Skip to content

Commit cbc421e

Browse files
committed
Add fast path for ASCII bytes in UTF-8 validation
1 parent 6f49474 commit cbc421e

File tree

1 file changed

+9
-1
lines changed

1 file changed

+9
-1
lines changed

ext/dom/html_document.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -517,8 +517,16 @@ static bool dom_decode_encode_fast_path(
517517
const lxb_char_t *buf_ref = *buf_ref_ref;
518518
const lxb_char_t *last_output = buf_ref;
519519
while (buf_ref != buf_end) {
520-
const lxb_char_t *buf_ref_backup = buf_ref;
521520
/* Fast path converts non-validated UTF-8 -> validated UTF-8 */
521+
if (decoding_encoding_ctx->decode.u.utf_8.need == 0 && *buf_ref < 0x80) {
522+
/* Fast path within the fast path: try to skip non-mb bytes in bulk if we are not in a state where we
523+
* need more UTF-8 bytes to complete a sequence.
524+
* It might be tempting to use SIMD here, but it turns out that this is less efficient because
525+
* we need to process the same byte multiple times sometimes when mixing ASCII with multibyte. */
526+
buf_ref++;
527+
continue;
528+
}
529+
const lxb_char_t *buf_ref_backup = buf_ref;
522530
lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
523531
if (UNEXPECTED(codepoint > LXB_ENCODING_MAX_CODEPOINT)) {
524532
size_t skip = buf_ref - buf_ref_backup; /* Skip invalid data, it's replaced by the UTF-8 replacement bytes */

0 commit comments

Comments
 (0)