From cc2ddbee7757c76f59b8e34af1c8e8ede4568e4a Mon Sep 17 00:00:00 2001 From: overlookmotel Date: Fri, 16 Feb 2024 12:49:01 +0000 Subject: [PATCH] refactor(parser): catch all illegal UTF-8 bytes (#2415) Catch all illegal UTF-8 bytes with the `UER` byte handler. From https://datatracker.ietf.org/doc/html/rfc3629: > The octet values C0, C1, F5 to FF never appear. This change *should* make no difference at all, as a valid `&str` may not contain any of these byte values anyway. But it's possible if user has e.g. created the string with `str::from_utf8_unchecked` and not obeyed the safety contraints. This will at least contain the damage if that's happened, and panic rather than lead to UB. And since we're already catching other error conditions, may as well catch them all. --- crates/oxc_parser/src/lexer/byte_handlers.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/crates/oxc_parser/src/lexer/byte_handlers.rs b/crates/oxc_parser/src/lexer/byte_handlers.rs index c368f527c..415f7255e 100644 --- a/crates/oxc_parser/src/lexer/byte_handlers.rs +++ b/crates/oxc_parser/src/lexer/byte_handlers.rs @@ -31,10 +31,10 @@ static BYTE_HANDLERS: [ByteHandler; 256] = [ UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, // 9 UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, // A UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, // B - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // C + UER, UER, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // C UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // D UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // E - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UER, UER, UER, UER, UER, UER, UER, UER, // F + UNI, UNI, UNI, UNI, UNI, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, // F ]; /// Macro for defining a byte handler. @@ -679,11 +679,11 @@ byte_handler!(UNI(lexer) { lexer.unicode_char_handler() }); -// UTF-8 continuation bytes (128-191) (i.e. middle of a multi-byte UTF-8 sequence) -// + and byte values which are not legal in UTF-8 strings (248-255). -// `handle_byte()` should only be called with 1st byte of a valid UTF-8 char, +// UTF-8 continuation bytes (0x80 - 0xBF) (i.e. middle of a multi-byte UTF-8 sequence) +// + and byte values which are not legal in UTF-8 strings (0xC0, 0xC1, 0xF5 - 0xFF). +// `handle_byte()` should only be called with 1st byte of a valid UTF-8 character, // so something has gone wrong if we get here. -// https://en.wikipedia.org/wiki/UTF-8 +// https://datatracker.ietf.org/doc/html/rfc3629 // NB: Must not use `ascii_byte_handler!` macro, as this handler is for non-ASCII bytes. byte_handler!(UER(_lexer) { unreachable!();