From 5279e8955f057f2bd7c4d17918f31cfe4ff04a04 Mon Sep 17 00:00:00 2001 From: overlookmotel Date: Wed, 31 Jan 2024 10:57:47 +0000 Subject: [PATCH] refactor(parser): byte handler for illegal bytes (#2229) This adds a separate byte handler to the lexer for byte values which should never be encountered: 1. UTF-8 continuation bytes (i.e. middle of a multi-byte UTF-8 byte sequence). 2. Bytes values which are illegal in valid UTF-8 strings. At present, this function is impossible to reach, because `std::str::Chars` ensures the next byte is always the *start* of a valid UTF-8 byte sequence. But later changes I intend introducing unsafe code will make it possible (but highly undesirable!). In the meantime, I don't think it does any harm to handle this case. --- crates/oxc_parser/src/lexer/byte_handlers.rs | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/crates/oxc_parser/src/lexer/byte_handlers.rs b/crates/oxc_parser/src/lexer/byte_handlers.rs index 09efc66bc..4c2926ad1 100644 --- a/crates/oxc_parser/src/lexer/byte_handlers.rs +++ b/crates/oxc_parser/src/lexer/byte_handlers.rs @@ -28,14 +28,14 @@ static BYTE_HANDLERS: [ByteHandler; 256] = [ IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, BTO, ESC, BTC, CRT, IDT, // 5 TPL, L_A, L_B, L_C, L_D, L_E, L_F, L_G, IDT, L_I, IDT, L_K, L_L, L_M, L_N, L_O, // 6 L_P, IDT, L_R, L_S, L_T, L_U, L_V, L_W, IDT, L_Y, IDT, BEO, PIP, BEC, TLD, ERR, // 7 - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 8 - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 9 - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // A - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // B + UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, // 8 + UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, // 9 + UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, // A + UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, // B UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // C UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // D UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // E - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // F + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UER, UER, UER, UER, UER, UER, UER, UER, // F ]; #[allow(clippy::unnecessary_safety_comment)] @@ -586,3 +586,11 @@ ascii_byte_handler!(L_Y(lexer) match &lexer.identifier_name_handler()[1..] { // NB: Must not use `ascii_byte_handler!()` macro, as this handler is for non-ASCII chars. #[allow(clippy::redundant_closure_for_method_calls)] const UNI: ByteHandler = |lexer| lexer.unicode_char_handler(); + +// UTF-8 continuation bytes (128-191) (i.e. middle of a multi-byte UTF-8 sequence) +// + and byte values which are not legal in UTF-8 strings (248-255). +// `handle_byte()` should only be called with 1st byte of a valid UTF-8 char, +// so something has gone wrong if we get here. +// https://en.wikipedia.org/wiki/UTF-8 +// NB: Must not use `ascii_byte_handler!()` macro, as this handler is for non-ASCII bytes. +const UER: ByteHandler = |_| unreachable!();