From 5279e8955f057f2bd7c4d17918f31cfe4ff04a04 Mon Sep 17 00:00:00 2001
From: overlookmotel <theoverlookmotel@gmail.com>
Date: Wed, 31 Jan 2024 10:57:47 +0000
Subject: [PATCH] refactor(parser): byte handler for illegal bytes (#2229)

This adds a separate byte handler to the lexer for byte values which
should never be encountered:

1. UTF-8 continuation bytes (i.e. middle of a multi-byte UTF-8 byte
sequence).
2. Bytes values which are illegal in valid UTF-8 strings.

At present, this function is impossible to reach, because
`std::str::Chars` ensures the next byte is always the *start* of a valid
UTF-8 byte sequence. But later changes I intend introducing unsafe code
will make it possible (but highly undesirable!). In the meantime, I
don't think it does any harm to handle this case.
---
 crates/oxc_parser/src/lexer/byte_handlers.rs | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/crates/oxc_parser/src/lexer/byte_handlers.rs b/crates/oxc_parser/src/lexer/byte_handlers.rs
index 09efc66bc..4c2926ad1 100644
--- a/crates/oxc_parser/src/lexer/byte_handlers.rs
+++ b/crates/oxc_parser/src/lexer/byte_handlers.rs
@@ -28,14 +28,14 @@ static BYTE_HANDLERS: [ByteHandler; 256] = [
     IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, BTO, ESC, BTC, CRT, IDT, // 5
     TPL, L_A, L_B, L_C, L_D, L_E, L_F, L_G, IDT, L_I, IDT, L_K, L_L, L_M, L_N, L_O, // 6
     L_P, IDT, L_R, L_S, L_T, L_U, L_V, L_W, IDT, L_Y, IDT, BEO, PIP, BEC, TLD, ERR, // 7
-    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 8
-    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 9
-    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // A
-    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // B
+    UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, // 8
+    UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, // 9
+    UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, // A
+    UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, UER, // B
     UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // C
     UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // D
     UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // E
-    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // F
+    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UER, UER, UER, UER, UER, UER, UER, UER, // F
 ];
 
 #[allow(clippy::unnecessary_safety_comment)]
@@ -586,3 +586,11 @@ ascii_byte_handler!(L_Y(lexer) match &lexer.identifier_name_handler()[1..] {
 // NB: Must not use `ascii_byte_handler!()` macro, as this handler is for non-ASCII chars.
 #[allow(clippy::redundant_closure_for_method_calls)]
 const UNI: ByteHandler = |lexer| lexer.unicode_char_handler();
+
+// UTF-8 continuation bytes (128-191) (i.e. middle of a multi-byte UTF-8 sequence)
+// + and byte values which are not legal in UTF-8 strings (248-255).
+// `handle_byte()` should only be called with 1st byte of a valid UTF-8 char,
+// so something has gone wrong if we get here.
+// https://en.wikipedia.org/wiki/UTF-8
+// NB: Must not use `ascii_byte_handler!()` macro, as this handler is for non-ASCII bytes.
+const UER: ByteHandler = |_| unreachable!();