diff --git a/crates/oxc_parser/src/lexer/byte_handlers.rs b/crates/oxc_parser/src/lexer/byte_handlers.rs index 1a182943e..1e9a420b6 100644 --- a/crates/oxc_parser/src/lexer/byte_handlers.rs +++ b/crates/oxc_parser/src/lexer/byte_handlers.rs @@ -417,8 +417,7 @@ ascii_byte_handler!(QST(lexer) { match next_2_bytes[0] { b'?' => { if next_2_bytes[1] == b'=' { - lexer.consume_char(); - lexer.consume_char(); + lexer.consume_2_chars(); Kind::Question2Eq } else { lexer.consume_char(); diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs index eecad34c8..307f7287c 100644 --- a/crates/oxc_parser/src/lexer/mod.rs +++ b/crates/oxc_parser/src/lexer/mod.rs @@ -251,6 +251,18 @@ impl<'a> Lexer<'a> { self.source.next_char().unwrap() } + /// Consume the current char and the next if not at EOF + #[inline] + fn next_2_chars(&mut self) -> Option<[char; 2]> { + self.source.next_2_chars() + } + + /// Consume the current char and the next + #[inline] + fn consume_2_chars(&mut self) -> [char; 2] { + self.next_2_chars().unwrap() + } + /// Peek the next byte without advancing the position #[inline] fn peek_byte(&self) -> Option { diff --git a/crates/oxc_parser/src/lexer/punctuation.rs b/crates/oxc_parser/src/lexer/punctuation.rs index 803071394..f49364c80 100644 --- a/crates/oxc_parser/src/lexer/punctuation.rs +++ b/crates/oxc_parser/src/lexer/punctuation.rs @@ -4,8 +4,7 @@ impl<'a> Lexer<'a> { /// Section 12.8 Punctuators pub(super) fn read_dot(&mut self) -> Kind { if self.peek_2_bytes() == Some([b'.', b'.']) { - self.consume_char(); - self.consume_char(); + self.consume_2_chars(); return Kind::Dot3; } if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) { diff --git a/crates/oxc_parser/src/lexer/source.rs b/crates/oxc_parser/src/lexer/source.rs index 3531b39b1..4a5b6a256 100644 --- a/crates/oxc_parser/src/lexer/source.rs +++ b/crates/oxc_parser/src/lexer/source.rs @@ -374,6 +374,36 @@ impl<'a> Source<'a> { Some(c) } + /// Get next 2 chars of source, and advance position to after them. + #[inline] + pub(super) fn next_2_chars(&mut self) -> Option<[char; 2]> { + // Check not at EOF and handle if 2 x ASCII bytes + let [byte1, byte2] = self.peek_2_bytes()?; + if byte1.is_ascii() && byte2.is_ascii() { + // SAFETY: We just checked that there are at least 2 bytes remaining, + // and next 2 bytes are ASCII, so advancing by 2 bytes must put `ptr` + // in bounds and on a UTF-8 character boundary + unsafe { self.ptr = self.ptr.add(2) }; + return Some([byte1 as char, byte2 as char]); + } + + // Multi-byte Unicode character. + // Check invariant that `ptr` is on a UTF-8 character boundary. + debug_assert!(!is_utf8_cont_byte(byte1)); + + // Create a `Chars` iterator, get next 2 chars from it, and then update `self.ptr` + // to match `Chars` iterator's updated pointer afterwards. + // `Chars` iterator upholds same invariants as `Source`, so its pointer is guaranteed + // to be valid as `self.ptr`. + let mut chars = self.remaining().chars(); + // SAFETY: We know that there's 2 bytes to be consumed, so first call to + // `chars.next()` must return `Some(_)` + let c1 = unsafe { chars.next().unwrap_unchecked() }; + let c2 = chars.next()?; + self.ptr = chars.as_str().as_ptr(); + Some([c1, c2]) + } + /// Get next byte of source, and advance position to after it. /// /// # SAFETY diff --git a/crates/oxc_parser/src/lexer/unicode.rs b/crates/oxc_parser/src/lexer/unicode.rs index 752318b2e..a52cb3d41 100644 --- a/crates/oxc_parser/src/lexer/unicode.rs +++ b/crates/oxc_parser/src/lexer/unicode.rs @@ -194,8 +194,7 @@ impl<'a> Lexer<'a> { return Some(SurrogatePair::CodePoint(high)); } - self.consume_char(); - self.consume_char(); + self.consume_2_chars(); let low = self.hex_4_digits()?;