perf(parser): support peeking over bytes (#4304)

Closes https://github.com/oxc-project/oxc/issues/3291
2026-05-19 04:08:41 +00:00 · 2024-07-30 17:53:13 +00:00 · 2024-07-30 17:53:13 +00:00 · c9c38a187c
commit c9c38a187c
parent 732f4e2591
11 changed files with 116 additions and 76 deletions
--- a/crates/oxc_ast/src/ast_impl/literal.rs
+++ b/crates/oxc_ast/src/ast_impl/literal.rs
@ -108,6 +108,24 @@ impl TryFrom<char> for RegExpFlags {
    }
 }

+impl TryFrom<u8> for RegExpFlags {
+    type Error = u8;
+
+    fn try_from(value: u8) -> Result<Self, Self::Error> {
+        match value {
+            b'g' => Ok(Self::G),
+            b'i' => Ok(Self::I),
+            b'm' => Ok(Self::M),
+            b's' => Ok(Self::S),
+            b'u' => Ok(Self::U),
+            b'y' => Ok(Self::Y),
+            b'd' => Ok(Self::D),
+            b'v' => Ok(Self::V),
+            _ => Err(value),
+        }
+    }
+}
+
 impl fmt::Display for RegExpFlags {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        if self.contains(Self::G) {
--- a/crates/oxc_parser/src/lexer/byte_handlers.rs
+++ b/crates/oxc_parser/src/lexer/byte_handlers.rs
@ -336,12 +336,12 @@ ascii_byte_handler!(PRD(lexer) {
 // /
 ascii_byte_handler!(SLH(lexer) {
    lexer.consume_char();
-    match lexer.peek() {
-        Some('/') => {
+    match lexer.peek_byte() {
+        Some(b'/') => {
            lexer.consume_char();
            lexer.skip_single_line_comment()
        }
-        Some('*') => {
+        Some(b'*') => {
            lexer.consume_char();
            lexer.skip_multi_line_comment()
        }
@ -418,9 +418,9 @@ ascii_byte_handler!(QST(lexer) {
        } else {
            Kind::Question2
        }
-    } else if lexer.peek() == Some('.') {
+    } else if lexer.peek_byte() == Some(b'.') {
        // parse `?.1` as `?` `.1`
-        if lexer.peek2().is_some_and(|c| c.is_ascii_digit()) {
+        if lexer.peek_char2().is_some_and(|c| c.is_ascii_digit()) {
            Kind::Question
        } else {
            lexer.consume_char();
--- a/crates/oxc_parser/src/lexer/identifier.rs
+++ b/crates/oxc_parser/src/lexer/identifier.rs
@ -98,7 +98,7 @@ impl<'a> Lexer<'a> {
    /// Any number of characters can have already been consumed from `self.source` prior to it.
    /// `self.source` should be positioned at start of Unicode character.
    fn identifier_tail_unicode(&mut self, start_pos: SourcePosition) -> &'a str {
-        let c = self.peek().unwrap();
+        let c = self.peek_char().unwrap();
        if is_identifier_part_unicode(c) {
            self.consume_char();
            self.identifier_tail_after_unicode(start_pos)
@ -115,7 +115,7 @@ impl<'a> Lexer<'a> {
    pub(super) fn identifier_tail_after_unicode(&mut self, start_pos: SourcePosition) -> &'a str {
        // Identifier contains a Unicode chars, so probably contains more.
        // So just iterate over chars now, instead of bytes.
-        while let Some(c) = self.peek() {
+        while let Some(c) = self.peek_char() {
            if is_identifier_part(c) {
                self.consume_char();
            } else if c == '\\' {
@ -177,7 +177,7 @@ impl<'a> Lexer<'a> {
            // Consume chars until reach end of identifier or another escape
            let chunk_start = self.source.position();
            loop {
-                let maybe_char = self.peek();
+                let maybe_char = self.peek_char();
                if maybe_char.is_some_and(is_identifier_part) {
                    self.consume_char();
                    continue;
@ -272,7 +272,7 @@ impl<'a> Lexer<'a> {
    fn private_identifier_not_ascii_id(&mut self) -> Kind {
        let b = self.source.peek_byte().unwrap();
        if !b.is_ascii() {
-            let c = self.peek().unwrap();
+            let c = self.peek_char().unwrap();
            if is_identifier_start_unicode(c) {
                let start_pos = self.source.position();
                self.consume_char();
--- a/crates/oxc_parser/src/lexer/jsx.rs
+++ b/crates/oxc_parser/src/lexer/jsx.rs
@ -61,12 +61,12 @@ impl<'a> Lexer<'a> {
    /// `JSXFragment`
    /// { `JSXChildExpressionopt` }
    fn read_jsx_child(&mut self) -> Kind {
-        match self.peek() {
-            Some('<') => {
+        match self.peek_byte() {
+            Some(b'<') => {
                self.consume_char();
                Kind::LAngle
            }
-            Some('{') => {
+            Some(b'{') => {
                self.consume_char();
                Kind::LCurly
            }
@ -122,7 +122,7 @@ impl<'a> Lexer<'a> {
            // Unicode chars are rare in identifiers, so cold branch to keep common path for ASCII
            // as fast as possible
            cold_branch(|| {
-                while let Some(c) = self.peek() {
+                while let Some(c) = self.peek_char() {
                    if c == '-' || is_identifier_part(c) {
                        self.consume_char();
                    } else {
--- a/crates/oxc_parser/src/lexer/kind.rs
+++ b/crates/oxc_parser/src/lexer/kind.rs
@ -206,11 +206,11 @@ impl Kind {
        )
    }

-    pub fn matches_number_char(self, c: char) -> bool {
+    pub fn matches_number_char(self, c: u8) -> bool {
        match self {
            Decimal => c.is_ascii_digit(),
-            Binary => matches!(c, '0'..='1'),
-            Octal => matches!(c, '0'..='7'),
+            Binary => matches!(c, b'0'..=b'1'),
+            Octal => matches!(c, b'0'..=b'7'),
            Hex => c.is_ascii_hexdigit(),
            _ => unreachable!(),
        }
--- a/crates/oxc_parser/src/lexer/mod.rs
+++ b/crates/oxc_parser/src/lexer/mod.rs
@ -251,15 +251,27 @@ impl<'a> Lexer<'a> {
        self.source.next_char().unwrap()
    }

+    /// Peek the next byte without advancing the position
+    #[inline]
+    fn peek_byte(&self) -> Option<u8> {
+        self.source.peek_byte()
+    }
+
+    /// Peek the next two bytes without advancing the position
+    #[inline]
+    fn peek_2_bytes(&self) -> Option<[u8; 2]> {
+        self.source.peek_2_bytes()
+    }
+
    /// Peek the next char without advancing the position
    #[inline]
-    fn peek(&self) -> Option<char> {
+    fn peek_char(&self) -> Option<char> {
        self.source.peek_char()
    }

    /// Peek the next next char without advancing the position
    #[inline]
-    fn peek2(&self) -> Option<char> {
+    fn peek_char2(&self) -> Option<char> {
        self.source.peek_char2()
    }

@ -284,7 +296,7 @@ impl<'a> Lexer<'a> {
    /// Return `IllegalCharacter` Error or `UnexpectedEnd` if EOF
    fn unexpected_err(&mut self) {
        let offset = self.current_offset();
-        match self.peek() {
+        match self.peek_char() {
            Some(c) => self.error(diagnostics::invalid_character(c, offset)),
            None => self.error(diagnostics::unexpected_end(offset)),
        }
--- a/crates/oxc_parser/src/lexer/numeric.rs
+++ b/crates/oxc_parser/src/lexer/numeric.rs
@ -6,19 +6,19 @@ use crate::diagnostics;
 impl<'a> Lexer<'a> {
    /// 12.9.3 Numeric Literals with `0` prefix
    pub(super) fn read_zero(&mut self) -> Kind {
-        match self.peek() {
-            Some('b' | 'B') => self.read_non_decimal(Kind::Binary),
-            Some('o' | 'O') => self.read_non_decimal(Kind::Octal),
-            Some('x' | 'X') => self.read_non_decimal(Kind::Hex),
-            Some('e' | 'E') => {
+        match self.peek_byte() {
+            Some(b'b' | b'B') => self.read_non_decimal(Kind::Binary),
+            Some(b'o' | b'O') => self.read_non_decimal(Kind::Octal),
+            Some(b'x' | b'X') => self.read_non_decimal(Kind::Hex),
+            Some(b'e' | b'E') => {
                self.consume_char();
                self.read_decimal_exponent()
            }
-            Some('.') => {
+            Some(b'.') => {
                self.consume_char();
                self.decimal_literal_after_decimal_point_after_digits()
            }
-            Some('n') => {
+            Some(b'n') => {
                self.consume_char();
                self.check_after_numeric_literal(Kind::Decimal)
            }
@ -42,23 +42,23 @@ impl<'a> Lexer<'a> {
    fn read_non_decimal(&mut self, kind: Kind) -> Kind {
        self.consume_char();

-        if self.peek().is_some_and(|c| kind.matches_number_char(c)) {
+        if self.peek_byte().is_some_and(|c| kind.matches_number_char(c)) {
            self.consume_char();
        } else {
            self.unexpected_err();
            return Kind::Undetermined;
        }

-        while let Some(c) = self.peek() {
+        while let Some(c) = self.peek_byte() {
            match c {
-                '_' => {
+                b'_' => {
                    self.consume_char();
                    // NOTE: it looks invalid numeric tokens are still parsed.
                    // This seems to be a waste. It also requires us to put this
                    // call here instead of after we ensure the next character
                    // is a number character
                    self.token.set_has_separator();
-                    if self.peek().is_some_and(|c| kind.matches_number_char(c)) {
+                    if self.peek_byte().is_some_and(|c| kind.matches_number_char(c)) {
                        self.consume_char();
                    } else {
                        self.unexpected_err();
@ -71,20 +71,18 @@ impl<'a> Lexer<'a> {
                _ => break,
            }
        }
-        if self.peek() == Some('n') {
-            self.consume_char();
-        }
+        self.next_ascii_char_eq(b'n');
        self.check_after_numeric_literal(kind)
    }

    fn read_legacy_octal(&mut self) -> Kind {
        let mut kind = Kind::Octal;
        loop {
-            match self.peek() {
-                Some('0'..='7') => {
+            match self.peek_byte() {
+                Some(b'0'..=b'7') => {
                    self.consume_char();
                }
-                Some('8'..='9') => {
+                Some(b'8'..=b'9') => {
                    self.consume_char();
                    kind = Kind::Decimal;
                }
@ -92,14 +90,14 @@ impl<'a> Lexer<'a> {
            }
        }

-        match self.peek() {
+        match self.peek_byte() {
            // allow 08.5 and 09.5
-            Some('.') if kind == Kind::Decimal => {
+            Some(b'.') if kind == Kind::Decimal => {
                self.consume_char();
                self.decimal_literal_after_decimal_point_after_digits()
            }
            // allow 08e1 and 09e1
-            Some('e') if kind == Kind::Decimal => {
+            Some(b'e') if kind == Kind::Decimal => {
                self.consume_char();
                self.read_decimal_exponent()
            }
@ -108,12 +106,12 @@ impl<'a> Lexer<'a> {
    }

    fn read_decimal_exponent(&mut self) -> Kind {
-        let kind = match self.peek() {
-            Some('-') => {
+        let kind = match self.peek_byte() {
+            Some(b'-') => {
                self.consume_char();
                Kind::NegativeExponential
            }
-            Some('+') => {
+            Some(b'+') => {
                self.consume_char();
                Kind::PositiveExponential
            }
@ -124,7 +122,7 @@ impl<'a> Lexer<'a> {
    }

    fn read_decimal_digits(&mut self) {
-        if self.peek().is_some_and(|c| c.is_ascii_digit()) {
+        if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) {
            self.consume_char();
        } else {
            self.unexpected_err();
@ -135,23 +133,23 @@ impl<'a> Lexer<'a> {
    }

    fn read_decimal_digits_after_first_digit(&mut self) {
-        while let Some(c) = self.peek() {
-            match c {
-                '_' => {
+        while let Some(b) = self.peek_byte() {
+            match b {
+                b'_' => {
                    self.consume_char();
                    // NOTE: it looks invalid numeric tokens are still parsed.
                    // This seems to be a waste. It also requires us to put this
                    // call here instead of after we ensure the next character
                    // is an ASCII digit
                    self.token.set_has_separator();
-                    if self.peek().is_some_and(|c| c.is_ascii_digit()) {
+                    if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) {
                        self.consume_char();
                    } else {
                        self.unexpected_err();
                        return;
                    }
                }
-                '0'..='9' => {
+                b'0'..=b'9' => {
                    self.consume_char();
                }
                _ => break,
@ -172,16 +170,14 @@ impl<'a> Lexer<'a> {
    }

    fn optional_decimal_digits(&mut self) {
-        if self.peek().is_some_and(|c| c.is_ascii_digit()) {
+        if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) {
            self.consume_char();
-        } else {
-            return;
+            self.read_decimal_digits_after_first_digit();
        }
-        self.read_decimal_digits_after_first_digit();
    }

    fn optional_exponent(&mut self) -> Option<Kind> {
-        if matches!(self.peek(), Some('e' | 'E')) {
+        if matches!(self.peek_byte(), Some(b'e' | b'E')) {
            self.consume_char();
            return Some(self.read_decimal_exponent());
        }
@ -191,12 +187,12 @@ impl<'a> Lexer<'a> {
    fn check_after_numeric_literal(&mut self, kind: Kind) -> Kind {
        let offset = self.offset();
        // The SourceCharacter immediately following a NumericLiteral must not be an IdentifierStart or DecimalDigit.
-        let c = self.peek();
+        let c = self.peek_char();
        if c.is_none() || c.is_some_and(|ch| !ch.is_ascii_digit() && !is_identifier_start(ch)) {
            return kind;
        }
        self.consume_char();
-        while let Some(c) = self.peek() {
+        while let Some(c) = self.peek_char() {
            if is_identifier_start(c) {
                self.consume_char();
            } else {
--- a/crates/oxc_parser/src/lexer/punctuation.rs
+++ b/crates/oxc_parser/src/lexer/punctuation.rs
@ -3,12 +3,12 @@ use super::{Kind, Lexer, Token};
 impl<'a> Lexer<'a> {
    /// Section 12.8 Punctuators
    pub(super) fn read_dot(&mut self) -> Kind {
-        if self.peek() == Some('.') && self.peek2() == Some('.') {
+        if self.peek_2_bytes() == Some([b'.', b'.']) {
            self.consume_char();
            self.consume_char();
            return Kind::Dot3;
        }
-        if self.peek().is_some_and(|c| c.is_ascii_digit()) {
+        if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) {
            self.decimal_literal_after_decimal_point()
        } else {
            Kind::Dot
@ -25,7 +25,7 @@ impl<'a> Lexer<'a> {
            }
        } else if self.next_ascii_char_eq(b'=') {
            Some(Kind::LtEq)
-        } else if self.peek() == Some('!')
+        } else if self.peek_byte() == Some(b'!')
            // SingleLineHTMLOpenComment `<!--` in script mode
            && self.source_type.is_script()
            && self.remaining().starts_with("!--")
--- a/crates/oxc_parser/src/lexer/regex.rs
+++ b/crates/oxc_parser/src/lexer/regex.rs
@ -58,14 +58,16 @@ impl<'a> Lexer<'a> {
        let pattern_end = self.offset() - 1; // -1 to exclude `/`
        let mut flags = RegExpFlags::empty();

-        while let Some(ch @ ('$' | '_' | 'a'..='z' | 'A'..='Z' | '0'..='9')) = self.peek() {
+        while let Some(ch @ (b'$' | b'_' | b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9')) =
+            self.peek_byte()
+        {
            self.consume_char();
            let Ok(flag) = RegExpFlags::try_from(ch) else {
-                self.error(diagnostics::reg_exp_flag(ch, self.current_offset()));
+                self.error(diagnostics::reg_exp_flag(ch as char, self.current_offset()));
                continue;
            };
            if flags.contains(flag) {
-                self.error(diagnostics::reg_exp_flag_twice(ch, self.current_offset()));
+                self.error(diagnostics::reg_exp_flag_twice(ch as char, self.current_offset()));
                continue;
            }
            flags |= flag;
--- a/crates/oxc_parser/src/lexer/source.rs
+++ b/crates/oxc_parser/src/lexer/source.rs
@ -498,6 +498,19 @@ impl<'a> Source<'a> {
        }
    }

+    /// Peek next two bytes of source without consuming them.
+    #[inline]
+    pub(super) fn peek_2_bytes(&self) -> Option<[u8; 2]> {
+        if (self.end as usize).saturating_sub(self.ptr as usize) >= 2 {
+            // SAFETY: The check above ensures that there are at least 2 bytes to
+            // read from `self.ptr` without overflowing past `self.end`.
+            let bytes = unsafe { self.position().read2() };
+            Some(bytes)
+        } else {
+            None
+        }
+    }
+
    /// Peek next byte of source without consuming it, without EOF bounds-check.
    ///
    /// # SAFETY
--- a/crates/oxc_parser/src/lexer/unicode.rs
+++ b/crates/oxc_parser/src/lexer/unicode.rs
@ -18,7 +18,7 @@ enum SurrogatePair {

 impl<'a> Lexer<'a> {
    pub(super) fn unicode_char_handler(&mut self) -> Kind {
-        let c = self.peek().unwrap();
+        let c = self.peek_char().unwrap();
        match c {
            c if is_identifier_start_unicode(c) => {
                let start_pos = self.source.position();
@ -60,7 +60,7 @@ impl<'a> Lexer<'a> {
            return;
        }

-        let value = match self.peek() {
+        let value = match self.peek_char() {
            Some('{') => self.unicode_code_point(),
            _ => self.surrogate_pair(),
        };
@ -109,7 +109,7 @@ impl<'a> Lexer<'a> {
        text: &mut String<'a>,
        is_valid_escape_sequence: &mut bool,
    ) {
-        let value = match self.peek() {
+        let value = match self.peek_char() {
            Some('{') => self.unicode_code_point(),
            _ => self.surrogate_pair(),
        };
@ -160,10 +160,10 @@ impl<'a> Lexer<'a> {
    }

    fn hex_digit(&mut self) -> Option<u32> {
-        let value = match self.peek() {
-            Some(c @ '0'..='9') => c as u32 - '0' as u32,
-            Some(c @ 'a'..='f') => 10 + (c as u32 - 'a' as u32),
-            Some(c @ 'A'..='F') => 10 + (c as u32 - 'A' as u32),
+        let value = match self.peek_byte() {
+            Some(c @ b'0'..=b'9') => u32::from(c) - '0' as u32,
+            Some(c @ b'a'..=b'f') => 10 + (u32::from(c) - 'a' as u32),
+            Some(c @ b'A'..=b'F') => 10 + (u32::from(c) - 'A' as u32),
            _ => return None,
        };
        self.consume_char();
@ -188,9 +188,8 @@ impl<'a> Lexer<'a> {
    fn surrogate_pair(&mut self) -> Option<SurrogatePair> {
        let high = self.hex_4_digits()?;
        // The first code unit of a surrogate pair is always in the range from 0xD800 to 0xDBFF, and is called a high surrogate or a lead surrogate.
-        let is_pair = (0xD800..=0xDBFF).contains(&high)
-            && self.peek() == Some('\\')
-            && self.peek2() == Some('u');
+        let is_pair =
+            (0xD800..=0xDBFF).contains(&high) && self.peek_2_bytes() == Some([b'\\', b'u']);
        if !is_pair {
            return Some(SurrogatePair::CodePoint(high));
        }
@ -266,7 +265,7 @@ impl<'a> Lexer<'a> {
                    self.string_unicode_escape_sequence(text, is_valid_escape_sequence);
                }
                // 0 [lookahead ∉ DecimalDigit]
-                '0' if !self.peek().is_some_and(|c| c.is_ascii_digit()) => text.push('\0'),
+                '0' if !self.peek_byte().is_some_and(|b| b.is_ascii_digit()) => text.push('\0'),
                // Section 12.9.4 String Literals
                // LegacyOctalEscapeSequence
                // NonOctalDecimalEscapeSequence
@ -275,16 +274,16 @@ impl<'a> Lexer<'a> {
                    num.push(a);
                    match a {
                        '4'..='7' => {
-                            if matches!(self.peek(), Some('0'..='7')) {
+                            if matches!(self.peek_byte(), Some(b'0'..=b'7')) {
                                let b = self.consume_char();
                                num.push(b);
                            }
                        }
                        '0'..='3' => {
-                            if matches!(self.peek(), Some('0'..='7')) {
+                            if matches!(self.peek_byte(), Some(b'0'..=b'7')) {
                                let b = self.consume_char();
                                num.push(b);
-                                if matches!(self.peek(), Some('0'..='7')) {
+                                if matches!(self.peek_byte(), Some(b'0'..=b'7')) {
                                    let c = self.consume_char();
                                    num.push(c);
                                }
@ -297,7 +296,7 @@ impl<'a> Lexer<'a> {
                        char::from_u32(u32::from_str_radix(num.as_str(), 8).unwrap()).unwrap();
                    text.push(value);
                }
-                '0' if in_template && self.peek().is_some_and(|c| c.is_ascii_digit()) => {
+                '0' if in_template && self.peek_byte().is_some_and(|b| b.is_ascii_digit()) => {
                    self.consume_char();
                    // error raised within the parser by `diagnostics::template_literal`
                    *is_valid_escape_sequence = false;