perf(lexer): reduce an extra branch from peek (#841)

2026-05-24 12:21:58 +00:00 · 2023-09-03 00:02:42 +08:00 · 2023-09-03 00:02:42 +08:00 · d25355c9e8
commit d25355c9e8
parent 53b094e46c
1 changed files with 73 additions and 75 deletions
--- a/crates/oxc_parser/src/lexer/mod.rs
+++ b/crates/oxc_parser/src/lexer/mod.rs
@ -20,7 +20,7 @@ use oxc_span::{SourceType, Span};
 use oxc_syntax::{
    identifier::{
        is_identifier_part, is_identifier_start_all, is_irregular_line_terminator,
-        is_irregular_whitespace, is_line_terminator, CR, EOF, FF, LF, LS, PS, TAB, VT,
+        is_irregular_whitespace, is_line_terminator, CR, FF, LF, LS, PS, TAB, VT,
    },
    unicode_id_start::is_id_start_unicode,
 };
@ -84,7 +84,7 @@ impl<'a> Lexer<'a> {
            source_type,
            current,
            errors: vec![],
-            lookahead: VecDeque::with_capacity(4),
+            lookahead: VecDeque::with_capacity(4), // 4 is the maximum lookahead for TypeScript
            context: LexerContext::Regular,
            trivia_builder: TriviaBuilder::default(),
        }
@ -254,21 +254,22 @@ impl<'a> Lexer<'a> {

    /// Peek the next char without advancing the position
    #[inline]
-    fn peek(&self) -> char {
-        self.current.chars.clone().next().unwrap_or(EOF)
+    fn peek(&self) -> Option<char> {
+        self.current.chars.clone().next()
    }

    /// Peek the next next char without advancing the position
-    fn peek2(&self) -> char {
+    #[inline]
+    fn peek2(&self) -> Option<char> {
        let mut chars = self.current.chars.clone();
        chars.next();
-        chars.next().unwrap_or(EOF)
+        chars.next()
    }

    /// Peek the next character, and advance the current position if it matches
    #[inline]
    fn next_eq(&mut self, c: char) -> bool {
-        let matched = self.peek() == c;
+        let matched = self.peek() == Some(c);
        if matched {
            self.current.chars.next();
        }
@ -282,11 +283,10 @@ impl<'a> Lexer<'a> {

    /// Return `IllegalCharacter` Error or `UnexpectedEnd` if EOF
    fn unexpected_err(&mut self) {
-        let c = self.peek();
-        if c == EOF {
-            self.error(diagnostics::UnexpectedEnd(self.current_offset()));
-        } else {
-            self.error(diagnostics::InvalidCharacter(c, self.current_offset()));
+        let offset = self.current_offset();
+        match self.peek() {
+            Some(c) => self.error(diagnostics::InvalidCharacter(c, offset)),
+            None => self.error(diagnostics::UnexpectedEnd(offset)),
        }
    }

@ -419,8 +419,7 @@ impl<'a> Lexer<'a> {
    /// Section 12.6.1 Identifier Names
    fn identifier_tail(&mut self, mut builder: AutoCow<'a>) -> (bool, &'a str) {
        // ident tail
-        loop {
-            let c = self.peek();
+        while let Some(c) = self.peek() {
            if !is_identifier_part(c) {
                if c == '\\' {
                    self.current.chars.next();
@ -453,12 +452,12 @@ impl<'a> Lexer<'a> {

    /// Section 12.7 Punctuators
    fn read_dot(&mut self, builder: &mut AutoCow<'a>) -> Kind {
-        if self.peek() == '.' && self.peek2() == '.' {
+        if self.peek() == Some('.') && self.peek2() == Some('.') {
            self.current.chars.next();
            self.current.chars.next();
            return Kind::Dot3;
        }
-        if self.peek().is_ascii_digit() {
+        if self.peek().is_some_and(|c| c.is_ascii_digit()) {
            builder.push_matching('.');
            self.decimal_literal_after_decimal_point(builder)
        } else {
@ -476,7 +475,7 @@ impl<'a> Lexer<'a> {
            }
        } else if self.next_eq('=') {
            Some(Kind::LtEq)
-        } else if self.peek() == '!'
+        } else if self.peek() == Some('!')
            // SingleLineHTMLOpenComment `<!--` in script mode
            && self.source_type.is_script()
            && self.remaining().starts_with("!--")
@ -557,25 +556,25 @@ impl<'a> Lexer<'a> {
    /// 12.8.3 Numeric Literals with `0` prefix
    fn read_zero(&mut self, builder: &mut AutoCow<'a>) -> Kind {
        match self.peek() {
-            'b' | 'B' => self.read_non_decimal(Kind::Binary, builder),
-            'o' | 'O' => self.read_non_decimal(Kind::Octal, builder),
-            'x' | 'X' => self.read_non_decimal(Kind::Hex, builder),
-            c @ ('e' | 'E') => {
+            Some('b' | 'B') => self.read_non_decimal(Kind::Binary, builder),
+            Some('o' | 'O') => self.read_non_decimal(Kind::Octal, builder),
+            Some('x' | 'X') => self.read_non_decimal(Kind::Hex, builder),
+            Some(c @ ('e' | 'E')) => {
                self.current.chars.next();
                builder.push_matching(c);
                self.read_decimal_exponent(builder)
            }
-            '.' => {
+            Some('.') => {
                self.current.chars.next();
                builder.push_matching('.');
                self.decimal_literal_after_decimal_point_after_digits(builder)
            }
-            'n' => {
+            Some('n') => {
                self.current.chars.next();
                builder.push_matching('n');
                self.check_after_numeric_literal(Kind::Decimal)
            }
-            n if n.is_ascii_digit() => self.read_legacy_octal(builder),
+            Some(n) if n.is_ascii_digit() => self.read_legacy_octal(builder),
            _ => self.check_after_numeric_literal(Kind::Decimal),
        }
    }
@ -584,7 +583,7 @@ impl<'a> Lexer<'a> {
        let c = self.current.chars.next().unwrap();
        builder.push_matching(c);

-        if kind.matches_number_char(self.peek()) {
+        if self.peek().is_some_and(|c| kind.matches_number_char(c)) {
            let c = self.current.chars.next().unwrap();
            builder.push_matching(c);
        } else {
@ -592,14 +591,13 @@ impl<'a> Lexer<'a> {
            return Kind::Undetermined;
        }

-        loop {
-            match self.peek() {
+        while let Some(c) = self.peek() {
+            match c {
                '_' => {
                    self.current.chars.next();
                    builder.force_allocation_without_current_ascii_char(self);
-                    let c = self.peek();
-                    if kind.matches_number_char(c) {
-                        self.current.chars.next();
+                    if self.peek().is_some_and(|c| kind.matches_number_char(c)) {
+                        let c = self.current.chars.next().unwrap();
                        builder.push_matching(c);
                    } else {
                        self.unexpected_err();
@ -613,7 +611,7 @@ impl<'a> Lexer<'a> {
                _ => break,
            }
        }
-        if self.peek() == 'n' {
+        if self.peek() == Some('n') {
            self.current.chars.next();
            builder.push_matching('n');
        }
@ -624,10 +622,10 @@ impl<'a> Lexer<'a> {
        let mut kind = Kind::Octal;
        loop {
            match self.peek() {
-                '0'..='7' => {
+                Some('0'..='7') => {
                    self.current.chars.next();
                }
-                '8'..='9' => {
+                Some('8'..='9') => {
                    self.current.chars.next();
                    kind = Kind::Decimal;
                }
@ -637,13 +635,13 @@ impl<'a> Lexer<'a> {

        match self.peek() {
            // allow 08.5 and 09.5
-            '.' if kind == Kind::Decimal => {
+            Some('.') if kind == Kind::Decimal => {
                self.current.chars.next();
                builder.push_matching('.');
                self.decimal_literal_after_decimal_point_after_digits(builder)
            }
            // allow 08e1 and 09e1
-            'e' if kind == Kind::Decimal => {
+            Some('e') if kind == Kind::Decimal => {
                self.current.chars.next();
                builder.push_matching('e');
                self.read_decimal_exponent(builder)
@ -668,12 +666,12 @@ impl<'a> Lexer<'a> {

    fn read_decimal_exponent(&mut self, builder: &mut AutoCow<'a>) -> Kind {
        let kind = match self.peek() {
-            '-' => {
+            Some('-') => {
                self.current.chars.next();
                builder.push_matching('-');
                Kind::NegativeExponential
            }
-            '+' => {
+            Some('+') => {
                self.current.chars.next();
                builder.push_matching('+');
                Kind::PositiveExponential
@ -685,7 +683,7 @@ impl<'a> Lexer<'a> {
    }

    fn read_decimal_digits(&mut self, builder: &mut AutoCow<'a>) {
-        if self.peek().is_ascii_digit() {
+        if self.peek().is_some_and(|c| c.is_ascii_digit()) {
            let c = self.current.chars.next().unwrap();
            builder.push_matching(c);
        } else {
@ -697,12 +695,12 @@ impl<'a> Lexer<'a> {
    }

    fn read_decimal_digits_after_first_digit(&mut self, builder: &mut AutoCow<'a>) {
-        loop {
-            match self.peek() {
+        while let Some(c) = self.peek() {
+            match c {
                '_' => {
                    self.current.chars.next();
                    builder.force_allocation_without_current_ascii_char(self);
-                    if self.peek().is_ascii_digit() {
+                    if self.peek().is_some_and(|c| c.is_ascii_digit()) {
                        let c = self.current.chars.next().unwrap();
                        builder.push_matching(c);
                    } else {
@ -735,7 +733,7 @@ impl<'a> Lexer<'a> {
    }

    fn optional_decimal_digits(&mut self, builder: &mut AutoCow<'a>) {
-        if self.peek().is_ascii_digit() {
+        if self.peek().is_some_and(|c| c.is_ascii_digit()) {
            let c = self.current.chars.next().unwrap();
            builder.push_matching(c);
        } else {
@ -745,7 +743,7 @@ impl<'a> Lexer<'a> {
    }

    fn optional_exponent(&mut self, builder: &mut AutoCow<'a>) -> Option<Kind> {
-        if matches!(self.peek(), 'e' | 'E') {
+        if matches!(self.peek(), Some('e' | 'E')) {
            let c = self.current.chars.next().unwrap();
            builder.push_matching(c);
            return Some(self.read_decimal_exponent(builder));
@ -756,14 +754,13 @@ impl<'a> Lexer<'a> {
    fn check_after_numeric_literal(&mut self, kind: Kind) -> Kind {
        let offset = self.offset();
        // The SourceCharacter immediately following a NumericLiteral must not be an IdentifierStart or DecimalDigit.
-        let ch = self.peek();
-        if !ch.is_ascii_digit() && !is_identifier_start_all(ch) {
+        let c = self.peek();
+        if c.is_none() || c.is_some_and(|ch| !ch.is_ascii_digit() && !is_identifier_start_all(ch)) {
            return kind;
        }
        self.current.chars.next();
-        loop {
-            let c = self.peek();
-            if c != EOF && is_identifier_start_all(c) {
+        while let Some(c) = self.peek() {
+            if is_identifier_start_all(c) {
                self.current.chars.next();
            } else {
                break;
@ -843,7 +840,7 @@ impl<'a> Lexer<'a> {

        let mut flags = RegExpFlags::empty();

-        while let ch @ ('$' | '_' | 'a'..='z' | 'A'..='Z' | '0'..='9') = self.peek() {
+        while let Some(ch @ ('$' | '_' | 'a'..='z' | 'A'..='Z' | '0'..='9')) = self.peek() {
            self.current.chars.next();
            if !ch.is_ascii_lowercase() {
                self.error(diagnostics::RegExpFlag(ch, self.current_offset()));
@ -881,7 +878,7 @@ impl<'a> Lexer<'a> {
        let mut is_valid_escape_sequence = true;
        while let Some(c) = self.current.chars.next() {
            match c {
-                '$' if self.peek() == '{' => {
+                '$' if self.peek() == Some('{') => {
                    if is_valid_escape_sequence {
                        self.current.token.value =
                            self.string_to_token_value(builder.finish_without_push(self));
@ -921,15 +918,13 @@ impl<'a> Lexer<'a> {
        let prev_str = &self.source[start_offset as usize..self.offset() as usize];

        let mut builder = AutoCow::new(self);
-        loop {
-            let c = self.peek();
+        while let Some(c) = self.peek() {
            if c == '-' || is_identifier_start_all(c) {
                self.current.chars.next();
                builder.push_matching(c);
-                loop {
-                    let c = self.peek();
+                while let Some(c) = self.peek() {
                    if is_identifier_part(c) {
-                        self.current.chars.next();
+                        let c = self.current.chars.next().unwrap();
                        builder.push_matching(c);
                    } else {
                        break;
@ -953,22 +948,21 @@ impl<'a> Lexer<'a> {
    /// { `JSXChildExpressionopt` }
    fn read_jsx_child(&mut self) -> Kind {
        match self.peek() {
-            '<' => {
+            Some('<') => {
                self.current.chars.next();
                Kind::LAngle
            }
-            '{' => {
+            Some('{') => {
                self.current.chars.next();
                Kind::LCurly
            }
-            EOF => Kind::Eof,
-            c => {
+            Some(c) => {
                let mut builder = AutoCow::new(self);
                builder.push_matching(c);
                loop {
                    // `>` and `}` are errors in TypeScript but not Babel
                    // let's make this less strict so we can parse more code
-                    if matches!(self.peek(), '{' | '<') {
+                    if matches!(self.peek(), Some('{' | '<')) {
                        break;
                    }
                    if let Some(c) = self.current.chars.next() {
@ -980,6 +974,7 @@ impl<'a> Lexer<'a> {
                self.current.token.value = self.string_to_token_value(builder.finish(self));
                Kind::JSXText
            }
+            None => Kind::Eof,
        }
    }

@ -1034,7 +1029,7 @@ impl<'a> Lexer<'a> {
        }

        let value = match self.peek() {
-            '{' => self.unicode_code_point(),
+            Some('{') => self.unicode_code_point(),
            _ => self.surrogate_pair(),
        };

@ -1086,7 +1081,7 @@ impl<'a> Lexer<'a> {
        is_valid_escape_sequence: &mut bool,
    ) {
        let value = match self.peek() {
-            '{' => self.unicode_code_point(),
+            Some('{') => self.unicode_code_point(),
            _ => self.surrogate_pair(),
        };

@ -1137,9 +1132,9 @@ impl<'a> Lexer<'a> {

    fn hex_digit(&mut self) -> Option<u32> {
        let value = match self.peek() {
-            c @ '0'..='9' => c as u32 - '0' as u32,
-            c @ 'a'..='f' => 10 + (c as u32 - 'a' as u32),
-            c @ 'A'..='F' => 10 + (c as u32 - 'A' as u32),
+            Some(c @ '0'..='9') => c as u32 - '0' as u32,
+            Some(c @ 'a'..='f') => 10 + (c as u32 - 'a' as u32),
+            Some(c @ 'A'..='F') => 10 + (c as u32 - 'A' as u32),
            _ => return None,
        };
        self.current.chars.next();
@ -1164,7 +1159,10 @@ impl<'a> Lexer<'a> {
    fn surrogate_pair(&mut self) -> Option<SurrogatePair> {
        let high = self.hex_4_digits()?;
        // The first code unit of a surrogate pair is always in the range from 0xD800 to 0xDBFF, and is called a high surrogate or a lead surrogate.
-        if !((0xD800..=0xDBFF).contains(&high) && self.peek() == '\\' && self.peek2() == 'u') {
+        if !((0xD800..=0xDBFF).contains(&high)
+            && self.peek() == Some('\\')
+            && self.peek2() == Some('u'))
+        {
            return Some(SurrogatePair::CodePoint(high));
        }

@ -1231,7 +1229,7 @@ impl<'a> Lexer<'a> {
                    self.string_unicode_escape_sequence(text, is_valid_escape_sequence);
                }
                // 0 [lookahead ∉ DecimalDigit]
-                '0' if !self.peek().is_ascii_digit() => text.push('\0'),
+                '0' if !self.peek().is_some_and(|c| c.is_ascii_digit()) => text.push('\0'),
                // Section 12.8.4 String Literals
                // LegacyOctalEscapeSequence
                // NonOctalDecimalEscapeSequence
@ -1240,16 +1238,16 @@ impl<'a> Lexer<'a> {
                    num.push(a);
                    match a {
                        '4'..='7' => {
-                            if matches!(self.peek(), '0'..='7') {
+                            if matches!(self.peek(), Some('0'..='7')) {
                                let b = self.current.chars.next().unwrap();
                                num.push(b);
                            }
                        }
                        '0'..='3' => {
-                            if matches!(self.peek(), '0'..='7') {
+                            if matches!(self.peek(), Some('0'..='7')) {
                                let b = self.current.chars.next().unwrap();
                                num.push(b);
-                                if matches!(self.peek(), '0'..='7') {
+                                if matches!(self.peek(), Some('0'..='7')) {
                                    let c = self.current.chars.next().unwrap();
                                    num.push(c);
                                }
@ -1262,7 +1260,7 @@ impl<'a> Lexer<'a> {
                        char::from_u32(u32::from_str_radix(num.as_str(), 8).unwrap()).unwrap();
                    text.push(value);
                }
-                '0' if in_template && self.peek().is_ascii_digit() => {
+                '0' if in_template && self.peek().is_some_and(|c| c.is_ascii_digit()) => {
                    self.current.chars.next();
                    // error raised within the parser by `diagnostics::TemplateLiteral`
                    *is_valid_escape_sequence = false;
@ -1464,11 +1462,11 @@ const PRD: ByteHandler = |lexer| {
 const SLH: ByteHandler = |lexer| {
    lexer.consume_char();
    match lexer.peek() {
-        '/' => {
+        Some('/') => {
            lexer.current.chars.next();
            lexer.skip_single_line_comment()
        }
-        '*' => {
+        Some('*') => {
            lexer.current.chars.next();
            lexer.skip_multi_line_comment()
        }
@ -1553,9 +1551,9 @@ const QST: ByteHandler = |lexer| {
        } else {
            Kind::Question2
        }
-    } else if lexer.peek() == '.' {
+    } else if lexer.peek() == Some('.') {
        // parse `?.1` as `?` `.1`
-        if lexer.peek2().is_ascii_digit() {
+        if lexer.peek2().is_some_and(|c| c.is_ascii_digit()) {
            Kind::Question
        } else {
            lexer.current.chars.next();