perf(parser): lexer match byte not char (#2025)

2 related changes to lexer's `read_next_token()`: 1. Hint to branch predictor that unicode identifiers and non-standard whitespace are rare by marking that branch `#[cold]`. 2. The branch is on whether next character is ASCII or not. This check only requires reading 1 byte, as ASCII characters are always single byte in UTF8. So only do the work of getting a `char` in the cold path, once it's established that character is not ASCII and this work is required.
2026-05-24 12:21:58 +00:00 · 2024-01-14 10:50:11 +00:00 · 2024-01-14 10:50:11 +00:00 · 60a927d8f5
commit 60a927d8f5
parent a356918d83
1 changed files with 20 additions and 17 deletions
--- a/crates/oxc_parser/src/lexer/mod.rs
+++ b/crates/oxc_parser/src/lexer/mod.rs
@ -374,28 +374,31 @@ impl<'a> Lexer<'a> {
            let offset = self.offset();
            self.current.token.start = offset;
-            if let Some(c) = self.current.chars.clone().next() {
+            let remaining = self.current.chars.as_str();
-                let kind = self.match_char(c);
+            if remaining.is_empty() {
                if !matches!(
                    kind,
                    Kind::WhiteSpace | Kind::NewLine | Kind::Comment | Kind::MultiLineComment
                ) {
                    return kind;
                }
            } else {
                return Kind::Eof;
            }
            let byte = remaining.as_bytes()[0];
            let kind = if byte < 128 {
                BYTE_HANDLERS[byte as usize](self)
            } else {
                self.match_unicode_char()
            };
            if !matches!(
                kind,
                Kind::WhiteSpace | Kind::NewLine | Kind::Comment | Kind::MultiLineComment
            ) {
                return kind;
            }
        }
    }
-    #[inline]
+    // `#[cold]` to hint to branch predictor that unicode identifiers and irregular whitespace are rare
-    fn match_char(&mut self, c: char) -> Kind {
+    #[cold]
-        let size = c as usize;
+    fn match_unicode_char(&mut self) -> Kind {
-
+        let c = self.current.chars.clone().next().unwrap();
        if size < 128 {
            return BYTE_HANDLERS[size](self);
        }
        match c {
            c if is_id_start_unicode(c) => {
                let mut builder = AutoCow::new(self);