perf(lexer): fine tune identifier checking with less redundant branches (#154)

perf(lexer): fine tune checking of identifiers with less redundant branches
2026-05-24 12:21:58 +00:00 · 2023-03-08 06:00:15 -08:00 · 2023-03-08 06:00:15 -08:00 · 94fdd54368
commit 94fdd54368
parent be2231b689
10 changed files with 67 additions and 32 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1471,9 +1471,9 @@ checksum = "d54675592c1dbefd78cbd98db9bacd89886e1ca50692a0692baefffdeb92dd58"

 [[package]]
 name = "unicode-id-start"
-version = "1.0.3"
+version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "238a3d5702128479aa8f25de86d12dde3ef71859109b6c1be6ce62dd4e76b160"
+checksum = "e15598ae4ac01f33d64525526e028beb497b216015dd768c44187a62b44cbce8"

 [[package]]
 name = "unicode-ident"
--- a/crates/oxc_parser/Cargo.toml
+++ b/crates/oxc_parser/Cargo.toml
@ -17,7 +17,7 @@ oxc_diagnostics = { path = "../oxc_diagnostics" }
 bitflags = { workspace = true }
 rustc-hash = { workspace = true }

-unicode-id-start = "1.0.3"
+unicode-id-start = "1.1.0"
 num-bigint = "0.4.3"

 [dev-dependencies]
--- a/crates/oxc_parser/src/lexer/constants.rs
+++ b/crates/oxc_parser/src/lexer/constants.rs
@ -1,5 +1,3 @@
-use unicode_id_start::{is_id_continue, is_id_start};
-
 use super::Kind;

 pub const EOF: char = '\0';
@ -65,17 +63,50 @@ pub fn is_line_terminator(c: char) -> bool {
    is_regular_line_terminator(c) || is_irregular_line_terminator(c)
 }

+const T: bool = true;
+const F: bool = false;
+
+#[repr(C, align(64))]
+pub struct Align64<T>(pub(crate) T);
+
+// This contains `$` (36) and `_` (95)
+pub const ASCII_START: Align64<[bool; 128]> = Align64([
+    F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, T,
+    F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F,
+]);
+
+// This contains `$` (36)
+pub const ASCII_CONTINUE: Align64<[bool; 128]> = Align64([
+    F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F, F,
+    F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, T,
+    F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F,
+]);
+
+#[inline]
+pub fn is_identifier_start_ascii(c: char) -> bool {
+    ASCII_START.0[c as usize]
+}
+
 /// Section 12.6 Detect `IdentifierStartChar`
 #[inline]
-pub fn is_identifier_start(c: char) -> bool {
-    c == '$' || c == '_' || is_id_start(c)
+pub fn is_identifier_start_all(c: char) -> bool {
+    if c.is_ascii() {
+        return is_identifier_start_ascii(c);
+    }
+    unicode_id_start::is_id_start_unicode(c)
 }

 /// Section 12.6 Detect `IdentifierPartChar`
 /// NOTE 2: The nonterminal `IdentifierPart` derives _ via `UnicodeIDContinue`.
 #[inline]
 pub fn is_identifier_part(c: char) -> bool {
-    c == '$' || is_id_continue(c) || c == ZWNJ || c == ZWJ
+    if c.is_ascii() {
+        return ASCII_CONTINUE.0[c as usize];
+    }
+    unicode_id_start::is_id_continue_unicode(c) || c == ZWNJ || c == ZWJ
 }

 pub const SINGLE_CHAR_TOKENS: &[Kind; 128] = &[
--- a/crates/oxc_parser/src/lexer/mod.rs
+++ b/crates/oxc_parser/src/lexer/mod.rs
@ -24,8 +24,9 @@ pub use token::{RegExp, Token, TokenValue};
 pub use self::kind::Kind;
 use self::{
    constants::{
-        is_identifier_part, is_identifier_start, is_irregular_line_terminator,
-        is_irregular_whitespace, is_line_terminator, EOF, SINGLE_CHAR_TOKENS,
+        is_identifier_part, is_identifier_start_all, is_identifier_start_ascii,
+        is_irregular_line_terminator, is_irregular_whitespace, is_line_terminator, EOF,
+        SINGLE_CHAR_TOKENS,
    },
    number::{parse_big_int, parse_float, parse_int},
    string_builder::AutoCow,
@ -366,22 +367,22 @@ impl<'a> Lexer<'a> {
        // fast path for single character tokens
        // '{'  '}'  '('  ')'  '['  ']'  ';' ',' ':' '~'
        let size = c as usize;
-        if size <= 127 {
+        if size < 128 {
            let kind = SINGLE_CHAR_TOKENS[size];
            if kind != Kind::Undetermined {
                return kind;
            }
+            // fast path for identifiers
+            if is_identifier_start_ascii(c) {
+                builder.push_matching(c);
+                return self.identifier_name_or_keyword(builder);
+            }
        }
        // NOTE: matching order is significant here, by real world occurrences
        // see https://blog.mozilla.org/nnethercote/2011/07/01/faster-javascript-parsing/
        // > the rough order of frequency for different token kinds is as follows:
        // identifiers/keywords, ‘.’, ‘=’, strings, decimal numbers, ‘:’, ‘+’, hex/octal numbers, and then everything else
        match c {
-            // fast path for identifiers
-            c if c.is_ascii_alphabetic() => {
-                builder.push_matching(c);
-                self.identifier_name_or_keyword(builder)
-            }
            '.' => {
                let kind = self.read_dot(&mut builder);
                if kind.is_number() {
@ -452,7 +453,7 @@ impl<'a> Lexer<'a> {
                self.identifier_unicode_escape_sequence(&mut builder, true);
                self.identifier_name_or_keyword(builder)
            }
-            c if is_identifier_start(c) => {
+            c if unicode_id_start::is_id_start_unicode(c) => {
                builder.push_matching(c);
                self.identifier_name_or_keyword(builder)
            }
@ -718,7 +719,7 @@ impl<'a> Lexer<'a> {
    fn private_identifier(&mut self, mut builder: AutoCow<'a>) -> Kind {
        let start = self.offset();
        match self.current.chars.next() {
-            Some(c) if is_identifier_start(c) => {
+            Some(c) if is_identifier_start_all(c) => {
                builder.push_matching(c);
            }
            Some('\\') => {
@ -935,13 +936,13 @@ impl<'a> Lexer<'a> {
        let offset = self.offset();
        // The SourceCharacter immediately following a NumericLiteral must not be an IdentifierStart or DecimalDigit.
        let ch = self.peek();
-        if !ch.is_ascii_digit() && !is_identifier_start(ch) {
+        if !ch.is_ascii_digit() && !is_identifier_start_all(ch) {
            return kind;
        }
        self.current.chars.next();
        loop {
            let c = self.peek();
-            if c != EOF && is_identifier_start(c) {
+            if c != EOF && is_identifier_start_all(c) {
                self.current.chars.next();
            } else {
                break;
@ -1106,7 +1107,7 @@ impl<'a> Lexer<'a> {
        let mut builder = AutoCow::new(self);
        loop {
            let c = self.peek();
-            if c == '-' || is_identifier_start(c) {
+            if c == '-' || is_identifier_start_all(c) {
                self.current.chars.next();
                builder.push_matching(c);
                loop {
@ -1239,8 +1240,11 @@ impl<'a> Lexer<'a> {
            }
        };

-        let is_valid =
-            if check_identifier_start { is_identifier_start(ch) } else { is_identifier_part(ch) };
+        let is_valid = if check_identifier_start {
+            is_identifier_start_all(ch)
+        } else {
+            is_identifier_part(ch)
+        };

        if !is_valid {
            self.error(diagnostics::InvalidCharacter(ch, self.current_offset()));
--- a/tasks/coverage/babel
+++ b/tasks/coverage/babel
@ -1 +1 @@
-Subproject commit c38bf12f010520ea7abe8a286f62922b2d1e1f1b
+Subproject commit a547f8724a5c6b4395b8a8f597e3edd44de74bf3
--- a/tasks/coverage/printer.snap
+++ b/tasks/coverage/printer.snap
@ -1,3 +1,3 @@
 Printer Summary:
-AST Parsed     : 44488/44488 (100.00%)
-Positive Passed: 44488/44488 (100.00%)
+AST Parsed     : 44494/44494 (100.00%)
+Positive Passed: 44494/44494 (100.00%)
--- a/tasks/coverage/test262
+++ b/tasks/coverage/test262
@ -1 +1 @@
-Subproject commit d216cc197269fc41eb6eca14710529c3d6650535
+Subproject commit 53e5ef817eb212d0d4f6f0ab44275094e5bf876d
--- a/tasks/coverage/test262.snap
+++ b/tasks/coverage/test262.snap
@ -1,6 +1,6 @@
 Test262 Summary:
-AST Parsed     : 43994/44003 (99.98%)
-Positive Passed: 43994/44003 (99.98%)
+AST Parsed     : 44000/44009 (99.98%)
+Positive Passed: 44000/44009 (99.98%)
 Negative Passed: 1934/3917 (49.37%)
 Expect Syntax Error: "annexB/language/expressions/template-literal/legacy-octal-escape-sequence-strict.js"
 Expect Syntax Error: "annexB/language/statements/for-in/const-initializer.js"
--- a/tasks/coverage/typescript
+++ b/tasks/coverage/typescript
@ -1 +1 @@
-Subproject commit 8f40d5633fc36df04b4fd4392e3877558149987f
+Subproject commit 746a6feb2e7ba6987b6c72db538dd498b35cd461
--- a/tasks/coverage/typescript.snap
+++ b/tasks/coverage/typescript.snap
@ -1,6 +1,6 @@
 TypeScript Summary:
-AST Parsed     : 4327/4867 (88.90%)
-Positive Passed: 4327/4867 (88.90%)
+AST Parsed     : 4329/4869 (88.91%)
+Positive Passed: 4329/4869 (88.91%)
 Expect to Parse: "async/es2017/asyncArrowFunction/asyncArrowFunction6_es2017.ts"

  × Automatic Semicolon Insertion