From 94fdd54368052bdd6660f5ab7766755feab9a7b5 Mon Sep 17 00:00:00 2001
From: Boshen <boshenc@gmail.com>
Date: Wed, 8 Mar 2023 06:00:15 -0800
Subject: [PATCH] perf(lexer): fine tune identifier checking with less
 redundant branches (#154)

perf(lexer): fine tune checking of identifiers with less redundant branches
---
 Cargo.lock                               |  4 +--
 crates/oxc_parser/Cargo.toml             |  2 +-
 crates/oxc_parser/src/lexer/constants.rs | 41 +++++++++++++++++++++---
 crates/oxc_parser/src/lexer/mod.rs       | 34 +++++++++++---------
 tasks/coverage/babel                     |  2 +-
 tasks/coverage/printer.snap              |  4 +--
 tasks/coverage/test262                   |  2 +-
 tasks/coverage/test262.snap              |  4 +--
 tasks/coverage/typescript                |  2 +-
 tasks/coverage/typescript.snap           |  4 +--
 10 files changed, 67 insertions(+), 32 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index e01d8d16b..4c96ee884 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1471,9 +1471,9 @@ checksum = "d54675592c1dbefd78cbd98db9bacd89886e1ca50692a0692baefffdeb92dd58"
 
 [[package]]
 name = "unicode-id-start"
-version = "1.0.3"
+version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "238a3d5702128479aa8f25de86d12dde3ef71859109b6c1be6ce62dd4e76b160"
+checksum = "e15598ae4ac01f33d64525526e028beb497b216015dd768c44187a62b44cbce8"
 
 [[package]]
 name = "unicode-ident"
diff --git a/crates/oxc_parser/Cargo.toml b/crates/oxc_parser/Cargo.toml
index 1d5ba1917..087c71847 100644
--- a/crates/oxc_parser/Cargo.toml
+++ b/crates/oxc_parser/Cargo.toml
@@ -17,7 +17,7 @@ oxc_diagnostics = { path = "../oxc_diagnostics" }
 bitflags = { workspace = true }
 rustc-hash = { workspace = true }
 
-unicode-id-start = "1.0.3"
+unicode-id-start = "1.1.0"
 num-bigint = "0.4.3"
 
 [dev-dependencies]
diff --git a/crates/oxc_parser/src/lexer/constants.rs b/crates/oxc_parser/src/lexer/constants.rs
index ad4af0f4c..3f1ed2c14 100644
--- a/crates/oxc_parser/src/lexer/constants.rs
+++ b/crates/oxc_parser/src/lexer/constants.rs
@@ -1,5 +1,3 @@
-use unicode_id_start::{is_id_continue, is_id_start};
-
 use super::Kind;
 
 pub const EOF: char = '\0';
@@ -65,17 +63,50 @@ pub fn is_line_terminator(c: char) -> bool {
     is_regular_line_terminator(c) || is_irregular_line_terminator(c)
 }
 
+const T: bool = true;
+const F: bool = false;
+
+#[repr(C, align(64))]
+pub struct Align64<T>(pub(crate) T);
+
+// This contains `$` (36) and `_` (95)
+pub const ASCII_START: Align64<[bool; 128]> = Align64([
+    F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, T,
+    F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F,
+]);
+
+// This contains `$` (36)
+pub const ASCII_CONTINUE: Align64<[bool; 128]> = Align64([
+    F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F, F,
+    F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, T,
+    F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F,
+]);
+
+#[inline]
+pub fn is_identifier_start_ascii(c: char) -> bool {
+    ASCII_START.0[c as usize]
+}
+
 /// Section 12.6 Detect `IdentifierStartChar`
 #[inline]
-pub fn is_identifier_start(c: char) -> bool {
-    c == '$' || c == '_' || is_id_start(c)
+pub fn is_identifier_start_all(c: char) -> bool {
+    if c.is_ascii() {
+        return is_identifier_start_ascii(c);
+    }
+    unicode_id_start::is_id_start_unicode(c)
 }
 
 /// Section 12.6 Detect `IdentifierPartChar`
 /// NOTE 2: The nonterminal `IdentifierPart` derives _ via `UnicodeIDContinue`.
 #[inline]
 pub fn is_identifier_part(c: char) -> bool {
-    c == '$' || is_id_continue(c) || c == ZWNJ || c == ZWJ
+    if c.is_ascii() {
+        return ASCII_CONTINUE.0[c as usize];
+    }
+    unicode_id_start::is_id_continue_unicode(c) || c == ZWNJ || c == ZWJ
 }
 
 pub const SINGLE_CHAR_TOKENS: &[Kind; 128] = &[
diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs
index 60e8d7ef0..2f2224eac 100644
--- a/crates/oxc_parser/src/lexer/mod.rs
+++ b/crates/oxc_parser/src/lexer/mod.rs
@@ -24,8 +24,9 @@ pub use token::{RegExp, Token, TokenValue};
 pub use self::kind::Kind;
 use self::{
     constants::{
-        is_identifier_part, is_identifier_start, is_irregular_line_terminator,
-        is_irregular_whitespace, is_line_terminator, EOF, SINGLE_CHAR_TOKENS,
+        is_identifier_part, is_identifier_start_all, is_identifier_start_ascii,
+        is_irregular_line_terminator, is_irregular_whitespace, is_line_terminator, EOF,
+        SINGLE_CHAR_TOKENS,
     },
     number::{parse_big_int, parse_float, parse_int},
     string_builder::AutoCow,
@@ -366,22 +367,22 @@ impl<'a> Lexer<'a> {
         // fast path for single character tokens
         // '{'  '}'  '('  ')'  '['  ']'  ';' ',' ':' '~'
         let size = c as usize;
-        if size <= 127 {
+        if size < 128 {
             let kind = SINGLE_CHAR_TOKENS[size];
             if kind != Kind::Undetermined {
                 return kind;
             }
+            // fast path for identifiers
+            if is_identifier_start_ascii(c) {
+                builder.push_matching(c);
+                return self.identifier_name_or_keyword(builder);
+            }
         }
         // NOTE: matching order is significant here, by real world occurrences
         // see https://blog.mozilla.org/nnethercote/2011/07/01/faster-javascript-parsing/
         // > the rough order of frequency for different token kinds is as follows:
         // identifiers/keywords, ‘.’, ‘=’, strings, decimal numbers, ‘:’, ‘+’, hex/octal numbers, and then everything else
         match c {
-            // fast path for identifiers
-            c if c.is_ascii_alphabetic() => {
-                builder.push_matching(c);
-                self.identifier_name_or_keyword(builder)
-            }
             '.' => {
                 let kind = self.read_dot(&mut builder);
                 if kind.is_number() {
@@ -452,7 +453,7 @@ impl<'a> Lexer<'a> {
                 self.identifier_unicode_escape_sequence(&mut builder, true);
                 self.identifier_name_or_keyword(builder)
             }
-            c if is_identifier_start(c) => {
+            c if unicode_id_start::is_id_start_unicode(c) => {
                 builder.push_matching(c);
                 self.identifier_name_or_keyword(builder)
             }
@@ -718,7 +719,7 @@ impl<'a> Lexer<'a> {
     fn private_identifier(&mut self, mut builder: AutoCow<'a>) -> Kind {
         let start = self.offset();
         match self.current.chars.next() {
-            Some(c) if is_identifier_start(c) => {
+            Some(c) if is_identifier_start_all(c) => {
                 builder.push_matching(c);
             }
             Some('\\') => {
@@ -935,13 +936,13 @@ impl<'a> Lexer<'a> {
         let offset = self.offset();
         // The SourceCharacter immediately following a NumericLiteral must not be an IdentifierStart or DecimalDigit.
         let ch = self.peek();
-        if !ch.is_ascii_digit() && !is_identifier_start(ch) {
+        if !ch.is_ascii_digit() && !is_identifier_start_all(ch) {
             return kind;
         }
         self.current.chars.next();
         loop {
             let c = self.peek();
-            if c != EOF && is_identifier_start(c) {
+            if c != EOF && is_identifier_start_all(c) {
                 self.current.chars.next();
             } else {
                 break;
@@ -1106,7 +1107,7 @@ impl<'a> Lexer<'a> {
         let mut builder = AutoCow::new(self);
         loop {
             let c = self.peek();
-            if c == '-' || is_identifier_start(c) {
+            if c == '-' || is_identifier_start_all(c) {
                 self.current.chars.next();
                 builder.push_matching(c);
                 loop {
@@ -1239,8 +1240,11 @@ impl<'a> Lexer<'a> {
             }
         };
 
-        let is_valid =
-            if check_identifier_start { is_identifier_start(ch) } else { is_identifier_part(ch) };
+        let is_valid = if check_identifier_start {
+            is_identifier_start_all(ch)
+        } else {
+            is_identifier_part(ch)
+        };
 
         if !is_valid {
             self.error(diagnostics::InvalidCharacter(ch, self.current_offset()));
diff --git a/tasks/coverage/babel b/tasks/coverage/babel
index c38bf12f0..a547f8724 160000
--- a/tasks/coverage/babel
+++ b/tasks/coverage/babel
@@ -1 +1 @@
-Subproject commit c38bf12f010520ea7abe8a286f62922b2d1e1f1b
+Subproject commit a547f8724a5c6b4395b8a8f597e3edd44de74bf3
diff --git a/tasks/coverage/printer.snap b/tasks/coverage/printer.snap
index 5ee96dd63..c36eb74ad 100644
--- a/tasks/coverage/printer.snap
+++ b/tasks/coverage/printer.snap
@@ -1,3 +1,3 @@
 Printer Summary:
-AST Parsed     : 44488/44488 (100.00%)
-Positive Passed: 44488/44488 (100.00%)
+AST Parsed     : 44494/44494 (100.00%)
+Positive Passed: 44494/44494 (100.00%)
diff --git a/tasks/coverage/test262 b/tasks/coverage/test262
index d216cc197..53e5ef817 160000
--- a/tasks/coverage/test262
+++ b/tasks/coverage/test262
@@ -1 +1 @@
-Subproject commit d216cc197269fc41eb6eca14710529c3d6650535
+Subproject commit 53e5ef817eb212d0d4f6f0ab44275094e5bf876d
diff --git a/tasks/coverage/test262.snap b/tasks/coverage/test262.snap
index cb85b5324..86d1ca3d8 100644
--- a/tasks/coverage/test262.snap
+++ b/tasks/coverage/test262.snap
@@ -1,6 +1,6 @@
 Test262 Summary:
-AST Parsed     : 43994/44003 (99.98%)
-Positive Passed: 43994/44003 (99.98%)
+AST Parsed     : 44000/44009 (99.98%)
+Positive Passed: 44000/44009 (99.98%)
 Negative Passed: 1934/3917 (49.37%)
 Expect Syntax Error: "annexB/language/expressions/template-literal/legacy-octal-escape-sequence-strict.js"
 Expect Syntax Error: "annexB/language/statements/for-in/const-initializer.js"
diff --git a/tasks/coverage/typescript b/tasks/coverage/typescript
index 8f40d5633..746a6feb2 160000
--- a/tasks/coverage/typescript
+++ b/tasks/coverage/typescript
@@ -1 +1 @@
-Subproject commit 8f40d5633fc36df04b4fd4392e3877558149987f
+Subproject commit 746a6feb2e7ba6987b6c72db538dd498b35cd461
diff --git a/tasks/coverage/typescript.snap b/tasks/coverage/typescript.snap
index 2c77ae076..14741c022 100644
--- a/tasks/coverage/typescript.snap
+++ b/tasks/coverage/typescript.snap
@@ -1,6 +1,6 @@
 TypeScript Summary:
-AST Parsed     : 4327/4867 (88.90%)
-Positive Passed: 4327/4867 (88.90%)
+AST Parsed     : 4329/4869 (88.91%)
+Positive Passed: 4329/4869 (88.91%)
 Expect to Parse: "async/es2017/asyncArrowFunction/asyncArrowFunction6_es2017.ts"
 
   × Automatic Semicolon Insertion