From 94fdd54368052bdd6660f5ab7766755feab9a7b5 Mon Sep 17 00:00:00 2001 From: Boshen Date: Wed, 8 Mar 2023 06:00:15 -0800 Subject: [PATCH] perf(lexer): fine tune identifier checking with less redundant branches (#154) perf(lexer): fine tune checking of identifiers with less redundant branches --- Cargo.lock | 4 +-- crates/oxc_parser/Cargo.toml | 2 +- crates/oxc_parser/src/lexer/constants.rs | 41 +++++++++++++++++++++--- crates/oxc_parser/src/lexer/mod.rs | 34 +++++++++++--------- tasks/coverage/babel | 2 +- tasks/coverage/printer.snap | 4 +-- tasks/coverage/test262 | 2 +- tasks/coverage/test262.snap | 4 +-- tasks/coverage/typescript | 2 +- tasks/coverage/typescript.snap | 4 +-- 10 files changed, 67 insertions(+), 32 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e01d8d16b..4c96ee884 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1471,9 +1471,9 @@ checksum = "d54675592c1dbefd78cbd98db9bacd89886e1ca50692a0692baefffdeb92dd58" [[package]] name = "unicode-id-start" -version = "1.0.3" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "238a3d5702128479aa8f25de86d12dde3ef71859109b6c1be6ce62dd4e76b160" +checksum = "e15598ae4ac01f33d64525526e028beb497b216015dd768c44187a62b44cbce8" [[package]] name = "unicode-ident" diff --git a/crates/oxc_parser/Cargo.toml b/crates/oxc_parser/Cargo.toml index 1d5ba1917..087c71847 100644 --- a/crates/oxc_parser/Cargo.toml +++ b/crates/oxc_parser/Cargo.toml @@ -17,7 +17,7 @@ oxc_diagnostics = { path = "../oxc_diagnostics" } bitflags = { workspace = true } rustc-hash = { workspace = true } -unicode-id-start = "1.0.3" +unicode-id-start = "1.1.0" num-bigint = "0.4.3" [dev-dependencies] diff --git a/crates/oxc_parser/src/lexer/constants.rs b/crates/oxc_parser/src/lexer/constants.rs index ad4af0f4c..3f1ed2c14 100644 --- a/crates/oxc_parser/src/lexer/constants.rs +++ b/crates/oxc_parser/src/lexer/constants.rs @@ -1,5 +1,3 @@ -use unicode_id_start::{is_id_continue, is_id_start}; - use super::Kind; pub const EOF: char = '\0'; @@ -65,17 +63,50 @@ pub fn is_line_terminator(c: char) -> bool { is_regular_line_terminator(c) || is_irregular_line_terminator(c) } +const T: bool = true; +const F: bool = false; + +#[repr(C, align(64))] +pub struct Align64(pub(crate) T); + +// This contains `$` (36) and `_` (95) +pub const ASCII_START: Align64<[bool; 128]> = Align64([ + F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, T, + F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F, +]); + +// This contains `$` (36) +pub const ASCII_CONTINUE: Align64<[bool; 128]> = Align64([ + F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F, F, + F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, T, + F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F, +]); + +#[inline] +pub fn is_identifier_start_ascii(c: char) -> bool { + ASCII_START.0[c as usize] +} + /// Section 12.6 Detect `IdentifierStartChar` #[inline] -pub fn is_identifier_start(c: char) -> bool { - c == '$' || c == '_' || is_id_start(c) +pub fn is_identifier_start_all(c: char) -> bool { + if c.is_ascii() { + return is_identifier_start_ascii(c); + } + unicode_id_start::is_id_start_unicode(c) } /// Section 12.6 Detect `IdentifierPartChar` /// NOTE 2: The nonterminal `IdentifierPart` derives _ via `UnicodeIDContinue`. #[inline] pub fn is_identifier_part(c: char) -> bool { - c == '$' || is_id_continue(c) || c == ZWNJ || c == ZWJ + if c.is_ascii() { + return ASCII_CONTINUE.0[c as usize]; + } + unicode_id_start::is_id_continue_unicode(c) || c == ZWNJ || c == ZWJ } pub const SINGLE_CHAR_TOKENS: &[Kind; 128] = &[ diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs index 60e8d7ef0..2f2224eac 100644 --- a/crates/oxc_parser/src/lexer/mod.rs +++ b/crates/oxc_parser/src/lexer/mod.rs @@ -24,8 +24,9 @@ pub use token::{RegExp, Token, TokenValue}; pub use self::kind::Kind; use self::{ constants::{ - is_identifier_part, is_identifier_start, is_irregular_line_terminator, - is_irregular_whitespace, is_line_terminator, EOF, SINGLE_CHAR_TOKENS, + is_identifier_part, is_identifier_start_all, is_identifier_start_ascii, + is_irregular_line_terminator, is_irregular_whitespace, is_line_terminator, EOF, + SINGLE_CHAR_TOKENS, }, number::{parse_big_int, parse_float, parse_int}, string_builder::AutoCow, @@ -366,22 +367,22 @@ impl<'a> Lexer<'a> { // fast path for single character tokens // '{' '}' '(' ')' '[' ']' ';' ',' ':' '~' let size = c as usize; - if size <= 127 { + if size < 128 { let kind = SINGLE_CHAR_TOKENS[size]; if kind != Kind::Undetermined { return kind; } + // fast path for identifiers + if is_identifier_start_ascii(c) { + builder.push_matching(c); + return self.identifier_name_or_keyword(builder); + } } // NOTE: matching order is significant here, by real world occurrences // see https://blog.mozilla.org/nnethercote/2011/07/01/faster-javascript-parsing/ // > the rough order of frequency for different token kinds is as follows: // identifiers/keywords, ‘.’, ‘=’, strings, decimal numbers, ‘:’, ‘+’, hex/octal numbers, and then everything else match c { - // fast path for identifiers - c if c.is_ascii_alphabetic() => { - builder.push_matching(c); - self.identifier_name_or_keyword(builder) - } '.' => { let kind = self.read_dot(&mut builder); if kind.is_number() { @@ -452,7 +453,7 @@ impl<'a> Lexer<'a> { self.identifier_unicode_escape_sequence(&mut builder, true); self.identifier_name_or_keyword(builder) } - c if is_identifier_start(c) => { + c if unicode_id_start::is_id_start_unicode(c) => { builder.push_matching(c); self.identifier_name_or_keyword(builder) } @@ -718,7 +719,7 @@ impl<'a> Lexer<'a> { fn private_identifier(&mut self, mut builder: AutoCow<'a>) -> Kind { let start = self.offset(); match self.current.chars.next() { - Some(c) if is_identifier_start(c) => { + Some(c) if is_identifier_start_all(c) => { builder.push_matching(c); } Some('\\') => { @@ -935,13 +936,13 @@ impl<'a> Lexer<'a> { let offset = self.offset(); // The SourceCharacter immediately following a NumericLiteral must not be an IdentifierStart or DecimalDigit. let ch = self.peek(); - if !ch.is_ascii_digit() && !is_identifier_start(ch) { + if !ch.is_ascii_digit() && !is_identifier_start_all(ch) { return kind; } self.current.chars.next(); loop { let c = self.peek(); - if c != EOF && is_identifier_start(c) { + if c != EOF && is_identifier_start_all(c) { self.current.chars.next(); } else { break; @@ -1106,7 +1107,7 @@ impl<'a> Lexer<'a> { let mut builder = AutoCow::new(self); loop { let c = self.peek(); - if c == '-' || is_identifier_start(c) { + if c == '-' || is_identifier_start_all(c) { self.current.chars.next(); builder.push_matching(c); loop { @@ -1239,8 +1240,11 @@ impl<'a> Lexer<'a> { } }; - let is_valid = - if check_identifier_start { is_identifier_start(ch) } else { is_identifier_part(ch) }; + let is_valid = if check_identifier_start { + is_identifier_start_all(ch) + } else { + is_identifier_part(ch) + }; if !is_valid { self.error(diagnostics::InvalidCharacter(ch, self.current_offset())); diff --git a/tasks/coverage/babel b/tasks/coverage/babel index c38bf12f0..a547f8724 160000 --- a/tasks/coverage/babel +++ b/tasks/coverage/babel @@ -1 +1 @@ -Subproject commit c38bf12f010520ea7abe8a286f62922b2d1e1f1b +Subproject commit a547f8724a5c6b4395b8a8f597e3edd44de74bf3 diff --git a/tasks/coverage/printer.snap b/tasks/coverage/printer.snap index 5ee96dd63..c36eb74ad 100644 --- a/tasks/coverage/printer.snap +++ b/tasks/coverage/printer.snap @@ -1,3 +1,3 @@ Printer Summary: -AST Parsed : 44488/44488 (100.00%) -Positive Passed: 44488/44488 (100.00%) +AST Parsed : 44494/44494 (100.00%) +Positive Passed: 44494/44494 (100.00%) diff --git a/tasks/coverage/test262 b/tasks/coverage/test262 index d216cc197..53e5ef817 160000 --- a/tasks/coverage/test262 +++ b/tasks/coverage/test262 @@ -1 +1 @@ -Subproject commit d216cc197269fc41eb6eca14710529c3d6650535 +Subproject commit 53e5ef817eb212d0d4f6f0ab44275094e5bf876d diff --git a/tasks/coverage/test262.snap b/tasks/coverage/test262.snap index cb85b5324..86d1ca3d8 100644 --- a/tasks/coverage/test262.snap +++ b/tasks/coverage/test262.snap @@ -1,6 +1,6 @@ Test262 Summary: -AST Parsed : 43994/44003 (99.98%) -Positive Passed: 43994/44003 (99.98%) +AST Parsed : 44000/44009 (99.98%) +Positive Passed: 44000/44009 (99.98%) Negative Passed: 1934/3917 (49.37%) Expect Syntax Error: "annexB/language/expressions/template-literal/legacy-octal-escape-sequence-strict.js" Expect Syntax Error: "annexB/language/statements/for-in/const-initializer.js" diff --git a/tasks/coverage/typescript b/tasks/coverage/typescript index 8f40d5633..746a6feb2 160000 --- a/tasks/coverage/typescript +++ b/tasks/coverage/typescript @@ -1 +1 @@ -Subproject commit 8f40d5633fc36df04b4fd4392e3877558149987f +Subproject commit 746a6feb2e7ba6987b6c72db538dd498b35cd461 diff --git a/tasks/coverage/typescript.snap b/tasks/coverage/typescript.snap index 2c77ae076..14741c022 100644 --- a/tasks/coverage/typescript.snap +++ b/tasks/coverage/typescript.snap @@ -1,6 +1,6 @@ TypeScript Summary: -AST Parsed : 4327/4867 (88.90%) -Positive Passed: 4327/4867 (88.90%) +AST Parsed : 4329/4869 (88.91%) +Positive Passed: 4329/4869 (88.91%) Expect to Parse: "async/es2017/asyncArrowFunction/asyncArrowFunction6_es2017.ts" × Automatic Semicolon Insertion