perf(lexer): fine tune identifier checking with less redundant branches (#154)

perf(lexer): fine tune checking of identifiers with less redundant branches
This commit is contained in:
Boshen 2023-03-08 06:00:15 -08:00 committed by GitHub
parent be2231b689
commit 94fdd54368
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 67 additions and 32 deletions

4
Cargo.lock generated
View file

@ -1471,9 +1471,9 @@ checksum = "d54675592c1dbefd78cbd98db9bacd89886e1ca50692a0692baefffdeb92dd58"
[[package]]
name = "unicode-id-start"
version = "1.0.3"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "238a3d5702128479aa8f25de86d12dde3ef71859109b6c1be6ce62dd4e76b160"
checksum = "e15598ae4ac01f33d64525526e028beb497b216015dd768c44187a62b44cbce8"
[[package]]
name = "unicode-ident"

View file

@ -17,7 +17,7 @@ oxc_diagnostics = { path = "../oxc_diagnostics" }
bitflags = { workspace = true }
rustc-hash = { workspace = true }
unicode-id-start = "1.0.3"
unicode-id-start = "1.1.0"
num-bigint = "0.4.3"
[dev-dependencies]

View file

@ -1,5 +1,3 @@
use unicode_id_start::{is_id_continue, is_id_start};
use super::Kind;
pub const EOF: char = '\0';
@ -65,17 +63,50 @@ pub fn is_line_terminator(c: char) -> bool {
is_regular_line_terminator(c) || is_irregular_line_terminator(c)
}
const T: bool = true;
const F: bool = false;
#[repr(C, align(64))]
pub struct Align64<T>(pub(crate) T);
// This contains `$` (36) and `_` (95)
pub const ASCII_START: Align64<[bool; 128]> = Align64([
F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, T,
F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F,
]);
// This contains `$` (36)
pub const ASCII_CONTINUE: Align64<[bool; 128]> = Align64([
F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F, F,
F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, T,
F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F,
]);
#[inline]
pub fn is_identifier_start_ascii(c: char) -> bool {
ASCII_START.0[c as usize]
}
/// Section 12.6 Detect `IdentifierStartChar`
#[inline]
pub fn is_identifier_start(c: char) -> bool {
c == '$' || c == '_' || is_id_start(c)
pub fn is_identifier_start_all(c: char) -> bool {
if c.is_ascii() {
return is_identifier_start_ascii(c);
}
unicode_id_start::is_id_start_unicode(c)
}
/// Section 12.6 Detect `IdentifierPartChar`
/// NOTE 2: The nonterminal `IdentifierPart` derives _ via `UnicodeIDContinue`.
#[inline]
pub fn is_identifier_part(c: char) -> bool {
c == '$' || is_id_continue(c) || c == ZWNJ || c == ZWJ
if c.is_ascii() {
return ASCII_CONTINUE.0[c as usize];
}
unicode_id_start::is_id_continue_unicode(c) || c == ZWNJ || c == ZWJ
}
pub const SINGLE_CHAR_TOKENS: &[Kind; 128] = &[

View file

@ -24,8 +24,9 @@ pub use token::{RegExp, Token, TokenValue};
pub use self::kind::Kind;
use self::{
constants::{
is_identifier_part, is_identifier_start, is_irregular_line_terminator,
is_irregular_whitespace, is_line_terminator, EOF, SINGLE_CHAR_TOKENS,
is_identifier_part, is_identifier_start_all, is_identifier_start_ascii,
is_irregular_line_terminator, is_irregular_whitespace, is_line_terminator, EOF,
SINGLE_CHAR_TOKENS,
},
number::{parse_big_int, parse_float, parse_int},
string_builder::AutoCow,
@ -366,22 +367,22 @@ impl<'a> Lexer<'a> {
// fast path for single character tokens
// '{' '}' '(' ')' '[' ']' ';' ',' ':' '~'
let size = c as usize;
if size <= 127 {
if size < 128 {
let kind = SINGLE_CHAR_TOKENS[size];
if kind != Kind::Undetermined {
return kind;
}
// fast path for identifiers
if is_identifier_start_ascii(c) {
builder.push_matching(c);
return self.identifier_name_or_keyword(builder);
}
}
// NOTE: matching order is significant here, by real world occurrences
// see https://blog.mozilla.org/nnethercote/2011/07/01/faster-javascript-parsing/
// > the rough order of frequency for different token kinds is as follows:
// identifiers/keywords, ., =, strings, decimal numbers, :, +, hex/octal numbers, and then everything else
match c {
// fast path for identifiers
c if c.is_ascii_alphabetic() => {
builder.push_matching(c);
self.identifier_name_or_keyword(builder)
}
'.' => {
let kind = self.read_dot(&mut builder);
if kind.is_number() {
@ -452,7 +453,7 @@ impl<'a> Lexer<'a> {
self.identifier_unicode_escape_sequence(&mut builder, true);
self.identifier_name_or_keyword(builder)
}
c if is_identifier_start(c) => {
c if unicode_id_start::is_id_start_unicode(c) => {
builder.push_matching(c);
self.identifier_name_or_keyword(builder)
}
@ -718,7 +719,7 @@ impl<'a> Lexer<'a> {
fn private_identifier(&mut self, mut builder: AutoCow<'a>) -> Kind {
let start = self.offset();
match self.current.chars.next() {
Some(c) if is_identifier_start(c) => {
Some(c) if is_identifier_start_all(c) => {
builder.push_matching(c);
}
Some('\\') => {
@ -935,13 +936,13 @@ impl<'a> Lexer<'a> {
let offset = self.offset();
// The SourceCharacter immediately following a NumericLiteral must not be an IdentifierStart or DecimalDigit.
let ch = self.peek();
if !ch.is_ascii_digit() && !is_identifier_start(ch) {
if !ch.is_ascii_digit() && !is_identifier_start_all(ch) {
return kind;
}
self.current.chars.next();
loop {
let c = self.peek();
if c != EOF && is_identifier_start(c) {
if c != EOF && is_identifier_start_all(c) {
self.current.chars.next();
} else {
break;
@ -1106,7 +1107,7 @@ impl<'a> Lexer<'a> {
let mut builder = AutoCow::new(self);
loop {
let c = self.peek();
if c == '-' || is_identifier_start(c) {
if c == '-' || is_identifier_start_all(c) {
self.current.chars.next();
builder.push_matching(c);
loop {
@ -1239,8 +1240,11 @@ impl<'a> Lexer<'a> {
}
};
let is_valid =
if check_identifier_start { is_identifier_start(ch) } else { is_identifier_part(ch) };
let is_valid = if check_identifier_start {
is_identifier_start_all(ch)
} else {
is_identifier_part(ch)
};
if !is_valid {
self.error(diagnostics::InvalidCharacter(ch, self.current_offset()));

@ -1 +1 @@
Subproject commit c38bf12f010520ea7abe8a286f62922b2d1e1f1b
Subproject commit a547f8724a5c6b4395b8a8f597e3edd44de74bf3

View file

@ -1,3 +1,3 @@
Printer Summary:
AST Parsed : 44488/44488 (100.00%)
Positive Passed: 44488/44488 (100.00%)
AST Parsed : 44494/44494 (100.00%)
Positive Passed: 44494/44494 (100.00%)

@ -1 +1 @@
Subproject commit d216cc197269fc41eb6eca14710529c3d6650535
Subproject commit 53e5ef817eb212d0d4f6f0ab44275094e5bf876d

View file

@ -1,6 +1,6 @@
Test262 Summary:
AST Parsed : 43994/44003 (99.98%)
Positive Passed: 43994/44003 (99.98%)
AST Parsed : 44000/44009 (99.98%)
Positive Passed: 44000/44009 (99.98%)
Negative Passed: 1934/3917 (49.37%)
Expect Syntax Error: "annexB/language/expressions/template-literal/legacy-octal-escape-sequence-strict.js"
Expect Syntax Error: "annexB/language/statements/for-in/const-initializer.js"

@ -1 +1 @@
Subproject commit 8f40d5633fc36df04b4fd4392e3877558149987f
Subproject commit 746a6feb2e7ba6987b6c72db538dd498b35cd461

View file

@ -1,6 +1,6 @@
TypeScript Summary:
AST Parsed : 4327/4867 (88.90%)
Positive Passed: 4327/4867 (88.90%)
AST Parsed : 4329/4869 (88.91%)
Positive Passed: 4329/4869 (88.91%)
Expect to Parse: "async/es2017/asyncArrowFunction/asyncArrowFunction6_es2017.ts"
× Automatic Semicolon Insertion