mirror of
https://github.com/danbulant/oxc
synced 2026-05-24 12:21:58 +00:00
perf(lexer): fine tune identifier checking with less redundant branches (#154)
perf(lexer): fine tune checking of identifiers with less redundant branches
This commit is contained in:
parent
be2231b689
commit
94fdd54368
10 changed files with 67 additions and 32 deletions
4
Cargo.lock
generated
4
Cargo.lock
generated
|
|
@ -1471,9 +1471,9 @@ checksum = "d54675592c1dbefd78cbd98db9bacd89886e1ca50692a0692baefffdeb92dd58"
|
|||
|
||||
[[package]]
|
||||
name = "unicode-id-start"
|
||||
version = "1.0.3"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "238a3d5702128479aa8f25de86d12dde3ef71859109b6c1be6ce62dd4e76b160"
|
||||
checksum = "e15598ae4ac01f33d64525526e028beb497b216015dd768c44187a62b44cbce8"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ oxc_diagnostics = { path = "../oxc_diagnostics" }
|
|||
bitflags = { workspace = true }
|
||||
rustc-hash = { workspace = true }
|
||||
|
||||
unicode-id-start = "1.0.3"
|
||||
unicode-id-start = "1.1.0"
|
||||
num-bigint = "0.4.3"
|
||||
|
||||
[dev-dependencies]
|
||||
|
|
|
|||
|
|
@ -1,5 +1,3 @@
|
|||
use unicode_id_start::{is_id_continue, is_id_start};
|
||||
|
||||
use super::Kind;
|
||||
|
||||
pub const EOF: char = '\0';
|
||||
|
|
@ -65,17 +63,50 @@ pub fn is_line_terminator(c: char) -> bool {
|
|||
is_regular_line_terminator(c) || is_irregular_line_terminator(c)
|
||||
}
|
||||
|
||||
const T: bool = true;
|
||||
const F: bool = false;
|
||||
|
||||
#[repr(C, align(64))]
|
||||
pub struct Align64<T>(pub(crate) T);
|
||||
|
||||
// This contains `$` (36) and `_` (95)
|
||||
pub const ASCII_START: Align64<[bool; 128]> = Align64([
|
||||
F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
||||
F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
||||
F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, T,
|
||||
F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F,
|
||||
]);
|
||||
|
||||
// This contains `$` (36)
|
||||
pub const ASCII_CONTINUE: Align64<[bool; 128]> = Align64([
|
||||
F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
||||
F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F, F,
|
||||
F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, T,
|
||||
F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F,
|
||||
]);
|
||||
|
||||
#[inline]
|
||||
pub fn is_identifier_start_ascii(c: char) -> bool {
|
||||
ASCII_START.0[c as usize]
|
||||
}
|
||||
|
||||
/// Section 12.6 Detect `IdentifierStartChar`
|
||||
#[inline]
|
||||
pub fn is_identifier_start(c: char) -> bool {
|
||||
c == '$' || c == '_' || is_id_start(c)
|
||||
pub fn is_identifier_start_all(c: char) -> bool {
|
||||
if c.is_ascii() {
|
||||
return is_identifier_start_ascii(c);
|
||||
}
|
||||
unicode_id_start::is_id_start_unicode(c)
|
||||
}
|
||||
|
||||
/// Section 12.6 Detect `IdentifierPartChar`
|
||||
/// NOTE 2: The nonterminal `IdentifierPart` derives _ via `UnicodeIDContinue`.
|
||||
#[inline]
|
||||
pub fn is_identifier_part(c: char) -> bool {
|
||||
c == '$' || is_id_continue(c) || c == ZWNJ || c == ZWJ
|
||||
if c.is_ascii() {
|
||||
return ASCII_CONTINUE.0[c as usize];
|
||||
}
|
||||
unicode_id_start::is_id_continue_unicode(c) || c == ZWNJ || c == ZWJ
|
||||
}
|
||||
|
||||
pub const SINGLE_CHAR_TOKENS: &[Kind; 128] = &[
|
||||
|
|
|
|||
|
|
@ -24,8 +24,9 @@ pub use token::{RegExp, Token, TokenValue};
|
|||
pub use self::kind::Kind;
|
||||
use self::{
|
||||
constants::{
|
||||
is_identifier_part, is_identifier_start, is_irregular_line_terminator,
|
||||
is_irregular_whitespace, is_line_terminator, EOF, SINGLE_CHAR_TOKENS,
|
||||
is_identifier_part, is_identifier_start_all, is_identifier_start_ascii,
|
||||
is_irregular_line_terminator, is_irregular_whitespace, is_line_terminator, EOF,
|
||||
SINGLE_CHAR_TOKENS,
|
||||
},
|
||||
number::{parse_big_int, parse_float, parse_int},
|
||||
string_builder::AutoCow,
|
||||
|
|
@ -366,22 +367,22 @@ impl<'a> Lexer<'a> {
|
|||
// fast path for single character tokens
|
||||
// '{' '}' '(' ')' '[' ']' ';' ',' ':' '~'
|
||||
let size = c as usize;
|
||||
if size <= 127 {
|
||||
if size < 128 {
|
||||
let kind = SINGLE_CHAR_TOKENS[size];
|
||||
if kind != Kind::Undetermined {
|
||||
return kind;
|
||||
}
|
||||
// fast path for identifiers
|
||||
if is_identifier_start_ascii(c) {
|
||||
builder.push_matching(c);
|
||||
return self.identifier_name_or_keyword(builder);
|
||||
}
|
||||
}
|
||||
// NOTE: matching order is significant here, by real world occurrences
|
||||
// see https://blog.mozilla.org/nnethercote/2011/07/01/faster-javascript-parsing/
|
||||
// > the rough order of frequency for different token kinds is as follows:
|
||||
// identifiers/keywords, ‘.’, ‘=’, strings, decimal numbers, ‘:’, ‘+’, hex/octal numbers, and then everything else
|
||||
match c {
|
||||
// fast path for identifiers
|
||||
c if c.is_ascii_alphabetic() => {
|
||||
builder.push_matching(c);
|
||||
self.identifier_name_or_keyword(builder)
|
||||
}
|
||||
'.' => {
|
||||
let kind = self.read_dot(&mut builder);
|
||||
if kind.is_number() {
|
||||
|
|
@ -452,7 +453,7 @@ impl<'a> Lexer<'a> {
|
|||
self.identifier_unicode_escape_sequence(&mut builder, true);
|
||||
self.identifier_name_or_keyword(builder)
|
||||
}
|
||||
c if is_identifier_start(c) => {
|
||||
c if unicode_id_start::is_id_start_unicode(c) => {
|
||||
builder.push_matching(c);
|
||||
self.identifier_name_or_keyword(builder)
|
||||
}
|
||||
|
|
@ -718,7 +719,7 @@ impl<'a> Lexer<'a> {
|
|||
fn private_identifier(&mut self, mut builder: AutoCow<'a>) -> Kind {
|
||||
let start = self.offset();
|
||||
match self.current.chars.next() {
|
||||
Some(c) if is_identifier_start(c) => {
|
||||
Some(c) if is_identifier_start_all(c) => {
|
||||
builder.push_matching(c);
|
||||
}
|
||||
Some('\\') => {
|
||||
|
|
@ -935,13 +936,13 @@ impl<'a> Lexer<'a> {
|
|||
let offset = self.offset();
|
||||
// The SourceCharacter immediately following a NumericLiteral must not be an IdentifierStart or DecimalDigit.
|
||||
let ch = self.peek();
|
||||
if !ch.is_ascii_digit() && !is_identifier_start(ch) {
|
||||
if !ch.is_ascii_digit() && !is_identifier_start_all(ch) {
|
||||
return kind;
|
||||
}
|
||||
self.current.chars.next();
|
||||
loop {
|
||||
let c = self.peek();
|
||||
if c != EOF && is_identifier_start(c) {
|
||||
if c != EOF && is_identifier_start_all(c) {
|
||||
self.current.chars.next();
|
||||
} else {
|
||||
break;
|
||||
|
|
@ -1106,7 +1107,7 @@ impl<'a> Lexer<'a> {
|
|||
let mut builder = AutoCow::new(self);
|
||||
loop {
|
||||
let c = self.peek();
|
||||
if c == '-' || is_identifier_start(c) {
|
||||
if c == '-' || is_identifier_start_all(c) {
|
||||
self.current.chars.next();
|
||||
builder.push_matching(c);
|
||||
loop {
|
||||
|
|
@ -1239,8 +1240,11 @@ impl<'a> Lexer<'a> {
|
|||
}
|
||||
};
|
||||
|
||||
let is_valid =
|
||||
if check_identifier_start { is_identifier_start(ch) } else { is_identifier_part(ch) };
|
||||
let is_valid = if check_identifier_start {
|
||||
is_identifier_start_all(ch)
|
||||
} else {
|
||||
is_identifier_part(ch)
|
||||
};
|
||||
|
||||
if !is_valid {
|
||||
self.error(diagnostics::InvalidCharacter(ch, self.current_offset()));
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
Subproject commit c38bf12f010520ea7abe8a286f62922b2d1e1f1b
|
||||
Subproject commit a547f8724a5c6b4395b8a8f597e3edd44de74bf3
|
||||
|
|
@ -1,3 +1,3 @@
|
|||
Printer Summary:
|
||||
AST Parsed : 44488/44488 (100.00%)
|
||||
Positive Passed: 44488/44488 (100.00%)
|
||||
AST Parsed : 44494/44494 (100.00%)
|
||||
Positive Passed: 44494/44494 (100.00%)
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
Subproject commit d216cc197269fc41eb6eca14710529c3d6650535
|
||||
Subproject commit 53e5ef817eb212d0d4f6f0ab44275094e5bf876d
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
Test262 Summary:
|
||||
AST Parsed : 43994/44003 (99.98%)
|
||||
Positive Passed: 43994/44003 (99.98%)
|
||||
AST Parsed : 44000/44009 (99.98%)
|
||||
Positive Passed: 44000/44009 (99.98%)
|
||||
Negative Passed: 1934/3917 (49.37%)
|
||||
Expect Syntax Error: "annexB/language/expressions/template-literal/legacy-octal-escape-sequence-strict.js"
|
||||
Expect Syntax Error: "annexB/language/statements/for-in/const-initializer.js"
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
Subproject commit 8f40d5633fc36df04b4fd4392e3877558149987f
|
||||
Subproject commit 746a6feb2e7ba6987b6c72db538dd498b35cd461
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
TypeScript Summary:
|
||||
AST Parsed : 4327/4867 (88.90%)
|
||||
Positive Passed: 4327/4867 (88.90%)
|
||||
AST Parsed : 4329/4869 (88.91%)
|
||||
Positive Passed: 4329/4869 (88.91%)
|
||||
Expect to Parse: "async/es2017/asyncArrowFunction/asyncArrowFunction6_es2017.ts"
|
||||
|
||||
× Automatic Semicolon Insertion
|
||||
|
|
|
|||
Loading…
Reference in a new issue