mirror of
https://github.com/danbulant/oxc
synced 2026-05-19 20:28:58 +00:00
perf(syntax): optimize is_identifier_name (#5425)
Optimize `oxc_syntax::identifier::is_identifier_name`. Add a fast path for ASCII, which will be the common case. Only fall back to iterating over `char`s and using the more expensive test functions e.g. `is_identifier_start_unicode` if non-ASCII chars are found.
This commit is contained in:
parent
3ccb065695
commit
aeda84f904
1 changed files with 132 additions and 2 deletions
|
|
@ -136,7 +136,137 @@ pub fn is_identifier_part_unicode(c: char) -> bool {
|
|||
is_id_continue_unicode(c) || c == ZWNJ || c == ZWJ
|
||||
}
|
||||
|
||||
/// Determine if a string is a valid JS identifier.
|
||||
#[allow(clippy::missing_panics_doc)]
|
||||
pub fn is_identifier_name(name: &str) -> bool {
|
||||
let mut chars = name.chars();
|
||||
chars.next().is_some_and(is_identifier_start) && chars.all(is_identifier_part)
|
||||
// This function contains a fast path for ASCII (common case), iterating over bytes and using
|
||||
// the cheap `is_identifier_start_ascii` and `is_identifier_part_ascii` to test bytes.
|
||||
// Only if a Unicode char is found, fall back to iterating over `char`s, and using the more
|
||||
// expensive `is_identifier_start_unicode` and `is_identifier_part`.
|
||||
|
||||
// Get first byte. Exit if empty string.
|
||||
let bytes = name.as_bytes();
|
||||
let Some(&first_byte) = bytes.first() else { return false };
|
||||
|
||||
let mut chars = if first_byte.is_ascii() {
|
||||
// First byte is ASCII
|
||||
if !is_identifier_start_ascii(first_byte as char) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// `'outer` loop never actually loops - only here to allow breaking out of it when Unicode found
|
||||
#[allow(clippy::never_loop)]
|
||||
let index = 'outer: loop {
|
||||
for (index, &b) in bytes[1..].iter().enumerate() {
|
||||
if b.is_ascii() {
|
||||
if !is_identifier_part_ascii(b as char) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
// Unicode byte found
|
||||
break 'outer index;
|
||||
}
|
||||
}
|
||||
// We got to end without finding any non-identifier of Unicode characters
|
||||
return true;
|
||||
};
|
||||
|
||||
// Unicode byte found - search rest of string (from this byte onwards) as Unicode.
|
||||
// `index + 1` because `index` returned from the loop is relative to start of `bytes[1..]`.
|
||||
name[index + 1..].chars()
|
||||
} else {
|
||||
// First char is Unicode.
|
||||
// NB: `unwrap()` cannot fail because we already checked the string is not empty.
|
||||
let mut chars = name.chars();
|
||||
let first_char = chars.next().unwrap();
|
||||
if !is_identifier_start_unicode(first_char) {
|
||||
return false;
|
||||
}
|
||||
// Search rest of string as Unicode
|
||||
chars
|
||||
};
|
||||
|
||||
// A Unicode char was found - search rest of string as Unicode
|
||||
chars.all(is_identifier_part)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn is_identifier_name_true() {
|
||||
let cases = [
|
||||
// 1 char ASCII
|
||||
"a",
|
||||
"z",
|
||||
"A",
|
||||
"Z",
|
||||
"_",
|
||||
"$",
|
||||
// 1 char Unicode
|
||||
"µ", // 2 bytes
|
||||
"ख", // 3 bytes
|
||||
"𐀀", // 4 bytes
|
||||
// Multiple chars ASCII
|
||||
"az",
|
||||
"AZ",
|
||||
"_a",
|
||||
"$Z",
|
||||
"a0",
|
||||
"A9",
|
||||
"_0",
|
||||
"$9",
|
||||
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$",
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_$",
|
||||
"_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789$",
|
||||
"$abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_",
|
||||
// Multiple chars Unicode
|
||||
"µख𐀀",
|
||||
// ASCII + Unicode, starting with ASCII
|
||||
"AµBखC𐀀D",
|
||||
// ASCII + Unicode, starting with Unicode
|
||||
"µAखB𐀀",
|
||||
];
|
||||
|
||||
for str in cases {
|
||||
assert!(is_identifier_name(str));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn is_identifier_name_false() {
|
||||
let cases = [
|
||||
// Empty string
|
||||
"",
|
||||
// 1 char ASCII
|
||||
"0",
|
||||
"9",
|
||||
"-",
|
||||
"~",
|
||||
"+",
|
||||
// 1 char Unicode
|
||||
"£", // 2 bytes
|
||||
"৸", // 3 bytes
|
||||
"𐄬", // 4 bytes
|
||||
// Multiple chars ASCII
|
||||
"0a",
|
||||
"9a",
|
||||
"-a",
|
||||
"+a",
|
||||
"a-Z",
|
||||
"A+z",
|
||||
"a-",
|
||||
"a+",
|
||||
// Multiple chars Unicode
|
||||
"£৸𐄬",
|
||||
// ASCII + Unicode, starting with ASCII
|
||||
"A£",
|
||||
"A৸",
|
||||
"A𐄬",
|
||||
// ASCII + Unicode, starting with Unicode
|
||||
"£A",
|
||||
"৸A",
|
||||
"𐄬A",
|
||||
];
|
||||
|
||||
for str in cases {
|
||||
assert!(!is_identifier_name(str));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue