perf(syntax): further optimize is_identifier_name (#5426)

Follow-on after #5425. Further optimize `oxc_syntax::identifier::is_identifier_name` by processing string in blocks of 8 bytes, and checking if all bytes in a block are ASCII in one go, rather than testing each byte individually.
This commit is contained in:
overlookmotel 2024-09-03 13:48:36 +00:00
parent aeda84f904
commit bfabd8facc
3 changed files with 82 additions and 14 deletions

1
Cargo.lock generated
View file

@ -1884,6 +1884,7 @@ dependencies = [
name = "oxc_syntax"
version = "0.26.0"
dependencies = [
"assert-unchecked",
"bitflags 2.6.0",
"dashmap 6.0.1",
"nonmax",

View file

@ -25,6 +25,7 @@ oxc_span = { workspace = true }
oxc_ast_macros = { workspace = true }
oxc_allocator = { workspace = true }
assert-unchecked = { workspace = true }
unicode-id-start = { workspace = true }
bitflags = { workspace = true }
rustc-hash = { workspace = true }

View file

@ -1,3 +1,5 @@
use assert_unchecked::assert_unchecked;
use unicode_id_start::{is_id_continue_unicode, is_id_start_unicode};
pub const EOF: char = '\0';
@ -143,6 +145,7 @@ pub fn is_identifier_name(name: &str) -> bool {
// the cheap `is_identifier_start_ascii` and `is_identifier_part_ascii` to test bytes.
// Only if a Unicode char is found, fall back to iterating over `char`s, and using the more
// expensive `is_identifier_start_unicode` and `is_identifier_part`.
// As a further optimization, we test if bytes are ASCII in blocks of 8 or 4 bytes, rather than 1 by 1.
// Get first byte. Exit if empty string.
let bytes = name.as_bytes();
@ -154,26 +157,83 @@ pub fn is_identifier_name(name: &str) -> bool {
return false;
}
// `'outer` loop never actually loops - only here to allow breaking out of it when Unicode found
#[allow(clippy::never_loop)]
let index = 'outer: loop {
for (index, &b) in bytes[1..].iter().enumerate() {
if b.is_ascii() {
let mut index = 1;
'outer: loop {
// Check blocks of 8 bytes, then 4 bytes, then single bytes
let bytes_remaining = bytes.len() - index;
if bytes_remaining >= 8 {
// Process block of 8 bytes.
// Check that next 8 bytes are all ASCII.
// SAFETY: We checked above that there are at least 8 bytes to read starting at `index`
#[allow(clippy::cast_ptr_alignment)]
let next8_as_u64 = unsafe {
let ptr = bytes.as_ptr().add(index).cast::<u64>();
ptr.read_unaligned()
};
let high_bits = next8_as_u64 & 0x8080_8080_8080_8080;
if high_bits != 0 {
// Some chars in this block are non-ASCII
break;
}
let next8 = next8_as_u64.to_ne_bytes();
for b in next8 {
// SAFETY: We just checked all these bytes are ASCII
unsafe { assert_unchecked!(b.is_ascii()) };
if !is_identifier_part_ascii(b as char) {
return false;
}
} else {
// Unicode byte found
break 'outer index;
}
index += 8;
} else if bytes_remaining >= 4 {
// Process block of 4 bytes.
// Check that next 4 bytes are all ASCII.
// SAFETY: We checked above that there are at least 4 bytes to read starting at `index`
#[allow(clippy::cast_ptr_alignment)]
let next4_as_u32 = unsafe {
let ptr = bytes.as_ptr().add(index).cast::<u32>();
ptr.read_unaligned()
};
let high_bits = next4_as_u32 & 0x8080_8080;
if high_bits != 0 {
// Some chars in this block are non-ASCII
break;
}
let next4 = next4_as_u32.to_ne_bytes();
for b in next4 {
// SAFETY: We just checked all these bytes are ASCII
unsafe { assert_unchecked!(b.is_ascii()) };
if !is_identifier_part_ascii(b as char) {
return false;
}
}
index += 4;
} else {
loop {
let Some(&b) = bytes.get(index) else {
// We got to the end with no non-identifier chars found
return true;
};
if b.is_ascii() {
if !is_identifier_part_ascii(b as char) {
return false;
}
} else {
// Unicode byte found
break 'outer;
}
index += 1;
}
}
// We got to end without finding any non-identifier of Unicode characters
return true;
};
}
// Unicode byte found - search rest of string (from this byte onwards) as Unicode.
// `index + 1` because `index` returned from the loop is relative to start of `bytes[1..]`.
name[index + 1..].chars()
// Unicode byte found - search rest of string (from this byte onwards) as Unicode
name[index..].chars()
} else {
// First char is Unicode.
// NB: `unwrap()` cannot fail because we already checked the string is not empty.
@ -260,6 +320,12 @@ fn is_identifier_name_false() {
"",
"A৸",
"A𐄬",
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc£",
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc৸",
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc𐄬",
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc£abcdefghijklmnopqrstuvwxyz",
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc৸abcdefghijklmnopqrstuvwxyz",
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc𐄬abcdefghijklmnopqrstuvwxyz",
// ASCII + Unicode, starting with Unicode
"£A",
"৸A",