perf(syntax): further optimize is_identifier_name (#5426)

Follow-on after #5425. Further optimize `oxc_syntax::identifier::is_identifier_name` by processing string in blocks of 8 bytes, and checking if all bytes in a block are ASCII in one go, rather than testing each byte individually.
2026-05-19 12:19:15 +00:00 · 2024-09-03 13:48:36 +00:00 · 2024-09-03 13:48:36 +00:00 · bfabd8facc
commit bfabd8facc
parent aeda84f904
3 changed files with 82 additions and 14 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1884,6 +1884,7 @@ dependencies = [
 name = "oxc_syntax"
 version = "0.26.0"
 dependencies = [
+ "assert-unchecked",
 "bitflags 2.6.0",
 "dashmap 6.0.1",
 "nonmax",
--- a/crates/oxc_syntax/Cargo.toml
+++ b/crates/oxc_syntax/Cargo.toml
@ -25,6 +25,7 @@ oxc_span       = { workspace = true }
 oxc_ast_macros = { workspace = true }
 oxc_allocator  = { workspace = true }

+assert-unchecked = { workspace = true }
 unicode-id-start = { workspace = true }
 bitflags         = { workspace = true }
 rustc-hash       = { workspace = true }
--- a/crates/oxc_syntax/src/identifier.rs
+++ b/crates/oxc_syntax/src/identifier.rs
@ -1,3 +1,5 @@
+use assert_unchecked::assert_unchecked;
+
 use unicode_id_start::{is_id_continue_unicode, is_id_start_unicode};

 pub const EOF: char = '\0';
@ -143,6 +145,7 @@ pub fn is_identifier_name(name: &str) -> bool {
    // the cheap `is_identifier_start_ascii` and `is_identifier_part_ascii` to test bytes.
    // Only if a Unicode char is found, fall back to iterating over `char`s, and using the more
    // expensive `is_identifier_start_unicode` and `is_identifier_part`.
+    // As a further optimization, we test if bytes are ASCII in blocks of 8 or 4 bytes, rather than 1 by 1.

    // Get first byte. Exit if empty string.
    let bytes = name.as_bytes();
@ -154,26 +157,83 @@ pub fn is_identifier_name(name: &str) -> bool {
            return false;
        }

-        // `'outer` loop never actually loops - only here to allow breaking out of it when Unicode found
-        #[allow(clippy::never_loop)]
-        let index = 'outer: loop {
-            for (index, &b) in bytes[1..].iter().enumerate() {
-                if b.is_ascii() {
+        let mut index = 1;
+        'outer: loop {
+            // Check blocks of 8 bytes, then 4 bytes, then single bytes
+            let bytes_remaining = bytes.len() - index;
+            if bytes_remaining >= 8 {
+                // Process block of 8 bytes.
+                // Check that next 8 bytes are all ASCII.
+                // SAFETY: We checked above that there are at least 8 bytes to read starting at `index`
+                #[allow(clippy::cast_ptr_alignment)]
+                let next8_as_u64 = unsafe {
+                    let ptr = bytes.as_ptr().add(index).cast::<u64>();
+                    ptr.read_unaligned()
+                };
+                let high_bits = next8_as_u64 & 0x8080_8080_8080_8080;
+                if high_bits != 0 {
+                    // Some chars in this block are non-ASCII
+                    break;
+                }
+
+                let next8 = next8_as_u64.to_ne_bytes();
+                for b in next8 {
+                    // SAFETY: We just checked all these bytes are ASCII
+                    unsafe { assert_unchecked!(b.is_ascii()) };
                    if !is_identifier_part_ascii(b as char) {
                        return false;
                    }
-                } else {
-                    // Unicode byte found
-                    break 'outer index;
+                }
+
+                index += 8;
+            } else if bytes_remaining >= 4 {
+                // Process block of 4 bytes.
+                // Check that next 4 bytes are all ASCII.
+                // SAFETY: We checked above that there are at least 4 bytes to read starting at `index`
+                #[allow(clippy::cast_ptr_alignment)]
+                let next4_as_u32 = unsafe {
+                    let ptr = bytes.as_ptr().add(index).cast::<u32>();
+                    ptr.read_unaligned()
+                };
+                let high_bits = next4_as_u32 & 0x8080_8080;
+                if high_bits != 0 {
+                    // Some chars in this block are non-ASCII
+                    break;
+                }
+
+                let next4 = next4_as_u32.to_ne_bytes();
+                for b in next4 {
+                    // SAFETY: We just checked all these bytes are ASCII
+                    unsafe { assert_unchecked!(b.is_ascii()) };
+                    if !is_identifier_part_ascii(b as char) {
+                        return false;
+                    }
+                }
+
+                index += 4;
+            } else {
+                loop {
+                    let Some(&b) = bytes.get(index) else {
+                        // We got to the end with no non-identifier chars found
+                        return true;
+                    };
+
+                    if b.is_ascii() {
+                        if !is_identifier_part_ascii(b as char) {
+                            return false;
+                        }
+                    } else {
+                        // Unicode byte found
+                        break 'outer;
+                    }
+
+                    index += 1;
                }
            }
-            // We got to end without finding any non-identifier of Unicode characters
-            return true;
-        };
+        }

-        // Unicode byte found - search rest of string (from this byte onwards) as Unicode.
-        // `index + 1` because `index` returned from the loop is relative to start of `bytes[1..]`.
-        name[index + 1..].chars()
+        // Unicode byte found - search rest of string (from this byte onwards) as Unicode
+        name[index..].chars()
    } else {
        // First char is Unicode.
        // NB: `unwrap()` cannot fail because we already checked the string is not empty.
@ -260,6 +320,12 @@ fn is_identifier_name_false() {
        "A£",
        "A৸",
        "A𐄬",
+        "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc£",
+        "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc৸",
+        "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc𐄬",
+        "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc£abcdefghijklmnopqrstuvwxyz",
+        "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc৸abcdefghijklmnopqrstuvwxyz",
+        "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc𐄬abcdefghijklmnopqrstuvwxyz",
        // ASCII + Unicode, starting with Unicode
        "£A",
        "৸A",