perf(mangler): optimize base54 function (#8557)

Optimize mangler by creating identifiers in an inline array, instead of using the more expensive `CompactString`, and optimize `base54` function. There's an unfortunate workaround necessary because of debug mode. In "normal" mode, identifiers are maximum 11 bytes (`usize::MAX` -> `ZrN6rN6rN6r`) but in debug mode they can be up to 25 bytes (`usize::MAX` -> `slot_18446744073709551615`). So this PR splits `build_with_symbols_and_scopes` into 2 branches for "normal" and "debug" modes with a generic function parameterized by max length of the string. This is not ideal - it will increase binary size a bit, but everything else I tried (e.g. allocating strings into arena) was much slower. The main motivation for this change wasn't actually performance. While working on allocator, I discovered that `CompactString`s were being allocated in arena (`reserved_names: ArenaVec<CompactStr>`), and wanted to remove them.
2026-05-19 04:08:41 +00:00 · 2025-01-17 05:47:27 +00:00 · 2025-01-17 05:47:27 +00:00 · d17021c834
commit d17021c834
parent 64d38b86b6
3 changed files with 150 additions and 19 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1839,7 +1839,6 @@ dependencies = [
 name = "oxc_mangler"
 version = "0.46.0"
 dependencies = [
- "compact_str",
 "itertools",
 "oxc_allocator",
 "oxc_ast",
--- a/crates/oxc_mangler/Cargo.toml
+++ b/crates/oxc_mangler/Cargo.toml
@ -27,6 +27,5 @@ oxc_index = { workspace = true }
 oxc_semantic = { workspace = true }
 oxc_span = { workspace = true }

-compact_str = { workspace = true }
 itertools = { workspace = true }
 rustc-hash = { workspace = true }
--- a/crates/oxc_mangler/src/lib.rs
+++ b/crates/oxc_mangler/src/lib.rs
@ -1,4 +1,5 @@
-use compact_str::CompactString;
+use std::ops::Deref;
+
 use itertools::Itertools;
 use rustc_hash::FxHashSet;

@ -6,7 +7,7 @@ use oxc_allocator::{Allocator, Vec};
 use oxc_ast::ast::{Declaration, Program, Statement};
 use oxc_index::Idx;
 use oxc_semantic::{ReferenceId, ScopeTree, SemanticBuilder, SymbolId, SymbolTable};
-use oxc_span::{Atom, CompactStr};
+use oxc_span::Atom;

 #[derive(Default, Debug, Clone, Copy)]
 pub struct MangleOptions {
@ -94,10 +95,27 @@ impl Mangler {

    #[must_use]
    pub fn build_with_symbols_and_scopes(
+        self,
+        symbol_table: SymbolTable,
+        scope_tree: &ScopeTree,
+        program: &Program<'_>,
+    ) -> Mangler {
+        if self.options.debug {
+            self.build_with_symbols_and_scopes_impl(symbol_table, scope_tree, program, debug_name)
+        } else {
+            self.build_with_symbols_and_scopes_impl(symbol_table, scope_tree, program, base54)
+        }
+    }
+
+    fn build_with_symbols_and_scopes_impl<
+        const CAPACITY: usize,
+        G: Fn(usize) -> InlineString<CAPACITY>,
+    >(
        mut self,
        symbol_table: SymbolTable,
        scope_tree: &ScopeTree,
        program: &Program<'_>,
+        generate_name: G,
    ) -> Mangler {
        let (exported_names, exported_symbols) = if self.options.top_level {
            Mangler::collect_exported_symbols(program)
@ -170,7 +188,6 @@ impl Mangler {

        let mut reserved_names = Vec::with_capacity_in(total_number_of_slots, &allocator);

-        let generate_name = if self.options.debug { debug_name } else { base54 };
        let mut count = 0;
        for _ in 0..total_number_of_slots {
            let name = loop {
@ -209,7 +226,7 @@ impl Mangler {
        let mut slice_of_same_len_strings = std::vec::Vec::with_capacity(100);
        // 2. "N number of vars are going to be assigned names of the same length"
        for (_, slice_of_same_len_strings_group) in
-            &reserved_names.into_iter().chunk_by(CompactStr::len)
+            &reserved_names.into_iter().chunk_by(InlineString::len)
        {
            // 1. "The most frequent vars get the shorter names"
            // (freq_iter is sorted by frequency from highest to lowest,
@ -235,7 +252,7 @@ impl Mangler {
            // rename the variables
            for (symbol_to_rename, new_name) in symbols_to_rename_with_new_names {
                for &symbol_id in &symbol_to_rename.symbol_ids {
-                    symbol_table.set_name(symbol_id, new_name.as_str());
+                    symbol_table.set_name(symbol_id, new_name);
                }
            }
        }
@ -330,29 +347,145 @@ fn is_keyword(s: &str) -> bool {
            | "void" | "with")
 }

-const BASE54_CHARS: &[u8; 64] = b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ$_0123456789";
+#[repr(C, align(64))]
+struct Aligned64([u8; 64]);
+
+const BASE54_CHARS: Aligned64 =
+    Aligned64(*b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ$_0123456789");

 /// Get the shortest mangled name for a given n.
 /// Code adapted from [terser](https://github.com/terser/terser/blob/8b966d687395ab493d2c6286cc9dd38650324c11/lib/scope.js#L1041-L1051)
-fn base54(n: usize) -> CompactStr {
+//
+// Maximum length of string is 11 (`ZrN6rN6rN6r` for `u64::MAX`), but set `CAPACITY` as 12,
+// so the total size of `InlineString` is 16, including the `len` field.
+// Then initializing the `InlineString` is a single `xmm` set, and with luck it'll sit in a register
+// throughout this function.
+#[expect(clippy::items_after_statements)]
+fn base54(n: usize) -> InlineString<12> {
+    let mut str = InlineString::new();
+
    let mut num = n;
+
    // Base 54 at first because these are the usable first characters in JavaScript identifiers
    // <https://tc39.es/ecma262/#prod-IdentifierStart>
-    let base = 54usize;
-    // SAFETY: `BASE54_CHARS` is utf8.
-    let mut s = unsafe { CompactString::from_utf8_unchecked([BASE54_CHARS[num % base]]) };
-    num /= base;
+    const FIRST_BASE: usize = 54;
+    let byte = BASE54_CHARS.0[num % FIRST_BASE];
+    // SAFETY: All `BASE54_CHARS` are ASCII. This is first byte we push, so can't be out of bounds.
+    unsafe { str.push_unchecked(byte) };
+    num /= FIRST_BASE;
+
    // Base 64 for the rest because after the first character we can also use 0-9 too
    // <https://tc39.es/ecma262/#prod-IdentifierPart>
-    let base = 64usize;
+    const REST_BASE: usize = 64;
    while num > 0 {
        num -= 1;
-        s.push(BASE54_CHARS[num % base] as char);
-        num /= base;
+        let byte = BASE54_CHARS.0[num % REST_BASE];
+        // SAFETY: All `BASE54_CHARS` are ASCII.
+        // String for `u64::MAX` is `ZrN6rN6rN6r` (11 bytes), so cannot push more than `CAPACITY` (12).
+        unsafe { str.push_unchecked(byte) };
+        num /= REST_BASE;
    }
-    CompactStr::from(s)
+
+    str
 }

-fn debug_name(n: usize) -> CompactStr {
-    CompactStr::from(format!("slot_{n}"))
+// Maximum length of string is 25 (`slot_18446744073709551615` for `u64::MAX`)
+// but set `CAPACITY` as 28 so the total size of `InlineString` is 32, including the `len` field.
+fn debug_name(n: usize) -> InlineString<28> {
+    InlineString::from_str(&format!("slot_{n}"))
+}
+
+/// Short inline string.
+///
+/// `CAPACITY` determines the maximum length of the string.
+#[repr(align(16))]
+struct InlineString<const CAPACITY: usize> {
+    len: u32,
+    bytes: [u8; CAPACITY],
+}
+
+impl<const CAPACITY: usize> InlineString<CAPACITY> {
+    /// Create empty [`InlineString`].
+    #[inline]
+    fn new() -> Self {
+        const { assert!(CAPACITY <= u32::MAX as usize) };
+
+        Self { bytes: [0; CAPACITY], len: 0 }
+    }
+
+    /// Create [`InlineString`] from `&str`.
+    ///
+    /// # Panics
+    /// Panics if `s.len() > CAPACITY`.
+    fn from_str(s: &str) -> Self {
+        let mut bytes = [0; CAPACITY];
+        let slice = &mut bytes[..s.len()];
+        slice.copy_from_slice(s.as_bytes());
+        Self { bytes, len: u32::try_from(s.len()).unwrap() }
+    }
+
+    /// Push a byte to the string.
+    ///
+    /// # SAFETY
+    /// * Must not push more than `CAPACITY` bytes.
+    /// * `byte` must be < 128 (an ASCII character).
+    #[inline]
+    unsafe fn push_unchecked(&mut self, byte: u8) {
+        debug_assert!((self.len as usize) < CAPACITY);
+        debug_assert!(byte.is_ascii());
+
+        *self.bytes.get_unchecked_mut(self.len as usize) = byte;
+        self.len += 1;
+    }
+
+    /// Get length of string as `u32`.
+    #[inline]
+    fn len(&self) -> u32 {
+        self.len
+    }
+
+    /// Get string as `&str` slice.
+    #[inline]
+    fn as_str(&self) -> &str {
+        // SAFETY: If safety conditions of `push_unchecked` have been upheld,
+        // slice cannot be out of bounds, and contents of that slice is a valid UTF-8 string
+        unsafe {
+            let slice = self.bytes.get_unchecked(..self.len as usize);
+            std::str::from_utf8_unchecked(slice)
+        }
+    }
+}
+
+impl<const CAPACITY: usize> Deref for InlineString<CAPACITY> {
+    type Target = str;
+
+    #[inline]
+    fn deref(&self) -> &str {
+        self.as_str()
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::base54;
+
+    #[test]
+    fn test_base54() {
+        assert_eq!(&*base54(0), "a");
+        assert_eq!(&*base54(25), "z");
+        assert_eq!(&*base54(26), "A");
+        assert_eq!(&*base54(51), "Z");
+        assert_eq!(&*base54(52), "$");
+        assert_eq!(&*base54(53), "_");
+        assert_eq!(&*base54(54), "aa");
+        assert_eq!(&*base54(55), "ab");
+
+        if cfg!(target_pointer_width = "64") {
+            assert_eq!(&*base54(usize::MAX), "ZrN6rN6rN6r");
+        }
+
+        if cfg!(target_pointer_width = "32") {
+            assert_eq!(&*base54(usize::MAX), "vUdzUd");
+        }
+    }
 }