mirror of
https://github.com/danbulant/oxc
synced 2026-05-19 12:19:15 +00:00
perf(traverse): faster string operations generating UIDs (#5626)
`get_unique_name` perform string manipulation faster, avoiding bounds checks, and use `itoa` crate for faster conversion of integers to strings for postfixes above 100. No apparent difference on benchmarks, but I imagine that's only because it's outweighed by cost of hashing strings. If measured alone, I believe it would be a perf improvement.
This commit is contained in:
parent
4996874dda
commit
e8013d259e
4 changed files with 208 additions and 41 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
|
@ -1980,6 +1980,7 @@ name = "oxc_traverse"
|
|||
version = "0.27.0"
|
||||
dependencies = [
|
||||
"compact_str",
|
||||
"itoa",
|
||||
"memoffset",
|
||||
"oxc_allocator",
|
||||
"oxc_ast",
|
||||
|
|
|
|||
|
|
@ -136,6 +136,7 @@ ignore = "0.4.22"
|
|||
indexmap = "2.3.0"
|
||||
insta = "1.39.0"
|
||||
itertools = "0.13.0"
|
||||
itoa = "1.0.11"
|
||||
jemallocator = "0.5.4"
|
||||
json-strip-comments = "1.0.4"
|
||||
language-tags = "0.3.2"
|
||||
|
|
|
|||
|
|
@ -30,5 +30,6 @@ oxc_span = { workspace = true }
|
|||
oxc_syntax = { workspace = true }
|
||||
|
||||
compact_str = { workspace = true }
|
||||
itoa = { workspace = true }
|
||||
memoffset = { workspace = true }
|
||||
rustc-hash = { workspace = true }
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
use std::{cell::Cell, str};
|
||||
|
||||
use compact_str::{format_compact, CompactString};
|
||||
use compact_str::CompactString;
|
||||
use itoa::Buffer as ItoaBuffer;
|
||||
use rustc_hash::FxHashSet;
|
||||
|
||||
#[allow(clippy::wildcard_imports)]
|
||||
|
|
@ -513,6 +514,14 @@ fn get_unique_name(base: &str, uid_names: &FxHashSet<CompactStr>) -> CompactStr
|
|||
CompactStr::from(get_unique_name_impl(base, uid_names))
|
||||
}
|
||||
|
||||
// TODO: We could make this function more performant, especially when it checks a lot of names
|
||||
// before it reaches one that is unused.
|
||||
// This function repeatedly creates strings which have only differ from each other by digits added on end,
|
||||
// and then hashes each of those strings to test them against the hash set `uid_names`.
|
||||
// Hashing strings is fairly expensive. As here only the end of the string changes on each iteration,
|
||||
// we could calculate an "unfinished" hash not including the last block, and then just add the final
|
||||
// block to "finish" the hash on each iteration. With `FxHash` this would be straight line code and only
|
||||
// a few operations.
|
||||
fn get_unique_name_impl(base: &str, uid_names: &FxHashSet<CompactStr>) -> CompactString {
|
||||
// Create `CompactString` prepending name with `_`, and with 1 byte excess capacity.
|
||||
// The extra byte is to avoid reallocation if need to add a digit on the end later,
|
||||
|
|
@ -522,63 +531,218 @@ fn get_unique_name_impl(base: &str, uid_names: &FxHashSet<CompactStr>) -> Compac
|
|||
name.push('_');
|
||||
name.push_str(base);
|
||||
|
||||
let name_is_unique = |name: &str| !uid_names.contains(name);
|
||||
|
||||
// Try the name without a numerical postfix (i.e. plain `_temp`)
|
||||
if name_is_unique(&name) {
|
||||
return name;
|
||||
}
|
||||
|
||||
// It's fairly common that UIDs may need a numerical postfix, so we try to keep string
|
||||
// operations to a minimum for postfixes up to 99 - using `replace_range` on a single
|
||||
// `CompactStr`, rather than generating a new string on each attempt.
|
||||
// operations to a minimum for postfixes up to 99 - reusing a single `CompactString`,
|
||||
// rather than generating a new string on each attempt.
|
||||
// For speed we manipulate the string as bytes.
|
||||
// Postfixes greater than 99 should be very uncommon, so don't bother optimizing.
|
||||
//
|
||||
// SAFETY: Only modifications to string are replacing last byte/last 2 bytes with ASCII digits.
|
||||
// These bytes are already ASCII chars, so cannot produce an invalid UTF-8 string.
|
||||
// Writes are always in bounds (`bytes` is redefined after string grows due to `push`).
|
||||
unsafe {
|
||||
let name_is_unique = |bytes: &[u8]| {
|
||||
let name = str::from_utf8_unchecked(bytes);
|
||||
!uid_names.contains(name)
|
||||
};
|
||||
|
||||
// Try single-digit postfixes (i.e. `_temp2`, `_temp3` ... `_temp9`)
|
||||
name.push('2');
|
||||
if name_is_unique(&name) {
|
||||
return name;
|
||||
}
|
||||
for c in b'3'..=b'9' {
|
||||
name.replace_range(name.len() - 1.., str::from_utf8(&[c]).unwrap());
|
||||
if name_is_unique(&name) {
|
||||
// Try the name without a numerical postfix (i.e. plain `_temp`)
|
||||
let bytes = name.as_bytes_mut();
|
||||
if name_is_unique(bytes) {
|
||||
return name;
|
||||
}
|
||||
}
|
||||
|
||||
// Try double-digit postfixes (i.e. `_temp10` ... `_temp99`)
|
||||
name.replace_range(name.len() - 1.., "1");
|
||||
name.push('0');
|
||||
let mut c1 = b'1';
|
||||
loop {
|
||||
if name_is_unique(&name) {
|
||||
// Try single-digit postfixes (i.e. `_temp2`, `_temp3` ... `_temp9`)
|
||||
name.push('2');
|
||||
let bytes = name.as_bytes_mut();
|
||||
if name_is_unique(bytes) {
|
||||
return name;
|
||||
}
|
||||
for c2 in b'1'..=b'9' {
|
||||
name.replace_range(name.len() - 1.., str::from_utf8(&[c2]).unwrap());
|
||||
if name_is_unique(&name) {
|
||||
|
||||
let last_index = bytes.len() - 1;
|
||||
for c in b'3'..=b'9' {
|
||||
*bytes.get_unchecked_mut(last_index) = c;
|
||||
if name_is_unique(bytes) {
|
||||
return name;
|
||||
}
|
||||
}
|
||||
if c1 == b'9' {
|
||||
break;
|
||||
|
||||
// Try double-digit postfixes (i.e. `_temp10` ... `_temp99`)
|
||||
*bytes.get_unchecked_mut(last_index) = b'1';
|
||||
name.push('0');
|
||||
let bytes = name.as_bytes_mut();
|
||||
let last_index = last_index + 1;
|
||||
|
||||
let mut c1 = b'1';
|
||||
loop {
|
||||
if name_is_unique(bytes) {
|
||||
return name;
|
||||
}
|
||||
for c2 in b'1'..=b'9' {
|
||||
*bytes.get_unchecked_mut(last_index) = c2;
|
||||
if name_is_unique(bytes) {
|
||||
return name;
|
||||
}
|
||||
}
|
||||
if c1 == b'9' {
|
||||
break;
|
||||
}
|
||||
c1 += 1;
|
||||
|
||||
let last_two: &mut [u8; 2] =
|
||||
bytes.get_unchecked_mut(last_index - 1..=last_index).try_into().unwrap();
|
||||
*last_two = [c1, b'0'];
|
||||
}
|
||||
c1 += 1;
|
||||
name.replace_range(name.len() - 2.., str::from_utf8(&[c1, b'0']).unwrap());
|
||||
}
|
||||
|
||||
// Try longer postfixes (`_temp100` upwards)
|
||||
let name_base = {
|
||||
name.pop();
|
||||
name.pop();
|
||||
&*name
|
||||
};
|
||||
for n in 100..=usize::MAX {
|
||||
let name = format_compact!("{}{}", name_base, n);
|
||||
if name_is_unique(&name) {
|
||||
|
||||
// Reserve space for 1 more byte for the additional 3rd digit.
|
||||
// Do this here so that `name.push_str(digits)` will never need to grow the string until it reaches
|
||||
// `n == 1000`, which makes the branch on "is there sufficient capacity to push?" in the loop below
|
||||
// completely predictable for `n < 1000`.
|
||||
name.reserve(1);
|
||||
|
||||
// At this point, `name` has had 2 digits added on end. `base_len` is length without those 2 digits.
|
||||
let base_len = name.len() - 2;
|
||||
|
||||
let mut buffer = ItoaBuffer::new();
|
||||
for n in 100..=u32::MAX {
|
||||
let digits = buffer.format(n);
|
||||
// SAFETY: `base_len` is always shorter than current `name.len()`, on a UTF-8 char boundary,
|
||||
// and `name` contains at least `base_len` initialized bytes
|
||||
unsafe { name.set_len(base_len) };
|
||||
name.push_str(digits);
|
||||
if !uid_names.contains(name.as_str()) {
|
||||
return name;
|
||||
}
|
||||
}
|
||||
|
||||
panic!("Cannot generate UID");
|
||||
// Limit for size of source text is `u32::MAX` bytes, so there cannot be `u32::MAX`
|
||||
// identifier names in the AST. So loop above cannot fail to find an unused name.
|
||||
unreachable!();
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[test]
|
||||
fn test_get_unique_name() {
|
||||
let cases: &[(&[&str], &str, &str)] = &[
|
||||
(&[], "foo", "_foo"),
|
||||
(&["_foo"], "foo", "_foo2"),
|
||||
(&["_foo0", "_foo1"], "foo", "_foo"),
|
||||
(&["_foo2", "_foo3", "_foo4"], "foo", "_foo"),
|
||||
(&["_foo", "_foo2"], "foo", "_foo3"),
|
||||
(&["_foo", "_foo2", "_foo4"], "foo", "_foo3"),
|
||||
(&["_foo", "_foo2", "_foo3", "_foo4", "_foo5", "_foo6", "_foo7", "_foo8"], "foo", "_foo9"),
|
||||
(
|
||||
&["_foo", "_foo2", "_foo3", "_foo4", "_foo5", "_foo6", "_foo7", "_foo8", "_foo9"],
|
||||
"foo",
|
||||
"_foo10",
|
||||
),
|
||||
(
|
||||
&[
|
||||
"_foo", "_foo2", "_foo3", "_foo4", "_foo5", "_foo6", "_foo7", "_foo8", "_foo9",
|
||||
"_foo10",
|
||||
],
|
||||
"foo",
|
||||
"_foo11",
|
||||
),
|
||||
(
|
||||
&[
|
||||
"_foo", "_foo2", "_foo3", "_foo4", "_foo5", "_foo6", "_foo7", "_foo8", "_foo9",
|
||||
"_foo10", "_foo11",
|
||||
],
|
||||
"foo",
|
||||
"_foo12",
|
||||
),
|
||||
(
|
||||
&[
|
||||
"_foo", "_foo2", "_foo3", "_foo4", "_foo5", "_foo6", "_foo7", "_foo8", "_foo9",
|
||||
"_foo10", "_foo11", "_foo12", "_foo13", "_foo14", "_foo15", "_foo16", "_foo17",
|
||||
"_foo18",
|
||||
],
|
||||
"foo",
|
||||
"_foo19",
|
||||
),
|
||||
(
|
||||
&[
|
||||
"_foo", "_foo2", "_foo3", "_foo4", "_foo5", "_foo6", "_foo7", "_foo8", "_foo9",
|
||||
"_foo10", "_foo11", "_foo12", "_foo13", "_foo14", "_foo15", "_foo16", "_foo17",
|
||||
"_foo18", "_foo19",
|
||||
],
|
||||
"foo",
|
||||
"_foo20",
|
||||
),
|
||||
(
|
||||
&[
|
||||
"_foo", "_foo2", "_foo3", "_foo4", "_foo5", "_foo6", "_foo7", "_foo8", "_foo9",
|
||||
"_foo10", "_foo11", "_foo12", "_foo13", "_foo14", "_foo15", "_foo16", "_foo17",
|
||||
"_foo18", "_foo19", "_foo20",
|
||||
],
|
||||
"foo",
|
||||
"_foo21",
|
||||
),
|
||||
(
|
||||
&[
|
||||
"_foo", "_foo2", "_foo3", "_foo4", "_foo5", "_foo6", "_foo7", "_foo8", "_foo9",
|
||||
"_foo10", "_foo11", "_foo12", "_foo13", "_foo14", "_foo15", "_foo16", "_foo17",
|
||||
"_foo18", "_foo19", "_foo20", "_foo21", "_foo22", "_foo23", "_foo24", "_foo25",
|
||||
"_foo26", "_foo27", "_foo28", "_foo29", "_foo30", "_foo31", "_foo32", "_foo33",
|
||||
"_foo34", "_foo35", "_foo36", "_foo37", "_foo38", "_foo39", "_foo40", "_foo41",
|
||||
"_foo42", "_foo43", "_foo44", "_foo45", "_foo46", "_foo47", "_foo48", "_foo49",
|
||||
"_foo50", "_foo51", "_foo52", "_foo53", "_foo54", "_foo55", "_foo56", "_foo57",
|
||||
"_foo58", "_foo59", "_foo60", "_foo61", "_foo62", "_foo63", "_foo64", "_foo65",
|
||||
"_foo66", "_foo67", "_foo68", "_foo69", "_foo70", "_foo71", "_foo72", "_foo73",
|
||||
"_foo74", "_foo75", "_foo76", "_foo77", "_foo78", "_foo79", "_foo80", "_foo81",
|
||||
"_foo82", "_foo83", "_foo84", "_foo85", "_foo86", "_foo87", "_foo88", "_foo89",
|
||||
"_foo90", "_foo91", "_foo92", "_foo93", "_foo94", "_foo95", "_foo96", "_foo97",
|
||||
"_foo98",
|
||||
],
|
||||
"foo",
|
||||
"_foo99",
|
||||
),
|
||||
(
|
||||
&[
|
||||
"_foo", "_foo2", "_foo3", "_foo4", "_foo5", "_foo6", "_foo7", "_foo8", "_foo9",
|
||||
"_foo10", "_foo11", "_foo12", "_foo13", "_foo14", "_foo15", "_foo16", "_foo17",
|
||||
"_foo18", "_foo19", "_foo20", "_foo21", "_foo22", "_foo23", "_foo24", "_foo25",
|
||||
"_foo26", "_foo27", "_foo28", "_foo29", "_foo30", "_foo31", "_foo32", "_foo33",
|
||||
"_foo34", "_foo35", "_foo36", "_foo37", "_foo38", "_foo39", "_foo40", "_foo41",
|
||||
"_foo42", "_foo43", "_foo44", "_foo45", "_foo46", "_foo47", "_foo48", "_foo49",
|
||||
"_foo50", "_foo51", "_foo52", "_foo53", "_foo54", "_foo55", "_foo56", "_foo57",
|
||||
"_foo58", "_foo59", "_foo60", "_foo61", "_foo62", "_foo63", "_foo64", "_foo65",
|
||||
"_foo66", "_foo67", "_foo68", "_foo69", "_foo70", "_foo71", "_foo72", "_foo73",
|
||||
"_foo74", "_foo75", "_foo76", "_foo77", "_foo78", "_foo79", "_foo80", "_foo81",
|
||||
"_foo82", "_foo83", "_foo84", "_foo85", "_foo86", "_foo87", "_foo88", "_foo89",
|
||||
"_foo90", "_foo91", "_foo92", "_foo93", "_foo94", "_foo95", "_foo96", "_foo97",
|
||||
"_foo98", "_foo99",
|
||||
],
|
||||
"foo",
|
||||
"_foo100",
|
||||
),
|
||||
(
|
||||
&[
|
||||
"_foo", "_foo2", "_foo3", "_foo4", "_foo5", "_foo6", "_foo7", "_foo8", "_foo9",
|
||||
"_foo10", "_foo11", "_foo12", "_foo13", "_foo14", "_foo15", "_foo16", "_foo17",
|
||||
"_foo18", "_foo19", "_foo20", "_foo21", "_foo22", "_foo23", "_foo24", "_foo25",
|
||||
"_foo26", "_foo27", "_foo28", "_foo29", "_foo30", "_foo31", "_foo32", "_foo33",
|
||||
"_foo34", "_foo35", "_foo36", "_foo37", "_foo38", "_foo39", "_foo40", "_foo41",
|
||||
"_foo42", "_foo43", "_foo44", "_foo45", "_foo46", "_foo47", "_foo48", "_foo49",
|
||||
"_foo50", "_foo51", "_foo52", "_foo53", "_foo54", "_foo55", "_foo56", "_foo57",
|
||||
"_foo58", "_foo59", "_foo60", "_foo61", "_foo62", "_foo63", "_foo64", "_foo65",
|
||||
"_foo66", "_foo67", "_foo68", "_foo69", "_foo70", "_foo71", "_foo72", "_foo73",
|
||||
"_foo74", "_foo75", "_foo76", "_foo77", "_foo78", "_foo79", "_foo80", "_foo81",
|
||||
"_foo82", "_foo83", "_foo84", "_foo85", "_foo86", "_foo87", "_foo88", "_foo89",
|
||||
"_foo90", "_foo91", "_foo92", "_foo93", "_foo94", "_foo95", "_foo96", "_foo97",
|
||||
"_foo98", "_foo99", "_foo100",
|
||||
],
|
||||
"foo",
|
||||
"_foo101",
|
||||
),
|
||||
];
|
||||
|
||||
for (used, name, expected) in cases {
|
||||
let used = used.iter().map(|name| CompactStr::from(*name)).collect();
|
||||
assert_eq!(get_unique_name(name, &used), expected);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue