diff --git a/Cargo.lock b/Cargo.lock index 582dcaa5b..f9c2f7114 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1737,6 +1737,7 @@ dependencies = [ name = "oxc_traverse" version = "0.13.1" dependencies = [ + "compact_str", "memoffset", "oxc_allocator", "oxc_ast", diff --git a/crates/oxc_semantic/src/scope.rs b/crates/oxc_semantic/src/scope.rs index 3689313b1..66ff5135c 100644 --- a/crates/oxc_semantic/src/scope.rs +++ b/crates/oxc_semantic/src/scope.rs @@ -1,7 +1,7 @@ use std::hash::BuildHasherDefault; use indexmap::IndexMap; -use oxc_ast::{ast::Expression, syntax_directed_operations::GatherNodeParts}; + use oxc_index::IndexVec; use oxc_span::CompactStr; pub use oxc_syntax::scope::{ScopeFlags, ScopeId}; @@ -175,30 +175,4 @@ impl ScopeTree { ) -> &mut UnresolvedReferences { &mut self.unresolved_references[scope_id] } - - // TODO: - // - pub fn generate_uid_based_on_node(&self, expr: &Expression) -> CompactStr { - let mut parts = std::vec::Vec::with_capacity(1); - expr.gather(&mut |part| parts.push(part)); - let name = parts.join("$"); - let name = name.trim_start_matches('_'); - self.generate_uid(name) - } - - // - pub fn generate_uid(&self, name: &str) -> CompactStr { - for i in 0.. { - let name = Self::internal_generate_uid(name, i); - if !self.has_binding(ScopeId::new(0), &name) { - return name; - } - } - unreachable!() - } - - // - fn internal_generate_uid(name: &str, i: i32) -> CompactStr { - CompactStr::from(if i > 1 { format!("_{name}{i}") } else { format!("_{name}") }) - } } diff --git a/crates/oxc_traverse/Cargo.toml b/crates/oxc_traverse/Cargo.toml index fb99d6212..131cb7b45 100644 --- a/crates/oxc_traverse/Cargo.toml +++ b/crates/oxc_traverse/Cargo.toml @@ -25,7 +25,8 @@ oxc_semantic = { workspace = true } oxc_span = { workspace = true } oxc_syntax = { workspace = true } -memoffset = { workspace = true } +compact_str = { workspace = true } +memoffset = { workspace = true } [dev-dependencies] trybuild = { workspace = true } diff --git a/crates/oxc_traverse/src/context/mod.rs b/crates/oxc_traverse/src/context/mod.rs index 74ed4129a..1f067ce7a 100644 --- a/crates/oxc_traverse/src/context/mod.rs +++ b/crates/oxc_traverse/src/context/mod.rs @@ -1,7 +1,11 @@ use oxc_allocator::{Allocator, Box}; use oxc_ast::AstBuilder; use oxc_semantic::{ScopeTree, SymbolTable}; -use oxc_syntax::scope::{ScopeFlags, ScopeId}; +use oxc_span::CompactStr; +use oxc_syntax::{ + scope::{ScopeFlags, ScopeId}, + symbol::SymbolFlags, +}; use crate::ancestor::{Ancestor, AncestorType}; @@ -269,6 +273,25 @@ impl<'a> TraverseCtx<'a> { { self.scoping.find_scope_by_flags(finder) } + + /// Generate UID. + /// + /// This is a shortcut for `ctx.scoping.generate_uid`. + pub fn generate_uid( + &mut self, + name: &str, + scope_id: ScopeId, + flags: SymbolFlags, + ) -> CompactStr { + self.scoping.generate_uid(name, scope_id, flags) + } + + /// Generate UID in current scope. + /// + /// This is a shortcut for `ctx.scoping.generate_uid_in_current_scope`. + pub fn generate_uid_in_current_scope(&mut self, name: &str, flags: SymbolFlags) -> CompactStr { + self.scoping.generate_uid_in_current_scope(name, flags) + } } // Methods used internally within crate diff --git a/crates/oxc_traverse/src/context/scoping.rs b/crates/oxc_traverse/src/context/scoping.rs index 513a18ce7..18aabccda 100644 --- a/crates/oxc_traverse/src/context/scoping.rs +++ b/crates/oxc_traverse/src/context/scoping.rs @@ -1,5 +1,13 @@ +use std::str; + +use compact_str::{format_compact, CompactString}; + use oxc_semantic::{ScopeTree, SymbolTable}; -use oxc_syntax::scope::{ScopeFlags, ScopeId}; +use oxc_span::{CompactStr, SPAN}; +use oxc_syntax::{ + scope::{ScopeFlags, ScopeId}, + symbol::SymbolFlags, +}; use super::FinderRet; @@ -92,6 +100,95 @@ impl TraverseScoping { finder(flags) }) } + + /// Generate UID. + /// + /// Finds a unique variable name which does clash with any other variables used in the program. + /// Generates a binding for it in scope provided. + /// + /// Based on Babel's `scope.generateUid` logic. + /// + /// + /// # Differences from Babel + /// + /// This implementation aims to replicate Babel's behavior, but differs from Babel + /// in the following ways: + /// + /// 1. Does not check that name is a valid JS identifier name. + /// In most cases, we'll be creating a UID based on an existing variable name, in which case + /// this check is redundant. + /// Caller must ensure `name` is a valid JS identifier, after a `_` is prepended on start. + /// The fact that a `_` will be prepended on start means providing an empty string or a string + /// starting with a digit (0-9) is fine. + /// + /// 2. Does not convert to camel case. + /// This seems unimportant. + /// + /// 3. Does not check var name against list of globals or "contextVariables" + /// (which Babel does in `hasBinding`). + /// No globals or "contextVariables" start with `_` anyway, so no need for this check. + /// + /// 4. Does not check this name is unique if used as a named statement label, only that it's unique + /// as an identifier. + /// If we need to generate unique labels for named statements, we should create a separate method + /// `generate_uid_label`. + /// + /// 5. Does not check against list of other UIDs that have been created. + /// `TraverseScoping::generate_uid` adds this name to symbols table, so when creating next UID, + /// this one will be found and avoided, like any other existing binding. So it's not needed. + /// + /// # Potential improvements + /// + /// TODO(improve-on-babel) + /// + /// This function is fairly expensive, because it aims to replicate Babel's output. + /// `name_is_unique` method below searches through every single binding in the entire program + /// and does a string comparison on each. It also searches through every reference in entire program + /// (though it will avoid string comparison on most of them). + /// If the first name tried is already in use, it will repeat that entire search with a new name, + /// potentially multiple times. + /// + /// We could improve this in one of 2 ways: + /// + /// 1. Semantic generate a hash set of all identifier names used in program. + /// Check for uniqueness would then be just 1 x hashset lookup for each name that's tried. + /// This would maintain output parity with Babel. + /// But building the hash set would add some overhead to semantic. + /// + /// 2. Use a much simpler method, but with less "nicely-named" UIDs: + /// + /// * Name all UIDs `_temp0`, `_temp1` etc. + /// * During initial semantic pass, check for any existing identifiers matching `_temp\d+`. + /// * Find the existing identifier with the largest number after `_temp`. + /// * Store that number in a counter which is global across the whole program. + /// * To create UIDs, increment the counter and postfix it to `_temp`. + /// * If creating or renaming any other identifiers as part of a transform, ensure they don't clash + /// with UIDs already created, and increase the counter if they match `_temp\d+`. + /// + /// i.e. if source contains identifiers `_temp1` and `_temp30`, create UIDs named `_temp_31`, + /// `_temp32` etc. They'll all be guaranteed to be unique within the program. + /// This would make for less nicely-named identifiers, but does that really matter? + /// Minification will rename them all anyway. + pub fn generate_uid( + &mut self, + name: &str, + scope_id: ScopeId, + flags: SymbolFlags, + ) -> CompactStr { + // Get name for UID + let name = CompactStr::new(&self.find_uid_name(name)); + + // Add binding to scope + let symbol_id = self.symbols.create_symbol(SPAN, name.as_str(), flags, scope_id); + self.scopes.add_binding(scope_id, name.clone(), symbol_id); + + name + } + + /// Generate UID in current scope. + pub fn generate_uid_in_current_scope(&mut self, name: &str, flags: SymbolFlags) -> CompactStr { + self.generate_uid(name, self.current_scope_id, flags) + } } // Methods used internally within crate @@ -111,4 +208,115 @@ impl TraverseScoping { pub(crate) fn set_current_scope_id(&mut self, scope_id: ScopeId) { self.current_scope_id = scope_id; } + + /// Find a variable name which can be used as a UID + fn find_uid_name(&self, name: &str) -> CompactString { + let mut name = create_uid_name_base(name); + + // Try the name without a numerical postfix (i.e. plain `_temp`) + if self.name_is_unique(&name) { + return name; + } + + // It's fairly common that UIDs may need a numerical postfix, so we try to keep string + // operations to a minimum for postfixes up to 99 - using `replace_range` on a single + // `CompactStr`, rather than generating a new string on each attempt. + // Postfixes greater than 99 should be very uncommon, so don't bother optimizing. + + // Try single-digit postfixes (i.e. `_temp1`, `_temp2` ... `_temp9`) + name.push('2'); + if self.name_is_unique(&name) { + return name; + } + for c in b'3'..=b'9' { + name.replace_range(name.len() - 1.., str::from_utf8(&[c]).unwrap()); + if self.name_is_unique(&name) { + return name; + } + } + + // Try double-digit postfixes (i.e. `_temp10` ... `_temp99`) + name.replace_range(name.len() - 1.., "1"); + name.push('0'); + let mut c1 = b'1'; + loop { + if self.name_is_unique(&name) { + return name; + } + for c2 in b'1'..=b'9' { + name.replace_range(name.len() - 1.., str::from_utf8(&[c2]).unwrap()); + if self.name_is_unique(&name) { + return name; + } + } + if c1 == b'9' { + break; + } + c1 += 1; + name.replace_range(name.len() - 2.., str::from_utf8(&[c1, b'0']).unwrap()); + } + + // Try longer postfixes (`_temp100` upwards) + let name_base = { + name.pop(); + name.pop(); + &*name + }; + for n in 100..=usize::MAX { + let name = format_compact!("{}{}", name_base, n); + if self.name_is_unique(&name) { + return name; + } + } + + panic!("Cannot generate UID"); + } + + fn name_is_unique(&self, name: &str) -> bool { + // Check if any bindings in program with this name + if self.symbols.names.iter().any(|n| n.as_str() == name) { + return false; + } + + // Check for unbound references in program with this name + !self.symbols.references.iter().any(|reference| { + if reference.symbol_id().is_some() { + // Skip string comparison on bound references, as they'll also be in `symbols.names` + // which already checked above + false + } else { + reference.name().as_str() == name + } + }) + } +} + +/// Create base for UID name based on provided `name`. +/// i.e. if `name` is "foo", returns "_foo". +/// We use `CompactString` to avoid any allocations where `name` is less than 22 bytes (the common case). +fn create_uid_name_base(name: &str) -> CompactString { + // Trim `_`s from start, and `0-9`s from end. + // Code below is equivalent to + // `let name = name.trim_start_matches('_').trim_end_matches(|c: char| c.is_ascii_digit());` + // but more efficient as operates on bytes not chars. + let mut bytes = name.as_bytes(); + while bytes.first() == Some(&b'_') { + bytes = &bytes[1..]; + } + while matches!(bytes.last(), Some(b) if b.is_ascii_digit()) { + bytes = &bytes[0..bytes.len() - 1]; + } + // SAFETY: We started with a valid UTF8 `&str` and have only trimmed off ASCII characters, + // so remainder must still be valid UTF8 + #[allow(unsafe_code)] + let name = unsafe { str::from_utf8_unchecked(bytes) }; + + // Create `CompactString` prepending name with `_`, and with 1 byte excess capacity. + // The extra byte is to avoid reallocation if need to add a digit on the end later, + // which will not be too uncommon. + // Having to add 2 digits will be uncommon, so we don't allocate 2 extra bytes for 2 digits. + let mut str = CompactString::with_capacity(name.len() + 2); + str.push('_'); + str.push_str(name); + str }