From bcdc658bbb4276e71b1a12c984288d15ed8f9ffd Mon Sep 17 00:00:00 2001 From: overlookmotel Date: Fri, 24 May 2024 10:19:02 +0100 Subject: [PATCH] feat(transformer): add `TraverseCtx::generate_uid` (#3394) Add `TraverseCtx::generate_uid` method. This is modelled on Babel's `scope.generateUid()` method. As discussed in https://github.com/oxc-project/oxc/discussions/3251#discussioncomment-9416826, this is required to fix most of the remaining failing tests in transformer Milestone 1. I have implemented this to work as closely as possible to Babel, so that it will generate same output as Babel for our tests. However, as mentioned in the code comments, this means it's a pretty expensive function to call. Those code comments suggest 2 ways in which we could make it much more efficient, but we'd need to decide how we're going to handle divergence from Babel before we can decide which route to go. I've left it as a `TODO(improve-on-babel)` for now. --- Cargo.lock | 1 + crates/oxc_semantic/src/scope.rs | 28 +-- crates/oxc_traverse/Cargo.toml | 3 +- crates/oxc_traverse/src/context/mod.rs | 25 ++- crates/oxc_traverse/src/context/scoping.rs | 210 ++++++++++++++++++++- 5 files changed, 237 insertions(+), 30 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 582dcaa5b..f9c2f7114 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1737,6 +1737,7 @@ dependencies = [ name = "oxc_traverse" version = "0.13.1" dependencies = [ + "compact_str", "memoffset", "oxc_allocator", "oxc_ast", diff --git a/crates/oxc_semantic/src/scope.rs b/crates/oxc_semantic/src/scope.rs index 3689313b1..66ff5135c 100644 --- a/crates/oxc_semantic/src/scope.rs +++ b/crates/oxc_semantic/src/scope.rs @@ -1,7 +1,7 @@ use std::hash::BuildHasherDefault; use indexmap::IndexMap; -use oxc_ast::{ast::Expression, syntax_directed_operations::GatherNodeParts}; + use oxc_index::IndexVec; use oxc_span::CompactStr; pub use oxc_syntax::scope::{ScopeFlags, ScopeId}; @@ -175,30 +175,4 @@ impl ScopeTree { ) -> &mut UnresolvedReferences { &mut self.unresolved_references[scope_id] } - - // TODO: - // - pub fn generate_uid_based_on_node(&self, expr: &Expression) -> CompactStr { - let mut parts = std::vec::Vec::with_capacity(1); - expr.gather(&mut |part| parts.push(part)); - let name = parts.join("$"); - let name = name.trim_start_matches('_'); - self.generate_uid(name) - } - - // - pub fn generate_uid(&self, name: &str) -> CompactStr { - for i in 0.. { - let name = Self::internal_generate_uid(name, i); - if !self.has_binding(ScopeId::new(0), &name) { - return name; - } - } - unreachable!() - } - - // - fn internal_generate_uid(name: &str, i: i32) -> CompactStr { - CompactStr::from(if i > 1 { format!("_{name}{i}") } else { format!("_{name}") }) - } } diff --git a/crates/oxc_traverse/Cargo.toml b/crates/oxc_traverse/Cargo.toml index fb99d6212..131cb7b45 100644 --- a/crates/oxc_traverse/Cargo.toml +++ b/crates/oxc_traverse/Cargo.toml @@ -25,7 +25,8 @@ oxc_semantic = { workspace = true } oxc_span = { workspace = true } oxc_syntax = { workspace = true } -memoffset = { workspace = true } +compact_str = { workspace = true } +memoffset = { workspace = true } [dev-dependencies] trybuild = { workspace = true } diff --git a/crates/oxc_traverse/src/context/mod.rs b/crates/oxc_traverse/src/context/mod.rs index 74ed4129a..1f067ce7a 100644 --- a/crates/oxc_traverse/src/context/mod.rs +++ b/crates/oxc_traverse/src/context/mod.rs @@ -1,7 +1,11 @@ use oxc_allocator::{Allocator, Box}; use oxc_ast::AstBuilder; use oxc_semantic::{ScopeTree, SymbolTable}; -use oxc_syntax::scope::{ScopeFlags, ScopeId}; +use oxc_span::CompactStr; +use oxc_syntax::{ + scope::{ScopeFlags, ScopeId}, + symbol::SymbolFlags, +}; use crate::ancestor::{Ancestor, AncestorType}; @@ -269,6 +273,25 @@ impl<'a> TraverseCtx<'a> { { self.scoping.find_scope_by_flags(finder) } + + /// Generate UID. + /// + /// This is a shortcut for `ctx.scoping.generate_uid`. + pub fn generate_uid( + &mut self, + name: &str, + scope_id: ScopeId, + flags: SymbolFlags, + ) -> CompactStr { + self.scoping.generate_uid(name, scope_id, flags) + } + + /// Generate UID in current scope. + /// + /// This is a shortcut for `ctx.scoping.generate_uid_in_current_scope`. + pub fn generate_uid_in_current_scope(&mut self, name: &str, flags: SymbolFlags) -> CompactStr { + self.scoping.generate_uid_in_current_scope(name, flags) + } } // Methods used internally within crate diff --git a/crates/oxc_traverse/src/context/scoping.rs b/crates/oxc_traverse/src/context/scoping.rs index 513a18ce7..18aabccda 100644 --- a/crates/oxc_traverse/src/context/scoping.rs +++ b/crates/oxc_traverse/src/context/scoping.rs @@ -1,5 +1,13 @@ +use std::str; + +use compact_str::{format_compact, CompactString}; + use oxc_semantic::{ScopeTree, SymbolTable}; -use oxc_syntax::scope::{ScopeFlags, ScopeId}; +use oxc_span::{CompactStr, SPAN}; +use oxc_syntax::{ + scope::{ScopeFlags, ScopeId}, + symbol::SymbolFlags, +}; use super::FinderRet; @@ -92,6 +100,95 @@ impl TraverseScoping { finder(flags) }) } + + /// Generate UID. + /// + /// Finds a unique variable name which does clash with any other variables used in the program. + /// Generates a binding for it in scope provided. + /// + /// Based on Babel's `scope.generateUid` logic. + /// + /// + /// # Differences from Babel + /// + /// This implementation aims to replicate Babel's behavior, but differs from Babel + /// in the following ways: + /// + /// 1. Does not check that name is a valid JS identifier name. + /// In most cases, we'll be creating a UID based on an existing variable name, in which case + /// this check is redundant. + /// Caller must ensure `name` is a valid JS identifier, after a `_` is prepended on start. + /// The fact that a `_` will be prepended on start means providing an empty string or a string + /// starting with a digit (0-9) is fine. + /// + /// 2. Does not convert to camel case. + /// This seems unimportant. + /// + /// 3. Does not check var name against list of globals or "contextVariables" + /// (which Babel does in `hasBinding`). + /// No globals or "contextVariables" start with `_` anyway, so no need for this check. + /// + /// 4. Does not check this name is unique if used as a named statement label, only that it's unique + /// as an identifier. + /// If we need to generate unique labels for named statements, we should create a separate method + /// `generate_uid_label`. + /// + /// 5. Does not check against list of other UIDs that have been created. + /// `TraverseScoping::generate_uid` adds this name to symbols table, so when creating next UID, + /// this one will be found and avoided, like any other existing binding. So it's not needed. + /// + /// # Potential improvements + /// + /// TODO(improve-on-babel) + /// + /// This function is fairly expensive, because it aims to replicate Babel's output. + /// `name_is_unique` method below searches through every single binding in the entire program + /// and does a string comparison on each. It also searches through every reference in entire program + /// (though it will avoid string comparison on most of them). + /// If the first name tried is already in use, it will repeat that entire search with a new name, + /// potentially multiple times. + /// + /// We could improve this in one of 2 ways: + /// + /// 1. Semantic generate a hash set of all identifier names used in program. + /// Check for uniqueness would then be just 1 x hashset lookup for each name that's tried. + /// This would maintain output parity with Babel. + /// But building the hash set would add some overhead to semantic. + /// + /// 2. Use a much simpler method, but with less "nicely-named" UIDs: + /// + /// * Name all UIDs `_temp0`, `_temp1` etc. + /// * During initial semantic pass, check for any existing identifiers matching `_temp\d+`. + /// * Find the existing identifier with the largest number after `_temp`. + /// * Store that number in a counter which is global across the whole program. + /// * To create UIDs, increment the counter and postfix it to `_temp`. + /// * If creating or renaming any other identifiers as part of a transform, ensure they don't clash + /// with UIDs already created, and increase the counter if they match `_temp\d+`. + /// + /// i.e. if source contains identifiers `_temp1` and `_temp30`, create UIDs named `_temp_31`, + /// `_temp32` etc. They'll all be guaranteed to be unique within the program. + /// This would make for less nicely-named identifiers, but does that really matter? + /// Minification will rename them all anyway. + pub fn generate_uid( + &mut self, + name: &str, + scope_id: ScopeId, + flags: SymbolFlags, + ) -> CompactStr { + // Get name for UID + let name = CompactStr::new(&self.find_uid_name(name)); + + // Add binding to scope + let symbol_id = self.symbols.create_symbol(SPAN, name.as_str(), flags, scope_id); + self.scopes.add_binding(scope_id, name.clone(), symbol_id); + + name + } + + /// Generate UID in current scope. + pub fn generate_uid_in_current_scope(&mut self, name: &str, flags: SymbolFlags) -> CompactStr { + self.generate_uid(name, self.current_scope_id, flags) + } } // Methods used internally within crate @@ -111,4 +208,115 @@ impl TraverseScoping { pub(crate) fn set_current_scope_id(&mut self, scope_id: ScopeId) { self.current_scope_id = scope_id; } + + /// Find a variable name which can be used as a UID + fn find_uid_name(&self, name: &str) -> CompactString { + let mut name = create_uid_name_base(name); + + // Try the name without a numerical postfix (i.e. plain `_temp`) + if self.name_is_unique(&name) { + return name; + } + + // It's fairly common that UIDs may need a numerical postfix, so we try to keep string + // operations to a minimum for postfixes up to 99 - using `replace_range` on a single + // `CompactStr`, rather than generating a new string on each attempt. + // Postfixes greater than 99 should be very uncommon, so don't bother optimizing. + + // Try single-digit postfixes (i.e. `_temp1`, `_temp2` ... `_temp9`) + name.push('2'); + if self.name_is_unique(&name) { + return name; + } + for c in b'3'..=b'9' { + name.replace_range(name.len() - 1.., str::from_utf8(&[c]).unwrap()); + if self.name_is_unique(&name) { + return name; + } + } + + // Try double-digit postfixes (i.e. `_temp10` ... `_temp99`) + name.replace_range(name.len() - 1.., "1"); + name.push('0'); + let mut c1 = b'1'; + loop { + if self.name_is_unique(&name) { + return name; + } + for c2 in b'1'..=b'9' { + name.replace_range(name.len() - 1.., str::from_utf8(&[c2]).unwrap()); + if self.name_is_unique(&name) { + return name; + } + } + if c1 == b'9' { + break; + } + c1 += 1; + name.replace_range(name.len() - 2.., str::from_utf8(&[c1, b'0']).unwrap()); + } + + // Try longer postfixes (`_temp100` upwards) + let name_base = { + name.pop(); + name.pop(); + &*name + }; + for n in 100..=usize::MAX { + let name = format_compact!("{}{}", name_base, n); + if self.name_is_unique(&name) { + return name; + } + } + + panic!("Cannot generate UID"); + } + + fn name_is_unique(&self, name: &str) -> bool { + // Check if any bindings in program with this name + if self.symbols.names.iter().any(|n| n.as_str() == name) { + return false; + } + + // Check for unbound references in program with this name + !self.symbols.references.iter().any(|reference| { + if reference.symbol_id().is_some() { + // Skip string comparison on bound references, as they'll also be in `symbols.names` + // which already checked above + false + } else { + reference.name().as_str() == name + } + }) + } +} + +/// Create base for UID name based on provided `name`. +/// i.e. if `name` is "foo", returns "_foo". +/// We use `CompactString` to avoid any allocations where `name` is less than 22 bytes (the common case). +fn create_uid_name_base(name: &str) -> CompactString { + // Trim `_`s from start, and `0-9`s from end. + // Code below is equivalent to + // `let name = name.trim_start_matches('_').trim_end_matches(|c: char| c.is_ascii_digit());` + // but more efficient as operates on bytes not chars. + let mut bytes = name.as_bytes(); + while bytes.first() == Some(&b'_') { + bytes = &bytes[1..]; + } + while matches!(bytes.last(), Some(b) if b.is_ascii_digit()) { + bytes = &bytes[0..bytes.len() - 1]; + } + // SAFETY: We started with a valid UTF8 `&str` and have only trimmed off ASCII characters, + // so remainder must still be valid UTF8 + #[allow(unsafe_code)] + let name = unsafe { str::from_utf8_unchecked(bytes) }; + + // Create `CompactString` prepending name with `_`, and with 1 byte excess capacity. + // The extra byte is to avoid reallocation if need to add a digit on the end later, + // which will not be too uncommon. + // Having to add 2 digits will be uncommon, so we don't allocate 2 extra bytes for 2 digits. + let mut str = CompactString::with_capacity(name.len() + 2); + str.push('_'); + str.push_str(name); + str }