diff --git a/crates/oxc_linter/src/snapshots/no_invalid_regexp.snap b/crates/oxc_linter/src/snapshots/no_invalid_regexp.snap index fcf8911ec..ce3ca46e0 100644 --- a/crates/oxc_linter/src/snapshots/no_invalid_regexp.snap +++ b/crates/oxc_linter/src/snapshots/no_invalid_regexp.snap @@ -200,7 +200,7 @@ source: crates/oxc_linter/src/tester.rs ╰──── ⚠ eslint(no-invalid-regexp): Invalid regular expression: Duplicated capturing group names - ╭─[no_invalid_regexp.tsx:1:23] + ╭─[no_invalid_regexp.tsx:1:16] 1 │ new RegExp('(?a)(?b)') - · ─ + · ─ ─ ╰──── diff --git a/crates/oxc_regular_expression/README.md b/crates/oxc_regular_expression/README.md index b2e10ad66..facadc40e 100644 --- a/crates/oxc_regular_expression/README.md +++ b/crates/oxc_regular_expression/README.md @@ -5,3 +5,8 @@ Implements ECMAScript® 2024 Language Specification - https://tc39.es/ecma262/2024/multipage/ecmascript-language-lexical-grammar.html#sec-literals-regular-expression-literals - https://tc39.es/ecma262/2024/multipage/text-processing.html#sec-regexp-regular-expression-objects - https://tc39.es/ecma262/2024/multipage/additional-ecmascript-features-for-web-browsers.html#sec-regular-expressions-patterns + +And, Stage 3 proposals + +- https://github.com/tc39/proposal-duplicate-named-capturing-groups +- https://github.com/tc39/proposal-regexp-modifiers diff --git a/crates/oxc_regular_expression/src/parser/mod.rs b/crates/oxc_regular_expression/src/parser/mod.rs index 4b7d5df8e..c53684cdb 100644 --- a/crates/oxc_regular_expression/src/parser/mod.rs +++ b/crates/oxc_regular_expression/src/parser/mod.rs @@ -101,10 +101,17 @@ mod test { (r"\1()", "u"), (r"(?..)(?..)", ""), // ES2025 --- - // TODO: Duplicate named capturing groups - // (r"(?..)|(?..)", ""), - // (r"(?[0-9]{4})-[0-9]{2}|[0-9]{2}-(?[0-9]{4})", ""), - // (r"(?:(?x)|(?y))\k", ""), + // Duplicate named capturing groups + (r"(?..)|(?..)", ""), + (r"(?[0-9]{4})-[0-9]{2}|[0-9]{2}-(?[0-9]{4})", ""), + (r"(?:(?x)|(?y))\k", ""), + (r"(?a)|(?b)", ""), + (r"(?:(?a)|(?a)(?b))(?:(?c)|(?d))", ""), + (r"(?:(?a)|(?b))\\k", ""), + (r"(?:(?:(?a)|(?b)|c)\\k){2}", ""), + (r"(?:(?:(?a)|(?b))\\k){2}", ""), + (r"(?:(?:(?a)\\k|(?b)\\k)|(?:))\\k", ""), + (r"(?:(?:(?a\\k)|(?b\\k))|(?:))\\k", ""), // Modifiers (r"(?:.)", ""), (r"(?s:.)", ""), @@ -190,9 +197,15 @@ mod test { (r"[\q{]", "v"), (r"[\q{\a}]", "v"), // ES2025 --- - // TODO: Duplicate named capturing groups - (r"(?..)|(?..)", ""), // This will be valid - // (r"(?|(?))", ""), // Nested, still invalid + // Duplicate named capturing groups + (r"(?.)(?.)", ""), + (r"(?.(?..))", "u"), + ("(?)|(?)(?)", ""), + ("(((((((?.)))))))(?)", ""), + ("(?:(?a)|(?b))(?c)", ""), + ("(?a)(?:(?b)|(?c))", ""), + ("(?:(?:(?a)|(?b)))(?c)", ""), + ("(?:(?:(?a)|(?b))|(?:))(?c)", ""), // Modifiers (r"(?a:.)", ""), (r"(?-S:.)", ""), @@ -255,6 +268,11 @@ mod test { (r"[[[[[^[[[[\q{ng}]]]]]]]]]", "v", true), (r"[^[[[[[[[[[[[[[[[[\q{ng}]]]]]]]]]]]]]]]]]", "v", true), // ES2025 --- + // Duplicated named capture groups + ("(?:(?a)|(?b))(?c)", "", true), + ("(?:(?a)|(?b))(?c)", "", false), + ("(?a)(?:(?b)|(?c))", "", true), + ("(?a)|(?:(?b)|(?c))", "", false), // Modifiers (r"(?ii:.)", "", true), (r"(?-ss:.)", "", true), diff --git a/crates/oxc_regular_expression/src/parser/pattern_parser/pattern_parser_impl.rs b/crates/oxc_regular_expression/src/parser/pattern_parser/pattern_parser_impl.rs index 9d4e566f0..75697d54c 100644 --- a/crates/oxc_regular_expression/src/parser/pattern_parser/pattern_parser_impl.rs +++ b/crates/oxc_regular_expression/src/parser/pattern_parser/pattern_parser_impl.rs @@ -46,8 +46,14 @@ impl<'a> PatternParser<'a> { // - Cons: 1st pass is completely useless if the pattern does not contain any capturing groups // We may re-consider this if we need more performance rather than simplicity. let checkpoint = self.reader.checkpoint(); - let duplicated_named_capturing_groups = - self.state.initialize_with_parsing(&mut self.reader); + + // [SS:EE] Pattern :: Disjunction + // It is a Syntax Error if Pattern contains two or more GroupSpecifiers for which the CapturingGroupName of GroupSpecifier is the same. + self.state.initialize_with_parsing(&mut self.reader).map_err(|offsets| { + diagnostics::duplicated_capturing_group_names( + offsets.iter().map(|&(start, end)| self.span_factory.create(start, end)).collect(), + ) + })?; self.reader.rewind(checkpoint); // [SS:EE] Pattern :: Disjunction @@ -58,16 +64,6 @@ impl<'a> PatternParser<'a> { if u32::MAX == self.state.num_of_capturing_groups { return Err(diagnostics::too_may_capturing_groups(self.span_factory.create(0, 0))); } - // [SS:EE] Pattern :: Disjunction - // It is a Syntax Error if Pattern contains two or more GroupSpecifiers for which the CapturingGroupName of GroupSpecifier is the same. - if !duplicated_named_capturing_groups.is_empty() { - return Err(diagnostics::duplicated_capturing_group_names( - duplicated_named_capturing_groups - .iter() - .map(|&(start, end)| self.span_factory.create(start, end)) - .collect(), - )); - } // Let's start parsing! let span_start = self.reader.offset(); diff --git a/crates/oxc_regular_expression/src/parser/pattern_parser/state.rs b/crates/oxc_regular_expression/src/parser/pattern_parser/state.rs index 320c1df04..7fa3732b1 100644 --- a/crates/oxc_regular_expression/src/parser/pattern_parser/state.rs +++ b/crates/oxc_regular_expression/src/parser/pattern_parser/state.rs @@ -1,5 +1,5 @@ use oxc_span::Atom; -use rustc_hash::FxHashSet; +use rustc_hash::{FxHashMap, FxHashSet}; use crate::parser::reader::Reader; @@ -16,6 +16,8 @@ pub struct State<'a> { pub capturing_group_names: FxHashSet>, } +type DuplicatedNamedCapturingGroupOffsets = Vec<(u32, u32)>; + impl<'a> State<'a> { pub fn new(unicode_mode: bool, unicode_sets_mode: bool) -> Self { Self { @@ -27,12 +29,11 @@ impl<'a> State<'a> { } } - pub fn initialize_with_parsing(&mut self, reader: &mut Reader<'a>) -> Vec<(u32, u32)> { - let ( - num_of_left_capturing_parens, - capturing_group_names, - duplicated_named_capturing_groups, - ) = parse_capturing_groups(reader); + pub fn initialize_with_parsing( + &mut self, + reader: &mut Reader<'a>, + ) -> Result<(), DuplicatedNamedCapturingGroupOffsets> { + let (num_of_left_capturing_parens, capturing_group_names) = parse_capturing_groups(reader)?; // In Annex B, this is `false` by default. // It is `true` @@ -44,26 +45,43 @@ impl<'a> State<'a> { self.num_of_capturing_groups = num_of_left_capturing_parens; self.capturing_group_names = capturing_group_names; - duplicated_named_capturing_groups + Ok(()) } } -/// Returns: (num_of_left_parens, capturing_group_names, duplicated_named_capturing_groups) +enum SimpleUnit<'a> { + Open, + Close, + Pipe, + GroupName(Atom<'a>), +} + +/// Returns: Result<(num_of_left_parens, capturing_group_names), duplicated_named_capturing_group_offsets> fn parse_capturing_groups<'a>( reader: &mut Reader<'a>, -) -> (u32, FxHashSet>, Vec<(u32, u32)>) { - let mut num_of_left_capturing_parens = 0; - let mut capturing_group_names = FxHashSet::default(); - let mut duplicated_named_capturing_groups = vec![]; - - let mut in_escape = false; - let mut in_character_class = false; - +) -> Result<(u32, FxHashSet>), DuplicatedNamedCapturingGroupOffsets> { // Count only normal CapturingGroup(named, unnamed) // (?...), (...) // IgnoreGroup, and LookaroundAssertions are ignored // (?:...) // (?=...), (?!...), (?<=...), (?, (u32, u32)> = FxHashMap::default(); + // At the same time, check duplicates + // If you want to process this most efficiently: + // - define a scope for each Disjunction + // - then check for duplicates for each `|` while inheriting the parent-child relationship + // ref. https://source.chromium.org/chromium/chromium/src/+/main:v8/src/regexp/regexp-parser.cc;l=1644 + // However, duplicates are rare in the first place. + // And as long as it works simply, this may be enough. + let mut may_duplicates: FxHashMap, DuplicatedNamedCapturingGroupOffsets> = + FxHashMap::default(); + let mut simplified: Vec> = vec![]; + + let mut in_escape = false; + let mut in_character_class = false; while let Some(cp) = reader.peek() { if in_escape { in_escape = false; @@ -73,9 +91,15 @@ fn parse_capturing_groups<'a>( in_character_class = true; } else if cp == ']' as u32 { in_character_class = false; + } else if !in_character_class && cp == '|' as u32 { + simplified.push(SimpleUnit::Pipe); + } else if !in_character_class && cp == ')' as u32 { + simplified.push(SimpleUnit::Close); } else if !in_character_class && cp == '(' as u32 { reader.advance(); + simplified.push(SimpleUnit::Open); + // Skip IgnoreGroup if reader.eat2('?', ':') // Skip LookAroundAssertion @@ -102,20 +126,84 @@ fn parse_capturing_groups<'a>( let span_end = reader.offset(); if reader.eat('>') { - // May be duplicated - if !capturing_group_names.insert(reader.atom(span_start, span_end)) { - // Report them with `Span` - duplicated_named_capturing_groups.push((span_start, span_end)); + let group_name = reader.atom(span_start, span_end); + + simplified.push(SimpleUnit::GroupName(group_name.clone())); + // Check duplicates later + if let Some(last_span) = group_names.get(&group_name) { + let entry = may_duplicates.entry(group_name).or_default(); + entry.push(*last_span); + entry.push((span_start, span_end)); + } else { + group_names.insert(group_name, (span_start, span_end)); } + continue; } } + + // Unnamed + continue; } reader.advance(); } - (num_of_left_capturing_parens, capturing_group_names, duplicated_named_capturing_groups) + // Check duplicates and emit error if exists + if !may_duplicates.is_empty() { + // Check must be done for each group name + for (group_name, spans) in may_duplicates { + let iter = simplified.iter().clone(); + + let mut alternative_depth = FxHashSet::default(); + let mut depth = 0_u32; + let mut is_first = true; + + 'outer: for token in iter { + match token { + SimpleUnit::Open => { + depth += 1; + } + SimpleUnit::Close => { + // May panic if the pattern has invalid, unbalanced parens + depth = depth.saturating_sub(1); + } + SimpleUnit::Pipe => { + if !is_first { + alternative_depth.insert(depth); + } + } + SimpleUnit::GroupName(name) => { + // Check target group name only + if *name != group_name { + continue; + } + // Skip the first one, because it is not duplicated + if is_first { + is_first = false; + continue; + } + + // If left outer `|` is found, both can participate + // `|(?)` + // ^ ^ depth: 1 + // ^ depth: 0 + for i in (0..depth).rev() { + if alternative_depth.contains(&i) { + // Remove it, next duplicates requires another `|` + alternative_depth.remove(&i); + continue 'outer; + } + } + + return Err(spans); + } + } + } + } + } + + Ok((num_of_left_capturing_parens, group_names.keys().cloned().collect())) } #[cfg(test)] @@ -123,35 +211,39 @@ mod tests { use super::*; #[test] - fn test_count_capturing_groups() { + fn count_capturing_groups() { for (source_text, expected) in [ - ("()", (1, 0, false)), - (r"\1()", (1, 0, false)), - ("(foo)", (1, 0, false)), - ("(foo)(bar)", (2, 0, false)), - ("(foo(bar))", (2, 0, false)), - ("(foo)[(bar)]", (1, 0, false)), - (r"(foo)\(bar\)", (1, 0, false)), - ("(foo)(?bar)", (2, 1, false)), - ("(foo)(?=...)(?!...)(?<=...)(?bar)(?baz)", (3, 2, false)), - ("(?.)(?..)", (2, 1, true)), - ("(?.(?..))", (2, 1, true)), + ("()", (1, 0)), + (r"\1()", (1, 0)), + ("(foo)", (1, 0)), + ("(foo)(bar)", (2, 0)), + ("(foo(bar))", (2, 0)), + ("(foo)[(bar)]", (1, 0)), + (r"(foo)\(bar\)", (1, 0)), + ("(foo)(?bar)", (2, 1)), + ("(foo)(?=...)(?!...)(?<=...)(?bar)(?baz)", (3, 2)), + ("(?.)(?.)|(?..)|(?.)", (4, 2)), + ("(?.)(?.)|(?:..)|(?.)", (3, 2)), ] { let mut reader = Reader::initialize(source_text, true, false).unwrap(); - let ( - num_of_left_capturing_parens, - capturing_group_names, - duplicated_named_capturing_groups, - ) = parse_capturing_groups(&mut reader); + let (num_of_left_capturing_parens, capturing_group_names) = + parse_capturing_groups(&mut reader).unwrap(); - let actual = ( - num_of_left_capturing_parens, - capturing_group_names.len(), - !duplicated_named_capturing_groups.is_empty(), - ); + let actual = (num_of_left_capturing_parens, capturing_group_names.len()); assert_eq!(expected, actual, "{source_text}"); } } + + #[test] + fn duplicated_named_capturing_groups() { + for source_text in + ["(?.)(?..)", "(?.(?..))", "|(?.(?..))", "(?.)|(?.(?..))"] + { + let mut reader = Reader::initialize(source_text, true, false).unwrap(); + + assert!(parse_capturing_groups(&mut reader).is_err(), "{source_text}"); + } + } } diff --git a/tasks/coverage/snapshots/codegen_test262.snap b/tasks/coverage/snapshots/codegen_test262.snap index fd4ada3a9..325c9ffb4 100644 --- a/tasks/coverage/snapshots/codegen_test262.snap +++ b/tasks/coverage/snapshots/codegen_test262.snap @@ -1,5 +1,5 @@ commit: 06454619 codegen_test262 Summary: -AST Parsed : 43832/43832 (100.00%) -Positive Passed: 43832/43832 (100.00%) +AST Parsed : 43851/43851 (100.00%) +Positive Passed: 43851/43851 (100.00%) diff --git a/tasks/coverage/snapshots/minifier_test262.snap b/tasks/coverage/snapshots/minifier_test262.snap index 84a1c136b..7c80ba79c 100644 --- a/tasks/coverage/snapshots/minifier_test262.snap +++ b/tasks/coverage/snapshots/minifier_test262.snap @@ -1,5 +1,5 @@ commit: 06454619 minifier_test262 Summary: -AST Parsed : 43832/43832 (100.00%) -Positive Passed: 43832/43832 (100.00%) +AST Parsed : 43851/43851 (100.00%) +Positive Passed: 43851/43851 (100.00%) diff --git a/tasks/coverage/snapshots/parser_test262.snap b/tasks/coverage/snapshots/parser_test262.snap index be4fb00f7..9b1150366 100644 --- a/tasks/coverage/snapshots/parser_test262.snap +++ b/tasks/coverage/snapshots/parser_test262.snap @@ -1,8 +1,8 @@ commit: 06454619 parser_test262 Summary: -AST Parsed : 43832/43832 (100.00%) -Positive Passed: 43832/43832 (100.00%) +AST Parsed : 43851/43851 (100.00%) +Positive Passed: 43851/43851 (100.00%) Negative Passed: 4320/4322 (99.95%) Expect Syntax Error: tasks/coverage/test262/test/language/import/import-attributes/json-invalid.js Expect Syntax Error: tasks/coverage/test262/test/language/import/import-attributes/json-named-bindings.js @@ -21345,31 +21345,31 @@ Expect Syntax Error: tasks/coverage/test262/test/language/import/import-attribut ╰──── × Invalid regular expression: Duplicated capturing group names - ╭─[test262/test/language/literals/regexp/named-groups/invalid-duplicate-groupspecifier-2-u.js:20:19] + ╭─[test262/test/language/literals/regexp/named-groups/invalid-duplicate-groupspecifier-2-u.js:20:5] 19 │ 20 │ /(?a)(?b)(?a)/u; - · ─ + · ─ ─ ╰──── × Invalid regular expression: Duplicated capturing group names - ╭─[test262/test/language/literals/regexp/named-groups/invalid-duplicate-groupspecifier-2.js:20:19] + ╭─[test262/test/language/literals/regexp/named-groups/invalid-duplicate-groupspecifier-2.js:20:5] 19 │ 20 │ /(?a)(?b)(?a)/; - · ─ + · ─ ─ ╰──── × Invalid regular expression: Duplicated capturing group names - ╭─[test262/test/language/literals/regexp/named-groups/invalid-duplicate-groupspecifier-u.js:20:12] + ╭─[test262/test/language/literals/regexp/named-groups/invalid-duplicate-groupspecifier-u.js:20:5] 19 │ 20 │ /(?a)(?a)/u; - · ─ + · ─ ─ ╰──── × Invalid regular expression: Duplicated capturing group names - ╭─[test262/test/language/literals/regexp/named-groups/invalid-duplicate-groupspecifier.js:20:12] + ╭─[test262/test/language/literals/regexp/named-groups/invalid-duplicate-groupspecifier.js:20:5] 19 │ 20 │ /(?a)(?a)/; - · ─ + · ─ ─ ╰──── × Invalid regular expression: Unterminated capturing group name diff --git a/tasks/coverage/snapshots/parser_typescript.snap b/tasks/coverage/snapshots/parser_typescript.snap index 738f8cc54..4cd3a5c3a 100644 --- a/tasks/coverage/snapshots/parser_typescript.snap +++ b/tasks/coverage/snapshots/parser_typescript.snap @@ -10975,19 +10975,11 @@ Expect to Parse: tasks/coverage/typescript/tests/cases/conformance/salsa/private 14 │ /(?)((?bar)bar)(?baz)|(foo(?foo))(?)/, ╰──── - × Invalid regular expression: Duplicated capturing group names - ╭─[typescript/tests/cases/compiler/regularExpressionScanning.ts:14:46] - 13 │ /\2()(\12)(foo)\1\0[\0\1\01\123\08\8](\3\03)\5\005\9\009/u, - 14 │ /(?)((?bar)bar)(?baz)|(foo(?foo))(?)/, - · ─── ─── - 15 │ /(\k)\k(?foo)|(?)((?)|(bar(?bar)))/, - ╰──── - - × Invalid regular expression: Duplicated capturing group names - ╭─[typescript/tests/cases/compiler/regularExpressionScanning.ts:15:59] + × Invalid regular expression: Group specifier is empty + ╭─[typescript/tests/cases/compiler/regularExpressionScanning.ts:15:12] 14 │ /(?)((?bar)bar)(?baz)|(foo(?foo))(?)/, 15 │ /(\k)\k(?foo)|(?)((?)|(bar(?bar)))/, - · ─── + · ────────── 16 │ // Quantifiers ╰──── diff --git a/tasks/coverage/snapshots/semantic_test262.snap b/tasks/coverage/snapshots/semantic_test262.snap index 77fc7b096..7e436cc38 100644 --- a/tasks/coverage/snapshots/semantic_test262.snap +++ b/tasks/coverage/snapshots/semantic_test262.snap @@ -1,8 +1,8 @@ commit: 06454619 semantic_test262 Summary: -AST Parsed : 43832/43832 (100.00%) -Positive Passed: 43632/43832 (99.54%) +AST Parsed : 43851/43851 (100.00%) +Positive Passed: 43651/43851 (99.54%) tasks/coverage/test262/test/annexB/language/function-code/if-decl-else-decl-a-func-block-scoping.js semantic error: Symbol scope ID mismatch for "f": after transform: SymbolId(3): ScopeId(4294967294) diff --git a/tasks/coverage/snapshots/transformer_test262.snap b/tasks/coverage/snapshots/transformer_test262.snap index f20ae8e23..3ef1c8a6d 100644 --- a/tasks/coverage/snapshots/transformer_test262.snap +++ b/tasks/coverage/snapshots/transformer_test262.snap @@ -1,5 +1,5 @@ commit: 06454619 transformer_test262 Summary: -AST Parsed : 43832/43832 (100.00%) -Positive Passed: 43832/43832 (100.00%) +AST Parsed : 43851/43851 (100.00%) +Positive Passed: 43851/43851 (100.00%) diff --git a/tasks/coverage/src/test262/mod.rs b/tasks/coverage/src/test262/mod.rs index 0e060a025..b9224db64 100644 --- a/tasks/coverage/src/test262/mod.rs +++ b/tasks/coverage/src/test262/mod.rs @@ -105,8 +105,6 @@ impl Case for Test262Case { fn skip_test_case(&self) -> bool { [ - // ES2025 https://github.com/tc39/proposal-duplicate-named-capturing-groups - "regexp-duplicate-named-groups", // stage 3 https://github.com/tc39/proposal-source-phase-imports "source-phase-imports", ]