feat(regular_expression)!: Support ES2025 Duplicated named capture groups (#6847)

Closes #6358

@preyneyv I know you've been working on this problem.

This is an implementation that has been dormant on my local for a while.

- All tests are passing
- However, the approach is simple but not general, so there might be some edge cases that were missed
- There's also room for improvement in terms of performance

For these reasons, it was marked as WIP for me.

I believe the test cases and other parts are usable, so feel free to fork and replace them with your implementation if you'd like.
This commit is contained in:
leaysgur 2024-10-25 02:13:57 +00:00
parent f49b3e2088
commit 90c786c420
12 changed files with 198 additions and 97 deletions

View file

@ -200,7 +200,7 @@ source: crates/oxc_linter/src/tester.rs
╰────
⚠ eslint(no-invalid-regexp): Invalid regular expression: Duplicated capturing group names
╭─[no_invalid_regexp.tsx:1:23]
╭─[no_invalid_regexp.tsx:1:16]
1 │ new RegExp('(?<k>a)(?<k>b)')
·
·
╰────

View file

@ -5,3 +5,8 @@ Implements ECMAScript® 2024 Language Specification
- https://tc39.es/ecma262/2024/multipage/ecmascript-language-lexical-grammar.html#sec-literals-regular-expression-literals
- https://tc39.es/ecma262/2024/multipage/text-processing.html#sec-regexp-regular-expression-objects
- https://tc39.es/ecma262/2024/multipage/additional-ecmascript-features-for-web-browsers.html#sec-regular-expressions-patterns
And, Stage 3 proposals
- https://github.com/tc39/proposal-duplicate-named-capturing-groups
- https://github.com/tc39/proposal-regexp-modifiers

View file

@ -101,10 +101,17 @@ mod test {
(r"\1()", "u"),
(r"(?<n1>..)(?<n2>..)", ""),
// ES2025 ---
// TODO: Duplicate named capturing groups
// (r"(?<n1>..)|(?<n1>..)", ""),
// (r"(?<year>[0-9]{4})-[0-9]{2}|[0-9]{2}-(?<year>[0-9]{4})", ""),
// (r"(?:(?<a>x)|(?<a>y))\k<a>", ""),
// Duplicate named capturing groups
(r"(?<n1>..)|(?<n1>..)", ""),
(r"(?<year>[0-9]{4})-[0-9]{2}|[0-9]{2}-(?<year>[0-9]{4})", ""),
(r"(?:(?<a>x)|(?<a>y))\k<a>", ""),
(r"(?<x>a)|(?<x>b)", ""),
(r"(?:(?<x>a)|(?<y>a)(?<x>b))(?:(?<z>c)|(?<z>d))", ""),
(r"(?:(?<x>a)|(?<x>b))\\k<x>", ""),
(r"(?:(?:(?<x>a)|(?<x>b)|c)\\k<x>){2}", ""),
(r"(?:(?:(?<x>a)|(?<x>b))\\k<x>){2}", ""),
(r"(?:(?:(?<x>a)\\k<x>|(?<x>b)\\k<x>)|(?:))\\k<x>", ""),
(r"(?:(?:(?<x>a\\k<x>)|(?<x>b\\k<x>))|(?:))\\k<x>", ""),
// Modifiers
(r"(?:.)", ""),
(r"(?s:.)", ""),
@ -190,9 +197,15 @@ mod test {
(r"[\q{]", "v"),
(r"[\q{\a}]", "v"),
// ES2025 ---
// TODO: Duplicate named capturing groups
(r"(?<n>..)|(?<n>..)", ""), // This will be valid
// (r"(?<a>|(?<a>))", ""), // Nested, still invalid
// Duplicate named capturing groups
(r"(?<n>.)(?<n>.)", ""),
(r"(?<n>.(?<n>..))", "u"),
("(?<n>)|(?<n>)(?<n>)", ""),
("(((((((?<n>.)))))))(?<n>)", ""),
("(?:(?<x>a)|(?<x>b))(?<x>c)", ""),
("(?<x>a)(?:(?<x>b)|(?<x>c))", ""),
("(?:(?:(?<x>a)|(?<x>b)))(?<x>c)", ""),
("(?:(?:(?<x>a)|(?<x>b))|(?:))(?<x>c)", ""),
// Modifiers
(r"(?a:.)", ""),
(r"(?-S:.)", ""),
@ -255,6 +268,11 @@ mod test {
(r"[[[[[^[[[[\q{ng}]]]]]]]]]", "v", true),
(r"[^[[[[[[[[[[[[[[[[\q{ng}]]]]]]]]]]]]]]]]]", "v", true),
// ES2025 ---
// Duplicated named capture groups
("(?:(?<x>a)|(?<x>b))(?<x>c)", "", true),
("(?:(?<x>a)|(?<x>b))(?<X>c)", "", false),
("(?<x>a)(?:(?<x>b)|(?<x>c))", "", true),
("(?<x>a)|(?:(?<x>b)|(?<x>c))", "", false),
// Modifiers
(r"(?ii:.)", "", true),
(r"(?-ss:.)", "", true),

View file

@ -46,8 +46,14 @@ impl<'a> PatternParser<'a> {
// - Cons: 1st pass is completely useless if the pattern does not contain any capturing groups
// We may re-consider this if we need more performance rather than simplicity.
let checkpoint = self.reader.checkpoint();
let duplicated_named_capturing_groups =
self.state.initialize_with_parsing(&mut self.reader);
// [SS:EE] Pattern :: Disjunction
// It is a Syntax Error if Pattern contains two or more GroupSpecifiers for which the CapturingGroupName of GroupSpecifier is the same.
self.state.initialize_with_parsing(&mut self.reader).map_err(|offsets| {
diagnostics::duplicated_capturing_group_names(
offsets.iter().map(|&(start, end)| self.span_factory.create(start, end)).collect(),
)
})?;
self.reader.rewind(checkpoint);
// [SS:EE] Pattern :: Disjunction
@ -58,16 +64,6 @@ impl<'a> PatternParser<'a> {
if u32::MAX == self.state.num_of_capturing_groups {
return Err(diagnostics::too_may_capturing_groups(self.span_factory.create(0, 0)));
}
// [SS:EE] Pattern :: Disjunction
// It is a Syntax Error if Pattern contains two or more GroupSpecifiers for which the CapturingGroupName of GroupSpecifier is the same.
if !duplicated_named_capturing_groups.is_empty() {
return Err(diagnostics::duplicated_capturing_group_names(
duplicated_named_capturing_groups
.iter()
.map(|&(start, end)| self.span_factory.create(start, end))
.collect(),
));
}
// Let's start parsing!
let span_start = self.reader.offset();

View file

@ -1,5 +1,5 @@
use oxc_span::Atom;
use rustc_hash::FxHashSet;
use rustc_hash::{FxHashMap, FxHashSet};
use crate::parser::reader::Reader;
@ -16,6 +16,8 @@ pub struct State<'a> {
pub capturing_group_names: FxHashSet<Atom<'a>>,
}
type DuplicatedNamedCapturingGroupOffsets = Vec<(u32, u32)>;
impl<'a> State<'a> {
pub fn new(unicode_mode: bool, unicode_sets_mode: bool) -> Self {
Self {
@ -27,12 +29,11 @@ impl<'a> State<'a> {
}
}
pub fn initialize_with_parsing(&mut self, reader: &mut Reader<'a>) -> Vec<(u32, u32)> {
let (
num_of_left_capturing_parens,
capturing_group_names,
duplicated_named_capturing_groups,
) = parse_capturing_groups(reader);
pub fn initialize_with_parsing(
&mut self,
reader: &mut Reader<'a>,
) -> Result<(), DuplicatedNamedCapturingGroupOffsets> {
let (num_of_left_capturing_parens, capturing_group_names) = parse_capturing_groups(reader)?;
// In Annex B, this is `false` by default.
// It is `true`
@ -44,26 +45,43 @@ impl<'a> State<'a> {
self.num_of_capturing_groups = num_of_left_capturing_parens;
self.capturing_group_names = capturing_group_names;
duplicated_named_capturing_groups
Ok(())
}
}
/// Returns: (num_of_left_parens, capturing_group_names, duplicated_named_capturing_groups)
enum SimpleUnit<'a> {
Open,
Close,
Pipe,
GroupName(Atom<'a>),
}
/// Returns: Result<(num_of_left_parens, capturing_group_names), duplicated_named_capturing_group_offsets>
fn parse_capturing_groups<'a>(
reader: &mut Reader<'a>,
) -> (u32, FxHashSet<Atom<'a>>, Vec<(u32, u32)>) {
let mut num_of_left_capturing_parens = 0;
let mut capturing_group_names = FxHashSet::default();
let mut duplicated_named_capturing_groups = vec![];
let mut in_escape = false;
let mut in_character_class = false;
) -> Result<(u32, FxHashSet<Atom<'a>>), DuplicatedNamedCapturingGroupOffsets> {
// Count only normal CapturingGroup(named, unnamed)
// (?<name>...), (...)
// IgnoreGroup, and LookaroundAssertions are ignored
// (?:...)
// (?=...), (?!...), (?<=...), (?<!...)
let mut num_of_left_capturing_parens = 0;
// Collect capturing group names
let mut group_names: FxHashMap<Atom<'a>, (u32, u32)> = FxHashMap::default();
// At the same time, check duplicates
// If you want to process this most efficiently:
// - define a scope for each Disjunction
// - then check for duplicates for each `|` while inheriting the parent-child relationship
// ref. https://source.chromium.org/chromium/chromium/src/+/main:v8/src/regexp/regexp-parser.cc;l=1644
// However, duplicates are rare in the first place.
// And as long as it works simply, this may be enough.
let mut may_duplicates: FxHashMap<Atom<'a>, DuplicatedNamedCapturingGroupOffsets> =
FxHashMap::default();
let mut simplified: Vec<SimpleUnit<'a>> = vec![];
let mut in_escape = false;
let mut in_character_class = false;
while let Some(cp) = reader.peek() {
if in_escape {
in_escape = false;
@ -73,9 +91,15 @@ fn parse_capturing_groups<'a>(
in_character_class = true;
} else if cp == ']' as u32 {
in_character_class = false;
} else if !in_character_class && cp == '|' as u32 {
simplified.push(SimpleUnit::Pipe);
} else if !in_character_class && cp == ')' as u32 {
simplified.push(SimpleUnit::Close);
} else if !in_character_class && cp == '(' as u32 {
reader.advance();
simplified.push(SimpleUnit::Open);
// Skip IgnoreGroup
if reader.eat2('?', ':')
// Skip LookAroundAssertion
@ -102,20 +126,84 @@ fn parse_capturing_groups<'a>(
let span_end = reader.offset();
if reader.eat('>') {
// May be duplicated
if !capturing_group_names.insert(reader.atom(span_start, span_end)) {
// Report them with `Span`
duplicated_named_capturing_groups.push((span_start, span_end));
let group_name = reader.atom(span_start, span_end);
simplified.push(SimpleUnit::GroupName(group_name.clone()));
// Check duplicates later
if let Some(last_span) = group_names.get(&group_name) {
let entry = may_duplicates.entry(group_name).or_default();
entry.push(*last_span);
entry.push((span_start, span_end));
} else {
group_names.insert(group_name, (span_start, span_end));
}
continue;
}
}
// Unnamed
continue;
}
reader.advance();
}
(num_of_left_capturing_parens, capturing_group_names, duplicated_named_capturing_groups)
// Check duplicates and emit error if exists
if !may_duplicates.is_empty() {
// Check must be done for each group name
for (group_name, spans) in may_duplicates {
let iter = simplified.iter().clone();
let mut alternative_depth = FxHashSet::default();
let mut depth = 0_u32;
let mut is_first = true;
'outer: for token in iter {
match token {
SimpleUnit::Open => {
depth += 1;
}
SimpleUnit::Close => {
// May panic if the pattern has invalid, unbalanced parens
depth = depth.saturating_sub(1);
}
SimpleUnit::Pipe => {
if !is_first {
alternative_depth.insert(depth);
}
}
SimpleUnit::GroupName(name) => {
// Check target group name only
if *name != group_name {
continue;
}
// Skip the first one, because it is not duplicated
if is_first {
is_first = false;
continue;
}
// If left outer `|` is found, both can participate
// `|(?<n>)`
// ^ ^ depth: 1
// ^ depth: 0
for i in (0..depth).rev() {
if alternative_depth.contains(&i) {
// Remove it, next duplicates requires another `|`
alternative_depth.remove(&i);
continue 'outer;
}
}
return Err(spans);
}
}
}
}
}
Ok((num_of_left_capturing_parens, group_names.keys().cloned().collect()))
}
#[cfg(test)]
@ -123,35 +211,39 @@ mod tests {
use super::*;
#[test]
fn test_count_capturing_groups() {
fn count_capturing_groups() {
for (source_text, expected) in [
("()", (1, 0, false)),
(r"\1()", (1, 0, false)),
("(foo)", (1, 0, false)),
("(foo)(bar)", (2, 0, false)),
("(foo(bar))", (2, 0, false)),
("(foo)[(bar)]", (1, 0, false)),
(r"(foo)\(bar\)", (1, 0, false)),
("(foo)(?<n>bar)", (2, 1, false)),
("(foo)(?=...)(?!...)(?<=...)(?<!...)(?:...)", (1, 0, false)),
("(foo)(?<n>bar)(?<nn>baz)", (3, 2, false)),
("(?<n>.)(?<n>..)", (2, 1, true)),
("(?<n>.(?<n>..))", (2, 1, true)),
("()", (1, 0)),
(r"\1()", (1, 0)),
("(foo)", (1, 0)),
("(foo)(bar)", (2, 0)),
("(foo(bar))", (2, 0)),
("(foo)[(bar)]", (1, 0)),
(r"(foo)\(bar\)", (1, 0)),
("(foo)(?<n>bar)", (2, 1)),
("(foo)(?=...)(?!...)(?<=...)(?<!...)(?:...)", (1, 0)),
("(foo)(?<n>bar)(?<nn>baz)", (3, 2)),
("(?<n>.)(?<m>.)|(?<n>..)|(?<m>.)", (4, 2)),
("(?<n>.)(?<m>.)|(?:..)|(?<m>.)", (3, 2)),
] {
let mut reader = Reader::initialize(source_text, true, false).unwrap();
let (
num_of_left_capturing_parens,
capturing_group_names,
duplicated_named_capturing_groups,
) = parse_capturing_groups(&mut reader);
let (num_of_left_capturing_parens, capturing_group_names) =
parse_capturing_groups(&mut reader).unwrap();
let actual = (
num_of_left_capturing_parens,
capturing_group_names.len(),
!duplicated_named_capturing_groups.is_empty(),
);
let actual = (num_of_left_capturing_parens, capturing_group_names.len());
assert_eq!(expected, actual, "{source_text}");
}
}
#[test]
fn duplicated_named_capturing_groups() {
for source_text in
["(?<n>.)(?<n>..)", "(?<n>.(?<n>..))", "|(?<n>.(?<n>..))", "(?<m>.)|(?<n>.(?<n>..))"]
{
let mut reader = Reader::initialize(source_text, true, false).unwrap();
assert!(parse_capturing_groups(&mut reader).is_err(), "{source_text}");
}
}
}

View file

@ -1,5 +1,5 @@
commit: 06454619
codegen_test262 Summary:
AST Parsed : 43832/43832 (100.00%)
Positive Passed: 43832/43832 (100.00%)
AST Parsed : 43851/43851 (100.00%)
Positive Passed: 43851/43851 (100.00%)

View file

@ -1,5 +1,5 @@
commit: 06454619
minifier_test262 Summary:
AST Parsed : 43832/43832 (100.00%)
Positive Passed: 43832/43832 (100.00%)
AST Parsed : 43851/43851 (100.00%)
Positive Passed: 43851/43851 (100.00%)

View file

@ -1,8 +1,8 @@
commit: 06454619
parser_test262 Summary:
AST Parsed : 43832/43832 (100.00%)
Positive Passed: 43832/43832 (100.00%)
AST Parsed : 43851/43851 (100.00%)
Positive Passed: 43851/43851 (100.00%)
Negative Passed: 4320/4322 (99.95%)
Expect Syntax Error: tasks/coverage/test262/test/language/import/import-attributes/json-invalid.js
Expect Syntax Error: tasks/coverage/test262/test/language/import/import-attributes/json-named-bindings.js
@ -21345,31 +21345,31 @@ Expect Syntax Error: tasks/coverage/test262/test/language/import/import-attribut
╰────
× Invalid regular expression: Duplicated capturing group names
╭─[test262/test/language/literals/regexp/named-groups/invalid-duplicate-groupspecifier-2-u.js:20:19]
╭─[test262/test/language/literals/regexp/named-groups/invalid-duplicate-groupspecifier-2-u.js:20:5]
19 │
20 │ /(?<a>a)(?<b>b)(?<a>a)/u;
·
·
╰────
× Invalid regular expression: Duplicated capturing group names
╭─[test262/test/language/literals/regexp/named-groups/invalid-duplicate-groupspecifier-2.js:20:19]
╭─[test262/test/language/literals/regexp/named-groups/invalid-duplicate-groupspecifier-2.js:20:5]
19 │
20 │ /(?<a>a)(?<b>b)(?<a>a)/;
·
·
╰────
× Invalid regular expression: Duplicated capturing group names
╭─[test262/test/language/literals/regexp/named-groups/invalid-duplicate-groupspecifier-u.js:20:12]
╭─[test262/test/language/literals/regexp/named-groups/invalid-duplicate-groupspecifier-u.js:20:5]
19 │
20 │ /(?<a>a)(?<a>a)/u;
·
·
╰────
× Invalid regular expression: Duplicated capturing group names
╭─[test262/test/language/literals/regexp/named-groups/invalid-duplicate-groupspecifier.js:20:12]
╭─[test262/test/language/literals/regexp/named-groups/invalid-duplicate-groupspecifier.js:20:5]
19 │
20 │ /(?<a>a)(?<a>a)/;
·
·
╰────
× Invalid regular expression: Unterminated capturing group name

View file

@ -10975,19 +10975,11 @@ Expect to Parse: tasks/coverage/typescript/tests/cases/conformance/salsa/private
14 │ /(?<foo>)((?<bar>bar)bar)(?<baz>baz)|(foo(?<foo>foo))(?<baz>)/,
╰────
× Invalid regular expression: Duplicated capturing group names
╭─[typescript/tests/cases/compiler/regularExpressionScanning.ts:14:46]
13 │ /\2()(\12)(foo)\1\0[\0\1\01\123\08\8](\3\03)\5\005\9\009/u,
14 │ /(?<foo>)((?<bar>bar)bar)(?<baz>baz)|(foo(?<foo>foo))(?<baz>)/,
· ─── ───
15 │ /(\k<bar>)\k<absent>(?<foo>foo)|(?<bar>)((?<foo>)|(bar(?<bar>bar)))/,
╰────
× Invalid regular expression: Duplicated capturing group names
╭─[typescript/tests/cases/compiler/regularExpressionScanning.ts:15:59]
× Invalid regular expression: Group specifier is empty
╭─[typescript/tests/cases/compiler/regularExpressionScanning.ts:15:12]
14 │ /(?<foo>)((?<bar>bar)bar)(?<baz>baz)|(foo(?<foo>foo))(?<baz>)/,
15 │ /(\k<bar>)\k<absent>(?<foo>foo)|(?<bar>)((?<foo>)|(bar(?<bar>bar)))/,
· ───
· ──────────
16 │ // Quantifiers
╰────

View file

@ -1,8 +1,8 @@
commit: 06454619
semantic_test262 Summary:
AST Parsed : 43832/43832 (100.00%)
Positive Passed: 43632/43832 (99.54%)
AST Parsed : 43851/43851 (100.00%)
Positive Passed: 43651/43851 (99.54%)
tasks/coverage/test262/test/annexB/language/function-code/if-decl-else-decl-a-func-block-scoping.js
semantic error: Symbol scope ID mismatch for "f":
after transform: SymbolId(3): ScopeId(4294967294)

View file

@ -1,5 +1,5 @@
commit: 06454619
transformer_test262 Summary:
AST Parsed : 43832/43832 (100.00%)
Positive Passed: 43832/43832 (100.00%)
AST Parsed : 43851/43851 (100.00%)
Positive Passed: 43851/43851 (100.00%)

View file

@ -105,8 +105,6 @@ impl Case for Test262Case {
fn skip_test_case(&self) -> bool {
[
// ES2025 https://github.com/tc39/proposal-duplicate-named-capturing-groups
"regexp-duplicate-named-groups",
// stage 3 https://github.com/tc39/proposal-source-phase-imports
"source-phase-imports",
]