feat(regular_expression): Validate max quantifier value (#5218)

I've never seen but `/a{9007199254740991}/` is valid and this is the maximum value for quantifier. \+ left comment about #5210 experiment.
2026-05-19 12:19:15 +00:00 · 2024-08-26 07:11:03 +00:00 · 2024-08-26 07:11:03 +00:00 · 46b641b75d
commit 46b641b75d
parent b39c0d6122
4 changed files with 56 additions and 15 deletions
--- a/crates/oxc_regular_expression/src/ast.rs
+++ b/crates/oxc_regular_expression/src/ast.rs
@ -98,9 +98,9 @@ pub enum LookAroundAssertionKind {
 #[derive(Debug)]
 pub struct Quantifier<'a> {
    pub span: Span,
-    pub min: u32,
+    pub min: u64,
    /// `None` means no upper bound.
-    pub max: Option<u32>,
+    pub max: Option<u64>,
    pub greedy: bool,
    pub body: Term<'a>,
 }
--- a/crates/oxc_regular_expression/src/body_parser/mod.rs
+++ b/crates/oxc_regular_expression/src/body_parser/mod.rs
@ -32,6 +32,8 @@ mod test {
            ("a{1,", ParserOptions::default()),
            ("a{1,}", ParserOptions::default()),
            ("a{1,2}", ParserOptions::default()),
+            ("x{9007199254740991}", ParserOptions::default()),
+            ("x{9007199254740991,9007199254740991}", ParserOptions::default()),
            ("a|b", ParserOptions::default()),
            ("a|b|c", ParserOptions::default()),
            ("a|b+?|c", ParserOptions::default()),
@ -137,6 +139,8 @@ mod test {
            ("a{1", ParserOptions::default().with_unicode_mode()),
            ("a{1,", ParserOptions::default().with_unicode_mode()),
            ("a{,", ParserOptions::default().with_unicode_mode()),
+            ("x{9007199254740992}", ParserOptions::default()),
+            ("x{9007199254740991,9007199254740992}", ParserOptions::default()),
            ("(?=a", ParserOptions::default()),
            ("(?<!a", ParserOptions::default()),
            (r"\c0", ParserOptions::default().with_unicode_mode()),
--- a/crates/oxc_regular_expression/src/body_parser/parser.rs
+++ b/crates/oxc_regular_expression/src/body_parser/parser.rs
@ -1565,7 +1565,8 @@ impl<'a> PatternParser<'a> {
    // ```
    /// Returns: ((min, max), greedy)
    #[allow(clippy::type_complexity)]
-    fn consume_quantifier(&mut self) -> Result<Option<((u32, Option<u32>), bool)>> {
+    fn consume_quantifier(&mut self) -> Result<Option<((u64, Option<u64>), bool)>> {
+        const MAX_QUANTIFIER: u64 = 9_007_199_254_740_991; // 2^53 - 1
        let is_greedy = |reader: &mut Reader| !reader.eat('?');

        if self.reader.eat('*') {
@ -1583,11 +1584,27 @@ impl<'a> PatternParser<'a> {
        if self.reader.eat('{') {
            if let Some(min) = self.consume_decimal_digits() {
                if self.reader.eat('}') {
+                    if MAX_QUANTIFIER < min {
+                        return Err(OxcDiagnostic::error(
+                            "Number is too large in braced quantifier",
+                        )
+                        .with_label(self.span_factory.create(span_start, self.reader.offset())));
+                    }
+
                    return Ok(Some(((min, Some(min)), is_greedy(&mut self.reader))));
                }

                if self.reader.eat(',') {
                    if self.reader.eat('}') {
+                        if MAX_QUANTIFIER < min {
+                            return Err(OxcDiagnostic::error(
+                                "Number is too large in braced quantifier",
+                            )
+                            .with_label(
+                                self.span_factory.create(span_start, self.reader.offset()),
+                            ));
+                        }
+
                        return Ok(Some(((min, None), is_greedy(&mut self.reader))));
                    }

@ -1603,6 +1620,14 @@ impl<'a> PatternParser<'a> {
                                    self.span_factory.create(span_start, self.reader.offset()),
                                ));
                            }
+                            if MAX_QUANTIFIER < min || MAX_QUANTIFIER < max {
+                                return Err(OxcDiagnostic::error(
+                                    "Number is too large in braced quantifier",
+                                )
+                                .with_label(
+                                    self.span_factory.create(span_start, self.reader.offset()),
+                                ));
+                            }

                            return Ok(Some(((min, Some(max)), is_greedy(&mut self.reader))));
                        }
@ -1626,7 +1651,8 @@ impl<'a> PatternParser<'a> {
        if let Some(index) = self.consume_decimal_digits() {
            // \0 is CharacterEscape, not DecimalEscape
            if index != 0 {
-                return Some(index);
+                #[allow(clippy::cast_possible_truncation)]
+                return Some(index as u32);
            }

            self.reader.rewind(checkpoint);
@ -1642,13 +1668,15 @@ impl<'a> PatternParser<'a> {
    //   [+Sep] DecimalDigits[+Sep] NumericLiteralSeparator DecimalDigit
    // ```
    // ([Sep] is disabled for `QuantifierPrefix` and `DecimalEscape`, skip it)
-    fn consume_decimal_digits(&mut self) -> Option<u32> {
+    fn consume_decimal_digits(&mut self) -> Option<u64> {
        let checkpoint = self.reader.checkpoint();

        let mut value = 0;
        while let Some(cp) = self.reader.peek().filter(|&cp| unicode::is_decimal_digit(cp)) {
            // `- '0' as u32`: convert code point to digit
-            value = (10 * value) + (cp - '0' as u32);
+            #[allow(clippy::cast_lossless)]
+            let d = (cp - '0' as u32) as u64;
+            value = (10 * value) + d;
            self.reader.advance();
        }

--- a/crates/oxc_regular_expression/src/body_parser/reader.rs
+++ b/crates/oxc_regular_expression/src/body_parser/reader.rs
@ -1,21 +1,29 @@
+// NOTE: The current implementation switches iteration units depending on the mode.
+//
+// This is all for surrogate pairs in non-unicode mode, but it is required only in limited cases:
+// - Group names for named `CapturingGroup`: `(?<name>.)`
+// - Group names for `NamedReference`: `\k<name>`
+// Even if we skip that distinction, it seems the current test262 cases pass due to other errors.
+//
+// Therefore, it is possible to change the implementation to iterate on `char` units always,
+// assuming that change some output of AST for `Character[kind=Symbol]` to `Character[kind=SurrogatePairs]`.
+//
+// However, for the following reasons, we keep the current implementation:
+// - We want to keep the behavior closer to the specification
+//   - and also, to prevent any oversight
+// - Changing does not have a significant impact on performance
+//
+// See also: https://github.com/oxc-project/oxc/pull/5210
 pub struct Reader<'a> {
    source: &'a str,
    unicode_mode: bool,
    /// Current index for `u8_units`(unicode mode) or `u16_units`(non-unicode mode).
    index: usize,
-    // NOTE: Distinguish these 2 units looks cleaner, but it may not be necessary.
-    //
-    // If I understand correctly (and there are no unexpected factors),
-    // AST `Character[kind=Symbol]` only needs to be aware of this for surrogate pairs.
-    //
-    // Therefore, performance might be improved by:
-    // - using only `u8_units`, and
-    // - checking if each unit (char) is non-BMP, and if so, converting it into a surrogate pair and emitting 2 units.
-    // However, I'm not certain this approach is faster than current one using `encode_utf16()` all at once.
    /// Iteration units for unicode mode.
    /// Even in non-unicode mode, used for `Span` offset calculation.
    u8_units: Vec<(usize, char)>,
    /// Iteration units for non-unicode mode.
+    /// To iterate on surrogate pairs, this is needed.
    u16_units: Vec<u16>,
    /// Last offset caches for non-unicode mode.
    last_offset_indices: (usize, usize),
@ -67,6 +75,7 @@ impl<'a> Reader<'a> {
        self.index += 1;
    }

+    // We need a code point, not a char.
    fn peek_nth(&self, n: usize) -> Option<u32> {
        let nth = self.index + n;