fix(regular_expression): Keep LegacyOctalEscape raw digits for to_string (#5692)

Fixes #5690 - Update `CharacterKind` enum from `Octal` to `Octal1`, `Octal2` and `Octal3` - Stylistic refactoring for `impl Display`
2026-05-19 12:19:15 +00:00 · 2024-09-11 07:07:00 +00:00 · 2024-09-11 07:07:00 +00:00 · 304ce25446
commit 304ce25446
parent 9e9435f03b
6 changed files with 149 additions and 117 deletions
--- a/crates/oxc_regular_expression/src/ast.rs
+++ b/crates/oxc_regular_expression/src/ast.rs
@ -174,10 +174,13 @@ pub enum CharacterKind {
    HexadecimalEscape = 1,
    Identifier = 2,
    Null = 3,
-    Octal = 4,
-    SingleEscape = 5,
-    Symbol = 6,
-    UnicodeEscape = 7,
+    // To distinguish leading 0 cases like `\00` and `\000`
+    Octal1 = 4,
+    Octal2 = 5,
+    Octal3 = 6,
+    SingleEscape = 7,
+    Symbol = 8,
+    UnicodeEscape = 9,
 }

 /// Character class.
--- a/crates/oxc_regular_expression/src/body_parser/parser.rs
+++ b/crates/oxc_regular_expression/src/body_parser/parser.rs
@ -691,12 +691,21 @@ impl<'a> PatternParser<'a> {
            }));
        }

-        // e.g. \18
+        // e.g. \1, \00, \000
        if !self.state.unicode_mode {
            if let Some(cp) = self.consume_legacy_octal_escape_sequence() {
+                let span_end = self.reader.offset();
+                // Keep original digits for `to_string()`
+                // Otherwise `\0022`(octal \002 + symbol 2) will be `\22`(octal \22)
+                let digits = span_end - span_start - 1; // -1 for '\'
+
                return Ok(Some(ast::Character {
-                    span: self.span_factory.create(span_start, self.reader.offset()),
-                    kind: ast::CharacterKind::Octal,
+                    span: self.span_factory.create(span_start, span_end),
+                    kind: (match digits {
+                        3 => ast::CharacterKind::Octal3,
+                        2 => ast::CharacterKind::Octal2,
+                        _ => ast::CharacterKind::Octal1,
+                    }),
                    value: cp,
                }));
            }
--- a/crates/oxc_regular_expression/src/display.rs
+++ b/crates/oxc_regular_expression/src/display.rs
@ -16,24 +16,23 @@ impl<'a> Display for RegularExpression<'a> {
 impl Display for Flags {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        let mut flags = String::with_capacity(8);
-        macro_rules! if_true_append {
-            ($flag:ident, $char:literal) => {
-                if self.$flag {
-                    flags.push($char);
-                }
-            };
-        }

        // write flags in the order they are described in the `MDN`
        // <https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions#advanced_searching_with_flags>
-        if_true_append!(has_indices, 'd');
-        if_true_append!(global, 'g');
-        if_true_append!(ignore_case, 'i');
-        if_true_append!(multiline, 'm');
-        if_true_append!(dot_all, 's');
-        if_true_append!(unicode, 'u');
-        if_true_append!(unicode_sets, 'v');
-        if_true_append!(sticky, 'y');
+        for (v, ch) in [
+            (self.has_indices, 'd'),
+            (self.global, 'g'),
+            (self.ignore_case, 'i'),
+            (self.multiline, 'm'),
+            (self.dot_all, 's'),
+            (self.unicode, 'u'),
+            (self.unicode_sets, 'v'),
+            (self.sticky, 'y'),
+        ] {
+            if v {
+                flags.push(ch);
+            }
+        }

        write!(f, "{flags}")
    }
@ -60,14 +59,17 @@ impl<'a> Display for Alternative<'a> {
                None
            }
        }
+
        write_join_with(f, "", &self.body, |iter| {
            let next = iter.next()?;
            let Some(next) = as_character(next) else { return Some(next.to_string()) };
+
            let peek = iter.peek().and_then(|it| as_character(it));
            let (result, eat) = character_to_string(next, peek);
            if eat {
-                _ = iter.next();
+                iter.next();
            }
+
            Some(result)
        })
    }
@ -208,25 +210,30 @@ impl<'a> Display for CharacterClass<'a> {
                None
            }
        }
+
        write!(f, "[")?;

        if !self.body.is_empty() {
            if self.negative {
                write!(f, "^")?;
            }
+
            let sep = match self.kind {
                CharacterClassContentsKind::Union => "",
                CharacterClassContentsKind::Subtraction => "--",
                CharacterClassContentsKind::Intersection => "&&",
            };
+
            write_join_with(f, sep, &self.body, |iter| {
                let next = iter.next()?;
                let Some(next) = as_character(next) else { return Some(next.to_string()) };
+
                let peek = iter.peek().and_then(|it| as_character(it));
                let (result, eat) = character_to_string(next, peek);
                if eat {
-                    _ = iter.next();
+                    iter.next();
                }
+
                Some(result)
            })?;
        }
@ -270,12 +277,14 @@ impl<'a> Display for ClassString<'a> {

 impl<'a> Display for CapturingGroup<'a> {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let body = &self.body;
+        write!(f, "(")?;
+
        if let Some(name) = &self.name {
-            write!(f, "(?<{name}>{body})")
-        } else {
-            write!(f, "({body})")
+            write!(f, "?<{name}>")?;
        }
+        write!(f, "{}", &self.body)?;
+
+        write!(f, ")")
    }
 }

@ -299,12 +308,14 @@ impl<'a> Display for IgnoreGroup<'a> {
        }

        write!(f, "(?")?;
+
        if let Some(enabling) = &self.enabling_modifiers {
            write_flags(f, '\0', enabling)?;
        }
        if let Some(disabling) = &self.disabling_modifiers {
            write_flags(f, '-', disabling)?;
        }
+
        write!(f, ":{})", self.body)
    }
 }
@ -321,6 +332,88 @@ impl<'a> Display for NamedReference<'a> {
    }
 }

+// ---
+
+fn character_to_string(
+    this: &Character,
+    peek: Option<&Character>,
+) -> (/* result */ String, /* true of peek should be consumed */ bool) {
+    let cp = this.value;
+
+    if matches!(this.kind, CharacterKind::Symbol | CharacterKind::UnicodeEscape) {
+        // Trail only
+        if is_trail_surrogate(cp) {
+            return (format!(r"\u{cp:X}"), false);
+        }
+
+        if is_lead_surrogate(cp) {
+            if let Some(peek) = peek.filter(|peek| is_trail_surrogate(peek.value)) {
+                // Lead+Trail
+                let cp = combine_surrogate_pair(cp, peek.value);
+                let ch = char::from_u32(cp).expect("Invalid surrogate pair `Character`!");
+                return (format!("{ch}"), true);
+            }
+
+            // Lead only
+            return (format!(r"\u{cp:X}"), false);
+        }
+    }
+
+    let ch = char::from_u32(cp).expect("Invalid `Character`!");
+    let result = match this.kind {
+        // Not a surrogate, like BMP, or all units in unicode mode
+        CharacterKind::Symbol => format!("{ch}"),
+        CharacterKind::ControlLetter => match ch {
+            '\n' => r"\cJ".to_string(),
+            '\r' => r"\cM".to_string(),
+            '\t' => r"\cI".to_string(),
+            _ => format!(r"\c{ch}"),
+        },
+        CharacterKind::Identifier => {
+            format!(r"\{ch}")
+        }
+        CharacterKind::SingleEscape => match ch {
+            '\n' => String::from(r"\n"),
+            '\r' => String::from(r"\r"),
+            '\t' => String::from(r"\t"),
+            '\u{b}' => String::from(r"\v"),
+            '\u{c}' => String::from(r"\f"),
+            '\u{8}' => String::from(r"\b"),
+            '\u{2D}' => String::from(r"\-"),
+            _ => format!(r"\{ch}"),
+        },
+        CharacterKind::Null => String::from(r"\0"),
+        CharacterKind::UnicodeEscape => {
+            let hex = &format!("{cp:04X}");
+            if hex.len() <= 4 {
+                format!(r"\u{hex}")
+            } else {
+                format!(r"\u{{{hex}}}")
+            }
+        }
+        CharacterKind::HexadecimalEscape => {
+            let hex = &format!("{cp:02X}");
+            format!(r"\x{hex}")
+        }
+        CharacterKind::Octal1 => {
+            let octal = format!("{cp:o}");
+            format!(r"\{octal}")
+        }
+        CharacterKind::Octal2 => {
+            let octal = format!("{cp:02o}");
+            format!(r"\{octal}")
+        }
+        CharacterKind::Octal3 => {
+            let octal = format!("{cp:03o}");
+            format!(r"\{octal}")
+        }
+    };
+
+    (result, false)
+}
+
+// ---
+
 fn write_join<S, I, E>(f: &mut fmt::Formatter<'_>, sep: S, items: I) -> fmt::Result
 where
    S: AsRef<str>,
@ -351,78 +444,9 @@ where
    Ok(())
 }

-fn character_to_string(
-    this: &Character,
-    peek: Option<&Character>,
-) -> (/* result */ String, /* true of peek should be consumed */ bool) {
-    let cp = this.value;
-
-    if matches!(this.kind, CharacterKind::Symbol | CharacterKind::UnicodeEscape) {
-        // Trail only
-        if is_trail_surrogate(cp) {
-            return (format!(r"\u{cp:X}"), false);
-        }
-
-        if is_lead_surrogate(cp) {
-            if let Some(peek) = peek.filter(|peek| is_trail_surrogate(peek.value)) {
-                // Lead+Trail
-                let cp = combine_surrogate_pair(cp, peek.value);
-                let ch = char::from_u32(cp).expect("Invalid surrogate pair `Character`!");
-                return (format!("{ch}"), true);
-            }
-
-            // Lead only
-            return (format!(r"\u{cp:X}"), false);
-        }
-    }
-
-    let ch = char::from_u32(cp).expect("Invalid `Character`!");
-    let result = match this.kind {
-        CharacterKind::ControlLetter => match ch {
-            '\n' => r"\cJ".to_string(),
-            '\r' => r"\cM".to_string(),
-            '\t' => r"\cI".to_string(),
-            _ => format!(r"\c{ch}"),
-        },
-        CharacterKind::Identifier => {
-            format!(r"\{ch}")
-        }
-        // Not a surrogate, like BMP, or all units in unicode mode
-        CharacterKind::Symbol => format!("{ch}"),
-        CharacterKind::Null => String::from(r"\0"),
-        CharacterKind::UnicodeEscape => {
-            let hex = &format!("{cp:04X}");
-            if hex.len() <= 4 {
-                format!(r"\u{hex}")
-            } else {
-                format!(r"\u{{{hex}}}")
-            }
-        }
-        CharacterKind::HexadecimalEscape => {
-            let hex = &format!("{cp:02X}");
-            format!(r"\x{hex}")
-        }
-        CharacterKind::Octal => {
-            let octal = format!("{cp:o}");
-            format!(r"\{octal}")
-        }
-        CharacterKind::SingleEscape => match ch {
-            '\n' => String::from(r"\n"),
-            '\r' => String::from(r"\r"),
-            '\t' => String::from(r"\t"),
-            '\u{b}' => String::from(r"\v"),
-            '\u{c}' => String::from(r"\f"),
-            '\u{8}' => String::from(r"\b"),
-            '\u{2D}' => String::from(r"\-"),
-            _ => format!(r"\{ch}"),
-        },
-    };
-
-    (result, false)
-}
-
 #[cfg(test)]
 mod test {
+    use crate::{Parser, ParserOptions};
    use oxc_allocator::Allocator;

    type Case<'a> = (
@ -505,23 +529,24 @@ mod test {
        (r"/\5/", None),
        (r"/\6/", None),
        (r"/\7/", None),
-        // Remove leading zeroes --
-        (r"/\00/", Some(r"/\0/")),
-        (r"/\07/", Some(r"/\7/")),
-        // --
+        (r"/\00/", None),
+        (r"/\07/", None),
+        (r"/\30/", None),
+        (r"/\37/", None),
        (r"/\40/", None),
        (r"/\47/", None),
        (r"/\70/", None),
        (r"/\77/", None),
-        // Remove leading zeroes --
-        (r"/\000/", Some(r"/\0/")),
-        (r"/\007/", Some(r"/\7/")),
-        (r"/\070/", Some(r"/\70/")),
-        // --
+        (r"/\000/", None),
+        (r"/\007/", None),
+        (r"/\070/", None),
        (r"/\300/", None),
        (r"/\307/", None),
        (r"/\370/", None),
        (r"/\377/", None),
+        (r"/\0111/", None),
+        (r"/\0022/", None),
+        (r"/\0003/", None),
        (r"/(.)\1/", None),
        // Identity escape from: <https://github.com/tc39/test262/blob/d62fa93c8f9ce5e687c0bbaa5d2b59670ab2ff60/test/annexB/language/literals/regexp/identity-escape.js>
        (r"/\C/", None),
@ -553,7 +578,6 @@ mod test {
    ];

    fn test_display(allocator: &Allocator, (source, expect): &Case) {
-        use crate::{Parser, ParserOptions};
        let expect = expect.unwrap_or(source);
        let actual = Parser::new(allocator, source, ParserOptions::default()).parse().unwrap();
        assert_eq!(expect, actual.to_string());
--- a/crates/oxc_regular_expression/src/generated/derive_clone_in.rs
+++ b/crates/oxc_regular_expression/src/generated/derive_clone_in.rs
@ -171,7 +171,9 @@ impl<'alloc> CloneIn<'alloc> for CharacterKind {
            Self::HexadecimalEscape => CharacterKind::HexadecimalEscape,
            Self::Identifier => CharacterKind::Identifier,
            Self::Null => CharacterKind::Null,
-            Self::Octal => CharacterKind::Octal,
+            Self::Octal1 => CharacterKind::Octal1,
+            Self::Octal2 => CharacterKind::Octal2,
+            Self::Octal3 => CharacterKind::Octal3,
            Self::SingleEscape => CharacterKind::SingleEscape,
            Self::Symbol => CharacterKind::Symbol,
            Self::UnicodeEscape => CharacterKind::UnicodeEscape,
--- a/tasks/coverage/parser_test262.snap
+++ b/tasks/coverage/parser_test262.snap
@ -2,11 +2,8 @@ commit: d62fa93c

 parser_test262 Summary:
 AST Parsed     : 43765/43765 (100.00%)
-Positive Passed: 43764/43765 (100.00%)
+Positive Passed: 43765/43765 (100.00%)
 Negative Passed: 4237/4237 (100.00%)
-Expect to Parse: tasks/coverage/test262/test/annexB/language/literals/regexp/legacy-octal-escape.js
-
-  × Regular Expression mismatch: \03 \3

  × '0'-prefixed octal literals and octal escape sequences are deprecated
    ╭─[test262/test/annexB/language/expressions/template-literal/legacy-octal-escape-sequence-strict.js:19:4]
--- a/tasks/coverage/semantic_test262.snap
+++ b/tasks/coverage/semantic_test262.snap
@ -2,7 +2,7 @@ commit: d62fa93c

 semantic_test262 Summary:
 AST Parsed     : 43765/43765 (100.00%)
-Positive Passed: 43564/43765 (99.54%)
+Positive Passed: 43565/43765 (99.54%)
 tasks/coverage/test262/test/annexB/language/function-code/if-decl-else-decl-a-func-block-scoping.js
 semantic error: Symbol scope ID mismatch:
 after transform: SymbolId(3): ScopeId(4294967294)
@ -1119,9 +1119,6 @@ semantic error: Symbol scope ID mismatch:
 after transform: SymbolId(0): ScopeId(4294967294)
 rebuilt        : SymbolId(0): ScopeId(4294967294)

-tasks/coverage/test262/test/annexB/language/literals/regexp/legacy-octal-escape.js
-semantic error: Regular Expression mismatch: \03 \3
-
 tasks/coverage/test262/test/language/module-code/eval-rqstd-once.js
 semantic error: Bindings mismatch:
 after transform: ScopeId(0): ["dflt1", "dflt2", "dflt3", "global", "ns1", "ns3"]