fix(regular_expression): Keep LegacyOctalEscape raw digits for to_string (#5692)

Fixes #5690

- Update `CharacterKind` enum from `Octal` to `Octal1`, `Octal2` and `Octal3`
- Stylistic refactoring for `impl Display`
This commit is contained in:
leaysgur 2024-09-11 07:07:00 +00:00
parent 9e9435f03b
commit 304ce25446
6 changed files with 149 additions and 117 deletions

View file

@ -174,10 +174,13 @@ pub enum CharacterKind {
HexadecimalEscape = 1,
Identifier = 2,
Null = 3,
Octal = 4,
SingleEscape = 5,
Symbol = 6,
UnicodeEscape = 7,
// To distinguish leading 0 cases like `\00` and `\000`
Octal1 = 4,
Octal2 = 5,
Octal3 = 6,
SingleEscape = 7,
Symbol = 8,
UnicodeEscape = 9,
}
/// Character class.

View file

@ -691,12 +691,21 @@ impl<'a> PatternParser<'a> {
}));
}
// e.g. \18
// e.g. \1, \00, \000
if !self.state.unicode_mode {
if let Some(cp) = self.consume_legacy_octal_escape_sequence() {
let span_end = self.reader.offset();
// Keep original digits for `to_string()`
// Otherwise `\0022`(octal \002 + symbol 2) will be `\22`(octal \22)
let digits = span_end - span_start - 1; // -1 for '\'
return Ok(Some(ast::Character {
span: self.span_factory.create(span_start, self.reader.offset()),
kind: ast::CharacterKind::Octal,
span: self.span_factory.create(span_start, span_end),
kind: (match digits {
3 => ast::CharacterKind::Octal3,
2 => ast::CharacterKind::Octal2,
_ => ast::CharacterKind::Octal1,
}),
value: cp,
}));
}

View file

@ -16,24 +16,23 @@ impl<'a> Display for RegularExpression<'a> {
impl Display for Flags {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let mut flags = String::with_capacity(8);
macro_rules! if_true_append {
($flag:ident, $char:literal) => {
if self.$flag {
flags.push($char);
}
};
}
// write flags in the order they are described in the `MDN`
// <https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions#advanced_searching_with_flags>
if_true_append!(has_indices, 'd');
if_true_append!(global, 'g');
if_true_append!(ignore_case, 'i');
if_true_append!(multiline, 'm');
if_true_append!(dot_all, 's');
if_true_append!(unicode, 'u');
if_true_append!(unicode_sets, 'v');
if_true_append!(sticky, 'y');
for (v, ch) in [
(self.has_indices, 'd'),
(self.global, 'g'),
(self.ignore_case, 'i'),
(self.multiline, 'm'),
(self.dot_all, 's'),
(self.unicode, 'u'),
(self.unicode_sets, 'v'),
(self.sticky, 'y'),
] {
if v {
flags.push(ch);
}
}
write!(f, "{flags}")
}
@ -60,14 +59,17 @@ impl<'a> Display for Alternative<'a> {
None
}
}
write_join_with(f, "", &self.body, |iter| {
let next = iter.next()?;
let Some(next) = as_character(next) else { return Some(next.to_string()) };
let peek = iter.peek().and_then(|it| as_character(it));
let (result, eat) = character_to_string(next, peek);
if eat {
_ = iter.next();
iter.next();
}
Some(result)
})
}
@ -208,25 +210,30 @@ impl<'a> Display for CharacterClass<'a> {
None
}
}
write!(f, "[")?;
if !self.body.is_empty() {
if self.negative {
write!(f, "^")?;
}
let sep = match self.kind {
CharacterClassContentsKind::Union => "",
CharacterClassContentsKind::Subtraction => "--",
CharacterClassContentsKind::Intersection => "&&",
};
write_join_with(f, sep, &self.body, |iter| {
let next = iter.next()?;
let Some(next) = as_character(next) else { return Some(next.to_string()) };
let peek = iter.peek().and_then(|it| as_character(it));
let (result, eat) = character_to_string(next, peek);
if eat {
_ = iter.next();
iter.next();
}
Some(result)
})?;
}
@ -270,12 +277,14 @@ impl<'a> Display for ClassString<'a> {
impl<'a> Display for CapturingGroup<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let body = &self.body;
write!(f, "(")?;
if let Some(name) = &self.name {
write!(f, "(?<{name}>{body})")
} else {
write!(f, "({body})")
write!(f, "?<{name}>")?;
}
write!(f, "{}", &self.body)?;
write!(f, ")")
}
}
@ -299,12 +308,14 @@ impl<'a> Display for IgnoreGroup<'a> {
}
write!(f, "(?")?;
if let Some(enabling) = &self.enabling_modifiers {
write_flags(f, '\0', enabling)?;
}
if let Some(disabling) = &self.disabling_modifiers {
write_flags(f, '-', disabling)?;
}
write!(f, ":{})", self.body)
}
}
@ -321,6 +332,88 @@ impl<'a> Display for NamedReference<'a> {
}
}
// ---
fn character_to_string(
this: &Character,
peek: Option<&Character>,
) -> (/* result */ String, /* true of peek should be consumed */ bool) {
let cp = this.value;
if matches!(this.kind, CharacterKind::Symbol | CharacterKind::UnicodeEscape) {
// Trail only
if is_trail_surrogate(cp) {
return (format!(r"\u{cp:X}"), false);
}
if is_lead_surrogate(cp) {
if let Some(peek) = peek.filter(|peek| is_trail_surrogate(peek.value)) {
// Lead+Trail
let cp = combine_surrogate_pair(cp, peek.value);
let ch = char::from_u32(cp).expect("Invalid surrogate pair `Character`!");
return (format!("{ch}"), true);
}
// Lead only
return (format!(r"\u{cp:X}"), false);
}
}
let ch = char::from_u32(cp).expect("Invalid `Character`!");
let result = match this.kind {
// Not a surrogate, like BMP, or all units in unicode mode
CharacterKind::Symbol => format!("{ch}"),
CharacterKind::ControlLetter => match ch {
'\n' => r"\cJ".to_string(),
'\r' => r"\cM".to_string(),
'\t' => r"\cI".to_string(),
_ => format!(r"\c{ch}"),
},
CharacterKind::Identifier => {
format!(r"\{ch}")
}
CharacterKind::SingleEscape => match ch {
'\n' => String::from(r"\n"),
'\r' => String::from(r"\r"),
'\t' => String::from(r"\t"),
'\u{b}' => String::from(r"\v"),
'\u{c}' => String::from(r"\f"),
'\u{8}' => String::from(r"\b"),
'\u{2D}' => String::from(r"\-"),
_ => format!(r"\{ch}"),
},
CharacterKind::Null => String::from(r"\0"),
CharacterKind::UnicodeEscape => {
let hex = &format!("{cp:04X}");
if hex.len() <= 4 {
format!(r"\u{hex}")
} else {
format!(r"\u{{{hex}}}")
}
}
CharacterKind::HexadecimalEscape => {
let hex = &format!("{cp:02X}");
format!(r"\x{hex}")
}
CharacterKind::Octal1 => {
let octal = format!("{cp:o}");
format!(r"\{octal}")
}
CharacterKind::Octal2 => {
let octal = format!("{cp:02o}");
format!(r"\{octal}")
}
CharacterKind::Octal3 => {
let octal = format!("{cp:03o}");
format!(r"\{octal}")
}
};
(result, false)
}
// ---
fn write_join<S, I, E>(f: &mut fmt::Formatter<'_>, sep: S, items: I) -> fmt::Result
where
S: AsRef<str>,
@ -351,78 +444,9 @@ where
Ok(())
}
fn character_to_string(
this: &Character,
peek: Option<&Character>,
) -> (/* result */ String, /* true of peek should be consumed */ bool) {
let cp = this.value;
if matches!(this.kind, CharacterKind::Symbol | CharacterKind::UnicodeEscape) {
// Trail only
if is_trail_surrogate(cp) {
return (format!(r"\u{cp:X}"), false);
}
if is_lead_surrogate(cp) {
if let Some(peek) = peek.filter(|peek| is_trail_surrogate(peek.value)) {
// Lead+Trail
let cp = combine_surrogate_pair(cp, peek.value);
let ch = char::from_u32(cp).expect("Invalid surrogate pair `Character`!");
return (format!("{ch}"), true);
}
// Lead only
return (format!(r"\u{cp:X}"), false);
}
}
let ch = char::from_u32(cp).expect("Invalid `Character`!");
let result = match this.kind {
CharacterKind::ControlLetter => match ch {
'\n' => r"\cJ".to_string(),
'\r' => r"\cM".to_string(),
'\t' => r"\cI".to_string(),
_ => format!(r"\c{ch}"),
},
CharacterKind::Identifier => {
format!(r"\{ch}")
}
// Not a surrogate, like BMP, or all units in unicode mode
CharacterKind::Symbol => format!("{ch}"),
CharacterKind::Null => String::from(r"\0"),
CharacterKind::UnicodeEscape => {
let hex = &format!("{cp:04X}");
if hex.len() <= 4 {
format!(r"\u{hex}")
} else {
format!(r"\u{{{hex}}}")
}
}
CharacterKind::HexadecimalEscape => {
let hex = &format!("{cp:02X}");
format!(r"\x{hex}")
}
CharacterKind::Octal => {
let octal = format!("{cp:o}");
format!(r"\{octal}")
}
CharacterKind::SingleEscape => match ch {
'\n' => String::from(r"\n"),
'\r' => String::from(r"\r"),
'\t' => String::from(r"\t"),
'\u{b}' => String::from(r"\v"),
'\u{c}' => String::from(r"\f"),
'\u{8}' => String::from(r"\b"),
'\u{2D}' => String::from(r"\-"),
_ => format!(r"\{ch}"),
},
};
(result, false)
}
#[cfg(test)]
mod test {
use crate::{Parser, ParserOptions};
use oxc_allocator::Allocator;
type Case<'a> = (
@ -505,23 +529,24 @@ mod test {
(r"/\5/", None),
(r"/\6/", None),
(r"/\7/", None),
// Remove leading zeroes --
(r"/\00/", Some(r"/\0/")),
(r"/\07/", Some(r"/\7/")),
// --
(r"/\00/", None),
(r"/\07/", None),
(r"/\30/", None),
(r"/\37/", None),
(r"/\40/", None),
(r"/\47/", None),
(r"/\70/", None),
(r"/\77/", None),
// Remove leading zeroes --
(r"/\000/", Some(r"/\0/")),
(r"/\007/", Some(r"/\7/")),
(r"/\070/", Some(r"/\70/")),
// --
(r"/\000/", None),
(r"/\007/", None),
(r"/\070/", None),
(r"/\300/", None),
(r"/\307/", None),
(r"/\370/", None),
(r"/\377/", None),
(r"/\0111/", None),
(r"/\0022/", None),
(r"/\0003/", None),
(r"/(.)\1/", None),
// Identity escape from: <https://github.com/tc39/test262/blob/d62fa93c8f9ce5e687c0bbaa5d2b59670ab2ff60/test/annexB/language/literals/regexp/identity-escape.js>
(r"/\C/", None),
@ -553,7 +578,6 @@ mod test {
];
fn test_display(allocator: &Allocator, (source, expect): &Case) {
use crate::{Parser, ParserOptions};
let expect = expect.unwrap_or(source);
let actual = Parser::new(allocator, source, ParserOptions::default()).parse().unwrap();
assert_eq!(expect, actual.to_string());

View file

@ -171,7 +171,9 @@ impl<'alloc> CloneIn<'alloc> for CharacterKind {
Self::HexadecimalEscape => CharacterKind::HexadecimalEscape,
Self::Identifier => CharacterKind::Identifier,
Self::Null => CharacterKind::Null,
Self::Octal => CharacterKind::Octal,
Self::Octal1 => CharacterKind::Octal1,
Self::Octal2 => CharacterKind::Octal2,
Self::Octal3 => CharacterKind::Octal3,
Self::SingleEscape => CharacterKind::SingleEscape,
Self::Symbol => CharacterKind::Symbol,
Self::UnicodeEscape => CharacterKind::UnicodeEscape,

View file

@ -2,11 +2,8 @@ commit: d62fa93c
parser_test262 Summary:
AST Parsed : 43765/43765 (100.00%)
Positive Passed: 43764/43765 (100.00%)
Positive Passed: 43765/43765 (100.00%)
Negative Passed: 4237/4237 (100.00%)
Expect to Parse: tasks/coverage/test262/test/annexB/language/literals/regexp/legacy-octal-escape.js
× Regular Expression mismatch: \03 \3
× '0'-prefixed octal literals and octal escape sequences are deprecated
╭─[test262/test/annexB/language/expressions/template-literal/legacy-octal-escape-sequence-strict.js:19:4]

View file

@ -2,7 +2,7 @@ commit: d62fa93c
semantic_test262 Summary:
AST Parsed : 43765/43765 (100.00%)
Positive Passed: 43564/43765 (99.54%)
Positive Passed: 43565/43765 (99.54%)
tasks/coverage/test262/test/annexB/language/function-code/if-decl-else-decl-a-func-block-scoping.js
semantic error: Symbol scope ID mismatch:
after transform: SymbolId(3): ScopeId(4294967294)
@ -1119,9 +1119,6 @@ semantic error: Symbol scope ID mismatch:
after transform: SymbolId(0): ScopeId(4294967294)
rebuilt : SymbolId(0): ScopeId(4294967294)
tasks/coverage/test262/test/annexB/language/literals/regexp/legacy-octal-escape.js
semantic error: Regular Expression mismatch: \03 \3
tasks/coverage/test262/test/language/module-code/eval-rqstd-once.js
semantic error: Bindings mismatch:
after transform: ScopeId(0): ["dflt1", "dflt2", "dflt3", "global", "ns1", "ns3"]