mirror of
https://github.com/danbulant/oxc
synced 2026-05-19 12:19:15 +00:00
feat(regular_expression): Validate max quantifier value (#5218)
I've never seen but `/a{9007199254740991}/` is valid and this is the maximum value for quantifier.
\+ left comment about #5210 experiment.
This commit is contained in:
parent
b39c0d6122
commit
46b641b75d
4 changed files with 56 additions and 15 deletions
|
|
@ -98,9 +98,9 @@ pub enum LookAroundAssertionKind {
|
|||
#[derive(Debug)]
|
||||
pub struct Quantifier<'a> {
|
||||
pub span: Span,
|
||||
pub min: u32,
|
||||
pub min: u64,
|
||||
/// `None` means no upper bound.
|
||||
pub max: Option<u32>,
|
||||
pub max: Option<u64>,
|
||||
pub greedy: bool,
|
||||
pub body: Term<'a>,
|
||||
}
|
||||
|
|
|
|||
|
|
@ -32,6 +32,8 @@ mod test {
|
|||
("a{1,", ParserOptions::default()),
|
||||
("a{1,}", ParserOptions::default()),
|
||||
("a{1,2}", ParserOptions::default()),
|
||||
("x{9007199254740991}", ParserOptions::default()),
|
||||
("x{9007199254740991,9007199254740991}", ParserOptions::default()),
|
||||
("a|b", ParserOptions::default()),
|
||||
("a|b|c", ParserOptions::default()),
|
||||
("a|b+?|c", ParserOptions::default()),
|
||||
|
|
@ -137,6 +139,8 @@ mod test {
|
|||
("a{1", ParserOptions::default().with_unicode_mode()),
|
||||
("a{1,", ParserOptions::default().with_unicode_mode()),
|
||||
("a{,", ParserOptions::default().with_unicode_mode()),
|
||||
("x{9007199254740992}", ParserOptions::default()),
|
||||
("x{9007199254740991,9007199254740992}", ParserOptions::default()),
|
||||
("(?=a", ParserOptions::default()),
|
||||
("(?<!a", ParserOptions::default()),
|
||||
(r"\c0", ParserOptions::default().with_unicode_mode()),
|
||||
|
|
|
|||
|
|
@ -1565,7 +1565,8 @@ impl<'a> PatternParser<'a> {
|
|||
// ```
|
||||
/// Returns: ((min, max), greedy)
|
||||
#[allow(clippy::type_complexity)]
|
||||
fn consume_quantifier(&mut self) -> Result<Option<((u32, Option<u32>), bool)>> {
|
||||
fn consume_quantifier(&mut self) -> Result<Option<((u64, Option<u64>), bool)>> {
|
||||
const MAX_QUANTIFIER: u64 = 9_007_199_254_740_991; // 2^53 - 1
|
||||
let is_greedy = |reader: &mut Reader| !reader.eat('?');
|
||||
|
||||
if self.reader.eat('*') {
|
||||
|
|
@ -1583,11 +1584,27 @@ impl<'a> PatternParser<'a> {
|
|||
if self.reader.eat('{') {
|
||||
if let Some(min) = self.consume_decimal_digits() {
|
||||
if self.reader.eat('}') {
|
||||
if MAX_QUANTIFIER < min {
|
||||
return Err(OxcDiagnostic::error(
|
||||
"Number is too large in braced quantifier",
|
||||
)
|
||||
.with_label(self.span_factory.create(span_start, self.reader.offset())));
|
||||
}
|
||||
|
||||
return Ok(Some(((min, Some(min)), is_greedy(&mut self.reader))));
|
||||
}
|
||||
|
||||
if self.reader.eat(',') {
|
||||
if self.reader.eat('}') {
|
||||
if MAX_QUANTIFIER < min {
|
||||
return Err(OxcDiagnostic::error(
|
||||
"Number is too large in braced quantifier",
|
||||
)
|
||||
.with_label(
|
||||
self.span_factory.create(span_start, self.reader.offset()),
|
||||
));
|
||||
}
|
||||
|
||||
return Ok(Some(((min, None), is_greedy(&mut self.reader))));
|
||||
}
|
||||
|
||||
|
|
@ -1603,6 +1620,14 @@ impl<'a> PatternParser<'a> {
|
|||
self.span_factory.create(span_start, self.reader.offset()),
|
||||
));
|
||||
}
|
||||
if MAX_QUANTIFIER < min || MAX_QUANTIFIER < max {
|
||||
return Err(OxcDiagnostic::error(
|
||||
"Number is too large in braced quantifier",
|
||||
)
|
||||
.with_label(
|
||||
self.span_factory.create(span_start, self.reader.offset()),
|
||||
));
|
||||
}
|
||||
|
||||
return Ok(Some(((min, Some(max)), is_greedy(&mut self.reader))));
|
||||
}
|
||||
|
|
@ -1626,7 +1651,8 @@ impl<'a> PatternParser<'a> {
|
|||
if let Some(index) = self.consume_decimal_digits() {
|
||||
// \0 is CharacterEscape, not DecimalEscape
|
||||
if index != 0 {
|
||||
return Some(index);
|
||||
#[allow(clippy::cast_possible_truncation)]
|
||||
return Some(index as u32);
|
||||
}
|
||||
|
||||
self.reader.rewind(checkpoint);
|
||||
|
|
@ -1642,13 +1668,15 @@ impl<'a> PatternParser<'a> {
|
|||
// [+Sep] DecimalDigits[+Sep] NumericLiteralSeparator DecimalDigit
|
||||
// ```
|
||||
// ([Sep] is disabled for `QuantifierPrefix` and `DecimalEscape`, skip it)
|
||||
fn consume_decimal_digits(&mut self) -> Option<u32> {
|
||||
fn consume_decimal_digits(&mut self) -> Option<u64> {
|
||||
let checkpoint = self.reader.checkpoint();
|
||||
|
||||
let mut value = 0;
|
||||
while let Some(cp) = self.reader.peek().filter(|&cp| unicode::is_decimal_digit(cp)) {
|
||||
// `- '0' as u32`: convert code point to digit
|
||||
value = (10 * value) + (cp - '0' as u32);
|
||||
#[allow(clippy::cast_lossless)]
|
||||
let d = (cp - '0' as u32) as u64;
|
||||
value = (10 * value) + d;
|
||||
self.reader.advance();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,21 +1,29 @@
|
|||
// NOTE: The current implementation switches iteration units depending on the mode.
|
||||
//
|
||||
// This is all for surrogate pairs in non-unicode mode, but it is required only in limited cases:
|
||||
// - Group names for named `CapturingGroup`: `(?<name>.)`
|
||||
// - Group names for `NamedReference`: `\k<name>`
|
||||
// Even if we skip that distinction, it seems the current test262 cases pass due to other errors.
|
||||
//
|
||||
// Therefore, it is possible to change the implementation to iterate on `char` units always,
|
||||
// assuming that change some output of AST for `Character[kind=Symbol]` to `Character[kind=SurrogatePairs]`.
|
||||
//
|
||||
// However, for the following reasons, we keep the current implementation:
|
||||
// - We want to keep the behavior closer to the specification
|
||||
// - and also, to prevent any oversight
|
||||
// - Changing does not have a significant impact on performance
|
||||
//
|
||||
// See also: https://github.com/oxc-project/oxc/pull/5210
|
||||
pub struct Reader<'a> {
|
||||
source: &'a str,
|
||||
unicode_mode: bool,
|
||||
/// Current index for `u8_units`(unicode mode) or `u16_units`(non-unicode mode).
|
||||
index: usize,
|
||||
// NOTE: Distinguish these 2 units looks cleaner, but it may not be necessary.
|
||||
//
|
||||
// If I understand correctly (and there are no unexpected factors),
|
||||
// AST `Character[kind=Symbol]` only needs to be aware of this for surrogate pairs.
|
||||
//
|
||||
// Therefore, performance might be improved by:
|
||||
// - using only `u8_units`, and
|
||||
// - checking if each unit (char) is non-BMP, and if so, converting it into a surrogate pair and emitting 2 units.
|
||||
// However, I'm not certain this approach is faster than current one using `encode_utf16()` all at once.
|
||||
/// Iteration units for unicode mode.
|
||||
/// Even in non-unicode mode, used for `Span` offset calculation.
|
||||
u8_units: Vec<(usize, char)>,
|
||||
/// Iteration units for non-unicode mode.
|
||||
/// To iterate on surrogate pairs, this is needed.
|
||||
u16_units: Vec<u16>,
|
||||
/// Last offset caches for non-unicode mode.
|
||||
last_offset_indices: (usize, usize),
|
||||
|
|
@ -67,6 +75,7 @@ impl<'a> Reader<'a> {
|
|||
self.index += 1;
|
||||
}
|
||||
|
||||
// We need a code point, not a char.
|
||||
fn peek_nth(&self, n: usize) -> Option<u32> {
|
||||
let nth = self.index + n;
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue