refactor(parser): reduce work parsing regexps (#1999)

#1926 produced a small performance regression because when parsing a
regexp, some work is repeated.
This commit is contained in:
overlookmotel 2024-01-12 03:36:30 +00:00 committed by GitHub
parent 74dfa3be8b
commit c7316856db
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 23 additions and 29 deletions

View file

@ -1,5 +1,6 @@
//! Code related to navigating `Token`s from the lexer //! Code related to navigating `Token`s from the lexer
use oxc_ast::ast::RegExpFlags;
use oxc_diagnostics::Result; use oxc_diagnostics::Result;
use oxc_span::Span; use oxc_span::Span;
@ -200,8 +201,10 @@ impl<'a> Parser<'a> {
} }
/// Tell lexer to read a regex /// Tell lexer to read a regex
pub(crate) fn read_regex(&mut self) { pub(crate) fn read_regex(&mut self) -> (u32, RegExpFlags) {
self.token = self.lexer.next_regex(self.cur_kind()); let (token, pattern_end, flags) = self.lexer.next_regex(self.cur_kind());
self.token = token;
(pattern_end, flags)
} }
/// Tell lexer to read a template substitution tail /// Tell lexer to read a template substitution tail

View file

@ -180,7 +180,6 @@ impl<'a> Parser<'a> {
} }
Kind::LParen => self.parse_parenthesized_expression(span), Kind::LParen => self.parse_parenthesized_expression(span),
Kind::Slash | Kind::SlashEq => { Kind::Slash | Kind::SlashEq => {
self.read_regex();
let literal = self.parse_literal_regexp(); let literal = self.parse_literal_regexp();
Ok(self.ast.literal_regexp_expression(literal)) Ok(self.ast.literal_regexp_expression(literal))
} }
@ -320,22 +319,10 @@ impl<'a> Parser<'a> {
pub(crate) fn parse_literal_regexp(&mut self) -> RegExpLiteral { pub(crate) fn parse_literal_regexp(&mut self) -> RegExpLiteral {
let span = self.start_span(); let span = self.start_span();
// split out the flag part of `/regex/flag` by looking for `/` from the end // split out pattern
let regex_src = self.cur_src(); let (pattern_end, flags) = self.read_regex();
let mut flags = RegExpFlags::empty(); let pattern_start = self.cur_token().start + 1; // +1 to exclude `/`
let pattern = &self.source_text[pattern_start as usize..pattern_end as usize];
let mut split_index = None;
for (i, c) in regex_src.char_indices().rev() {
if let Ok(flag) = RegExpFlags::try_from(c) {
flags |= flag;
} else {
split_index.replace(i);
break;
}
}
// `/` are omitted from the pattern
let pattern = split_index.map_or(regex_src, |i| regex_src.get(1..i).unwrap_or(""));
self.bump_any(); self.bump_any();
self.ast.reg_exp_literal(self.end_span(span), pattern, flags) self.ast.reg_exp_literal(self.end_span(span), pattern, flags)

View file

@ -192,16 +192,17 @@ impl<'a> Lexer<'a> {
/// where a `RegularExpressionLiteral` is permitted /// where a `RegularExpressionLiteral` is permitted
/// Which means the parser needs to re-tokenize on `PrimaryExpression`, /// Which means the parser needs to re-tokenize on `PrimaryExpression`,
/// `RegularExpressionLiteral` only appear on the right hand side of `PrimaryExpression` /// `RegularExpressionLiteral` only appear on the right hand side of `PrimaryExpression`
pub fn next_regex(&mut self, kind: Kind) -> Token { pub fn next_regex(&mut self, kind: Kind) -> (Token, u32, RegExpFlags) {
self.current.token.start = self.offset() self.current.token.start = self.offset()
- match kind { - match kind {
Kind::Slash => 1, Kind::Slash => 1,
Kind::SlashEq => 2, Kind::SlashEq => 2,
_ => unreachable!(), _ => unreachable!(),
}; };
let kind = self.read_regex(); let (pattern_end, flags) = self.read_regex();
self.lookahead.clear(); self.lookahead.clear();
self.finish_next(kind) let token = self.finish_next(Kind::RegExp);
(token, pattern_end, flags)
} }
pub fn next_right_angle(&mut self) -> Token { pub fn next_right_angle(&mut self) -> Token {
@ -828,18 +829,20 @@ impl<'a> Lexer<'a> {
} }
/// 12.9.5 Regular Expression Literals /// 12.9.5 Regular Expression Literals
fn read_regex(&mut self) -> Kind { fn read_regex(&mut self) -> (u32, RegExpFlags) {
let mut in_escape = false; let mut in_escape = false;
let mut in_character_class = false; let mut in_character_class = false;
loop { loop {
match self.current.chars.next() { match self.current.chars.next() {
None => { None => {
self.error(diagnostics::UnterminatedRegExp(self.unterminated_range())); self.error(diagnostics::UnterminatedRegExp(self.unterminated_range()));
return Kind::Undetermined; return (self.offset(), RegExpFlags::empty());
} }
Some(c) if is_line_terminator(c) => { Some(c) if is_line_terminator(c) => {
self.error(diagnostics::UnterminatedRegExp(self.unterminated_range())); self.error(diagnostics::UnterminatedRegExp(self.unterminated_range()));
return Kind::Undetermined; #[allow(clippy::cast_possible_truncation)]
let pattern_end = self.offset() - c.len_utf8() as u32;
return (pattern_end, RegExpFlags::empty());
} }
Some(c) => { Some(c) => {
if in_escape { if in_escape {
@ -857,28 +860,29 @@ impl<'a> Lexer<'a> {
} }
} }
let pattern_end = self.offset() - 1; // -1 to exclude `/`
let mut flags = RegExpFlags::empty(); let mut flags = RegExpFlags::empty();
while let Some(ch @ ('$' | '_' | 'a'..='z' | 'A'..='Z' | '0'..='9')) = self.peek() { while let Some(ch @ ('$' | '_' | 'a'..='z' | 'A'..='Z' | '0'..='9')) = self.peek() {
self.current.chars.next(); self.current.chars.next();
if !ch.is_ascii_lowercase() { if !ch.is_ascii_lowercase() {
self.error(diagnostics::RegExpFlag(ch, self.current_offset())); self.error(diagnostics::RegExpFlag(ch, self.current_offset()));
return Kind::Undetermined; break;
} }
let flag = if let Ok(flag) = RegExpFlags::try_from(ch) { let flag = if let Ok(flag) = RegExpFlags::try_from(ch) {
flag flag
} else { } else {
self.error(diagnostics::RegExpFlag(ch, self.current_offset())); self.error(diagnostics::RegExpFlag(ch, self.current_offset()));
return Kind::Undetermined; break;
}; };
if flags.contains(flag) { if flags.contains(flag) {
self.error(diagnostics::RegExpFlagTwice(ch, self.current_offset())); self.error(diagnostics::RegExpFlagTwice(ch, self.current_offset()));
return Kind::Undetermined; break;
} }
flags |= flag; flags |= flag;
} }
Kind::RegExp (pattern_end, flags)
} }
/// 12.8.6 Template Literal Lexical Components /// 12.8.6 Template Literal Lexical Components