From 08438e04ba83cd679fa5e8982a3e899382f59e8f Mon Sep 17 00:00:00 2001 From: Boshen Date: Mon, 8 Jan 2024 13:48:52 +0800 Subject: [PATCH] refactor(parser): remove TokenValue::RegExp from `Token` (#1926) This PR is part of #1880. `Token` size is reduced from 48 to 40 bytes. To reconstruct the regex pattern and flags within the parser , the regex string is re-parsed from the end by reading all valid flags. In order to make things work nicely, the lexer will no longer recover from a invalid regex. --- crates/oxc_ast/src/ast_builder.rs | 9 +++++-- crates/oxc_parser/src/js/expression.rs | 36 ++++++++++++++++---------- crates/oxc_parser/src/lexer/mod.rs | 30 ++++++--------------- crates/oxc_parser/src/lexer/token.rs | 17 +----------- tasks/coverage/parser_babel.snap | 27 +++++++++++++++++++ tasks/coverage/parser_test262.snap | 25 ++++++++++++++++++ tasks/coverage/parser_typescript.snap | 5 ++++ 7 files changed, 95 insertions(+), 54 deletions(-) diff --git a/crates/oxc_ast/src/ast_builder.rs b/crates/oxc_ast/src/ast_builder.rs index 84ac29a13..3c9b69c7c 100644 --- a/crates/oxc_ast/src/ast_builder.rs +++ b/crates/oxc_ast/src/ast_builder.rs @@ -163,8 +163,13 @@ impl<'a> AstBuilder<'a> { TemplateElementValue { raw, cooked } } - pub fn reg_exp_literal(&self, span: Span, pattern: Atom, flags: RegExpFlags) -> RegExpLiteral { - RegExpLiteral { span, value: EmptyObject, regex: RegExp { pattern, flags } } + pub fn reg_exp_literal( + &self, + span: Span, + pattern: &'a str, + flags: RegExpFlags, + ) -> RegExpLiteral { + RegExpLiteral { span, value: EmptyObject, regex: RegExp { pattern: pattern.into(), flags } } } pub fn literal_string_expression(&self, literal: StringLiteral) -> Expression<'a> { diff --git a/crates/oxc_parser/src/js/expression.rs b/crates/oxc_parser/src/js/expression.rs index 72594a12e..00d0680df 100644 --- a/crates/oxc_parser/src/js/expression.rs +++ b/crates/oxc_parser/src/js/expression.rs @@ -185,8 +185,8 @@ impl<'a> Parser<'a> { Kind::LParen => self.parse_parenthesized_expression(span), Kind::Slash | Kind::SlashEq => { self.read_regex(); - self.parse_literal_regexp() - .map(|literal| self.ast.literal_regexp_expression(literal)) + let literal = self.parse_literal_regexp(); + Ok(self.ast.literal_regexp_expression(literal)) } // JSXElement, JSXFragment Kind::LAngle if self.source_type.is_jsx() => self.parse_jsx_expression(), @@ -315,20 +315,28 @@ impl<'a> Parser<'a> { Ok(self.ast.bigint_literal(self.end_span(span), value, base)) } - pub(crate) fn parse_literal_regexp(&mut self) -> Result { + pub(crate) fn parse_literal_regexp(&mut self) -> RegExpLiteral { let span = self.start_span(); - let r = match self.cur_kind() { - Kind::RegExp => self.cur_token().value.as_regex(), - _ => return Err(self.unexpected()), - }; - let pattern = Atom::from(r.pattern); - let flags = r.flags; + + // split out the flag part of `/regex/flag` by looking for `/` from the end + let regex_src = self.cur_src(); + let mut flags = RegExpFlags::empty(); + + let mut split_index = None; + for (i, c) in regex_src.char_indices().rev() { + if let Ok(flag) = RegExpFlags::try_from(c) { + flags |= flag; + } else { + split_index.replace(i); + break; + } + } + + // `/` are omitted from the pattern + let pattern = split_index.map_or(regex_src, |i| regex_src.get(1..i).unwrap_or("")); + self.bump_any(); - Ok(RegExpLiteral { - span: self.end_span(span), - value: EmptyObject {}, - regex: RegExp { pattern, flags }, - }) + self.ast.reg_exp_literal(self.end_span(span), pattern, flags) } pub(crate) fn parse_literal_string(&mut self) -> Result { diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs index 1b016f910..d8bf42fbc 100644 --- a/crates/oxc_parser/src/lexer/mod.rs +++ b/crates/oxc_parser/src/lexer/mod.rs @@ -24,7 +24,7 @@ use oxc_syntax::{ }, unicode_id_start::is_id_start_unicode, }; -pub use token::{RegExp, Token, TokenValue}; +pub use token::{Token, TokenValue}; pub use self::{kind::Kind, number::parse_big_int}; use self::{ @@ -819,7 +819,6 @@ impl<'a> Lexer<'a> { /// 12.9.5 Regular Expression Literals fn read_regex(&mut self) -> Kind { - let start = self.current.token.start + 1; // +1 to exclude `/` let mut in_escape = false; let mut in_character_class = false; loop { @@ -848,40 +847,27 @@ impl<'a> Lexer<'a> { } } - let end = self.offset() - 1; // -1 to exclude `/` - let pattern = &self.source[start as usize..end as usize]; - let mut flags = RegExpFlags::empty(); while let Some(ch @ ('$' | '_' | 'a'..='z' | 'A'..='Z' | '0'..='9')) = self.peek() { self.current.chars.next(); if !ch.is_ascii_lowercase() { self.error(diagnostics::RegExpFlag(ch, self.current_offset())); - continue; + return Kind::Undetermined; } - let flag = match ch { - 'g' => RegExpFlags::G, - 'i' => RegExpFlags::I, - 'm' => RegExpFlags::M, - 's' => RegExpFlags::S, - 'u' => RegExpFlags::U, - 'y' => RegExpFlags::Y, - 'd' => RegExpFlags::D, - 'v' => RegExpFlags::V, - _ => { - self.error(diagnostics::RegExpFlag(ch, self.current_offset())); - continue; - } + let flag = if let Ok(flag) = RegExpFlags::try_from(ch) { + flag + } else { + self.error(diagnostics::RegExpFlag(ch, self.current_offset())); + return Kind::Undetermined; }; if flags.contains(flag) { self.error(diagnostics::RegExpFlagTwice(ch, self.current_offset())); - continue; + return Kind::Undetermined; } flags |= flag; } - self.current.token.value = TokenValue::RegExp(RegExp { pattern, flags }); - Kind::RegExp } diff --git a/crates/oxc_parser/src/lexer/token.rs b/crates/oxc_parser/src/lexer/token.rs index b7ea93039..72953af77 100644 --- a/crates/oxc_parser/src/lexer/token.rs +++ b/crates/oxc_parser/src/lexer/token.rs @@ -1,6 +1,5 @@ //! Token -use oxc_ast::ast::RegExpFlags; use oxc_span::Span; use super::kind::Kind; @@ -29,7 +28,7 @@ pub struct Token<'a> { mod size_asserts { use oxc_index::assert_eq_size; - assert_eq_size!(super::Token, [u8; 48]); + assert_eq_size!(super::Token, [u8; 40]); } impl<'a> Token<'a> { @@ -43,13 +42,6 @@ pub enum TokenValue<'a> { None, Number(f64), String(&'a str), - RegExp(RegExp<'a>), -} - -#[derive(Debug, Copy, Clone)] -pub struct RegExp<'a> { - pub pattern: &'a str, - pub flags: RegExpFlags, } impl<'a> Default for TokenValue<'a> { @@ -66,13 +58,6 @@ impl<'a> TokenValue<'a> { } } - pub fn as_regex(&self) -> &RegExp<'a> { - match self { - Self::RegExp(regex) => regex, - _ => unreachable!("expected regex!"), - } - } - pub fn get_string(&self) -> Option<&str> { match self { Self::String(s) => Some(s), diff --git a/tasks/coverage/parser_babel.snap b/tasks/coverage/parser_babel.snap index be32a8d9b..a86329c50 100644 --- a/tasks/coverage/parser_babel.snap +++ b/tasks/coverage/parser_babel.snap @@ -1141,6 +1141,12 @@ Expect to Parse: "typescript/types/const-type-parameters-babel-7/input.ts" ╭─[core/uncategorised/380/input.js:1:1] 1 │ var x = / · ── + 2 │ / + ╰──── + + × Unexpected token + ╭─[core/uncategorised/380/input.js:1:1] + 1 │ var x = / 2 │ / ╰──── @@ -1523,6 +1529,12 @@ Expect to Parse: "typescript/types/const-type-parameters-babel-7/input.ts" ╭─[core/uncategorised/441/input.js:1:1] 1 │ /a\ · ──── + 2 │ / + ╰──── + + × Unexpected token + ╭─[core/uncategorised/441/input.js:1:1] + 1 │ /a\ 2 │ / ╰──── @@ -7967,6 +7979,11 @@ Expect to Parse: "typescript/types/const-type-parameters-babel-7/input.ts" 2 │ / ╰──── + × Unexpected token + ╭─[esprima/invalid-syntax/migrated_0040/input.js:2:1] + 2 │ / + ╰──── + × Invalid Unicode escape sequence ╭─[esprima/invalid-syntax/migrated_0041/input.js:1:1] 1 │ var x = /[a-z]/\ux @@ -8141,6 +8158,11 @@ Expect to Parse: "typescript/types/const-type-parameters-babel-7/input.ts" 2 │ / ╰──── + × Unexpected token + ╭─[esprima/invalid-syntax/migrated_0062/input.js:2:1] + 2 │ / + ╰──── + × Unterminated string ╭─[esprima/invalid-syntax/migrated_0063/input.js:1:1] 1 │ var x = " @@ -8681,6 +8703,11 @@ Expect to Parse: "typescript/types/const-type-parameters-babel-7/input.ts" 2 │ / ╰──── + × Unexpected token + ╭─[esprima/invalid-syntax/migrated_0157/input.js:2:1] + 2 │ / + ╰──── + × Unexpected token ╭─[esprima/invalid-syntax/migrated_0158/input.js:2:1] 2 │ diff --git a/tasks/coverage/parser_test262.snap b/tasks/coverage/parser_test262.snap index 84d666282..094b8d3a2 100644 --- a/tasks/coverage/parser_test262.snap +++ b/tasks/coverage/parser_test262.snap @@ -18251,6 +18251,11 @@ Expect Syntax Error: "language/import/import-attributes/json-named-bindings.js" · ─ ╰──── + × Unexpected token + ╭─[language/line-terminators/invalid-regexp-cr.js:18:1] + 18 │ / + ╰──── + × Unterminated regular expression ╭─[language/line-terminators/invalid-regexp-lf.js:16:1] 16 │ @@ -18259,6 +18264,11 @@ Expect Syntax Error: "language/import/import-attributes/json-named-bindings.js" 18 │ / ╰──── + × Unexpected token + ╭─[language/line-terminators/invalid-regexp-lf.js:18:1] + 18 │ / + ╰──── + × Unterminated regular expression ╭─[language/line-terminators/invalid-regexp-ls.js:16:1] 16 │ @@ -18266,6 +18276,11 @@ Expect Syntax Error: "language/import/import-attributes/json-named-bindings.js" · ── ╰──── + × Unexpected token + ╭─[language/line-terminators/invalid-regexp-ls.js:17:1] + 17 │ /
/ + ╰──── + × Unterminated regular expression ╭─[language/line-terminators/invalid-regexp-ps.js:16:1] 16 │ @@ -18273,6 +18288,11 @@ Expect Syntax Error: "language/import/import-attributes/json-named-bindings.js" · ── ╰──── + × Unexpected token + ╭─[language/line-terminators/invalid-regexp-ps.js:17:1] + 17 │ /
/ + ╰──── + × Unterminated string ╭─[language/line-terminators/invalid-string-cr.js:15:1] 15 │ @@ -31537,6 +31557,11 @@ Expect Syntax Error: "language/import/import-attributes/json-named-bindings.js" · ─────── ╰──── + × Expected `}` but found `EOF` + ╭─[language/statements/function/invalid-function-body-1.js:17:1] + 17 │ function __func(){/ ABC} + ╰──── + × Unexpected token ╭─[language/statements/function/invalid-function-body-2.js:16:1] 16 │ diff --git a/tasks/coverage/parser_typescript.snap b/tasks/coverage/parser_typescript.snap index 9fcf4516b..50236e5c6 100644 --- a/tasks/coverage/parser_typescript.snap +++ b/tasks/coverage/parser_typescript.snap @@ -17136,6 +17136,11 @@ Expect to Parse: "conformance/salsa/plainJSRedeclare3.ts" · ──────────── ╰──── + × Expected `)` but found `EOF` + ╭─[conformance/parser/ecmascript5/RegularExpressions/parserRegularExpressionDivideAmbiguity4.ts:1:1] + 1 │ foo(/notregexp); + ╰──── + × Expected a semicolon or an implicit semicolon after a statement, but found none ╭─[conformance/parser/ecmascript5/RegularExpressions/parserRegularExpressionDivideAmbiguity7.ts:1:1] 1 │ (a/8