From 24ded3cb1530cb5cf69d021cb96d67318082954f Mon Sep 17 00:00:00 2001 From: overlookmotel Date: Wed, 28 Feb 2024 06:39:23 +0000 Subject: [PATCH] perf(parser): lex JSX strings with `memchr` (#2528) Simplify lexing JSX string attributes. As the search is purely for 1 byte value (the closing quote), and so doesn't require a byte table, use `memchr`. This change doesn't really register on benchmarks, but it's one step closer to removing `AutoCow`, and transitioning all the searches in the lexer to byte-by-byte. --- crates/oxc_parser/src/lexer/jsx.rs | 45 +++++++++++++++------------ crates/oxc_parser/src/lexer/string.rs | 31 ++++++++---------- 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/crates/oxc_parser/src/lexer/jsx.rs b/crates/oxc_parser/src/lexer/jsx.rs index 307109c34..c1c506fef 100644 --- a/crates/oxc_parser/src/lexer/jsx.rs +++ b/crates/oxc_parser/src/lexer/jsx.rs @@ -1,6 +1,7 @@ -use super::{AutoCow, Kind, Lexer, Token}; +use super::{Kind, Lexer, Token}; use crate::diagnostics; +use memchr::memchr; use oxc_syntax::identifier::{is_identifier_part, is_identifier_start}; impl<'a> Lexer<'a> { @@ -14,25 +15,29 @@ impl<'a> Lexer<'a> { /// `JSXStringCharacter` but not ' /// `JSXStringCharacter` :: /// `SourceCharacter` but not one of `HTMLCharacterReference` - pub(super) fn read_jsx_string_literal(&mut self, delimiter: char) -> Kind { - let mut builder = AutoCow::new(self); - loop { - match self.next_char() { - Some(c @ ('"' | '\'')) => { - if c == delimiter { - self.save_string(builder.has_escape(), builder.finish_without_push(self)); - return Kind::Str; - } - builder.push_matching(c); - } - Some(other) => { - builder.push_matching(other); - } - None => { - self.error(diagnostics::UnterminatedString(self.unterminated_range())); - return Kind::Undetermined; - } - } + + /// Read JSX string literal. + /// # SAFETY + /// * `delimiter` must be an ASCII character. + /// * Next char in `lexer.source` must be ASCII. + pub(super) unsafe fn read_jsx_string_literal(&mut self, delimiter: u8) -> Kind { + // Skip opening quote + debug_assert!(delimiter.is_ascii()); + // SAFETY: Caller guarantees next byte is ASCII, so `.add(1)` is a UTF-8 char boundary + let after_opening_quote = self.source.position().add(1); + let remaining = self.source.str_from_pos_to_end(after_opening_quote); + + let len = memchr(delimiter, remaining.as_bytes()); + if let Some(len) = len { + // SAFETY: `after_opening_quote` + `len` is position of delimiter. + // Caller guarantees delimiter is ASCII, so 1 byte after it is a UTF-8 char boundary. + let after_closing_quote = after_opening_quote.add(len + 1); + self.source.set_position(after_closing_quote); + Kind::Str + } else { + self.source.advance_to_end(); + self.error(diagnostics::UnterminatedString(self.unterminated_range())); + Kind::Undetermined } } diff --git a/crates/oxc_parser/src/lexer/string.rs b/crates/oxc_parser/src/lexer/string.rs index 8325d2f95..a37aee099 100644 --- a/crates/oxc_parser/src/lexer/string.rs +++ b/crates/oxc_parser/src/lexer/string.rs @@ -25,6 +25,13 @@ static SINGLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable = /// `$table` must only match `$delimiter`, '\', '\r' or '\n'. macro_rules! handle_string_literal { ($lexer:ident, $delimiter:expr, $table:ident) => {{ + debug_assert!($delimiter.is_ascii()); + + if $lexer.context == LexerContext::JsxAttributeValue { + // SAFETY: Caller guarantees `$delimiter` is ASCII, and next char is ASCII + return $lexer.read_jsx_string_literal($delimiter); + } + // Skip opening quote. // SAFETY: Caller guarantees next byte is ASCII, so safe to advance past it. let after_opening_quote = $lexer.source.position().add(1); @@ -157,30 +164,18 @@ impl<'a> Lexer<'a> { /// # SAFETY /// Next character must be `"`. pub(super) unsafe fn read_string_literal_double_quote(&mut self) -> Kind { - if self.context == LexerContext::JsxAttributeValue { - // SAFETY: Caller guarantees next char is `"` - self.source.next_byte_unchecked(); - self.read_jsx_string_literal('"') - } else { - // SAFETY: Caller guarantees next char is `"`, which is ASCII. - // b'"' is an ASCII byte. `DOUBLE_QUOTE_STRING_END_TABLE` is a `SafeByteMatchTable`. - unsafe { handle_string_literal!(self, b'"', DOUBLE_QUOTE_STRING_END_TABLE) } - } + // SAFETY: Caller guarantees next char is `"`, which is ASCII. + // b'"' is an ASCII byte. `DOUBLE_QUOTE_STRING_END_TABLE` is a `SafeByteMatchTable`. + unsafe { handle_string_literal!(self, b'"', DOUBLE_QUOTE_STRING_END_TABLE) } } /// Read string literal delimited with `'`. /// # SAFETY /// Next character must be `'`. pub(super) unsafe fn read_string_literal_single_quote(&mut self) -> Kind { - if self.context == LexerContext::JsxAttributeValue { - // SAFETY: Caller guarantees next char is `'` - self.source.next_byte_unchecked(); - self.read_jsx_string_literal('\'') - } else { - // SAFETY: Caller guarantees next char is `"`, which is ASCII. - // b'\'' is an ASCII byte. `SINGLE_QUOTE_STRING_END_TABLE` is a `SafeByteMatchTable`. - unsafe { handle_string_literal!(self, b'\'', SINGLE_QUOTE_STRING_END_TABLE) } - } + // SAFETY: Caller guarantees next char is `'`, which is ASCII. + // b'\'' is an ASCII byte. `SINGLE_QUOTE_STRING_END_TABLE` is a `SafeByteMatchTable`. + unsafe { handle_string_literal!(self, b'\'', SINGLE_QUOTE_STRING_END_TABLE) } } /// Save the string if it is escaped