perf(parser): lex JSX strings with memchr (#2528)

Simplify lexing JSX string attributes. As the search is purely for 1
byte value (the closing quote), and so doesn't require a byte table, use
`memchr`.

This change doesn't really register on benchmarks, but it's one step
closer to removing `AutoCow`, and transitioning all the searches in the
lexer to byte-by-byte.
This commit is contained in:
overlookmotel 2024-02-28 06:39:23 +00:00 committed by GitHub
parent f760108094
commit 24ded3cb15
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 38 additions and 38 deletions

View file

@ -1,6 +1,7 @@
use super::{AutoCow, Kind, Lexer, Token}; use super::{Kind, Lexer, Token};
use crate::diagnostics; use crate::diagnostics;
use memchr::memchr;
use oxc_syntax::identifier::{is_identifier_part, is_identifier_start}; use oxc_syntax::identifier::{is_identifier_part, is_identifier_start};
impl<'a> Lexer<'a> { impl<'a> Lexer<'a> {
@ -14,25 +15,29 @@ impl<'a> Lexer<'a> {
/// `JSXStringCharacter` but not ' /// `JSXStringCharacter` but not '
/// `JSXStringCharacter` :: /// `JSXStringCharacter` ::
/// `SourceCharacter` but not one of `HTMLCharacterReference` /// `SourceCharacter` but not one of `HTMLCharacterReference`
pub(super) fn read_jsx_string_literal(&mut self, delimiter: char) -> Kind {
let mut builder = AutoCow::new(self); /// Read JSX string literal.
loop { /// # SAFETY
match self.next_char() { /// * `delimiter` must be an ASCII character.
Some(c @ ('"' | '\'')) => { /// * Next char in `lexer.source` must be ASCII.
if c == delimiter { pub(super) unsafe fn read_jsx_string_literal(&mut self, delimiter: u8) -> Kind {
self.save_string(builder.has_escape(), builder.finish_without_push(self)); // Skip opening quote
return Kind::Str; debug_assert!(delimiter.is_ascii());
} // SAFETY: Caller guarantees next byte is ASCII, so `.add(1)` is a UTF-8 char boundary
builder.push_matching(c); let after_opening_quote = self.source.position().add(1);
} let remaining = self.source.str_from_pos_to_end(after_opening_quote);
Some(other) => {
builder.push_matching(other); let len = memchr(delimiter, remaining.as_bytes());
} if let Some(len) = len {
None => { // SAFETY: `after_opening_quote` + `len` is position of delimiter.
self.error(diagnostics::UnterminatedString(self.unterminated_range())); // Caller guarantees delimiter is ASCII, so 1 byte after it is a UTF-8 char boundary.
return Kind::Undetermined; let after_closing_quote = after_opening_quote.add(len + 1);
} self.source.set_position(after_closing_quote);
} Kind::Str
} else {
self.source.advance_to_end();
self.error(diagnostics::UnterminatedString(self.unterminated_range()));
Kind::Undetermined
} }
} }

View file

@ -25,6 +25,13 @@ static SINGLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable =
/// `$table` must only match `$delimiter`, '\', '\r' or '\n'. /// `$table` must only match `$delimiter`, '\', '\r' or '\n'.
macro_rules! handle_string_literal { macro_rules! handle_string_literal {
($lexer:ident, $delimiter:expr, $table:ident) => {{ ($lexer:ident, $delimiter:expr, $table:ident) => {{
debug_assert!($delimiter.is_ascii());
if $lexer.context == LexerContext::JsxAttributeValue {
// SAFETY: Caller guarantees `$delimiter` is ASCII, and next char is ASCII
return $lexer.read_jsx_string_literal($delimiter);
}
// Skip opening quote. // Skip opening quote.
// SAFETY: Caller guarantees next byte is ASCII, so safe to advance past it. // SAFETY: Caller guarantees next byte is ASCII, so safe to advance past it.
let after_opening_quote = $lexer.source.position().add(1); let after_opening_quote = $lexer.source.position().add(1);
@ -157,30 +164,18 @@ impl<'a> Lexer<'a> {
/// # SAFETY /// # SAFETY
/// Next character must be `"`. /// Next character must be `"`.
pub(super) unsafe fn read_string_literal_double_quote(&mut self) -> Kind { pub(super) unsafe fn read_string_literal_double_quote(&mut self) -> Kind {
if self.context == LexerContext::JsxAttributeValue { // SAFETY: Caller guarantees next char is `"`, which is ASCII.
// SAFETY: Caller guarantees next char is `"` // b'"' is an ASCII byte. `DOUBLE_QUOTE_STRING_END_TABLE` is a `SafeByteMatchTable`.
self.source.next_byte_unchecked(); unsafe { handle_string_literal!(self, b'"', DOUBLE_QUOTE_STRING_END_TABLE) }
self.read_jsx_string_literal('"')
} else {
// SAFETY: Caller guarantees next char is `"`, which is ASCII.
// b'"' is an ASCII byte. `DOUBLE_QUOTE_STRING_END_TABLE` is a `SafeByteMatchTable`.
unsafe { handle_string_literal!(self, b'"', DOUBLE_QUOTE_STRING_END_TABLE) }
}
} }
/// Read string literal delimited with `'`. /// Read string literal delimited with `'`.
/// # SAFETY /// # SAFETY
/// Next character must be `'`. /// Next character must be `'`.
pub(super) unsafe fn read_string_literal_single_quote(&mut self) -> Kind { pub(super) unsafe fn read_string_literal_single_quote(&mut self) -> Kind {
if self.context == LexerContext::JsxAttributeValue { // SAFETY: Caller guarantees next char is `'`, which is ASCII.
// SAFETY: Caller guarantees next char is `'` // b'\'' is an ASCII byte. `SINGLE_QUOTE_STRING_END_TABLE` is a `SafeByteMatchTable`.
self.source.next_byte_unchecked(); unsafe { handle_string_literal!(self, b'\'', SINGLE_QUOTE_STRING_END_TABLE) }
self.read_jsx_string_literal('\'')
} else {
// SAFETY: Caller guarantees next char is `"`, which is ASCII.
// b'\'' is an ASCII byte. `SINGLE_QUOTE_STRING_END_TABLE` is a `SafeByteMatchTable`.
unsafe { handle_string_literal!(self, b'\'', SINGLE_QUOTE_STRING_END_TABLE) }
}
} }
/// Save the string if it is escaped /// Save the string if it is escaped