mirror of
https://github.com/danbulant/oxc
synced 2026-05-25 04:42:10 +00:00
perf(parser): optimize lexing strings (#2366)
Optimize lexing strings a bit.
This commit is contained in:
parent
d6d921ea1f
commit
0be8397c77
2 changed files with 124 additions and 97 deletions
|
|
@ -394,6 +394,7 @@ macro_rules! byte_search {
|
||||||
$table.use_table();
|
$table.use_table();
|
||||||
|
|
||||||
let mut pos = $start;
|
let mut pos = $start;
|
||||||
|
#[allow(unused_unsafe)] // Silence warnings if macro called in unsafe code
|
||||||
loop {
|
loop {
|
||||||
if pos.addr() <= $lexer.source.end_for_batch_search_addr() {
|
if pos.addr() <= $lexer.source.end_for_batch_search_addr() {
|
||||||
// Search a batch of `SEARCH_BATCH_SIZE` bytes.
|
// Search a batch of `SEARCH_BATCH_SIZE` bytes.
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
use super::{
|
use super::{
|
||||||
|
cold_branch,
|
||||||
search::{byte_search, safe_byte_match_table, SafeByteMatchTable},
|
search::{byte_search, safe_byte_match_table, SafeByteMatchTable},
|
||||||
source::SourcePosition,
|
|
||||||
Kind, Lexer, LexerContext, Span, Token,
|
Kind, Lexer, LexerContext, Span, Token,
|
||||||
};
|
};
|
||||||
use crate::diagnostics;
|
use crate::diagnostics;
|
||||||
|
|
@ -16,86 +16,69 @@ static DOUBLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable =
|
||||||
static SINGLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable =
|
static SINGLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable =
|
||||||
safe_byte_match_table!(|b| matches!(b, b'\'' | b'\r' | b'\n' | b'\\'));
|
safe_byte_match_table!(|b| matches!(b, b'\'' | b'\r' | b'\n' | b'\\'));
|
||||||
|
|
||||||
impl<'a> Lexer<'a> {
|
/// Macro to handle a string literal.
|
||||||
/// 12.9.4 String Literals
|
///
|
||||||
|
/// # SAFETY
|
||||||
/// Read string literal delimited with `"`.
|
/// `$delimiter` must be an ASCII byte.
|
||||||
/// # SAFETY
|
/// Next char in `lexer.source` must be ASCII.
|
||||||
/// Next character must be `"`.
|
/// `$table` must be a `SafeByteMatchTable`.
|
||||||
pub(super) unsafe fn read_string_literal_double_quote(&mut self) -> Kind {
|
/// `$table` must only match `$delimiter`, '\', '\r' or '\n'.
|
||||||
if self.context == LexerContext::JsxAttributeValue {
|
macro_rules! handle_string_literal {
|
||||||
self.consume_char();
|
($lexer:ident, $delimiter:expr, $table:ident) => {{
|
||||||
self.read_jsx_string_literal('"')
|
|
||||||
} else {
|
|
||||||
// SAFETY: `DOUBLE_QUOTE_STRING_END_TABLE` matches all non-ASCII bytes
|
|
||||||
self.read_string_literal(b'"', &DOUBLE_QUOTE_STRING_END_TABLE)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Read string literal delimited with `'`.
|
|
||||||
/// # SAFETY
|
|
||||||
/// Next character must be `'`.
|
|
||||||
pub(super) unsafe fn read_string_literal_single_quote(&mut self) -> Kind {
|
|
||||||
if self.context == LexerContext::JsxAttributeValue {
|
|
||||||
self.consume_char();
|
|
||||||
self.read_jsx_string_literal('\'')
|
|
||||||
} else {
|
|
||||||
// SAFETY: `SINGLE_QUOTE_STRING_END_TABLE` matches all non-ASCII bytes
|
|
||||||
self.read_string_literal(b'\'', &SINGLE_QUOTE_STRING_END_TABLE)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Read string literal.
|
|
||||||
/// # SAFETY
|
|
||||||
/// Next byte must be ASCII.
|
|
||||||
unsafe fn read_string_literal(&mut self, delimiter: u8, table: &SafeByteMatchTable) -> Kind {
|
|
||||||
// Skip opening quote.
|
// Skip opening quote.
|
||||||
// SAFETY: Caller guarantees next byte is ASCII, so safe to advance past it.
|
// SAFETY: Caller guarantees next byte is ASCII, so safe to advance past it.
|
||||||
let after_opening_quote = unsafe { self.source.position().add(1) };
|
let after_opening_quote = $lexer.source.position().add(1);
|
||||||
|
|
||||||
// Consume bytes which are part of identifier
|
// Consume bytes which are part of identifier
|
||||||
byte_search! {
|
byte_search! {
|
||||||
lexer: self,
|
lexer: $lexer,
|
||||||
table: table,
|
table: $table,
|
||||||
start: after_opening_quote,
|
start: after_opening_quote,
|
||||||
handle_match: |next_byte| {
|
handle_match: |next_byte| {
|
||||||
// Found a matching byte.
|
// Found a matching byte.
|
||||||
// Either end of string found, or a line break, or `\` escape.
|
// Either end of string found, or a line break, or `\` escape.
|
||||||
if next_byte == delimiter {
|
match next_byte {
|
||||||
self.consume_char();
|
$delimiter => {
|
||||||
return Kind::Str;
|
// SAFETY: `handle_match` is only called if there's a byte to consume,
|
||||||
|
// and `next_byte` is the next byte in `lexer.source`.
|
||||||
|
// Macro user guarantees delimiter is ASCII, so consuming it cannot move
|
||||||
|
// `lexer.source` off a UTF-8 character boundary.
|
||||||
|
$lexer.source.next_byte_unchecked();
|
||||||
|
Kind::Str
|
||||||
|
},
|
||||||
|
b'\\' => {
|
||||||
|
cold_branch(|| {
|
||||||
|
handle_string_literal_escape!($lexer, $delimiter, $table, after_opening_quote)
|
||||||
|
})
|
||||||
|
},
|
||||||
|
b'\r' | b'\n' => {
|
||||||
|
// This is impossible in valid JS, so cold path
|
||||||
|
cold_branch(|| {
|
||||||
|
$lexer.consume_char();
|
||||||
|
$lexer.error(diagnostics::UnterminatedString($lexer.unterminated_range()));
|
||||||
|
Kind::Undetermined
|
||||||
|
})
|
||||||
|
},
|
||||||
|
// SAFETY: Macro user guarantees `$table` does not match any other bytes
|
||||||
|
_ => assert_unchecked::unreachable_unchecked!()
|
||||||
}
|
}
|
||||||
|
|
||||||
if next_byte == b'\\' {
|
|
||||||
return self.string_literal_on_escape(delimiter, table, after_opening_quote);
|
|
||||||
}
|
|
||||||
|
|
||||||
debug_assert!(matches!(next_byte, b'\r' | b'\n'));
|
|
||||||
self.consume_char();
|
|
||||||
self.error(diagnostics::UnterminatedString(self.unterminated_range()));
|
|
||||||
Kind::Undetermined
|
|
||||||
},
|
},
|
||||||
handle_eof: || {
|
handle_eof: || {
|
||||||
self.error(diagnostics::UnterminatedString(self.unterminated_range()));
|
$lexer.error(diagnostics::UnterminatedString($lexer.unterminated_range()));
|
||||||
Kind::Undetermined
|
Kind::Undetermined
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
}
|
}};
|
||||||
|
}
|
||||||
|
|
||||||
/// Process string literal when `\` escape found.
|
macro_rules! handle_string_literal_escape {
|
||||||
#[cold]
|
($lexer:ident, $delimiter:expr, $table:ident, $after_opening_quote:ident) => {{
|
||||||
fn string_literal_on_escape(
|
|
||||||
&mut self,
|
|
||||||
delimiter: u8,
|
|
||||||
table: &SafeByteMatchTable,
|
|
||||||
after_opening_quote: SourcePosition,
|
|
||||||
) -> Kind {
|
|
||||||
// Create arena string to hold unescaped string.
|
// Create arena string to hold unescaped string.
|
||||||
// We don't know how long string will end up being. Take a guess that total length
|
// We don't know how long string will end up being. Take a guess that total length
|
||||||
// will be double what we've seen so far, or `MIN_ESCAPED_STR_LEN` minimum.
|
// will be double what we've seen so far, or `MIN_ESCAPED_STR_LEN` minimum.
|
||||||
let so_far = self.source.str_from_pos_to_current(after_opening_quote);
|
let so_far = $lexer.source.str_from_pos_to_current($after_opening_quote);
|
||||||
let capacity = max(so_far.len() * 2, MIN_ESCAPED_STR_LEN);
|
let capacity = max(so_far.len() * 2, MIN_ESCAPED_STR_LEN);
|
||||||
let mut str = String::with_capacity_in(capacity, self.allocator);
|
let mut str = String::with_capacity_in(capacity, $lexer.allocator);
|
||||||
|
|
||||||
// Push chunk before `\` into `str`.
|
// Push chunk before `\` into `str`.
|
||||||
// `bumpalo::collections::string::String::push_str` is currently expensive due to
|
// `bumpalo::collections::string::String::push_str` is currently expensive due to
|
||||||
|
|
@ -104,60 +87,103 @@ impl<'a> Lexer<'a> {
|
||||||
|
|
||||||
'outer: loop {
|
'outer: loop {
|
||||||
// Consume `\`
|
// Consume `\`
|
||||||
let escape_start_offset = self.offset();
|
let escape_start_offset = $lexer.offset();
|
||||||
self.consume_char();
|
$lexer.consume_char();
|
||||||
|
|
||||||
// Consume escape sequence and add char to `str`
|
// Consume escape sequence and add char to `str`
|
||||||
let mut is_valid_escape_sequence = true;
|
let mut is_valid_escape_sequence = true;
|
||||||
self.read_string_escape_sequence(&mut str, false, &mut is_valid_escape_sequence);
|
$lexer.read_string_escape_sequence(&mut str, false, &mut is_valid_escape_sequence);
|
||||||
if !is_valid_escape_sequence {
|
if !is_valid_escape_sequence {
|
||||||
let range = Span::new(escape_start_offset, self.offset());
|
let range = Span::new(escape_start_offset, $lexer.offset());
|
||||||
self.error(diagnostics::InvalidEscapeSequence(range));
|
$lexer.error(diagnostics::InvalidEscapeSequence(range));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Consume bytes until reach end of string, line break, or another escape
|
// Consume bytes until reach end of string, line break, or another escape
|
||||||
let chunk_start = self.source.position();
|
let chunk_start = $lexer.source.position();
|
||||||
while let Some(b) = self.source.peek_byte() {
|
while let Some(b) = $lexer.source.peek_byte() {
|
||||||
if !table.matches(b) {
|
match b {
|
||||||
// SAFETY: A byte is available, as we just peeked it.
|
b if !$table.matches(b) => {
|
||||||
// This may put `source`'s position on a UTF-8 continuation byte, which violates
|
// SAFETY: A byte is available, as we just peeked it.
|
||||||
// `Source`'s invariant temporarily, but the guarantees of `SafeByteMatchTable`
|
// This may put `source`'s position on a UTF-8 continuation byte, which violates
|
||||||
// mean `table.matches(b)` will always return `true` in a pattern where
|
// `Source`'s invariant temporarily, but the guarantees of `SafeByteMatchTable`
|
||||||
// we can't exit this loop without `source` being positioned on a UTF-8 character
|
// mean `!table.matches(b)` on this branch prevents exiting this loop until
|
||||||
// boundary again.
|
// `source` is positioned on a UTF-8 character boundary again.
|
||||||
unsafe { self.source.next_byte_unchecked() };
|
unsafe { $lexer.source.next_byte_unchecked() };
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
b if b == $delimiter => {
|
||||||
|
// End of string found. Push last chunk to `str`.
|
||||||
|
let chunk = $lexer.source.str_from_pos_to_current(chunk_start);
|
||||||
|
str.push_str(chunk);
|
||||||
|
|
||||||
if b == delimiter {
|
// Consume closing quote.
|
||||||
// End of string found. Push last chunk to `str`, and consume closing quote.
|
// SAFETY: Caller guarantees delimiter is ASCII, so consuming it cannot move
|
||||||
let chunk = self.source.str_from_pos_to_current(chunk_start);
|
// `lexer.source` off a UTF-8 character boundary
|
||||||
str.push_str(chunk);
|
$lexer.source.next_byte_unchecked();
|
||||||
self.consume_char();
|
break 'outer;
|
||||||
break 'outer;
|
}
|
||||||
|
b'\\' => {
|
||||||
|
// Another escape found. Push last chunk to `str`, and loop back to handle escape.
|
||||||
|
let chunk = $lexer.source.str_from_pos_to_current(chunk_start);
|
||||||
|
str.push_str(chunk);
|
||||||
|
continue 'outer;
|
||||||
|
}
|
||||||
|
b'\r' | b'\n' => {
|
||||||
|
// This is impossible in valid JS, so cold path
|
||||||
|
return cold_branch(|| {
|
||||||
|
$lexer.consume_char();
|
||||||
|
$lexer.error(diagnostics::UnterminatedString($lexer.unterminated_range()));
|
||||||
|
Kind::Undetermined
|
||||||
|
});
|
||||||
|
}
|
||||||
|
// SAFETY: Caller guarantees `table` does not match any other bytes
|
||||||
|
_ => assert_unchecked::unreachable_unchecked!(),
|
||||||
}
|
}
|
||||||
|
|
||||||
if b == b'\\' {
|
|
||||||
// Another escape found. Push last chunk to `str`, and loop back to handle escape.
|
|
||||||
let chunk = self.source.str_from_pos_to_current(chunk_start);
|
|
||||||
str.push_str(chunk);
|
|
||||||
continue 'outer;
|
|
||||||
}
|
|
||||||
|
|
||||||
debug_assert!(matches!(b, b'\r' | b'\n'));
|
|
||||||
self.consume_char();
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// EOF
|
// EOF
|
||||||
self.error(diagnostics::UnterminatedString(self.unterminated_range()));
|
$lexer.error(diagnostics::UnterminatedString($lexer.unterminated_range()));
|
||||||
return Kind::Undetermined;
|
return Kind::Undetermined;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Convert `str` to arena slice and save to `escaped_strings`
|
// Convert `str` to arena slice and save to `escaped_strings`
|
||||||
self.save_string(true, str.into_bump_str());
|
$lexer.save_string(true, str.into_bump_str());
|
||||||
|
|
||||||
Kind::Str
|
Kind::Str
|
||||||
|
}}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Lexer<'a> {
|
||||||
|
/// 12.9.4 String Literals
|
||||||
|
|
||||||
|
/// Read string literal delimited with `"`.
|
||||||
|
/// # SAFETY
|
||||||
|
/// Next character must be `"`.
|
||||||
|
pub(super) unsafe fn read_string_literal_double_quote(&mut self) -> Kind {
|
||||||
|
if self.context == LexerContext::JsxAttributeValue {
|
||||||
|
// SAFETY: Caller guarantees next char is `"`
|
||||||
|
self.source.next_byte_unchecked();
|
||||||
|
self.read_jsx_string_literal('"')
|
||||||
|
} else {
|
||||||
|
// SAFETY: Caller guarantees next char is `"`, which is ASCII.
|
||||||
|
// b'"' is an ASCII byte. `DOUBLE_QUOTE_STRING_END_TABLE` is a `SafeByteMatchTable`.
|
||||||
|
unsafe { handle_string_literal!(self, b'"', DOUBLE_QUOTE_STRING_END_TABLE) }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Read string literal delimited with `'`.
|
||||||
|
/// # SAFETY
|
||||||
|
/// Next character must be `'`.
|
||||||
|
pub(super) unsafe fn read_string_literal_single_quote(&mut self) -> Kind {
|
||||||
|
if self.context == LexerContext::JsxAttributeValue {
|
||||||
|
// SAFETY: Caller guarantees next char is `'`
|
||||||
|
self.source.next_byte_unchecked();
|
||||||
|
self.read_jsx_string_literal('\'')
|
||||||
|
} else {
|
||||||
|
// SAFETY: Caller guarantees next char is `"`, which is ASCII.
|
||||||
|
// b'\'' is an ASCII byte. `SINGLE_QUOTE_STRING_END_TABLE` is a `SafeByteMatchTable`.
|
||||||
|
unsafe { handle_string_literal!(self, b'\'', SINGLE_QUOTE_STRING_END_TABLE) }
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Save the string if it is escaped
|
/// Save the string if it is escaped
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue