mirror of
https://github.com/danbulant/oxc
synced 2026-05-24 20:32:10 +00:00
perf(parser): faster lexing template strings (#2541)
Speed up lexing template strings. This was the last use of `AutoCow` remaining in the lexer, and it's now removed. Implementation is quite complex, to avoid repeatedly branching on whether an unescaped string is required or not (the way `AutoCow` did). I tried to simplify it down to a single function, but this hurt performance significantly. Benchmarks do not show much movement, but I believe that's because there aren't many template strings in the benchmarks. Where there are template strings, I believe this speeds up lexing them significantly.
This commit is contained in:
parent
9d7ea6b3f0
commit
5a13714a18
4 changed files with 316 additions and 121 deletions
|
|
@ -19,7 +19,6 @@ mod regex;
|
|||
mod search;
|
||||
mod source;
|
||||
mod string;
|
||||
mod string_builder;
|
||||
mod template;
|
||||
mod token;
|
||||
mod trivia_builder;
|
||||
|
|
@ -38,7 +37,6 @@ use oxc_span::{SourceType, Span};
|
|||
use self::{
|
||||
byte_handlers::handle_byte,
|
||||
source::{Source, SourcePosition},
|
||||
string_builder::AutoCow,
|
||||
trivia_builder::TriviaBuilder,
|
||||
};
|
||||
pub use self::{
|
||||
|
|
|
|||
|
|
@ -218,6 +218,20 @@ impl<'a> Source<'a> {
|
|||
self.str_between_positions_unchecked(pos, SourcePosition::new(self.ptr))
|
||||
}
|
||||
|
||||
/// Get string slice from current position of `Source` up to a `SourcePosition`, without checks.
|
||||
///
|
||||
/// # SAFETY
|
||||
/// `pos` must not be before current position of `Source`.
|
||||
/// This is always the case if both:
|
||||
/// 1. `Source::set_position` has not been called since `pos` was created.
|
||||
/// 2. `pos` has not been moved backwards with `SourcePosition::sub`.
|
||||
#[inline]
|
||||
pub(super) unsafe fn str_from_current_to_pos_unchecked(&self, pos: SourcePosition) -> &'a str {
|
||||
// SAFETY: Caller guarantees `pos` is not before current position of `Source`.
|
||||
// `self.ptr` is always a valid `SourcePosition` due to invariants of `Source`.
|
||||
self.str_between_positions_unchecked(SourcePosition::new(self.ptr), pos)
|
||||
}
|
||||
|
||||
/// Get string slice from a `SourcePosition` up to the end of `Source`.
|
||||
#[inline]
|
||||
pub(super) fn str_from_pos_to_end(&self, pos: SourcePosition) -> &'a str {
|
||||
|
|
|
|||
|
|
@ -1,74 +0,0 @@
|
|||
// Copied from https://github.com/mozilla-spidermonkey/jsparagus/blob/master/crates/parser/src/lexer.rs#L2256
|
||||
|
||||
use oxc_allocator::String;
|
||||
|
||||
use crate::lexer::Lexer;
|
||||
|
||||
pub struct AutoCow<'a> {
|
||||
pub start: &'a str,
|
||||
pub value: Option<String<'a>>,
|
||||
}
|
||||
|
||||
impl<'a> AutoCow<'a> {
|
||||
pub fn new(lexer: &Lexer<'a>) -> Self {
|
||||
let start = lexer.remaining();
|
||||
AutoCow { start, value: None }
|
||||
}
|
||||
|
||||
// Push a char that matches `lexer.next_char()`.
|
||||
pub fn push_matching(&mut self, c: char) {
|
||||
if let Some(text) = &mut self.value {
|
||||
text.push(c);
|
||||
}
|
||||
}
|
||||
|
||||
// Push a different character than `lexer.next_char()`.
|
||||
// force_allocation_without_current_ascii_char must be called before this.
|
||||
pub fn push_different(&mut self, c: char) {
|
||||
debug_assert!(self.value.is_some());
|
||||
self.value.as_mut().unwrap().push(c);
|
||||
}
|
||||
|
||||
// Force allocation of a String, excluding the current ASCII character,
|
||||
// and return the reference to it
|
||||
pub fn get_mut_string_without_current_ascii_char<'b>(
|
||||
&'b mut self,
|
||||
lexer: &Lexer<'a>,
|
||||
) -> &'b mut String<'a> {
|
||||
self.force_allocation_without_current_ascii_char(lexer);
|
||||
self.value.as_mut().unwrap()
|
||||
}
|
||||
|
||||
// Force allocation of a String, excluding the current ASCII character.
|
||||
pub fn force_allocation_without_current_ascii_char(&mut self, lexer: &Lexer<'a>) {
|
||||
if self.value.is_some() {
|
||||
return;
|
||||
}
|
||||
self.value = Some(String::from_str_in(
|
||||
&self.start[..self.start.len() - lexer.remaining().len() - 1],
|
||||
lexer.allocator,
|
||||
));
|
||||
}
|
||||
|
||||
// Check if the string contains a different character, such as an escape sequence
|
||||
pub fn has_escape(&self) -> bool {
|
||||
self.value.is_some()
|
||||
}
|
||||
|
||||
// TODO: Delete this if not using it
|
||||
#[allow(dead_code)]
|
||||
pub fn finish(mut self, lexer: &Lexer<'a>) -> &'a str {
|
||||
match self.value.take() {
|
||||
Some(s) => s.into_bump_str(),
|
||||
None => &self.start[..self.start.len() - lexer.remaining().len()],
|
||||
}
|
||||
}
|
||||
|
||||
// Just like finish, but without pushing current char.
|
||||
pub fn finish_without_push(mut self, lexer: &Lexer<'a>) -> &'a str {
|
||||
match self.value.take() {
|
||||
Some(s) => s.into_bump_str(),
|
||||
None => &self.start[..self.start.len() - lexer.remaining().len() - 1],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,47 +1,312 @@
|
|||
use super::{AutoCow, Kind, Lexer, Token};
|
||||
use super::{
|
||||
cold_branch,
|
||||
search::{byte_search, safe_byte_match_table, SafeByteMatchTable},
|
||||
Kind, Lexer, SourcePosition, Token,
|
||||
};
|
||||
use crate::diagnostics;
|
||||
|
||||
use oxc_syntax::identifier::{CR, LF};
|
||||
use std::cmp::max;
|
||||
|
||||
use oxc_allocator::String;
|
||||
|
||||
const MIN_ESCAPED_TEMPLATE_LIT_LEN: usize = 16;
|
||||
|
||||
static TEMPLATE_LITERAL_TABLE: SafeByteMatchTable =
|
||||
safe_byte_match_table!(|b| matches!(b, b'$' | b'`' | b'\r' | b'\\'));
|
||||
|
||||
impl<'a> Lexer<'a> {
|
||||
/// 12.8.6 Template Literal Lexical Components
|
||||
|
||||
/// Read template literal component.
|
||||
///
|
||||
/// This function handles the common case where template contains no escapes or `\r` characters
|
||||
/// and so does not require saving to `lexer.escaped_templates`.
|
||||
/// If an escape or `\r` is found, control is passed to `template_literal_escaped` which builds
|
||||
/// the unescaped string. This division keeps the path for common case as fast as possible.
|
||||
pub(super) fn read_template_literal(&mut self, substitute: Kind, tail: Kind) -> Kind {
|
||||
let mut builder = AutoCow::new(self);
|
||||
let mut is_valid_escape_sequence = true;
|
||||
while let Some(c) = self.next_char() {
|
||||
match c {
|
||||
'$' if self.peek() == Some('{') => {
|
||||
self.save_template_string(
|
||||
is_valid_escape_sequence,
|
||||
builder.has_escape(),
|
||||
builder.finish_without_push(self),
|
||||
);
|
||||
self.consume_char();
|
||||
return substitute;
|
||||
}
|
||||
'`' => {
|
||||
self.save_template_string(
|
||||
is_valid_escape_sequence,
|
||||
builder.has_escape(),
|
||||
builder.finish_without_push(self),
|
||||
);
|
||||
return tail;
|
||||
}
|
||||
CR => {
|
||||
builder.force_allocation_without_current_ascii_char(self);
|
||||
if self.next_eq(LF) {
|
||||
builder.push_different(LF);
|
||||
let mut ret = substitute;
|
||||
|
||||
byte_search! {
|
||||
lexer: self,
|
||||
table: TEMPLATE_LITERAL_TABLE,
|
||||
continue_if: |next_byte, pos| {
|
||||
match next_byte {
|
||||
b'$' => {
|
||||
// SAFETY: Next byte is `$` which is ASCII, so after it is a UTF-8 char boundary
|
||||
let after_dollar = unsafe { pos.add(1) };
|
||||
if after_dollar.addr() < self.source.end_addr() {
|
||||
// If `${`, exit.
|
||||
// SAFETY: Have checked there's at least 1 further byte to read.
|
||||
if unsafe { after_dollar.read() } == b'{' {
|
||||
// Skip `${` and stop searching.
|
||||
// SAFETY: Consuming `${` leaves `pos` on a UTF-8 char boundary.
|
||||
pos = unsafe { after_dollar.add(1) };
|
||||
false
|
||||
} else {
|
||||
// Not `${`. Continue searching.
|
||||
true
|
||||
}
|
||||
} else {
|
||||
// This is last byte in file. Continue to `handle_eof`.
|
||||
// This is illegal in valid JS, so mark this branch cold.
|
||||
cold_branch(|| true)
|
||||
}
|
||||
},
|
||||
b'`' => {
|
||||
// Skip '`' and stop searching.
|
||||
// SAFETY: Char at `pos` is '`', so `pos + 1` is a UTF-8 char boundary.
|
||||
pos = unsafe { pos.add(1) };
|
||||
ret = tail;
|
||||
false
|
||||
},
|
||||
b'\r' => {
|
||||
// SAFETY: Byte at `pos` is `\r`.
|
||||
// `pos` has only been advanced relative to `self.source.position()`.
|
||||
return unsafe { self.template_literal_carriage_return(pos, substitute, tail) };
|
||||
}
|
||||
_ => {
|
||||
// `TEMPLATE_LITERAL_TABLE` only matches `$`, '`', `\r` and `\`
|
||||
debug_assert!(next_byte == b'\\');
|
||||
// SAFETY: Byte at `pos` is `\`.
|
||||
// `pos` has only been advanced relative to `self.source.position()`.
|
||||
return unsafe { self.template_literal_backslash(pos, substitute, tail) };
|
||||
}
|
||||
}
|
||||
'\\' => {
|
||||
let text = builder.get_mut_string_without_current_ascii_char(self);
|
||||
self.read_string_escape_sequence(text, true, &mut is_valid_escape_sequence);
|
||||
}
|
||||
_ => builder.push_matching(c),
|
||||
}
|
||||
},
|
||||
handle_match: |_next_byte, _start| {
|
||||
ret
|
||||
},
|
||||
handle_eof: |_start| {
|
||||
self.error(diagnostics::UnterminatedString(self.unterminated_range()));
|
||||
Kind::Undetermined
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
/// Consume rest of template literal after a `\r` is found.
|
||||
///
|
||||
/// # SAFETY
|
||||
/// * Byte at `pos` must be `\r`.
|
||||
/// * `pos` must not be before `self.source.position()`.
|
||||
unsafe fn template_literal_carriage_return(
|
||||
&mut self,
|
||||
mut pos: SourcePosition<'a>,
|
||||
substitute: Kind,
|
||||
tail: Kind,
|
||||
) -> Kind {
|
||||
// Create arena string to hold modified template literal, containing up to before `\r`.
|
||||
// SAFETY: Caller guarantees `pos` is not before `self.source.position()`.
|
||||
let str = self.template_literal_create_string(pos);
|
||||
|
||||
// Skip `\r`.
|
||||
// SAFETY: Caller guarantees byte at `pos` is `\r`, so `pos + 1` is a UTF-8 char boundary.
|
||||
pos = pos.add(1);
|
||||
|
||||
// If at EOF, exit. This illegal in valid JS, so cold branch.
|
||||
if pos.addr() == self.source.end_addr() {
|
||||
return cold_branch(|| {
|
||||
self.source.advance_to_end();
|
||||
self.error(diagnostics::UnterminatedString(self.unterminated_range()));
|
||||
Kind::Undetermined
|
||||
});
|
||||
}
|
||||
self.error(diagnostics::UnterminatedString(self.unterminated_range()));
|
||||
Kind::Undetermined
|
||||
|
||||
// Start next chunk after `\r`
|
||||
let chunk_start = pos;
|
||||
|
||||
// If next char is `\n`, start next search after it.
|
||||
// `\n` is first char of next chunk, so it'll get added to `str` when chunk is pushed.
|
||||
// SAFETY: Have checked not at EOF.
|
||||
if pos.read() == b'\n' {
|
||||
// SAFETY: `\n` is ASCII, so advancing past it leaves `pos` on a UTF-8 char boundary
|
||||
pos = pos.add(1);
|
||||
}
|
||||
|
||||
self.template_literal_escaped(str, pos, chunk_start, true, substitute, tail)
|
||||
}
|
||||
|
||||
/// Consume rest of template literal after a `\` escape is found.
|
||||
///
|
||||
/// # SAFETY
|
||||
/// * Byte at `pos` must be `\`.
|
||||
/// * `pos` must not be before `self.source.position()`.
|
||||
unsafe fn template_literal_backslash(
|
||||
&mut self,
|
||||
pos: SourcePosition<'a>,
|
||||
substitute: Kind,
|
||||
tail: Kind,
|
||||
) -> Kind {
|
||||
// Create arena string to hold modified template literal, containing up to before `\`.
|
||||
// SAFETY: Caller guarantees `pos` is not before `self.source.position()`.
|
||||
let mut str = self.template_literal_create_string(pos);
|
||||
|
||||
// Decode escape sequence into `str`.
|
||||
// `read_string_escape_sequence` expects `self.source` to be positioned after `\`.
|
||||
// SAFETY: Caller guarantees next byte is `\`, which is ASCII, so `pos + 1` is UTF-8 char boundary.
|
||||
let after_backslash = pos.add(1);
|
||||
self.source.set_position(after_backslash);
|
||||
|
||||
let mut is_valid_escape_sequence = true;
|
||||
self.read_string_escape_sequence(&mut str, true, &mut is_valid_escape_sequence);
|
||||
|
||||
// Continue search after escape
|
||||
let after_escape = self.source.position();
|
||||
// SAFETY: `pos` and `chunk_start` are the same
|
||||
self.template_literal_escaped(
|
||||
str,
|
||||
after_escape,
|
||||
after_escape,
|
||||
is_valid_escape_sequence,
|
||||
substitute,
|
||||
tail,
|
||||
)
|
||||
}
|
||||
|
||||
/// Create arena string for modified template literal, containing the template literal up to `pos`.
|
||||
/// # SAFETY
|
||||
/// `pos` must not be before `self.source.position()`
|
||||
unsafe fn template_literal_create_string(&self, pos: SourcePosition) -> String<'a> {
|
||||
// Create arena string to hold modified template literal.
|
||||
// We don't know how long template literal will end up being. Take a guess that total length
|
||||
// will be double what we've seen so far, or `MIN_ESCAPED_TEMPLATE_LIT_LEN` minimum.
|
||||
// SAFETY: Caller guarantees `pos` is not before `self.source.position()`.
|
||||
let so_far = self.source.str_from_current_to_pos_unchecked(pos);
|
||||
let capacity = max(so_far.len() * 2, MIN_ESCAPED_TEMPLATE_LIT_LEN);
|
||||
let mut str = String::with_capacity_in(capacity, self.allocator);
|
||||
str.push_str(so_far);
|
||||
str
|
||||
}
|
||||
|
||||
/// Process template literal after `\n` or `\` found.
|
||||
/// # SAFETY
|
||||
/// `chunk_start` must not be after `pos`.
|
||||
unsafe fn template_literal_escaped(
|
||||
&mut self,
|
||||
mut str: String<'a>,
|
||||
pos: SourcePosition<'a>,
|
||||
mut chunk_start: SourcePosition<'a>,
|
||||
mut is_valid_escape_sequence: bool,
|
||||
substitute: Kind,
|
||||
tail: Kind,
|
||||
) -> Kind {
|
||||
let mut ret = substitute;
|
||||
|
||||
byte_search! {
|
||||
lexer: self,
|
||||
table: TEMPLATE_LITERAL_TABLE,
|
||||
start: pos,
|
||||
continue_if: |next_byte, pos| {
|
||||
if next_byte == b'$' {
|
||||
// SAFETY: Next byte is `$` which is ASCII, so after it is a UTF-8 char boundary
|
||||
let after_dollar = pos.add(1);
|
||||
if after_dollar.addr() < self.source.end_addr() {
|
||||
// If `${`, exit.
|
||||
// SAFETY: Have checked there's at least 1 further byte to read.
|
||||
if after_dollar.read() == b'{' {
|
||||
// Add last chunk to `str`.
|
||||
// SAFETY: Caller guarantees `chunk_start` is not after `pos` at start of
|
||||
// this function. `pos` only increases during searching.
|
||||
// Where `chunk_start` is updated, it's always before or equal to `pos`.
|
||||
// So `chunk_start` cannot be after `pos`.
|
||||
let chunk = self.source.str_between_positions_unchecked(chunk_start, pos);
|
||||
str.push_str(chunk);
|
||||
|
||||
// Skip `${` and stop searching.
|
||||
// SAFETY: Consuming `${` leaves `pos` on a UTF-8 char boundary.
|
||||
pos = after_dollar.add(1);
|
||||
false
|
||||
} else {
|
||||
// Not `${`. Continue searching.
|
||||
true
|
||||
}
|
||||
} else {
|
||||
// This is last byte in file. Continue to `handle_eof`.
|
||||
// This is illegal in valid JS, so mark this branch cold.
|
||||
cold_branch(|| true)
|
||||
}
|
||||
} else {
|
||||
// Next byte is '`', `\r` or `\`. Add chunk up to before this char to `str`.
|
||||
// SAFETY: Caller guarantees `chunk_start` is not after `pos` at start of
|
||||
// this function. `pos` only increases during searching.
|
||||
// Where `chunk_start` is updated, it's always before or equal to `pos`.
|
||||
// So `chunk_start` cannot be after `pos`.
|
||||
let chunk = self.source.str_between_positions_unchecked(chunk_start, pos);
|
||||
str.push_str(chunk);
|
||||
|
||||
match next_byte {
|
||||
b'`' => {
|
||||
// Skip '`' and stop searching.
|
||||
// SAFETY: Byte at `pos` is '`' (ASCII), so `pos + 1` is a UTF-8 char boundary.
|
||||
pos = pos.add(1);
|
||||
ret = tail;
|
||||
false
|
||||
}
|
||||
b'\r' => {
|
||||
// Set next chunk to start after `\r`.
|
||||
// SAFETY: Next byte is `\r` which is ASCII, so after it is a UTF-8 char boundary.
|
||||
// This temporarily puts `chunk_start` 1 byte after `pos`, but `byte_search!` macro
|
||||
// increments `pos` when return `true` from `continue_if`, so `pos` will be
|
||||
// brought up to `chunk_start` again.
|
||||
chunk_start = pos.add(1);
|
||||
|
||||
if chunk_start.addr() < self.source.end_addr() {
|
||||
// If next char is `\n`, start next search after it.
|
||||
// NB: `byte_search!` macro already advances `pos` by 1, so only advance
|
||||
// by 1 here, so that in total we skip 2 bytes for `\r\n`.
|
||||
// No need to push `\n` to `str`, as it's 1st char of next chunk,
|
||||
// and will be added to `str` when next chunk is pushed.
|
||||
if chunk_start.read() == b'\n' {
|
||||
pos = chunk_start;
|
||||
}
|
||||
} else {
|
||||
// This is last byte in file. Continue to `handle_eof`.
|
||||
// This is illegal in valid JS, so mark this branch cold.
|
||||
cold_branch(|| {});
|
||||
}
|
||||
|
||||
// Continue searching
|
||||
true
|
||||
}
|
||||
_ => {
|
||||
// `TEMPLATE_LITERAL_TABLE` only matches `$`, '`', `\r` and `\`
|
||||
debug_assert!(next_byte == b'\\');
|
||||
|
||||
// Decode escape sequence into `str`.
|
||||
// `read_string_escape_sequence` expects `self.source` to be positioned after `\`.
|
||||
// SAFETY: Next byte is `\`, which is ASCII, so `pos + 1` is UTF-8 char boundary.
|
||||
let after_backslash = pos.add(1);
|
||||
self.source.set_position(after_backslash);
|
||||
self.read_string_escape_sequence(&mut str, true, &mut is_valid_escape_sequence);
|
||||
|
||||
// Start next chunk after escape sequence
|
||||
chunk_start = self.source.position();
|
||||
assert!(chunk_start.addr() >= after_backslash.addr());
|
||||
|
||||
// Continue search after escape sequence.
|
||||
// NB: `byte_search!` macro increments `pos` when return `true`,
|
||||
// so need to subtract 1 here to counteract that.
|
||||
// SAFETY: Added 1 to `pos` above, and checked `chunk_start` hasn't moved
|
||||
// backwards from that, so subtracting 1 again is within bounds.
|
||||
pos = chunk_start.sub(1);
|
||||
|
||||
// Continue searching
|
||||
true
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
handle_match: |_next_byte, _start| {
|
||||
self.save_template_string(
|
||||
is_valid_escape_sequence,
|
||||
str.into_bump_str(),
|
||||
);
|
||||
ret
|
||||
},
|
||||
handle_eof: |_start| {
|
||||
self.error(diagnostics::UnterminatedString(self.unterminated_range()));
|
||||
Kind::Undetermined
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
/// Re-tokenize the current `}` token for `TemplateSubstitutionTail`
|
||||
|
|
@ -53,16 +318,8 @@ impl<'a> Lexer<'a> {
|
|||
self.finish_next(kind)
|
||||
}
|
||||
|
||||
/// Save the template if it is escaped
|
||||
fn save_template_string(
|
||||
&mut self,
|
||||
is_valid_escape_sequence: bool,
|
||||
has_escape: bool,
|
||||
s: &'a str,
|
||||
) {
|
||||
if !has_escape {
|
||||
return;
|
||||
}
|
||||
/// Save escaped template string
|
||||
fn save_template_string(&mut self, is_valid_escape_sequence: bool, s: &'a str) {
|
||||
self.escaped_templates.insert(self.token.start, is_valid_escape_sequence.then_some(s));
|
||||
self.token.escaped = true;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue