perf(parser): faster lexing template strings (#2541)

Speed up lexing template strings.

This was the last use of `AutoCow` remaining in the lexer, and it's now removed.

Implementation is quite complex, to avoid repeatedly branching on whether an unescaped string is required or not (the way `AutoCow` did). I tried to simplify it down to a single function, but this hurt performance significantly.

Benchmarks do not show much movement, but I believe that's because there aren't many template strings in the benchmarks. Where there are template strings, I believe this speeds up lexing them significantly.
This commit is contained in:
overlookmotel 2024-02-29 05:28:30 +00:00 committed by GitHub
parent 9d7ea6b3f0
commit 5a13714a18
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 316 additions and 121 deletions

View file

@ -19,7 +19,6 @@ mod regex;
mod search;
mod source;
mod string;
mod string_builder;
mod template;
mod token;
mod trivia_builder;
@ -38,7 +37,6 @@ use oxc_span::{SourceType, Span};
use self::{
byte_handlers::handle_byte,
source::{Source, SourcePosition},
string_builder::AutoCow,
trivia_builder::TriviaBuilder,
};
pub use self::{

View file

@ -218,6 +218,20 @@ impl<'a> Source<'a> {
self.str_between_positions_unchecked(pos, SourcePosition::new(self.ptr))
}
/// Get string slice from current position of `Source` up to a `SourcePosition`, without checks.
///
/// # SAFETY
/// `pos` must not be before current position of `Source`.
/// This is always the case if both:
/// 1. `Source::set_position` has not been called since `pos` was created.
/// 2. `pos` has not been moved backwards with `SourcePosition::sub`.
#[inline]
pub(super) unsafe fn str_from_current_to_pos_unchecked(&self, pos: SourcePosition) -> &'a str {
// SAFETY: Caller guarantees `pos` is not before current position of `Source`.
// `self.ptr` is always a valid `SourcePosition` due to invariants of `Source`.
self.str_between_positions_unchecked(SourcePosition::new(self.ptr), pos)
}
/// Get string slice from a `SourcePosition` up to the end of `Source`.
#[inline]
pub(super) fn str_from_pos_to_end(&self, pos: SourcePosition) -> &'a str {

View file

@ -1,74 +0,0 @@
// Copied from https://github.com/mozilla-spidermonkey/jsparagus/blob/master/crates/parser/src/lexer.rs#L2256
use oxc_allocator::String;
use crate::lexer::Lexer;
pub struct AutoCow<'a> {
pub start: &'a str,
pub value: Option<String<'a>>,
}
impl<'a> AutoCow<'a> {
pub fn new(lexer: &Lexer<'a>) -> Self {
let start = lexer.remaining();
AutoCow { start, value: None }
}
// Push a char that matches `lexer.next_char()`.
pub fn push_matching(&mut self, c: char) {
if let Some(text) = &mut self.value {
text.push(c);
}
}
// Push a different character than `lexer.next_char()`.
// force_allocation_without_current_ascii_char must be called before this.
pub fn push_different(&mut self, c: char) {
debug_assert!(self.value.is_some());
self.value.as_mut().unwrap().push(c);
}
// Force allocation of a String, excluding the current ASCII character,
// and return the reference to it
pub fn get_mut_string_without_current_ascii_char<'b>(
&'b mut self,
lexer: &Lexer<'a>,
) -> &'b mut String<'a> {
self.force_allocation_without_current_ascii_char(lexer);
self.value.as_mut().unwrap()
}
// Force allocation of a String, excluding the current ASCII character.
pub fn force_allocation_without_current_ascii_char(&mut self, lexer: &Lexer<'a>) {
if self.value.is_some() {
return;
}
self.value = Some(String::from_str_in(
&self.start[..self.start.len() - lexer.remaining().len() - 1],
lexer.allocator,
));
}
// Check if the string contains a different character, such as an escape sequence
pub fn has_escape(&self) -> bool {
self.value.is_some()
}
// TODO: Delete this if not using it
#[allow(dead_code)]
pub fn finish(mut self, lexer: &Lexer<'a>) -> &'a str {
match self.value.take() {
Some(s) => s.into_bump_str(),
None => &self.start[..self.start.len() - lexer.remaining().len()],
}
}
// Just like finish, but without pushing current char.
pub fn finish_without_push(mut self, lexer: &Lexer<'a>) -> &'a str {
match self.value.take() {
Some(s) => s.into_bump_str(),
None => &self.start[..self.start.len() - lexer.remaining().len() - 1],
}
}
}

View file

@ -1,47 +1,312 @@
use super::{AutoCow, Kind, Lexer, Token};
use super::{
cold_branch,
search::{byte_search, safe_byte_match_table, SafeByteMatchTable},
Kind, Lexer, SourcePosition, Token,
};
use crate::diagnostics;
use oxc_syntax::identifier::{CR, LF};
use std::cmp::max;
use oxc_allocator::String;
const MIN_ESCAPED_TEMPLATE_LIT_LEN: usize = 16;
static TEMPLATE_LITERAL_TABLE: SafeByteMatchTable =
safe_byte_match_table!(|b| matches!(b, b'$' | b'`' | b'\r' | b'\\'));
impl<'a> Lexer<'a> {
/// 12.8.6 Template Literal Lexical Components
/// Read template literal component.
///
/// This function handles the common case where template contains no escapes or `\r` characters
/// and so does not require saving to `lexer.escaped_templates`.
/// If an escape or `\r` is found, control is passed to `template_literal_escaped` which builds
/// the unescaped string. This division keeps the path for common case as fast as possible.
pub(super) fn read_template_literal(&mut self, substitute: Kind, tail: Kind) -> Kind {
let mut builder = AutoCow::new(self);
let mut is_valid_escape_sequence = true;
while let Some(c) = self.next_char() {
match c {
'$' if self.peek() == Some('{') => {
self.save_template_string(
is_valid_escape_sequence,
builder.has_escape(),
builder.finish_without_push(self),
);
self.consume_char();
return substitute;
}
'`' => {
self.save_template_string(
is_valid_escape_sequence,
builder.has_escape(),
builder.finish_without_push(self),
);
return tail;
}
CR => {
builder.force_allocation_without_current_ascii_char(self);
if self.next_eq(LF) {
builder.push_different(LF);
let mut ret = substitute;
byte_search! {
lexer: self,
table: TEMPLATE_LITERAL_TABLE,
continue_if: |next_byte, pos| {
match next_byte {
b'$' => {
// SAFETY: Next byte is `$` which is ASCII, so after it is a UTF-8 char boundary
let after_dollar = unsafe { pos.add(1) };
if after_dollar.addr() < self.source.end_addr() {
// If `${`, exit.
// SAFETY: Have checked there's at least 1 further byte to read.
if unsafe { after_dollar.read() } == b'{' {
// Skip `${` and stop searching.
// SAFETY: Consuming `${` leaves `pos` on a UTF-8 char boundary.
pos = unsafe { after_dollar.add(1) };
false
} else {
// Not `${`. Continue searching.
true
}
} else {
// This is last byte in file. Continue to `handle_eof`.
// This is illegal in valid JS, so mark this branch cold.
cold_branch(|| true)
}
},
b'`' => {
// Skip '`' and stop searching.
// SAFETY: Char at `pos` is '`', so `pos + 1` is a UTF-8 char boundary.
pos = unsafe { pos.add(1) };
ret = tail;
false
},
b'\r' => {
// SAFETY: Byte at `pos` is `\r`.
// `pos` has only been advanced relative to `self.source.position()`.
return unsafe { self.template_literal_carriage_return(pos, substitute, tail) };
}
_ => {
// `TEMPLATE_LITERAL_TABLE` only matches `$`, '`', `\r` and `\`
debug_assert!(next_byte == b'\\');
// SAFETY: Byte at `pos` is `\`.
// `pos` has only been advanced relative to `self.source.position()`.
return unsafe { self.template_literal_backslash(pos, substitute, tail) };
}
}
'\\' => {
let text = builder.get_mut_string_without_current_ascii_char(self);
self.read_string_escape_sequence(text, true, &mut is_valid_escape_sequence);
}
_ => builder.push_matching(c),
}
},
handle_match: |_next_byte, _start| {
ret
},
handle_eof: |_start| {
self.error(diagnostics::UnterminatedString(self.unterminated_range()));
Kind::Undetermined
},
};
}
/// Consume rest of template literal after a `\r` is found.
///
/// # SAFETY
/// * Byte at `pos` must be `\r`.
/// * `pos` must not be before `self.source.position()`.
unsafe fn template_literal_carriage_return(
&mut self,
mut pos: SourcePosition<'a>,
substitute: Kind,
tail: Kind,
) -> Kind {
// Create arena string to hold modified template literal, containing up to before `\r`.
// SAFETY: Caller guarantees `pos` is not before `self.source.position()`.
let str = self.template_literal_create_string(pos);
// Skip `\r`.
// SAFETY: Caller guarantees byte at `pos` is `\r`, so `pos + 1` is a UTF-8 char boundary.
pos = pos.add(1);
// If at EOF, exit. This illegal in valid JS, so cold branch.
if pos.addr() == self.source.end_addr() {
return cold_branch(|| {
self.source.advance_to_end();
self.error(diagnostics::UnterminatedString(self.unterminated_range()));
Kind::Undetermined
});
}
self.error(diagnostics::UnterminatedString(self.unterminated_range()));
Kind::Undetermined
// Start next chunk after `\r`
let chunk_start = pos;
// If next char is `\n`, start next search after it.
// `\n` is first char of next chunk, so it'll get added to `str` when chunk is pushed.
// SAFETY: Have checked not at EOF.
if pos.read() == b'\n' {
// SAFETY: `\n` is ASCII, so advancing past it leaves `pos` on a UTF-8 char boundary
pos = pos.add(1);
}
self.template_literal_escaped(str, pos, chunk_start, true, substitute, tail)
}
/// Consume rest of template literal after a `\` escape is found.
///
/// # SAFETY
/// * Byte at `pos` must be `\`.
/// * `pos` must not be before `self.source.position()`.
unsafe fn template_literal_backslash(
&mut self,
pos: SourcePosition<'a>,
substitute: Kind,
tail: Kind,
) -> Kind {
// Create arena string to hold modified template literal, containing up to before `\`.
// SAFETY: Caller guarantees `pos` is not before `self.source.position()`.
let mut str = self.template_literal_create_string(pos);
// Decode escape sequence into `str`.
// `read_string_escape_sequence` expects `self.source` to be positioned after `\`.
// SAFETY: Caller guarantees next byte is `\`, which is ASCII, so `pos + 1` is UTF-8 char boundary.
let after_backslash = pos.add(1);
self.source.set_position(after_backslash);
let mut is_valid_escape_sequence = true;
self.read_string_escape_sequence(&mut str, true, &mut is_valid_escape_sequence);
// Continue search after escape
let after_escape = self.source.position();
// SAFETY: `pos` and `chunk_start` are the same
self.template_literal_escaped(
str,
after_escape,
after_escape,
is_valid_escape_sequence,
substitute,
tail,
)
}
/// Create arena string for modified template literal, containing the template literal up to `pos`.
/// # SAFETY
/// `pos` must not be before `self.source.position()`
unsafe fn template_literal_create_string(&self, pos: SourcePosition) -> String<'a> {
// Create arena string to hold modified template literal.
// We don't know how long template literal will end up being. Take a guess that total length
// will be double what we've seen so far, or `MIN_ESCAPED_TEMPLATE_LIT_LEN` minimum.
// SAFETY: Caller guarantees `pos` is not before `self.source.position()`.
let so_far = self.source.str_from_current_to_pos_unchecked(pos);
let capacity = max(so_far.len() * 2, MIN_ESCAPED_TEMPLATE_LIT_LEN);
let mut str = String::with_capacity_in(capacity, self.allocator);
str.push_str(so_far);
str
}
/// Process template literal after `\n` or `\` found.
/// # SAFETY
/// `chunk_start` must not be after `pos`.
unsafe fn template_literal_escaped(
&mut self,
mut str: String<'a>,
pos: SourcePosition<'a>,
mut chunk_start: SourcePosition<'a>,
mut is_valid_escape_sequence: bool,
substitute: Kind,
tail: Kind,
) -> Kind {
let mut ret = substitute;
byte_search! {
lexer: self,
table: TEMPLATE_LITERAL_TABLE,
start: pos,
continue_if: |next_byte, pos| {
if next_byte == b'$' {
// SAFETY: Next byte is `$` which is ASCII, so after it is a UTF-8 char boundary
let after_dollar = pos.add(1);
if after_dollar.addr() < self.source.end_addr() {
// If `${`, exit.
// SAFETY: Have checked there's at least 1 further byte to read.
if after_dollar.read() == b'{' {
// Add last chunk to `str`.
// SAFETY: Caller guarantees `chunk_start` is not after `pos` at start of
// this function. `pos` only increases during searching.
// Where `chunk_start` is updated, it's always before or equal to `pos`.
// So `chunk_start` cannot be after `pos`.
let chunk = self.source.str_between_positions_unchecked(chunk_start, pos);
str.push_str(chunk);
// Skip `${` and stop searching.
// SAFETY: Consuming `${` leaves `pos` on a UTF-8 char boundary.
pos = after_dollar.add(1);
false
} else {
// Not `${`. Continue searching.
true
}
} else {
// This is last byte in file. Continue to `handle_eof`.
// This is illegal in valid JS, so mark this branch cold.
cold_branch(|| true)
}
} else {
// Next byte is '`', `\r` or `\`. Add chunk up to before this char to `str`.
// SAFETY: Caller guarantees `chunk_start` is not after `pos` at start of
// this function. `pos` only increases during searching.
// Where `chunk_start` is updated, it's always before or equal to `pos`.
// So `chunk_start` cannot be after `pos`.
let chunk = self.source.str_between_positions_unchecked(chunk_start, pos);
str.push_str(chunk);
match next_byte {
b'`' => {
// Skip '`' and stop searching.
// SAFETY: Byte at `pos` is '`' (ASCII), so `pos + 1` is a UTF-8 char boundary.
pos = pos.add(1);
ret = tail;
false
}
b'\r' => {
// Set next chunk to start after `\r`.
// SAFETY: Next byte is `\r` which is ASCII, so after it is a UTF-8 char boundary.
// This temporarily puts `chunk_start` 1 byte after `pos`, but `byte_search!` macro
// increments `pos` when return `true` from `continue_if`, so `pos` will be
// brought up to `chunk_start` again.
chunk_start = pos.add(1);
if chunk_start.addr() < self.source.end_addr() {
// If next char is `\n`, start next search after it.
// NB: `byte_search!` macro already advances `pos` by 1, so only advance
// by 1 here, so that in total we skip 2 bytes for `\r\n`.
// No need to push `\n` to `str`, as it's 1st char of next chunk,
// and will be added to `str` when next chunk is pushed.
if chunk_start.read() == b'\n' {
pos = chunk_start;
}
} else {
// This is last byte in file. Continue to `handle_eof`.
// This is illegal in valid JS, so mark this branch cold.
cold_branch(|| {});
}
// Continue searching
true
}
_ => {
// `TEMPLATE_LITERAL_TABLE` only matches `$`, '`', `\r` and `\`
debug_assert!(next_byte == b'\\');
// Decode escape sequence into `str`.
// `read_string_escape_sequence` expects `self.source` to be positioned after `\`.
// SAFETY: Next byte is `\`, which is ASCII, so `pos + 1` is UTF-8 char boundary.
let after_backslash = pos.add(1);
self.source.set_position(after_backslash);
self.read_string_escape_sequence(&mut str, true, &mut is_valid_escape_sequence);
// Start next chunk after escape sequence
chunk_start = self.source.position();
assert!(chunk_start.addr() >= after_backslash.addr());
// Continue search after escape sequence.
// NB: `byte_search!` macro increments `pos` when return `true`,
// so need to subtract 1 here to counteract that.
// SAFETY: Added 1 to `pos` above, and checked `chunk_start` hasn't moved
// backwards from that, so subtracting 1 again is within bounds.
pos = chunk_start.sub(1);
// Continue searching
true
}
}
}
},
handle_match: |_next_byte, _start| {
self.save_template_string(
is_valid_escape_sequence,
str.into_bump_str(),
);
ret
},
handle_eof: |_start| {
self.error(diagnostics::UnterminatedString(self.unterminated_range()));
Kind::Undetermined
},
};
}
/// Re-tokenize the current `}` token for `TemplateSubstitutionTail`
@ -53,16 +318,8 @@ impl<'a> Lexer<'a> {
self.finish_next(kind)
}
/// Save the template if it is escaped
fn save_template_string(
&mut self,
is_valid_escape_sequence: bool,
has_escape: bool,
s: &'a str,
) {
if !has_escape {
return;
}
/// Save escaped template string
fn save_template_string(&mut self, is_valid_escape_sequence: bool, s: &'a str) {
self.escaped_templates.insert(self.token.start, is_valid_escape_sequence.then_some(s));
self.token.escaped = true;
}