perf(parser): faster lexing template strings (#2541)

Speed up lexing template strings. This was the last use of `AutoCow` remaining in the lexer, and it's now removed. Implementation is quite complex, to avoid repeatedly branching on whether an unescaped string is required or not (the way `AutoCow` did). I tried to simplify it down to a single function, but this hurt performance significantly. Benchmarks do not show much movement, but I believe that's because there aren't many template strings in the benchmarks. Where there are template strings, I believe this speeds up lexing them significantly.
2026-05-24 20:32:10 +00:00 · 2024-02-29 05:28:30 +00:00 · 2024-02-29 05:28:30 +00:00 · 5a13714a18
commit 5a13714a18
parent 9d7ea6b3f0
4 changed files with 316 additions and 121 deletions
--- a/crates/oxc_parser/src/lexer/mod.rs
+++ b/crates/oxc_parser/src/lexer/mod.rs
@ -19,7 +19,6 @@ mod regex;
 mod search;
 mod source;
 mod string;
-mod string_builder;
 mod template;
 mod token;
 mod trivia_builder;
@ -38,7 +37,6 @@ use oxc_span::{SourceType, Span};
 use self::{
    byte_handlers::handle_byte,
    source::{Source, SourcePosition},
-    string_builder::AutoCow,
    trivia_builder::TriviaBuilder,
 };
 pub use self::{
--- a/crates/oxc_parser/src/lexer/source.rs
+++ b/crates/oxc_parser/src/lexer/source.rs
@ -218,6 +218,20 @@ impl<'a> Source<'a> {
        self.str_between_positions_unchecked(pos, SourcePosition::new(self.ptr))
    }

+    /// Get string slice from current position of `Source` up to a `SourcePosition`, without checks.
+    ///
+    /// # SAFETY
+    /// `pos` must not be before current position of `Source`.
+    /// This is always the case if both:
+    /// 1. `Source::set_position` has not been called since `pos` was created.
+    /// 2. `pos` has not been moved backwards with `SourcePosition::sub`.
+    #[inline]
+    pub(super) unsafe fn str_from_current_to_pos_unchecked(&self, pos: SourcePosition) -> &'a str {
+        // SAFETY: Caller guarantees `pos` is not before current position of `Source`.
+        // `self.ptr` is always a valid `SourcePosition` due to invariants of `Source`.
+        self.str_between_positions_unchecked(SourcePosition::new(self.ptr), pos)
+    }
+
    /// Get string slice from a `SourcePosition` up to the end of `Source`.
    #[inline]
    pub(super) fn str_from_pos_to_end(&self, pos: SourcePosition) -> &'a str {
--- a/crates/oxc_parser/src/lexer/string_builder.rs
+++ b/crates/oxc_parser/src/lexer/string_builder.rs
@ -1,74 +0,0 @@
-// Copied from https://github.com/mozilla-spidermonkey/jsparagus/blob/master/crates/parser/src/lexer.rs#L2256
-
-use oxc_allocator::String;
-
-use crate::lexer::Lexer;
-
-pub struct AutoCow<'a> {
-    pub start: &'a str,
-    pub value: Option<String<'a>>,
-}
-
-impl<'a> AutoCow<'a> {
-    pub fn new(lexer: &Lexer<'a>) -> Self {
-        let start = lexer.remaining();
-        AutoCow { start, value: None }
-    }
-
-    // Push a char that matches `lexer.next_char()`.
-    pub fn push_matching(&mut self, c: char) {
-        if let Some(text) = &mut self.value {
-            text.push(c);
-        }
-    }
-
-    // Push a different character than `lexer.next_char()`.
-    // force_allocation_without_current_ascii_char must be called before this.
-    pub fn push_different(&mut self, c: char) {
-        debug_assert!(self.value.is_some());
-        self.value.as_mut().unwrap().push(c);
-    }
-
-    // Force allocation of a String, excluding the current ASCII character,
-    // and return the reference to it
-    pub fn get_mut_string_without_current_ascii_char<'b>(
-        &'b mut self,
-        lexer: &Lexer<'a>,
-    ) -> &'b mut String<'a> {
-        self.force_allocation_without_current_ascii_char(lexer);
-        self.value.as_mut().unwrap()
-    }
-
-    // Force allocation of a String, excluding the current ASCII character.
-    pub fn force_allocation_without_current_ascii_char(&mut self, lexer: &Lexer<'a>) {
-        if self.value.is_some() {
-            return;
-        }
-        self.value = Some(String::from_str_in(
-            &self.start[..self.start.len() - lexer.remaining().len() - 1],
-            lexer.allocator,
-        ));
-    }
-
-    // Check if the string contains a different character, such as an escape sequence
-    pub fn has_escape(&self) -> bool {
-        self.value.is_some()
-    }
-
-    // TODO: Delete this if not using it
-    #[allow(dead_code)]
-    pub fn finish(mut self, lexer: &Lexer<'a>) -> &'a str {
-        match self.value.take() {
-            Some(s) => s.into_bump_str(),
-            None => &self.start[..self.start.len() - lexer.remaining().len()],
-        }
-    }
-
-    // Just like finish, but without pushing current char.
-    pub fn finish_without_push(mut self, lexer: &Lexer<'a>) -> &'a str {
-        match self.value.take() {
-            Some(s) => s.into_bump_str(),
-            None => &self.start[..self.start.len() - lexer.remaining().len() - 1],
-        }
-    }
-}
--- a/crates/oxc_parser/src/lexer/template.rs
+++ b/crates/oxc_parser/src/lexer/template.rs
@ -1,47 +1,312 @@
-use super::{AutoCow, Kind, Lexer, Token};
+use super::{
+    cold_branch,
+    search::{byte_search, safe_byte_match_table, SafeByteMatchTable},
+    Kind, Lexer, SourcePosition, Token,
+};
 use crate::diagnostics;

-use oxc_syntax::identifier::{CR, LF};
+use std::cmp::max;
+
+use oxc_allocator::String;
+
+const MIN_ESCAPED_TEMPLATE_LIT_LEN: usize = 16;
+
+static TEMPLATE_LITERAL_TABLE: SafeByteMatchTable =
+    safe_byte_match_table!(|b| matches!(b, b'$' | b'`' | b'\r' | b'\\'));

 impl<'a> Lexer<'a> {
    /// 12.8.6 Template Literal Lexical Components
+
+    /// Read template literal component.
+    ///
+    /// This function handles the common case where template contains no escapes or `\r` characters
+    /// and so does not require saving to `lexer.escaped_templates`.
+    /// If an escape or `\r` is found, control is passed to `template_literal_escaped` which builds
+    /// the unescaped string. This division keeps the path for common case as fast as possible.
    pub(super) fn read_template_literal(&mut self, substitute: Kind, tail: Kind) -> Kind {
-        let mut builder = AutoCow::new(self);
-        let mut is_valid_escape_sequence = true;
-        while let Some(c) = self.next_char() {
-            match c {
-                '$' if self.peek() == Some('{') => {
-                    self.save_template_string(
-                        is_valid_escape_sequence,
-                        builder.has_escape(),
-                        builder.finish_without_push(self),
-                    );
-                    self.consume_char();
-                    return substitute;
-                }
-                '`' => {
-                    self.save_template_string(
-                        is_valid_escape_sequence,
-                        builder.has_escape(),
-                        builder.finish_without_push(self),
-                    );
-                    return tail;
-                }
-                CR => {
-                    builder.force_allocation_without_current_ascii_char(self);
-                    if self.next_eq(LF) {
-                        builder.push_different(LF);
+        let mut ret = substitute;
+
+        byte_search! {
+            lexer: self,
+            table: TEMPLATE_LITERAL_TABLE,
+            continue_if: |next_byte, pos| {
+                match next_byte {
+                    b'$' => {
+                        // SAFETY: Next byte is `$` which is ASCII, so after it is a UTF-8 char boundary
+                        let after_dollar = unsafe { pos.add(1) };
+                        if after_dollar.addr() < self.source.end_addr() {
+                            // If `${`, exit.
+                            // SAFETY: Have checked there's at least 1 further byte to read.
+                            if unsafe { after_dollar.read() } == b'{' {
+                                // Skip `${` and stop searching.
+                                // SAFETY: Consuming `${` leaves `pos` on a UTF-8 char boundary.
+                                pos = unsafe { after_dollar.add(1) };
+                                false
+                            } else {
+                                // Not `${`. Continue searching.
+                                true
+                            }
+                        } else {
+                            // This is last byte in file. Continue to `handle_eof`.
+                            // This is illegal in valid JS, so mark this branch cold.
+                            cold_branch(|| true)
+                        }
+                    },
+                    b'`' => {
+                        // Skip '`' and stop searching.
+                        // SAFETY: Char at `pos` is '`', so `pos + 1` is a UTF-8 char boundary.
+                        pos = unsafe { pos.add(1) };
+                        ret = tail;
+                        false
+                    },
+                    b'\r' => {
+                        // SAFETY: Byte at `pos` is `\r`.
+                        // `pos` has only been advanced relative to `self.source.position()`.
+                        return unsafe { self.template_literal_carriage_return(pos, substitute, tail) };
+                    }
+                    _ => {
+                        // `TEMPLATE_LITERAL_TABLE` only matches `$`, '`', `\r` and `\`
+                        debug_assert!(next_byte == b'\\');
+                        // SAFETY: Byte at `pos` is `\`.
+                        // `pos` has only been advanced relative to `self.source.position()`.
+                        return unsafe { self.template_literal_backslash(pos, substitute, tail) };
                    }
                }
-                '\\' => {
-                    let text = builder.get_mut_string_without_current_ascii_char(self);
-                    self.read_string_escape_sequence(text, true, &mut is_valid_escape_sequence);
-                }
-                _ => builder.push_matching(c),
-            }
+            },
+            handle_match: |_next_byte, _start| {
+                ret
+            },
+            handle_eof: |_start| {
+                self.error(diagnostics::UnterminatedString(self.unterminated_range()));
+                Kind::Undetermined
+            },
+        };
+    }
+
+    /// Consume rest of template literal after a `\r` is found.
+    ///
+    /// # SAFETY
+    /// * Byte at `pos` must be `\r`.
+    /// * `pos` must not be before `self.source.position()`.
+    unsafe fn template_literal_carriage_return(
+        &mut self,
+        mut pos: SourcePosition<'a>,
+        substitute: Kind,
+        tail: Kind,
+    ) -> Kind {
+        // Create arena string to hold modified template literal, containing up to before `\r`.
+        // SAFETY: Caller guarantees `pos` is not before `self.source.position()`.
+        let str = self.template_literal_create_string(pos);
+
+        // Skip `\r`.
+        // SAFETY: Caller guarantees byte at `pos` is `\r`, so `pos + 1` is a UTF-8 char boundary.
+        pos = pos.add(1);
+
+        // If at EOF, exit. This illegal in valid JS, so cold branch.
+        if pos.addr() == self.source.end_addr() {
+            return cold_branch(|| {
+                self.source.advance_to_end();
+                self.error(diagnostics::UnterminatedString(self.unterminated_range()));
+                Kind::Undetermined
+            });
        }
-        self.error(diagnostics::UnterminatedString(self.unterminated_range()));
-        Kind::Undetermined
+
+        // Start next chunk after `\r`
+        let chunk_start = pos;
+
+        // If next char is `\n`, start next search after it.
+        // `\n` is first char of next chunk, so it'll get added to `str` when chunk is pushed.
+        // SAFETY: Have checked not at EOF.
+        if pos.read() == b'\n' {
+            // SAFETY: `\n` is ASCII, so advancing past it leaves `pos` on a UTF-8 char boundary
+            pos = pos.add(1);
+        }
+
+        self.template_literal_escaped(str, pos, chunk_start, true, substitute, tail)
+    }
+
+    /// Consume rest of template literal after a `\` escape is found.
+    ///
+    /// # SAFETY
+    /// * Byte at `pos` must be `\`.
+    /// * `pos` must not be before `self.source.position()`.
+    unsafe fn template_literal_backslash(
+        &mut self,
+        pos: SourcePosition<'a>,
+        substitute: Kind,
+        tail: Kind,
+    ) -> Kind {
+        // Create arena string to hold modified template literal, containing up to before `\`.
+        // SAFETY: Caller guarantees `pos` is not before `self.source.position()`.
+        let mut str = self.template_literal_create_string(pos);
+
+        // Decode escape sequence into `str`.
+        // `read_string_escape_sequence` expects `self.source` to be positioned after `\`.
+        // SAFETY: Caller guarantees next byte is `\`, which is ASCII, so `pos + 1` is UTF-8 char boundary.
+        let after_backslash = pos.add(1);
+        self.source.set_position(after_backslash);
+
+        let mut is_valid_escape_sequence = true;
+        self.read_string_escape_sequence(&mut str, true, &mut is_valid_escape_sequence);
+
+        // Continue search after escape
+        let after_escape = self.source.position();
+        // SAFETY: `pos` and `chunk_start` are the same
+        self.template_literal_escaped(
+            str,
+            after_escape,
+            after_escape,
+            is_valid_escape_sequence,
+            substitute,
+            tail,
+        )
+    }
+
+    /// Create arena string for modified template literal, containing the template literal up to `pos`.
+    /// # SAFETY
+    /// `pos` must not be before `self.source.position()`
+    unsafe fn template_literal_create_string(&self, pos: SourcePosition) -> String<'a> {
+        // Create arena string to hold modified template literal.
+        // We don't know how long template literal will end up being. Take a guess that total length
+        // will be double what we've seen so far, or `MIN_ESCAPED_TEMPLATE_LIT_LEN` minimum.
+        // SAFETY: Caller guarantees `pos` is not before `self.source.position()`.
+        let so_far = self.source.str_from_current_to_pos_unchecked(pos);
+        let capacity = max(so_far.len() * 2, MIN_ESCAPED_TEMPLATE_LIT_LEN);
+        let mut str = String::with_capacity_in(capacity, self.allocator);
+        str.push_str(so_far);
+        str
+    }
+
+    /// Process template literal after `\n` or `\` found.
+    /// # SAFETY
+    /// `chunk_start` must not be after `pos`.
+    unsafe fn template_literal_escaped(
+        &mut self,
+        mut str: String<'a>,
+        pos: SourcePosition<'a>,
+        mut chunk_start: SourcePosition<'a>,
+        mut is_valid_escape_sequence: bool,
+        substitute: Kind,
+        tail: Kind,
+    ) -> Kind {
+        let mut ret = substitute;
+
+        byte_search! {
+            lexer: self,
+            table: TEMPLATE_LITERAL_TABLE,
+            start: pos,
+            continue_if: |next_byte, pos| {
+                if next_byte == b'$' {
+                    // SAFETY: Next byte is `$` which is ASCII, so after it is a UTF-8 char boundary
+                    let after_dollar = pos.add(1);
+                    if after_dollar.addr() < self.source.end_addr() {
+                        // If `${`, exit.
+                        // SAFETY: Have checked there's at least 1 further byte to read.
+                        if after_dollar.read() == b'{' {
+                            // Add last chunk to `str`.
+                            // SAFETY: Caller guarantees `chunk_start` is not after `pos` at start of
+                            // this function. `pos` only increases during searching.
+                            // Where `chunk_start` is updated, it's always before or equal to `pos`.
+                            // So `chunk_start` cannot be after `pos`.
+                            let chunk = self.source.str_between_positions_unchecked(chunk_start, pos);
+                            str.push_str(chunk);
+
+                            // Skip `${` and stop searching.
+                            // SAFETY: Consuming `${` leaves `pos` on a UTF-8 char boundary.
+                            pos = after_dollar.add(1);
+                            false
+                        } else {
+                            // Not `${`. Continue searching.
+                            true
+                        }
+                    } else {
+                        // This is last byte in file. Continue to `handle_eof`.
+                        // This is illegal in valid JS, so mark this branch cold.
+                        cold_branch(|| true)
+                    }
+                } else {
+                    // Next byte is '`', `\r` or `\`. Add chunk up to before this char to `str`.
+                    // SAFETY: Caller guarantees `chunk_start` is not after `pos` at start of
+                    // this function. `pos` only increases during searching.
+                    // Where `chunk_start` is updated, it's always before or equal to `pos`.
+                    // So `chunk_start` cannot be after `pos`.
+                    let chunk = self.source.str_between_positions_unchecked(chunk_start, pos);
+                    str.push_str(chunk);
+
+                    match next_byte {
+                        b'`' => {
+                            // Skip '`' and stop searching.
+                            // SAFETY: Byte at `pos` is '`' (ASCII), so `pos + 1` is a UTF-8 char boundary.
+                            pos = pos.add(1);
+                            ret = tail;
+                            false
+                        }
+                        b'\r' => {
+                            // Set next chunk to start after `\r`.
+                            // SAFETY: Next byte is `\r` which is ASCII, so after it is a UTF-8 char boundary.
+                            // This temporarily puts `chunk_start` 1 byte after `pos`, but `byte_search!` macro
+                            // increments `pos` when return `true` from `continue_if`, so `pos` will be
+                            // brought up to `chunk_start` again.
+                            chunk_start = pos.add(1);
+
+                            if chunk_start.addr() < self.source.end_addr() {
+                                // If next char is `\n`, start next search after it.
+                                // NB: `byte_search!` macro already advances `pos` by 1, so only advance
+                                // by 1 here, so that in total we skip 2 bytes for `\r\n`.
+                                // No need to push `\n` to `str`, as it's 1st char of next chunk,
+                                // and will be added to `str` when next chunk is pushed.
+                                if chunk_start.read() == b'\n' {
+                                    pos = chunk_start;
+                                }
+                            } else {
+                                // This is last byte in file. Continue to `handle_eof`.
+                                // This is illegal in valid JS, so mark this branch cold.
+                                cold_branch(|| {});
+                            }
+
+                            // Continue searching
+                            true
+                        }
+                        _ => {
+                            // `TEMPLATE_LITERAL_TABLE` only matches `$`, '`', `\r` and `\`
+                            debug_assert!(next_byte == b'\\');
+
+                            // Decode escape sequence into `str`.
+                            // `read_string_escape_sequence` expects `self.source` to be positioned after `\`.
+                            // SAFETY: Next byte is `\`, which is ASCII, so `pos + 1` is UTF-8 char boundary.
+                            let after_backslash = pos.add(1);
+                            self.source.set_position(after_backslash);
+                            self.read_string_escape_sequence(&mut str, true, &mut is_valid_escape_sequence);
+
+                            // Start next chunk after escape sequence
+                            chunk_start = self.source.position();
+                            assert!(chunk_start.addr() >= after_backslash.addr());
+
+                            // Continue search after escape sequence.
+                            // NB: `byte_search!` macro increments `pos` when return `true`,
+                            // so need to subtract 1 here to counteract that.
+                            // SAFETY: Added 1 to `pos` above, and checked `chunk_start` hasn't moved
+                            // backwards from that, so subtracting 1 again is within bounds.
+                            pos = chunk_start.sub(1);
+
+                            // Continue searching
+                            true
+                        }
+                    }
+                }
+            },
+            handle_match: |_next_byte, _start| {
+                self.save_template_string(
+                    is_valid_escape_sequence,
+                    str.into_bump_str(),
+                );
+                ret
+            },
+            handle_eof: |_start| {
+                self.error(diagnostics::UnterminatedString(self.unterminated_range()));
+                Kind::Undetermined
+            },
+        };
    }

    /// Re-tokenize the current `}` token for `TemplateSubstitutionTail`
@ -53,16 +318,8 @@ impl<'a> Lexer<'a> {
        self.finish_next(kind)
    }

-    /// Save the template if it is escaped
-    fn save_template_string(
-        &mut self,
-        is_valid_escape_sequence: bool,
-        has_escape: bool,
-        s: &'a str,
-    ) {
-        if !has_escape {
-            return;
-        }
+    /// Save escaped template string
+    fn save_template_string(&mut self, is_valid_escape_sequence: bool, s: &'a str) {
        self.escaped_templates.insert(self.token.start, is_valid_escape_sequence.then_some(s));
        self.token.escaped = true;
    }