From c0d1d6b08acca74ccd86a666b1195e8d0fe840d8 Mon Sep 17 00:00:00 2001
From: overlookmotel <theoverlookmotel@gmail.com>
Date: Fri, 9 Feb 2024 13:00:27 +0000
Subject: [PATCH] perf(parser): lex strings as bytes (#2357)

Lex string literals as bytes, using same techniques as for identifiers.

Handling escapes could be optimized a bit more, and maybe I'll return to that, but as escapes are fairly rare, it wouldn't be the biggest gain.
---
 crates/oxc_parser/src/lexer/byte_handlers.rs |  22 ++-
 crates/oxc_parser/src/lexer/source.rs        |   2 +-
 crates/oxc_parser/src/lexer/string.rs        | 185 ++++++++++++++++---
 3 files changed, 168 insertions(+), 41 deletions(-)

diff --git a/crates/oxc_parser/src/lexer/byte_handlers.rs b/crates/oxc_parser/src/lexer/byte_handlers.rs
index 67beef8ee..c368f527c 100644
--- a/crates/oxc_parser/src/lexer/byte_handlers.rs
+++ b/crates/oxc_parser/src/lexer/byte_handlers.rs
@@ -1,4 +1,4 @@
-use super::{Kind, Lexer, LexerContext};
+use super::{Kind, Lexer};
 use crate::diagnostics;
 
 #[allow(clippy::unnecessary_safety_comment)]
@@ -21,7 +21,7 @@ static BYTE_HANDLERS: [ByteHandler; 256] = [
 //  0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F    //
     ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, SPS, LIN, ISP, ISP, LIN, ERR, ERR, // 0
     ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, // 1
-    SPS, EXL, QOT, HAS, IDT, PRC, AMP, QOT, PNO, PNC, ATR, PLS, COM, MIN, PRD, SLH, // 2
+    SPS, EXL, QOD, HAS, IDT, PRC, AMP, QOS, PNO, PNC, ATR, PLS, COM, MIN, PRD, SLH, // 2
     ZER, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, COL, SEM, LSS, EQL, GTR, QST, // 3
     AT_, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, // 4
     IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, BTO, ESC, BTC, CRT, IDT, // 5
@@ -220,14 +220,16 @@ ascii_byte_handler!(EXL(lexer) {
     }
 });
 
-// ' "
-ascii_byte_handler!(QOT(lexer) {
-    let c = lexer.consume_char();
-    if lexer.context == LexerContext::JsxAttributeValue {
-        lexer.read_jsx_string_literal(c)
-    } else {
-        lexer.read_string_literal(c)
-    }
+// "
+ascii_byte_handler!(QOD(lexer) {
+    // SAFETY: This function is only called for `"`
+    unsafe { lexer.read_string_literal_double_quote() }
+});
+
+// '
+ascii_byte_handler!(QOS(lexer) {
+    // SAFETY: This function is only called for `'`
+    unsafe { lexer.read_string_literal_single_quote() }
 });
 
 // #
diff --git a/crates/oxc_parser/src/lexer/source.rs b/crates/oxc_parser/src/lexer/source.rs
index 5beb3ab0c..1e19d0bb3 100644
--- a/crates/oxc_parser/src/lexer/source.rs
+++ b/crates/oxc_parser/src/lexer/source.rs
@@ -362,7 +362,7 @@ impl<'a> Source<'a> {
     /// In particular, safe methods `Source::next_char`, `Source::peek_char`, and `Source::remaining`
     /// are *not* safe to call until one of above conditions is satisfied.
     #[inline]
-    unsafe fn next_byte_unchecked(&mut self) -> u8 {
+    pub(super) unsafe fn next_byte_unchecked(&mut self) -> u8 {
         // SAFETY: Caller guarantees not at end of file i.e. `ptr != end`.
         // Methods of this type provide no way for `ptr` to be before `start` or after `end`.
         // Therefore always valid to read a byte from `ptr`, and incrementing `ptr` cannot result
diff --git a/crates/oxc_parser/src/lexer/string.rs b/crates/oxc_parser/src/lexer/string.rs
index 12785e0a6..21fc5d9c6 100644
--- a/crates/oxc_parser/src/lexer/string.rs
+++ b/crates/oxc_parser/src/lexer/string.rs
@@ -1,40 +1,165 @@
-use super::{AutoCow, Kind, Lexer, Span, Token};
+use super::{
+    search::{byte_search, safe_byte_match_table, SafeByteMatchTable},
+    source::SourcePosition,
+    Kind, Lexer, LexerContext, Span, Token,
+};
 use crate::diagnostics;
 
+use oxc_allocator::String;
+use std::cmp::max;
+
+const MIN_ESCAPED_STR_LEN: usize = 16;
+
+static DOUBLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable =
+    safe_byte_match_table!(|b| matches!(b, b'"' | b'\r' | b'\n' | b'\\'));
+
+static SINGLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable =
+    safe_byte_match_table!(|b| matches!(b, b'\'' | b'\r' | b'\n' | b'\\'));
+
 impl<'a> Lexer<'a> {
     /// 12.9.4 String Literals
-    pub(super) fn read_string_literal(&mut self, delimiter: char) -> Kind {
-        let mut builder = AutoCow::new(self);
-        loop {
-            match self.next_char() {
-                None | Some('\r' | '\n') => {
-                    self.error(diagnostics::UnterminatedString(self.unterminated_range()));
-                    return Kind::Undetermined;
-                }
-                Some(c @ ('"' | '\'')) => {
-                    if c == delimiter {
-                        self.save_string(builder.has_escape(), builder.finish_without_push(self));
-                        return Kind::Str;
-                    }
-                    builder.push_matching(c);
-                }
-                Some('\\') => {
-                    let start = self.offset() - 1;
-                    let text = builder.get_mut_string_without_current_ascii_char(self);
-                    let mut is_valid_escape_sequence = true;
-                    self.read_string_escape_sequence(text, false, &mut is_valid_escape_sequence);
-                    if !is_valid_escape_sequence {
-                        let range = Span::new(start, self.offset());
-                        self.error(diagnostics::InvalidEscapeSequence(range));
-                    }
-                }
-                Some(c) => {
-                    builder.push_matching(c);
-                }
-            }
+
+    /// Read string literal delimited with `"`.
+    /// # SAFETY
+    /// Next character must be `"`.
+    pub(super) unsafe fn read_string_literal_double_quote(&mut self) -> Kind {
+        if self.context == LexerContext::JsxAttributeValue {
+            self.consume_char();
+            self.read_jsx_string_literal('"')
+        } else {
+            // SAFETY: `DOUBLE_QUOTE_STRING_END_TABLE` matches all non-ASCII bytes
+            self.read_string_literal(b'"', &DOUBLE_QUOTE_STRING_END_TABLE)
         }
     }
 
+    /// Read string literal delimited with `'`.
+    /// # SAFETY
+    /// Next character must be `'`.
+    pub(super) unsafe fn read_string_literal_single_quote(&mut self) -> Kind {
+        if self.context == LexerContext::JsxAttributeValue {
+            self.consume_char();
+            self.read_jsx_string_literal('\'')
+        } else {
+            // SAFETY: `SINGLE_QUOTE_STRING_END_TABLE` matches all non-ASCII bytes
+            self.read_string_literal(b'\'', &SINGLE_QUOTE_STRING_END_TABLE)
+        }
+    }
+
+    /// Read string literal.
+    /// # SAFETY
+    /// Next byte must be ASCII.
+    unsafe fn read_string_literal(&mut self, delimiter: u8, table: &SafeByteMatchTable) -> Kind {
+        // Skip opening quote.
+        // SAFETY: Caller guarantees next byte is ASCII, so safe to advance past it.
+        let after_opening_quote = unsafe { self.source.position().add(1) };
+
+        // Consume bytes which are part of identifier
+        byte_search! {
+            lexer: self,
+            table: table,
+            start: after_opening_quote,
+            handle_match: |next_byte| {
+                // Found a matching byte.
+                // Either end of string found, or a line break, or `\` escape.
+                if next_byte == delimiter {
+                    self.consume_char();
+                    return Kind::Str;
+                }
+
+                if next_byte == b'\\' {
+                    return self.string_literal_on_escape(delimiter, table, after_opening_quote);
+                }
+
+                debug_assert!(matches!(next_byte, b'\r' | b'\n'));
+                self.consume_char();
+                self.error(diagnostics::UnterminatedString(self.unterminated_range()));
+                Kind::Undetermined
+            },
+            handle_eof: || {
+                self.error(diagnostics::UnterminatedString(self.unterminated_range()));
+                Kind::Undetermined
+            },
+        };
+    }
+
+    /// Process string literal when `\` escape found.
+    #[cold]
+    fn string_literal_on_escape(
+        &mut self,
+        delimiter: u8,
+        table: &SafeByteMatchTable,
+        after_opening_quote: SourcePosition,
+    ) -> Kind {
+        // Create arena string to hold unescaped string.
+        // We don't know how long string will end up being. Take a guess that total length
+        // will be double what we've seen so far, or `MIN_ESCAPED_STR_LEN` minimum.
+        let so_far = self.source.str_from_pos_to_current(after_opening_quote);
+        let capacity = max(so_far.len() * 2, MIN_ESCAPED_STR_LEN);
+        let mut str = String::with_capacity_in(capacity, self.allocator);
+
+        // Push chunk before `\` into `str`.
+        // `bumpalo::collections::string::String::push_str` is currently expensive due to
+        // inefficiency in bumpalo's implementation. But best we have right now.
+        str.push_str(so_far);
+
+        'outer: loop {
+            // Consume `\`
+            let escape_start_offset = self.offset();
+            self.consume_char();
+
+            // Consume escape sequence and add char to `str`
+            let mut is_valid_escape_sequence = true;
+            self.read_string_escape_sequence(&mut str, false, &mut is_valid_escape_sequence);
+            if !is_valid_escape_sequence {
+                let range = Span::new(escape_start_offset, self.offset());
+                self.error(diagnostics::InvalidEscapeSequence(range));
+            }
+
+            // Consume bytes until reach end of string, line break, or another escape
+            let chunk_start = self.source.position();
+            while let Some(b) = self.source.peek_byte() {
+                if !table.matches(b) {
+                    // SAFETY: A byte is available, as we just peeked it.
+                    // This may put `source`'s position on a UTF-8 continuation byte, which violates
+                    // `Source`'s invariant temporarily, but the guarantees of `SafeByteMatchTable`
+                    // mean `table.matches(b)` will always return `true` in a pattern where
+                    // we can't exit this loop without `source` being positioned on a UTF-8 character
+                    // boundary again.
+                    unsafe { self.source.next_byte_unchecked() };
+                    continue;
+                }
+
+                if b == delimiter {
+                    // End of string found. Push last chunk to `str`, and consume closing quote.
+                    let chunk = self.source.str_from_pos_to_current(chunk_start);
+                    str.push_str(chunk);
+                    self.consume_char();
+                    break 'outer;
+                }
+
+                if b == b'\\' {
+                    // Another escape found. Push last chunk to `str`, and loop back to handle escape.
+                    let chunk = self.source.str_from_pos_to_current(chunk_start);
+                    str.push_str(chunk);
+                    continue 'outer;
+                }
+
+                debug_assert!(matches!(b, b'\r' | b'\n'));
+                self.consume_char();
+                break;
+            }
+
+            // EOF
+            self.error(diagnostics::UnterminatedString(self.unterminated_range()));
+            return Kind::Undetermined;
+        }
+
+        // Convert `str` to arena slice and save to `escaped_strings`
+        self.save_string(true, str.into_bump_str());
+
+        Kind::Str
+    }
+
     /// Save the string if it is escaped
     /// This reduces the overall memory consumption while keeping the `Token` size small
     /// Strings without escaped values can be retrieved as is from the token span