From c0d1d6b08acca74ccd86a666b1195e8d0fe840d8 Mon Sep 17 00:00:00 2001 From: overlookmotel Date: Fri, 9 Feb 2024 13:00:27 +0000 Subject: [PATCH] perf(parser): lex strings as bytes (#2357) Lex string literals as bytes, using same techniques as for identifiers. Handling escapes could be optimized a bit more, and maybe I'll return to that, but as escapes are fairly rare, it wouldn't be the biggest gain. --- crates/oxc_parser/src/lexer/byte_handlers.rs | 22 ++- crates/oxc_parser/src/lexer/source.rs | 2 +- crates/oxc_parser/src/lexer/string.rs | 185 ++++++++++++++++--- 3 files changed, 168 insertions(+), 41 deletions(-) diff --git a/crates/oxc_parser/src/lexer/byte_handlers.rs b/crates/oxc_parser/src/lexer/byte_handlers.rs index 67beef8ee..c368f527c 100644 --- a/crates/oxc_parser/src/lexer/byte_handlers.rs +++ b/crates/oxc_parser/src/lexer/byte_handlers.rs @@ -1,4 +1,4 @@ -use super::{Kind, Lexer, LexerContext}; +use super::{Kind, Lexer}; use crate::diagnostics; #[allow(clippy::unnecessary_safety_comment)] @@ -21,7 +21,7 @@ static BYTE_HANDLERS: [ByteHandler; 256] = [ // 0 1 2 3 4 5 6 7 8 9 A B C D E F // ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, SPS, LIN, ISP, ISP, LIN, ERR, ERR, // 0 ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, // 1 - SPS, EXL, QOT, HAS, IDT, PRC, AMP, QOT, PNO, PNC, ATR, PLS, COM, MIN, PRD, SLH, // 2 + SPS, EXL, QOD, HAS, IDT, PRC, AMP, QOS, PNO, PNC, ATR, PLS, COM, MIN, PRD, SLH, // 2 ZER, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, COL, SEM, LSS, EQL, GTR, QST, // 3 AT_, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, // 4 IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, BTO, ESC, BTC, CRT, IDT, // 5 @@ -220,14 +220,16 @@ ascii_byte_handler!(EXL(lexer) { } }); -// ' " -ascii_byte_handler!(QOT(lexer) { - let c = lexer.consume_char(); - if lexer.context == LexerContext::JsxAttributeValue { - lexer.read_jsx_string_literal(c) - } else { - lexer.read_string_literal(c) - } +// " +ascii_byte_handler!(QOD(lexer) { + // SAFETY: This function is only called for `"` + unsafe { lexer.read_string_literal_double_quote() } +}); + +// ' +ascii_byte_handler!(QOS(lexer) { + // SAFETY: This function is only called for `'` + unsafe { lexer.read_string_literal_single_quote() } }); // # diff --git a/crates/oxc_parser/src/lexer/source.rs b/crates/oxc_parser/src/lexer/source.rs index 5beb3ab0c..1e19d0bb3 100644 --- a/crates/oxc_parser/src/lexer/source.rs +++ b/crates/oxc_parser/src/lexer/source.rs @@ -362,7 +362,7 @@ impl<'a> Source<'a> { /// In particular, safe methods `Source::next_char`, `Source::peek_char`, and `Source::remaining` /// are *not* safe to call until one of above conditions is satisfied. #[inline] - unsafe fn next_byte_unchecked(&mut self) -> u8 { + pub(super) unsafe fn next_byte_unchecked(&mut self) -> u8 { // SAFETY: Caller guarantees not at end of file i.e. `ptr != end`. // Methods of this type provide no way for `ptr` to be before `start` or after `end`. // Therefore always valid to read a byte from `ptr`, and incrementing `ptr` cannot result diff --git a/crates/oxc_parser/src/lexer/string.rs b/crates/oxc_parser/src/lexer/string.rs index 12785e0a6..21fc5d9c6 100644 --- a/crates/oxc_parser/src/lexer/string.rs +++ b/crates/oxc_parser/src/lexer/string.rs @@ -1,40 +1,165 @@ -use super::{AutoCow, Kind, Lexer, Span, Token}; +use super::{ + search::{byte_search, safe_byte_match_table, SafeByteMatchTable}, + source::SourcePosition, + Kind, Lexer, LexerContext, Span, Token, +}; use crate::diagnostics; +use oxc_allocator::String; +use std::cmp::max; + +const MIN_ESCAPED_STR_LEN: usize = 16; + +static DOUBLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable = + safe_byte_match_table!(|b| matches!(b, b'"' | b'\r' | b'\n' | b'\\')); + +static SINGLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable = + safe_byte_match_table!(|b| matches!(b, b'\'' | b'\r' | b'\n' | b'\\')); + impl<'a> Lexer<'a> { /// 12.9.4 String Literals - pub(super) fn read_string_literal(&mut self, delimiter: char) -> Kind { - let mut builder = AutoCow::new(self); - loop { - match self.next_char() { - None | Some('\r' | '\n') => { - self.error(diagnostics::UnterminatedString(self.unterminated_range())); - return Kind::Undetermined; - } - Some(c @ ('"' | '\'')) => { - if c == delimiter { - self.save_string(builder.has_escape(), builder.finish_without_push(self)); - return Kind::Str; - } - builder.push_matching(c); - } - Some('\\') => { - let start = self.offset() - 1; - let text = builder.get_mut_string_without_current_ascii_char(self); - let mut is_valid_escape_sequence = true; - self.read_string_escape_sequence(text, false, &mut is_valid_escape_sequence); - if !is_valid_escape_sequence { - let range = Span::new(start, self.offset()); - self.error(diagnostics::InvalidEscapeSequence(range)); - } - } - Some(c) => { - builder.push_matching(c); - } - } + + /// Read string literal delimited with `"`. + /// # SAFETY + /// Next character must be `"`. + pub(super) unsafe fn read_string_literal_double_quote(&mut self) -> Kind { + if self.context == LexerContext::JsxAttributeValue { + self.consume_char(); + self.read_jsx_string_literal('"') + } else { + // SAFETY: `DOUBLE_QUOTE_STRING_END_TABLE` matches all non-ASCII bytes + self.read_string_literal(b'"', &DOUBLE_QUOTE_STRING_END_TABLE) } } + /// Read string literal delimited with `'`. + /// # SAFETY + /// Next character must be `'`. + pub(super) unsafe fn read_string_literal_single_quote(&mut self) -> Kind { + if self.context == LexerContext::JsxAttributeValue { + self.consume_char(); + self.read_jsx_string_literal('\'') + } else { + // SAFETY: `SINGLE_QUOTE_STRING_END_TABLE` matches all non-ASCII bytes + self.read_string_literal(b'\'', &SINGLE_QUOTE_STRING_END_TABLE) + } + } + + /// Read string literal. + /// # SAFETY + /// Next byte must be ASCII. + unsafe fn read_string_literal(&mut self, delimiter: u8, table: &SafeByteMatchTable) -> Kind { + // Skip opening quote. + // SAFETY: Caller guarantees next byte is ASCII, so safe to advance past it. + let after_opening_quote = unsafe { self.source.position().add(1) }; + + // Consume bytes which are part of identifier + byte_search! { + lexer: self, + table: table, + start: after_opening_quote, + handle_match: |next_byte| { + // Found a matching byte. + // Either end of string found, or a line break, or `\` escape. + if next_byte == delimiter { + self.consume_char(); + return Kind::Str; + } + + if next_byte == b'\\' { + return self.string_literal_on_escape(delimiter, table, after_opening_quote); + } + + debug_assert!(matches!(next_byte, b'\r' | b'\n')); + self.consume_char(); + self.error(diagnostics::UnterminatedString(self.unterminated_range())); + Kind::Undetermined + }, + handle_eof: || { + self.error(diagnostics::UnterminatedString(self.unterminated_range())); + Kind::Undetermined + }, + }; + } + + /// Process string literal when `\` escape found. + #[cold] + fn string_literal_on_escape( + &mut self, + delimiter: u8, + table: &SafeByteMatchTable, + after_opening_quote: SourcePosition, + ) -> Kind { + // Create arena string to hold unescaped string. + // We don't know how long string will end up being. Take a guess that total length + // will be double what we've seen so far, or `MIN_ESCAPED_STR_LEN` minimum. + let so_far = self.source.str_from_pos_to_current(after_opening_quote); + let capacity = max(so_far.len() * 2, MIN_ESCAPED_STR_LEN); + let mut str = String::with_capacity_in(capacity, self.allocator); + + // Push chunk before `\` into `str`. + // `bumpalo::collections::string::String::push_str` is currently expensive due to + // inefficiency in bumpalo's implementation. But best we have right now. + str.push_str(so_far); + + 'outer: loop { + // Consume `\` + let escape_start_offset = self.offset(); + self.consume_char(); + + // Consume escape sequence and add char to `str` + let mut is_valid_escape_sequence = true; + self.read_string_escape_sequence(&mut str, false, &mut is_valid_escape_sequence); + if !is_valid_escape_sequence { + let range = Span::new(escape_start_offset, self.offset()); + self.error(diagnostics::InvalidEscapeSequence(range)); + } + + // Consume bytes until reach end of string, line break, or another escape + let chunk_start = self.source.position(); + while let Some(b) = self.source.peek_byte() { + if !table.matches(b) { + // SAFETY: A byte is available, as we just peeked it. + // This may put `source`'s position on a UTF-8 continuation byte, which violates + // `Source`'s invariant temporarily, but the guarantees of `SafeByteMatchTable` + // mean `table.matches(b)` will always return `true` in a pattern where + // we can't exit this loop without `source` being positioned on a UTF-8 character + // boundary again. + unsafe { self.source.next_byte_unchecked() }; + continue; + } + + if b == delimiter { + // End of string found. Push last chunk to `str`, and consume closing quote. + let chunk = self.source.str_from_pos_to_current(chunk_start); + str.push_str(chunk); + self.consume_char(); + break 'outer; + } + + if b == b'\\' { + // Another escape found. Push last chunk to `str`, and loop back to handle escape. + let chunk = self.source.str_from_pos_to_current(chunk_start); + str.push_str(chunk); + continue 'outer; + } + + debug_assert!(matches!(b, b'\r' | b'\n')); + self.consume_char(); + break; + } + + // EOF + self.error(diagnostics::UnterminatedString(self.unterminated_range())); + return Kind::Undetermined; + } + + // Convert `str` to arena slice and save to `escaped_strings` + self.save_string(true, str.into_bump_str()); + + Kind::Str + } + /// Save the string if it is escaped /// This reduces the overall memory consumption while keeping the `Token` size small /// Strings without escaped values can be retrieved as is from the token span