diff --git a/crates/oxc_parser/src/lexer/search.rs b/crates/oxc_parser/src/lexer/search.rs index a9d9327a6..6233b833a 100644 --- a/crates/oxc_parser/src/lexer/search.rs +++ b/crates/oxc_parser/src/lexer/search.rs @@ -394,6 +394,7 @@ macro_rules! byte_search { $table.use_table(); let mut pos = $start; + #[allow(unused_unsafe)] // Silence warnings if macro called in unsafe code loop { if pos.addr() <= $lexer.source.end_for_batch_search_addr() { // Search a batch of `SEARCH_BATCH_SIZE` bytes. diff --git a/crates/oxc_parser/src/lexer/string.rs b/crates/oxc_parser/src/lexer/string.rs index 21fc5d9c6..c8c9297a8 100644 --- a/crates/oxc_parser/src/lexer/string.rs +++ b/crates/oxc_parser/src/lexer/string.rs @@ -1,6 +1,6 @@ use super::{ + cold_branch, search::{byte_search, safe_byte_match_table, SafeByteMatchTable}, - source::SourcePosition, Kind, Lexer, LexerContext, Span, Token, }; use crate::diagnostics; @@ -16,86 +16,69 @@ static DOUBLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable = static SINGLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable = safe_byte_match_table!(|b| matches!(b, b'\'' | b'\r' | b'\n' | b'\\')); -impl<'a> Lexer<'a> { - /// 12.9.4 String Literals - - /// Read string literal delimited with `"`. - /// # SAFETY - /// Next character must be `"`. - pub(super) unsafe fn read_string_literal_double_quote(&mut self) -> Kind { - if self.context == LexerContext::JsxAttributeValue { - self.consume_char(); - self.read_jsx_string_literal('"') - } else { - // SAFETY: `DOUBLE_QUOTE_STRING_END_TABLE` matches all non-ASCII bytes - self.read_string_literal(b'"', &DOUBLE_QUOTE_STRING_END_TABLE) - } - } - - /// Read string literal delimited with `'`. - /// # SAFETY - /// Next character must be `'`. - pub(super) unsafe fn read_string_literal_single_quote(&mut self) -> Kind { - if self.context == LexerContext::JsxAttributeValue { - self.consume_char(); - self.read_jsx_string_literal('\'') - } else { - // SAFETY: `SINGLE_QUOTE_STRING_END_TABLE` matches all non-ASCII bytes - self.read_string_literal(b'\'', &SINGLE_QUOTE_STRING_END_TABLE) - } - } - - /// Read string literal. - /// # SAFETY - /// Next byte must be ASCII. - unsafe fn read_string_literal(&mut self, delimiter: u8, table: &SafeByteMatchTable) -> Kind { +/// Macro to handle a string literal. +/// +/// # SAFETY +/// `$delimiter` must be an ASCII byte. +/// Next char in `lexer.source` must be ASCII. +/// `$table` must be a `SafeByteMatchTable`. +/// `$table` must only match `$delimiter`, '\', '\r' or '\n'. +macro_rules! handle_string_literal { + ($lexer:ident, $delimiter:expr, $table:ident) => {{ // Skip opening quote. // SAFETY: Caller guarantees next byte is ASCII, so safe to advance past it. - let after_opening_quote = unsafe { self.source.position().add(1) }; + let after_opening_quote = $lexer.source.position().add(1); // Consume bytes which are part of identifier byte_search! { - lexer: self, - table: table, + lexer: $lexer, + table: $table, start: after_opening_quote, handle_match: |next_byte| { // Found a matching byte. // Either end of string found, or a line break, or `\` escape. - if next_byte == delimiter { - self.consume_char(); - return Kind::Str; + match next_byte { + $delimiter => { + // SAFETY: `handle_match` is only called if there's a byte to consume, + // and `next_byte` is the next byte in `lexer.source`. + // Macro user guarantees delimiter is ASCII, so consuming it cannot move + // `lexer.source` off a UTF-8 character boundary. + $lexer.source.next_byte_unchecked(); + Kind::Str + }, + b'\\' => { + cold_branch(|| { + handle_string_literal_escape!($lexer, $delimiter, $table, after_opening_quote) + }) + }, + b'\r' | b'\n' => { + // This is impossible in valid JS, so cold path + cold_branch(|| { + $lexer.consume_char(); + $lexer.error(diagnostics::UnterminatedString($lexer.unterminated_range())); + Kind::Undetermined + }) + }, + // SAFETY: Macro user guarantees `$table` does not match any other bytes + _ => assert_unchecked::unreachable_unchecked!() } - - if next_byte == b'\\' { - return self.string_literal_on_escape(delimiter, table, after_opening_quote); - } - - debug_assert!(matches!(next_byte, b'\r' | b'\n')); - self.consume_char(); - self.error(diagnostics::UnterminatedString(self.unterminated_range())); - Kind::Undetermined }, handle_eof: || { - self.error(diagnostics::UnterminatedString(self.unterminated_range())); + $lexer.error(diagnostics::UnterminatedString($lexer.unterminated_range())); Kind::Undetermined }, }; - } + }}; +} - /// Process string literal when `\` escape found. - #[cold] - fn string_literal_on_escape( - &mut self, - delimiter: u8, - table: &SafeByteMatchTable, - after_opening_quote: SourcePosition, - ) -> Kind { +macro_rules! handle_string_literal_escape { + ($lexer:ident, $delimiter:expr, $table:ident, $after_opening_quote:ident) => {{ // Create arena string to hold unescaped string. // We don't know how long string will end up being. Take a guess that total length // will be double what we've seen so far, or `MIN_ESCAPED_STR_LEN` minimum. - let so_far = self.source.str_from_pos_to_current(after_opening_quote); + let so_far = $lexer.source.str_from_pos_to_current($after_opening_quote); let capacity = max(so_far.len() * 2, MIN_ESCAPED_STR_LEN); - let mut str = String::with_capacity_in(capacity, self.allocator); + let mut str = String::with_capacity_in(capacity, $lexer.allocator); // Push chunk before `\` into `str`. // `bumpalo::collections::string::String::push_str` is currently expensive due to @@ -104,60 +87,103 @@ impl<'a> Lexer<'a> { 'outer: loop { // Consume `\` - let escape_start_offset = self.offset(); - self.consume_char(); + let escape_start_offset = $lexer.offset(); + $lexer.consume_char(); // Consume escape sequence and add char to `str` let mut is_valid_escape_sequence = true; - self.read_string_escape_sequence(&mut str, false, &mut is_valid_escape_sequence); + $lexer.read_string_escape_sequence(&mut str, false, &mut is_valid_escape_sequence); if !is_valid_escape_sequence { - let range = Span::new(escape_start_offset, self.offset()); - self.error(diagnostics::InvalidEscapeSequence(range)); + let range = Span::new(escape_start_offset, $lexer.offset()); + $lexer.error(diagnostics::InvalidEscapeSequence(range)); } // Consume bytes until reach end of string, line break, or another escape - let chunk_start = self.source.position(); - while let Some(b) = self.source.peek_byte() { - if !table.matches(b) { - // SAFETY: A byte is available, as we just peeked it. - // This may put `source`'s position on a UTF-8 continuation byte, which violates - // `Source`'s invariant temporarily, but the guarantees of `SafeByteMatchTable` - // mean `table.matches(b)` will always return `true` in a pattern where - // we can't exit this loop without `source` being positioned on a UTF-8 character - // boundary again. - unsafe { self.source.next_byte_unchecked() }; - continue; - } + let chunk_start = $lexer.source.position(); + while let Some(b) = $lexer.source.peek_byte() { + match b { + b if !$table.matches(b) => { + // SAFETY: A byte is available, as we just peeked it. + // This may put `source`'s position on a UTF-8 continuation byte, which violates + // `Source`'s invariant temporarily, but the guarantees of `SafeByteMatchTable` + // mean `!table.matches(b)` on this branch prevents exiting this loop until + // `source` is positioned on a UTF-8 character boundary again. + unsafe { $lexer.source.next_byte_unchecked() }; + continue; + } + b if b == $delimiter => { + // End of string found. Push last chunk to `str`. + let chunk = $lexer.source.str_from_pos_to_current(chunk_start); + str.push_str(chunk); - if b == delimiter { - // End of string found. Push last chunk to `str`, and consume closing quote. - let chunk = self.source.str_from_pos_to_current(chunk_start); - str.push_str(chunk); - self.consume_char(); - break 'outer; + // Consume closing quote. + // SAFETY: Caller guarantees delimiter is ASCII, so consuming it cannot move + // `lexer.source` off a UTF-8 character boundary + $lexer.source.next_byte_unchecked(); + break 'outer; + } + b'\\' => { + // Another escape found. Push last chunk to `str`, and loop back to handle escape. + let chunk = $lexer.source.str_from_pos_to_current(chunk_start); + str.push_str(chunk); + continue 'outer; + } + b'\r' | b'\n' => { + // This is impossible in valid JS, so cold path + return cold_branch(|| { + $lexer.consume_char(); + $lexer.error(diagnostics::UnterminatedString($lexer.unterminated_range())); + Kind::Undetermined + }); + } + // SAFETY: Caller guarantees `table` does not match any other bytes + _ => assert_unchecked::unreachable_unchecked!(), } - - if b == b'\\' { - // Another escape found. Push last chunk to `str`, and loop back to handle escape. - let chunk = self.source.str_from_pos_to_current(chunk_start); - str.push_str(chunk); - continue 'outer; - } - - debug_assert!(matches!(b, b'\r' | b'\n')); - self.consume_char(); - break; } // EOF - self.error(diagnostics::UnterminatedString(self.unterminated_range())); + $lexer.error(diagnostics::UnterminatedString($lexer.unterminated_range())); return Kind::Undetermined; } // Convert `str` to arena slice and save to `escaped_strings` - self.save_string(true, str.into_bump_str()); + $lexer.save_string(true, str.into_bump_str()); Kind::Str + }} +} + +impl<'a> Lexer<'a> { + /// 12.9.4 String Literals + + /// Read string literal delimited with `"`. + /// # SAFETY + /// Next character must be `"`. + pub(super) unsafe fn read_string_literal_double_quote(&mut self) -> Kind { + if self.context == LexerContext::JsxAttributeValue { + // SAFETY: Caller guarantees next char is `"` + self.source.next_byte_unchecked(); + self.read_jsx_string_literal('"') + } else { + // SAFETY: Caller guarantees next char is `"`, which is ASCII. + // b'"' is an ASCII byte. `DOUBLE_QUOTE_STRING_END_TABLE` is a `SafeByteMatchTable`. + unsafe { handle_string_literal!(self, b'"', DOUBLE_QUOTE_STRING_END_TABLE) } + } + } + + /// Read string literal delimited with `'`. + /// # SAFETY + /// Next character must be `'`. + pub(super) unsafe fn read_string_literal_single_quote(&mut self) -> Kind { + if self.context == LexerContext::JsxAttributeValue { + // SAFETY: Caller guarantees next char is `'` + self.source.next_byte_unchecked(); + self.read_jsx_string_literal('\'') + } else { + // SAFETY: Caller guarantees next char is `"`, which is ASCII. + // b'\'' is an ASCII byte. `SINGLE_QUOTE_STRING_END_TABLE` is a `SafeByteMatchTable`. + unsafe { handle_string_literal!(self, b'\'', SINGLE_QUOTE_STRING_END_TABLE) } + } } /// Save the string if it is escaped