diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs index 4502aa313..79ba772cc 100644 --- a/crates/oxc_parser/src/lexer/mod.rs +++ b/crates/oxc_parser/src/lexer/mod.rs @@ -19,7 +19,6 @@ mod regex; mod search; mod source; mod string; -mod string_builder; mod template; mod token; mod trivia_builder; @@ -38,7 +37,6 @@ use oxc_span::{SourceType, Span}; use self::{ byte_handlers::handle_byte, source::{Source, SourcePosition}, - string_builder::AutoCow, trivia_builder::TriviaBuilder, }; pub use self::{ diff --git a/crates/oxc_parser/src/lexer/source.rs b/crates/oxc_parser/src/lexer/source.rs index bc832eff5..371ee3ba0 100644 --- a/crates/oxc_parser/src/lexer/source.rs +++ b/crates/oxc_parser/src/lexer/source.rs @@ -218,6 +218,20 @@ impl<'a> Source<'a> { self.str_between_positions_unchecked(pos, SourcePosition::new(self.ptr)) } + /// Get string slice from current position of `Source` up to a `SourcePosition`, without checks. + /// + /// # SAFETY + /// `pos` must not be before current position of `Source`. + /// This is always the case if both: + /// 1. `Source::set_position` has not been called since `pos` was created. + /// 2. `pos` has not been moved backwards with `SourcePosition::sub`. + #[inline] + pub(super) unsafe fn str_from_current_to_pos_unchecked(&self, pos: SourcePosition) -> &'a str { + // SAFETY: Caller guarantees `pos` is not before current position of `Source`. + // `self.ptr` is always a valid `SourcePosition` due to invariants of `Source`. + self.str_between_positions_unchecked(SourcePosition::new(self.ptr), pos) + } + /// Get string slice from a `SourcePosition` up to the end of `Source`. #[inline] pub(super) fn str_from_pos_to_end(&self, pos: SourcePosition) -> &'a str { diff --git a/crates/oxc_parser/src/lexer/string_builder.rs b/crates/oxc_parser/src/lexer/string_builder.rs deleted file mode 100644 index 3b6961c9f..000000000 --- a/crates/oxc_parser/src/lexer/string_builder.rs +++ /dev/null @@ -1,74 +0,0 @@ -// Copied from https://github.com/mozilla-spidermonkey/jsparagus/blob/master/crates/parser/src/lexer.rs#L2256 - -use oxc_allocator::String; - -use crate::lexer::Lexer; - -pub struct AutoCow<'a> { - pub start: &'a str, - pub value: Option>, -} - -impl<'a> AutoCow<'a> { - pub fn new(lexer: &Lexer<'a>) -> Self { - let start = lexer.remaining(); - AutoCow { start, value: None } - } - - // Push a char that matches `lexer.next_char()`. - pub fn push_matching(&mut self, c: char) { - if let Some(text) = &mut self.value { - text.push(c); - } - } - - // Push a different character than `lexer.next_char()`. - // force_allocation_without_current_ascii_char must be called before this. - pub fn push_different(&mut self, c: char) { - debug_assert!(self.value.is_some()); - self.value.as_mut().unwrap().push(c); - } - - // Force allocation of a String, excluding the current ASCII character, - // and return the reference to it - pub fn get_mut_string_without_current_ascii_char<'b>( - &'b mut self, - lexer: &Lexer<'a>, - ) -> &'b mut String<'a> { - self.force_allocation_without_current_ascii_char(lexer); - self.value.as_mut().unwrap() - } - - // Force allocation of a String, excluding the current ASCII character. - pub fn force_allocation_without_current_ascii_char(&mut self, lexer: &Lexer<'a>) { - if self.value.is_some() { - return; - } - self.value = Some(String::from_str_in( - &self.start[..self.start.len() - lexer.remaining().len() - 1], - lexer.allocator, - )); - } - - // Check if the string contains a different character, such as an escape sequence - pub fn has_escape(&self) -> bool { - self.value.is_some() - } - - // TODO: Delete this if not using it - #[allow(dead_code)] - pub fn finish(mut self, lexer: &Lexer<'a>) -> &'a str { - match self.value.take() { - Some(s) => s.into_bump_str(), - None => &self.start[..self.start.len() - lexer.remaining().len()], - } - } - - // Just like finish, but without pushing current char. - pub fn finish_without_push(mut self, lexer: &Lexer<'a>) -> &'a str { - match self.value.take() { - Some(s) => s.into_bump_str(), - None => &self.start[..self.start.len() - lexer.remaining().len() - 1], - } - } -} diff --git a/crates/oxc_parser/src/lexer/template.rs b/crates/oxc_parser/src/lexer/template.rs index 7ebb41943..550061f60 100644 --- a/crates/oxc_parser/src/lexer/template.rs +++ b/crates/oxc_parser/src/lexer/template.rs @@ -1,47 +1,312 @@ -use super::{AutoCow, Kind, Lexer, Token}; +use super::{ + cold_branch, + search::{byte_search, safe_byte_match_table, SafeByteMatchTable}, + Kind, Lexer, SourcePosition, Token, +}; use crate::diagnostics; -use oxc_syntax::identifier::{CR, LF}; +use std::cmp::max; + +use oxc_allocator::String; + +const MIN_ESCAPED_TEMPLATE_LIT_LEN: usize = 16; + +static TEMPLATE_LITERAL_TABLE: SafeByteMatchTable = + safe_byte_match_table!(|b| matches!(b, b'$' | b'`' | b'\r' | b'\\')); impl<'a> Lexer<'a> { /// 12.8.6 Template Literal Lexical Components + + /// Read template literal component. + /// + /// This function handles the common case where template contains no escapes or `\r` characters + /// and so does not require saving to `lexer.escaped_templates`. + /// If an escape or `\r` is found, control is passed to `template_literal_escaped` which builds + /// the unescaped string. This division keeps the path for common case as fast as possible. pub(super) fn read_template_literal(&mut self, substitute: Kind, tail: Kind) -> Kind { - let mut builder = AutoCow::new(self); - let mut is_valid_escape_sequence = true; - while let Some(c) = self.next_char() { - match c { - '$' if self.peek() == Some('{') => { - self.save_template_string( - is_valid_escape_sequence, - builder.has_escape(), - builder.finish_without_push(self), - ); - self.consume_char(); - return substitute; - } - '`' => { - self.save_template_string( - is_valid_escape_sequence, - builder.has_escape(), - builder.finish_without_push(self), - ); - return tail; - } - CR => { - builder.force_allocation_without_current_ascii_char(self); - if self.next_eq(LF) { - builder.push_different(LF); + let mut ret = substitute; + + byte_search! { + lexer: self, + table: TEMPLATE_LITERAL_TABLE, + continue_if: |next_byte, pos| { + match next_byte { + b'$' => { + // SAFETY: Next byte is `$` which is ASCII, so after it is a UTF-8 char boundary + let after_dollar = unsafe { pos.add(1) }; + if after_dollar.addr() < self.source.end_addr() { + // If `${`, exit. + // SAFETY: Have checked there's at least 1 further byte to read. + if unsafe { after_dollar.read() } == b'{' { + // Skip `${` and stop searching. + // SAFETY: Consuming `${` leaves `pos` on a UTF-8 char boundary. + pos = unsafe { after_dollar.add(1) }; + false + } else { + // Not `${`. Continue searching. + true + } + } else { + // This is last byte in file. Continue to `handle_eof`. + // This is illegal in valid JS, so mark this branch cold. + cold_branch(|| true) + } + }, + b'`' => { + // Skip '`' and stop searching. + // SAFETY: Char at `pos` is '`', so `pos + 1` is a UTF-8 char boundary. + pos = unsafe { pos.add(1) }; + ret = tail; + false + }, + b'\r' => { + // SAFETY: Byte at `pos` is `\r`. + // `pos` has only been advanced relative to `self.source.position()`. + return unsafe { self.template_literal_carriage_return(pos, substitute, tail) }; + } + _ => { + // `TEMPLATE_LITERAL_TABLE` only matches `$`, '`', `\r` and `\` + debug_assert!(next_byte == b'\\'); + // SAFETY: Byte at `pos` is `\`. + // `pos` has only been advanced relative to `self.source.position()`. + return unsafe { self.template_literal_backslash(pos, substitute, tail) }; } } - '\\' => { - let text = builder.get_mut_string_without_current_ascii_char(self); - self.read_string_escape_sequence(text, true, &mut is_valid_escape_sequence); - } - _ => builder.push_matching(c), - } + }, + handle_match: |_next_byte, _start| { + ret + }, + handle_eof: |_start| { + self.error(diagnostics::UnterminatedString(self.unterminated_range())); + Kind::Undetermined + }, + }; + } + + /// Consume rest of template literal after a `\r` is found. + /// + /// # SAFETY + /// * Byte at `pos` must be `\r`. + /// * `pos` must not be before `self.source.position()`. + unsafe fn template_literal_carriage_return( + &mut self, + mut pos: SourcePosition<'a>, + substitute: Kind, + tail: Kind, + ) -> Kind { + // Create arena string to hold modified template literal, containing up to before `\r`. + // SAFETY: Caller guarantees `pos` is not before `self.source.position()`. + let str = self.template_literal_create_string(pos); + + // Skip `\r`. + // SAFETY: Caller guarantees byte at `pos` is `\r`, so `pos + 1` is a UTF-8 char boundary. + pos = pos.add(1); + + // If at EOF, exit. This illegal in valid JS, so cold branch. + if pos.addr() == self.source.end_addr() { + return cold_branch(|| { + self.source.advance_to_end(); + self.error(diagnostics::UnterminatedString(self.unterminated_range())); + Kind::Undetermined + }); } - self.error(diagnostics::UnterminatedString(self.unterminated_range())); - Kind::Undetermined + + // Start next chunk after `\r` + let chunk_start = pos; + + // If next char is `\n`, start next search after it. + // `\n` is first char of next chunk, so it'll get added to `str` when chunk is pushed. + // SAFETY: Have checked not at EOF. + if pos.read() == b'\n' { + // SAFETY: `\n` is ASCII, so advancing past it leaves `pos` on a UTF-8 char boundary + pos = pos.add(1); + } + + self.template_literal_escaped(str, pos, chunk_start, true, substitute, tail) + } + + /// Consume rest of template literal after a `\` escape is found. + /// + /// # SAFETY + /// * Byte at `pos` must be `\`. + /// * `pos` must not be before `self.source.position()`. + unsafe fn template_literal_backslash( + &mut self, + pos: SourcePosition<'a>, + substitute: Kind, + tail: Kind, + ) -> Kind { + // Create arena string to hold modified template literal, containing up to before `\`. + // SAFETY: Caller guarantees `pos` is not before `self.source.position()`. + let mut str = self.template_literal_create_string(pos); + + // Decode escape sequence into `str`. + // `read_string_escape_sequence` expects `self.source` to be positioned after `\`. + // SAFETY: Caller guarantees next byte is `\`, which is ASCII, so `pos + 1` is UTF-8 char boundary. + let after_backslash = pos.add(1); + self.source.set_position(after_backslash); + + let mut is_valid_escape_sequence = true; + self.read_string_escape_sequence(&mut str, true, &mut is_valid_escape_sequence); + + // Continue search after escape + let after_escape = self.source.position(); + // SAFETY: `pos` and `chunk_start` are the same + self.template_literal_escaped( + str, + after_escape, + after_escape, + is_valid_escape_sequence, + substitute, + tail, + ) + } + + /// Create arena string for modified template literal, containing the template literal up to `pos`. + /// # SAFETY + /// `pos` must not be before `self.source.position()` + unsafe fn template_literal_create_string(&self, pos: SourcePosition) -> String<'a> { + // Create arena string to hold modified template literal. + // We don't know how long template literal will end up being. Take a guess that total length + // will be double what we've seen so far, or `MIN_ESCAPED_TEMPLATE_LIT_LEN` minimum. + // SAFETY: Caller guarantees `pos` is not before `self.source.position()`. + let so_far = self.source.str_from_current_to_pos_unchecked(pos); + let capacity = max(so_far.len() * 2, MIN_ESCAPED_TEMPLATE_LIT_LEN); + let mut str = String::with_capacity_in(capacity, self.allocator); + str.push_str(so_far); + str + } + + /// Process template literal after `\n` or `\` found. + /// # SAFETY + /// `chunk_start` must not be after `pos`. + unsafe fn template_literal_escaped( + &mut self, + mut str: String<'a>, + pos: SourcePosition<'a>, + mut chunk_start: SourcePosition<'a>, + mut is_valid_escape_sequence: bool, + substitute: Kind, + tail: Kind, + ) -> Kind { + let mut ret = substitute; + + byte_search! { + lexer: self, + table: TEMPLATE_LITERAL_TABLE, + start: pos, + continue_if: |next_byte, pos| { + if next_byte == b'$' { + // SAFETY: Next byte is `$` which is ASCII, so after it is a UTF-8 char boundary + let after_dollar = pos.add(1); + if after_dollar.addr() < self.source.end_addr() { + // If `${`, exit. + // SAFETY: Have checked there's at least 1 further byte to read. + if after_dollar.read() == b'{' { + // Add last chunk to `str`. + // SAFETY: Caller guarantees `chunk_start` is not after `pos` at start of + // this function. `pos` only increases during searching. + // Where `chunk_start` is updated, it's always before or equal to `pos`. + // So `chunk_start` cannot be after `pos`. + let chunk = self.source.str_between_positions_unchecked(chunk_start, pos); + str.push_str(chunk); + + // Skip `${` and stop searching. + // SAFETY: Consuming `${` leaves `pos` on a UTF-8 char boundary. + pos = after_dollar.add(1); + false + } else { + // Not `${`. Continue searching. + true + } + } else { + // This is last byte in file. Continue to `handle_eof`. + // This is illegal in valid JS, so mark this branch cold. + cold_branch(|| true) + } + } else { + // Next byte is '`', `\r` or `\`. Add chunk up to before this char to `str`. + // SAFETY: Caller guarantees `chunk_start` is not after `pos` at start of + // this function. `pos` only increases during searching. + // Where `chunk_start` is updated, it's always before or equal to `pos`. + // So `chunk_start` cannot be after `pos`. + let chunk = self.source.str_between_positions_unchecked(chunk_start, pos); + str.push_str(chunk); + + match next_byte { + b'`' => { + // Skip '`' and stop searching. + // SAFETY: Byte at `pos` is '`' (ASCII), so `pos + 1` is a UTF-8 char boundary. + pos = pos.add(1); + ret = tail; + false + } + b'\r' => { + // Set next chunk to start after `\r`. + // SAFETY: Next byte is `\r` which is ASCII, so after it is a UTF-8 char boundary. + // This temporarily puts `chunk_start` 1 byte after `pos`, but `byte_search!` macro + // increments `pos` when return `true` from `continue_if`, so `pos` will be + // brought up to `chunk_start` again. + chunk_start = pos.add(1); + + if chunk_start.addr() < self.source.end_addr() { + // If next char is `\n`, start next search after it. + // NB: `byte_search!` macro already advances `pos` by 1, so only advance + // by 1 here, so that in total we skip 2 bytes for `\r\n`. + // No need to push `\n` to `str`, as it's 1st char of next chunk, + // and will be added to `str` when next chunk is pushed. + if chunk_start.read() == b'\n' { + pos = chunk_start; + } + } else { + // This is last byte in file. Continue to `handle_eof`. + // This is illegal in valid JS, so mark this branch cold. + cold_branch(|| {}); + } + + // Continue searching + true + } + _ => { + // `TEMPLATE_LITERAL_TABLE` only matches `$`, '`', `\r` and `\` + debug_assert!(next_byte == b'\\'); + + // Decode escape sequence into `str`. + // `read_string_escape_sequence` expects `self.source` to be positioned after `\`. + // SAFETY: Next byte is `\`, which is ASCII, so `pos + 1` is UTF-8 char boundary. + let after_backslash = pos.add(1); + self.source.set_position(after_backslash); + self.read_string_escape_sequence(&mut str, true, &mut is_valid_escape_sequence); + + // Start next chunk after escape sequence + chunk_start = self.source.position(); + assert!(chunk_start.addr() >= after_backslash.addr()); + + // Continue search after escape sequence. + // NB: `byte_search!` macro increments `pos` when return `true`, + // so need to subtract 1 here to counteract that. + // SAFETY: Added 1 to `pos` above, and checked `chunk_start` hasn't moved + // backwards from that, so subtracting 1 again is within bounds. + pos = chunk_start.sub(1); + + // Continue searching + true + } + } + } + }, + handle_match: |_next_byte, _start| { + self.save_template_string( + is_valid_escape_sequence, + str.into_bump_str(), + ); + ret + }, + handle_eof: |_start| { + self.error(diagnostics::UnterminatedString(self.unterminated_range())); + Kind::Undetermined + }, + }; } /// Re-tokenize the current `}` token for `TemplateSubstitutionTail` @@ -53,16 +318,8 @@ impl<'a> Lexer<'a> { self.finish_next(kind) } - /// Save the template if it is escaped - fn save_template_string( - &mut self, - is_valid_escape_sequence: bool, - has_escape: bool, - s: &'a str, - ) { - if !has_escape { - return; - } + /// Save escaped template string + fn save_template_string(&mut self, is_valid_escape_sequence: bool, s: &'a str) { self.escaped_templates.insert(self.token.start, is_valid_escape_sequence.then_some(s)); self.token.escaped = true; }