mirror of
https://github.com/danbulant/oxc
synced 2026-05-24 20:32:10 +00:00
perf(parser): lex strings as bytes (#2357)
Lex string literals as bytes, using same techniques as for identifiers. Handling escapes could be optimized a bit more, and maybe I'll return to that, but as escapes are fairly rare, it wouldn't be the biggest gain.
This commit is contained in:
parent
1baf9c33d2
commit
c0d1d6b08a
3 changed files with 168 additions and 41 deletions
|
|
@ -1,4 +1,4 @@
|
||||||
use super::{Kind, Lexer, LexerContext};
|
use super::{Kind, Lexer};
|
||||||
use crate::diagnostics;
|
use crate::diagnostics;
|
||||||
|
|
||||||
#[allow(clippy::unnecessary_safety_comment)]
|
#[allow(clippy::unnecessary_safety_comment)]
|
||||||
|
|
@ -21,7 +21,7 @@ static BYTE_HANDLERS: [ByteHandler; 256] = [
|
||||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F //
|
// 0 1 2 3 4 5 6 7 8 9 A B C D E F //
|
||||||
ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, SPS, LIN, ISP, ISP, LIN, ERR, ERR, // 0
|
ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, SPS, LIN, ISP, ISP, LIN, ERR, ERR, // 0
|
||||||
ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, // 1
|
ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, // 1
|
||||||
SPS, EXL, QOT, HAS, IDT, PRC, AMP, QOT, PNO, PNC, ATR, PLS, COM, MIN, PRD, SLH, // 2
|
SPS, EXL, QOD, HAS, IDT, PRC, AMP, QOS, PNO, PNC, ATR, PLS, COM, MIN, PRD, SLH, // 2
|
||||||
ZER, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, COL, SEM, LSS, EQL, GTR, QST, // 3
|
ZER, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, COL, SEM, LSS, EQL, GTR, QST, // 3
|
||||||
AT_, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, // 4
|
AT_, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, // 4
|
||||||
IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, BTO, ESC, BTC, CRT, IDT, // 5
|
IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, BTO, ESC, BTC, CRT, IDT, // 5
|
||||||
|
|
@ -220,14 +220,16 @@ ascii_byte_handler!(EXL(lexer) {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// ' "
|
// "
|
||||||
ascii_byte_handler!(QOT(lexer) {
|
ascii_byte_handler!(QOD(lexer) {
|
||||||
let c = lexer.consume_char();
|
// SAFETY: This function is only called for `"`
|
||||||
if lexer.context == LexerContext::JsxAttributeValue {
|
unsafe { lexer.read_string_literal_double_quote() }
|
||||||
lexer.read_jsx_string_literal(c)
|
});
|
||||||
} else {
|
|
||||||
lexer.read_string_literal(c)
|
// '
|
||||||
}
|
ascii_byte_handler!(QOS(lexer) {
|
||||||
|
// SAFETY: This function is only called for `'`
|
||||||
|
unsafe { lexer.read_string_literal_single_quote() }
|
||||||
});
|
});
|
||||||
|
|
||||||
// #
|
// #
|
||||||
|
|
|
||||||
|
|
@ -362,7 +362,7 @@ impl<'a> Source<'a> {
|
||||||
/// In particular, safe methods `Source::next_char`, `Source::peek_char`, and `Source::remaining`
|
/// In particular, safe methods `Source::next_char`, `Source::peek_char`, and `Source::remaining`
|
||||||
/// are *not* safe to call until one of above conditions is satisfied.
|
/// are *not* safe to call until one of above conditions is satisfied.
|
||||||
#[inline]
|
#[inline]
|
||||||
unsafe fn next_byte_unchecked(&mut self) -> u8 {
|
pub(super) unsafe fn next_byte_unchecked(&mut self) -> u8 {
|
||||||
// SAFETY: Caller guarantees not at end of file i.e. `ptr != end`.
|
// SAFETY: Caller guarantees not at end of file i.e. `ptr != end`.
|
||||||
// Methods of this type provide no way for `ptr` to be before `start` or after `end`.
|
// Methods of this type provide no way for `ptr` to be before `start` or after `end`.
|
||||||
// Therefore always valid to read a byte from `ptr`, and incrementing `ptr` cannot result
|
// Therefore always valid to read a byte from `ptr`, and incrementing `ptr` cannot result
|
||||||
|
|
|
||||||
|
|
@ -1,40 +1,165 @@
|
||||||
use super::{AutoCow, Kind, Lexer, Span, Token};
|
use super::{
|
||||||
|
search::{byte_search, safe_byte_match_table, SafeByteMatchTable},
|
||||||
|
source::SourcePosition,
|
||||||
|
Kind, Lexer, LexerContext, Span, Token,
|
||||||
|
};
|
||||||
use crate::diagnostics;
|
use crate::diagnostics;
|
||||||
|
|
||||||
|
use oxc_allocator::String;
|
||||||
|
use std::cmp::max;
|
||||||
|
|
||||||
|
const MIN_ESCAPED_STR_LEN: usize = 16;
|
||||||
|
|
||||||
|
static DOUBLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable =
|
||||||
|
safe_byte_match_table!(|b| matches!(b, b'"' | b'\r' | b'\n' | b'\\'));
|
||||||
|
|
||||||
|
static SINGLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable =
|
||||||
|
safe_byte_match_table!(|b| matches!(b, b'\'' | b'\r' | b'\n' | b'\\'));
|
||||||
|
|
||||||
impl<'a> Lexer<'a> {
|
impl<'a> Lexer<'a> {
|
||||||
/// 12.9.4 String Literals
|
/// 12.9.4 String Literals
|
||||||
pub(super) fn read_string_literal(&mut self, delimiter: char) -> Kind {
|
|
||||||
let mut builder = AutoCow::new(self);
|
/// Read string literal delimited with `"`.
|
||||||
loop {
|
/// # SAFETY
|
||||||
match self.next_char() {
|
/// Next character must be `"`.
|
||||||
None | Some('\r' | '\n') => {
|
pub(super) unsafe fn read_string_literal_double_quote(&mut self) -> Kind {
|
||||||
self.error(diagnostics::UnterminatedString(self.unterminated_range()));
|
if self.context == LexerContext::JsxAttributeValue {
|
||||||
return Kind::Undetermined;
|
self.consume_char();
|
||||||
}
|
self.read_jsx_string_literal('"')
|
||||||
Some(c @ ('"' | '\'')) => {
|
} else {
|
||||||
if c == delimiter {
|
// SAFETY: `DOUBLE_QUOTE_STRING_END_TABLE` matches all non-ASCII bytes
|
||||||
self.save_string(builder.has_escape(), builder.finish_without_push(self));
|
self.read_string_literal(b'"', &DOUBLE_QUOTE_STRING_END_TABLE)
|
||||||
return Kind::Str;
|
|
||||||
}
|
|
||||||
builder.push_matching(c);
|
|
||||||
}
|
|
||||||
Some('\\') => {
|
|
||||||
let start = self.offset() - 1;
|
|
||||||
let text = builder.get_mut_string_without_current_ascii_char(self);
|
|
||||||
let mut is_valid_escape_sequence = true;
|
|
||||||
self.read_string_escape_sequence(text, false, &mut is_valid_escape_sequence);
|
|
||||||
if !is_valid_escape_sequence {
|
|
||||||
let range = Span::new(start, self.offset());
|
|
||||||
self.error(diagnostics::InvalidEscapeSequence(range));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Some(c) => {
|
|
||||||
builder.push_matching(c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Read string literal delimited with `'`.
|
||||||
|
/// # SAFETY
|
||||||
|
/// Next character must be `'`.
|
||||||
|
pub(super) unsafe fn read_string_literal_single_quote(&mut self) -> Kind {
|
||||||
|
if self.context == LexerContext::JsxAttributeValue {
|
||||||
|
self.consume_char();
|
||||||
|
self.read_jsx_string_literal('\'')
|
||||||
|
} else {
|
||||||
|
// SAFETY: `SINGLE_QUOTE_STRING_END_TABLE` matches all non-ASCII bytes
|
||||||
|
self.read_string_literal(b'\'', &SINGLE_QUOTE_STRING_END_TABLE)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Read string literal.
|
||||||
|
/// # SAFETY
|
||||||
|
/// Next byte must be ASCII.
|
||||||
|
unsafe fn read_string_literal(&mut self, delimiter: u8, table: &SafeByteMatchTable) -> Kind {
|
||||||
|
// Skip opening quote.
|
||||||
|
// SAFETY: Caller guarantees next byte is ASCII, so safe to advance past it.
|
||||||
|
let after_opening_quote = unsafe { self.source.position().add(1) };
|
||||||
|
|
||||||
|
// Consume bytes which are part of identifier
|
||||||
|
byte_search! {
|
||||||
|
lexer: self,
|
||||||
|
table: table,
|
||||||
|
start: after_opening_quote,
|
||||||
|
handle_match: |next_byte| {
|
||||||
|
// Found a matching byte.
|
||||||
|
// Either end of string found, or a line break, or `\` escape.
|
||||||
|
if next_byte == delimiter {
|
||||||
|
self.consume_char();
|
||||||
|
return Kind::Str;
|
||||||
|
}
|
||||||
|
|
||||||
|
if next_byte == b'\\' {
|
||||||
|
return self.string_literal_on_escape(delimiter, table, after_opening_quote);
|
||||||
|
}
|
||||||
|
|
||||||
|
debug_assert!(matches!(next_byte, b'\r' | b'\n'));
|
||||||
|
self.consume_char();
|
||||||
|
self.error(diagnostics::UnterminatedString(self.unterminated_range()));
|
||||||
|
Kind::Undetermined
|
||||||
|
},
|
||||||
|
handle_eof: || {
|
||||||
|
self.error(diagnostics::UnterminatedString(self.unterminated_range()));
|
||||||
|
Kind::Undetermined
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Process string literal when `\` escape found.
|
||||||
|
#[cold]
|
||||||
|
fn string_literal_on_escape(
|
||||||
|
&mut self,
|
||||||
|
delimiter: u8,
|
||||||
|
table: &SafeByteMatchTable,
|
||||||
|
after_opening_quote: SourcePosition,
|
||||||
|
) -> Kind {
|
||||||
|
// Create arena string to hold unescaped string.
|
||||||
|
// We don't know how long string will end up being. Take a guess that total length
|
||||||
|
// will be double what we've seen so far, or `MIN_ESCAPED_STR_LEN` minimum.
|
||||||
|
let so_far = self.source.str_from_pos_to_current(after_opening_quote);
|
||||||
|
let capacity = max(so_far.len() * 2, MIN_ESCAPED_STR_LEN);
|
||||||
|
let mut str = String::with_capacity_in(capacity, self.allocator);
|
||||||
|
|
||||||
|
// Push chunk before `\` into `str`.
|
||||||
|
// `bumpalo::collections::string::String::push_str` is currently expensive due to
|
||||||
|
// inefficiency in bumpalo's implementation. But best we have right now.
|
||||||
|
str.push_str(so_far);
|
||||||
|
|
||||||
|
'outer: loop {
|
||||||
|
// Consume `\`
|
||||||
|
let escape_start_offset = self.offset();
|
||||||
|
self.consume_char();
|
||||||
|
|
||||||
|
// Consume escape sequence and add char to `str`
|
||||||
|
let mut is_valid_escape_sequence = true;
|
||||||
|
self.read_string_escape_sequence(&mut str, false, &mut is_valid_escape_sequence);
|
||||||
|
if !is_valid_escape_sequence {
|
||||||
|
let range = Span::new(escape_start_offset, self.offset());
|
||||||
|
self.error(diagnostics::InvalidEscapeSequence(range));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Consume bytes until reach end of string, line break, or another escape
|
||||||
|
let chunk_start = self.source.position();
|
||||||
|
while let Some(b) = self.source.peek_byte() {
|
||||||
|
if !table.matches(b) {
|
||||||
|
// SAFETY: A byte is available, as we just peeked it.
|
||||||
|
// This may put `source`'s position on a UTF-8 continuation byte, which violates
|
||||||
|
// `Source`'s invariant temporarily, but the guarantees of `SafeByteMatchTable`
|
||||||
|
// mean `table.matches(b)` will always return `true` in a pattern where
|
||||||
|
// we can't exit this loop without `source` being positioned on a UTF-8 character
|
||||||
|
// boundary again.
|
||||||
|
unsafe { self.source.next_byte_unchecked() };
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if b == delimiter {
|
||||||
|
// End of string found. Push last chunk to `str`, and consume closing quote.
|
||||||
|
let chunk = self.source.str_from_pos_to_current(chunk_start);
|
||||||
|
str.push_str(chunk);
|
||||||
|
self.consume_char();
|
||||||
|
break 'outer;
|
||||||
|
}
|
||||||
|
|
||||||
|
if b == b'\\' {
|
||||||
|
// Another escape found. Push last chunk to `str`, and loop back to handle escape.
|
||||||
|
let chunk = self.source.str_from_pos_to_current(chunk_start);
|
||||||
|
str.push_str(chunk);
|
||||||
|
continue 'outer;
|
||||||
|
}
|
||||||
|
|
||||||
|
debug_assert!(matches!(b, b'\r' | b'\n'));
|
||||||
|
self.consume_char();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// EOF
|
||||||
|
self.error(diagnostics::UnterminatedString(self.unterminated_range()));
|
||||||
|
return Kind::Undetermined;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert `str` to arena slice and save to `escaped_strings`
|
||||||
|
self.save_string(true, str.into_bump_str());
|
||||||
|
|
||||||
|
Kind::Str
|
||||||
|
}
|
||||||
|
|
||||||
/// Save the string if it is escaped
|
/// Save the string if it is escaped
|
||||||
/// This reduces the overall memory consumption while keeping the `Token` size small
|
/// This reduces the overall memory consumption while keeping the `Token` size small
|
||||||
/// Strings without escaped values can be retrieved as is from the token span
|
/// Strings without escaped values can be retrieved as is from the token span
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue