mirror of
https://github.com/danbulant/oxc
synced 2026-05-25 04:42:10 +00:00
part of #3213 We should only have one diagnostic struct instead 353 copies of them, so we don't end up choking LLVM with 50k lines of the same code due to monomorphization. If the proposed approach is good, then I'll start writing a codemod to turn all the existing structs to plain functions. --- Background: Using `--timings`, we see `oxc_linter` is slow on codegen (the purple part).  The crate currently contains 353 miette errors. [cargo-llvm-lines](https://github.com/dtolnay/cargo-llvm-lines) displays ``` cargo llvm-lines -p oxc_linter --lib --release Lines Copies Function name ----- ------ ------------- 830350 33438 (TOTAL) 29252 (3.5%, 3.5%) 808 (2.4%, 2.4%) <alloc::boxed::Box<T,A> as core::ops::drop::Drop>::drop 23298 (2.8%, 6.3%) 353 (1.1%, 3.5%) miette::eyreish::error::object_downcast 19062 (2.3%, 8.6%) 706 (2.1%, 5.6%) core::error::Error::type_id 12610 (1.5%, 10.1%) 65 (0.2%, 5.8%) alloc::raw_vec::RawVec<T,A>::grow_amortized 12002 (1.4%, 11.6%) 706 (2.1%, 7.9%) miette::eyreish::ptr::Own<T>::boxed 9215 (1.1%, 12.7%) 115 (0.3%, 8.2%) core::iter::traits::iterator::Iterator::try_fold 9150 (1.1%, 13.8%) 1 (0.0%, 8.2%) oxc_linter::rules::RuleEnum::read_json 8825 (1.1%, 14.9%) 353 (1.1%, 9.3%) <miette::eyreish::error::ErrorImpl<E> as core::error::Error>::source 8822 (1.1%, 15.9%) 353 (1.1%, 10.3%) miette::eyreish::error::<impl miette::eyreish::Report>::construct 8119 (1.0%, 16.9%) 353 (1.1%, 11.4%) miette::eyreish::error::object_ref 8119 (1.0%, 17.9%) 353 (1.1%, 12.5%) miette::eyreish::error::object_ref_stderr 7413 (0.9%, 18.8%) 353 (1.1%, 13.5%) <miette::eyreish::error::ErrorImpl<E> as core::fmt::Display>::fmt 7413 (0.9%, 19.7%) 353 (1.1%, 14.6%) miette::eyreish::ptr::Own<T>::new 6669 (0.8%, 20.5%) 39 (0.1%, 14.7%) alloc::raw_vec::RawVec<T,A>::try_allocate_in 6173 (0.7%, 21.2%) 353 (1.1%, 15.7%) miette::eyreish::error::<impl miette::eyreish::Report>::from_std 6027 (0.7%, 21.9%) 70 (0.2%, 16.0%) <alloc::vec::Vec<T> as alloc::vec::spec_from_iter_nested::SpecFromIterNested<T,I>>::from_iter 6001 (0.7%, 22.7%) 353 (1.1%, 17.0%) miette::eyreish::error::object_drop 6001 (0.7%, 23.4%) 353 (1.1%, 18.1%) miette::eyreish::error::object_drop_front 5648 (0.7%, 24.1%) 353 (1.1%, 19.1%) <miette::eyreish::error::ErrorImpl<E> as core::fmt::Debug>::fmt ``` It's totalling more than 50k llvm lines, and is putting pressure on rustc codegen (the purple part on `oxc_linter` in the image above. --- It's pretty obvious by looking at https://github.com/zkat/miette/blob/main/src/eyreish/error.rs, the generics can expand out to lots of code.
202 lines
8.2 KiB
Rust
202 lines
8.2 KiB
Rust
use super::{
|
|
cold_branch,
|
|
search::{byte_search, safe_byte_match_table, SafeByteMatchTable},
|
|
Kind, Lexer, LexerContext, Span, Token,
|
|
};
|
|
use crate::diagnostics;
|
|
|
|
use oxc_allocator::String;
|
|
use std::cmp::max;
|
|
|
|
const MIN_ESCAPED_STR_LEN: usize = 16;
|
|
|
|
static DOUBLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable =
|
|
safe_byte_match_table!(|b| matches!(b, b'"' | b'\r' | b'\n' | b'\\'));
|
|
|
|
static SINGLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable =
|
|
safe_byte_match_table!(|b| matches!(b, b'\'' | b'\r' | b'\n' | b'\\'));
|
|
|
|
/// Macro to handle a string literal.
|
|
///
|
|
/// # SAFETY
|
|
/// `$delimiter` must be an ASCII byte.
|
|
/// Next char in `lexer.source` must be ASCII.
|
|
/// `$table` must be a `SafeByteMatchTable`.
|
|
/// `$table` must only match `$delimiter`, '\', '\r' or '\n'.
|
|
macro_rules! handle_string_literal {
|
|
($lexer:ident, $delimiter:expr, $table:ident) => {{
|
|
debug_assert!($delimiter.is_ascii());
|
|
|
|
if $lexer.context == LexerContext::JsxAttributeValue {
|
|
// SAFETY: Caller guarantees `$delimiter` is ASCII, and next char is ASCII
|
|
return $lexer.read_jsx_string_literal($delimiter);
|
|
}
|
|
|
|
// Skip opening quote.
|
|
// SAFETY: Caller guarantees next byte is ASCII, so safe to advance past it.
|
|
let after_opening_quote = $lexer.source.position().add(1);
|
|
|
|
// Consume bytes which are part of identifier
|
|
let next_byte = byte_search! {
|
|
lexer: $lexer,
|
|
table: $table,
|
|
start: after_opening_quote,
|
|
handle_eof: {
|
|
$lexer.error(diagnostics::unterminated_string($lexer.unterminated_range()));
|
|
return Kind::Undetermined;
|
|
},
|
|
};
|
|
|
|
// Found a matching byte.
|
|
// Either end of string found, or a line break, or `\` escape.
|
|
match next_byte {
|
|
$delimiter => {
|
|
// SAFETY: Macro user guarantees delimiter is ASCII, so consuming it cannot move
|
|
// `lexer.source` off a UTF-8 character boundary.
|
|
$lexer.source.next_byte_unchecked();
|
|
Kind::Str
|
|
}
|
|
b'\\' => cold_branch(|| {
|
|
handle_string_literal_escape!($lexer, $delimiter, $table, after_opening_quote)
|
|
}),
|
|
_ => {
|
|
// Line break. This is impossible in valid JS, so cold path.
|
|
cold_branch(|| {
|
|
debug_assert!(matches!(next_byte, b'\r' | b'\n'));
|
|
$lexer.consume_char();
|
|
$lexer.error(diagnostics::unterminated_string($lexer.unterminated_range()));
|
|
Kind::Undetermined
|
|
})
|
|
}
|
|
}
|
|
}};
|
|
}
|
|
|
|
macro_rules! handle_string_literal_escape {
|
|
($lexer:ident, $delimiter:expr, $table:ident, $after_opening_quote:ident) => {{
|
|
// Create arena string to hold unescaped string.
|
|
// We don't know how long string will end up being. Take a guess that total length
|
|
// will be double what we've seen so far, or `MIN_ESCAPED_STR_LEN` minimum.
|
|
let so_far = $lexer.source.str_from_pos_to_current($after_opening_quote);
|
|
let capacity = max(so_far.len() * 2, MIN_ESCAPED_STR_LEN);
|
|
let mut str = String::with_capacity_in(capacity, $lexer.allocator);
|
|
|
|
// Push chunk before `\` into `str`.
|
|
str.push_str(so_far);
|
|
|
|
'outer: loop {
|
|
// Consume `\`
|
|
let escape_start_offset = $lexer.offset();
|
|
$lexer.consume_char();
|
|
|
|
// Consume escape sequence and add char to `str`
|
|
let mut is_valid_escape_sequence = true;
|
|
$lexer.read_string_escape_sequence(&mut str, false, &mut is_valid_escape_sequence);
|
|
if !is_valid_escape_sequence {
|
|
let range = Span::new(escape_start_offset, $lexer.offset());
|
|
$lexer.error(diagnostics::invalid_escape_sequence(range));
|
|
}
|
|
|
|
// Consume bytes until reach end of string, line break, or another escape
|
|
let chunk_start = $lexer.source.position();
|
|
while let Some(b) = $lexer.source.peek_byte() {
|
|
match b {
|
|
b if !$table.matches(b) => {
|
|
// SAFETY: A byte is available, as we just peeked it.
|
|
// This may put `source`'s position on a UTF-8 continuation byte, which violates
|
|
// `Source`'s invariant temporarily, but the guarantees of `SafeByteMatchTable`
|
|
// mean `!table.matches(b)` on this branch prevents exiting this loop until
|
|
// `source` is positioned on a UTF-8 character boundary again.
|
|
$lexer.source.next_byte_unchecked();
|
|
continue;
|
|
}
|
|
b if b == $delimiter => {
|
|
// End of string found. Push last chunk to `str`.
|
|
let chunk = $lexer.source.str_from_pos_to_current(chunk_start);
|
|
str.push_str(chunk);
|
|
|
|
// Consume closing quote.
|
|
// SAFETY: Caller guarantees delimiter is ASCII, so consuming it cannot move
|
|
// `lexer.source` off a UTF-8 character boundary
|
|
$lexer.source.next_byte_unchecked();
|
|
break 'outer;
|
|
}
|
|
b'\\' => {
|
|
// Another escape found. Push last chunk to `str`, and loop back to handle escape.
|
|
let chunk = $lexer.source.str_from_pos_to_current(chunk_start);
|
|
str.push_str(chunk);
|
|
continue 'outer;
|
|
}
|
|
_ => {
|
|
// Line break. This is impossible in valid JS, so cold path.
|
|
return cold_branch(|| {
|
|
debug_assert!(matches!(b, b'\r' | b'\n'));
|
|
$lexer.consume_char();
|
|
$lexer.error(diagnostics::unterminated_string($lexer.unterminated_range()));
|
|
Kind::Undetermined
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
// EOF
|
|
$lexer.error(diagnostics::unterminated_string($lexer.unterminated_range()));
|
|
return Kind::Undetermined;
|
|
}
|
|
|
|
// Convert `str` to arena slice and save to `escaped_strings`
|
|
$lexer.save_string(true, str.into_bump_str());
|
|
|
|
Kind::Str
|
|
}};
|
|
}
|
|
|
|
impl<'a> Lexer<'a> {
|
|
/// 12.9.4 String Literals
|
|
|
|
/// Read string literal delimited with `"`.
|
|
/// # SAFETY
|
|
/// Next character must be `"`.
|
|
pub(super) unsafe fn read_string_literal_double_quote(&mut self) -> Kind {
|
|
// SAFETY: Caller guarantees next char is `"`, which is ASCII.
|
|
// b'"' is an ASCII byte. `DOUBLE_QUOTE_STRING_END_TABLE` is a `SafeByteMatchTable`.
|
|
unsafe { handle_string_literal!(self, b'"', DOUBLE_QUOTE_STRING_END_TABLE) }
|
|
}
|
|
|
|
/// Read string literal delimited with `'`.
|
|
/// # SAFETY
|
|
/// Next character must be `'`.
|
|
pub(super) unsafe fn read_string_literal_single_quote(&mut self) -> Kind {
|
|
// SAFETY: Caller guarantees next char is `'`, which is ASCII.
|
|
// b'\'' is an ASCII byte. `SINGLE_QUOTE_STRING_END_TABLE` is a `SafeByteMatchTable`.
|
|
unsafe { handle_string_literal!(self, b'\'', SINGLE_QUOTE_STRING_END_TABLE) }
|
|
}
|
|
|
|
/// Save the string if it is escaped
|
|
/// This reduces the overall memory consumption while keeping the `Token` size small
|
|
/// Strings without escaped values can be retrieved as is from the token span
|
|
pub(super) fn save_string(&mut self, has_escape: bool, s: &'a str) {
|
|
if !has_escape {
|
|
return;
|
|
}
|
|
self.escaped_strings.insert(self.token.start, s);
|
|
self.token.escaped = true;
|
|
}
|
|
|
|
pub(crate) fn get_string(&self, token: Token) -> &'a str {
|
|
if token.escaped {
|
|
return self.escaped_strings[&token.start];
|
|
}
|
|
|
|
let raw = &self.source.whole()[token.start as usize..token.end as usize];
|
|
match token.kind {
|
|
Kind::Str => {
|
|
&raw[1..raw.len() - 1] // omit surrounding quotes
|
|
}
|
|
Kind::PrivateIdentifier => {
|
|
&raw[1..] // omit leading `#`
|
|
}
|
|
_ => raw,
|
|
}
|
|
}
|
|
}
|