oxc/crates/oxc_parser/src/lexer/string.rs
Boshen 2064ae9e0a refactor(parser,diagnostic): one diagnostic struct to eliminate monomorphization of generic types (#3214)
part of #3213

We should only have one diagnostic struct instead 353 copies of them, so we don't end up choking LLVM with 50k lines of the same code due to monomorphization.

If the proposed approach is good, then I'll start writing a codemod to turn all the existing structs to plain functions.

---

Background:

Using `--timings`, we see `oxc_linter` is slow on codegen (the purple part).

![image](https://github.com/zkat/miette/assets/1430279/c1df4f7d-90ef-4c0f-9956-2ec3194db7ca)

The crate currently contains 353 miette errors. [cargo-llvm-lines](https://github.com/dtolnay/cargo-llvm-lines) displays

```
cargo llvm-lines -p oxc_linter --lib --release

  Lines                 Copies               Function name
  -----                 ------               -------------
  830350                33438                (TOTAL)
   29252 (3.5%,  3.5%)    808 (2.4%,  2.4%)  <alloc::boxed::Box<T,A> as core::ops::drop::Drop>::drop
   23298 (2.8%,  6.3%)    353 (1.1%,  3.5%)  miette::eyreish::error::object_downcast
   19062 (2.3%,  8.6%)    706 (2.1%,  5.6%)  core::error::Error::type_id
   12610 (1.5%, 10.1%)     65 (0.2%,  5.8%)  alloc::raw_vec::RawVec<T,A>::grow_amortized
   12002 (1.4%, 11.6%)    706 (2.1%,  7.9%)  miette::eyreish::ptr::Own<T>::boxed
    9215 (1.1%, 12.7%)    115 (0.3%,  8.2%)  core::iter::traits::iterator::Iterator::try_fold
    9150 (1.1%, 13.8%)      1 (0.0%,  8.2%)  oxc_linter::rules::RuleEnum::read_json
    8825 (1.1%, 14.9%)    353 (1.1%,  9.3%)  <miette::eyreish::error::ErrorImpl<E> as core::error::Error>::source
    8822 (1.1%, 15.9%)    353 (1.1%, 10.3%)  miette::eyreish::error::<impl miette::eyreish::Report>::construct
    8119 (1.0%, 16.9%)    353 (1.1%, 11.4%)  miette::eyreish::error::object_ref
    8119 (1.0%, 17.9%)    353 (1.1%, 12.5%)  miette::eyreish::error::object_ref_stderr
    7413 (0.9%, 18.8%)    353 (1.1%, 13.5%)  <miette::eyreish::error::ErrorImpl<E> as core::fmt::Display>::fmt
    7413 (0.9%, 19.7%)    353 (1.1%, 14.6%)  miette::eyreish::ptr::Own<T>::new
    6669 (0.8%, 20.5%)     39 (0.1%, 14.7%)  alloc::raw_vec::RawVec<T,A>::try_allocate_in
    6173 (0.7%, 21.2%)    353 (1.1%, 15.7%)  miette::eyreish::error::<impl miette::eyreish::Report>::from_std
    6027 (0.7%, 21.9%)     70 (0.2%, 16.0%)  <alloc::vec::Vec<T> as alloc::vec::spec_from_iter_nested::SpecFromIterNested<T,I>>::from_iter
    6001 (0.7%, 22.7%)    353 (1.1%, 17.0%)  miette::eyreish::error::object_drop
    6001 (0.7%, 23.4%)    353 (1.1%, 18.1%)  miette::eyreish::error::object_drop_front
    5648 (0.7%, 24.1%)    353 (1.1%, 19.1%)  <miette::eyreish::error::ErrorImpl<E> as core::fmt::Debug>::fmt
```

It's totalling more than 50k llvm lines, and is putting pressure on rustc codegen (the purple part on `oxc_linter` in the image above.

---

It's pretty obvious by looking at https://github.com/zkat/miette/blob/main/src/eyreish/error.rs, the generics can expand out to lots of code.
2024-05-11 04:56:22 +00:00

202 lines
8.2 KiB
Rust

use super::{
cold_branch,
search::{byte_search, safe_byte_match_table, SafeByteMatchTable},
Kind, Lexer, LexerContext, Span, Token,
};
use crate::diagnostics;
use oxc_allocator::String;
use std::cmp::max;
const MIN_ESCAPED_STR_LEN: usize = 16;
static DOUBLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable =
safe_byte_match_table!(|b| matches!(b, b'"' | b'\r' | b'\n' | b'\\'));
static SINGLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable =
safe_byte_match_table!(|b| matches!(b, b'\'' | b'\r' | b'\n' | b'\\'));
/// Macro to handle a string literal.
///
/// # SAFETY
/// `$delimiter` must be an ASCII byte.
/// Next char in `lexer.source` must be ASCII.
/// `$table` must be a `SafeByteMatchTable`.
/// `$table` must only match `$delimiter`, '\', '\r' or '\n'.
macro_rules! handle_string_literal {
($lexer:ident, $delimiter:expr, $table:ident) => {{
debug_assert!($delimiter.is_ascii());
if $lexer.context == LexerContext::JsxAttributeValue {
// SAFETY: Caller guarantees `$delimiter` is ASCII, and next char is ASCII
return $lexer.read_jsx_string_literal($delimiter);
}
// Skip opening quote.
// SAFETY: Caller guarantees next byte is ASCII, so safe to advance past it.
let after_opening_quote = $lexer.source.position().add(1);
// Consume bytes which are part of identifier
let next_byte = byte_search! {
lexer: $lexer,
table: $table,
start: after_opening_quote,
handle_eof: {
$lexer.error(diagnostics::unterminated_string($lexer.unterminated_range()));
return Kind::Undetermined;
},
};
// Found a matching byte.
// Either end of string found, or a line break, or `\` escape.
match next_byte {
$delimiter => {
// SAFETY: Macro user guarantees delimiter is ASCII, so consuming it cannot move
// `lexer.source` off a UTF-8 character boundary.
$lexer.source.next_byte_unchecked();
Kind::Str
}
b'\\' => cold_branch(|| {
handle_string_literal_escape!($lexer, $delimiter, $table, after_opening_quote)
}),
_ => {
// Line break. This is impossible in valid JS, so cold path.
cold_branch(|| {
debug_assert!(matches!(next_byte, b'\r' | b'\n'));
$lexer.consume_char();
$lexer.error(diagnostics::unterminated_string($lexer.unterminated_range()));
Kind::Undetermined
})
}
}
}};
}
macro_rules! handle_string_literal_escape {
($lexer:ident, $delimiter:expr, $table:ident, $after_opening_quote:ident) => {{
// Create arena string to hold unescaped string.
// We don't know how long string will end up being. Take a guess that total length
// will be double what we've seen so far, or `MIN_ESCAPED_STR_LEN` minimum.
let so_far = $lexer.source.str_from_pos_to_current($after_opening_quote);
let capacity = max(so_far.len() * 2, MIN_ESCAPED_STR_LEN);
let mut str = String::with_capacity_in(capacity, $lexer.allocator);
// Push chunk before `\` into `str`.
str.push_str(so_far);
'outer: loop {
// Consume `\`
let escape_start_offset = $lexer.offset();
$lexer.consume_char();
// Consume escape sequence and add char to `str`
let mut is_valid_escape_sequence = true;
$lexer.read_string_escape_sequence(&mut str, false, &mut is_valid_escape_sequence);
if !is_valid_escape_sequence {
let range = Span::new(escape_start_offset, $lexer.offset());
$lexer.error(diagnostics::invalid_escape_sequence(range));
}
// Consume bytes until reach end of string, line break, or another escape
let chunk_start = $lexer.source.position();
while let Some(b) = $lexer.source.peek_byte() {
match b {
b if !$table.matches(b) => {
// SAFETY: A byte is available, as we just peeked it.
// This may put `source`'s position on a UTF-8 continuation byte, which violates
// `Source`'s invariant temporarily, but the guarantees of `SafeByteMatchTable`
// mean `!table.matches(b)` on this branch prevents exiting this loop until
// `source` is positioned on a UTF-8 character boundary again.
$lexer.source.next_byte_unchecked();
continue;
}
b if b == $delimiter => {
// End of string found. Push last chunk to `str`.
let chunk = $lexer.source.str_from_pos_to_current(chunk_start);
str.push_str(chunk);
// Consume closing quote.
// SAFETY: Caller guarantees delimiter is ASCII, so consuming it cannot move
// `lexer.source` off a UTF-8 character boundary
$lexer.source.next_byte_unchecked();
break 'outer;
}
b'\\' => {
// Another escape found. Push last chunk to `str`, and loop back to handle escape.
let chunk = $lexer.source.str_from_pos_to_current(chunk_start);
str.push_str(chunk);
continue 'outer;
}
_ => {
// Line break. This is impossible in valid JS, so cold path.
return cold_branch(|| {
debug_assert!(matches!(b, b'\r' | b'\n'));
$lexer.consume_char();
$lexer.error(diagnostics::unterminated_string($lexer.unterminated_range()));
Kind::Undetermined
});
}
}
}
// EOF
$lexer.error(diagnostics::unterminated_string($lexer.unterminated_range()));
return Kind::Undetermined;
}
// Convert `str` to arena slice and save to `escaped_strings`
$lexer.save_string(true, str.into_bump_str());
Kind::Str
}};
}
impl<'a> Lexer<'a> {
/// 12.9.4 String Literals
/// Read string literal delimited with `"`.
/// # SAFETY
/// Next character must be `"`.
pub(super) unsafe fn read_string_literal_double_quote(&mut self) -> Kind {
// SAFETY: Caller guarantees next char is `"`, which is ASCII.
// b'"' is an ASCII byte. `DOUBLE_QUOTE_STRING_END_TABLE` is a `SafeByteMatchTable`.
unsafe { handle_string_literal!(self, b'"', DOUBLE_QUOTE_STRING_END_TABLE) }
}
/// Read string literal delimited with `'`.
/// # SAFETY
/// Next character must be `'`.
pub(super) unsafe fn read_string_literal_single_quote(&mut self) -> Kind {
// SAFETY: Caller guarantees next char is `'`, which is ASCII.
// b'\'' is an ASCII byte. `SINGLE_QUOTE_STRING_END_TABLE` is a `SafeByteMatchTable`.
unsafe { handle_string_literal!(self, b'\'', SINGLE_QUOTE_STRING_END_TABLE) }
}
/// Save the string if it is escaped
/// This reduces the overall memory consumption while keeping the `Token` size small
/// Strings without escaped values can be retrieved as is from the token span
pub(super) fn save_string(&mut self, has_escape: bool, s: &'a str) {
if !has_escape {
return;
}
self.escaped_strings.insert(self.token.start, s);
self.token.escaped = true;
}
pub(crate) fn get_string(&self, token: Token) -> &'a str {
if token.escaped {
return self.escaped_strings[&token.start];
}
let raw = &self.source.whole()[token.start as usize..token.end as usize];
match token.kind {
Kind::Str => {
&raw[1..raw.len() - 1] // omit surrounding quotes
}
Kind::PrivateIdentifier => {
&raw[1..] // omit leading `#`
}
_ => raw,
}
}
}