oxc/crates/oxc_parser/src/lexer/comment.rs
Boshen 2064ae9e0a refactor(parser,diagnostic): one diagnostic struct to eliminate monomorphization of generic types (#3214)
part of #3213

We should only have one diagnostic struct instead 353 copies of them, so we don't end up choking LLVM with 50k lines of the same code due to monomorphization.

If the proposed approach is good, then I'll start writing a codemod to turn all the existing structs to plain functions.

---

Background:

Using `--timings`, we see `oxc_linter` is slow on codegen (the purple part).

![image](https://github.com/zkat/miette/assets/1430279/c1df4f7d-90ef-4c0f-9956-2ec3194db7ca)

The crate currently contains 353 miette errors. [cargo-llvm-lines](https://github.com/dtolnay/cargo-llvm-lines) displays

```
cargo llvm-lines -p oxc_linter --lib --release

  Lines                 Copies               Function name
  -----                 ------               -------------
  830350                33438                (TOTAL)
   29252 (3.5%,  3.5%)    808 (2.4%,  2.4%)  <alloc::boxed::Box<T,A> as core::ops::drop::Drop>::drop
   23298 (2.8%,  6.3%)    353 (1.1%,  3.5%)  miette::eyreish::error::object_downcast
   19062 (2.3%,  8.6%)    706 (2.1%,  5.6%)  core::error::Error::type_id
   12610 (1.5%, 10.1%)     65 (0.2%,  5.8%)  alloc::raw_vec::RawVec<T,A>::grow_amortized
   12002 (1.4%, 11.6%)    706 (2.1%,  7.9%)  miette::eyreish::ptr::Own<T>::boxed
    9215 (1.1%, 12.7%)    115 (0.3%,  8.2%)  core::iter::traits::iterator::Iterator::try_fold
    9150 (1.1%, 13.8%)      1 (0.0%,  8.2%)  oxc_linter::rules::RuleEnum::read_json
    8825 (1.1%, 14.9%)    353 (1.1%,  9.3%)  <miette::eyreish::error::ErrorImpl<E> as core::error::Error>::source
    8822 (1.1%, 15.9%)    353 (1.1%, 10.3%)  miette::eyreish::error::<impl miette::eyreish::Report>::construct
    8119 (1.0%, 16.9%)    353 (1.1%, 11.4%)  miette::eyreish::error::object_ref
    8119 (1.0%, 17.9%)    353 (1.1%, 12.5%)  miette::eyreish::error::object_ref_stderr
    7413 (0.9%, 18.8%)    353 (1.1%, 13.5%)  <miette::eyreish::error::ErrorImpl<E> as core::fmt::Display>::fmt
    7413 (0.9%, 19.7%)    353 (1.1%, 14.6%)  miette::eyreish::ptr::Own<T>::new
    6669 (0.8%, 20.5%)     39 (0.1%, 14.7%)  alloc::raw_vec::RawVec<T,A>::try_allocate_in
    6173 (0.7%, 21.2%)    353 (1.1%, 15.7%)  miette::eyreish::error::<impl miette::eyreish::Report>::from_std
    6027 (0.7%, 21.9%)     70 (0.2%, 16.0%)  <alloc::vec::Vec<T> as alloc::vec::spec_from_iter_nested::SpecFromIterNested<T,I>>::from_iter
    6001 (0.7%, 22.7%)    353 (1.1%, 17.0%)  miette::eyreish::error::object_drop
    6001 (0.7%, 23.4%)    353 (1.1%, 18.1%)  miette::eyreish::error::object_drop_front
    5648 (0.7%, 24.1%)    353 (1.1%, 19.1%)  <miette::eyreish::error::ErrorImpl<E> as core::fmt::Debug>::fmt
```

It's totalling more than 50k llvm lines, and is putting pressure on rustc codegen (the purple part on `oxc_linter` in the image above.

---

It's pretty obvious by looking at https://github.com/zkat/miette/blob/main/src/eyreish/error.rs, the generics can expand out to lots of code.
2024-05-11 04:56:22 +00:00

192 lines
9.4 KiB
Rust

use super::{
cold_branch,
search::{byte_search, safe_byte_match_table, SafeByteMatchTable},
source::SourcePosition,
Kind, Lexer,
};
use crate::diagnostics;
use oxc_syntax::identifier::is_line_terminator;
use memchr::memmem::Finder;
// Irregular line breaks - '\u{2028}' (LS) and '\u{2029}' (PS)
const LS_OR_PS_FIRST: u8 = 0xE2;
const LS_BYTES_2_AND_3: [u8; 2] = [0x80, 0xA8];
const PS_BYTES_2_AND_3: [u8; 2] = [0x80, 0xA9];
static LINE_BREAK_TABLE: SafeByteMatchTable =
safe_byte_match_table!(|b| matches!(b, b'\r' | b'\n' | LS_OR_PS_FIRST));
static MULTILINE_COMMENT_START_TABLE: SafeByteMatchTable =
safe_byte_match_table!(|b| matches!(b, b'*' | b'\r' | b'\n' | LS_OR_PS_FIRST));
impl<'a> Lexer<'a> {
/// Section 12.4 Single Line Comment
pub(super) fn skip_single_line_comment(&mut self) -> Kind {
byte_search! {
lexer: self,
table: LINE_BREAK_TABLE,
continue_if: (next_byte, pos) {
// Match found. Decide whether to continue searching.
// If this is end of comment, create trivia, and advance `pos` to after line break.
// Do that here rather than in `handle_match`, to avoid branching twice on value of
// the matched byte.
#[allow(clippy::if_not_else)]
if next_byte != LS_OR_PS_FIRST {
// `\r` or `\n`
self.trivia_builder
.add_single_line_comment(self.token.start, self.source.offset_of(pos));
// SAFETY: Safe to consume `\r` or `\n` as both are ASCII
pos = unsafe { pos.add(1) };
// We've found the end. Do not continue searching.
false
} else {
// `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
// Either way, Unicode is uncommon, so make this a cold branch.
cold_branch(|| {
// SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
// So safe to advance `pos` by 1 and read 2 bytes.
let next2 = unsafe { pos.add(1).read2() };
if matches!(next2, LS_BYTES_2_AND_3 | PS_BYTES_2_AND_3) {
// Irregular line break
self.trivia_builder
.add_single_line_comment(self.token.start, self.source.offset_of(pos));
// Advance `pos` to after this char.
// SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
// so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
pos = unsafe { pos.add(3) };
// We've found the end. Do not continue searching.
false
} else {
// Some other Unicode char beginning with `0xE2`.
// Skip 3 bytes (macro skips 1 already, so skip 2 here), and continue searching.
// SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
// so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
pos = unsafe { pos.add(2) };
true
}
})
}
},
handle_eof: {
self.trivia_builder.add_single_line_comment(self.token.start, self.offset());
return Kind::Skip;
},
};
self.token.is_on_new_line = true;
Kind::Skip
}
/// Section 12.4 Multi Line Comment
pub(super) fn skip_multi_line_comment(&mut self) -> Kind {
// If `is_on_new_line` is already set, go directly to faster search which only looks for `*/`
if self.token.is_on_new_line {
return self.skip_multi_line_comment_after_line_break(self.source.position());
}
byte_search! {
lexer: self,
table: MULTILINE_COMMENT_START_TABLE,
continue_if: (next_byte, pos) {
// Match found. Decide whether to continue searching.
if next_byte == b'*' {
// SAFETY: Next byte is `*` (ASCII) so after it is UTF-8 char boundary
let after_star = unsafe { pos.add(1) };
if after_star.addr() < self.source.end_addr() {
// If next byte isn't `/`, continue
// SAFETY: Have checked there's at least 1 further byte to read
if unsafe { after_star.read() } == b'/' {
// Consume `*/`
// SAFETY: Consuming `*/` leaves `pos` on a UTF-8 char boundary
pos = unsafe { pos.add(2) };
false
} else {
true
}
} else {
// This is last byte in file. Continue to `handle_eof`.
// This is illegal in valid JS, so mark this branch cold.
cold_branch(|| true)
}
} else if next_byte == LS_OR_PS_FIRST {
// `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
// Either way, Unicode is uncommon, so make this a cold branch.
cold_branch(|| {
// SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
// So safe to advance `pos` by 1 and read 2 bytes.
let next2 = unsafe {
pos = pos.add(1);
pos.read2()
};
if matches!(next2, LS_BYTES_2_AND_3 | PS_BYTES_2_AND_3) {
// Irregular line break
self.token.is_on_new_line = true;
// Ideally we'd go on to `skip_multi_line_comment_after_line_break` here
// but can't do that easily because can't use `return` in a closure.
// But irregular line breaks are rare anyway.
}
// Either way, continue searching.
// Skip 3 bytes (skipped 1 byte above, macro skips 1 more, so skip 1 more here
// to make 3), and continue searching.
// SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
// so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
pos = unsafe { pos.add(1) };
true
})
} else {
// Regular line break.
// No need to look for more line breaks, so switch to faster search just for `*/`.
self.token.is_on_new_line = true;
// SAFETY: Regular line breaks are ASCII, so skipping 1 byte is a UTF-8 char boundary.
let after_line_break = unsafe { pos.add(1) };
return self.skip_multi_line_comment_after_line_break(after_line_break);
}
},
handle_eof: {
self.error(diagnostics::unterminated_multi_line_comment(self.unterminated_range()));
return Kind::Eof;
},
};
self.trivia_builder.add_multi_line_comment(self.token.start, self.offset());
Kind::Skip
}
fn skip_multi_line_comment_after_line_break(&mut self, pos: SourcePosition) -> Kind {
// Can use `memchr` here as only searching for 1 pattern.
// Cache `Finder` instance on `Lexer` as there's a significant cost to creating it.
// `Finder::new` isn't a const function, so can't make it a `static`, and `lazy_static!`
// has a cost each time it's deref-ed. Creating `Finder` unconditionally in `Lexer::new`
// would be efficient for files containing multi-line comments, but would impose pointless
// cost on files which don't. So this is the fastest solution.
if self.multi_line_comment_end_finder.is_none() {
self.multi_line_comment_end_finder = Some(Finder::new("*/"));
}
let finder = self.multi_line_comment_end_finder.as_ref().unwrap();
let remaining = self.source.str_from_pos_to_end(pos).as_bytes();
if let Some(index) = finder.find(remaining) {
// SAFETY: `pos + index + 2` is end of `*/`, so a valid `SourcePosition`
self.source.set_position(unsafe { pos.add(index + 2) });
self.trivia_builder.add_multi_line_comment(self.token.start, self.offset());
Kind::Skip
} else {
self.source.advance_to_end();
self.error(diagnostics::unterminated_multi_line_comment(self.unterminated_range()));
Kind::Eof
}
}
/// Section 12.5 Hashbang Comments
pub(super) fn read_hashbang_comment(&mut self) -> Kind {
while let Some(c) = self.next_char().as_ref() {
if is_line_terminator(*c) {
break;
}
}
self.token.is_on_new_line = true;
Kind::HashbangComment
}
}