mirror of
https://github.com/danbulant/oxc
synced 2026-05-22 13:48:55 +00:00
perf(parser): consume multi-line comments faster (#2377)
Consume multi-line comments faster. * Initially search for `*/`, `\r`, `\n` or `0xE2` (first byte of irregular line breaks). * Once a line break is found, switch to faster search which only looks for `*/`, as it's not relevant whether there are more line breaks or not. Using `memchr` for the 2nd simpler search, as it's efficient for a search with only one "needle". Initializing `memchr::memmem::Finder` is fairly expensive, and tried numerous ways to handle it. This is most performant way I could find. Any ideas how to avoid re-creating it for each Lexer pass? (it can't be a `static` as `Finder::new` is not a const function, and `lazy_static!` is too costly)
This commit is contained in:
parent
24a9255610
commit
383f5b3081
5 changed files with 118 additions and 10 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
|
@ -1593,6 +1593,7 @@ version = "0.7.0"
|
|||
dependencies = [
|
||||
"assert-unchecked",
|
||||
"bitflags 2.4.2",
|
||||
"memchr",
|
||||
"miette",
|
||||
"num-bigint",
|
||||
"ouroboros",
|
||||
|
|
|
|||
|
|
@ -31,6 +31,8 @@ rustc-hash = { workspace = true }
|
|||
num-bigint = { workspace = true }
|
||||
seq-macro = { workspace = true }
|
||||
|
||||
memchr = "2.7.1"
|
||||
|
||||
[dev-dependencies]
|
||||
oxc_ast = { workspace = true, features = ["serde"] }
|
||||
miette = { workspace = true, features = ["fancy-no-backtrace"] }
|
||||
|
|
|
|||
|
|
@ -1,12 +1,16 @@
|
|||
use super::{
|
||||
cold_branch,
|
||||
search::{byte_search, safe_byte_match_table, SafeByteMatchTable},
|
||||
source::SourcePosition,
|
||||
Kind, Lexer,
|
||||
};
|
||||
use crate::diagnostics;
|
||||
|
||||
use oxc_syntax::identifier::is_line_terminator;
|
||||
|
||||
use memchr::memmem::Finder;
|
||||
|
||||
// Irregular line breaks - '\u{2028}' (LS) and '\u{2029}' (PS)
|
||||
const LS_OR_PS_FIRST: u8 = 0xE2;
|
||||
const LS_BYTES_2_AND_3: [u8; 2] = [0x80, 0xA8];
|
||||
const PS_BYTES_2_AND_3: [u8; 2] = [0x80, 0xA9];
|
||||
|
|
@ -14,6 +18,9 @@ const PS_BYTES_2_AND_3: [u8; 2] = [0x80, 0xA9];
|
|||
static LINE_BREAK_TABLE: SafeByteMatchTable =
|
||||
safe_byte_match_table!(|b| matches!(b, b'\r' | b'\n' | LS_OR_PS_FIRST));
|
||||
|
||||
static MULTILINE_COMMENT_START_TABLE: SafeByteMatchTable =
|
||||
safe_byte_match_table!(|b| matches!(b, b'*' | b'\r' | b'\n' | LS_OR_PS_FIRST));
|
||||
|
||||
impl<'a> Lexer<'a> {
|
||||
/// Section 12.4 Single Line Comment
|
||||
pub(super) fn skip_single_line_comment(&mut self) -> Kind {
|
||||
|
|
@ -74,17 +81,94 @@ impl<'a> Lexer<'a> {
|
|||
|
||||
/// Section 12.4 Multi Line Comment
|
||||
pub(super) fn skip_multi_line_comment(&mut self) -> Kind {
|
||||
while let Some(c) = self.next_char() {
|
||||
if c == '*' && self.next_eq('/') {
|
||||
self.trivia_builder.add_multi_line_comment(self.token.start, self.offset());
|
||||
return Kind::Skip;
|
||||
}
|
||||
if is_line_terminator(c) {
|
||||
self.token.is_on_new_line = true;
|
||||
}
|
||||
// If `is_on_new_line` is already set, go directly to faster search which only looks for `*/`
|
||||
if self.token.is_on_new_line {
|
||||
return self.skip_multi_line_comment_after_line_break(self.source.position());
|
||||
}
|
||||
|
||||
// SAFETY: Requirement not to alter `pos` if return `true` from `if_continue` is satisfied
|
||||
unsafe {
|
||||
byte_search! {
|
||||
lexer: self,
|
||||
table: MULTILINE_COMMENT_START_TABLE,
|
||||
continue_if: |next_byte, pos| {
|
||||
// Match found. Decide whether to continue searching.
|
||||
if next_byte == b'*' {
|
||||
if pos.addr() < self.source.end_addr() - 1 {
|
||||
// If next byte isn't `/`, continue
|
||||
// SAFETY: Have checked there's at least 1 further byte to read
|
||||
if pos.add(1).read() == b'/' {
|
||||
// Consume `*/`
|
||||
// SAFETY: Consuming `*/` leaves `pos` on a UTF-8 char boundary
|
||||
pos = pos.add(2);
|
||||
false
|
||||
} else {
|
||||
true
|
||||
}
|
||||
} else {
|
||||
// This is last byte in file. Continue to `handle_eof`.
|
||||
// This is illegal in valid JS, so mark this branch cold.
|
||||
cold_branch(|| true)
|
||||
}
|
||||
} else if next_byte == LS_OR_PS_FIRST {
|
||||
// `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
|
||||
// Either way, Unicode is uncommon, so make this a cold branch.
|
||||
cold_branch(|| {
|
||||
// SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
|
||||
// So safe to advance `pos` by 1 and read 2 bytes.
|
||||
let next2 = pos.add(1).read2();
|
||||
if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
|
||||
// Irregular line break
|
||||
self.token.is_on_new_line = true;
|
||||
// Ideally we'd go on to `skip_multi_line_comment_after_line_break` here
|
||||
// but can't do that easily because can't use `return` in a closure.
|
||||
// But irregular line breaks are rare anyway.
|
||||
}
|
||||
// Either way, continue searching
|
||||
true
|
||||
})
|
||||
} else {
|
||||
// Regular line break.
|
||||
// No need to look for more line breaks, so switch to faster search just for `*/`.
|
||||
self.token.is_on_new_line = true;
|
||||
return self.skip_multi_line_comment_after_line_break(pos.add(1));
|
||||
}
|
||||
},
|
||||
handle_match: |_next_byte, _start| {
|
||||
self.trivia_builder.add_multi_line_comment(self.token.start, self.offset());
|
||||
Kind::Skip
|
||||
},
|
||||
handle_eof: |_start| {
|
||||
self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range()));
|
||||
Kind::Eof
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
fn skip_multi_line_comment_after_line_break(&mut self, pos: SourcePosition) -> Kind {
|
||||
// Can use `memchr` here as only searching for 1 pattern.
|
||||
// Cache `Finder` instance on `Lexer` as there's a significant cost to creating it.
|
||||
// `Finder::new` isn't a const function, so can't make it a `static`, and `lazy_static!`
|
||||
// has a cost each time it's deref-ed. Creating `Finder` unconditionally in `Lexer::new`
|
||||
// would be efficient for files containing multi-line comments, but would impose pointless
|
||||
// cost on files which don't. So this is the fastest solution.
|
||||
if self.multi_line_comment_end_finder.is_none() {
|
||||
self.multi_line_comment_end_finder = Some(Finder::new("*/"));
|
||||
}
|
||||
let finder = self.multi_line_comment_end_finder.as_ref().unwrap();
|
||||
|
||||
let remaining = self.source.str_from_pos_to_end(pos).as_bytes();
|
||||
if let Some(index) = finder.find(remaining) {
|
||||
// SAFETY: `pos + index + 2` is end of `*/`, so a valid `SourcePosition`
|
||||
self.source.set_position(unsafe { pos.add(index + 2) });
|
||||
self.trivia_builder.add_multi_line_comment(self.token.start, self.offset());
|
||||
Kind::Skip
|
||||
} else {
|
||||
self.source.advance_to_end();
|
||||
self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range()));
|
||||
Kind::Eof
|
||||
}
|
||||
self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range()));
|
||||
Kind::Eof
|
||||
}
|
||||
|
||||
/// Section 12.5 Hashbang Comments
|
||||
|
|
|
|||
|
|
@ -95,6 +95,9 @@ pub struct Lexer<'a> {
|
|||
/// Data store for escaped templates, indexed by [Token::start] when [Token::escaped] is true
|
||||
/// `None` is saved when the string contains an invalid escape sequence.
|
||||
pub escaped_templates: FxHashMap<u32, Option<&'a str>>,
|
||||
|
||||
/// `memchr` Finder for end of multi-line comments. Created lazily when first used.
|
||||
multi_line_comment_end_finder: Option<memchr::memmem::Finder<'static>>,
|
||||
}
|
||||
|
||||
#[allow(clippy::unused_self)]
|
||||
|
|
@ -124,6 +127,7 @@ impl<'a> Lexer<'a> {
|
|||
trivia_builder: TriviaBuilder::default(),
|
||||
escaped_strings: FxHashMap::default(),
|
||||
escaped_templates: FxHashMap::default(),
|
||||
multi_line_comment_end_finder: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -192,6 +192,11 @@ impl<'a> Source<'a> {
|
|||
self.ptr = pos.ptr;
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(super) fn advance_to_end(&mut self) {
|
||||
self.ptr = self.end;
|
||||
}
|
||||
|
||||
/// Get string slice from a `SourcePosition` up to the current position of `Source`.
|
||||
pub(super) fn str_from_pos_to_current(&self, pos: SourcePosition) -> &'a str {
|
||||
assert!(pos.ptr <= self.ptr);
|
||||
|
|
@ -226,6 +231,18 @@ impl<'a> Source<'a> {
|
|||
std::str::from_utf8_unchecked(slice)
|
||||
}
|
||||
|
||||
/// Get string slice from a `SourcePosition` up to the end of `Source`.
|
||||
#[inline]
|
||||
pub(super) fn str_from_pos_to_end(&self, pos: SourcePosition) -> &'a str {
|
||||
// SAFETY: Invariants of `SourcePosition` is that it cannot be after end of `Source`,
|
||||
// and always on a UTF-8 character boundary
|
||||
unsafe {
|
||||
let len = self.end as usize - pos.addr();
|
||||
let slice = slice::from_raw_parts(pos.ptr, len);
|
||||
std::str::from_utf8_unchecked(slice)
|
||||
}
|
||||
}
|
||||
|
||||
/// Get current position in source, relative to start of source.
|
||||
#[allow(clippy::cast_possible_truncation)]
|
||||
#[inline]
|
||||
|
|
|
|||
Loading…
Reference in a new issue