mirror of
https://github.com/danbulant/oxc
synced 2026-05-25 04:42:10 +00:00
perf(parser): consume single-line comments faster (#2374)
Use `byte_search!` macro to consume single-line comments. Would be a lot simpler if didn't have to deal with irregular line breaks. Damn you Unicode!
This commit is contained in:
parent
b29719d2df
commit
c4fa738312
2 changed files with 64 additions and 14 deletions
|
|
@ -1,24 +1,75 @@
|
||||||
use super::{Kind, Lexer};
|
use super::{
|
||||||
|
cold_branch,
|
||||||
|
search::{byte_search, safe_byte_match_table, SafeByteMatchTable},
|
||||||
|
Kind, Lexer,
|
||||||
|
};
|
||||||
use crate::diagnostics;
|
use crate::diagnostics;
|
||||||
|
|
||||||
use oxc_syntax::identifier::is_line_terminator;
|
use oxc_syntax::identifier::is_line_terminator;
|
||||||
|
|
||||||
|
const LS_OR_PS_FIRST: u8 = 0xE2;
|
||||||
|
const LS_BYTES_2_AND_3: [u8; 2] = [0x80, 0xA8];
|
||||||
|
const PS_BYTES_2_AND_3: [u8; 2] = [0x80, 0xA9];
|
||||||
|
|
||||||
|
static LINE_BREAK_TABLE: SafeByteMatchTable =
|
||||||
|
safe_byte_match_table!(|b| matches!(b, b'\r' | b'\n' | LS_OR_PS_FIRST));
|
||||||
|
|
||||||
impl<'a> Lexer<'a> {
|
impl<'a> Lexer<'a> {
|
||||||
/// Section 12.4 Single Line Comment
|
/// Section 12.4 Single Line Comment
|
||||||
#[allow(clippy::cast_possible_truncation)]
|
|
||||||
pub(super) fn skip_single_line_comment(&mut self) -> Kind {
|
pub(super) fn skip_single_line_comment(&mut self) -> Kind {
|
||||||
let start = self.token.start;
|
// SAFETY: Requirement not to alter `pos` if return `true` from `if_continue` is satisfied
|
||||||
while let Some(c) = self.next_char() {
|
unsafe {
|
||||||
if is_line_terminator(c) {
|
byte_search! {
|
||||||
self.token.is_on_new_line = true;
|
lexer: self,
|
||||||
self.trivia_builder
|
table: LINE_BREAK_TABLE,
|
||||||
.add_single_line_comment(start, self.offset() - c.len_utf8() as u32);
|
continue_if: |next_byte, pos| {
|
||||||
return Kind::Skip;
|
// Match found. Decide whether to continue searching.
|
||||||
}
|
// If this is end of comment, create trivia, and advance `pos` to after line break.
|
||||||
|
// Do that here rather than in `handle_match`, to avoid branching twice on value of
|
||||||
|
// the matched byte.
|
||||||
|
#[allow(clippy::if_not_else)]
|
||||||
|
if next_byte != LS_OR_PS_FIRST {
|
||||||
|
// `\r` or `\n`
|
||||||
|
self.trivia_builder
|
||||||
|
.add_single_line_comment(self.token.start, self.source.offset_of(pos));
|
||||||
|
// SAFETY: Safe to consume `\r` or `\n` as both are ASCII
|
||||||
|
pos = pos.add(1);
|
||||||
|
// We've found the end. Do not continue searching.
|
||||||
|
false
|
||||||
|
} else {
|
||||||
|
// `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
|
||||||
|
// Either way, Unicode is uncommon, so make this a cold branch.
|
||||||
|
cold_branch(|| {
|
||||||
|
// SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
|
||||||
|
// So safe to advance `pos` by 1 and read 2 bytes.
|
||||||
|
let next2 = pos.add(1).read2();
|
||||||
|
if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
|
||||||
|
// Irregular line break
|
||||||
|
self.trivia_builder
|
||||||
|
.add_single_line_comment(self.token.start, self.source.offset_of(pos));
|
||||||
|
// Advance `pos` to after this char.
|
||||||
|
// SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
|
||||||
|
// so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
|
||||||
|
pos = pos.add(3);
|
||||||
|
// We've found the end. Do not continue searching.
|
||||||
|
false
|
||||||
|
} else {
|
||||||
|
// Some other Unicode char beginning with `0xE2`. Continue searching.
|
||||||
|
true
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
},
|
||||||
|
handle_match: |_next_byte, _start| {
|
||||||
|
self.token.is_on_new_line = true;
|
||||||
|
Kind::Skip
|
||||||
|
},
|
||||||
|
handle_eof: |_start| {
|
||||||
|
self.trivia_builder.add_single_line_comment(self.token.start, self.offset());
|
||||||
|
Kind::Skip
|
||||||
|
},
|
||||||
|
};
|
||||||
}
|
}
|
||||||
// EOF
|
|
||||||
self.trivia_builder.add_single_line_comment(start, self.offset());
|
|
||||||
Kind::Skip
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Section 12.4 Multi Line Comment
|
/// Section 12.4 Multi Line Comment
|
||||||
|
|
|
||||||
|
|
@ -534,7 +534,6 @@ impl<'a> SourcePosition<'a> {
|
||||||
/// # SAFETY
|
/// # SAFETY
|
||||||
/// Caller must ensure `SourcePosition` is no later than 2 bytes before end of source text.
|
/// Caller must ensure `SourcePosition` is no later than 2 bytes before end of source text.
|
||||||
/// i.e. if source length is 10, `self` must be on position 8 max.
|
/// i.e. if source length is 10, `self` must be on position 8 max.
|
||||||
#[allow(dead_code)]
|
|
||||||
#[inline]
|
#[inline]
|
||||||
pub(super) unsafe fn read2(self) -> [u8; 2] {
|
pub(super) unsafe fn read2(self) -> [u8; 2] {
|
||||||
// SAFETY:
|
// SAFETY:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue