mirror of
https://github.com/danbulant/oxc
synced 2026-05-24 12:21:58 +00:00
feat(lexer): use portable-SIMD to speed up multiline comment scanning
This commit is contained in:
parent
4fc112f7dc
commit
83c3f34af2
2 changed files with 146 additions and 18 deletions
|
|
@ -23,7 +23,7 @@ use number::{parse_big_int, parse_float, parse_int};
|
|||
use oxc_allocator::{Allocator, String};
|
||||
use oxc_ast::{Atom, SourceType, Span};
|
||||
use oxc_diagnostics::{Diagnostic, Diagnostics};
|
||||
use simd::SkipWhitespace;
|
||||
use simd::{SkipMultilineComment, SkipWhitespace};
|
||||
use string_builder::AutoCow;
|
||||
pub use token::{RegExp, Token, TokenValue};
|
||||
|
||||
|
|
@ -397,13 +397,19 @@ impl<'a> Lexer<'a> {
|
|||
kind
|
||||
}
|
||||
'/' => {
|
||||
if self.next_eq('/') {
|
||||
self.skip_single_line_comment()
|
||||
} else if self.next_eq('*') {
|
||||
self.skip_multi_line_comment()
|
||||
} else {
|
||||
// regex is handled separately, see `next_regex`
|
||||
self.read_slash()
|
||||
match self.peek() {
|
||||
'/' => {
|
||||
self.current.chars.next();
|
||||
self.skip_single_line_comment()
|
||||
}
|
||||
'*' => {
|
||||
self.current.chars.next();
|
||||
self.skip_multi_line_comment()
|
||||
}
|
||||
_ => {
|
||||
// regex is handled separately, see `next_regex`
|
||||
self.read_slash()
|
||||
}
|
||||
}
|
||||
}
|
||||
'`' => self.read_template_literal(Kind::TemplateHead, Kind::NoSubstitutionTemplate),
|
||||
|
|
@ -490,16 +496,24 @@ impl<'a> Lexer<'a> {
|
|||
/// Section 12.4 Multi Line Comment
|
||||
#[must_use]
|
||||
fn skip_multi_line_comment(&mut self) -> Kind {
|
||||
while let Some(c) = self.current.chars.next() {
|
||||
if c == '*' && self.next_eq('/') {
|
||||
return Kind::MultiLineComment;
|
||||
}
|
||||
if is_line_terminator(c) {
|
||||
self.current.token.is_on_new_line = true;
|
||||
}
|
||||
let remaining = self.remaining().as_bytes();
|
||||
let newline = self.current.token.is_on_new_line;
|
||||
let state = SkipMultilineComment::new(newline, remaining).simd(remaining);
|
||||
|
||||
// SAFETY: offset is computed to the boundary
|
||||
self.current.chars =
|
||||
unsafe { std::str::from_utf8_unchecked(&remaining[state.offset..]) }.chars();
|
||||
|
||||
if state.newline && !newline {
|
||||
self.current.token.is_on_new_line = true;
|
||||
}
|
||||
self.error(Diagnostic::UnterminatedMultiLineComment(self.unterminated_range()));
|
||||
Kind::Eof
|
||||
|
||||
if !state.found {
|
||||
self.error(Diagnostic::UnterminatedMultiLineComment(self.unterminated_range()));
|
||||
return Kind::Eof;
|
||||
}
|
||||
|
||||
Kind::MultiLineComment
|
||||
}
|
||||
|
||||
/// Section 12.6.1 Identifier Names
|
||||
|
|
|
|||
|
|
@ -9,7 +9,6 @@ use std::simd::{Simd, SimdPartialEq, ToBitMask};
|
|||
const ELEMENTS: usize = 16;
|
||||
type SimdVec = Simd<u8, ELEMENTS>;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct SkipWhitespace {
|
||||
/// Total offset
|
||||
pub offset: usize,
|
||||
|
|
@ -84,3 +83,118 @@ impl SkipWhitespace {
|
|||
self.offset += advance_by as usize;
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SkipMultilineComment<'a> {
|
||||
/// Total offset
|
||||
pub offset: usize,
|
||||
|
||||
/// Found multiline comment end '*/'?
|
||||
pub found: bool,
|
||||
|
||||
/// Found newline inside the comment?
|
||||
pub newline: bool,
|
||||
|
||||
/// Remaining char bytes from the lexer
|
||||
remaining: &'a [u8],
|
||||
|
||||
star: SimdVec,
|
||||
slash: SimdVec,
|
||||
lf: SimdVec,
|
||||
cr: SimdVec,
|
||||
lsps: SimdVec,
|
||||
}
|
||||
|
||||
impl<'a> SkipMultilineComment<'a> {
|
||||
pub fn new(newline: bool, remaining: &'a [u8]) -> Self {
|
||||
Self {
|
||||
offset: 0,
|
||||
found: false,
|
||||
newline,
|
||||
remaining,
|
||||
star: SimdVec::splat(b'*'),
|
||||
slash: SimdVec::splat(b'/'),
|
||||
lf: SimdVec::splat(b'\n'),
|
||||
cr: SimdVec::splat(b'\r'),
|
||||
lsps: SimdVec::splat(226),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn simd(mut self, remaining: &[u8]) -> Self {
|
||||
let (chunks, remainder) = remaining.as_chunks::<ELEMENTS>();
|
||||
|
||||
for chunk in chunks {
|
||||
self.check(chunk, chunk.len());
|
||||
if self.found {
|
||||
return self;
|
||||
}
|
||||
}
|
||||
|
||||
if !remainder.is_empty() {
|
||||
// Align the last chunk for avoiding the use of a scalar version
|
||||
let mut chunk = [0; ELEMENTS];
|
||||
let len = remainder.len();
|
||||
chunk[..len].copy_from_slice(remainder);
|
||||
self.check(&chunk, len);
|
||||
}
|
||||
|
||||
self
|
||||
}
|
||||
|
||||
/// Check and compute state for a single chunk
|
||||
/// `chunk_len` can be < ELEMENTS for the last chunk
|
||||
fn check(&mut self, chunk: &[u8], chunk_len: usize) {
|
||||
let s = SimdVec::from_slice(chunk);
|
||||
|
||||
let any_star = s.simd_eq(self.star);
|
||||
let any_slash = s.simd_eq(self.slash);
|
||||
let star_mask = any_star.to_bitmask();
|
||||
let slash_mask = any_slash.to_bitmask();
|
||||
|
||||
// Get the offset of '/' if '*' is immediately followed by '/'
|
||||
let star_slash_mask = (star_mask << 1) & slash_mask;
|
||||
let star_slash_pos = star_slash_mask.trailing_zeros();
|
||||
|
||||
let chunk_offset = if star_slash_mask > 0 {
|
||||
self.found = true;
|
||||
star_slash_pos as usize + 1
|
||||
} else {
|
||||
// Is '*' at the end?
|
||||
if star_mask & 1 << (ELEMENTS - 1) > 0
|
||||
&& self.remaining.get(self.offset + ELEMENTS) == Some(&b'/')
|
||||
{
|
||||
self.found = true;
|
||||
ELEMENTS + 1
|
||||
} else {
|
||||
chunk_len
|
||||
}
|
||||
};
|
||||
|
||||
// Look for '\n' and '\r'
|
||||
if !self.newline {
|
||||
let any_newline = s.simd_eq(self.lf) | s.simd_eq(self.cr);
|
||||
let newline_mask = any_newline.to_bitmask();
|
||||
self.newline = (newline_mask.trailing_zeros() as usize) < chunk_offset;
|
||||
// Look for LS '\u{2028}' [226, 128, 168] and PS '\u{2029}' [226, 128, 169]
|
||||
if !self.newline {
|
||||
let lspf_mask = s.simd_eq(self.lsps).to_bitmask();
|
||||
if lspf_mask > 0 {
|
||||
let offset_by = lspf_mask.trailing_zeros() as usize;
|
||||
if offset_by < chunk_offset {
|
||||
let second = self.offset + offset_by + 1;
|
||||
// Using scalar version `.get` instead of simd
|
||||
// to avoid checking on the next chunk
|
||||
// because this may be on the chunk boundary
|
||||
if self.remaining.get(second) == Some(&128) {
|
||||
let third = self.remaining.get(second + 1);
|
||||
if matches!(third, Some(&168 | &169)) {
|
||||
self.newline = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.offset += chunk_offset;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue