feat(lexer): use portable-SIMD to speed up multiline comment scanning

This commit is contained in:
Boshen 2023-02-20 21:30:40 +08:00
parent 4fc112f7dc
commit 83c3f34af2
2 changed files with 146 additions and 18 deletions

View file

@ -23,7 +23,7 @@ use number::{parse_big_int, parse_float, parse_int};
use oxc_allocator::{Allocator, String};
use oxc_ast::{Atom, SourceType, Span};
use oxc_diagnostics::{Diagnostic, Diagnostics};
use simd::SkipWhitespace;
use simd::{SkipMultilineComment, SkipWhitespace};
use string_builder::AutoCow;
pub use token::{RegExp, Token, TokenValue};
@ -397,13 +397,19 @@ impl<'a> Lexer<'a> {
kind
}
'/' => {
if self.next_eq('/') {
self.skip_single_line_comment()
} else if self.next_eq('*') {
self.skip_multi_line_comment()
} else {
// regex is handled separately, see `next_regex`
self.read_slash()
match self.peek() {
'/' => {
self.current.chars.next();
self.skip_single_line_comment()
}
'*' => {
self.current.chars.next();
self.skip_multi_line_comment()
}
_ => {
// regex is handled separately, see `next_regex`
self.read_slash()
}
}
}
'`' => self.read_template_literal(Kind::TemplateHead, Kind::NoSubstitutionTemplate),
@ -490,16 +496,24 @@ impl<'a> Lexer<'a> {
/// Section 12.4 Multi Line Comment
#[must_use]
fn skip_multi_line_comment(&mut self) -> Kind {
while let Some(c) = self.current.chars.next() {
if c == '*' && self.next_eq('/') {
return Kind::MultiLineComment;
}
if is_line_terminator(c) {
self.current.token.is_on_new_line = true;
}
let remaining = self.remaining().as_bytes();
let newline = self.current.token.is_on_new_line;
let state = SkipMultilineComment::new(newline, remaining).simd(remaining);
// SAFETY: offset is computed to the boundary
self.current.chars =
unsafe { std::str::from_utf8_unchecked(&remaining[state.offset..]) }.chars();
if state.newline && !newline {
self.current.token.is_on_new_line = true;
}
self.error(Diagnostic::UnterminatedMultiLineComment(self.unterminated_range()));
Kind::Eof
if !state.found {
self.error(Diagnostic::UnterminatedMultiLineComment(self.unterminated_range()));
return Kind::Eof;
}
Kind::MultiLineComment
}
/// Section 12.6.1 Identifier Names

View file

@ -9,7 +9,6 @@ use std::simd::{Simd, SimdPartialEq, ToBitMask};
const ELEMENTS: usize = 16;
type SimdVec = Simd<u8, ELEMENTS>;
#[derive(Debug)]
pub struct SkipWhitespace {
/// Total offset
pub offset: usize,
@ -84,3 +83,118 @@ impl SkipWhitespace {
self.offset += advance_by as usize;
}
}
pub struct SkipMultilineComment<'a> {
/// Total offset
pub offset: usize,
/// Found multiline comment end '*/'?
pub found: bool,
/// Found newline inside the comment?
pub newline: bool,
/// Remaining char bytes from the lexer
remaining: &'a [u8],
star: SimdVec,
slash: SimdVec,
lf: SimdVec,
cr: SimdVec,
lsps: SimdVec,
}
impl<'a> SkipMultilineComment<'a> {
pub fn new(newline: bool, remaining: &'a [u8]) -> Self {
Self {
offset: 0,
found: false,
newline,
remaining,
star: SimdVec::splat(b'*'),
slash: SimdVec::splat(b'/'),
lf: SimdVec::splat(b'\n'),
cr: SimdVec::splat(b'\r'),
lsps: SimdVec::splat(226),
}
}
pub fn simd(mut self, remaining: &[u8]) -> Self {
let (chunks, remainder) = remaining.as_chunks::<ELEMENTS>();
for chunk in chunks {
self.check(chunk, chunk.len());
if self.found {
return self;
}
}
if !remainder.is_empty() {
// Align the last chunk for avoiding the use of a scalar version
let mut chunk = [0; ELEMENTS];
let len = remainder.len();
chunk[..len].copy_from_slice(remainder);
self.check(&chunk, len);
}
self
}
/// Check and compute state for a single chunk
/// `chunk_len` can be < ELEMENTS for the last chunk
fn check(&mut self, chunk: &[u8], chunk_len: usize) {
let s = SimdVec::from_slice(chunk);
let any_star = s.simd_eq(self.star);
let any_slash = s.simd_eq(self.slash);
let star_mask = any_star.to_bitmask();
let slash_mask = any_slash.to_bitmask();
// Get the offset of '/' if '*' is immediately followed by '/'
let star_slash_mask = (star_mask << 1) & slash_mask;
let star_slash_pos = star_slash_mask.trailing_zeros();
let chunk_offset = if star_slash_mask > 0 {
self.found = true;
star_slash_pos as usize + 1
} else {
// Is '*' at the end?
if star_mask & 1 << (ELEMENTS - 1) > 0
&& self.remaining.get(self.offset + ELEMENTS) == Some(&b'/')
{
self.found = true;
ELEMENTS + 1
} else {
chunk_len
}
};
// Look for '\n' and '\r'
if !self.newline {
let any_newline = s.simd_eq(self.lf) | s.simd_eq(self.cr);
let newline_mask = any_newline.to_bitmask();
self.newline = (newline_mask.trailing_zeros() as usize) < chunk_offset;
// Look for LS '\u{2028}' [226, 128, 168] and PS '\u{2029}' [226, 128, 169]
if !self.newline {
let lspf_mask = s.simd_eq(self.lsps).to_bitmask();
if lspf_mask > 0 {
let offset_by = lspf_mask.trailing_zeros() as usize;
if offset_by < chunk_offset {
let second = self.offset + offset_by + 1;
// Using scalar version `.get` instead of simd
// to avoid checking on the next chunk
// because this may be on the chunk boundary
if self.remaining.get(second) == Some(&128) {
let third = self.remaining.get(second + 1);
if matches!(third, Some(&168 | &169)) {
self.newline = true;
}
}
}
}
}
}
self.offset += chunk_offset;
}
}