mirror of
https://github.com/danbulant/oxc
synced 2026-05-24 12:21:58 +00:00
perf(lexer): use portable-SIMD to speed up multiline comment scanning
This commit is contained in:
parent
3fee6a5184
commit
a51c7f9ba2
5 changed files with 142 additions and 7 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
|
@ -595,6 +595,7 @@ name = "oxc_parser"
|
|||
version = "0.0.0"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"lazy_static",
|
||||
"num-bigint",
|
||||
"oxc_allocator",
|
||||
"oxc_ast",
|
||||
|
|
|
|||
|
|
@ -19,3 +19,4 @@ rustc-hash = { workspace = true }
|
|||
|
||||
unicode-id-start = "1.0.3"
|
||||
num-bigint = "0.4.3"
|
||||
lazy_static = "1.4.0"
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@
|
|||
mod constants;
|
||||
mod kind;
|
||||
mod number;
|
||||
mod simd;
|
||||
mod string_builder;
|
||||
mod token;
|
||||
|
||||
|
|
@ -22,6 +23,7 @@ use number::{parse_big_int, parse_float, parse_int};
|
|||
use oxc_allocator::{Allocator, String};
|
||||
use oxc_ast::{Atom, SourceType, Span};
|
||||
use oxc_diagnostics::{Diagnostic, Diagnostics};
|
||||
use simd::MultiLineComment;
|
||||
use string_builder::AutoCow;
|
||||
pub use token::{RegExp, Token, TokenValue};
|
||||
|
||||
|
|
@ -466,14 +468,21 @@ impl<'a> Lexer<'a> {
|
|||
/// Section 12.4 Multi Line Comment
|
||||
#[must_use]
|
||||
fn skip_multi_line_comment(&mut self) -> Kind {
|
||||
while let Some(c) = self.current.chars.next() {
|
||||
if c == '*' && self.next_eq('/') {
|
||||
return Kind::MultiLineComment;
|
||||
}
|
||||
if is_line_terminator(c) {
|
||||
self.current.token.is_on_new_line = true;
|
||||
}
|
||||
let remaining = self.remaining().as_bytes();
|
||||
let state = MultiLineComment::new(remaining).simd(remaining);
|
||||
|
||||
// SAFETY: offset is computed to the boundary
|
||||
self.current.chars =
|
||||
unsafe { std::str::from_utf8_unchecked(&remaining[state.offset..]) }.chars();
|
||||
|
||||
if state.newline {
|
||||
self.current.token.is_on_new_line = state.newline;
|
||||
}
|
||||
|
||||
if state.found {
|
||||
return Kind::MultiLineComment;
|
||||
}
|
||||
|
||||
self.error(Diagnostic::UnterminatedMultiLineComment(self.unterminated_range()));
|
||||
Kind::Eof
|
||||
}
|
||||
|
|
|
|||
122
crates/oxc_parser/src/lexer/simd.rs
Normal file
122
crates/oxc_parser/src/lexer/simd.rs
Normal file
|
|
@ -0,0 +1,122 @@
|
|||
//! Lexer methods using portable-SIMD
|
||||
//! See:
|
||||
//! * <https://github.com/rust-lang/portable-simd/blob/master/beginners-guide.md>
|
||||
//! * <https://rapidjson.org/md_doc_internals.html#SkipwhitespaceWithSIMD>
|
||||
//! * <https://lemire.me/blog/2017/01/20/how-quickly-can-you-remove-spaces-from-a-string>
|
||||
|
||||
use std::simd::{Simd, SimdPartialEq, ToBitMask};
|
||||
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
const ELEMENTS: usize = 16;
|
||||
type SimdVec = Simd<u8, ELEMENTS>;
|
||||
|
||||
lazy_static! {
|
||||
static ref STAR: SimdVec = SimdVec::splat(b'*');
|
||||
static ref SLASH: SimdVec = SimdVec::splat(b'/');
|
||||
static ref LF: SimdVec = SimdVec::splat(b'\n');
|
||||
static ref CR: SimdVec = SimdVec::splat(b'\r');
|
||||
static ref LSPS: SimdVec = SimdVec::splat(226);
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct MultiLineComment<'a> {
|
||||
/// Total offset
|
||||
pub offset: usize,
|
||||
|
||||
/// Found multiline comment end '*/'?
|
||||
pub found: bool,
|
||||
|
||||
/// Found newline inside the comment?
|
||||
pub newline: bool,
|
||||
|
||||
/// Does the previous chunk has a '*' at the end?
|
||||
/// For checking against the first '/' on the current chunk.
|
||||
previous_star_at_end: bool,
|
||||
|
||||
/// Remaining char bytes from the lexer
|
||||
remaining: &'a [u8],
|
||||
}
|
||||
|
||||
impl<'a> MultiLineComment<'a> {
|
||||
pub const fn new(remaining: &'a [u8]) -> Self {
|
||||
Self { offset: 0, found: false, newline: false, previous_star_at_end: false, remaining }
|
||||
}
|
||||
|
||||
pub fn simd(mut self, remaining: &[u8]) -> Self {
|
||||
let (chunks, remainder) = remaining.as_chunks::<ELEMENTS>();
|
||||
|
||||
for chunk in chunks {
|
||||
self.check(chunk, chunk.len());
|
||||
if self.found {
|
||||
return self;
|
||||
}
|
||||
}
|
||||
|
||||
if !remainder.is_empty() {
|
||||
// Align the last chunk for avoiding the use of a scalar version
|
||||
let mut chunk = [0; ELEMENTS];
|
||||
let len = remainder.len();
|
||||
chunk[..len].copy_from_slice(remainder);
|
||||
self.check(&chunk, len);
|
||||
}
|
||||
|
||||
self
|
||||
}
|
||||
|
||||
/// Check and compute state for a single chunk
|
||||
/// `chunk_len` can be < ELEMENTS for the last chunk
|
||||
fn check(&mut self, chunk: &[u8], chunk_len: usize) {
|
||||
let s = SimdVec::from_slice(chunk);
|
||||
|
||||
let any_star = s.simd_eq(*STAR);
|
||||
let any_slash = s.simd_eq(*SLASH);
|
||||
let star_mask = any_star.to_bitmask();
|
||||
let slash_mask = any_slash.to_bitmask();
|
||||
|
||||
// Get the offset of '/' if '*' is immediately followed by '/'
|
||||
let star_slash_mask = (star_mask << 1) & slash_mask;
|
||||
let star_slash_pos = star_slash_mask.trailing_zeros();
|
||||
|
||||
let offset_total = if star_slash_mask > 0 {
|
||||
self.found = true;
|
||||
star_slash_pos as usize + 1
|
||||
} else if self.previous_star_at_end && slash_mask & 1 > 0 {
|
||||
// at boundary
|
||||
self.found = true;
|
||||
1
|
||||
} else {
|
||||
// Is '*' at the end?
|
||||
self.previous_star_at_end = star_mask & 1 << (ELEMENTS - 1) > 0;
|
||||
chunk_len
|
||||
};
|
||||
|
||||
// Look for '\n' and '\r'
|
||||
if !self.newline {
|
||||
let any_newline = s.simd_eq(*LF) | s.simd_eq(*CR);
|
||||
let newline_mask = any_newline.to_bitmask();
|
||||
self.newline = newline_mask.trailing_zeros() < star_slash_pos;
|
||||
// Look for LS '\u{2028}' [226, 128, 168] and PS '\u{2029}' [226, 128, 169]
|
||||
if !self.newline {
|
||||
let lspf_mask = s.simd_eq(*LSPS).to_bitmask();
|
||||
if lspf_mask > 0 {
|
||||
let offset_by = lspf_mask.trailing_zeros();
|
||||
if offset_by < star_slash_pos {
|
||||
let second = self.offset + offset_by as usize + 1;
|
||||
// Using scalar version `.get` instead of simd
|
||||
// to avoid checking on the next chunk
|
||||
// because this may be on the chunk boundary
|
||||
if self.remaining.get(second) == Some(&128) {
|
||||
let third = self.remaining.get(second + 1);
|
||||
if matches!(third, Some(&168 | &169)) {
|
||||
self.newline = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.offset += offset_total;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,6 +1,8 @@
|
|||
//! Recursive Descent Parser for ECMAScript and TypeScript
|
||||
|
||||
#![allow(clippy::wildcard_imports)] // allow for use `oxc_ast::ast::*`
|
||||
#![feature(portable_simd)]
|
||||
#![feature(slice_as_chunks)]
|
||||
|
||||
mod cursor;
|
||||
mod list;
|
||||
|
|
|
|||
Loading…
Reference in a new issue