Revert "perf(lexer): use portable-SIMD to speed up multiline comment scanning"

This reverts commit a51c7f9ba2.
This commit is contained in:
Boshen 2023-02-17 14:50:42 +08:00
parent a51c7f9ba2
commit a347e3993e
No known key found for this signature in database
GPG key ID: 6AC90C77AAAA6ABC
5 changed files with 7 additions and 142 deletions

1
Cargo.lock generated
View file

@ -595,7 +595,6 @@ name = "oxc_parser"
version = "0.0.0"
dependencies = [
"bitflags",
"lazy_static",
"num-bigint",
"oxc_allocator",
"oxc_ast",

View file

@ -19,4 +19,3 @@ rustc-hash = { workspace = true }
unicode-id-start = "1.0.3"
num-bigint = "0.4.3"
lazy_static = "1.4.0"

View file

@ -8,7 +8,6 @@
mod constants;
mod kind;
mod number;
mod simd;
mod string_builder;
mod token;
@ -23,7 +22,6 @@ use number::{parse_big_int, parse_float, parse_int};
use oxc_allocator::{Allocator, String};
use oxc_ast::{Atom, SourceType, Span};
use oxc_diagnostics::{Diagnostic, Diagnostics};
use simd::MultiLineComment;
use string_builder::AutoCow;
pub use token::{RegExp, Token, TokenValue};
@ -468,21 +466,14 @@ impl<'a> Lexer<'a> {
/// Section 12.4 Multi Line Comment
#[must_use]
fn skip_multi_line_comment(&mut self) -> Kind {
let remaining = self.remaining().as_bytes();
let state = MultiLineComment::new(remaining).simd(remaining);
// SAFETY: offset is computed to the boundary
self.current.chars =
unsafe { std::str::from_utf8_unchecked(&remaining[state.offset..]) }.chars();
if state.newline {
self.current.token.is_on_new_line = state.newline;
while let Some(c) = self.current.chars.next() {
if c == '*' && self.next_eq('/') {
return Kind::MultiLineComment;
}
if is_line_terminator(c) {
self.current.token.is_on_new_line = true;
}
}
if state.found {
return Kind::MultiLineComment;
}
self.error(Diagnostic::UnterminatedMultiLineComment(self.unterminated_range()));
Kind::Eof
}

View file

@ -1,122 +0,0 @@
//! Lexer methods using portable-SIMD
//! See:
//! * <https://github.com/rust-lang/portable-simd/blob/master/beginners-guide.md>
//! * <https://rapidjson.org/md_doc_internals.html#SkipwhitespaceWithSIMD>
//! * <https://lemire.me/blog/2017/01/20/how-quickly-can-you-remove-spaces-from-a-string>
use std::simd::{Simd, SimdPartialEq, ToBitMask};
use lazy_static::lazy_static;
const ELEMENTS: usize = 16;
type SimdVec = Simd<u8, ELEMENTS>;
lazy_static! {
static ref STAR: SimdVec = SimdVec::splat(b'*');
static ref SLASH: SimdVec = SimdVec::splat(b'/');
static ref LF: SimdVec = SimdVec::splat(b'\n');
static ref CR: SimdVec = SimdVec::splat(b'\r');
static ref LSPS: SimdVec = SimdVec::splat(226);
}
#[derive(Debug)]
pub struct MultiLineComment<'a> {
/// Total offset
pub offset: usize,
/// Found multiline comment end '*/'?
pub found: bool,
/// Found newline inside the comment?
pub newline: bool,
/// Does the previous chunk has a '*' at the end?
/// For checking against the first '/' on the current chunk.
previous_star_at_end: bool,
/// Remaining char bytes from the lexer
remaining: &'a [u8],
}
impl<'a> MultiLineComment<'a> {
pub const fn new(remaining: &'a [u8]) -> Self {
Self { offset: 0, found: false, newline: false, previous_star_at_end: false, remaining }
}
pub fn simd(mut self, remaining: &[u8]) -> Self {
let (chunks, remainder) = remaining.as_chunks::<ELEMENTS>();
for chunk in chunks {
self.check(chunk, chunk.len());
if self.found {
return self;
}
}
if !remainder.is_empty() {
// Align the last chunk for avoiding the use of a scalar version
let mut chunk = [0; ELEMENTS];
let len = remainder.len();
chunk[..len].copy_from_slice(remainder);
self.check(&chunk, len);
}
self
}
/// Check and compute state for a single chunk
/// `chunk_len` can be < ELEMENTS for the last chunk
fn check(&mut self, chunk: &[u8], chunk_len: usize) {
let s = SimdVec::from_slice(chunk);
let any_star = s.simd_eq(*STAR);
let any_slash = s.simd_eq(*SLASH);
let star_mask = any_star.to_bitmask();
let slash_mask = any_slash.to_bitmask();
// Get the offset of '/' if '*' is immediately followed by '/'
let star_slash_mask = (star_mask << 1) & slash_mask;
let star_slash_pos = star_slash_mask.trailing_zeros();
let offset_total = if star_slash_mask > 0 {
self.found = true;
star_slash_pos as usize + 1
} else if self.previous_star_at_end && slash_mask & 1 > 0 {
// at boundary
self.found = true;
1
} else {
// Is '*' at the end?
self.previous_star_at_end = star_mask & 1 << (ELEMENTS - 1) > 0;
chunk_len
};
// Look for '\n' and '\r'
if !self.newline {
let any_newline = s.simd_eq(*LF) | s.simd_eq(*CR);
let newline_mask = any_newline.to_bitmask();
self.newline = newline_mask.trailing_zeros() < star_slash_pos;
// Look for LS '\u{2028}' [226, 128, 168] and PS '\u{2029}' [226, 128, 169]
if !self.newline {
let lspf_mask = s.simd_eq(*LSPS).to_bitmask();
if lspf_mask > 0 {
let offset_by = lspf_mask.trailing_zeros();
if offset_by < star_slash_pos {
let second = self.offset + offset_by as usize + 1;
// Using scalar version `.get` instead of simd
// to avoid checking on the next chunk
// because this may be on the chunk boundary
if self.remaining.get(second) == Some(&128) {
let third = self.remaining.get(second + 1);
if matches!(third, Some(&168 | &169)) {
self.newline = true;
}
}
}
}
}
}
self.offset += offset_total;
}
}

View file

@ -1,8 +1,6 @@
//! Recursive Descent Parser for ECMAScript and TypeScript
#![allow(clippy::wildcard_imports)] // allow for use `oxc_ast::ast::*`
#![feature(portable_simd)]
#![feature(slice_as_chunks)]
mod cursor;
mod list;