From a51c7f9ba22b6c192b6de1f2a79447869ba7c65f Mon Sep 17 00:00:00 2001 From: Boshen Date: Tue, 14 Feb 2023 10:31:19 +0800 Subject: [PATCH] perf(lexer): use portable-SIMD to speed up multiline comment scanning --- Cargo.lock | 1 + crates/oxc_parser/Cargo.toml | 1 + crates/oxc_parser/src/lexer/mod.rs | 23 ++++-- crates/oxc_parser/src/lexer/simd.rs | 122 ++++++++++++++++++++++++++++ crates/oxc_parser/src/lib.rs | 2 + 5 files changed, 142 insertions(+), 7 deletions(-) create mode 100644 crates/oxc_parser/src/lexer/simd.rs diff --git a/Cargo.lock b/Cargo.lock index 8c594a41b..15d0812d9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -595,6 +595,7 @@ name = "oxc_parser" version = "0.0.0" dependencies = [ "bitflags", + "lazy_static", "num-bigint", "oxc_allocator", "oxc_ast", diff --git a/crates/oxc_parser/Cargo.toml b/crates/oxc_parser/Cargo.toml index ea9da0b08..b23c9506c 100644 --- a/crates/oxc_parser/Cargo.toml +++ b/crates/oxc_parser/Cargo.toml @@ -19,3 +19,4 @@ rustc-hash = { workspace = true } unicode-id-start = "1.0.3" num-bigint = "0.4.3" +lazy_static = "1.4.0" diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs index cfd80a52f..123973545 100644 --- a/crates/oxc_parser/src/lexer/mod.rs +++ b/crates/oxc_parser/src/lexer/mod.rs @@ -8,6 +8,7 @@ mod constants; mod kind; mod number; +mod simd; mod string_builder; mod token; @@ -22,6 +23,7 @@ use number::{parse_big_int, parse_float, parse_int}; use oxc_allocator::{Allocator, String}; use oxc_ast::{Atom, SourceType, Span}; use oxc_diagnostics::{Diagnostic, Diagnostics}; +use simd::MultiLineComment; use string_builder::AutoCow; pub use token::{RegExp, Token, TokenValue}; @@ -466,14 +468,21 @@ impl<'a> Lexer<'a> { /// Section 12.4 Multi Line Comment #[must_use] fn skip_multi_line_comment(&mut self) -> Kind { - while let Some(c) = self.current.chars.next() { - if c == '*' && self.next_eq('/') { - return Kind::MultiLineComment; - } - if is_line_terminator(c) { - self.current.token.is_on_new_line = true; - } + let remaining = self.remaining().as_bytes(); + let state = MultiLineComment::new(remaining).simd(remaining); + + // SAFETY: offset is computed to the boundary + self.current.chars = + unsafe { std::str::from_utf8_unchecked(&remaining[state.offset..]) }.chars(); + + if state.newline { + self.current.token.is_on_new_line = state.newline; } + + if state.found { + return Kind::MultiLineComment; + } + self.error(Diagnostic::UnterminatedMultiLineComment(self.unterminated_range())); Kind::Eof } diff --git a/crates/oxc_parser/src/lexer/simd.rs b/crates/oxc_parser/src/lexer/simd.rs new file mode 100644 index 000000000..452f0d965 --- /dev/null +++ b/crates/oxc_parser/src/lexer/simd.rs @@ -0,0 +1,122 @@ +//! Lexer methods using portable-SIMD +//! See: +//! * +//! * +//! * + +use std::simd::{Simd, SimdPartialEq, ToBitMask}; + +use lazy_static::lazy_static; + +const ELEMENTS: usize = 16; +type SimdVec = Simd; + +lazy_static! { + static ref STAR: SimdVec = SimdVec::splat(b'*'); + static ref SLASH: SimdVec = SimdVec::splat(b'/'); + static ref LF: SimdVec = SimdVec::splat(b'\n'); + static ref CR: SimdVec = SimdVec::splat(b'\r'); + static ref LSPS: SimdVec = SimdVec::splat(226); +} + +#[derive(Debug)] +pub struct MultiLineComment<'a> { + /// Total offset + pub offset: usize, + + /// Found multiline comment end '*/'? + pub found: bool, + + /// Found newline inside the comment? + pub newline: bool, + + /// Does the previous chunk has a '*' at the end? + /// For checking against the first '/' on the current chunk. + previous_star_at_end: bool, + + /// Remaining char bytes from the lexer + remaining: &'a [u8], +} + +impl<'a> MultiLineComment<'a> { + pub const fn new(remaining: &'a [u8]) -> Self { + Self { offset: 0, found: false, newline: false, previous_star_at_end: false, remaining } + } + + pub fn simd(mut self, remaining: &[u8]) -> Self { + let (chunks, remainder) = remaining.as_chunks::(); + + for chunk in chunks { + self.check(chunk, chunk.len()); + if self.found { + return self; + } + } + + if !remainder.is_empty() { + // Align the last chunk for avoiding the use of a scalar version + let mut chunk = [0; ELEMENTS]; + let len = remainder.len(); + chunk[..len].copy_from_slice(remainder); + self.check(&chunk, len); + } + + self + } + + /// Check and compute state for a single chunk + /// `chunk_len` can be < ELEMENTS for the last chunk + fn check(&mut self, chunk: &[u8], chunk_len: usize) { + let s = SimdVec::from_slice(chunk); + + let any_star = s.simd_eq(*STAR); + let any_slash = s.simd_eq(*SLASH); + let star_mask = any_star.to_bitmask(); + let slash_mask = any_slash.to_bitmask(); + + // Get the offset of '/' if '*' is immediately followed by '/' + let star_slash_mask = (star_mask << 1) & slash_mask; + let star_slash_pos = star_slash_mask.trailing_zeros(); + + let offset_total = if star_slash_mask > 0 { + self.found = true; + star_slash_pos as usize + 1 + } else if self.previous_star_at_end && slash_mask & 1 > 0 { + // at boundary + self.found = true; + 1 + } else { + // Is '*' at the end? + self.previous_star_at_end = star_mask & 1 << (ELEMENTS - 1) > 0; + chunk_len + }; + + // Look for '\n' and '\r' + if !self.newline { + let any_newline = s.simd_eq(*LF) | s.simd_eq(*CR); + let newline_mask = any_newline.to_bitmask(); + self.newline = newline_mask.trailing_zeros() < star_slash_pos; + // Look for LS '\u{2028}' [226, 128, 168] and PS '\u{2029}' [226, 128, 169] + if !self.newline { + let lspf_mask = s.simd_eq(*LSPS).to_bitmask(); + if lspf_mask > 0 { + let offset_by = lspf_mask.trailing_zeros(); + if offset_by < star_slash_pos { + let second = self.offset + offset_by as usize + 1; + // Using scalar version `.get` instead of simd + // to avoid checking on the next chunk + // because this may be on the chunk boundary + if self.remaining.get(second) == Some(&128) { + let third = self.remaining.get(second + 1); + if matches!(third, Some(&168 | &169)) { + self.newline = true; + } + } + } + } + } + } + + self.offset += offset_total; + } +} diff --git a/crates/oxc_parser/src/lib.rs b/crates/oxc_parser/src/lib.rs index e52e5091d..d0459cb0a 100644 --- a/crates/oxc_parser/src/lib.rs +++ b/crates/oxc_parser/src/lib.rs @@ -1,6 +1,8 @@ //! Recursive Descent Parser for ECMAScript and TypeScript #![allow(clippy::wildcard_imports)] // allow for use `oxc_ast::ast::*` +#![feature(portable_simd)] +#![feature(slice_as_chunks)] mod cursor; mod list;