diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs index bf3ba193b..23e8c28d4 100644 --- a/crates/oxc_parser/src/lexer/mod.rs +++ b/crates/oxc_parser/src/lexer/mod.rs @@ -23,7 +23,7 @@ use number::{parse_big_int, parse_float, parse_int}; use oxc_allocator::{Allocator, String}; use oxc_ast::{Atom, SourceType, Span}; use oxc_diagnostics::{Diagnostic, Diagnostics}; -use simd::SkipWhitespace; +use simd::{SkipMultilineComment, SkipWhitespace}; use string_builder::AutoCow; pub use token::{RegExp, Token, TokenValue}; @@ -397,13 +397,19 @@ impl<'a> Lexer<'a> { kind } '/' => { - if self.next_eq('/') { - self.skip_single_line_comment() - } else if self.next_eq('*') { - self.skip_multi_line_comment() - } else { - // regex is handled separately, see `next_regex` - self.read_slash() + match self.peek() { + '/' => { + self.current.chars.next(); + self.skip_single_line_comment() + } + '*' => { + self.current.chars.next(); + self.skip_multi_line_comment() + } + _ => { + // regex is handled separately, see `next_regex` + self.read_slash() + } } } '`' => self.read_template_literal(Kind::TemplateHead, Kind::NoSubstitutionTemplate), @@ -490,16 +496,24 @@ impl<'a> Lexer<'a> { /// Section 12.4 Multi Line Comment #[must_use] fn skip_multi_line_comment(&mut self) -> Kind { - while let Some(c) = self.current.chars.next() { - if c == '*' && self.next_eq('/') { - return Kind::MultiLineComment; - } - if is_line_terminator(c) { - self.current.token.is_on_new_line = true; - } + let remaining = self.remaining().as_bytes(); + let newline = self.current.token.is_on_new_line; + let state = SkipMultilineComment::new(newline, remaining).simd(remaining); + + // SAFETY: offset is computed to the boundary + self.current.chars = + unsafe { std::str::from_utf8_unchecked(&remaining[state.offset..]) }.chars(); + + if state.newline && !newline { + self.current.token.is_on_new_line = true; } - self.error(Diagnostic::UnterminatedMultiLineComment(self.unterminated_range())); - Kind::Eof + + if !state.found { + self.error(Diagnostic::UnterminatedMultiLineComment(self.unterminated_range())); + return Kind::Eof; + } + + Kind::MultiLineComment } /// Section 12.6.1 Identifier Names diff --git a/crates/oxc_parser/src/lexer/simd.rs b/crates/oxc_parser/src/lexer/simd.rs index a10d45542..fd046922b 100644 --- a/crates/oxc_parser/src/lexer/simd.rs +++ b/crates/oxc_parser/src/lexer/simd.rs @@ -9,7 +9,6 @@ use std::simd::{Simd, SimdPartialEq, ToBitMask}; const ELEMENTS: usize = 16; type SimdVec = Simd; -#[derive(Debug)] pub struct SkipWhitespace { /// Total offset pub offset: usize, @@ -84,3 +83,118 @@ impl SkipWhitespace { self.offset += advance_by as usize; } } + +pub struct SkipMultilineComment<'a> { + /// Total offset + pub offset: usize, + + /// Found multiline comment end '*/'? + pub found: bool, + + /// Found newline inside the comment? + pub newline: bool, + + /// Remaining char bytes from the lexer + remaining: &'a [u8], + + star: SimdVec, + slash: SimdVec, + lf: SimdVec, + cr: SimdVec, + lsps: SimdVec, +} + +impl<'a> SkipMultilineComment<'a> { + pub fn new(newline: bool, remaining: &'a [u8]) -> Self { + Self { + offset: 0, + found: false, + newline, + remaining, + star: SimdVec::splat(b'*'), + slash: SimdVec::splat(b'/'), + lf: SimdVec::splat(b'\n'), + cr: SimdVec::splat(b'\r'), + lsps: SimdVec::splat(226), + } + } + + pub fn simd(mut self, remaining: &[u8]) -> Self { + let (chunks, remainder) = remaining.as_chunks::(); + + for chunk in chunks { + self.check(chunk, chunk.len()); + if self.found { + return self; + } + } + + if !remainder.is_empty() { + // Align the last chunk for avoiding the use of a scalar version + let mut chunk = [0; ELEMENTS]; + let len = remainder.len(); + chunk[..len].copy_from_slice(remainder); + self.check(&chunk, len); + } + + self + } + + /// Check and compute state for a single chunk + /// `chunk_len` can be < ELEMENTS for the last chunk + fn check(&mut self, chunk: &[u8], chunk_len: usize) { + let s = SimdVec::from_slice(chunk); + + let any_star = s.simd_eq(self.star); + let any_slash = s.simd_eq(self.slash); + let star_mask = any_star.to_bitmask(); + let slash_mask = any_slash.to_bitmask(); + + // Get the offset of '/' if '*' is immediately followed by '/' + let star_slash_mask = (star_mask << 1) & slash_mask; + let star_slash_pos = star_slash_mask.trailing_zeros(); + + let chunk_offset = if star_slash_mask > 0 { + self.found = true; + star_slash_pos as usize + 1 + } else { + // Is '*' at the end? + if star_mask & 1 << (ELEMENTS - 1) > 0 + && self.remaining.get(self.offset + ELEMENTS) == Some(&b'/') + { + self.found = true; + ELEMENTS + 1 + } else { + chunk_len + } + }; + + // Look for '\n' and '\r' + if !self.newline { + let any_newline = s.simd_eq(self.lf) | s.simd_eq(self.cr); + let newline_mask = any_newline.to_bitmask(); + self.newline = (newline_mask.trailing_zeros() as usize) < chunk_offset; + // Look for LS '\u{2028}' [226, 128, 168] and PS '\u{2029}' [226, 128, 169] + if !self.newline { + let lspf_mask = s.simd_eq(self.lsps).to_bitmask(); + if lspf_mask > 0 { + let offset_by = lspf_mask.trailing_zeros() as usize; + if offset_by < chunk_offset { + let second = self.offset + offset_by + 1; + // Using scalar version `.get` instead of simd + // to avoid checking on the next chunk + // because this may be on the chunk boundary + if self.remaining.get(second) == Some(&128) { + let third = self.remaining.get(second + 1); + if matches!(third, Some(&168 | &169)) { + self.newline = true; + } + } + } + } + } + } + + self.offset += chunk_offset; + } +}