feat(lexer): use portable-SIMD to speed up multiline comment scanning

2026-05-24 12:21:58 +00:00 · 2023-02-20 21:30:40 +08:00 · 2023-02-20 21:30:40 +08:00 · 83c3f34af2
commit 83c3f34af2
parent 4fc112f7dc
2 changed files with 146 additions and 18 deletions
--- a/crates/oxc_parser/src/lexer/mod.rs
+++ b/crates/oxc_parser/src/lexer/mod.rs
@ -23,7 +23,7 @@ use number::{parse_big_int, parse_float, parse_int};
 use oxc_allocator::{Allocator, String};
 use oxc_ast::{Atom, SourceType, Span};
 use oxc_diagnostics::{Diagnostic, Diagnostics};
-use simd::SkipWhitespace;
+use simd::{SkipMultilineComment, SkipWhitespace};
 use string_builder::AutoCow;
 pub use token::{RegExp, Token, TokenValue};

@ -397,13 +397,19 @@ impl<'a> Lexer<'a> {
                kind
            }
            '/' => {
-                if self.next_eq('/') {
-                    self.skip_single_line_comment()
-                } else if self.next_eq('*') {
-                    self.skip_multi_line_comment()
-                } else {
-                    // regex is handled separately, see `next_regex`
-                    self.read_slash()
+                match self.peek() {
+                    '/' => {
+                        self.current.chars.next();
+                        self.skip_single_line_comment()
+                    }
+                    '*' => {
+                        self.current.chars.next();
+                        self.skip_multi_line_comment()
+                    }
+                    _ => {
+                        // regex is handled separately, see `next_regex`
+                        self.read_slash()
+                    }
                }
            }
            '`' => self.read_template_literal(Kind::TemplateHead, Kind::NoSubstitutionTemplate),
@ -490,16 +496,24 @@ impl<'a> Lexer<'a> {
    /// Section 12.4 Multi Line Comment
    #[must_use]
    fn skip_multi_line_comment(&mut self) -> Kind {
-        while let Some(c) = self.current.chars.next() {
-            if c == '*' && self.next_eq('/') {
-                return Kind::MultiLineComment;
-            }
-            if is_line_terminator(c) {
-                self.current.token.is_on_new_line = true;
-            }
+        let remaining = self.remaining().as_bytes();
+        let newline = self.current.token.is_on_new_line;
+        let state = SkipMultilineComment::new(newline, remaining).simd(remaining);
+
+        // SAFETY: offset is computed to the boundary
+        self.current.chars =
+            unsafe { std::str::from_utf8_unchecked(&remaining[state.offset..]) }.chars();
+
+        if state.newline && !newline {
+            self.current.token.is_on_new_line = true;
        }
-        self.error(Diagnostic::UnterminatedMultiLineComment(self.unterminated_range()));
-        Kind::Eof
+
+        if !state.found {
+            self.error(Diagnostic::UnterminatedMultiLineComment(self.unterminated_range()));
+            return Kind::Eof;
+        }
+
+        Kind::MultiLineComment
    }

    /// Section 12.6.1 Identifier Names
--- a/crates/oxc_parser/src/lexer/simd.rs
+++ b/crates/oxc_parser/src/lexer/simd.rs
@ -9,7 +9,6 @@ use std::simd::{Simd, SimdPartialEq, ToBitMask};
 const ELEMENTS: usize = 16;
 type SimdVec = Simd<u8, ELEMENTS>;

-#[derive(Debug)]
 pub struct SkipWhitespace {
    /// Total offset
    pub offset: usize,
@ -84,3 +83,118 @@ impl SkipWhitespace {
        self.offset += advance_by as usize;
    }
 }
+
+pub struct SkipMultilineComment<'a> {
+    /// Total offset
+    pub offset: usize,
+
+    /// Found multiline comment end '*/'?
+    pub found: bool,
+
+    /// Found newline inside the comment?
+    pub newline: bool,
+
+    /// Remaining char bytes from the lexer
+    remaining: &'a [u8],
+
+    star: SimdVec,
+    slash: SimdVec,
+    lf: SimdVec,
+    cr: SimdVec,
+    lsps: SimdVec,
+}
+
+impl<'a> SkipMultilineComment<'a> {
+    pub fn new(newline: bool, remaining: &'a [u8]) -> Self {
+        Self {
+            offset: 0,
+            found: false,
+            newline,
+            remaining,
+            star: SimdVec::splat(b'*'),
+            slash: SimdVec::splat(b'/'),
+            lf: SimdVec::splat(b'\n'),
+            cr: SimdVec::splat(b'\r'),
+            lsps: SimdVec::splat(226),
+        }
+    }
+
+    pub fn simd(mut self, remaining: &[u8]) -> Self {
+        let (chunks, remainder) = remaining.as_chunks::<ELEMENTS>();
+
+        for chunk in chunks {
+            self.check(chunk, chunk.len());
+            if self.found {
+                return self;
+            }
+        }
+
+        if !remainder.is_empty() {
+            // Align the last chunk for avoiding the use of a scalar version
+            let mut chunk = [0; ELEMENTS];
+            let len = remainder.len();
+            chunk[..len].copy_from_slice(remainder);
+            self.check(&chunk, len);
+        }
+
+        self
+    }
+
+    /// Check and compute state for a single chunk
+    /// `chunk_len` can be < ELEMENTS for the last chunk
+    fn check(&mut self, chunk: &[u8], chunk_len: usize) {
+        let s = SimdVec::from_slice(chunk);
+
+        let any_star = s.simd_eq(self.star);
+        let any_slash = s.simd_eq(self.slash);
+        let star_mask = any_star.to_bitmask();
+        let slash_mask = any_slash.to_bitmask();
+
+        // Get the offset of '/' if '*' is immediately followed by '/'
+        let star_slash_mask = (star_mask << 1) & slash_mask;
+        let star_slash_pos = star_slash_mask.trailing_zeros();
+
+        let chunk_offset = if star_slash_mask > 0 {
+            self.found = true;
+            star_slash_pos as usize + 1
+        } else {
+            // Is '*' at the end?
+            if star_mask & 1 << (ELEMENTS - 1) > 0
+                && self.remaining.get(self.offset + ELEMENTS) == Some(&b'/')
+            {
+                self.found = true;
+                ELEMENTS + 1
+            } else {
+                chunk_len
+            }
+        };
+
+        // Look for '\n' and '\r'
+        if !self.newline {
+            let any_newline = s.simd_eq(self.lf) | s.simd_eq(self.cr);
+            let newline_mask = any_newline.to_bitmask();
+            self.newline = (newline_mask.trailing_zeros() as usize) < chunk_offset;
+            // Look for LS '\u{2028}' [226, 128, 168] and PS '\u{2029}' [226, 128, 169]
+            if !self.newline {
+                let lspf_mask = s.simd_eq(self.lsps).to_bitmask();
+                if lspf_mask > 0 {
+                    let offset_by = lspf_mask.trailing_zeros() as usize;
+                    if offset_by < chunk_offset {
+                        let second = self.offset + offset_by + 1;
+                        // Using scalar version `.get` instead of simd
+                        // to avoid checking on the next chunk
+                        // because this may be on the chunk boundary
+                        if self.remaining.get(second) == Some(&128) {
+                            let third = self.remaining.get(second + 1);
+                            if matches!(third, Some(&168 | &169)) {
+                                self.newline = true;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        self.offset += chunk_offset;
+    }
+}