perf(lexer): use portable-SIMD to speed up whitespace scanning

closes #13
2026-05-24 12:21:58 +00:00 · 2023-02-14 10:31:19 +08:00 · 2023-02-14 10:31:19 +08:00 · ab68cea0b7
commit ab68cea0b7
parent 4edd3f75ce
4 changed files with 229 additions and 121 deletions
--- a/crates/oxc_parser/src/lexer/constants.rs
+++ b/crates/oxc_parser/src/lexer/constants.rs
@ -31,10 +31,6 @@ pub const FF: char = '\u{c}';
 /// U+00A0 NON-BREAKING SPACE, abbreviated <NBSP>.
 pub const NBSP: char = '\u{a0}';

-pub const fn is_regular_whitespace(c: char) -> bool {
-    matches!(c, ' ' | '\t')
-}
-
 pub const fn is_irregular_whitespace(c: char) -> bool {
    matches!(
        c,
--- a/crates/oxc_parser/src/lexer/mod.rs
+++ b/crates/oxc_parser/src/lexer/mod.rs
@ -8,6 +8,7 @@
 mod constants;
 mod kind;
 mod number;
+mod simd;
 mod string_builder;
 mod token;

@ -15,13 +16,14 @@ use std::{collections::VecDeque, str::Chars};

 use constants::{
    is_identifier_part, is_identifier_start, is_irregular_line_terminator, is_irregular_whitespace,
-    is_line_terminator, is_regular_line_terminator, is_regular_whitespace, EOF, SINGLE_CHAR_TOKENS,
+    is_line_terminator, EOF, SINGLE_CHAR_TOKENS,
 };
 pub use kind::Kind;
 use number::{parse_big_int, parse_float, parse_int};
 use oxc_allocator::{Allocator, String};
 use oxc_ast::{Atom, SourceType, Span};
 use oxc_diagnostics::{Diagnostic, Diagnostics};
+use simd::SkipWhitespace;
 use string_builder::AutoCow;
 pub use token::{RegExp, Token, TokenValue};

@ -321,7 +323,6 @@ impl<'a> Lexer<'a> {

    /// Read each char and set the current token
    /// Whitespace and line terminators are skipped
-    #[allow(clippy::too_many_lines)]
    fn read_next_token(&mut self) -> Kind {
        self.current.token.start = self.offset();

@ -329,126 +330,149 @@ impl<'a> Lexer<'a> {
            return self.read_jsx_child();
        }

-        let mut builder = AutoCow::new(self);
-
-        while let Some(c) = self.current.chars.next() {
-            // fast path for single character tokens
-            // '{'  '}'  '('  ')'  '['  ']'  ';' ',' ':' '~'
-            let size = c as usize;
-            if size <= 127 {
-                let kind = SINGLE_CHAR_TOKENS[size];
-                if kind != Kind::Undetermined {
-                    return kind;
-                }
-            }
-
-            // NOTE: matching order is significant here, by real world occurrences
-            // see https://blog.mozilla.org/nnethercote/2011/07/01/faster-javascript-parsing/
-            // > the rough order of frequency for different token kinds is as follows:
-            // identifiers/keywords, ‘.’, ‘=’, strings, decimal numbers, ‘:’, ‘+’, hex/octal numbers, and then everything else
-            let kind = match c {
-                // fast path for white space
-                c if is_regular_whitespace(c) => Kind::WhiteSpace,
-                // fast path for identifiers
-                c if c.is_ascii_alphabetic() => {
-                    builder.push_matching(c);
-                    self.identifier_name_or_keyword(builder)
-                }
-                '.' => {
-                    let kind = self.read_dot(&mut builder);
-                    if kind.is_number() {
-                        self.set_numeric_value(kind, builder.finish(self));
-                    }
-                    kind
-                }
-                '=' => self.read_equal(),
-                '"' | '\'' => {
-                    if self.context == LexerContext::JsxAttributeValue {
-                        self.read_jsx_string_literal(c)
-                    } else {
-                        self.read_string_literal(c)
-                    }
-                }
-                '1'..='9' => {
-                    let kind = self.decimal_literal_after_first_digit(&mut builder);
-                    self.set_numeric_value(kind, builder.finish(self));
-                    kind
-                }
-                '+' => self.read_plus(),
-                '-' => {
-                    self.read_minus().map_or_else(|| self.skip_single_line_comment(), |kind| kind)
-                }
-                '0' => {
-                    let kind = self.read_zero(&mut builder);
-                    self.set_numeric_value(kind, builder.finish(self));
-                    kind
-                }
-                c if is_regular_line_terminator(c) => {
-                    self.current.token.is_on_new_line = true;
-                    Kind::NewLine
-                }
-                '/' => {
-                    if self.next_eq('/') {
-                        self.skip_single_line_comment()
-                    } else if self.next_eq('*') {
-                        self.skip_multi_line_comment()
-                    } else {
-                        // regex is handled separately, see `next_regex`
-                        self.read_slash()
-                    }
-                }
-                '`' => self.read_template_literal(Kind::TemplateHead, Kind::NoSubstitutionTemplate),
-                '!' => self.read_exclamation(),
-                '%' => self.read_percent(),
-                '*' => self.read_star(),
-                '&' => self.read_ampersand(),
-                '|' => self.read_pipe(),
-                '?' => self.read_question(),
-                '<' => self
-                    .read_left_angle()
-                    .map_or_else(|| self.skip_single_line_comment(), |kind| kind),
-                '^' => self.read_caret(),
-                '#' => {
-                    // https://tc39.es/proposal-hashbang/out.html
-                    // HashbangComment ::
-                    //     `#!` SingleLineCommentChars?
-                    if self.current.token.start == 0 && self.next_eq('!') {
-                        self.skip_single_line_comment()
-                    } else {
-                        builder.get_mut_string_without_current_ascii_char(self);
-                        self.private_identifier(builder)
-                    }
-                }
-                '\\' => {
-                    builder.force_allocation_without_current_ascii_char(self);
-                    self.identifier_unicode_escape_sequence(&mut builder, true);
-                    self.identifier_name_or_keyword(builder)
-                }
-                c if is_identifier_start(c) => {
-                    builder.push_matching(c);
-                    self.identifier_name_or_keyword(builder)
-                }
-                c if is_irregular_whitespace(c) => Kind::WhiteSpace,
-                c if is_irregular_line_terminator(c) => {
-                    self.current.token.is_on_new_line = true;
-                    Kind::NewLine
-                }
-                _ => {
-                    self.error(Diagnostic::InvalidCharacter(c, self.unterminated_range()));
-                    Kind::Undetermined
-                }
-            };
-
-            if !kind.is_trivia() {
-                return kind;
-            }
+        loop {
+            self.skip_whitespace();

            let offset = self.offset();
-            builder = AutoCow::new(self);
            self.current.token.start = offset;
+            let builder = AutoCow::new(self);
+
+            if let Some(c) = self.current.chars.next() {
+                let kind = self.match_char(c, builder);
+                if !kind.is_trivia() {
+                    return kind;
+                }
+            } else {
+                return Kind::Eof;
+            }
+        }
+    }
+
+    #[allow(clippy::too_many_lines)]
+    fn match_char(&mut self, c: char, mut builder: AutoCow<'a>) -> Kind {
+        // fast path for single character tokens
+        // '{'  '}'  '('  ')'  '['  ']'  ';' ',' ':' '~'
+        let size = c as usize;
+        if size <= 127 {
+            let kind = SINGLE_CHAR_TOKENS[size];
+            if kind != Kind::Undetermined {
+                return kind;
+            }
+        }
+        // NOTE: matching order is significant here, by real world occurrences
+        // see https://blog.mozilla.org/nnethercote/2011/07/01/faster-javascript-parsing/
+        // > the rough order of frequency for different token kinds is as follows:
+        // identifiers/keywords, ‘.’, ‘=’, strings, decimal numbers, ‘:’, ‘+’, hex/octal numbers, and then everything else
+        match c {
+            // fast path for identifiers
+            c if c.is_ascii_alphabetic() => {
+                builder.push_matching(c);
+                self.identifier_name_or_keyword(builder)
+            }
+            '.' => {
+                let kind = self.read_dot(&mut builder);
+                if kind.is_number() {
+                    self.set_numeric_value(kind, builder.finish(self));
+                }
+                kind
+            }
+            '=' => self.read_equal(),
+            '"' | '\'' => {
+                if self.context == LexerContext::JsxAttributeValue {
+                    self.read_jsx_string_literal(c)
+                } else {
+                    self.read_string_literal(c)
+                }
+            }
+            '1'..='9' => {
+                let kind = self.decimal_literal_after_first_digit(&mut builder);
+                self.set_numeric_value(kind, builder.finish(self));
+                kind
+            }
+            '+' => self.read_plus(),
+            '-' => self.read_minus().map_or_else(|| self.skip_single_line_comment(), |kind| kind),
+            '0' => {
+                let kind = self.read_zero(&mut builder);
+                self.set_numeric_value(kind, builder.finish(self));
+                kind
+            }
+            '/' => {
+                if self.next_eq('/') {
+                    self.skip_single_line_comment()
+                } else if self.next_eq('*') {
+                    self.skip_multi_line_comment()
+                } else {
+                    // regex is handled separately, see `next_regex`
+                    self.read_slash()
+                }
+            }
+            '`' => self.read_template_literal(Kind::TemplateHead, Kind::NoSubstitutionTemplate),
+            '!' => self.read_exclamation(),
+            '%' => self.read_percent(),
+            '*' => self.read_star(),
+            '&' => self.read_ampersand(),
+            '|' => self.read_pipe(),
+            '?' => self.read_question(),
+            '<' => {
+                self.read_left_angle().map_or_else(|| self.skip_single_line_comment(), |kind| kind)
+            }
+            '^' => self.read_caret(),
+            '#' => {
+                // https://tc39.es/proposal-hashbang/out.html
+                // HashbangComment ::
+                //     `#!` SingleLineCommentChars?
+                if self.current.token.start == 0 && self.next_eq('!') {
+                    self.skip_single_line_comment()
+                } else {
+                    builder.get_mut_string_without_current_ascii_char(self);
+                    self.private_identifier(builder)
+                }
+            }
+            '\\' => {
+                builder.force_allocation_without_current_ascii_char(self);
+                self.identifier_unicode_escape_sequence(&mut builder, true);
+                self.identifier_name_or_keyword(builder)
+            }
+            c if is_identifier_start(c) => {
+                builder.push_matching(c);
+                self.identifier_name_or_keyword(builder)
+            }
+            c if is_irregular_whitespace(c) => Kind::WhiteSpace,
+            c if is_irregular_line_terminator(c) => {
+                self.current.token.is_on_new_line = true;
+                Kind::NewLine
+            }
+            _ => {
+                self.error(Diagnostic::InvalidCharacter(c, self.unterminated_range()));
+                Kind::Undetermined
+            }
+        }
+    }
+
+    fn skip_whitespace(&mut self) {
+        let c = self.peek();
+        let any_newline = c == '\r' || c == '\n';
+        let any_white = c == ' ' || c == '\t' || any_newline;
+        // Fast path for single non-whitespace
+        if any_white {
+            self.current.chars.next();
+            if any_newline {
+                self.current.token.is_on_new_line = true;
+            }
+        } else {
+            return;
        }

-        Kind::Eof
+        let remaining = self.remaining().as_bytes();
+        let state = SkipWhitespace::new(self.current.token.is_on_new_line).simd(remaining);
+
+        // SAFETY: offset is computed to the boundary
+        self.current.chars =
+            unsafe { std::str::from_utf8_unchecked(&remaining[state.offset..]) }.chars();
+
+        if state.newline {
+            self.current.token.is_on_new_line = true;
+        }
    }

    /// Section 12.4 Single Line Comment
--- a/crates/oxc_parser/src/lexer/simd.rs
+++ b/crates/oxc_parser/src/lexer/simd.rs
@ -0,0 +1,86 @@
+//! Lexer methods using portable-SIMD
+//! See:
+//!   * <https://github.com/rust-lang/portable-simd/blob/master/beginners-guide.md>
+//!   * <https://rapidjson.org/md_doc_internals.html#SkipwhitespaceWithSIMD>
+//!   * <https://lemire.me/blog/2017/01/20/how-quickly-can-you-remove-spaces-from-a-string>
+
+use std::simd::{Simd, SimdPartialEq, ToBitMask};
+
+const ELEMENTS: usize = 16;
+type SimdVec = Simd<u8, ELEMENTS>;
+
+#[derive(Debug)]
+pub struct SkipWhitespace {
+    /// Total offset
+    pub offset: usize,
+
+    /// Found multiline comment end '*/'?
+    pub found: bool,
+
+    /// Found newline inside the comment?
+    pub newline: bool,
+
+    lf: SimdVec,
+    cr: SimdVec,
+    space: SimdVec,
+    tab: SimdVec,
+}
+
+impl SkipWhitespace {
+    pub fn new(newline: bool) -> Self {
+        Self {
+            offset: 0,
+            found: false,
+            newline,
+            lf: SimdVec::splat(b'\n'),
+            cr: SimdVec::splat(b'\r'),
+            space: SimdVec::splat(b' '),
+            tab: SimdVec::splat(b'\t'),
+        }
+    }
+
+    pub fn simd(mut self, bytes: &[u8]) -> Self {
+        let (chunks, remainder) = bytes.as_chunks::<ELEMENTS>();
+
+        for chunk in chunks {
+            self.check_chunk(chunk);
+            if self.found {
+                return self;
+            }
+        }
+
+        if !remainder.is_empty() {
+            // Align the last chunk for avoiding the use of a scalar version
+            let mut chunk = [0; ELEMENTS];
+            let len = remainder.len();
+            chunk[..len].copy_from_slice(remainder);
+            self.check_chunk(&chunk);
+        }
+
+        self
+    }
+
+    fn check_chunk(&mut self, chunk: &[u8]) {
+        let s = SimdVec::from_slice(chunk);
+
+        let any_newline = s.simd_eq(self.lf) | s.simd_eq(self.cr);
+        let any_white = s.simd_eq(self.space) | s.simd_eq(self.tab) | any_newline;
+
+        let advance_by = (!any_white.to_bitmask()).trailing_zeros();
+
+        // If the advanced offset contains a newline
+        if !self.newline
+            && advance_by > 0
+            && any_newline.to_bitmask() & (1u16.checked_shl(advance_by).map_or(u16::MAX, |c| c - 1))
+                > 0
+        {
+            self.newline = true;
+        }
+
+        if (advance_by as usize) < ELEMENTS {
+            self.found = true;
+        }
+
+        self.offset += advance_by as usize;
+    }
+}
--- a/crates/oxc_parser/src/lib.rs
+++ b/crates/oxc_parser/src/lib.rs
@ -1,6 +1,8 @@
 //! Recursive Descent Parser for ECMAScript and TypeScript

 #![allow(clippy::wildcard_imports)] // allow for use `oxc_ast::ast::*`
+#![feature(portable_simd)]
+#![feature(slice_as_chunks)]

 mod cursor;
 mod list;