From ab68cea0b7196b482c3ef0bef87687bd95b7a0f1 Mon Sep 17 00:00:00 2001 From: Boshen Date: Tue, 14 Feb 2023 10:31:19 +0800 Subject: [PATCH] perf(lexer): use portable-SIMD to speed up whitespace scanning closes #13 --- crates/oxc_parser/src/lexer/constants.rs | 4 - crates/oxc_parser/src/lexer/mod.rs | 258 +++++++++++++---------- crates/oxc_parser/src/lexer/simd.rs | 86 ++++++++ crates/oxc_parser/src/lib.rs | 2 + 4 files changed, 229 insertions(+), 121 deletions(-) create mode 100644 crates/oxc_parser/src/lexer/simd.rs diff --git a/crates/oxc_parser/src/lexer/constants.rs b/crates/oxc_parser/src/lexer/constants.rs index ce4f874e0..edbedb37d 100644 --- a/crates/oxc_parser/src/lexer/constants.rs +++ b/crates/oxc_parser/src/lexer/constants.rs @@ -31,10 +31,6 @@ pub const FF: char = '\u{c}'; /// U+00A0 NON-BREAKING SPACE, abbreviated . pub const NBSP: char = '\u{a0}'; -pub const fn is_regular_whitespace(c: char) -> bool { - matches!(c, ' ' | '\t') -} - pub const fn is_irregular_whitespace(c: char) -> bool { matches!( c, diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs index cfd80a52f..bf3ba193b 100644 --- a/crates/oxc_parser/src/lexer/mod.rs +++ b/crates/oxc_parser/src/lexer/mod.rs @@ -8,6 +8,7 @@ mod constants; mod kind; mod number; +mod simd; mod string_builder; mod token; @@ -15,13 +16,14 @@ use std::{collections::VecDeque, str::Chars}; use constants::{ is_identifier_part, is_identifier_start, is_irregular_line_terminator, is_irregular_whitespace, - is_line_terminator, is_regular_line_terminator, is_regular_whitespace, EOF, SINGLE_CHAR_TOKENS, + is_line_terminator, EOF, SINGLE_CHAR_TOKENS, }; pub use kind::Kind; use number::{parse_big_int, parse_float, parse_int}; use oxc_allocator::{Allocator, String}; use oxc_ast::{Atom, SourceType, Span}; use oxc_diagnostics::{Diagnostic, Diagnostics}; +use simd::SkipWhitespace; use string_builder::AutoCow; pub use token::{RegExp, Token, TokenValue}; @@ -321,7 +323,6 @@ impl<'a> Lexer<'a> { /// Read each char and set the current token /// Whitespace and line terminators are skipped - #[allow(clippy::too_many_lines)] fn read_next_token(&mut self) -> Kind { self.current.token.start = self.offset(); @@ -329,126 +330,149 @@ impl<'a> Lexer<'a> { return self.read_jsx_child(); } - let mut builder = AutoCow::new(self); - - while let Some(c) = self.current.chars.next() { - // fast path for single character tokens - // '{' '}' '(' ')' '[' ']' ';' ',' ':' '~' - let size = c as usize; - if size <= 127 { - let kind = SINGLE_CHAR_TOKENS[size]; - if kind != Kind::Undetermined { - return kind; - } - } - - // NOTE: matching order is significant here, by real world occurrences - // see https://blog.mozilla.org/nnethercote/2011/07/01/faster-javascript-parsing/ - // > the rough order of frequency for different token kinds is as follows: - // identifiers/keywords, ‘.’, ‘=’, strings, decimal numbers, ‘:’, ‘+’, hex/octal numbers, and then everything else - let kind = match c { - // fast path for white space - c if is_regular_whitespace(c) => Kind::WhiteSpace, - // fast path for identifiers - c if c.is_ascii_alphabetic() => { - builder.push_matching(c); - self.identifier_name_or_keyword(builder) - } - '.' => { - let kind = self.read_dot(&mut builder); - if kind.is_number() { - self.set_numeric_value(kind, builder.finish(self)); - } - kind - } - '=' => self.read_equal(), - '"' | '\'' => { - if self.context == LexerContext::JsxAttributeValue { - self.read_jsx_string_literal(c) - } else { - self.read_string_literal(c) - } - } - '1'..='9' => { - let kind = self.decimal_literal_after_first_digit(&mut builder); - self.set_numeric_value(kind, builder.finish(self)); - kind - } - '+' => self.read_plus(), - '-' => { - self.read_minus().map_or_else(|| self.skip_single_line_comment(), |kind| kind) - } - '0' => { - let kind = self.read_zero(&mut builder); - self.set_numeric_value(kind, builder.finish(self)); - kind - } - c if is_regular_line_terminator(c) => { - self.current.token.is_on_new_line = true; - Kind::NewLine - } - '/' => { - if self.next_eq('/') { - self.skip_single_line_comment() - } else if self.next_eq('*') { - self.skip_multi_line_comment() - } else { - // regex is handled separately, see `next_regex` - self.read_slash() - } - } - '`' => self.read_template_literal(Kind::TemplateHead, Kind::NoSubstitutionTemplate), - '!' => self.read_exclamation(), - '%' => self.read_percent(), - '*' => self.read_star(), - '&' => self.read_ampersand(), - '|' => self.read_pipe(), - '?' => self.read_question(), - '<' => self - .read_left_angle() - .map_or_else(|| self.skip_single_line_comment(), |kind| kind), - '^' => self.read_caret(), - '#' => { - // https://tc39.es/proposal-hashbang/out.html - // HashbangComment :: - // `#!` SingleLineCommentChars? - if self.current.token.start == 0 && self.next_eq('!') { - self.skip_single_line_comment() - } else { - builder.get_mut_string_without_current_ascii_char(self); - self.private_identifier(builder) - } - } - '\\' => { - builder.force_allocation_without_current_ascii_char(self); - self.identifier_unicode_escape_sequence(&mut builder, true); - self.identifier_name_or_keyword(builder) - } - c if is_identifier_start(c) => { - builder.push_matching(c); - self.identifier_name_or_keyword(builder) - } - c if is_irregular_whitespace(c) => Kind::WhiteSpace, - c if is_irregular_line_terminator(c) => { - self.current.token.is_on_new_line = true; - Kind::NewLine - } - _ => { - self.error(Diagnostic::InvalidCharacter(c, self.unterminated_range())); - Kind::Undetermined - } - }; - - if !kind.is_trivia() { - return kind; - } + loop { + self.skip_whitespace(); let offset = self.offset(); - builder = AutoCow::new(self); self.current.token.start = offset; + let builder = AutoCow::new(self); + + if let Some(c) = self.current.chars.next() { + let kind = self.match_char(c, builder); + if !kind.is_trivia() { + return kind; + } + } else { + return Kind::Eof; + } + } + } + + #[allow(clippy::too_many_lines)] + fn match_char(&mut self, c: char, mut builder: AutoCow<'a>) -> Kind { + // fast path for single character tokens + // '{' '}' '(' ')' '[' ']' ';' ',' ':' '~' + let size = c as usize; + if size <= 127 { + let kind = SINGLE_CHAR_TOKENS[size]; + if kind != Kind::Undetermined { + return kind; + } + } + // NOTE: matching order is significant here, by real world occurrences + // see https://blog.mozilla.org/nnethercote/2011/07/01/faster-javascript-parsing/ + // > the rough order of frequency for different token kinds is as follows: + // identifiers/keywords, ‘.’, ‘=’, strings, decimal numbers, ‘:’, ‘+’, hex/octal numbers, and then everything else + match c { + // fast path for identifiers + c if c.is_ascii_alphabetic() => { + builder.push_matching(c); + self.identifier_name_or_keyword(builder) + } + '.' => { + let kind = self.read_dot(&mut builder); + if kind.is_number() { + self.set_numeric_value(kind, builder.finish(self)); + } + kind + } + '=' => self.read_equal(), + '"' | '\'' => { + if self.context == LexerContext::JsxAttributeValue { + self.read_jsx_string_literal(c) + } else { + self.read_string_literal(c) + } + } + '1'..='9' => { + let kind = self.decimal_literal_after_first_digit(&mut builder); + self.set_numeric_value(kind, builder.finish(self)); + kind + } + '+' => self.read_plus(), + '-' => self.read_minus().map_or_else(|| self.skip_single_line_comment(), |kind| kind), + '0' => { + let kind = self.read_zero(&mut builder); + self.set_numeric_value(kind, builder.finish(self)); + kind + } + '/' => { + if self.next_eq('/') { + self.skip_single_line_comment() + } else if self.next_eq('*') { + self.skip_multi_line_comment() + } else { + // regex is handled separately, see `next_regex` + self.read_slash() + } + } + '`' => self.read_template_literal(Kind::TemplateHead, Kind::NoSubstitutionTemplate), + '!' => self.read_exclamation(), + '%' => self.read_percent(), + '*' => self.read_star(), + '&' => self.read_ampersand(), + '|' => self.read_pipe(), + '?' => self.read_question(), + '<' => { + self.read_left_angle().map_or_else(|| self.skip_single_line_comment(), |kind| kind) + } + '^' => self.read_caret(), + '#' => { + // https://tc39.es/proposal-hashbang/out.html + // HashbangComment :: + // `#!` SingleLineCommentChars? + if self.current.token.start == 0 && self.next_eq('!') { + self.skip_single_line_comment() + } else { + builder.get_mut_string_without_current_ascii_char(self); + self.private_identifier(builder) + } + } + '\\' => { + builder.force_allocation_without_current_ascii_char(self); + self.identifier_unicode_escape_sequence(&mut builder, true); + self.identifier_name_or_keyword(builder) + } + c if is_identifier_start(c) => { + builder.push_matching(c); + self.identifier_name_or_keyword(builder) + } + c if is_irregular_whitespace(c) => Kind::WhiteSpace, + c if is_irregular_line_terminator(c) => { + self.current.token.is_on_new_line = true; + Kind::NewLine + } + _ => { + self.error(Diagnostic::InvalidCharacter(c, self.unterminated_range())); + Kind::Undetermined + } + } + } + + fn skip_whitespace(&mut self) { + let c = self.peek(); + let any_newline = c == '\r' || c == '\n'; + let any_white = c == ' ' || c == '\t' || any_newline; + // Fast path for single non-whitespace + if any_white { + self.current.chars.next(); + if any_newline { + self.current.token.is_on_new_line = true; + } + } else { + return; } - Kind::Eof + let remaining = self.remaining().as_bytes(); + let state = SkipWhitespace::new(self.current.token.is_on_new_line).simd(remaining); + + // SAFETY: offset is computed to the boundary + self.current.chars = + unsafe { std::str::from_utf8_unchecked(&remaining[state.offset..]) }.chars(); + + if state.newline { + self.current.token.is_on_new_line = true; + } } /// Section 12.4 Single Line Comment diff --git a/crates/oxc_parser/src/lexer/simd.rs b/crates/oxc_parser/src/lexer/simd.rs new file mode 100644 index 000000000..a10d45542 --- /dev/null +++ b/crates/oxc_parser/src/lexer/simd.rs @@ -0,0 +1,86 @@ +//! Lexer methods using portable-SIMD +//! See: +//! * +//! * +//! * + +use std::simd::{Simd, SimdPartialEq, ToBitMask}; + +const ELEMENTS: usize = 16; +type SimdVec = Simd; + +#[derive(Debug)] +pub struct SkipWhitespace { + /// Total offset + pub offset: usize, + + /// Found multiline comment end '*/'? + pub found: bool, + + /// Found newline inside the comment? + pub newline: bool, + + lf: SimdVec, + cr: SimdVec, + space: SimdVec, + tab: SimdVec, +} + +impl SkipWhitespace { + pub fn new(newline: bool) -> Self { + Self { + offset: 0, + found: false, + newline, + lf: SimdVec::splat(b'\n'), + cr: SimdVec::splat(b'\r'), + space: SimdVec::splat(b' '), + tab: SimdVec::splat(b'\t'), + } + } + + pub fn simd(mut self, bytes: &[u8]) -> Self { + let (chunks, remainder) = bytes.as_chunks::(); + + for chunk in chunks { + self.check_chunk(chunk); + if self.found { + return self; + } + } + + if !remainder.is_empty() { + // Align the last chunk for avoiding the use of a scalar version + let mut chunk = [0; ELEMENTS]; + let len = remainder.len(); + chunk[..len].copy_from_slice(remainder); + self.check_chunk(&chunk); + } + + self + } + + fn check_chunk(&mut self, chunk: &[u8]) { + let s = SimdVec::from_slice(chunk); + + let any_newline = s.simd_eq(self.lf) | s.simd_eq(self.cr); + let any_white = s.simd_eq(self.space) | s.simd_eq(self.tab) | any_newline; + + let advance_by = (!any_white.to_bitmask()).trailing_zeros(); + + // If the advanced offset contains a newline + if !self.newline + && advance_by > 0 + && any_newline.to_bitmask() & (1u16.checked_shl(advance_by).map_or(u16::MAX, |c| c - 1)) + > 0 + { + self.newline = true; + } + + if (advance_by as usize) < ELEMENTS { + self.found = true; + } + + self.offset += advance_by as usize; + } +} diff --git a/crates/oxc_parser/src/lib.rs b/crates/oxc_parser/src/lib.rs index 46e960a47..74a489286 100644 --- a/crates/oxc_parser/src/lib.rs +++ b/crates/oxc_parser/src/lib.rs @@ -1,6 +1,8 @@ //! Recursive Descent Parser for ECMAScript and TypeScript #![allow(clippy::wildcard_imports)] // allow for use `oxc_ast::ast::*` +#![feature(portable_simd)] +#![feature(slice_as_chunks)] mod cursor; mod list;