perf(lexer): use portable-SIMD to speed up whitespace scanning

closes #13
This commit is contained in:
Boshen 2023-02-14 10:31:19 +08:00
parent 4edd3f75ce
commit ab68cea0b7
4 changed files with 229 additions and 121 deletions

View file

@ -31,10 +31,6 @@ pub const FF: char = '\u{c}';
/// U+00A0 NON-BREAKING SPACE, abbreviated <NBSP>.
pub const NBSP: char = '\u{a0}';
pub const fn is_regular_whitespace(c: char) -> bool {
matches!(c, ' ' | '\t')
}
pub const fn is_irregular_whitespace(c: char) -> bool {
matches!(
c,

View file

@ -8,6 +8,7 @@
mod constants;
mod kind;
mod number;
mod simd;
mod string_builder;
mod token;
@ -15,13 +16,14 @@ use std::{collections::VecDeque, str::Chars};
use constants::{
is_identifier_part, is_identifier_start, is_irregular_line_terminator, is_irregular_whitespace,
is_line_terminator, is_regular_line_terminator, is_regular_whitespace, EOF, SINGLE_CHAR_TOKENS,
is_line_terminator, EOF, SINGLE_CHAR_TOKENS,
};
pub use kind::Kind;
use number::{parse_big_int, parse_float, parse_int};
use oxc_allocator::{Allocator, String};
use oxc_ast::{Atom, SourceType, Span};
use oxc_diagnostics::{Diagnostic, Diagnostics};
use simd::SkipWhitespace;
use string_builder::AutoCow;
pub use token::{RegExp, Token, TokenValue};
@ -321,7 +323,6 @@ impl<'a> Lexer<'a> {
/// Read each char and set the current token
/// Whitespace and line terminators are skipped
#[allow(clippy::too_many_lines)]
fn read_next_token(&mut self) -> Kind {
self.current.token.start = self.offset();
@ -329,126 +330,149 @@ impl<'a> Lexer<'a> {
return self.read_jsx_child();
}
let mut builder = AutoCow::new(self);
while let Some(c) = self.current.chars.next() {
// fast path for single character tokens
// '{' '}' '(' ')' '[' ']' ';' ',' ':' '~'
let size = c as usize;
if size <= 127 {
let kind = SINGLE_CHAR_TOKENS[size];
if kind != Kind::Undetermined {
return kind;
}
}
// NOTE: matching order is significant here, by real world occurrences
// see https://blog.mozilla.org/nnethercote/2011/07/01/faster-javascript-parsing/
// > the rough order of frequency for different token kinds is as follows:
// identifiers/keywords, ., =, strings, decimal numbers, :, +, hex/octal numbers, and then everything else
let kind = match c {
// fast path for white space
c if is_regular_whitespace(c) => Kind::WhiteSpace,
// fast path for identifiers
c if c.is_ascii_alphabetic() => {
builder.push_matching(c);
self.identifier_name_or_keyword(builder)
}
'.' => {
let kind = self.read_dot(&mut builder);
if kind.is_number() {
self.set_numeric_value(kind, builder.finish(self));
}
kind
}
'=' => self.read_equal(),
'"' | '\'' => {
if self.context == LexerContext::JsxAttributeValue {
self.read_jsx_string_literal(c)
} else {
self.read_string_literal(c)
}
}
'1'..='9' => {
let kind = self.decimal_literal_after_first_digit(&mut builder);
self.set_numeric_value(kind, builder.finish(self));
kind
}
'+' => self.read_plus(),
'-' => {
self.read_minus().map_or_else(|| self.skip_single_line_comment(), |kind| kind)
}
'0' => {
let kind = self.read_zero(&mut builder);
self.set_numeric_value(kind, builder.finish(self));
kind
}
c if is_regular_line_terminator(c) => {
self.current.token.is_on_new_line = true;
Kind::NewLine
}
'/' => {
if self.next_eq('/') {
self.skip_single_line_comment()
} else if self.next_eq('*') {
self.skip_multi_line_comment()
} else {
// regex is handled separately, see `next_regex`
self.read_slash()
}
}
'`' => self.read_template_literal(Kind::TemplateHead, Kind::NoSubstitutionTemplate),
'!' => self.read_exclamation(),
'%' => self.read_percent(),
'*' => self.read_star(),
'&' => self.read_ampersand(),
'|' => self.read_pipe(),
'?' => self.read_question(),
'<' => self
.read_left_angle()
.map_or_else(|| self.skip_single_line_comment(), |kind| kind),
'^' => self.read_caret(),
'#' => {
// https://tc39.es/proposal-hashbang/out.html
// HashbangComment ::
// `#!` SingleLineCommentChars?
if self.current.token.start == 0 && self.next_eq('!') {
self.skip_single_line_comment()
} else {
builder.get_mut_string_without_current_ascii_char(self);
self.private_identifier(builder)
}
}
'\\' => {
builder.force_allocation_without_current_ascii_char(self);
self.identifier_unicode_escape_sequence(&mut builder, true);
self.identifier_name_or_keyword(builder)
}
c if is_identifier_start(c) => {
builder.push_matching(c);
self.identifier_name_or_keyword(builder)
}
c if is_irregular_whitespace(c) => Kind::WhiteSpace,
c if is_irregular_line_terminator(c) => {
self.current.token.is_on_new_line = true;
Kind::NewLine
}
_ => {
self.error(Diagnostic::InvalidCharacter(c, self.unterminated_range()));
Kind::Undetermined
}
};
if !kind.is_trivia() {
return kind;
}
loop {
self.skip_whitespace();
let offset = self.offset();
builder = AutoCow::new(self);
self.current.token.start = offset;
let builder = AutoCow::new(self);
if let Some(c) = self.current.chars.next() {
let kind = self.match_char(c, builder);
if !kind.is_trivia() {
return kind;
}
} else {
return Kind::Eof;
}
}
}
#[allow(clippy::too_many_lines)]
fn match_char(&mut self, c: char, mut builder: AutoCow<'a>) -> Kind {
// fast path for single character tokens
// '{' '}' '(' ')' '[' ']' ';' ',' ':' '~'
let size = c as usize;
if size <= 127 {
let kind = SINGLE_CHAR_TOKENS[size];
if kind != Kind::Undetermined {
return kind;
}
}
// NOTE: matching order is significant here, by real world occurrences
// see https://blog.mozilla.org/nnethercote/2011/07/01/faster-javascript-parsing/
// > the rough order of frequency for different token kinds is as follows:
// identifiers/keywords, ., =, strings, decimal numbers, :, +, hex/octal numbers, and then everything else
match c {
// fast path for identifiers
c if c.is_ascii_alphabetic() => {
builder.push_matching(c);
self.identifier_name_or_keyword(builder)
}
'.' => {
let kind = self.read_dot(&mut builder);
if kind.is_number() {
self.set_numeric_value(kind, builder.finish(self));
}
kind
}
'=' => self.read_equal(),
'"' | '\'' => {
if self.context == LexerContext::JsxAttributeValue {
self.read_jsx_string_literal(c)
} else {
self.read_string_literal(c)
}
}
'1'..='9' => {
let kind = self.decimal_literal_after_first_digit(&mut builder);
self.set_numeric_value(kind, builder.finish(self));
kind
}
'+' => self.read_plus(),
'-' => self.read_minus().map_or_else(|| self.skip_single_line_comment(), |kind| kind),
'0' => {
let kind = self.read_zero(&mut builder);
self.set_numeric_value(kind, builder.finish(self));
kind
}
'/' => {
if self.next_eq('/') {
self.skip_single_line_comment()
} else if self.next_eq('*') {
self.skip_multi_line_comment()
} else {
// regex is handled separately, see `next_regex`
self.read_slash()
}
}
'`' => self.read_template_literal(Kind::TemplateHead, Kind::NoSubstitutionTemplate),
'!' => self.read_exclamation(),
'%' => self.read_percent(),
'*' => self.read_star(),
'&' => self.read_ampersand(),
'|' => self.read_pipe(),
'?' => self.read_question(),
'<' => {
self.read_left_angle().map_or_else(|| self.skip_single_line_comment(), |kind| kind)
}
'^' => self.read_caret(),
'#' => {
// https://tc39.es/proposal-hashbang/out.html
// HashbangComment ::
// `#!` SingleLineCommentChars?
if self.current.token.start == 0 && self.next_eq('!') {
self.skip_single_line_comment()
} else {
builder.get_mut_string_without_current_ascii_char(self);
self.private_identifier(builder)
}
}
'\\' => {
builder.force_allocation_without_current_ascii_char(self);
self.identifier_unicode_escape_sequence(&mut builder, true);
self.identifier_name_or_keyword(builder)
}
c if is_identifier_start(c) => {
builder.push_matching(c);
self.identifier_name_or_keyword(builder)
}
c if is_irregular_whitespace(c) => Kind::WhiteSpace,
c if is_irregular_line_terminator(c) => {
self.current.token.is_on_new_line = true;
Kind::NewLine
}
_ => {
self.error(Diagnostic::InvalidCharacter(c, self.unterminated_range()));
Kind::Undetermined
}
}
}
fn skip_whitespace(&mut self) {
let c = self.peek();
let any_newline = c == '\r' || c == '\n';
let any_white = c == ' ' || c == '\t' || any_newline;
// Fast path for single non-whitespace
if any_white {
self.current.chars.next();
if any_newline {
self.current.token.is_on_new_line = true;
}
} else {
return;
}
Kind::Eof
let remaining = self.remaining().as_bytes();
let state = SkipWhitespace::new(self.current.token.is_on_new_line).simd(remaining);
// SAFETY: offset is computed to the boundary
self.current.chars =
unsafe { std::str::from_utf8_unchecked(&remaining[state.offset..]) }.chars();
if state.newline {
self.current.token.is_on_new_line = true;
}
}
/// Section 12.4 Single Line Comment

View file

@ -0,0 +1,86 @@
//! Lexer methods using portable-SIMD
//! See:
//! * <https://github.com/rust-lang/portable-simd/blob/master/beginners-guide.md>
//! * <https://rapidjson.org/md_doc_internals.html#SkipwhitespaceWithSIMD>
//! * <https://lemire.me/blog/2017/01/20/how-quickly-can-you-remove-spaces-from-a-string>
use std::simd::{Simd, SimdPartialEq, ToBitMask};
const ELEMENTS: usize = 16;
type SimdVec = Simd<u8, ELEMENTS>;
#[derive(Debug)]
pub struct SkipWhitespace {
/// Total offset
pub offset: usize,
/// Found multiline comment end '*/'?
pub found: bool,
/// Found newline inside the comment?
pub newline: bool,
lf: SimdVec,
cr: SimdVec,
space: SimdVec,
tab: SimdVec,
}
impl SkipWhitespace {
pub fn new(newline: bool) -> Self {
Self {
offset: 0,
found: false,
newline,
lf: SimdVec::splat(b'\n'),
cr: SimdVec::splat(b'\r'),
space: SimdVec::splat(b' '),
tab: SimdVec::splat(b'\t'),
}
}
pub fn simd(mut self, bytes: &[u8]) -> Self {
let (chunks, remainder) = bytes.as_chunks::<ELEMENTS>();
for chunk in chunks {
self.check_chunk(chunk);
if self.found {
return self;
}
}
if !remainder.is_empty() {
// Align the last chunk for avoiding the use of a scalar version
let mut chunk = [0; ELEMENTS];
let len = remainder.len();
chunk[..len].copy_from_slice(remainder);
self.check_chunk(&chunk);
}
self
}
fn check_chunk(&mut self, chunk: &[u8]) {
let s = SimdVec::from_slice(chunk);
let any_newline = s.simd_eq(self.lf) | s.simd_eq(self.cr);
let any_white = s.simd_eq(self.space) | s.simd_eq(self.tab) | any_newline;
let advance_by = (!any_white.to_bitmask()).trailing_zeros();
// If the advanced offset contains a newline
if !self.newline
&& advance_by > 0
&& any_newline.to_bitmask() & (1u16.checked_shl(advance_by).map_or(u16::MAX, |c| c - 1))
> 0
{
self.newline = true;
}
if (advance_by as usize) < ELEMENTS {
self.found = true;
}
self.offset += advance_by as usize;
}
}

View file

@ -1,6 +1,8 @@
//! Recursive Descent Parser for ECMAScript and TypeScript
#![allow(clippy::wildcard_imports)] // allow for use `oxc_ast::ast::*`
#![feature(portable_simd)]
#![feature(slice_as_chunks)]
mod cursor;
mod list;