perf(parser): lexer byte handlers consume ASCII chars faster (#2046)

In the lexer, most `BYTE_HANDLER`s immediately consume the current char
with `lexer.consume_char()`.

Byte handlers are only called if there's a certain value (or range of
values) for the next char. This is their entire purpose. So in all cases
we know for sure that we're not at EOF, and that the next char is a
single-byte ASCII character.

The compiler, however, doesn't seem to be able to "see through" the
`BYTE_HANDLERS[byte](self)` call and understand these invariants. So it
produces very verbose ASM for `lexer.consume_char()`.

This PR replaces `lexer.consume_char()` in the byte handlers with an
unsafe `lexer.consume_ascii_char()` which skips on to next char with a
single `inc` instruction.

The difference in codegen can be seen here:
https://godbolt.org/z/1ha3cr9W5 (compare the 2 x
`core::ops::function::FnOnce::call_once` handlers).

Downside is that this does introduce a lot of unsafe blocks, but in my
opinion they're all pretty trivial to validate.

---------

Co-authored-by: Boshen <boshenc@gmail.com>
This commit is contained in:
overlookmotel 2024-01-16 04:31:45 +00:00 committed by GitHub
parent 09c7570560
commit 66a7a68f9f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 101 additions and 39 deletions

7
Cargo.lock generated
View file

@ -53,6 +53,12 @@ version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87" checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87"
[[package]]
name = "assert-unchecked"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7330592adf847ee2e3513587b4db2db410a0d751378654e7e993d9adcbe5c795"
[[package]] [[package]]
name = "async-trait" name = "async-trait"
version = "0.1.77" version = "0.1.77"
@ -1735,6 +1741,7 @@ dependencies = [
name = "oxc_parser" name = "oxc_parser"
version = "0.5.0" version = "0.5.0"
dependencies = [ dependencies = [
"assert-unchecked",
"bitflags 2.4.1", "bitflags 2.4.1",
"miette", "miette",
"num-bigint", "num-bigint",

View file

@ -83,6 +83,7 @@ oxc_prettier = { path = "crates/oxc_prettier" }
oxc_tasks_common = { path = "tasks/common" } oxc_tasks_common = { path = "tasks/common" }
oxc_language_server = { path = "crates/oxc_language_server" } oxc_language_server = { path = "crates/oxc_language_server" }
assert-unchecked = { version = "0.1.2" }
bpaf = { version = "0.9.8" } bpaf = { version = "0.9.8" }
bitflags = { version = "2.4.1" } bitflags = { version = "2.4.1" }
bumpalo = { version = "3.14.0" } bumpalo = { version = "3.14.0" }

View file

@ -25,9 +25,10 @@ oxc_syntax = { workspace = true }
oxc_diagnostics = { workspace = true } oxc_diagnostics = { workspace = true }
oxc_index = { workspace = true } oxc_index = { workspace = true }
bitflags = { workspace = true } assert-unchecked = { workspace = true }
rustc-hash = { workspace = true } bitflags = { workspace = true }
num-bigint = { workspace = true } rustc-hash = { workspace = true }
num-bigint = { workspace = true }
[dev-dependencies] [dev-dependencies]
oxc_ast = { workspace = true, features = ["serde"] } oxc_ast = { workspace = true, features = ["serde"] }

View file

@ -11,6 +11,7 @@ mod string_builder;
mod token; mod token;
mod trivia_builder; mod trivia_builder;
use assert_unchecked::assert_unchecked;
use rustc_hash::FxHashMap; use rustc_hash::FxHashMap;
use std::{collections::VecDeque, str::Chars}; use std::{collections::VecDeque, str::Chars};
@ -270,6 +271,20 @@ impl<'a> Lexer<'a> {
self.current.chars.next().unwrap() self.current.chars.next().unwrap()
} }
/// Consume the current char when it's known to be ASCII.
/// This compiles down to a single instruction, just incrementing `chars` iterator's pointer.
/// NOTE: Caller must ensure not at EOF and current char is ASCII.
#[inline]
fn consume_ascii_char(&mut self) -> char {
let s = self.current.chars.as_str();
// SAFETY: Caller must ensure not at EOF and current char is ASCII.
unsafe {
assert_unchecked!(!s.is_empty());
assert_unchecked!(s.as_bytes()[0] < 128);
}
self.current.chars.next().unwrap()
}
/// Peek the next char without advancing the position /// Peek the next char without advancing the position
#[inline] #[inline]
fn peek(&self) -> Option<char> { fn peek(&self) -> Option<char> {
@ -1315,28 +1330,33 @@ static BYTE_HANDLERS: [ByteHandler; 128] = [
L_P, IDT, L_R, L_S, L_T, L_U, L_V, L_W, IDT, L_Y, IDT, BEO, PIP, BEC, TLD, ERR, // 7 L_P, IDT, L_R, L_S, L_T, L_U, L_V, L_W, IDT, L_Y, IDT, BEO, PIP, BEC, TLD, ERR, // 7
]; ];
// `\0` `\1` etc
const ERR: ByteHandler = |lexer| { const ERR: ByteHandler = |lexer| {
let c = lexer.consume_char(); // Next char is an ASCII char e.g. `\0`
let c = lexer.consume_ascii_char();
lexer.error(diagnostics::InvalidCharacter(c, lexer.unterminated_range())); lexer.error(diagnostics::InvalidCharacter(c, lexer.unterminated_range()));
Kind::Undetermined Kind::Undetermined
}; };
// <TAB> <VT> <FF> // <SPACE> <TAB> <VT> <FF>
const SPS: ByteHandler = |lexer| { const SPS: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is an ASCII space character
lexer.consume_ascii_char();
Kind::WhiteSpace Kind::WhiteSpace
}; };
// '\r' '\n' // '\r' '\n'
const LIN: ByteHandler = |lexer| { const LIN: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is `\r` or `\n`, which are both ASCII
lexer.consume_ascii_char();
lexer.current.token.is_on_new_line = true; lexer.current.token.is_on_new_line = true;
Kind::NewLine Kind::NewLine
}; };
// ! // !
const EXL: ByteHandler = |lexer| { const EXL: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is `!`, which is ASCII
lexer.consume_ascii_char();
if lexer.next_eq('=') { if lexer.next_eq('=') {
if lexer.next_eq('=') { if lexer.next_eq('=') {
Kind::Neq2 Kind::Neq2
@ -1350,7 +1370,8 @@ const EXL: ByteHandler = |lexer| {
// ' " // ' "
const QOT: ByteHandler = |lexer| { const QOT: ByteHandler = |lexer| {
let c = lexer.consume_char(); // Next char is `'` or `"`, which are both ASCII
let c = lexer.consume_ascii_char();
if lexer.context == LexerContext::JsxAttributeValue { if lexer.context == LexerContext::JsxAttributeValue {
lexer.read_jsx_string_literal(c) lexer.read_jsx_string_literal(c)
} else { } else {
@ -1360,7 +1381,8 @@ const QOT: ByteHandler = |lexer| {
// # // #
const HAS: ByteHandler = |lexer| { const HAS: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is `#`, which is ASCII
lexer.consume_ascii_char();
// HashbangComment :: // HashbangComment ::
// `#!` SingleLineCommentChars? // `#!` SingleLineCommentChars?
if lexer.current.token.start == 0 && lexer.next_eq('!') { if lexer.current.token.start == 0 && lexer.next_eq('!') {
@ -1377,7 +1399,8 @@ const IDT: ByteHandler = |lexer| {
// % // %
const PRC: ByteHandler = |lexer| { const PRC: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is `%`, which is ASCII
lexer.consume_ascii_char();
if lexer.next_eq('=') { if lexer.next_eq('=') {
Kind::PercentEq Kind::PercentEq
} else { } else {
@ -1387,7 +1410,8 @@ const PRC: ByteHandler = |lexer| {
// & // &
const AMP: ByteHandler = |lexer| { const AMP: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is `&`, which is ASCII
lexer.consume_ascii_char();
if lexer.next_eq('&') { if lexer.next_eq('&') {
if lexer.next_eq('=') { if lexer.next_eq('=') {
Kind::Amp2Eq Kind::Amp2Eq
@ -1403,19 +1427,22 @@ const AMP: ByteHandler = |lexer| {
// ( // (
const PNO: ByteHandler = |lexer| { const PNO: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is `(`, which is ASCII
lexer.consume_ascii_char();
Kind::LParen Kind::LParen
}; };
// ) // )
const PNC: ByteHandler = |lexer| { const PNC: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is `)`, which is ASCII
lexer.consume_ascii_char();
Kind::RParen Kind::RParen
}; };
// * // *
const ATR: ByteHandler = |lexer| { const ATR: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is `*`, which is ASCII
lexer.consume_ascii_char();
if lexer.next_eq('*') { if lexer.next_eq('*') {
if lexer.next_eq('=') { if lexer.next_eq('=') {
Kind::Star2Eq Kind::Star2Eq
@ -1431,7 +1458,8 @@ const ATR: ByteHandler = |lexer| {
// + // +
const PLS: ByteHandler = |lexer| { const PLS: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is `+`, which is ASCII
lexer.consume_ascii_char();
if lexer.next_eq('+') { if lexer.next_eq('+') {
Kind::Plus2 Kind::Plus2
} else if lexer.next_eq('=') { } else if lexer.next_eq('=') {
@ -1443,25 +1471,29 @@ const PLS: ByteHandler = |lexer| {
// , // ,
const COM: ByteHandler = |lexer| { const COM: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is `,`, which is ASCII
lexer.consume_ascii_char();
Kind::Comma Kind::Comma
}; };
// - // -
const MIN: ByteHandler = |lexer| { const MIN: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is `-`, which is ASCII
lexer.consume_ascii_char();
lexer.read_minus().unwrap_or_else(|| lexer.skip_single_line_comment()) lexer.read_minus().unwrap_or_else(|| lexer.skip_single_line_comment())
}; };
// . // .
const PRD: ByteHandler = |lexer| { const PRD: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is `.`, which is ASCII
lexer.consume_ascii_char();
lexer.read_dot() lexer.read_dot()
}; };
// / // /
const SLH: ByteHandler = |lexer| { const SLH: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is `/`, which is ASCII
lexer.consume_ascii_char();
match lexer.peek() { match lexer.peek() {
Some('/') => { Some('/') => {
lexer.current.chars.next(); lexer.current.chars.next();
@ -1484,37 +1516,43 @@ const SLH: ByteHandler = |lexer| {
// 0 // 0
const ZER: ByteHandler = |lexer| { const ZER: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is `0`, which is ASCII
lexer.consume_ascii_char();
lexer.read_zero() lexer.read_zero()
}; };
// 1 to 9 // 1 to 9
const DIG: ByteHandler = |lexer| { const DIG: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is an ASCII digit
lexer.consume_ascii_char();
lexer.decimal_literal_after_first_digit() lexer.decimal_literal_after_first_digit()
}; };
// : // :
const COL: ByteHandler = |lexer| { const COL: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is `:`, which is ASCII
lexer.consume_ascii_char();
Kind::Colon Kind::Colon
}; };
// ; // ;
const SEM: ByteHandler = |lexer| { const SEM: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is `;`, which is ASCII
lexer.consume_ascii_char();
Kind::Semicolon Kind::Semicolon
}; };
// < // <
const LSS: ByteHandler = |lexer| { const LSS: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is `<`, which is ASCII
lexer.consume_ascii_char();
lexer.read_left_angle().unwrap_or_else(|| lexer.skip_single_line_comment()) lexer.read_left_angle().unwrap_or_else(|| lexer.skip_single_line_comment())
}; };
// = // =
const EQL: ByteHandler = |lexer| { const EQL: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is `=`, which is ASCII
lexer.consume_ascii_char();
if lexer.next_eq('=') { if lexer.next_eq('=') {
if lexer.next_eq('=') { if lexer.next_eq('=') {
Kind::Eq3 Kind::Eq3
@ -1530,14 +1568,16 @@ const EQL: ByteHandler = |lexer| {
// > // >
const GTR: ByteHandler = |lexer| { const GTR: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is `>`, which is ASCII
lexer.consume_ascii_char();
// `>=` is re-lexed with [Lexer::next_jsx_child] // `>=` is re-lexed with [Lexer::next_jsx_child]
Kind::RAngle Kind::RAngle
}; };
// ? // ?
const QST: ByteHandler = |lexer| { const QST: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is `?`, which is ASCII
lexer.consume_ascii_char();
if lexer.next_eq('?') { if lexer.next_eq('?') {
if lexer.next_eq('=') { if lexer.next_eq('=') {
Kind::Question2Eq Kind::Question2Eq
@ -1559,20 +1599,26 @@ const QST: ByteHandler = |lexer| {
// @ // @
const AT_: ByteHandler = |lexer| { const AT_: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is `@`, which is ASCII
lexer.consume_ascii_char();
Kind::At Kind::At
}; };
// [ // [
const BTO: ByteHandler = |lexer| { const BTO: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is `[`, which is ASCII
lexer.consume_ascii_char();
Kind::LBrack Kind::LBrack
}; };
// \ // \
const ESC: ByteHandler = |lexer| { const ESC: ByteHandler = |lexer| {
let mut builder = AutoCow::new(lexer); let lexer_ref = lexer as &Lexer<'_>;
lexer.consume_char(); let mut builder = AutoCow::new(lexer_ref);
// Next char at start of this function was `\`, which is ASCII.
// `AutoCow::new` cannot have changed the state of `lexer.current.chars` iterator,
// as we explicitly passed it only an immutable reference.
lexer.consume_ascii_char();
builder.force_allocation_without_current_ascii_char(lexer); builder.force_allocation_without_current_ascii_char(lexer);
lexer.identifier_unicode_escape_sequence(&mut builder, true); lexer.identifier_unicode_escape_sequence(&mut builder, true);
let text = lexer.identifier_name(builder); let text = lexer.identifier_name(builder);
@ -1581,13 +1627,15 @@ const ESC: ByteHandler = |lexer| {
// ] // ]
const BTC: ByteHandler = |lexer| { const BTC: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is `]`, which is ASCII
lexer.consume_ascii_char();
Kind::RBrack Kind::RBrack
}; };
// ^ // ^
const CRT: ByteHandler = |lexer| { const CRT: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is `^`, which is ASCII
lexer.consume_ascii_char();
if lexer.next_eq('=') { if lexer.next_eq('=') {
Kind::CaretEq Kind::CaretEq
} else { } else {
@ -1597,19 +1645,22 @@ const CRT: ByteHandler = |lexer| {
// ` // `
const TPL: ByteHandler = |lexer| { const TPL: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is '`', which is ASCII
lexer.consume_ascii_char();
lexer.read_template_literal(Kind::TemplateHead, Kind::NoSubstitutionTemplate) lexer.read_template_literal(Kind::TemplateHead, Kind::NoSubstitutionTemplate)
}; };
// { // {
const BEO: ByteHandler = |lexer| { const BEO: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is `{`, which is ASCII
lexer.consume_ascii_char();
Kind::LCurly Kind::LCurly
}; };
// | // |
const PIP: ByteHandler = |lexer| { const PIP: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is `|`, which is ASCII
lexer.consume_ascii_char();
if lexer.next_eq('|') { if lexer.next_eq('|') {
if lexer.next_eq('=') { if lexer.next_eq('=') {
Kind::Pipe2Eq Kind::Pipe2Eq
@ -1625,13 +1676,15 @@ const PIP: ByteHandler = |lexer| {
// } // }
const BEC: ByteHandler = |lexer| { const BEC: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is `}`, which is ASCII
lexer.consume_ascii_char();
Kind::RCurly Kind::RCurly
}; };
// ~ // ~
const TLD: ByteHandler = |lexer| { const TLD: ByteHandler = |lexer| {
lexer.consume_char(); // Next char is `~`, which is ASCII
lexer.consume_ascii_char();
Kind::Tilde Kind::Tilde
}; };