perf(parser): lexer match byte not char (#2025)

2 related changes to lexer's `read_next_token()`:

1. Hint to branch predictor that unicode identifiers and non-standard
whitespace are rare by marking that branch `#[cold]`.

2. The branch is on whether next character is ASCII or not. This check
only requires reading 1 byte, as ASCII characters are always single byte
in UTF8. So only do the work of getting a `char` in the cold path, once
it's established that character is not ASCII and this work is required.
This commit is contained in:
overlookmotel 2024-01-14 10:50:11 +00:00 committed by GitHub
parent a356918d83
commit 60a927d8f5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -374,28 +374,31 @@ impl<'a> Lexer<'a> {
let offset = self.offset(); let offset = self.offset();
self.current.token.start = offset; self.current.token.start = offset;
if let Some(c) = self.current.chars.clone().next() { let remaining = self.current.chars.as_str();
let kind = self.match_char(c); if remaining.is_empty() {
if !matches!(
kind,
Kind::WhiteSpace | Kind::NewLine | Kind::Comment | Kind::MultiLineComment
) {
return kind;
}
} else {
return Kind::Eof; return Kind::Eof;
} }
let byte = remaining.as_bytes()[0];
let kind = if byte < 128 {
BYTE_HANDLERS[byte as usize](self)
} else {
self.match_unicode_char()
};
if !matches!(
kind,
Kind::WhiteSpace | Kind::NewLine | Kind::Comment | Kind::MultiLineComment
) {
return kind;
}
} }
} }
#[inline] // `#[cold]` to hint to branch predictor that unicode identifiers and irregular whitespace are rare
fn match_char(&mut self, c: char) -> Kind { #[cold]
let size = c as usize; fn match_unicode_char(&mut self) -> Kind {
let c = self.current.chars.clone().next().unwrap();
if size < 128 {
return BYTE_HANDLERS[size](self);
}
match c { match c {
c if is_id_start_unicode(c) => { c if is_id_start_unicode(c) => {
let mut builder = AutoCow::new(self); let mut builder = AutoCow::new(self);