perf(parser): support peeking over bytes (#4304)

Closes https://github.com/oxc-project/oxc/issues/3291
This commit is contained in:
lucab 2024-07-30 17:53:13 +00:00
parent 732f4e2591
commit c9c38a187c
11 changed files with 116 additions and 76 deletions

View file

@ -108,6 +108,24 @@ impl TryFrom<char> for RegExpFlags {
}
}
impl TryFrom<u8> for RegExpFlags {
type Error = u8;
fn try_from(value: u8) -> Result<Self, Self::Error> {
match value {
b'g' => Ok(Self::G),
b'i' => Ok(Self::I),
b'm' => Ok(Self::M),
b's' => Ok(Self::S),
b'u' => Ok(Self::U),
b'y' => Ok(Self::Y),
b'd' => Ok(Self::D),
b'v' => Ok(Self::V),
_ => Err(value),
}
}
}
impl fmt::Display for RegExpFlags {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.contains(Self::G) {

View file

@ -336,12 +336,12 @@ ascii_byte_handler!(PRD(lexer) {
// /
ascii_byte_handler!(SLH(lexer) {
lexer.consume_char();
match lexer.peek() {
Some('/') => {
match lexer.peek_byte() {
Some(b'/') => {
lexer.consume_char();
lexer.skip_single_line_comment()
}
Some('*') => {
Some(b'*') => {
lexer.consume_char();
lexer.skip_multi_line_comment()
}
@ -418,9 +418,9 @@ ascii_byte_handler!(QST(lexer) {
} else {
Kind::Question2
}
} else if lexer.peek() == Some('.') {
} else if lexer.peek_byte() == Some(b'.') {
// parse `?.1` as `?` `.1`
if lexer.peek2().is_some_and(|c| c.is_ascii_digit()) {
if lexer.peek_char2().is_some_and(|c| c.is_ascii_digit()) {
Kind::Question
} else {
lexer.consume_char();

View file

@ -98,7 +98,7 @@ impl<'a> Lexer<'a> {
/// Any number of characters can have already been consumed from `self.source` prior to it.
/// `self.source` should be positioned at start of Unicode character.
fn identifier_tail_unicode(&mut self, start_pos: SourcePosition) -> &'a str {
let c = self.peek().unwrap();
let c = self.peek_char().unwrap();
if is_identifier_part_unicode(c) {
self.consume_char();
self.identifier_tail_after_unicode(start_pos)
@ -115,7 +115,7 @@ impl<'a> Lexer<'a> {
pub(super) fn identifier_tail_after_unicode(&mut self, start_pos: SourcePosition) -> &'a str {
// Identifier contains a Unicode chars, so probably contains more.
// So just iterate over chars now, instead of bytes.
while let Some(c) = self.peek() {
while let Some(c) = self.peek_char() {
if is_identifier_part(c) {
self.consume_char();
} else if c == '\\' {
@ -177,7 +177,7 @@ impl<'a> Lexer<'a> {
// Consume chars until reach end of identifier or another escape
let chunk_start = self.source.position();
loop {
let maybe_char = self.peek();
let maybe_char = self.peek_char();
if maybe_char.is_some_and(is_identifier_part) {
self.consume_char();
continue;
@ -272,7 +272,7 @@ impl<'a> Lexer<'a> {
fn private_identifier_not_ascii_id(&mut self) -> Kind {
let b = self.source.peek_byte().unwrap();
if !b.is_ascii() {
let c = self.peek().unwrap();
let c = self.peek_char().unwrap();
if is_identifier_start_unicode(c) {
let start_pos = self.source.position();
self.consume_char();

View file

@ -61,12 +61,12 @@ impl<'a> Lexer<'a> {
/// `JSXFragment`
/// { `JSXChildExpressionopt` }
fn read_jsx_child(&mut self) -> Kind {
match self.peek() {
Some('<') => {
match self.peek_byte() {
Some(b'<') => {
self.consume_char();
Kind::LAngle
}
Some('{') => {
Some(b'{') => {
self.consume_char();
Kind::LCurly
}
@ -122,7 +122,7 @@ impl<'a> Lexer<'a> {
// Unicode chars are rare in identifiers, so cold branch to keep common path for ASCII
// as fast as possible
cold_branch(|| {
while let Some(c) = self.peek() {
while let Some(c) = self.peek_char() {
if c == '-' || is_identifier_part(c) {
self.consume_char();
} else {

View file

@ -206,11 +206,11 @@ impl Kind {
)
}
pub fn matches_number_char(self, c: char) -> bool {
pub fn matches_number_char(self, c: u8) -> bool {
match self {
Decimal => c.is_ascii_digit(),
Binary => matches!(c, '0'..='1'),
Octal => matches!(c, '0'..='7'),
Binary => matches!(c, b'0'..=b'1'),
Octal => matches!(c, b'0'..=b'7'),
Hex => c.is_ascii_hexdigit(),
_ => unreachable!(),
}

View file

@ -251,15 +251,27 @@ impl<'a> Lexer<'a> {
self.source.next_char().unwrap()
}
/// Peek the next byte without advancing the position
#[inline]
fn peek_byte(&self) -> Option<u8> {
self.source.peek_byte()
}
/// Peek the next two bytes without advancing the position
#[inline]
fn peek_2_bytes(&self) -> Option<[u8; 2]> {
self.source.peek_2_bytes()
}
/// Peek the next char without advancing the position
#[inline]
fn peek(&self) -> Option<char> {
fn peek_char(&self) -> Option<char> {
self.source.peek_char()
}
/// Peek the next next char without advancing the position
#[inline]
fn peek2(&self) -> Option<char> {
fn peek_char2(&self) -> Option<char> {
self.source.peek_char2()
}
@ -284,7 +296,7 @@ impl<'a> Lexer<'a> {
/// Return `IllegalCharacter` Error or `UnexpectedEnd` if EOF
fn unexpected_err(&mut self) {
let offset = self.current_offset();
match self.peek() {
match self.peek_char() {
Some(c) => self.error(diagnostics::invalid_character(c, offset)),
None => self.error(diagnostics::unexpected_end(offset)),
}

View file

@ -6,19 +6,19 @@ use crate::diagnostics;
impl<'a> Lexer<'a> {
/// 12.9.3 Numeric Literals with `0` prefix
pub(super) fn read_zero(&mut self) -> Kind {
match self.peek() {
Some('b' | 'B') => self.read_non_decimal(Kind::Binary),
Some('o' | 'O') => self.read_non_decimal(Kind::Octal),
Some('x' | 'X') => self.read_non_decimal(Kind::Hex),
Some('e' | 'E') => {
match self.peek_byte() {
Some(b'b' | b'B') => self.read_non_decimal(Kind::Binary),
Some(b'o' | b'O') => self.read_non_decimal(Kind::Octal),
Some(b'x' | b'X') => self.read_non_decimal(Kind::Hex),
Some(b'e' | b'E') => {
self.consume_char();
self.read_decimal_exponent()
}
Some('.') => {
Some(b'.') => {
self.consume_char();
self.decimal_literal_after_decimal_point_after_digits()
}
Some('n') => {
Some(b'n') => {
self.consume_char();
self.check_after_numeric_literal(Kind::Decimal)
}
@ -42,23 +42,23 @@ impl<'a> Lexer<'a> {
fn read_non_decimal(&mut self, kind: Kind) -> Kind {
self.consume_char();
if self.peek().is_some_and(|c| kind.matches_number_char(c)) {
if self.peek_byte().is_some_and(|c| kind.matches_number_char(c)) {
self.consume_char();
} else {
self.unexpected_err();
return Kind::Undetermined;
}
while let Some(c) = self.peek() {
while let Some(c) = self.peek_byte() {
match c {
'_' => {
b'_' => {
self.consume_char();
// NOTE: it looks invalid numeric tokens are still parsed.
// This seems to be a waste. It also requires us to put this
// call here instead of after we ensure the next character
// is a number character
self.token.set_has_separator();
if self.peek().is_some_and(|c| kind.matches_number_char(c)) {
if self.peek_byte().is_some_and(|c| kind.matches_number_char(c)) {
self.consume_char();
} else {
self.unexpected_err();
@ -71,20 +71,18 @@ impl<'a> Lexer<'a> {
_ => break,
}
}
if self.peek() == Some('n') {
self.consume_char();
}
self.next_ascii_char_eq(b'n');
self.check_after_numeric_literal(kind)
}
fn read_legacy_octal(&mut self) -> Kind {
let mut kind = Kind::Octal;
loop {
match self.peek() {
Some('0'..='7') => {
match self.peek_byte() {
Some(b'0'..=b'7') => {
self.consume_char();
}
Some('8'..='9') => {
Some(b'8'..=b'9') => {
self.consume_char();
kind = Kind::Decimal;
}
@ -92,14 +90,14 @@ impl<'a> Lexer<'a> {
}
}
match self.peek() {
match self.peek_byte() {
// allow 08.5 and 09.5
Some('.') if kind == Kind::Decimal => {
Some(b'.') if kind == Kind::Decimal => {
self.consume_char();
self.decimal_literal_after_decimal_point_after_digits()
}
// allow 08e1 and 09e1
Some('e') if kind == Kind::Decimal => {
Some(b'e') if kind == Kind::Decimal => {
self.consume_char();
self.read_decimal_exponent()
}
@ -108,12 +106,12 @@ impl<'a> Lexer<'a> {
}
fn read_decimal_exponent(&mut self) -> Kind {
let kind = match self.peek() {
Some('-') => {
let kind = match self.peek_byte() {
Some(b'-') => {
self.consume_char();
Kind::NegativeExponential
}
Some('+') => {
Some(b'+') => {
self.consume_char();
Kind::PositiveExponential
}
@ -124,7 +122,7 @@ impl<'a> Lexer<'a> {
}
fn read_decimal_digits(&mut self) {
if self.peek().is_some_and(|c| c.is_ascii_digit()) {
if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) {
self.consume_char();
} else {
self.unexpected_err();
@ -135,23 +133,23 @@ impl<'a> Lexer<'a> {
}
fn read_decimal_digits_after_first_digit(&mut self) {
while let Some(c) = self.peek() {
match c {
'_' => {
while let Some(b) = self.peek_byte() {
match b {
b'_' => {
self.consume_char();
// NOTE: it looks invalid numeric tokens are still parsed.
// This seems to be a waste. It also requires us to put this
// call here instead of after we ensure the next character
// is an ASCII digit
self.token.set_has_separator();
if self.peek().is_some_and(|c| c.is_ascii_digit()) {
if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) {
self.consume_char();
} else {
self.unexpected_err();
return;
}
}
'0'..='9' => {
b'0'..=b'9' => {
self.consume_char();
}
_ => break,
@ -172,16 +170,14 @@ impl<'a> Lexer<'a> {
}
fn optional_decimal_digits(&mut self) {
if self.peek().is_some_and(|c| c.is_ascii_digit()) {
if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) {
self.consume_char();
} else {
return;
self.read_decimal_digits_after_first_digit();
}
self.read_decimal_digits_after_first_digit();
}
fn optional_exponent(&mut self) -> Option<Kind> {
if matches!(self.peek(), Some('e' | 'E')) {
if matches!(self.peek_byte(), Some(b'e' | b'E')) {
self.consume_char();
return Some(self.read_decimal_exponent());
}
@ -191,12 +187,12 @@ impl<'a> Lexer<'a> {
fn check_after_numeric_literal(&mut self, kind: Kind) -> Kind {
let offset = self.offset();
// The SourceCharacter immediately following a NumericLiteral must not be an IdentifierStart or DecimalDigit.
let c = self.peek();
let c = self.peek_char();
if c.is_none() || c.is_some_and(|ch| !ch.is_ascii_digit() && !is_identifier_start(ch)) {
return kind;
}
self.consume_char();
while let Some(c) = self.peek() {
while let Some(c) = self.peek_char() {
if is_identifier_start(c) {
self.consume_char();
} else {

View file

@ -3,12 +3,12 @@ use super::{Kind, Lexer, Token};
impl<'a> Lexer<'a> {
/// Section 12.8 Punctuators
pub(super) fn read_dot(&mut self) -> Kind {
if self.peek() == Some('.') && self.peek2() == Some('.') {
if self.peek_2_bytes() == Some([b'.', b'.']) {
self.consume_char();
self.consume_char();
return Kind::Dot3;
}
if self.peek().is_some_and(|c| c.is_ascii_digit()) {
if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) {
self.decimal_literal_after_decimal_point()
} else {
Kind::Dot
@ -25,7 +25,7 @@ impl<'a> Lexer<'a> {
}
} else if self.next_ascii_char_eq(b'=') {
Some(Kind::LtEq)
} else if self.peek() == Some('!')
} else if self.peek_byte() == Some(b'!')
// SingleLineHTMLOpenComment `<!--` in script mode
&& self.source_type.is_script()
&& self.remaining().starts_with("!--")

View file

@ -58,14 +58,16 @@ impl<'a> Lexer<'a> {
let pattern_end = self.offset() - 1; // -1 to exclude `/`
let mut flags = RegExpFlags::empty();
while let Some(ch @ ('$' | '_' | 'a'..='z' | 'A'..='Z' | '0'..='9')) = self.peek() {
while let Some(ch @ (b'$' | b'_' | b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9')) =
self.peek_byte()
{
self.consume_char();
let Ok(flag) = RegExpFlags::try_from(ch) else {
self.error(diagnostics::reg_exp_flag(ch, self.current_offset()));
self.error(diagnostics::reg_exp_flag(ch as char, self.current_offset()));
continue;
};
if flags.contains(flag) {
self.error(diagnostics::reg_exp_flag_twice(ch, self.current_offset()));
self.error(diagnostics::reg_exp_flag_twice(ch as char, self.current_offset()));
continue;
}
flags |= flag;

View file

@ -498,6 +498,19 @@ impl<'a> Source<'a> {
}
}
/// Peek next two bytes of source without consuming them.
#[inline]
pub(super) fn peek_2_bytes(&self) -> Option<[u8; 2]> {
if (self.end as usize).saturating_sub(self.ptr as usize) >= 2 {
// SAFETY: The check above ensures that there are at least 2 bytes to
// read from `self.ptr` without overflowing past `self.end`.
let bytes = unsafe { self.position().read2() };
Some(bytes)
} else {
None
}
}
/// Peek next byte of source without consuming it, without EOF bounds-check.
///
/// # SAFETY

View file

@ -18,7 +18,7 @@ enum SurrogatePair {
impl<'a> Lexer<'a> {
pub(super) fn unicode_char_handler(&mut self) -> Kind {
let c = self.peek().unwrap();
let c = self.peek_char().unwrap();
match c {
c if is_identifier_start_unicode(c) => {
let start_pos = self.source.position();
@ -60,7 +60,7 @@ impl<'a> Lexer<'a> {
return;
}
let value = match self.peek() {
let value = match self.peek_char() {
Some('{') => self.unicode_code_point(),
_ => self.surrogate_pair(),
};
@ -109,7 +109,7 @@ impl<'a> Lexer<'a> {
text: &mut String<'a>,
is_valid_escape_sequence: &mut bool,
) {
let value = match self.peek() {
let value = match self.peek_char() {
Some('{') => self.unicode_code_point(),
_ => self.surrogate_pair(),
};
@ -160,10 +160,10 @@ impl<'a> Lexer<'a> {
}
fn hex_digit(&mut self) -> Option<u32> {
let value = match self.peek() {
Some(c @ '0'..='9') => c as u32 - '0' as u32,
Some(c @ 'a'..='f') => 10 + (c as u32 - 'a' as u32),
Some(c @ 'A'..='F') => 10 + (c as u32 - 'A' as u32),
let value = match self.peek_byte() {
Some(c @ b'0'..=b'9') => u32::from(c) - '0' as u32,
Some(c @ b'a'..=b'f') => 10 + (u32::from(c) - 'a' as u32),
Some(c @ b'A'..=b'F') => 10 + (u32::from(c) - 'A' as u32),
_ => return None,
};
self.consume_char();
@ -188,9 +188,8 @@ impl<'a> Lexer<'a> {
fn surrogate_pair(&mut self) -> Option<SurrogatePair> {
let high = self.hex_4_digits()?;
// The first code unit of a surrogate pair is always in the range from 0xD800 to 0xDBFF, and is called a high surrogate or a lead surrogate.
let is_pair = (0xD800..=0xDBFF).contains(&high)
&& self.peek() == Some('\\')
&& self.peek2() == Some('u');
let is_pair =
(0xD800..=0xDBFF).contains(&high) && self.peek_2_bytes() == Some([b'\\', b'u']);
if !is_pair {
return Some(SurrogatePair::CodePoint(high));
}
@ -266,7 +265,7 @@ impl<'a> Lexer<'a> {
self.string_unicode_escape_sequence(text, is_valid_escape_sequence);
}
// 0 [lookahead ∉ DecimalDigit]
'0' if !self.peek().is_some_and(|c| c.is_ascii_digit()) => text.push('\0'),
'0' if !self.peek_byte().is_some_and(|b| b.is_ascii_digit()) => text.push('\0'),
// Section 12.9.4 String Literals
// LegacyOctalEscapeSequence
// NonOctalDecimalEscapeSequence
@ -275,16 +274,16 @@ impl<'a> Lexer<'a> {
num.push(a);
match a {
'4'..='7' => {
if matches!(self.peek(), Some('0'..='7')) {
if matches!(self.peek_byte(), Some(b'0'..=b'7')) {
let b = self.consume_char();
num.push(b);
}
}
'0'..='3' => {
if matches!(self.peek(), Some('0'..='7')) {
if matches!(self.peek_byte(), Some(b'0'..=b'7')) {
let b = self.consume_char();
num.push(b);
if matches!(self.peek(), Some('0'..='7')) {
if matches!(self.peek_byte(), Some(b'0'..=b'7')) {
let c = self.consume_char();
num.push(c);
}
@ -297,7 +296,7 @@ impl<'a> Lexer<'a> {
char::from_u32(u32::from_str_radix(num.as_str(), 8).unwrap()).unwrap();
text.push(value);
}
'0' if in_template && self.peek().is_some_and(|c| c.is_ascii_digit()) => {
'0' if in_template && self.peek_byte().is_some_and(|b| b.is_ascii_digit()) => {
self.consume_char();
// error raised within the parser by `diagnostics::template_literal`
*is_valid_escape_sequence = false;