diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs index dd6c6bdb2..c3d698eda 100644 --- a/crates/oxc_parser/src/lexer/mod.rs +++ b/crates/oxc_parser/src/lexer/mod.rs @@ -11,7 +11,6 @@ mod string_builder; mod token; mod trivia_builder; -use assert_unchecked::assert_unchecked; use rustc_hash::FxHashMap; use std::{collections::VecDeque, str::Chars}; @@ -271,20 +270,6 @@ impl<'a> Lexer<'a> { self.current.chars.next().unwrap() } - /// Consume the current char when it's known to be ASCII. - /// This compiles down to a single instruction, just incrementing `chars` iterator's pointer. - /// NOTE: Caller must ensure not at EOF and current char is ASCII. - #[inline] - fn consume_ascii_char(&mut self) -> char { - let s = self.current.chars.as_str(); - // SAFETY: Caller must ensure not at EOF and current char is ASCII. - unsafe { - assert_unchecked!(!s.is_empty()); - assert_unchecked!(s.as_bytes()[0] < 128); - } - self.current.chars.next().unwrap() - } - /// Peek the next char without advancing the position #[inline] fn peek(&self) -> Option { @@ -395,7 +380,9 @@ impl<'a> Lexer<'a> { } let byte = remaining.as_bytes()[0]; - let kind = BYTE_HANDLERS[byte as usize](self); + // SAFETY: Check for `remaining.is_empty()` ensures not at end of file, + // and `byte` is the byte at current position of `self.current.chars`. + let kind = unsafe { handle_byte(byte, self) }; if !matches!( kind, @@ -1307,6 +1294,17 @@ enum SurrogatePair { HighLow(u32, u32), } +#[allow(clippy::unnecessary_safety_comment)] +/// Handle next byte of source. +/// SAFETY: +/// * Lexer must not be at end of file. +/// * `byte` must be next byte of source code, corresponding to current position +/// of `lexer.current.chars`. +/// * Only `BYTE_HANDLERS` for ASCII characters may use the `ascii_byte_handler!()` macro. +unsafe fn handle_byte(byte: u8, lexer: &mut Lexer) -> Kind { + BYTE_HANDLERS[byte as usize](lexer) +} + type ByteHandler = fn(&mut Lexer<'_>) -> Kind; /// Lookup table mapping any incoming byte to a handler function defined below. @@ -1332,33 +1330,82 @@ static BYTE_HANDLERS: [ByteHandler; 256] = [ UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // F ]; +#[allow(clippy::unnecessary_safety_comment)] +/// Macro for defining byte handler for an ASCII character. +/// +/// In addition to defining a `const` for the handler, it also asserts that lexer +/// is not at end of file, and that next char is ASCII. +/// Where the handler is for an ASCII character, these assertions are self-evidently true. +/// +/// These assertions produce no runtime code, but hint to the compiler that it can assume that +/// next char is ASCII, and it uses that information to optimize the rest of the handler. +/// e.g. `lexer.current.chars.next()` becomes just a single assembler instruction. +/// Without the assertions, the compiler is unable to deduce the next char is ASCII, due to +/// the indirection of the `BYTE_HANDLERS` jump table. +/// +/// These assertions are unchecked (i.e. won't panic) and will cause UB if they're incorrect. +/// +/// SAFETY: Only use this macro to define byte handlers for ASCII characters. +/// +/// ``` +/// ascii_byte_handler!(SPS(lexer) { +/// lexer.consume_char(); +/// Kind::WhiteSpace +/// }); +/// ``` +/// +/// expands to: +/// +/// ``` +/// const SPS: ByteHandler = |lexer| { +/// unsafe { +/// use ::assert_unchecked::assert_unchecked; +/// let s = lexer.current.chars.as_str(); +/// assert_unchecked!(!s.is_empty()); +/// assert_unchecked!(s.as_bytes()[0] < 128); +/// } +/// lexer.consume_char(); +/// Kind::WhiteSpace +/// }; +/// ``` +macro_rules! ascii_byte_handler { + ($id:ident($lex:ident) $body:expr) => { + const $id: ByteHandler = |$lex| { + // SAFETY: This macro is only used for ASCII characters + unsafe { + use ::assert_unchecked::assert_unchecked; + let s = $lex.current.chars.as_str(); + assert_unchecked!(!s.is_empty()); + assert_unchecked!(s.as_bytes()[0] < 128); + } + $body + }; + }; +} + // `\0` `\1` etc -const ERR: ByteHandler = |lexer| { - // Next char is an ASCII char e.g. `\0` - let c = lexer.consume_ascii_char(); +ascii_byte_handler!(ERR(lexer) { + let c = lexer.consume_char(); lexer.error(diagnostics::InvalidCharacter(c, lexer.unterminated_range())); Kind::Undetermined -}; +}); // -const SPS: ByteHandler = |lexer| { - // Next char is an ASCII space character - lexer.consume_ascii_char(); +ascii_byte_handler!(SPS(lexer) { + lexer.consume_char(); Kind::WhiteSpace -}; +}); // '\r' '\n' -const LIN: ByteHandler = |lexer| { - // Next char is `\r` or `\n`, which are both ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(LIN(lexer) { + lexer.consume_char(); lexer.current.token.is_on_new_line = true; Kind::NewLine -}; +}); // ! -const EXL: ByteHandler = |lexer| { - // Next char is `!`, which is ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(EXL(lexer) { + lexer.consume_char(); if lexer.next_eq('=') { if lexer.next_eq('=') { Kind::Neq2 @@ -1368,23 +1415,21 @@ const EXL: ByteHandler = |lexer| { } else { Kind::Bang } -}; +}); // ' " -const QOT: ByteHandler = |lexer| { - // Next char is `'` or `"`, which are both ASCII - let c = lexer.consume_ascii_char(); +ascii_byte_handler!(QOT(lexer) { + let c = lexer.consume_char(); if lexer.context == LexerContext::JsxAttributeValue { lexer.read_jsx_string_literal(c) } else { lexer.read_string_literal(c) } -}; +}); // # -const HAS: ByteHandler = |lexer| { - // Next char is `#`, which is ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(HAS(lexer) { + lexer.consume_char(); // HashbangComment :: // `#!` SingleLineCommentChars? if lexer.current.token.start == 0 && lexer.next_eq('!') { @@ -1392,28 +1437,27 @@ const HAS: ByteHandler = |lexer| { } else { lexer.private_identifier() } -}; +}); -const IDT: ByteHandler = |lexer| { +// `A..=Z`, `a..=z` (except special cases below), `_`, `$` +ascii_byte_handler!(IDT(lexer) { lexer.identifier_name_handler(); Kind::Ident -}; +}); // % -const PRC: ByteHandler = |lexer| { - // Next char is `%`, which is ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(PRC(lexer) { + lexer.consume_char(); if lexer.next_eq('=') { Kind::PercentEq } else { Kind::Percent } -}; +}); // & -const AMP: ByteHandler = |lexer| { - // Next char is `&`, which is ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(AMP(lexer) { + lexer.consume_char(); if lexer.next_eq('&') { if lexer.next_eq('=') { Kind::Amp2Eq @@ -1425,26 +1469,23 @@ const AMP: ByteHandler = |lexer| { } else { Kind::Amp } -}; +}); // ( -const PNO: ByteHandler = |lexer| { - // Next char is `(`, which is ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(PNO(lexer) { + lexer.consume_char(); Kind::LParen -}; +}); // ) -const PNC: ByteHandler = |lexer| { - // Next char is `)`, which is ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(PNC(lexer) { + lexer.consume_char(); Kind::RParen -}; +}); // * -const ATR: ByteHandler = |lexer| { - // Next char is `*`, which is ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(ATR(lexer) { + lexer.consume_char(); if lexer.next_eq('*') { if lexer.next_eq('=') { Kind::Star2Eq @@ -1456,12 +1497,11 @@ const ATR: ByteHandler = |lexer| { } else { Kind::Star } -}; +}); // + -const PLS: ByteHandler = |lexer| { - // Next char is `+`, which is ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(PLS(lexer) { + lexer.consume_char(); if lexer.next_eq('+') { Kind::Plus2 } else if lexer.next_eq('=') { @@ -1469,33 +1509,29 @@ const PLS: ByteHandler = |lexer| { } else { Kind::Plus } -}; +}); // , -const COM: ByteHandler = |lexer| { - // Next char is `,`, which is ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(COM(lexer) { + lexer.consume_char(); Kind::Comma -}; +}); // - -const MIN: ByteHandler = |lexer| { - // Next char is `-`, which is ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(MIN(lexer) { + lexer.consume_char(); lexer.read_minus().unwrap_or_else(|| lexer.skip_single_line_comment()) -}; +}); // . -const PRD: ByteHandler = |lexer| { - // Next char is `.`, which is ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(PRD(lexer) { + lexer.consume_char(); lexer.read_dot() -}; +}); // / -const SLH: ByteHandler = |lexer| { - // Next char is `/`, which is ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(SLH(lexer) { + lexer.consume_char(); match lexer.peek() { Some('/') => { lexer.current.chars.next(); @@ -1514,47 +1550,41 @@ const SLH: ByteHandler = |lexer| { } } } -}; +}); // 0 -const ZER: ByteHandler = |lexer| { - // Next char is `0`, which is ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(ZER(lexer) { + lexer.consume_char(); lexer.read_zero() -}; +}); // 1 to 9 -const DIG: ByteHandler = |lexer| { - // Next char is an ASCII digit - lexer.consume_ascii_char(); +ascii_byte_handler!(DIG(lexer) { + lexer.consume_char(); lexer.decimal_literal_after_first_digit() -}; +}); // : -const COL: ByteHandler = |lexer| { - // Next char is `:`, which is ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(COL(lexer) { + lexer.consume_char(); Kind::Colon -}; +}); // ; -const SEM: ByteHandler = |lexer| { - // Next char is `;`, which is ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(SEM(lexer) { + lexer.consume_char(); Kind::Semicolon -}; +}); // < -const LSS: ByteHandler = |lexer| { - // Next char is `<`, which is ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(LSS(lexer) { + lexer.consume_char(); lexer.read_left_angle().unwrap_or_else(|| lexer.skip_single_line_comment()) -}; +}); // = -const EQL: ByteHandler = |lexer| { - // Next char is `=`, which is ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(EQL(lexer) { + lexer.consume_char(); if lexer.next_eq('=') { if lexer.next_eq('=') { Kind::Eq3 @@ -1566,20 +1596,18 @@ const EQL: ByteHandler = |lexer| { } else { Kind::Eq } -}; +}); // > -const GTR: ByteHandler = |lexer| { - // Next char is `>`, which is ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(GTR(lexer) { + lexer.consume_char(); // `>=` is re-lexed with [Lexer::next_jsx_child] Kind::RAngle -}; +}); // ? -const QST: ByteHandler = |lexer| { - // Next char is `?`, which is ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(QST(lexer) { + lexer.consume_char(); if lexer.next_eq('?') { if lexer.next_eq('=') { Kind::Question2Eq @@ -1597,72 +1625,61 @@ const QST: ByteHandler = |lexer| { } else { Kind::Question } -}; +}); // @ -const AT_: ByteHandler = |lexer| { - // Next char is `@`, which is ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(AT_(lexer) { + lexer.consume_char(); Kind::At -}; +}); // [ -const BTO: ByteHandler = |lexer| { - // Next char is `[`, which is ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(BTO(lexer) { + lexer.consume_char(); Kind::LBrack -}; +}); // \ -const ESC: ByteHandler = |lexer| { - let lexer_ref = lexer as &Lexer<'_>; - let mut builder = AutoCow::new(lexer_ref); - // Next char at start of this function was `\`, which is ASCII. - // `AutoCow::new` cannot have changed the state of `lexer.current.chars` iterator, - // as we explicitly passed it only an immutable reference. - lexer.consume_ascii_char(); +ascii_byte_handler!(ESC(lexer) { + let mut builder = AutoCow::new(lexer); + lexer.consume_char(); builder.force_allocation_without_current_ascii_char(lexer); lexer.identifier_unicode_escape_sequence(&mut builder, true); let text = lexer.identifier_name(builder); Kind::match_keyword(text) -}; +}); // ] -const BTC: ByteHandler = |lexer| { - // Next char is `]`, which is ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(BTC(lexer) { + lexer.consume_char(); Kind::RBrack -}; +}); // ^ -const CRT: ByteHandler = |lexer| { - // Next char is `^`, which is ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(CRT(lexer) { + lexer.consume_char(); if lexer.next_eq('=') { Kind::CaretEq } else { Kind::Caret } -}; +}); // ` -const TPL: ByteHandler = |lexer| { - // Next char is '`', which is ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(TPL(lexer) { + lexer.consume_char(); lexer.read_template_literal(Kind::TemplateHead, Kind::NoSubstitutionTemplate) -}; +}); // { -const BEO: ByteHandler = |lexer| { - // Next char is `{`, which is ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(BEO(lexer) { + lexer.consume_char(); Kind::LCurly -}; +}); // | -const PIP: ByteHandler = |lexer| { - // Next char is `|`, which is ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(PIP(lexer) { + lexer.consume_char(); if lexer.next_eq('|') { if lexer.next_eq('=') { Kind::Pipe2Eq @@ -1674,23 +1691,21 @@ const PIP: ByteHandler = |lexer| { } else { Kind::Pipe } -}; +}); // } -const BEC: ByteHandler = |lexer| { - // Next char is `}`, which is ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(BEC(lexer) { + lexer.consume_char(); Kind::RCurly -}; +}); // ~ -const TLD: ByteHandler = |lexer| { - // Next char is `~`, which is ASCII - lexer.consume_ascii_char(); +ascii_byte_handler!(TLD(lexer) { + lexer.consume_char(); Kind::Tilde -}; +}); -const L_A: ByteHandler = |lexer| match &lexer.identifier_name_handler()[1..] { +ascii_byte_handler!(L_A(lexer) match &lexer.identifier_name_handler()[1..] { "wait" => Kind::Await, "sync" => Kind::Async, "bstract" => Kind::Abstract, @@ -1700,16 +1715,16 @@ const L_A: ByteHandler = |lexer| match &lexer.identifier_name_handler()[1..] { "ssert" => Kind::Assert, "sserts" => Kind::Asserts, _ => Kind::Ident, -}; +}); -const L_B: ByteHandler = |lexer| match &lexer.identifier_name_handler()[1..] { +ascii_byte_handler!(L_B(lexer) match &lexer.identifier_name_handler()[1..] { "reak" => Kind::Break, "oolean" => Kind::Boolean, "igint" => Kind::BigInt, _ => Kind::Ident, -}; +}); -const L_C: ByteHandler = |lexer| match &lexer.identifier_name_handler()[1..] { +ascii_byte_handler!(L_C(lexer) match &lexer.identifier_name_handler()[1..] { "onst" => Kind::Const, "lass" => Kind::Class, "ontinue" => Kind::Continue, @@ -1717,41 +1732,41 @@ const L_C: ByteHandler = |lexer| match &lexer.identifier_name_handler()[1..] { "ase" => Kind::Case, "onstructor" => Kind::Constructor, _ => Kind::Ident, -}; +}); -const L_D: ByteHandler = |lexer| match &lexer.identifier_name_handler()[1..] { +ascii_byte_handler!(L_D(lexer) match &lexer.identifier_name_handler()[1..] { "o" => Kind::Do, "elete" => Kind::Delete, "eclare" => Kind::Declare, "efault" => Kind::Default, "ebugger" => Kind::Debugger, _ => Kind::Ident, -}; +}); -const L_E: ByteHandler = |lexer| match &lexer.identifier_name_handler()[1..] { +ascii_byte_handler!(L_E(lexer) match &lexer.identifier_name_handler()[1..] { "lse" => Kind::Else, "num" => Kind::Enum, "xport" => Kind::Export, "xtends" => Kind::Extends, _ => Kind::Ident, -}; +}); -const L_F: ByteHandler = |lexer| match &lexer.identifier_name_handler()[1..] { +ascii_byte_handler!(L_F(lexer) match &lexer.identifier_name_handler()[1..] { "unction" => Kind::Function, "alse" => Kind::False, "or" => Kind::For, "inally" => Kind::Finally, "rom" => Kind::From, _ => Kind::Ident, -}; +}); -const L_G: ByteHandler = |lexer| match &lexer.identifier_name_handler()[1..] { +ascii_byte_handler!(L_G(lexer) match &lexer.identifier_name_handler()[1..] { "et" => Kind::Get, "lobal" => Kind::Global, _ => Kind::Ident, -}; +}); -const L_I: ByteHandler = |lexer| match &lexer.identifier_name_handler()[1..] { +ascii_byte_handler!(L_I(lexer) match &lexer.identifier_name_handler()[1..] { "f" => Kind::If, "nstanceof" => Kind::Instanceof, "n" => Kind::In, @@ -1762,57 +1777,57 @@ const L_I: ByteHandler = |lexer| match &lexer.identifier_name_handler()[1..] { "ntrinsic" => Kind::Intrinsic, "s" => Kind::Is, _ => Kind::Ident, -}; +}); -const L_K: ByteHandler = |lexer| match &lexer.identifier_name_handler()[1..] { +ascii_byte_handler!(L_K(lexer) match &lexer.identifier_name_handler()[1..] { "eyof" => Kind::KeyOf, _ => Kind::Ident, -}; +}); -const L_L: ByteHandler = |lexer| match &lexer.identifier_name_handler()[1..] { +ascii_byte_handler!(L_L(lexer) match &lexer.identifier_name_handler()[1..] { "et" => Kind::Let, _ => Kind::Ident, -}; +}); -const L_M: ByteHandler = |lexer| match &lexer.identifier_name_handler()[1..] { +ascii_byte_handler!(L_M(lexer) match &lexer.identifier_name_handler()[1..] { "eta" => Kind::Meta, "odule" => Kind::Module, _ => Kind::Ident, -}; +}); -const L_N: ByteHandler = |lexer| match &lexer.identifier_name_handler()[1..] { +ascii_byte_handler!(L_N(lexer) match &lexer.identifier_name_handler()[1..] { "ull" => Kind::Null, "ew" => Kind::New, "umber" => Kind::Number, "amespace" => Kind::Namespace, "ever" => Kind::Never, _ => Kind::Ident, -}; +}); -const L_O: ByteHandler = |lexer| match &lexer.identifier_name_handler()[1..] { +ascii_byte_handler!(L_O(lexer) match &lexer.identifier_name_handler()[1..] { "f" => Kind::Of, "bject" => Kind::Object, "ut" => Kind::Out, "verride" => Kind::Override, _ => Kind::Ident, -}; +}); -const L_P: ByteHandler = |lexer| match &lexer.identifier_name_handler()[1..] { +ascii_byte_handler!(L_P(lexer) match &lexer.identifier_name_handler()[1..] { "ackage" => Kind::Package, "rivate" => Kind::Private, "rotected" => Kind::Protected, "ublic" => Kind::Public, _ => Kind::Ident, -}; +}); -const L_R: ByteHandler = |lexer| match &lexer.identifier_name_handler()[1..] { +ascii_byte_handler!(L_R(lexer) match &lexer.identifier_name_handler()[1..] { "eturn" => Kind::Return, "equire" => Kind::Require, "eadonly" => Kind::Readonly, _ => Kind::Ident, -}; +}); -const L_S: ByteHandler = |lexer| match &lexer.identifier_name_handler()[1..] { +ascii_byte_handler!(L_S(lexer) match &lexer.identifier_name_handler()[1..] { "et" => Kind::Set, "uper" => Kind::Super, "witch" => Kind::Switch, @@ -1821,9 +1836,9 @@ const L_S: ByteHandler = |lexer| match &lexer.identifier_name_handler()[1..] { "tring" => Kind::String, "atisfies" => Kind::Satisfies, _ => Kind::Ident, -}; +}); -const L_T: ByteHandler = |lexer| match &lexer.identifier_name_handler()[1..] { +ascii_byte_handler!(L_T(lexer) match &lexer.identifier_name_handler()[1..] { "his" => Kind::This, "rue" => Kind::True, "hrow" => Kind::Throw, @@ -1832,33 +1847,34 @@ const L_T: ByteHandler = |lexer| match &lexer.identifier_name_handler()[1..] { "arget" => Kind::Target, "ype" => Kind::Type, _ => Kind::Ident, -}; +}); -const L_U: ByteHandler = |lexer| match &lexer.identifier_name_handler()[1..] { +ascii_byte_handler!(L_U(lexer) match &lexer.identifier_name_handler()[1..] { "ndefined" => Kind::Undefined, "sing" => Kind::Using, "nique" => Kind::Unique, "nknown" => Kind::Unknown, _ => Kind::Ident, -}; +}); -const L_V: ByteHandler = |lexer| match &lexer.identifier_name_handler()[1..] { +ascii_byte_handler!(L_V(lexer) match &lexer.identifier_name_handler()[1..] { "ar" => Kind::Var, "oid" => Kind::Void, _ => Kind::Ident, -}; +}); -const L_W: ByteHandler = |lexer| match &lexer.identifier_name_handler()[1..] { +ascii_byte_handler!(L_W(lexer) match &lexer.identifier_name_handler()[1..] { "hile" => Kind::While, "ith" => Kind::With, _ => Kind::Ident, -}; +}); -const L_Y: ByteHandler = |lexer| match &lexer.identifier_name_handler()[1..] { +ascii_byte_handler!(L_Y(lexer) match &lexer.identifier_name_handler()[1..] { "ield" => Kind::Yield, _ => Kind::Ident, -}; +}); -// Non-ASCII characters +// Non-ASCII characters. +// NB: Must not use `ascii_byte_handler!()` macro, as this handler is for non-ASCII chars. #[allow(clippy::redundant_closure_for_method_calls)] const UNI: ByteHandler = |lexer| lexer.unicode_char_handler();