From 3d79d77b409c22669687f4007b891c78b78ff51b Mon Sep 17 00:00:00 2001 From: overlookmotel Date: Wed, 31 Jan 2024 03:43:53 +0000 Subject: [PATCH] refactor(parser): split lexer into multiple files (#2228) This PR has a large diff, but it contains no substantive changes whatsoever. It purely breaks up the lexer into multiple smaller files. I've been working quite intensively on the lexer over past few weeks, but still have been finding it hard to make sense of, due to most of the logic currently being contained in [a single 1800-line file](https://github.com/oxc-project/oxc/blob/018675ceb1a7e3442d1aeaa69e0256964675d207/crates/oxc_parser/src/lexer/mod.rs). I feel that breaking it up into multiple files makes it much easier to navigate and understand. An additional benefit is that many functions can have their visibility reduced to module scope, so sub-systems for e.g. lexing numbers have fewer exposed functions. This makes it clearer what the entry points are, and makes it harder to make mistakes when working on the lexer. I intend to later make changes to the lexer for performance which will introduce unsafe code. Keeping that unsafe code encapsulated in modules will make it more viable to validate the workings of that code, and avoid accidental UB. There is one downside to this change. Previously [`lexer/mod.rs`](https://github.com/oxc-project/oxc/blob/018675ceb1a7e3442d1aeaa69e0256964675d207/crates/oxc_parser/src/lexer/mod.rs) was laid out in same order as the JS spec. If you were trying to validate the lexer against the spec, this would make it easier. However, as OXC's parser is fairly mature at this point, and I imagine most spec-compliance issues have been flushed out by now, in my opinion this advantage is less compelling than it probably used to be. So in my view it's outweighed by the benefit of more readable code. Reviewing this could be a bit of a battle due to the size of the diff. I do have further changes I'd like to make, but I've intentionally kept this PR as 100% just: 1. Moving code around. 2. Reducing visibility of functions to module/super scope where that's possible to do without changing anything else. Aside from that, not even a single comment has changed. If you're willing to trust me on that promise, I think it can be merged without poring through it line by line. --- .typos.toml | 2 +- crates/oxc_parser/src/lexer/byte_handlers.rs | 588 +++++++ crates/oxc_parser/src/lexer/comment.rs | 49 + crates/oxc_parser/src/lexer/identifier.rs | 66 + crates/oxc_parser/src/lexer/jsx.rs | 108 ++ crates/oxc_parser/src/lexer/mod.rs | 1620 +----------------- crates/oxc_parser/src/lexer/numeric.rs | 199 +++ crates/oxc_parser/src/lexer/punctuation.rs | 83 + crates/oxc_parser/src/lexer/regex.rs | 78 + crates/oxc_parser/src/lexer/string.rs | 65 + crates/oxc_parser/src/lexer/template.rs | 86 + crates/oxc_parser/src/lexer/typescript.rs | 17 + crates/oxc_parser/src/lexer/unicode.rs | 318 ++++ 13 files changed, 1671 insertions(+), 1608 deletions(-) create mode 100644 crates/oxc_parser/src/lexer/byte_handlers.rs create mode 100644 crates/oxc_parser/src/lexer/comment.rs create mode 100644 crates/oxc_parser/src/lexer/identifier.rs create mode 100644 crates/oxc_parser/src/lexer/jsx.rs create mode 100644 crates/oxc_parser/src/lexer/numeric.rs create mode 100644 crates/oxc_parser/src/lexer/punctuation.rs create mode 100644 crates/oxc_parser/src/lexer/regex.rs create mode 100644 crates/oxc_parser/src/lexer/string.rs create mode 100644 crates/oxc_parser/src/lexer/template.rs create mode 100644 crates/oxc_parser/src/lexer/typescript.rs create mode 100644 crates/oxc_parser/src/lexer/unicode.rs diff --git a/.typos.toml b/.typos.toml index 0850c991c..ab2db2e38 100644 --- a/.typos.toml +++ b/.typos.toml @@ -8,7 +8,7 @@ extend-exclude = [ "tasks/coverage/babel", "tasks/coverage/typescript", "tasks/prettier_conformance/prettier", - "crates/oxc_parser/src/lexer/mod.rs", + "crates/oxc_parser/src/lexer/byte_handlers.rs", "crates/oxc_linter/fixtures", "crates/oxc_linter/src/rules/jsx_a11y/img_redundant_alt.rs", "crates/oxc_syntax/src/xml_entities.rs", diff --git a/crates/oxc_parser/src/lexer/byte_handlers.rs b/crates/oxc_parser/src/lexer/byte_handlers.rs new file mode 100644 index 000000000..09efc66bc --- /dev/null +++ b/crates/oxc_parser/src/lexer/byte_handlers.rs @@ -0,0 +1,588 @@ +use super::{AutoCow, Kind, Lexer, LexerContext}; +use crate::diagnostics; + +#[allow(clippy::unnecessary_safety_comment)] +/// Handle next byte of source. +/// +/// SAFETY: +/// * Lexer must not be at end of file. +/// * `byte` must be next byte of source code, corresponding to current position +/// of `lexer.current.chars`. +/// * Only `BYTE_HANDLERS` for ASCII characters may use the `ascii_byte_handler!()` macro. +pub(super) unsafe fn handle_byte(byte: u8, lexer: &mut Lexer) -> Kind { + BYTE_HANDLERS[byte as usize](lexer) +} + +type ByteHandler = unsafe fn(&mut Lexer<'_>) -> Kind; + +/// Lookup table mapping any incoming byte to a handler function defined below. +/// +#[rustfmt::skip] +static BYTE_HANDLERS: [ByteHandler; 256] = [ +// 0 1 2 3 4 5 6 7 8 9 A B C D E F // + ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, SPS, LIN, SPS, SPS, LIN, ERR, ERR, // 0 + ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, // 1 + SPS, EXL, QOT, HAS, IDT, PRC, AMP, QOT, PNO, PNC, ATR, PLS, COM, MIN, PRD, SLH, // 2 + ZER, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, COL, SEM, LSS, EQL, GTR, QST, // 3 + AT_, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, // 4 + IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, BTO, ESC, BTC, CRT, IDT, // 5 + TPL, L_A, L_B, L_C, L_D, L_E, L_F, L_G, IDT, L_I, IDT, L_K, L_L, L_M, L_N, L_O, // 6 + L_P, IDT, L_R, L_S, L_T, L_U, L_V, L_W, IDT, L_Y, IDT, BEO, PIP, BEC, TLD, ERR, // 7 + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 8 + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 9 + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // A + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // B + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // C + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // D + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // E + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // F +]; + +#[allow(clippy::unnecessary_safety_comment)] +/// Macro for defining byte handler for an ASCII character. +/// +/// In addition to defining a `const` for the handler, it also asserts that lexer +/// is not at end of file, and that next char is ASCII. +/// Where the handler is for an ASCII character, these assertions are self-evidently true. +/// +/// These assertions produce no runtime code, but hint to the compiler that it can assume that +/// next char is ASCII, and it uses that information to optimize the rest of the handler. +/// e.g. `lexer.current.chars.next()` becomes just a single assembler instruction. +/// Without the assertions, the compiler is unable to deduce the next char is ASCII, due to +/// the indirection of the `BYTE_HANDLERS` jump table. +/// +/// These assertions are unchecked (i.e. won't panic) and will cause UB if they're incorrect. +/// +/// SAFETY: Only use this macro to define byte handlers for ASCII characters. +/// +/// ``` +/// ascii_byte_handler!(SPS(lexer) { +/// lexer.consume_char(); +/// Kind::WhiteSpace +/// }); +/// ``` +/// +/// expands to: +/// +/// ``` +/// const SPS: ByteHandler = |lexer| { +/// unsafe { +/// use assert_unchecked::assert_unchecked; +/// let s = lexer.current.chars.as_str(); +/// assert_unchecked!(!s.is_empty()); +/// assert_unchecked!(s.as_bytes()[0] < 128); +/// } +/// lexer.consume_char(); +/// Kind::WhiteSpace +/// }; +/// ``` +macro_rules! ascii_byte_handler { + ($id:ident($lex:ident) $body:expr) => { + const $id: ByteHandler = |$lex| { + // SAFETY: This macro is only used for ASCII characters + unsafe { + use assert_unchecked::assert_unchecked; + let s = $lex.current.chars.as_str(); + assert_unchecked!(!s.is_empty()); + assert_unchecked!(s.as_bytes()[0] < 128); + } + $body + }; + }; +} + +// `\0` `\1` etc +ascii_byte_handler!(ERR(lexer) { + let c = lexer.consume_char(); + lexer.error(diagnostics::InvalidCharacter(c, lexer.unterminated_range())); + Kind::Undetermined +}); + +// +ascii_byte_handler!(SPS(lexer) { + lexer.consume_char(); + Kind::Skip +}); + +// '\r' '\n' +ascii_byte_handler!(LIN(lexer) { + lexer.consume_char(); + lexer.current.token.is_on_new_line = true; + Kind::Skip +}); + +// ! +ascii_byte_handler!(EXL(lexer) { + lexer.consume_char(); + if lexer.next_eq('=') { + if lexer.next_eq('=') { + Kind::Neq2 + } else { + Kind::Neq + } + } else { + Kind::Bang + } +}); + +// ' " +ascii_byte_handler!(QOT(lexer) { + let c = lexer.consume_char(); + if lexer.context == LexerContext::JsxAttributeValue { + lexer.read_jsx_string_literal(c) + } else { + lexer.read_string_literal(c) + } +}); + +// # +ascii_byte_handler!(HAS(lexer) { + lexer.consume_char(); + // HashbangComment :: + // `#!` SingleLineCommentChars? + if lexer.current.token.start == 0 && lexer.next_eq('!') { + lexer.read_hashbang_comment() + } else { + lexer.private_identifier() + } +}); + +// `A..=Z`, `a..=z` (except special cases below), `_`, `$` +ascii_byte_handler!(IDT(lexer) { + lexer.identifier_name_handler(); + Kind::Ident +}); + +// % +ascii_byte_handler!(PRC(lexer) { + lexer.consume_char(); + if lexer.next_eq('=') { + Kind::PercentEq + } else { + Kind::Percent + } +}); + +// & +ascii_byte_handler!(AMP(lexer) { + lexer.consume_char(); + if lexer.next_eq('&') { + if lexer.next_eq('=') { + Kind::Amp2Eq + } else { + Kind::Amp2 + } + } else if lexer.next_eq('=') { + Kind::AmpEq + } else { + Kind::Amp + } +}); + +// ( +ascii_byte_handler!(PNO(lexer) { + lexer.consume_char(); + Kind::LParen +}); + +// ) +ascii_byte_handler!(PNC(lexer) { + lexer.consume_char(); + Kind::RParen +}); + +// * +ascii_byte_handler!(ATR(lexer) { + lexer.consume_char(); + if lexer.next_eq('*') { + if lexer.next_eq('=') { + Kind::Star2Eq + } else { + Kind::Star2 + } + } else if lexer.next_eq('=') { + Kind::StarEq + } else { + Kind::Star + } +}); + +// + +ascii_byte_handler!(PLS(lexer) { + lexer.consume_char(); + if lexer.next_eq('+') { + Kind::Plus2 + } else if lexer.next_eq('=') { + Kind::PlusEq + } else { + Kind::Plus + } +}); + +// , +ascii_byte_handler!(COM(lexer) { + lexer.consume_char(); + Kind::Comma +}); + +// - +ascii_byte_handler!(MIN(lexer) { + lexer.consume_char(); + lexer.read_minus().unwrap_or_else(|| lexer.skip_single_line_comment()) +}); + +// . +ascii_byte_handler!(PRD(lexer) { + lexer.consume_char(); + lexer.read_dot() +}); + +// / +ascii_byte_handler!(SLH(lexer) { + lexer.consume_char(); + match lexer.peek() { + Some('/') => { + lexer.current.chars.next(); + lexer.skip_single_line_comment() + } + Some('*') => { + lexer.current.chars.next(); + lexer.skip_multi_line_comment() + } + _ => { + // regex is handled separately, see `next_regex` + if lexer.next_eq('=') { + Kind::SlashEq + } else { + Kind::Slash + } + } + } +}); + +// 0 +ascii_byte_handler!(ZER(lexer) { + lexer.consume_char(); + lexer.read_zero() +}); + +// 1 to 9 +ascii_byte_handler!(DIG(lexer) { + lexer.consume_char(); + lexer.decimal_literal_after_first_digit() +}); + +// : +ascii_byte_handler!(COL(lexer) { + lexer.consume_char(); + Kind::Colon +}); + +// ; +ascii_byte_handler!(SEM(lexer) { + lexer.consume_char(); + Kind::Semicolon +}); + +// < +ascii_byte_handler!(LSS(lexer) { + lexer.consume_char(); + lexer.read_left_angle().unwrap_or_else(|| lexer.skip_single_line_comment()) +}); + +// = +ascii_byte_handler!(EQL(lexer) { + lexer.consume_char(); + if lexer.next_eq('=') { + if lexer.next_eq('=') { + Kind::Eq3 + } else { + Kind::Eq2 + } + } else if lexer.next_eq('>') { + Kind::Arrow + } else { + Kind::Eq + } +}); + +// > +ascii_byte_handler!(GTR(lexer) { + lexer.consume_char(); + // `>=` is re-lexed with [Lexer::next_jsx_child] + Kind::RAngle +}); + +// ? +ascii_byte_handler!(QST(lexer) { + lexer.consume_char(); + if lexer.next_eq('?') { + if lexer.next_eq('=') { + Kind::Question2Eq + } else { + Kind::Question2 + } + } else if lexer.peek() == Some('.') { + // parse `?.1` as `?` `.1` + if lexer.peek2().is_some_and(|c| c.is_ascii_digit()) { + Kind::Question + } else { + lexer.current.chars.next(); + Kind::QuestionDot + } + } else { + Kind::Question + } +}); + +// @ +ascii_byte_handler!(AT_(lexer) { + lexer.consume_char(); + Kind::At +}); + +// [ +ascii_byte_handler!(BTO(lexer) { + lexer.consume_char(); + Kind::LBrack +}); + +// \ +ascii_byte_handler!(ESC(lexer) { + let mut builder = AutoCow::new(lexer); + lexer.consume_char(); + builder.force_allocation_without_current_ascii_char(lexer); + lexer.identifier_unicode_escape_sequence(&mut builder, true); + let text = lexer.identifier_name(builder); + Kind::match_keyword(text) +}); + +// ] +ascii_byte_handler!(BTC(lexer) { + lexer.consume_char(); + Kind::RBrack +}); + +// ^ +ascii_byte_handler!(CRT(lexer) { + lexer.consume_char(); + if lexer.next_eq('=') { + Kind::CaretEq + } else { + Kind::Caret + } +}); + +// ` +ascii_byte_handler!(TPL(lexer) { + lexer.consume_char(); + lexer.read_template_literal(Kind::TemplateHead, Kind::NoSubstitutionTemplate) +}); + +// { +ascii_byte_handler!(BEO(lexer) { + lexer.consume_char(); + Kind::LCurly +}); + +// | +ascii_byte_handler!(PIP(lexer) { + lexer.consume_char(); + if lexer.next_eq('|') { + if lexer.next_eq('=') { + Kind::Pipe2Eq + } else { + Kind::Pipe2 + } + } else if lexer.next_eq('=') { + Kind::PipeEq + } else { + Kind::Pipe + } +}); + +// } +ascii_byte_handler!(BEC(lexer) { + lexer.consume_char(); + Kind::RCurly +}); + +// ~ +ascii_byte_handler!(TLD(lexer) { + lexer.consume_char(); + Kind::Tilde +}); + +ascii_byte_handler!(L_A(lexer) match &lexer.identifier_name_handler()[1..] { + "wait" => Kind::Await, + "sync" => Kind::Async, + "bstract" => Kind::Abstract, + "ccessor" => Kind::Accessor, + "ny" => Kind::Any, + "s" => Kind::As, + "ssert" => Kind::Assert, + "sserts" => Kind::Asserts, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_B(lexer) match &lexer.identifier_name_handler()[1..] { + "reak" => Kind::Break, + "oolean" => Kind::Boolean, + "igint" => Kind::BigInt, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_C(lexer) match &lexer.identifier_name_handler()[1..] { + "onst" => Kind::Const, + "lass" => Kind::Class, + "ontinue" => Kind::Continue, + "atch" => Kind::Catch, + "ase" => Kind::Case, + "onstructor" => Kind::Constructor, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_D(lexer) match &lexer.identifier_name_handler()[1..] { + "o" => Kind::Do, + "elete" => Kind::Delete, + "eclare" => Kind::Declare, + "efault" => Kind::Default, + "ebugger" => Kind::Debugger, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_E(lexer) match &lexer.identifier_name_handler()[1..] { + "lse" => Kind::Else, + "num" => Kind::Enum, + "xport" => Kind::Export, + "xtends" => Kind::Extends, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_F(lexer) match &lexer.identifier_name_handler()[1..] { + "unction" => Kind::Function, + "alse" => Kind::False, + "or" => Kind::For, + "inally" => Kind::Finally, + "rom" => Kind::From, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_G(lexer) match &lexer.identifier_name_handler()[1..] { + "et" => Kind::Get, + "lobal" => Kind::Global, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_I(lexer) match &lexer.identifier_name_handler()[1..] { + "f" => Kind::If, + "nstanceof" => Kind::Instanceof, + "n" => Kind::In, + "mplements" => Kind::Implements, + "mport" => Kind::Import, + "nfer" => Kind::Infer, + "nterface" => Kind::Interface, + "ntrinsic" => Kind::Intrinsic, + "s" => Kind::Is, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_K(lexer) match &lexer.identifier_name_handler()[1..] { + "eyof" => Kind::KeyOf, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_L(lexer) match &lexer.identifier_name_handler()[1..] { + "et" => Kind::Let, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_M(lexer) match &lexer.identifier_name_handler()[1..] { + "eta" => Kind::Meta, + "odule" => Kind::Module, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_N(lexer) match &lexer.identifier_name_handler()[1..] { + "ull" => Kind::Null, + "ew" => Kind::New, + "umber" => Kind::Number, + "amespace" => Kind::Namespace, + "ever" => Kind::Never, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_O(lexer) match &lexer.identifier_name_handler()[1..] { + "f" => Kind::Of, + "bject" => Kind::Object, + "ut" => Kind::Out, + "verride" => Kind::Override, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_P(lexer) match &lexer.identifier_name_handler()[1..] { + "ackage" => Kind::Package, + "rivate" => Kind::Private, + "rotected" => Kind::Protected, + "ublic" => Kind::Public, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_R(lexer) match &lexer.identifier_name_handler()[1..] { + "eturn" => Kind::Return, + "equire" => Kind::Require, + "eadonly" => Kind::Readonly, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_S(lexer) match &lexer.identifier_name_handler()[1..] { + "et" => Kind::Set, + "uper" => Kind::Super, + "witch" => Kind::Switch, + "tatic" => Kind::Static, + "ymbol" => Kind::Symbol, + "tring" => Kind::String, + "atisfies" => Kind::Satisfies, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_T(lexer) match &lexer.identifier_name_handler()[1..] { + "his" => Kind::This, + "rue" => Kind::True, + "hrow" => Kind::Throw, + "ry" => Kind::Try, + "ypeof" => Kind::Typeof, + "arget" => Kind::Target, + "ype" => Kind::Type, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_U(lexer) match &lexer.identifier_name_handler()[1..] { + "ndefined" => Kind::Undefined, + "sing" => Kind::Using, + "nique" => Kind::Unique, + "nknown" => Kind::Unknown, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_V(lexer) match &lexer.identifier_name_handler()[1..] { + "ar" => Kind::Var, + "oid" => Kind::Void, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_W(lexer) match &lexer.identifier_name_handler()[1..] { + "hile" => Kind::While, + "ith" => Kind::With, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_Y(lexer) match &lexer.identifier_name_handler()[1..] { + "ield" => Kind::Yield, + _ => Kind::Ident, +}); + +// Non-ASCII characters. +// NB: Must not use `ascii_byte_handler!()` macro, as this handler is for non-ASCII chars. +#[allow(clippy::redundant_closure_for_method_calls)] +const UNI: ByteHandler = |lexer| lexer.unicode_char_handler(); diff --git a/crates/oxc_parser/src/lexer/comment.rs b/crates/oxc_parser/src/lexer/comment.rs new file mode 100644 index 000000000..f195796ba --- /dev/null +++ b/crates/oxc_parser/src/lexer/comment.rs @@ -0,0 +1,49 @@ +use super::{Kind, Lexer}; +use crate::diagnostics; + +use oxc_syntax::identifier::is_line_terminator; + +impl<'a> Lexer<'a> { + /// Section 12.4 Single Line Comment + #[allow(clippy::cast_possible_truncation)] + pub(super) fn skip_single_line_comment(&mut self) -> Kind { + let start = self.current.token.start; + while let Some(c) = self.current.chars.next() { + if is_line_terminator(c) { + self.current.token.is_on_new_line = true; + self.trivia_builder + .add_single_line_comment(start, self.offset() - c.len_utf8() as u32); + return Kind::Skip; + } + } + // EOF + self.trivia_builder.add_single_line_comment(start, self.offset()); + Kind::Skip + } + + /// Section 12.4 Multi Line Comment + pub(super) fn skip_multi_line_comment(&mut self) -> Kind { + while let Some(c) = self.current.chars.next() { + if c == '*' && self.next_eq('/') { + self.trivia_builder.add_multi_line_comment(self.current.token.start, self.offset()); + return Kind::Skip; + } + if is_line_terminator(c) { + self.current.token.is_on_new_line = true; + } + } + self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range())); + Kind::Eof + } + + /// Section 12.5 Hashbang Comments + pub(super) fn read_hashbang_comment(&mut self) -> Kind { + while let Some(c) = self.current.chars.next().as_ref() { + if is_line_terminator(*c) { + break; + } + } + self.current.token.is_on_new_line = true; + Kind::HashbangComment + } +} diff --git a/crates/oxc_parser/src/lexer/identifier.rs b/crates/oxc_parser/src/lexer/identifier.rs new file mode 100644 index 000000000..272dd32f8 --- /dev/null +++ b/crates/oxc_parser/src/lexer/identifier.rs @@ -0,0 +1,66 @@ +use super::{AutoCow, Kind, Lexer, Span}; +use crate::diagnostics; + +use oxc_syntax::identifier::{is_identifier_part, is_identifier_start}; + +impl<'a> Lexer<'a> { + /// Section 12.7.1 Identifier Names + pub(super) fn identifier_name_handler(&mut self) -> &'a str { + let builder = AutoCow::new(self); + self.consume_char(); + self.identifier_name(builder) + } + + pub(super) fn identifier_name(&mut self, builder: AutoCow<'a>) -> &'a str { + self.identifier_tail(builder) + } + + pub(super) fn private_identifier(&mut self) -> Kind { + let mut builder = AutoCow::new(self); + let start = self.offset(); + match self.current.chars.next() { + Some(c) if is_identifier_start(c) => { + builder.push_matching(c); + } + Some('\\') => { + builder.force_allocation_without_current_ascii_char(self); + self.identifier_unicode_escape_sequence(&mut builder, true); + } + Some(c) => { + #[allow(clippy::cast_possible_truncation)] + self.error(diagnostics::InvalidCharacter( + c, + Span::new(start, start + c.len_utf8() as u32), + )); + return Kind::Undetermined; + } + None => { + self.error(diagnostics::UnexpectedEnd(Span::new(start, start))); + return Kind::Undetermined; + } + } + self.identifier_tail(builder); + Kind::PrivateIdentifier + } + + fn identifier_tail(&mut self, mut builder: AutoCow<'a>) -> &'a str { + // ident tail + while let Some(c) = self.peek() { + if !is_identifier_part(c) { + if c == '\\' { + self.current.chars.next(); + builder.force_allocation_without_current_ascii_char(self); + self.identifier_unicode_escape_sequence(&mut builder, false); + continue; + } + break; + } + self.current.chars.next(); + builder.push_matching(c); + } + let has_escape = builder.has_escape(); + let text = builder.finish(self); + self.save_string(has_escape, text); + text + } +} diff --git a/crates/oxc_parser/src/lexer/jsx.rs b/crates/oxc_parser/src/lexer/jsx.rs new file mode 100644 index 000000000..94b4d0e7e --- /dev/null +++ b/crates/oxc_parser/src/lexer/jsx.rs @@ -0,0 +1,108 @@ +use super::{AutoCow, Kind, Lexer, Token}; +use crate::diagnostics; + +use oxc_syntax::identifier::{is_identifier_part, is_identifier_start}; + +impl<'a> Lexer<'a> { + /// `JSXDoubleStringCharacters` :: + /// `JSXDoubleStringCharacter` `JSXDoubleStringCharactersopt` + /// `JSXDoubleStringCharacter` :: + /// `JSXStringCharacter` but not " + /// `JSXSingleStringCharacters` :: + /// `JSXSingleStringCharacter` `JSXSingleStringCharactersopt` + /// `JSXSingleStringCharacter` :: + /// `JSXStringCharacter` but not ' + /// `JSXStringCharacter` :: + /// `SourceCharacter` but not one of `HTMLCharacterReference` + pub(super) fn read_jsx_string_literal(&mut self, delimiter: char) -> Kind { + let mut builder = AutoCow::new(self); + loop { + match self.current.chars.next() { + Some(c @ ('"' | '\'')) => { + if c == delimiter { + self.save_string(builder.has_escape(), builder.finish_without_push(self)); + return Kind::Str; + } + builder.push_matching(c); + } + Some(other) => { + builder.push_matching(other); + } + None => { + self.error(diagnostics::UnterminatedString(self.unterminated_range())); + return Kind::Undetermined; + } + } + } + } + + pub(crate) fn next_jsx_child(&mut self) -> Token { + self.current.token.start = self.offset(); + let kind = self.read_jsx_child(); + self.finish_next(kind) + } + + /// Expand the current token for `JSXIdentifier` + pub(crate) fn next_jsx_identifier(&mut self, start_offset: u32) -> Token { + let kind = self.read_jsx_identifier(start_offset); + self.lookahead.clear(); + self.finish_next(kind) + } + + /// [`JSXChild`](https://facebook.github.io/jsx/#prod-JSXChild) + /// `JSXChild` : + /// `JSXText` + /// `JSXElement` + /// `JSXFragment` + /// { `JSXChildExpressionopt` } + fn read_jsx_child(&mut self) -> Kind { + match self.peek() { + Some('<') => { + self.current.chars.next(); + Kind::LAngle + } + Some('{') => { + self.current.chars.next(); + Kind::LCurly + } + Some(_) => { + loop { + // The tokens `{`, `<`, `>` and `}` cannot appear in a jsx text. + // The TypeScript compiler raises the error "Unexpected token. Did you mean `{'>'}` or `>`?". + // Where as the Babel compiler does not raise any errors. + // The following check omits `>` and `}` so that more Babel tests can be passed. + if self.peek().is_some_and(|c| c == '{' || c == '<') { + break; + } + if self.current.chars.next().is_none() { + break; + } + } + Kind::JSXText + } + None => Kind::Eof, + } + } + + /// `JSXIdentifier` : + /// `IdentifierStart` + /// `JSXIdentifier` `IdentifierPart` + /// `JSXIdentifier` [no `WhiteSpace` or Comment here] - + fn read_jsx_identifier(&mut self, _start_offset: u32) -> Kind { + while let Some(c) = self.peek() { + if c == '-' || is_identifier_start(c) { + self.current.chars.next(); + while let Some(c) = self.peek() { + if is_identifier_part(c) { + self.current.chars.next(); + } else { + break; + } + } + } else { + break; + } + } + Kind::Ident + } +} diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs index 3cb8f585e..2c14e2d85 100644 --- a/crates/oxc_parser/src/lexer/mod.rs +++ b/crates/oxc_parser/src/lexer/mod.rs @@ -5,31 +5,37 @@ //! * [rustc](https://github.com/rust-lang/rust/blob/master/compiler/rustc_lexer/src) //! * [v8](https://v8.dev/blog/scanner) +mod byte_handlers; +mod comment; +mod identifier; +mod jsx; mod kind; mod number; +mod numeric; +mod punctuation; +mod regex; +mod string; mod string_builder; +mod template; mod token; mod trivia_builder; +mod typescript; +mod unicode; use rustc_hash::FxHashMap; use std::{collections::VecDeque, str::Chars}; -use oxc_allocator::{Allocator, String}; +use oxc_allocator::Allocator; use oxc_ast::ast::RegExpFlags; use oxc_diagnostics::Error; use oxc_span::{SourceType, Span}; -use oxc_syntax::identifier::{ - is_identifier_part, is_identifier_start, is_identifier_start_unicode, - is_irregular_line_terminator, is_irregular_whitespace, is_line_terminator, CR, FF, LF, LS, PS, - TAB, VT, -}; +use self::{byte_handlers::handle_byte, string_builder::AutoCow, trivia_builder::TriviaBuilder}; pub use self::{ kind::Kind, number::{parse_big_int, parse_float, parse_int}, token::Token, }; -use self::{string_builder::AutoCow, trivia_builder::TriviaBuilder}; use crate::{diagnostics, MAX_LEN}; #[derive(Debug, Clone)] @@ -173,12 +179,6 @@ impl<'a> Lexer<'a> { self.finish_next(kind) } - pub fn next_jsx_child(&mut self) -> Token { - self.current.token.start = self.offset(); - let kind = self.read_jsx_child(); - self.finish_next(kind) - } - fn finish_next(&mut self, kind: Kind) -> Token { self.current.token.kind = kind; self.current.token.end = self.offset(); @@ -188,61 +188,6 @@ impl<'a> Lexer<'a> { token } - /// Re-tokenize the current `/` or `/=` and return `RegExp` - /// See Section 12: - /// The `InputElementRegExp` goal symbol is used in all syntactic grammar contexts - /// where a `RegularExpressionLiteral` is permitted - /// Which means the parser needs to re-tokenize on `PrimaryExpression`, - /// `RegularExpressionLiteral` only appear on the right hand side of `PrimaryExpression` - pub fn next_regex(&mut self, kind: Kind) -> (Token, u32, RegExpFlags) { - self.current.token.start = self.offset() - - match kind { - Kind::Slash => 1, - Kind::SlashEq => 2, - _ => unreachable!(), - }; - let (pattern_end, flags) = self.read_regex(); - self.lookahead.clear(); - let token = self.finish_next(Kind::RegExp); - (token, pattern_end, flags) - } - - pub fn next_right_angle(&mut self) -> Token { - let kind = self.read_right_angle(); - self.lookahead.clear(); - self.finish_next(kind) - } - - /// Re-tokenize the current `}` token for `TemplateSubstitutionTail` - /// See Section 12, the parser needs to re-tokenize on `TemplateSubstitutionTail`, - pub fn next_template_substitution_tail(&mut self) -> Token { - self.current.token.start = self.offset() - 1; - let kind = self.read_template_literal(Kind::TemplateMiddle, Kind::TemplateTail); - self.lookahead.clear(); - self.finish_next(kind) - } - - /// Expand the current token for `JSXIdentifier` - pub fn next_jsx_identifier(&mut self, start_offset: u32) -> Token { - let kind = self.read_jsx_identifier(start_offset); - self.lookahead.clear(); - self.finish_next(kind) - } - - /// Re-tokenize '<<' or '<=' or '<<=' to '<' - pub fn re_lex_as_typescript_l_angle(&mut self, kind: Kind) -> Token { - let offset = match kind { - Kind::ShiftLeft | Kind::LtEq => 2, - Kind::ShiftLeftEq => 3, - _ => unreachable!(), - }; - self.current.token.start = self.offset() - offset; - self.current.chars = self.source[self.current.token.start as usize + 1..].chars(); - let kind = Kind::LAngle; - self.lookahead.clear(); - self.finish_next(kind) - } - // ---------- Private Methods ---------- // fn error>(&mut self, error: T) { self.errors.push(error.into()); @@ -311,65 +256,6 @@ impl<'a> Lexer<'a> { } } - /// Save the string if it is escaped - /// This reduces the overall memory consumption while keeping the `Token` size small - /// Strings without escaped values can be retrieved as is from the token span - fn save_string(&mut self, has_escape: bool, s: &'a str) { - if !has_escape { - return; - } - self.escaped_strings.insert(self.current.token.start, s); - self.current.token.escaped = true; - } - - pub(crate) fn get_string(&self, token: Token) -> &'a str { - if token.escaped { - return self.escaped_strings[&token.start]; - } - - let raw = &self.source[token.start as usize..token.end as usize]; - match token.kind { - Kind::Str => { - &raw[1..raw.len() - 1] // omit surrounding quotes - } - Kind::PrivateIdentifier => { - &raw[1..] // omit leading `#` - } - _ => raw, - } - } - - /// Save the template if it is escaped - fn save_template_string( - &mut self, - is_valid_escape_sequence: bool, - has_escape: bool, - s: &'a str, - ) { - if !has_escape { - return; - } - self.escaped_templates - .insert(self.current.token.start, is_valid_escape_sequence.then(|| s)); - self.current.token.escaped = true; - } - - pub(crate) fn get_template_string(&self, token: Token) -> Option<&'a str> { - if token.escaped { - return self.escaped_templates[&token.start]; - } - let raw = &self.source[token.start as usize..token.end as usize]; - Some(match token.kind { - Kind::NoSubstitutionTemplate | Kind::TemplateTail => { - &raw[1..raw.len() - 1] // omit surrounding quotes or leading "}" and trailing "`" - } - Kind::TemplateHead | Kind::TemplateMiddle => { - &raw[1..raw.len() - 2] // omit leading "`" or "}" and trailing "${" - } - _ => raw, - }) - } - /// Read each char and set the current token /// Whitespace and line terminators are skipped fn read_next_token(&mut self) -> Kind { @@ -391,1484 +277,4 @@ impl<'a> Lexer<'a> { } } } - - fn unicode_char_handler(&mut self) -> Kind { - let c = self.current.chars.clone().next().unwrap(); - match c { - c if is_identifier_start_unicode(c) => { - let mut builder = AutoCow::new(self); - let c = self.consume_char(); - builder.push_matching(c); - self.identifier_name(builder); - Kind::Ident - } - c if is_irregular_whitespace(c) => { - self.trivia_builder - .add_irregular_whitespace(self.current.token.start, self.offset()); - self.consume_char(); - Kind::Skip - } - c if is_irregular_line_terminator(c) => { - self.consume_char(); - self.current.token.is_on_new_line = true; - Kind::Skip - } - _ => { - self.consume_char(); - self.error(diagnostics::InvalidCharacter(c, self.unterminated_range())); - Kind::Undetermined - } - } - } - - /// Section 12.4 Single Line Comment - #[allow(clippy::cast_possible_truncation)] - fn skip_single_line_comment(&mut self) -> Kind { - let start = self.current.token.start; - while let Some(c) = self.current.chars.next() { - if is_line_terminator(c) { - self.current.token.is_on_new_line = true; - self.trivia_builder - .add_single_line_comment(start, self.offset() - c.len_utf8() as u32); - return Kind::Skip; - } - } - // EOF - self.trivia_builder.add_single_line_comment(start, self.offset()); - Kind::Skip - } - - /// Section 12.4 Multi Line Comment - fn skip_multi_line_comment(&mut self) -> Kind { - while let Some(c) = self.current.chars.next() { - if c == '*' && self.next_eq('/') { - self.trivia_builder.add_multi_line_comment(self.current.token.start, self.offset()); - return Kind::Skip; - } - if is_line_terminator(c) { - self.current.token.is_on_new_line = true; - } - } - self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range())); - Kind::Eof - } - - /// Section 12.5 Hashbang Comments - fn read_hashbang_comment(&mut self) -> Kind { - while let Some(c) = self.current.chars.next().as_ref() { - if is_line_terminator(*c) { - break; - } - } - self.current.token.is_on_new_line = true; - Kind::HashbangComment - } - - /// Section 12.7.1 Identifier Names - fn identifier_tail(&mut self, mut builder: AutoCow<'a>) -> &'a str { - // ident tail - while let Some(c) = self.peek() { - if !is_identifier_part(c) { - if c == '\\' { - self.current.chars.next(); - builder.force_allocation_without_current_ascii_char(self); - self.identifier_unicode_escape_sequence(&mut builder, false); - continue; - } - break; - } - self.current.chars.next(); - builder.push_matching(c); - } - let has_escape = builder.has_escape(); - let text = builder.finish(self); - self.save_string(has_escape, text); - text - } - - fn identifier_name(&mut self, builder: AutoCow<'a>) -> &'a str { - self.identifier_tail(builder) - } - - fn identifier_name_handler(&mut self) -> &'a str { - let builder = AutoCow::new(self); - self.consume_char(); - self.identifier_name(builder) - } - - /// Section 12.8 Punctuators - fn read_dot(&mut self) -> Kind { - if self.peek() == Some('.') && self.peek2() == Some('.') { - self.current.chars.next(); - self.current.chars.next(); - return Kind::Dot3; - } - if self.peek().is_some_and(|c| c.is_ascii_digit()) { - self.decimal_literal_after_decimal_point() - } else { - Kind::Dot - } - } - - /// returns None for `SingleLineHTMLOpenComment` `` in script mode - fn read_minus(&mut self) -> Option { - if self.next_eq('-') { - // SingleLineHTMLCloseComment `-->` in script mode - if self.current.token.is_on_new_line - && self.source_type.is_script() - && self.next_eq('>') - { - None - } else { - Some(Kind::Minus2) - } - } else if self.next_eq('=') { - Some(Kind::MinusEq) - } else { - Some(Kind::Minus) - } - } - - fn private_identifier(&mut self) -> Kind { - let mut builder = AutoCow::new(self); - let start = self.offset(); - match self.current.chars.next() { - Some(c) if is_identifier_start(c) => { - builder.push_matching(c); - } - Some('\\') => { - builder.force_allocation_without_current_ascii_char(self); - self.identifier_unicode_escape_sequence(&mut builder, true); - } - Some(c) => { - #[allow(clippy::cast_possible_truncation)] - self.error(diagnostics::InvalidCharacter( - c, - Span::new(start, start + c.len_utf8() as u32), - )); - return Kind::Undetermined; - } - None => { - self.error(diagnostics::UnexpectedEnd(Span::new(start, start))); - return Kind::Undetermined; - } - } - self.identifier_tail(builder); - Kind::PrivateIdentifier - } - - /// 12.9.3 Numeric Literals with `0` prefix - fn read_zero(&mut self) -> Kind { - match self.peek() { - Some('b' | 'B') => self.read_non_decimal(Kind::Binary), - Some('o' | 'O') => self.read_non_decimal(Kind::Octal), - Some('x' | 'X') => self.read_non_decimal(Kind::Hex), - Some('e' | 'E') => { - self.current.chars.next(); - self.read_decimal_exponent() - } - Some('.') => { - self.current.chars.next(); - self.decimal_literal_after_decimal_point_after_digits() - } - Some('n') => { - self.current.chars.next(); - self.check_after_numeric_literal(Kind::Decimal) - } - Some(n) if n.is_ascii_digit() => self.read_legacy_octal(), - _ => self.check_after_numeric_literal(Kind::Decimal), - } - } - - fn read_non_decimal(&mut self, kind: Kind) -> Kind { - self.current.chars.next(); - - if self.peek().is_some_and(|c| kind.matches_number_char(c)) { - self.current.chars.next(); - } else { - self.unexpected_err(); - return Kind::Undetermined; - } - - while let Some(c) = self.peek() { - match c { - '_' => { - self.current.chars.next(); - if self.peek().is_some_and(|c| kind.matches_number_char(c)) { - self.current.chars.next(); - } else { - self.unexpected_err(); - return Kind::Undetermined; - } - } - c if kind.matches_number_char(c) => { - self.current.chars.next(); - } - _ => break, - } - } - if self.peek() == Some('n') { - self.current.chars.next(); - } - self.check_after_numeric_literal(kind) - } - - fn read_legacy_octal(&mut self) -> Kind { - let mut kind = Kind::Octal; - loop { - match self.peek() { - Some('0'..='7') => { - self.current.chars.next(); - } - Some('8'..='9') => { - self.current.chars.next(); - kind = Kind::Decimal; - } - _ => break, - } - } - - match self.peek() { - // allow 08.5 and 09.5 - Some('.') if kind == Kind::Decimal => { - self.current.chars.next(); - self.decimal_literal_after_decimal_point_after_digits() - } - // allow 08e1 and 09e1 - Some('e') if kind == Kind::Decimal => { - self.current.chars.next(); - self.read_decimal_exponent() - } - _ => self.check_after_numeric_literal(kind), - } - } - - fn decimal_literal_after_first_digit(&mut self) -> Kind { - self.read_decimal_digits_after_first_digit(); - if self.next_eq('.') { - return self.decimal_literal_after_decimal_point_after_digits(); - } else if self.next_eq('n') { - return self.check_after_numeric_literal(Kind::Decimal); - } - - let kind = self.optional_exponent().map_or(Kind::Decimal, |kind| kind); - self.check_after_numeric_literal(kind) - } - - fn read_decimal_exponent(&mut self) -> Kind { - let kind = match self.peek() { - Some('-') => { - self.current.chars.next(); - Kind::NegativeExponential - } - Some('+') => { - self.current.chars.next(); - Kind::PositiveExponential - } - _ => Kind::PositiveExponential, - }; - self.read_decimal_digits(); - kind - } - - fn read_decimal_digits(&mut self) { - if self.peek().is_some_and(|c| c.is_ascii_digit()) { - self.current.chars.next(); - } else { - self.unexpected_err(); - return; - } - - self.read_decimal_digits_after_first_digit(); - } - - fn read_decimal_digits_after_first_digit(&mut self) { - while let Some(c) = self.peek() { - match c { - '_' => { - self.current.chars.next(); - if self.peek().is_some_and(|c| c.is_ascii_digit()) { - self.current.chars.next(); - } else { - self.unexpected_err(); - return; - } - } - '0'..='9' => { - self.current.chars.next(); - } - _ => break, - } - } - } - - fn decimal_literal_after_decimal_point(&mut self) -> Kind { - self.read_decimal_digits(); - self.optional_exponent(); - self.check_after_numeric_literal(Kind::Float) - } - - fn decimal_literal_after_decimal_point_after_digits(&mut self) -> Kind { - self.optional_decimal_digits(); - self.optional_exponent(); - self.check_after_numeric_literal(Kind::Float) - } - - fn optional_decimal_digits(&mut self) { - if self.peek().is_some_and(|c| c.is_ascii_digit()) { - self.current.chars.next(); - } else { - return; - } - self.read_decimal_digits_after_first_digit(); - } - - fn optional_exponent(&mut self) -> Option { - if matches!(self.peek(), Some('e' | 'E')) { - self.current.chars.next(); - return Some(self.read_decimal_exponent()); - } - None - } - - fn check_after_numeric_literal(&mut self, kind: Kind) -> Kind { - let offset = self.offset(); - // The SourceCharacter immediately following a NumericLiteral must not be an IdentifierStart or DecimalDigit. - let c = self.peek(); - if c.is_none() || c.is_some_and(|ch| !ch.is_ascii_digit() && !is_identifier_start(ch)) { - return kind; - } - self.current.chars.next(); - while let Some(c) = self.peek() { - if is_identifier_start(c) { - self.current.chars.next(); - } else { - break; - } - } - self.error(diagnostics::InvalidNumberEnd(Span::new(offset, self.offset()))); - Kind::Undetermined - } - - /// 12.9.4 String Literals - fn read_string_literal(&mut self, delimiter: char) -> Kind { - let mut builder = AutoCow::new(self); - loop { - match self.current.chars.next() { - None | Some('\r' | '\n') => { - self.error(diagnostics::UnterminatedString(self.unterminated_range())); - return Kind::Undetermined; - } - Some(c @ ('"' | '\'')) => { - if c == delimiter { - self.save_string(builder.has_escape(), builder.finish_without_push(self)); - return Kind::Str; - } - builder.push_matching(c); - } - Some('\\') => { - let start = self.offset() - 1; - let text = builder.get_mut_string_without_current_ascii_char(self); - let mut is_valid_escape_sequence = true; - self.read_string_escape_sequence(text, false, &mut is_valid_escape_sequence); - if !is_valid_escape_sequence { - let range = Span::new(start, self.offset()); - self.error(diagnostics::InvalidEscapeSequence(range)); - } - } - Some(c) => { - builder.push_matching(c); - } - } - } - } - - /// 12.9.5 Regular Expression Literals - fn read_regex(&mut self) -> (u32, RegExpFlags) { - let mut in_escape = false; - let mut in_character_class = false; - loop { - match self.current.chars.next() { - None => { - self.error(diagnostics::UnterminatedRegExp(self.unterminated_range())); - return (self.offset(), RegExpFlags::empty()); - } - Some(c) if is_line_terminator(c) => { - self.error(diagnostics::UnterminatedRegExp(self.unterminated_range())); - #[allow(clippy::cast_possible_truncation)] - let pattern_end = self.offset() - c.len_utf8() as u32; - return (pattern_end, RegExpFlags::empty()); - } - Some(c) => { - if in_escape { - in_escape = false; - } else if c == '/' && !in_character_class { - break; - } else if c == '[' { - in_character_class = true; - } else if c == '\\' { - in_escape = true; - } else if c == ']' { - in_character_class = false; - } - } - } - } - - let pattern_end = self.offset() - 1; // -1 to exclude `/` - let mut flags = RegExpFlags::empty(); - - while let Some(ch @ ('$' | '_' | 'a'..='z' | 'A'..='Z' | '0'..='9')) = self.peek() { - self.current.chars.next(); - let flag = if let Ok(flag) = RegExpFlags::try_from(ch) { - flag - } else { - self.error(diagnostics::RegExpFlag(ch, self.current_offset())); - continue; - }; - if flags.contains(flag) { - self.error(diagnostics::RegExpFlagTwice(ch, self.current_offset())); - continue; - } - flags |= flag; - } - - (pattern_end, flags) - } - - /// 12.8.6 Template Literal Lexical Components - fn read_template_literal(&mut self, substitute: Kind, tail: Kind) -> Kind { - let mut builder = AutoCow::new(self); - let mut is_valid_escape_sequence = true; - while let Some(c) = self.current.chars.next() { - match c { - '$' if self.peek() == Some('{') => { - self.save_template_string( - is_valid_escape_sequence, - builder.has_escape(), - builder.finish_without_push(self), - ); - self.current.chars.next(); - return substitute; - } - '`' => { - self.save_template_string( - is_valid_escape_sequence, - builder.has_escape(), - builder.finish_without_push(self), - ); - return tail; - } - CR => { - builder.force_allocation_without_current_ascii_char(self); - if self.next_eq(LF) { - builder.push_different(LF); - } - } - '\\' => { - let text = builder.get_mut_string_without_current_ascii_char(self); - self.read_string_escape_sequence(text, true, &mut is_valid_escape_sequence); - } - _ => builder.push_matching(c), - } - } - self.error(diagnostics::UnterminatedString(self.unterminated_range())); - Kind::Undetermined - } - - /// `JSXIdentifier` : - /// `IdentifierStart` - /// `JSXIdentifier` `IdentifierPart` - /// `JSXIdentifier` [no `WhiteSpace` or Comment here] - - fn read_jsx_identifier(&mut self, _start_offset: u32) -> Kind { - while let Some(c) = self.peek() { - if c == '-' || is_identifier_start(c) { - self.current.chars.next(); - while let Some(c) = self.peek() { - if is_identifier_part(c) { - self.current.chars.next(); - } else { - break; - } - } - } else { - break; - } - } - Kind::Ident - } - - /// [`JSXChild`](https://facebook.github.io/jsx/#prod-JSXChild) - /// `JSXChild` : - /// `JSXText` - /// `JSXElement` - /// `JSXFragment` - /// { `JSXChildExpressionopt` } - fn read_jsx_child(&mut self) -> Kind { - match self.peek() { - Some('<') => { - self.current.chars.next(); - Kind::LAngle - } - Some('{') => { - self.current.chars.next(); - Kind::LCurly - } - Some(_) => { - loop { - // The tokens `{`, `<`, `>` and `}` cannot appear in a jsx text. - // The TypeScript compiler raises the error "Unexpected token. Did you mean `{'>'}` or `>`?". - // Where as the Babel compiler does not raise any errors. - // The following check omits `>` and `}` so that more Babel tests can be passed. - if self.peek().is_some_and(|c| c == '{' || c == '<') { - break; - } - if self.current.chars.next().is_none() { - break; - } - } - Kind::JSXText - } - None => Kind::Eof, - } - } - - /// `JSXDoubleStringCharacters` :: - /// `JSXDoubleStringCharacter` `JSXDoubleStringCharactersopt` - /// `JSXDoubleStringCharacter` :: - /// `JSXStringCharacter` but not " - /// `JSXSingleStringCharacters` :: - /// `JSXSingleStringCharacter` `JSXSingleStringCharactersopt` - /// `JSXSingleStringCharacter` :: - /// `JSXStringCharacter` but not ' - /// `JSXStringCharacter` :: - /// `SourceCharacter` but not one of `HTMLCharacterReference` - fn read_jsx_string_literal(&mut self, delimiter: char) -> Kind { - let mut builder = AutoCow::new(self); - loop { - match self.current.chars.next() { - Some(c @ ('"' | '\'')) => { - if c == delimiter { - self.save_string(builder.has_escape(), builder.finish_without_push(self)); - return Kind::Str; - } - builder.push_matching(c); - } - Some(other) => { - builder.push_matching(other); - } - None => { - self.error(diagnostics::UnterminatedString(self.unterminated_range())); - return Kind::Undetermined; - } - } - } - } - - /* ---------- utils ---------- */ - - /// Identifier `UnicodeEscapeSequence` - /// \u `Hex4Digits` - /// \u{ `CodePoint` } - fn identifier_unicode_escape_sequence( - &mut self, - builder: &mut AutoCow<'a>, - check_identifier_start: bool, - ) { - let start = self.offset(); - if self.current.chars.next() != Some('u') { - let range = Span::new(start, self.offset()); - self.error(diagnostics::UnicodeEscapeSequence(range)); - return; - } - - let value = match self.peek() { - Some('{') => self.unicode_code_point(), - _ => self.surrogate_pair(), - }; - - let Some(value) = value else { - let range = Span::new(start, self.offset()); - self.error(diagnostics::UnicodeEscapeSequence(range)); - return; - }; - - // For Identifiers, surrogate pair is an invalid grammar, e.g. `var \uD800\uDEA7`. - let ch = match value { - SurrogatePair::Astral(..) | SurrogatePair::HighLow(..) => { - let range = Span::new(start, self.offset()); - self.error(diagnostics::UnicodeEscapeSequence(range)); - return; - } - SurrogatePair::CodePoint(code_point) => { - if let Ok(ch) = char::try_from(code_point) { - ch - } else { - let range = Span::new(start, self.offset()); - self.error(diagnostics::UnicodeEscapeSequence(range)); - return; - } - } - }; - - let is_valid = - if check_identifier_start { is_identifier_start(ch) } else { is_identifier_part(ch) }; - - if !is_valid { - self.error(diagnostics::InvalidCharacter(ch, self.current_offset())); - return; - } - - builder.push_different(ch); - } - - /// String `UnicodeEscapeSequence` - /// \u `Hex4Digits` - /// \u `Hex4Digits` \u `Hex4Digits` - /// \u{ `CodePoint` } - fn string_unicode_escape_sequence( - &mut self, - text: &mut String<'a>, - is_valid_escape_sequence: &mut bool, - ) { - let value = match self.peek() { - Some('{') => self.unicode_code_point(), - _ => self.surrogate_pair(), - }; - - let Some(value) = value else { - // error raised within the parser by `diagnostics::TemplateLiteral` - *is_valid_escape_sequence = false; - return; - }; - - // For strings and templates, surrogate pairs are valid grammar, e.g. `"\uD83D\uDE00" === 😀` - // values are interpreted as is if they fall out of range - match value { - SurrogatePair::CodePoint(code_point) | SurrogatePair::Astral(code_point) => { - if let Ok(ch) = char::try_from(code_point) { - text.push(ch); - } else { - text.push_str("\\u"); - text.push_str(format!("{code_point:x}").as_str()); - } - } - SurrogatePair::HighLow(high, low) => { - text.push_str("\\u"); - text.push_str(format!("{high:x}").as_str()); - text.push_str("\\u"); - text.push_str(format!("{low:x}").as_str()); - } - } - } - - fn unicode_code_point(&mut self) -> Option { - if !self.next_eq('{') { - return None; - } - let value = self.code_point()?; - if !self.next_eq('}') { - return None; - } - Some(SurrogatePair::CodePoint(value)) - } - - fn hex_4_digits(&mut self) -> Option { - let mut value = 0; - for _ in 0..4 { - value = (value << 4) | self.hex_digit()?; - } - Some(value) - } - - fn hex_digit(&mut self) -> Option { - let value = match self.peek() { - Some(c @ '0'..='9') => c as u32 - '0' as u32, - Some(c @ 'a'..='f') => 10 + (c as u32 - 'a' as u32), - Some(c @ 'A'..='F') => 10 + (c as u32 - 'A' as u32), - _ => return None, - }; - self.current.chars.next(); - Some(value) - } - - fn code_point(&mut self) -> Option { - let mut value = self.hex_digit()?; - while let Some(next) = self.hex_digit() { - value = (value << 4) | next; - if value > 0x0010_FFFF { - return None; - } - } - Some(value) - } - - /// Surrogate pairs - /// See background info: - /// * `https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae` - /// * `https://mathiasbynens.be/notes/javascript-identifiers-es6` - fn surrogate_pair(&mut self) -> Option { - let high = self.hex_4_digits()?; - // The first code unit of a surrogate pair is always in the range from 0xD800 to 0xDBFF, and is called a high surrogate or a lead surrogate. - if !((0xD800..=0xDBFF).contains(&high) - && self.peek() == Some('\\') - && self.peek2() == Some('u')) - { - return Some(SurrogatePair::CodePoint(high)); - } - - self.current.chars.next(); - self.current.chars.next(); - - let low = self.hex_4_digits()?; - - // The second code unit of a surrogate pair is always in the range from 0xDC00 to 0xDFFF, and is called a low surrogate or a trail surrogate. - if !(0xDC00..=0xDFFF).contains(&low) { - return Some(SurrogatePair::HighLow(high, low)); - } - - // `https://tc39.es/ecma262/#sec-utf16decodesurrogatepair` - let astral_code_point = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000; - - Some(SurrogatePair::Astral(astral_code_point)) - } - - // EscapeSequence :: - fn read_string_escape_sequence( - &mut self, - text: &mut String<'a>, - in_template: bool, - is_valid_escape_sequence: &mut bool, - ) { - match self.current.chars.next() { - None => { - self.error(diagnostics::UnterminatedString(self.unterminated_range())); - } - Some(c) => match c { - // \ LineTerminatorSequence - // LineTerminatorSequence :: - // - // [lookahead ≠ ] - // - // - // - LF | LS | PS => {} - CR => { - self.next_eq(LF); - } - // SingleEscapeCharacter :: one of - // ' " \ b f n r t v - '\'' | '"' | '\\' => text.push(c), - 'b' => text.push('\u{8}'), - 'f' => text.push(FF), - 'n' => text.push(LF), - 'r' => text.push(CR), - 't' => text.push(TAB), - 'v' => text.push(VT), - // HexEscapeSequence - 'x' => { - self.hex_digit() - .and_then(|value1| { - let value2 = self.hex_digit()?; - Some((value1, value2)) - }) - .map(|(value1, value2)| (value1 << 4) | value2) - .and_then(|value| char::try_from(value).ok()) - .map_or_else( - || { - *is_valid_escape_sequence = false; - }, - |c| { - text.push(c); - }, - ); - } - // UnicodeEscapeSequence - 'u' => { - self.string_unicode_escape_sequence(text, is_valid_escape_sequence); - } - // 0 [lookahead ∉ DecimalDigit] - '0' if !self.peek().is_some_and(|c| c.is_ascii_digit()) => text.push('\0'), - // Section 12.9.4 String Literals - // LegacyOctalEscapeSequence - // NonOctalDecimalEscapeSequence - a @ '0'..='7' if !in_template => { - let mut num = String::new_in(self.allocator); - num.push(a); - match a { - '4'..='7' => { - if matches!(self.peek(), Some('0'..='7')) { - let b = self.consume_char(); - num.push(b); - } - } - '0'..='3' => { - if matches!(self.peek(), Some('0'..='7')) { - let b = self.consume_char(); - num.push(b); - if matches!(self.peek(), Some('0'..='7')) { - let c = self.consume_char(); - num.push(c); - } - } - } - _ => {} - } - - let value = - char::from_u32(u32::from_str_radix(num.as_str(), 8).unwrap()).unwrap(); - text.push(value); - } - '0' if in_template && self.peek().is_some_and(|c| c.is_ascii_digit()) => { - self.current.chars.next(); - // error raised within the parser by `diagnostics::TemplateLiteral` - *is_valid_escape_sequence = false; - } - // NotEscapeSequence :: DecimalDigit but not 0 - '1'..='9' if in_template => { - // error raised within the parser by `diagnostics::TemplateLiteral` - *is_valid_escape_sequence = false; - } - other => { - // NonOctalDecimalEscapeSequence \8 \9 in strict mode - text.push(other); - } - }, - } - } } - -enum SurrogatePair { - // valid \u Hex4Digits \u Hex4Digits - Astral(u32), - // valid \u Hex4Digits - CodePoint(u32), - // invalid \u Hex4Digits \u Hex4Digits - HighLow(u32, u32), -} - -#[allow(clippy::unnecessary_safety_comment)] -/// Handle next byte of source. -/// SAFETY: -/// * Lexer must not be at end of file. -/// * `byte` must be next byte of source code, corresponding to current position -/// of `lexer.current.chars`. -/// * Only `BYTE_HANDLERS` for ASCII characters may use the `ascii_byte_handler!()` macro. -unsafe fn handle_byte(byte: u8, lexer: &mut Lexer) -> Kind { - BYTE_HANDLERS[byte as usize](lexer) -} - -type ByteHandler = unsafe fn(&mut Lexer<'_>) -> Kind; - -/// Lookup table mapping any incoming byte to a handler function defined below. -/// -#[rustfmt::skip] -static BYTE_HANDLERS: [ByteHandler; 256] = [ -// 0 1 2 3 4 5 6 7 8 9 A B C D E F // - ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, SPS, LIN, SPS, SPS, LIN, ERR, ERR, // 0 - ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, // 1 - SPS, EXL, QOT, HAS, IDT, PRC, AMP, QOT, PNO, PNC, ATR, PLS, COM, MIN, PRD, SLH, // 2 - ZER, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, COL, SEM, LSS, EQL, GTR, QST, // 3 - AT_, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, // 4 - IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, BTO, ESC, BTC, CRT, IDT, // 5 - TPL, L_A, L_B, L_C, L_D, L_E, L_F, L_G, IDT, L_I, IDT, L_K, L_L, L_M, L_N, L_O, // 6 - L_P, IDT, L_R, L_S, L_T, L_U, L_V, L_W, IDT, L_Y, IDT, BEO, PIP, BEC, TLD, ERR, // 7 - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 8 - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 9 - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // A - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // B - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // C - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // D - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // E - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // F -]; - -#[allow(clippy::unnecessary_safety_comment)] -/// Macro for defining byte handler for an ASCII character. -/// -/// In addition to defining a `const` for the handler, it also asserts that lexer -/// is not at end of file, and that next char is ASCII. -/// Where the handler is for an ASCII character, these assertions are self-evidently true. -/// -/// These assertions produce no runtime code, but hint to the compiler that it can assume that -/// next char is ASCII, and it uses that information to optimize the rest of the handler. -/// e.g. `lexer.current.chars.next()` becomes just a single assembler instruction. -/// Without the assertions, the compiler is unable to deduce the next char is ASCII, due to -/// the indirection of the `BYTE_HANDLERS` jump table. -/// -/// These assertions are unchecked (i.e. won't panic) and will cause UB if they're incorrect. -/// -/// SAFETY: Only use this macro to define byte handlers for ASCII characters. -/// -/// ``` -/// ascii_byte_handler!(SPS(lexer) { -/// lexer.consume_char(); -/// Kind::WhiteSpace -/// }); -/// ``` -/// -/// expands to: -/// -/// ``` -/// const SPS: ByteHandler = |lexer| { -/// unsafe { -/// use ::assert_unchecked::assert_unchecked; -/// let s = lexer.current.chars.as_str(); -/// assert_unchecked!(!s.is_empty()); -/// assert_unchecked!(s.as_bytes()[0] < 128); -/// } -/// lexer.consume_char(); -/// Kind::WhiteSpace -/// }; -/// ``` -macro_rules! ascii_byte_handler { - ($id:ident($lex:ident) $body:expr) => { - const $id: ByteHandler = |$lex| { - // SAFETY: This macro is only used for ASCII characters - unsafe { - use assert_unchecked::assert_unchecked; - let s = $lex.current.chars.as_str(); - assert_unchecked!(!s.is_empty()); - assert_unchecked!(s.as_bytes()[0] < 128); - } - $body - }; - }; -} - -// `\0` `\1` etc -ascii_byte_handler!(ERR(lexer) { - let c = lexer.consume_char(); - lexer.error(diagnostics::InvalidCharacter(c, lexer.unterminated_range())); - Kind::Undetermined -}); - -// -ascii_byte_handler!(SPS(lexer) { - lexer.consume_char(); - Kind::Skip -}); - -// '\r' '\n' -ascii_byte_handler!(LIN(lexer) { - lexer.consume_char(); - lexer.current.token.is_on_new_line = true; - Kind::Skip -}); - -// ! -ascii_byte_handler!(EXL(lexer) { - lexer.consume_char(); - if lexer.next_eq('=') { - if lexer.next_eq('=') { - Kind::Neq2 - } else { - Kind::Neq - } - } else { - Kind::Bang - } -}); - -// ' " -ascii_byte_handler!(QOT(lexer) { - let c = lexer.consume_char(); - if lexer.context == LexerContext::JsxAttributeValue { - lexer.read_jsx_string_literal(c) - } else { - lexer.read_string_literal(c) - } -}); - -// # -ascii_byte_handler!(HAS(lexer) { - lexer.consume_char(); - // HashbangComment :: - // `#!` SingleLineCommentChars? - if lexer.current.token.start == 0 && lexer.next_eq('!') { - lexer.read_hashbang_comment() - } else { - lexer.private_identifier() - } -}); - -// `A..=Z`, `a..=z` (except special cases below), `_`, `$` -ascii_byte_handler!(IDT(lexer) { - lexer.identifier_name_handler(); - Kind::Ident -}); - -// % -ascii_byte_handler!(PRC(lexer) { - lexer.consume_char(); - if lexer.next_eq('=') { - Kind::PercentEq - } else { - Kind::Percent - } -}); - -// & -ascii_byte_handler!(AMP(lexer) { - lexer.consume_char(); - if lexer.next_eq('&') { - if lexer.next_eq('=') { - Kind::Amp2Eq - } else { - Kind::Amp2 - } - } else if lexer.next_eq('=') { - Kind::AmpEq - } else { - Kind::Amp - } -}); - -// ( -ascii_byte_handler!(PNO(lexer) { - lexer.consume_char(); - Kind::LParen -}); - -// ) -ascii_byte_handler!(PNC(lexer) { - lexer.consume_char(); - Kind::RParen -}); - -// * -ascii_byte_handler!(ATR(lexer) { - lexer.consume_char(); - if lexer.next_eq('*') { - if lexer.next_eq('=') { - Kind::Star2Eq - } else { - Kind::Star2 - } - } else if lexer.next_eq('=') { - Kind::StarEq - } else { - Kind::Star - } -}); - -// + -ascii_byte_handler!(PLS(lexer) { - lexer.consume_char(); - if lexer.next_eq('+') { - Kind::Plus2 - } else if lexer.next_eq('=') { - Kind::PlusEq - } else { - Kind::Plus - } -}); - -// , -ascii_byte_handler!(COM(lexer) { - lexer.consume_char(); - Kind::Comma -}); - -// - -ascii_byte_handler!(MIN(lexer) { - lexer.consume_char(); - lexer.read_minus().unwrap_or_else(|| lexer.skip_single_line_comment()) -}); - -// . -ascii_byte_handler!(PRD(lexer) { - lexer.consume_char(); - lexer.read_dot() -}); - -// / -ascii_byte_handler!(SLH(lexer) { - lexer.consume_char(); - match lexer.peek() { - Some('/') => { - lexer.current.chars.next(); - lexer.skip_single_line_comment() - } - Some('*') => { - lexer.current.chars.next(); - lexer.skip_multi_line_comment() - } - _ => { - // regex is handled separately, see `next_regex` - if lexer.next_eq('=') { - Kind::SlashEq - } else { - Kind::Slash - } - } - } -}); - -// 0 -ascii_byte_handler!(ZER(lexer) { - lexer.consume_char(); - lexer.read_zero() -}); - -// 1 to 9 -ascii_byte_handler!(DIG(lexer) { - lexer.consume_char(); - lexer.decimal_literal_after_first_digit() -}); - -// : -ascii_byte_handler!(COL(lexer) { - lexer.consume_char(); - Kind::Colon -}); - -// ; -ascii_byte_handler!(SEM(lexer) { - lexer.consume_char(); - Kind::Semicolon -}); - -// < -ascii_byte_handler!(LSS(lexer) { - lexer.consume_char(); - lexer.read_left_angle().unwrap_or_else(|| lexer.skip_single_line_comment()) -}); - -// = -ascii_byte_handler!(EQL(lexer) { - lexer.consume_char(); - if lexer.next_eq('=') { - if lexer.next_eq('=') { - Kind::Eq3 - } else { - Kind::Eq2 - } - } else if lexer.next_eq('>') { - Kind::Arrow - } else { - Kind::Eq - } -}); - -// > -ascii_byte_handler!(GTR(lexer) { - lexer.consume_char(); - // `>=` is re-lexed with [Lexer::next_jsx_child] - Kind::RAngle -}); - -// ? -ascii_byte_handler!(QST(lexer) { - lexer.consume_char(); - if lexer.next_eq('?') { - if lexer.next_eq('=') { - Kind::Question2Eq - } else { - Kind::Question2 - } - } else if lexer.peek() == Some('.') { - // parse `?.1` as `?` `.1` - if lexer.peek2().is_some_and(|c| c.is_ascii_digit()) { - Kind::Question - } else { - lexer.current.chars.next(); - Kind::QuestionDot - } - } else { - Kind::Question - } -}); - -// @ -ascii_byte_handler!(AT_(lexer) { - lexer.consume_char(); - Kind::At -}); - -// [ -ascii_byte_handler!(BTO(lexer) { - lexer.consume_char(); - Kind::LBrack -}); - -// \ -ascii_byte_handler!(ESC(lexer) { - let mut builder = AutoCow::new(lexer); - lexer.consume_char(); - builder.force_allocation_without_current_ascii_char(lexer); - lexer.identifier_unicode_escape_sequence(&mut builder, true); - let text = lexer.identifier_name(builder); - Kind::match_keyword(text) -}); - -// ] -ascii_byte_handler!(BTC(lexer) { - lexer.consume_char(); - Kind::RBrack -}); - -// ^ -ascii_byte_handler!(CRT(lexer) { - lexer.consume_char(); - if lexer.next_eq('=') { - Kind::CaretEq - } else { - Kind::Caret - } -}); - -// ` -ascii_byte_handler!(TPL(lexer) { - lexer.consume_char(); - lexer.read_template_literal(Kind::TemplateHead, Kind::NoSubstitutionTemplate) -}); - -// { -ascii_byte_handler!(BEO(lexer) { - lexer.consume_char(); - Kind::LCurly -}); - -// | -ascii_byte_handler!(PIP(lexer) { - lexer.consume_char(); - if lexer.next_eq('|') { - if lexer.next_eq('=') { - Kind::Pipe2Eq - } else { - Kind::Pipe2 - } - } else if lexer.next_eq('=') { - Kind::PipeEq - } else { - Kind::Pipe - } -}); - -// } -ascii_byte_handler!(BEC(lexer) { - lexer.consume_char(); - Kind::RCurly -}); - -// ~ -ascii_byte_handler!(TLD(lexer) { - lexer.consume_char(); - Kind::Tilde -}); - -ascii_byte_handler!(L_A(lexer) match &lexer.identifier_name_handler()[1..] { - "wait" => Kind::Await, - "sync" => Kind::Async, - "bstract" => Kind::Abstract, - "ccessor" => Kind::Accessor, - "ny" => Kind::Any, - "s" => Kind::As, - "ssert" => Kind::Assert, - "sserts" => Kind::Asserts, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_B(lexer) match &lexer.identifier_name_handler()[1..] { - "reak" => Kind::Break, - "oolean" => Kind::Boolean, - "igint" => Kind::BigInt, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_C(lexer) match &lexer.identifier_name_handler()[1..] { - "onst" => Kind::Const, - "lass" => Kind::Class, - "ontinue" => Kind::Continue, - "atch" => Kind::Catch, - "ase" => Kind::Case, - "onstructor" => Kind::Constructor, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_D(lexer) match &lexer.identifier_name_handler()[1..] { - "o" => Kind::Do, - "elete" => Kind::Delete, - "eclare" => Kind::Declare, - "efault" => Kind::Default, - "ebugger" => Kind::Debugger, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_E(lexer) match &lexer.identifier_name_handler()[1..] { - "lse" => Kind::Else, - "num" => Kind::Enum, - "xport" => Kind::Export, - "xtends" => Kind::Extends, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_F(lexer) match &lexer.identifier_name_handler()[1..] { - "unction" => Kind::Function, - "alse" => Kind::False, - "or" => Kind::For, - "inally" => Kind::Finally, - "rom" => Kind::From, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_G(lexer) match &lexer.identifier_name_handler()[1..] { - "et" => Kind::Get, - "lobal" => Kind::Global, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_I(lexer) match &lexer.identifier_name_handler()[1..] { - "f" => Kind::If, - "nstanceof" => Kind::Instanceof, - "n" => Kind::In, - "mplements" => Kind::Implements, - "mport" => Kind::Import, - "nfer" => Kind::Infer, - "nterface" => Kind::Interface, - "ntrinsic" => Kind::Intrinsic, - "s" => Kind::Is, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_K(lexer) match &lexer.identifier_name_handler()[1..] { - "eyof" => Kind::KeyOf, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_L(lexer) match &lexer.identifier_name_handler()[1..] { - "et" => Kind::Let, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_M(lexer) match &lexer.identifier_name_handler()[1..] { - "eta" => Kind::Meta, - "odule" => Kind::Module, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_N(lexer) match &lexer.identifier_name_handler()[1..] { - "ull" => Kind::Null, - "ew" => Kind::New, - "umber" => Kind::Number, - "amespace" => Kind::Namespace, - "ever" => Kind::Never, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_O(lexer) match &lexer.identifier_name_handler()[1..] { - "f" => Kind::Of, - "bject" => Kind::Object, - "ut" => Kind::Out, - "verride" => Kind::Override, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_P(lexer) match &lexer.identifier_name_handler()[1..] { - "ackage" => Kind::Package, - "rivate" => Kind::Private, - "rotected" => Kind::Protected, - "ublic" => Kind::Public, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_R(lexer) match &lexer.identifier_name_handler()[1..] { - "eturn" => Kind::Return, - "equire" => Kind::Require, - "eadonly" => Kind::Readonly, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_S(lexer) match &lexer.identifier_name_handler()[1..] { - "et" => Kind::Set, - "uper" => Kind::Super, - "witch" => Kind::Switch, - "tatic" => Kind::Static, - "ymbol" => Kind::Symbol, - "tring" => Kind::String, - "atisfies" => Kind::Satisfies, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_T(lexer) match &lexer.identifier_name_handler()[1..] { - "his" => Kind::This, - "rue" => Kind::True, - "hrow" => Kind::Throw, - "ry" => Kind::Try, - "ypeof" => Kind::Typeof, - "arget" => Kind::Target, - "ype" => Kind::Type, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_U(lexer) match &lexer.identifier_name_handler()[1..] { - "ndefined" => Kind::Undefined, - "sing" => Kind::Using, - "nique" => Kind::Unique, - "nknown" => Kind::Unknown, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_V(lexer) match &lexer.identifier_name_handler()[1..] { - "ar" => Kind::Var, - "oid" => Kind::Void, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_W(lexer) match &lexer.identifier_name_handler()[1..] { - "hile" => Kind::While, - "ith" => Kind::With, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_Y(lexer) match &lexer.identifier_name_handler()[1..] { - "ield" => Kind::Yield, - _ => Kind::Ident, -}); - -// Non-ASCII characters. -// NB: Must not use `ascii_byte_handler!()` macro, as this handler is for non-ASCII chars. -#[allow(clippy::redundant_closure_for_method_calls)] -const UNI: ByteHandler = |lexer| lexer.unicode_char_handler(); diff --git a/crates/oxc_parser/src/lexer/numeric.rs b/crates/oxc_parser/src/lexer/numeric.rs new file mode 100644 index 000000000..8dcc27d05 --- /dev/null +++ b/crates/oxc_parser/src/lexer/numeric.rs @@ -0,0 +1,199 @@ +use super::{Kind, Lexer, Span}; +use crate::diagnostics; + +use oxc_syntax::identifier::is_identifier_start; + +impl<'a> Lexer<'a> { + /// 12.9.3 Numeric Literals with `0` prefix + pub(super) fn read_zero(&mut self) -> Kind { + match self.peek() { + Some('b' | 'B') => self.read_non_decimal(Kind::Binary), + Some('o' | 'O') => self.read_non_decimal(Kind::Octal), + Some('x' | 'X') => self.read_non_decimal(Kind::Hex), + Some('e' | 'E') => { + self.current.chars.next(); + self.read_decimal_exponent() + } + Some('.') => { + self.current.chars.next(); + self.decimal_literal_after_decimal_point_after_digits() + } + Some('n') => { + self.current.chars.next(); + self.check_after_numeric_literal(Kind::Decimal) + } + Some(n) if n.is_ascii_digit() => self.read_legacy_octal(), + _ => self.check_after_numeric_literal(Kind::Decimal), + } + } + + pub(super) fn decimal_literal_after_first_digit(&mut self) -> Kind { + self.read_decimal_digits_after_first_digit(); + if self.next_eq('.') { + return self.decimal_literal_after_decimal_point_after_digits(); + } else if self.next_eq('n') { + return self.check_after_numeric_literal(Kind::Decimal); + } + + let kind = self.optional_exponent().map_or(Kind::Decimal, |kind| kind); + self.check_after_numeric_literal(kind) + } + + fn read_non_decimal(&mut self, kind: Kind) -> Kind { + self.current.chars.next(); + + if self.peek().is_some_and(|c| kind.matches_number_char(c)) { + self.current.chars.next(); + } else { + self.unexpected_err(); + return Kind::Undetermined; + } + + while let Some(c) = self.peek() { + match c { + '_' => { + self.current.chars.next(); + if self.peek().is_some_and(|c| kind.matches_number_char(c)) { + self.current.chars.next(); + } else { + self.unexpected_err(); + return Kind::Undetermined; + } + } + c if kind.matches_number_char(c) => { + self.current.chars.next(); + } + _ => break, + } + } + if self.peek() == Some('n') { + self.current.chars.next(); + } + self.check_after_numeric_literal(kind) + } + + fn read_legacy_octal(&mut self) -> Kind { + let mut kind = Kind::Octal; + loop { + match self.peek() { + Some('0'..='7') => { + self.current.chars.next(); + } + Some('8'..='9') => { + self.current.chars.next(); + kind = Kind::Decimal; + } + _ => break, + } + } + + match self.peek() { + // allow 08.5 and 09.5 + Some('.') if kind == Kind::Decimal => { + self.current.chars.next(); + self.decimal_literal_after_decimal_point_after_digits() + } + // allow 08e1 and 09e1 + Some('e') if kind == Kind::Decimal => { + self.current.chars.next(); + self.read_decimal_exponent() + } + _ => self.check_after_numeric_literal(kind), + } + } + + fn read_decimal_exponent(&mut self) -> Kind { + let kind = match self.peek() { + Some('-') => { + self.current.chars.next(); + Kind::NegativeExponential + } + Some('+') => { + self.current.chars.next(); + Kind::PositiveExponential + } + _ => Kind::PositiveExponential, + }; + self.read_decimal_digits(); + kind + } + + fn read_decimal_digits(&mut self) { + if self.peek().is_some_and(|c| c.is_ascii_digit()) { + self.current.chars.next(); + } else { + self.unexpected_err(); + return; + } + + self.read_decimal_digits_after_first_digit(); + } + + fn read_decimal_digits_after_first_digit(&mut self) { + while let Some(c) = self.peek() { + match c { + '_' => { + self.current.chars.next(); + if self.peek().is_some_and(|c| c.is_ascii_digit()) { + self.current.chars.next(); + } else { + self.unexpected_err(); + return; + } + } + '0'..='9' => { + self.current.chars.next(); + } + _ => break, + } + } + } + + pub(super) fn decimal_literal_after_decimal_point(&mut self) -> Kind { + self.read_decimal_digits(); + self.optional_exponent(); + self.check_after_numeric_literal(Kind::Float) + } + + fn decimal_literal_after_decimal_point_after_digits(&mut self) -> Kind { + self.optional_decimal_digits(); + self.optional_exponent(); + self.check_after_numeric_literal(Kind::Float) + } + + fn optional_decimal_digits(&mut self) { + if self.peek().is_some_and(|c| c.is_ascii_digit()) { + self.current.chars.next(); + } else { + return; + } + self.read_decimal_digits_after_first_digit(); + } + + fn optional_exponent(&mut self) -> Option { + if matches!(self.peek(), Some('e' | 'E')) { + self.current.chars.next(); + return Some(self.read_decimal_exponent()); + } + None + } + + fn check_after_numeric_literal(&mut self, kind: Kind) -> Kind { + let offset = self.offset(); + // The SourceCharacter immediately following a NumericLiteral must not be an IdentifierStart or DecimalDigit. + let c = self.peek(); + if c.is_none() || c.is_some_and(|ch| !ch.is_ascii_digit() && !is_identifier_start(ch)) { + return kind; + } + self.current.chars.next(); + while let Some(c) = self.peek() { + if is_identifier_start(c) { + self.current.chars.next(); + } else { + break; + } + } + self.error(diagnostics::InvalidNumberEnd(Span::new(offset, self.offset()))); + Kind::Undetermined + } +} diff --git a/crates/oxc_parser/src/lexer/punctuation.rs b/crates/oxc_parser/src/lexer/punctuation.rs new file mode 100644 index 000000000..e119a45b5 --- /dev/null +++ b/crates/oxc_parser/src/lexer/punctuation.rs @@ -0,0 +1,83 @@ +use super::{Kind, Lexer, Token}; + +impl<'a> Lexer<'a> { + /// Section 12.8 Punctuators + pub(super) fn read_dot(&mut self) -> Kind { + if self.peek() == Some('.') && self.peek2() == Some('.') { + self.current.chars.next(); + self.current.chars.next(); + return Kind::Dot3; + } + if self.peek().is_some_and(|c| c.is_ascii_digit()) { + self.decimal_literal_after_decimal_point() + } else { + Kind::Dot + } + } + + /// returns None for `SingleLineHTMLOpenComment` `` in script mode + pub(super) fn read_minus(&mut self) -> Option { + if self.next_eq('-') { + // SingleLineHTMLCloseComment `-->` in script mode + if self.current.token.is_on_new_line + && self.source_type.is_script() + && self.next_eq('>') + { + None + } else { + Some(Kind::Minus2) + } + } else if self.next_eq('=') { + Some(Kind::MinusEq) + } else { + Some(Kind::Minus) + } + } + + pub(crate) fn next_right_angle(&mut self) -> Token { + let kind = self.read_right_angle(); + self.lookahead.clear(); + self.finish_next(kind) + } + + fn read_right_angle(&mut self) -> Kind { + if self.next_eq('>') { + if self.next_eq('>') { + if self.next_eq('=') { + Kind::ShiftRight3Eq + } else { + Kind::ShiftRight3 + } + } else if self.next_eq('=') { + Kind::ShiftRightEq + } else { + Kind::ShiftRight + } + } else if self.next_eq('=') { + Kind::GtEq + } else { + Kind::RAngle + } + } +} diff --git a/crates/oxc_parser/src/lexer/regex.rs b/crates/oxc_parser/src/lexer/regex.rs new file mode 100644 index 000000000..084e1175d --- /dev/null +++ b/crates/oxc_parser/src/lexer/regex.rs @@ -0,0 +1,78 @@ +use super::{Kind, Lexer, RegExpFlags, Token}; +use crate::diagnostics; + +use oxc_syntax::identifier::is_line_terminator; + +impl<'a> Lexer<'a> { + /// Re-tokenize the current `/` or `/=` and return `RegExp` + /// See Section 12: + /// The `InputElementRegExp` goal symbol is used in all syntactic grammar contexts + /// where a `RegularExpressionLiteral` is permitted + /// Which means the parser needs to re-tokenize on `PrimaryExpression`, + /// `RegularExpressionLiteral` only appear on the right hand side of `PrimaryExpression` + pub(crate) fn next_regex(&mut self, kind: Kind) -> (Token, u32, RegExpFlags) { + self.current.token.start = self.offset() + - match kind { + Kind::Slash => 1, + Kind::SlashEq => 2, + _ => unreachable!(), + }; + let (pattern_end, flags) = self.read_regex(); + self.lookahead.clear(); + let token = self.finish_next(Kind::RegExp); + (token, pattern_end, flags) + } + + /// 12.9.5 Regular Expression Literals + fn read_regex(&mut self) -> (u32, RegExpFlags) { + let mut in_escape = false; + let mut in_character_class = false; + loop { + match self.current.chars.next() { + None => { + self.error(diagnostics::UnterminatedRegExp(self.unterminated_range())); + return (self.offset(), RegExpFlags::empty()); + } + Some(c) if is_line_terminator(c) => { + self.error(diagnostics::UnterminatedRegExp(self.unterminated_range())); + #[allow(clippy::cast_possible_truncation)] + let pattern_end = self.offset() - c.len_utf8() as u32; + return (pattern_end, RegExpFlags::empty()); + } + Some(c) => { + if in_escape { + in_escape = false; + } else if c == '/' && !in_character_class { + break; + } else if c == '[' { + in_character_class = true; + } else if c == '\\' { + in_escape = true; + } else if c == ']' { + in_character_class = false; + } + } + } + } + + let pattern_end = self.offset() - 1; // -1 to exclude `/` + let mut flags = RegExpFlags::empty(); + + while let Some(ch @ ('$' | '_' | 'a'..='z' | 'A'..='Z' | '0'..='9')) = self.peek() { + self.current.chars.next(); + let flag = if let Ok(flag) = RegExpFlags::try_from(ch) { + flag + } else { + self.error(diagnostics::RegExpFlag(ch, self.current_offset())); + continue; + }; + if flags.contains(flag) { + self.error(diagnostics::RegExpFlagTwice(ch, self.current_offset())); + continue; + } + flags |= flag; + } + + (pattern_end, flags) + } +} diff --git a/crates/oxc_parser/src/lexer/string.rs b/crates/oxc_parser/src/lexer/string.rs new file mode 100644 index 000000000..f2f0c14b0 --- /dev/null +++ b/crates/oxc_parser/src/lexer/string.rs @@ -0,0 +1,65 @@ +use super::{AutoCow, Kind, Lexer, Span, Token}; +use crate::diagnostics; + +impl<'a> Lexer<'a> { + /// 12.9.4 String Literals + pub(super) fn read_string_literal(&mut self, delimiter: char) -> Kind { + let mut builder = AutoCow::new(self); + loop { + match self.current.chars.next() { + None | Some('\r' | '\n') => { + self.error(diagnostics::UnterminatedString(self.unterminated_range())); + return Kind::Undetermined; + } + Some(c @ ('"' | '\'')) => { + if c == delimiter { + self.save_string(builder.has_escape(), builder.finish_without_push(self)); + return Kind::Str; + } + builder.push_matching(c); + } + Some('\\') => { + let start = self.offset() - 1; + let text = builder.get_mut_string_without_current_ascii_char(self); + let mut is_valid_escape_sequence = true; + self.read_string_escape_sequence(text, false, &mut is_valid_escape_sequence); + if !is_valid_escape_sequence { + let range = Span::new(start, self.offset()); + self.error(diagnostics::InvalidEscapeSequence(range)); + } + } + Some(c) => { + builder.push_matching(c); + } + } + } + } + + /// Save the string if it is escaped + /// This reduces the overall memory consumption while keeping the `Token` size small + /// Strings without escaped values can be retrieved as is from the token span + pub(super) fn save_string(&mut self, has_escape: bool, s: &'a str) { + if !has_escape { + return; + } + self.escaped_strings.insert(self.current.token.start, s); + self.current.token.escaped = true; + } + + pub(crate) fn get_string(&self, token: Token) -> &'a str { + if token.escaped { + return self.escaped_strings[&token.start]; + } + + let raw = &self.source[token.start as usize..token.end as usize]; + match token.kind { + Kind::Str => { + &raw[1..raw.len() - 1] // omit surrounding quotes + } + Kind::PrivateIdentifier => { + &raw[1..] // omit leading `#` + } + _ => raw, + } + } +} diff --git a/crates/oxc_parser/src/lexer/template.rs b/crates/oxc_parser/src/lexer/template.rs new file mode 100644 index 000000000..661bfda4f --- /dev/null +++ b/crates/oxc_parser/src/lexer/template.rs @@ -0,0 +1,86 @@ +use super::{AutoCow, Kind, Lexer, Token}; +use crate::diagnostics; + +use oxc_syntax::identifier::{CR, LF}; + +impl<'a> Lexer<'a> { + /// 12.8.6 Template Literal Lexical Components + pub(super) fn read_template_literal(&mut self, substitute: Kind, tail: Kind) -> Kind { + let mut builder = AutoCow::new(self); + let mut is_valid_escape_sequence = true; + while let Some(c) = self.current.chars.next() { + match c { + '$' if self.peek() == Some('{') => { + self.save_template_string( + is_valid_escape_sequence, + builder.has_escape(), + builder.finish_without_push(self), + ); + self.current.chars.next(); + return substitute; + } + '`' => { + self.save_template_string( + is_valid_escape_sequence, + builder.has_escape(), + builder.finish_without_push(self), + ); + return tail; + } + CR => { + builder.force_allocation_without_current_ascii_char(self); + if self.next_eq(LF) { + builder.push_different(LF); + } + } + '\\' => { + let text = builder.get_mut_string_without_current_ascii_char(self); + self.read_string_escape_sequence(text, true, &mut is_valid_escape_sequence); + } + _ => builder.push_matching(c), + } + } + self.error(diagnostics::UnterminatedString(self.unterminated_range())); + Kind::Undetermined + } + + /// Re-tokenize the current `}` token for `TemplateSubstitutionTail` + /// See Section 12, the parser needs to re-tokenize on `TemplateSubstitutionTail`, + pub(crate) fn next_template_substitution_tail(&mut self) -> Token { + self.current.token.start = self.offset() - 1; + let kind = self.read_template_literal(Kind::TemplateMiddle, Kind::TemplateTail); + self.lookahead.clear(); + self.finish_next(kind) + } + + /// Save the template if it is escaped + fn save_template_string( + &mut self, + is_valid_escape_sequence: bool, + has_escape: bool, + s: &'a str, + ) { + if !has_escape { + return; + } + self.escaped_templates + .insert(self.current.token.start, is_valid_escape_sequence.then(|| s)); + self.current.token.escaped = true; + } + + pub(crate) fn get_template_string(&self, token: Token) -> Option<&'a str> { + if token.escaped { + return self.escaped_templates[&token.start]; + } + let raw = &self.source[token.start as usize..token.end as usize]; + Some(match token.kind { + Kind::NoSubstitutionTemplate | Kind::TemplateTail => { + &raw[1..raw.len() - 1] // omit surrounding quotes or leading "}" and trailing "`" + } + Kind::TemplateHead | Kind::TemplateMiddle => { + &raw[1..raw.len() - 2] // omit leading "`" or "}" and trailing "${" + } + _ => raw, + }) + } +} diff --git a/crates/oxc_parser/src/lexer/typescript.rs b/crates/oxc_parser/src/lexer/typescript.rs new file mode 100644 index 000000000..e2c781969 --- /dev/null +++ b/crates/oxc_parser/src/lexer/typescript.rs @@ -0,0 +1,17 @@ +use super::{Kind, Lexer, Token}; + +impl<'a> Lexer<'a> { + /// Re-tokenize '<<' or '<=' or '<<=' to '<' + pub(crate) fn re_lex_as_typescript_l_angle(&mut self, kind: Kind) -> Token { + let offset = match kind { + Kind::ShiftLeft | Kind::LtEq => 2, + Kind::ShiftLeftEq => 3, + _ => unreachable!(), + }; + self.current.token.start = self.offset() - offset; + self.current.chars = self.source[self.current.token.start as usize + 1..].chars(); + let kind = Kind::LAngle; + self.lookahead.clear(); + self.finish_next(kind) + } +} diff --git a/crates/oxc_parser/src/lexer/unicode.rs b/crates/oxc_parser/src/lexer/unicode.rs new file mode 100644 index 000000000..fe8f08f49 --- /dev/null +++ b/crates/oxc_parser/src/lexer/unicode.rs @@ -0,0 +1,318 @@ +use super::{AutoCow, Kind, Lexer, Span}; +use crate::diagnostics; + +use oxc_allocator::String; +use oxc_syntax::identifier::{ + is_identifier_part, is_identifier_start, is_identifier_start_unicode, + is_irregular_line_terminator, is_irregular_whitespace, CR, FF, LF, LS, PS, TAB, VT, +}; + +enum SurrogatePair { + // valid \u Hex4Digits \u Hex4Digits + Astral(u32), + // valid \u Hex4Digits + CodePoint(u32), + // invalid \u Hex4Digits \u Hex4Digits + HighLow(u32, u32), +} + +impl<'a> Lexer<'a> { + pub(super) fn unicode_char_handler(&mut self) -> Kind { + let c = self.current.chars.clone().next().unwrap(); + match c { + c if is_identifier_start_unicode(c) => { + let mut builder = AutoCow::new(self); + let c = self.consume_char(); + builder.push_matching(c); + self.identifier_name(builder); + Kind::Ident + } + c if is_irregular_whitespace(c) => { + self.trivia_builder + .add_irregular_whitespace(self.current.token.start, self.offset()); + self.consume_char(); + Kind::Skip + } + c if is_irregular_line_terminator(c) => { + self.consume_char(); + self.current.token.is_on_new_line = true; + Kind::Skip + } + _ => { + self.consume_char(); + self.error(diagnostics::InvalidCharacter(c, self.unterminated_range())); + Kind::Undetermined + } + } + } + + /// Identifier `UnicodeEscapeSequence` + /// \u `Hex4Digits` + /// \u{ `CodePoint` } + pub(super) fn identifier_unicode_escape_sequence( + &mut self, + builder: &mut AutoCow<'a>, + check_identifier_start: bool, + ) { + let start = self.offset(); + if self.current.chars.next() != Some('u') { + let range = Span::new(start, self.offset()); + self.error(diagnostics::UnicodeEscapeSequence(range)); + return; + } + + let value = match self.peek() { + Some('{') => self.unicode_code_point(), + _ => self.surrogate_pair(), + }; + + let Some(value) = value else { + let range = Span::new(start, self.offset()); + self.error(diagnostics::UnicodeEscapeSequence(range)); + return; + }; + + // For Identifiers, surrogate pair is an invalid grammar, e.g. `var \uD800\uDEA7`. + let ch = match value { + SurrogatePair::Astral(..) | SurrogatePair::HighLow(..) => { + let range = Span::new(start, self.offset()); + self.error(diagnostics::UnicodeEscapeSequence(range)); + return; + } + SurrogatePair::CodePoint(code_point) => { + if let Ok(ch) = char::try_from(code_point) { + ch + } else { + let range = Span::new(start, self.offset()); + self.error(diagnostics::UnicodeEscapeSequence(range)); + return; + } + } + }; + + let is_valid = + if check_identifier_start { is_identifier_start(ch) } else { is_identifier_part(ch) }; + + if !is_valid { + self.error(diagnostics::InvalidCharacter(ch, self.current_offset())); + return; + } + + builder.push_different(ch); + } + + /// String `UnicodeEscapeSequence` + /// \u `Hex4Digits` + /// \u `Hex4Digits` \u `Hex4Digits` + /// \u{ `CodePoint` } + fn string_unicode_escape_sequence( + &mut self, + text: &mut String<'a>, + is_valid_escape_sequence: &mut bool, + ) { + let value = match self.peek() { + Some('{') => self.unicode_code_point(), + _ => self.surrogate_pair(), + }; + + let Some(value) = value else { + // error raised within the parser by `diagnostics::TemplateLiteral` + *is_valid_escape_sequence = false; + return; + }; + + // For strings and templates, surrogate pairs are valid grammar, e.g. `"\uD83D\uDE00" === 😀` + // values are interpreted as is if they fall out of range + match value { + SurrogatePair::CodePoint(code_point) | SurrogatePair::Astral(code_point) => { + if let Ok(ch) = char::try_from(code_point) { + text.push(ch); + } else { + text.push_str("\\u"); + text.push_str(format!("{code_point:x}").as_str()); + } + } + SurrogatePair::HighLow(high, low) => { + text.push_str("\\u"); + text.push_str(format!("{high:x}").as_str()); + text.push_str("\\u"); + text.push_str(format!("{low:x}").as_str()); + } + } + } + + fn unicode_code_point(&mut self) -> Option { + if !self.next_eq('{') { + return None; + } + let value = self.code_point()?; + if !self.next_eq('}') { + return None; + } + Some(SurrogatePair::CodePoint(value)) + } + + fn hex_4_digits(&mut self) -> Option { + let mut value = 0; + for _ in 0..4 { + value = (value << 4) | self.hex_digit()?; + } + Some(value) + } + + fn hex_digit(&mut self) -> Option { + let value = match self.peek() { + Some(c @ '0'..='9') => c as u32 - '0' as u32, + Some(c @ 'a'..='f') => 10 + (c as u32 - 'a' as u32), + Some(c @ 'A'..='F') => 10 + (c as u32 - 'A' as u32), + _ => return None, + }; + self.current.chars.next(); + Some(value) + } + + fn code_point(&mut self) -> Option { + let mut value = self.hex_digit()?; + while let Some(next) = self.hex_digit() { + value = (value << 4) | next; + if value > 0x0010_FFFF { + return None; + } + } + Some(value) + } + + /// Surrogate pairs + /// See background info: + /// * `https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae` + /// * `https://mathiasbynens.be/notes/javascript-identifiers-es6` + fn surrogate_pair(&mut self) -> Option { + let high = self.hex_4_digits()?; + // The first code unit of a surrogate pair is always in the range from 0xD800 to 0xDBFF, and is called a high surrogate or a lead surrogate. + if !((0xD800..=0xDBFF).contains(&high) + && self.peek() == Some('\\') + && self.peek2() == Some('u')) + { + return Some(SurrogatePair::CodePoint(high)); + } + + self.current.chars.next(); + self.current.chars.next(); + + let low = self.hex_4_digits()?; + + // The second code unit of a surrogate pair is always in the range from 0xDC00 to 0xDFFF, and is called a low surrogate or a trail surrogate. + if !(0xDC00..=0xDFFF).contains(&low) { + return Some(SurrogatePair::HighLow(high, low)); + } + + // `https://tc39.es/ecma262/#sec-utf16decodesurrogatepair` + let astral_code_point = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000; + + Some(SurrogatePair::Astral(astral_code_point)) + } + + // EscapeSequence :: + pub(super) fn read_string_escape_sequence( + &mut self, + text: &mut String<'a>, + in_template: bool, + is_valid_escape_sequence: &mut bool, + ) { + match self.current.chars.next() { + None => { + self.error(diagnostics::UnterminatedString(self.unterminated_range())); + } + Some(c) => match c { + // \ LineTerminatorSequence + // LineTerminatorSequence :: + // + // [lookahead ≠ ] + // + // + // + LF | LS | PS => {} + CR => { + self.next_eq(LF); + } + // SingleEscapeCharacter :: one of + // ' " \ b f n r t v + '\'' | '"' | '\\' => text.push(c), + 'b' => text.push('\u{8}'), + 'f' => text.push(FF), + 'n' => text.push(LF), + 'r' => text.push(CR), + 't' => text.push(TAB), + 'v' => text.push(VT), + // HexEscapeSequence + 'x' => { + self.hex_digit() + .and_then(|value1| { + let value2 = self.hex_digit()?; + Some((value1, value2)) + }) + .map(|(value1, value2)| (value1 << 4) | value2) + .and_then(|value| char::try_from(value).ok()) + .map_or_else( + || { + *is_valid_escape_sequence = false; + }, + |c| { + text.push(c); + }, + ); + } + // UnicodeEscapeSequence + 'u' => { + self.string_unicode_escape_sequence(text, is_valid_escape_sequence); + } + // 0 [lookahead ∉ DecimalDigit] + '0' if !self.peek().is_some_and(|c| c.is_ascii_digit()) => text.push('\0'), + // Section 12.9.4 String Literals + // LegacyOctalEscapeSequence + // NonOctalDecimalEscapeSequence + a @ '0'..='7' if !in_template => { + let mut num = String::new_in(self.allocator); + num.push(a); + match a { + '4'..='7' => { + if matches!(self.peek(), Some('0'..='7')) { + let b = self.consume_char(); + num.push(b); + } + } + '0'..='3' => { + if matches!(self.peek(), Some('0'..='7')) { + let b = self.consume_char(); + num.push(b); + if matches!(self.peek(), Some('0'..='7')) { + let c = self.consume_char(); + num.push(c); + } + } + } + _ => {} + } + + let value = + char::from_u32(u32::from_str_radix(num.as_str(), 8).unwrap()).unwrap(); + text.push(value); + } + '0' if in_template && self.peek().is_some_and(|c| c.is_ascii_digit()) => { + self.current.chars.next(); + // error raised within the parser by `diagnostics::TemplateLiteral` + *is_valid_escape_sequence = false; + } + // NotEscapeSequence :: DecimalDigit but not 0 + '1'..='9' if in_template => { + // error raised within the parser by `diagnostics::TemplateLiteral` + *is_valid_escape_sequence = false; + } + other => { + // NonOctalDecimalEscapeSequence \8 \9 in strict mode + text.push(other); + } + }, + } + } +}