diff --git a/.typos.toml b/.typos.toml index 0850c991c..ab2db2e38 100644 --- a/.typos.toml +++ b/.typos.toml @@ -8,7 +8,7 @@ extend-exclude = [ "tasks/coverage/babel", "tasks/coverage/typescript", "tasks/prettier_conformance/prettier", - "crates/oxc_parser/src/lexer/mod.rs", + "crates/oxc_parser/src/lexer/byte_handlers.rs", "crates/oxc_linter/fixtures", "crates/oxc_linter/src/rules/jsx_a11y/img_redundant_alt.rs", "crates/oxc_syntax/src/xml_entities.rs", diff --git a/crates/oxc_parser/src/lexer/byte_handlers.rs b/crates/oxc_parser/src/lexer/byte_handlers.rs new file mode 100644 index 000000000..09efc66bc --- /dev/null +++ b/crates/oxc_parser/src/lexer/byte_handlers.rs @@ -0,0 +1,588 @@ +use super::{AutoCow, Kind, Lexer, LexerContext}; +use crate::diagnostics; + +#[allow(clippy::unnecessary_safety_comment)] +/// Handle next byte of source. +/// +/// SAFETY: +/// * Lexer must not be at end of file. +/// * `byte` must be next byte of source code, corresponding to current position +/// of `lexer.current.chars`. +/// * Only `BYTE_HANDLERS` for ASCII characters may use the `ascii_byte_handler!()` macro. +pub(super) unsafe fn handle_byte(byte: u8, lexer: &mut Lexer) -> Kind { + BYTE_HANDLERS[byte as usize](lexer) +} + +type ByteHandler = unsafe fn(&mut Lexer<'_>) -> Kind; + +/// Lookup table mapping any incoming byte to a handler function defined below. +/// +#[rustfmt::skip] +static BYTE_HANDLERS: [ByteHandler; 256] = [ +// 0 1 2 3 4 5 6 7 8 9 A B C D E F // + ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, SPS, LIN, SPS, SPS, LIN, ERR, ERR, // 0 + ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, // 1 + SPS, EXL, QOT, HAS, IDT, PRC, AMP, QOT, PNO, PNC, ATR, PLS, COM, MIN, PRD, SLH, // 2 + ZER, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, COL, SEM, LSS, EQL, GTR, QST, // 3 + AT_, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, // 4 + IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, BTO, ESC, BTC, CRT, IDT, // 5 + TPL, L_A, L_B, L_C, L_D, L_E, L_F, L_G, IDT, L_I, IDT, L_K, L_L, L_M, L_N, L_O, // 6 + L_P, IDT, L_R, L_S, L_T, L_U, L_V, L_W, IDT, L_Y, IDT, BEO, PIP, BEC, TLD, ERR, // 7 + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 8 + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 9 + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // A + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // B + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // C + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // D + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // E + UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // F +]; + +#[allow(clippy::unnecessary_safety_comment)] +/// Macro for defining byte handler for an ASCII character. +/// +/// In addition to defining a `const` for the handler, it also asserts that lexer +/// is not at end of file, and that next char is ASCII. +/// Where the handler is for an ASCII character, these assertions are self-evidently true. +/// +/// These assertions produce no runtime code, but hint to the compiler that it can assume that +/// next char is ASCII, and it uses that information to optimize the rest of the handler. +/// e.g. `lexer.current.chars.next()` becomes just a single assembler instruction. +/// Without the assertions, the compiler is unable to deduce the next char is ASCII, due to +/// the indirection of the `BYTE_HANDLERS` jump table. +/// +/// These assertions are unchecked (i.e. won't panic) and will cause UB if they're incorrect. +/// +/// SAFETY: Only use this macro to define byte handlers for ASCII characters. +/// +/// ``` +/// ascii_byte_handler!(SPS(lexer) { +/// lexer.consume_char(); +/// Kind::WhiteSpace +/// }); +/// ``` +/// +/// expands to: +/// +/// ``` +/// const SPS: ByteHandler = |lexer| { +/// unsafe { +/// use assert_unchecked::assert_unchecked; +/// let s = lexer.current.chars.as_str(); +/// assert_unchecked!(!s.is_empty()); +/// assert_unchecked!(s.as_bytes()[0] < 128); +/// } +/// lexer.consume_char(); +/// Kind::WhiteSpace +/// }; +/// ``` +macro_rules! ascii_byte_handler { + ($id:ident($lex:ident) $body:expr) => { + const $id: ByteHandler = |$lex| { + // SAFETY: This macro is only used for ASCII characters + unsafe { + use assert_unchecked::assert_unchecked; + let s = $lex.current.chars.as_str(); + assert_unchecked!(!s.is_empty()); + assert_unchecked!(s.as_bytes()[0] < 128); + } + $body + }; + }; +} + +// `\0` `\1` etc +ascii_byte_handler!(ERR(lexer) { + let c = lexer.consume_char(); + lexer.error(diagnostics::InvalidCharacter(c, lexer.unterminated_range())); + Kind::Undetermined +}); + +// +ascii_byte_handler!(SPS(lexer) { + lexer.consume_char(); + Kind::Skip +}); + +// '\r' '\n' +ascii_byte_handler!(LIN(lexer) { + lexer.consume_char(); + lexer.current.token.is_on_new_line = true; + Kind::Skip +}); + +// ! +ascii_byte_handler!(EXL(lexer) { + lexer.consume_char(); + if lexer.next_eq('=') { + if lexer.next_eq('=') { + Kind::Neq2 + } else { + Kind::Neq + } + } else { + Kind::Bang + } +}); + +// ' " +ascii_byte_handler!(QOT(lexer) { + let c = lexer.consume_char(); + if lexer.context == LexerContext::JsxAttributeValue { + lexer.read_jsx_string_literal(c) + } else { + lexer.read_string_literal(c) + } +}); + +// # +ascii_byte_handler!(HAS(lexer) { + lexer.consume_char(); + // HashbangComment :: + // `#!` SingleLineCommentChars? + if lexer.current.token.start == 0 && lexer.next_eq('!') { + lexer.read_hashbang_comment() + } else { + lexer.private_identifier() + } +}); + +// `A..=Z`, `a..=z` (except special cases below), `_`, `$` +ascii_byte_handler!(IDT(lexer) { + lexer.identifier_name_handler(); + Kind::Ident +}); + +// % +ascii_byte_handler!(PRC(lexer) { + lexer.consume_char(); + if lexer.next_eq('=') { + Kind::PercentEq + } else { + Kind::Percent + } +}); + +// & +ascii_byte_handler!(AMP(lexer) { + lexer.consume_char(); + if lexer.next_eq('&') { + if lexer.next_eq('=') { + Kind::Amp2Eq + } else { + Kind::Amp2 + } + } else if lexer.next_eq('=') { + Kind::AmpEq + } else { + Kind::Amp + } +}); + +// ( +ascii_byte_handler!(PNO(lexer) { + lexer.consume_char(); + Kind::LParen +}); + +// ) +ascii_byte_handler!(PNC(lexer) { + lexer.consume_char(); + Kind::RParen +}); + +// * +ascii_byte_handler!(ATR(lexer) { + lexer.consume_char(); + if lexer.next_eq('*') { + if lexer.next_eq('=') { + Kind::Star2Eq + } else { + Kind::Star2 + } + } else if lexer.next_eq('=') { + Kind::StarEq + } else { + Kind::Star + } +}); + +// + +ascii_byte_handler!(PLS(lexer) { + lexer.consume_char(); + if lexer.next_eq('+') { + Kind::Plus2 + } else if lexer.next_eq('=') { + Kind::PlusEq + } else { + Kind::Plus + } +}); + +// , +ascii_byte_handler!(COM(lexer) { + lexer.consume_char(); + Kind::Comma +}); + +// - +ascii_byte_handler!(MIN(lexer) { + lexer.consume_char(); + lexer.read_minus().unwrap_or_else(|| lexer.skip_single_line_comment()) +}); + +// . +ascii_byte_handler!(PRD(lexer) { + lexer.consume_char(); + lexer.read_dot() +}); + +// / +ascii_byte_handler!(SLH(lexer) { + lexer.consume_char(); + match lexer.peek() { + Some('/') => { + lexer.current.chars.next(); + lexer.skip_single_line_comment() + } + Some('*') => { + lexer.current.chars.next(); + lexer.skip_multi_line_comment() + } + _ => { + // regex is handled separately, see `next_regex` + if lexer.next_eq('=') { + Kind::SlashEq + } else { + Kind::Slash + } + } + } +}); + +// 0 +ascii_byte_handler!(ZER(lexer) { + lexer.consume_char(); + lexer.read_zero() +}); + +// 1 to 9 +ascii_byte_handler!(DIG(lexer) { + lexer.consume_char(); + lexer.decimal_literal_after_first_digit() +}); + +// : +ascii_byte_handler!(COL(lexer) { + lexer.consume_char(); + Kind::Colon +}); + +// ; +ascii_byte_handler!(SEM(lexer) { + lexer.consume_char(); + Kind::Semicolon +}); + +// < +ascii_byte_handler!(LSS(lexer) { + lexer.consume_char(); + lexer.read_left_angle().unwrap_or_else(|| lexer.skip_single_line_comment()) +}); + +// = +ascii_byte_handler!(EQL(lexer) { + lexer.consume_char(); + if lexer.next_eq('=') { + if lexer.next_eq('=') { + Kind::Eq3 + } else { + Kind::Eq2 + } + } else if lexer.next_eq('>') { + Kind::Arrow + } else { + Kind::Eq + } +}); + +// > +ascii_byte_handler!(GTR(lexer) { + lexer.consume_char(); + // `>=` is re-lexed with [Lexer::next_jsx_child] + Kind::RAngle +}); + +// ? +ascii_byte_handler!(QST(lexer) { + lexer.consume_char(); + if lexer.next_eq('?') { + if lexer.next_eq('=') { + Kind::Question2Eq + } else { + Kind::Question2 + } + } else if lexer.peek() == Some('.') { + // parse `?.1` as `?` `.1` + if lexer.peek2().is_some_and(|c| c.is_ascii_digit()) { + Kind::Question + } else { + lexer.current.chars.next(); + Kind::QuestionDot + } + } else { + Kind::Question + } +}); + +// @ +ascii_byte_handler!(AT_(lexer) { + lexer.consume_char(); + Kind::At +}); + +// [ +ascii_byte_handler!(BTO(lexer) { + lexer.consume_char(); + Kind::LBrack +}); + +// \ +ascii_byte_handler!(ESC(lexer) { + let mut builder = AutoCow::new(lexer); + lexer.consume_char(); + builder.force_allocation_without_current_ascii_char(lexer); + lexer.identifier_unicode_escape_sequence(&mut builder, true); + let text = lexer.identifier_name(builder); + Kind::match_keyword(text) +}); + +// ] +ascii_byte_handler!(BTC(lexer) { + lexer.consume_char(); + Kind::RBrack +}); + +// ^ +ascii_byte_handler!(CRT(lexer) { + lexer.consume_char(); + if lexer.next_eq('=') { + Kind::CaretEq + } else { + Kind::Caret + } +}); + +// ` +ascii_byte_handler!(TPL(lexer) { + lexer.consume_char(); + lexer.read_template_literal(Kind::TemplateHead, Kind::NoSubstitutionTemplate) +}); + +// { +ascii_byte_handler!(BEO(lexer) { + lexer.consume_char(); + Kind::LCurly +}); + +// | +ascii_byte_handler!(PIP(lexer) { + lexer.consume_char(); + if lexer.next_eq('|') { + if lexer.next_eq('=') { + Kind::Pipe2Eq + } else { + Kind::Pipe2 + } + } else if lexer.next_eq('=') { + Kind::PipeEq + } else { + Kind::Pipe + } +}); + +// } +ascii_byte_handler!(BEC(lexer) { + lexer.consume_char(); + Kind::RCurly +}); + +// ~ +ascii_byte_handler!(TLD(lexer) { + lexer.consume_char(); + Kind::Tilde +}); + +ascii_byte_handler!(L_A(lexer) match &lexer.identifier_name_handler()[1..] { + "wait" => Kind::Await, + "sync" => Kind::Async, + "bstract" => Kind::Abstract, + "ccessor" => Kind::Accessor, + "ny" => Kind::Any, + "s" => Kind::As, + "ssert" => Kind::Assert, + "sserts" => Kind::Asserts, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_B(lexer) match &lexer.identifier_name_handler()[1..] { + "reak" => Kind::Break, + "oolean" => Kind::Boolean, + "igint" => Kind::BigInt, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_C(lexer) match &lexer.identifier_name_handler()[1..] { + "onst" => Kind::Const, + "lass" => Kind::Class, + "ontinue" => Kind::Continue, + "atch" => Kind::Catch, + "ase" => Kind::Case, + "onstructor" => Kind::Constructor, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_D(lexer) match &lexer.identifier_name_handler()[1..] { + "o" => Kind::Do, + "elete" => Kind::Delete, + "eclare" => Kind::Declare, + "efault" => Kind::Default, + "ebugger" => Kind::Debugger, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_E(lexer) match &lexer.identifier_name_handler()[1..] { + "lse" => Kind::Else, + "num" => Kind::Enum, + "xport" => Kind::Export, + "xtends" => Kind::Extends, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_F(lexer) match &lexer.identifier_name_handler()[1..] { + "unction" => Kind::Function, + "alse" => Kind::False, + "or" => Kind::For, + "inally" => Kind::Finally, + "rom" => Kind::From, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_G(lexer) match &lexer.identifier_name_handler()[1..] { + "et" => Kind::Get, + "lobal" => Kind::Global, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_I(lexer) match &lexer.identifier_name_handler()[1..] { + "f" => Kind::If, + "nstanceof" => Kind::Instanceof, + "n" => Kind::In, + "mplements" => Kind::Implements, + "mport" => Kind::Import, + "nfer" => Kind::Infer, + "nterface" => Kind::Interface, + "ntrinsic" => Kind::Intrinsic, + "s" => Kind::Is, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_K(lexer) match &lexer.identifier_name_handler()[1..] { + "eyof" => Kind::KeyOf, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_L(lexer) match &lexer.identifier_name_handler()[1..] { + "et" => Kind::Let, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_M(lexer) match &lexer.identifier_name_handler()[1..] { + "eta" => Kind::Meta, + "odule" => Kind::Module, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_N(lexer) match &lexer.identifier_name_handler()[1..] { + "ull" => Kind::Null, + "ew" => Kind::New, + "umber" => Kind::Number, + "amespace" => Kind::Namespace, + "ever" => Kind::Never, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_O(lexer) match &lexer.identifier_name_handler()[1..] { + "f" => Kind::Of, + "bject" => Kind::Object, + "ut" => Kind::Out, + "verride" => Kind::Override, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_P(lexer) match &lexer.identifier_name_handler()[1..] { + "ackage" => Kind::Package, + "rivate" => Kind::Private, + "rotected" => Kind::Protected, + "ublic" => Kind::Public, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_R(lexer) match &lexer.identifier_name_handler()[1..] { + "eturn" => Kind::Return, + "equire" => Kind::Require, + "eadonly" => Kind::Readonly, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_S(lexer) match &lexer.identifier_name_handler()[1..] { + "et" => Kind::Set, + "uper" => Kind::Super, + "witch" => Kind::Switch, + "tatic" => Kind::Static, + "ymbol" => Kind::Symbol, + "tring" => Kind::String, + "atisfies" => Kind::Satisfies, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_T(lexer) match &lexer.identifier_name_handler()[1..] { + "his" => Kind::This, + "rue" => Kind::True, + "hrow" => Kind::Throw, + "ry" => Kind::Try, + "ypeof" => Kind::Typeof, + "arget" => Kind::Target, + "ype" => Kind::Type, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_U(lexer) match &lexer.identifier_name_handler()[1..] { + "ndefined" => Kind::Undefined, + "sing" => Kind::Using, + "nique" => Kind::Unique, + "nknown" => Kind::Unknown, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_V(lexer) match &lexer.identifier_name_handler()[1..] { + "ar" => Kind::Var, + "oid" => Kind::Void, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_W(lexer) match &lexer.identifier_name_handler()[1..] { + "hile" => Kind::While, + "ith" => Kind::With, + _ => Kind::Ident, +}); + +ascii_byte_handler!(L_Y(lexer) match &lexer.identifier_name_handler()[1..] { + "ield" => Kind::Yield, + _ => Kind::Ident, +}); + +// Non-ASCII characters. +// NB: Must not use `ascii_byte_handler!()` macro, as this handler is for non-ASCII chars. +#[allow(clippy::redundant_closure_for_method_calls)] +const UNI: ByteHandler = |lexer| lexer.unicode_char_handler(); diff --git a/crates/oxc_parser/src/lexer/comment.rs b/crates/oxc_parser/src/lexer/comment.rs new file mode 100644 index 000000000..f195796ba --- /dev/null +++ b/crates/oxc_parser/src/lexer/comment.rs @@ -0,0 +1,49 @@ +use super::{Kind, Lexer}; +use crate::diagnostics; + +use oxc_syntax::identifier::is_line_terminator; + +impl<'a> Lexer<'a> { + /// Section 12.4 Single Line Comment + #[allow(clippy::cast_possible_truncation)] + pub(super) fn skip_single_line_comment(&mut self) -> Kind { + let start = self.current.token.start; + while let Some(c) = self.current.chars.next() { + if is_line_terminator(c) { + self.current.token.is_on_new_line = true; + self.trivia_builder + .add_single_line_comment(start, self.offset() - c.len_utf8() as u32); + return Kind::Skip; + } + } + // EOF + self.trivia_builder.add_single_line_comment(start, self.offset()); + Kind::Skip + } + + /// Section 12.4 Multi Line Comment + pub(super) fn skip_multi_line_comment(&mut self) -> Kind { + while let Some(c) = self.current.chars.next() { + if c == '*' && self.next_eq('/') { + self.trivia_builder.add_multi_line_comment(self.current.token.start, self.offset()); + return Kind::Skip; + } + if is_line_terminator(c) { + self.current.token.is_on_new_line = true; + } + } + self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range())); + Kind::Eof + } + + /// Section 12.5 Hashbang Comments + pub(super) fn read_hashbang_comment(&mut self) -> Kind { + while let Some(c) = self.current.chars.next().as_ref() { + if is_line_terminator(*c) { + break; + } + } + self.current.token.is_on_new_line = true; + Kind::HashbangComment + } +} diff --git a/crates/oxc_parser/src/lexer/identifier.rs b/crates/oxc_parser/src/lexer/identifier.rs new file mode 100644 index 000000000..272dd32f8 --- /dev/null +++ b/crates/oxc_parser/src/lexer/identifier.rs @@ -0,0 +1,66 @@ +use super::{AutoCow, Kind, Lexer, Span}; +use crate::diagnostics; + +use oxc_syntax::identifier::{is_identifier_part, is_identifier_start}; + +impl<'a> Lexer<'a> { + /// Section 12.7.1 Identifier Names + pub(super) fn identifier_name_handler(&mut self) -> &'a str { + let builder = AutoCow::new(self); + self.consume_char(); + self.identifier_name(builder) + } + + pub(super) fn identifier_name(&mut self, builder: AutoCow<'a>) -> &'a str { + self.identifier_tail(builder) + } + + pub(super) fn private_identifier(&mut self) -> Kind { + let mut builder = AutoCow::new(self); + let start = self.offset(); + match self.current.chars.next() { + Some(c) if is_identifier_start(c) => { + builder.push_matching(c); + } + Some('\\') => { + builder.force_allocation_without_current_ascii_char(self); + self.identifier_unicode_escape_sequence(&mut builder, true); + } + Some(c) => { + #[allow(clippy::cast_possible_truncation)] + self.error(diagnostics::InvalidCharacter( + c, + Span::new(start, start + c.len_utf8() as u32), + )); + return Kind::Undetermined; + } + None => { + self.error(diagnostics::UnexpectedEnd(Span::new(start, start))); + return Kind::Undetermined; + } + } + self.identifier_tail(builder); + Kind::PrivateIdentifier + } + + fn identifier_tail(&mut self, mut builder: AutoCow<'a>) -> &'a str { + // ident tail + while let Some(c) = self.peek() { + if !is_identifier_part(c) { + if c == '\\' { + self.current.chars.next(); + builder.force_allocation_without_current_ascii_char(self); + self.identifier_unicode_escape_sequence(&mut builder, false); + continue; + } + break; + } + self.current.chars.next(); + builder.push_matching(c); + } + let has_escape = builder.has_escape(); + let text = builder.finish(self); + self.save_string(has_escape, text); + text + } +} diff --git a/crates/oxc_parser/src/lexer/jsx.rs b/crates/oxc_parser/src/lexer/jsx.rs new file mode 100644 index 000000000..94b4d0e7e --- /dev/null +++ b/crates/oxc_parser/src/lexer/jsx.rs @@ -0,0 +1,108 @@ +use super::{AutoCow, Kind, Lexer, Token}; +use crate::diagnostics; + +use oxc_syntax::identifier::{is_identifier_part, is_identifier_start}; + +impl<'a> Lexer<'a> { + /// `JSXDoubleStringCharacters` :: + /// `JSXDoubleStringCharacter` `JSXDoubleStringCharactersopt` + /// `JSXDoubleStringCharacter` :: + /// `JSXStringCharacter` but not " + /// `JSXSingleStringCharacters` :: + /// `JSXSingleStringCharacter` `JSXSingleStringCharactersopt` + /// `JSXSingleStringCharacter` :: + /// `JSXStringCharacter` but not ' + /// `JSXStringCharacter` :: + /// `SourceCharacter` but not one of `HTMLCharacterReference` + pub(super) fn read_jsx_string_literal(&mut self, delimiter: char) -> Kind { + let mut builder = AutoCow::new(self); + loop { + match self.current.chars.next() { + Some(c @ ('"' | '\'')) => { + if c == delimiter { + self.save_string(builder.has_escape(), builder.finish_without_push(self)); + return Kind::Str; + } + builder.push_matching(c); + } + Some(other) => { + builder.push_matching(other); + } + None => { + self.error(diagnostics::UnterminatedString(self.unterminated_range())); + return Kind::Undetermined; + } + } + } + } + + pub(crate) fn next_jsx_child(&mut self) -> Token { + self.current.token.start = self.offset(); + let kind = self.read_jsx_child(); + self.finish_next(kind) + } + + /// Expand the current token for `JSXIdentifier` + pub(crate) fn next_jsx_identifier(&mut self, start_offset: u32) -> Token { + let kind = self.read_jsx_identifier(start_offset); + self.lookahead.clear(); + self.finish_next(kind) + } + + /// [`JSXChild`](https://facebook.github.io/jsx/#prod-JSXChild) + /// `JSXChild` : + /// `JSXText` + /// `JSXElement` + /// `JSXFragment` + /// { `JSXChildExpressionopt` } + fn read_jsx_child(&mut self) -> Kind { + match self.peek() { + Some('<') => { + self.current.chars.next(); + Kind::LAngle + } + Some('{') => { + self.current.chars.next(); + Kind::LCurly + } + Some(_) => { + loop { + // The tokens `{`, `<`, `>` and `}` cannot appear in a jsx text. + // The TypeScript compiler raises the error "Unexpected token. Did you mean `{'>'}` or `>`?". + // Where as the Babel compiler does not raise any errors. + // The following check omits `>` and `}` so that more Babel tests can be passed. + if self.peek().is_some_and(|c| c == '{' || c == '<') { + break; + } + if self.current.chars.next().is_none() { + break; + } + } + Kind::JSXText + } + None => Kind::Eof, + } + } + + /// `JSXIdentifier` : + /// `IdentifierStart` + /// `JSXIdentifier` `IdentifierPart` + /// `JSXIdentifier` [no `WhiteSpace` or Comment here] - + fn read_jsx_identifier(&mut self, _start_offset: u32) -> Kind { + while let Some(c) = self.peek() { + if c == '-' || is_identifier_start(c) { + self.current.chars.next(); + while let Some(c) = self.peek() { + if is_identifier_part(c) { + self.current.chars.next(); + } else { + break; + } + } + } else { + break; + } + } + Kind::Ident + } +} diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs index 3cb8f585e..2c14e2d85 100644 --- a/crates/oxc_parser/src/lexer/mod.rs +++ b/crates/oxc_parser/src/lexer/mod.rs @@ -5,31 +5,37 @@ //! * [rustc](https://github.com/rust-lang/rust/blob/master/compiler/rustc_lexer/src) //! * [v8](https://v8.dev/blog/scanner) +mod byte_handlers; +mod comment; +mod identifier; +mod jsx; mod kind; mod number; +mod numeric; +mod punctuation; +mod regex; +mod string; mod string_builder; +mod template; mod token; mod trivia_builder; +mod typescript; +mod unicode; use rustc_hash::FxHashMap; use std::{collections::VecDeque, str::Chars}; -use oxc_allocator::{Allocator, String}; +use oxc_allocator::Allocator; use oxc_ast::ast::RegExpFlags; use oxc_diagnostics::Error; use oxc_span::{SourceType, Span}; -use oxc_syntax::identifier::{ - is_identifier_part, is_identifier_start, is_identifier_start_unicode, - is_irregular_line_terminator, is_irregular_whitespace, is_line_terminator, CR, FF, LF, LS, PS, - TAB, VT, -}; +use self::{byte_handlers::handle_byte, string_builder::AutoCow, trivia_builder::TriviaBuilder}; pub use self::{ kind::Kind, number::{parse_big_int, parse_float, parse_int}, token::Token, }; -use self::{string_builder::AutoCow, trivia_builder::TriviaBuilder}; use crate::{diagnostics, MAX_LEN}; #[derive(Debug, Clone)] @@ -173,12 +179,6 @@ impl<'a> Lexer<'a> { self.finish_next(kind) } - pub fn next_jsx_child(&mut self) -> Token { - self.current.token.start = self.offset(); - let kind = self.read_jsx_child(); - self.finish_next(kind) - } - fn finish_next(&mut self, kind: Kind) -> Token { self.current.token.kind = kind; self.current.token.end = self.offset(); @@ -188,61 +188,6 @@ impl<'a> Lexer<'a> { token } - /// Re-tokenize the current `/` or `/=` and return `RegExp` - /// See Section 12: - /// The `InputElementRegExp` goal symbol is used in all syntactic grammar contexts - /// where a `RegularExpressionLiteral` is permitted - /// Which means the parser needs to re-tokenize on `PrimaryExpression`, - /// `RegularExpressionLiteral` only appear on the right hand side of `PrimaryExpression` - pub fn next_regex(&mut self, kind: Kind) -> (Token, u32, RegExpFlags) { - self.current.token.start = self.offset() - - match kind { - Kind::Slash => 1, - Kind::SlashEq => 2, - _ => unreachable!(), - }; - let (pattern_end, flags) = self.read_regex(); - self.lookahead.clear(); - let token = self.finish_next(Kind::RegExp); - (token, pattern_end, flags) - } - - pub fn next_right_angle(&mut self) -> Token { - let kind = self.read_right_angle(); - self.lookahead.clear(); - self.finish_next(kind) - } - - /// Re-tokenize the current `}` token for `TemplateSubstitutionTail` - /// See Section 12, the parser needs to re-tokenize on `TemplateSubstitutionTail`, - pub fn next_template_substitution_tail(&mut self) -> Token { - self.current.token.start = self.offset() - 1; - let kind = self.read_template_literal(Kind::TemplateMiddle, Kind::TemplateTail); - self.lookahead.clear(); - self.finish_next(kind) - } - - /// Expand the current token for `JSXIdentifier` - pub fn next_jsx_identifier(&mut self, start_offset: u32) -> Token { - let kind = self.read_jsx_identifier(start_offset); - self.lookahead.clear(); - self.finish_next(kind) - } - - /// Re-tokenize '<<' or '<=' or '<<=' to '<' - pub fn re_lex_as_typescript_l_angle(&mut self, kind: Kind) -> Token { - let offset = match kind { - Kind::ShiftLeft | Kind::LtEq => 2, - Kind::ShiftLeftEq => 3, - _ => unreachable!(), - }; - self.current.token.start = self.offset() - offset; - self.current.chars = self.source[self.current.token.start as usize + 1..].chars(); - let kind = Kind::LAngle; - self.lookahead.clear(); - self.finish_next(kind) - } - // ---------- Private Methods ---------- // fn error>(&mut self, error: T) { self.errors.push(error.into()); @@ -311,65 +256,6 @@ impl<'a> Lexer<'a> { } } - /// Save the string if it is escaped - /// This reduces the overall memory consumption while keeping the `Token` size small - /// Strings without escaped values can be retrieved as is from the token span - fn save_string(&mut self, has_escape: bool, s: &'a str) { - if !has_escape { - return; - } - self.escaped_strings.insert(self.current.token.start, s); - self.current.token.escaped = true; - } - - pub(crate) fn get_string(&self, token: Token) -> &'a str { - if token.escaped { - return self.escaped_strings[&token.start]; - } - - let raw = &self.source[token.start as usize..token.end as usize]; - match token.kind { - Kind::Str => { - &raw[1..raw.len() - 1] // omit surrounding quotes - } - Kind::PrivateIdentifier => { - &raw[1..] // omit leading `#` - } - _ => raw, - } - } - - /// Save the template if it is escaped - fn save_template_string( - &mut self, - is_valid_escape_sequence: bool, - has_escape: bool, - s: &'a str, - ) { - if !has_escape { - return; - } - self.escaped_templates - .insert(self.current.token.start, is_valid_escape_sequence.then(|| s)); - self.current.token.escaped = true; - } - - pub(crate) fn get_template_string(&self, token: Token) -> Option<&'a str> { - if token.escaped { - return self.escaped_templates[&token.start]; - } - let raw = &self.source[token.start as usize..token.end as usize]; - Some(match token.kind { - Kind::NoSubstitutionTemplate | Kind::TemplateTail => { - &raw[1..raw.len() - 1] // omit surrounding quotes or leading "}" and trailing "`" - } - Kind::TemplateHead | Kind::TemplateMiddle => { - &raw[1..raw.len() - 2] // omit leading "`" or "}" and trailing "${" - } - _ => raw, - }) - } - /// Read each char and set the current token /// Whitespace and line terminators are skipped fn read_next_token(&mut self) -> Kind { @@ -391,1484 +277,4 @@ impl<'a> Lexer<'a> { } } } - - fn unicode_char_handler(&mut self) -> Kind { - let c = self.current.chars.clone().next().unwrap(); - match c { - c if is_identifier_start_unicode(c) => { - let mut builder = AutoCow::new(self); - let c = self.consume_char(); - builder.push_matching(c); - self.identifier_name(builder); - Kind::Ident - } - c if is_irregular_whitespace(c) => { - self.trivia_builder - .add_irregular_whitespace(self.current.token.start, self.offset()); - self.consume_char(); - Kind::Skip - } - c if is_irregular_line_terminator(c) => { - self.consume_char(); - self.current.token.is_on_new_line = true; - Kind::Skip - } - _ => { - self.consume_char(); - self.error(diagnostics::InvalidCharacter(c, self.unterminated_range())); - Kind::Undetermined - } - } - } - - /// Section 12.4 Single Line Comment - #[allow(clippy::cast_possible_truncation)] - fn skip_single_line_comment(&mut self) -> Kind { - let start = self.current.token.start; - while let Some(c) = self.current.chars.next() { - if is_line_terminator(c) { - self.current.token.is_on_new_line = true; - self.trivia_builder - .add_single_line_comment(start, self.offset() - c.len_utf8() as u32); - return Kind::Skip; - } - } - // EOF - self.trivia_builder.add_single_line_comment(start, self.offset()); - Kind::Skip - } - - /// Section 12.4 Multi Line Comment - fn skip_multi_line_comment(&mut self) -> Kind { - while let Some(c) = self.current.chars.next() { - if c == '*' && self.next_eq('/') { - self.trivia_builder.add_multi_line_comment(self.current.token.start, self.offset()); - return Kind::Skip; - } - if is_line_terminator(c) { - self.current.token.is_on_new_line = true; - } - } - self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range())); - Kind::Eof - } - - /// Section 12.5 Hashbang Comments - fn read_hashbang_comment(&mut self) -> Kind { - while let Some(c) = self.current.chars.next().as_ref() { - if is_line_terminator(*c) { - break; - } - } - self.current.token.is_on_new_line = true; - Kind::HashbangComment - } - - /// Section 12.7.1 Identifier Names - fn identifier_tail(&mut self, mut builder: AutoCow<'a>) -> &'a str { - // ident tail - while let Some(c) = self.peek() { - if !is_identifier_part(c) { - if c == '\\' { - self.current.chars.next(); - builder.force_allocation_without_current_ascii_char(self); - self.identifier_unicode_escape_sequence(&mut builder, false); - continue; - } - break; - } - self.current.chars.next(); - builder.push_matching(c); - } - let has_escape = builder.has_escape(); - let text = builder.finish(self); - self.save_string(has_escape, text); - text - } - - fn identifier_name(&mut self, builder: AutoCow<'a>) -> &'a str { - self.identifier_tail(builder) - } - - fn identifier_name_handler(&mut self) -> &'a str { - let builder = AutoCow::new(self); - self.consume_char(); - self.identifier_name(builder) - } - - /// Section 12.8 Punctuators - fn read_dot(&mut self) -> Kind { - if self.peek() == Some('.') && self.peek2() == Some('.') { - self.current.chars.next(); - self.current.chars.next(); - return Kind::Dot3; - } - if self.peek().is_some_and(|c| c.is_ascii_digit()) { - self.decimal_literal_after_decimal_point() - } else { - Kind::Dot - } - } - - /// returns None for `SingleLineHTMLOpenComment` `` in script mode - fn read_minus(&mut self) -> Option { - if self.next_eq('-') { - // SingleLineHTMLCloseComment `-->` in script mode - if self.current.token.is_on_new_line - && self.source_type.is_script() - && self.next_eq('>') - { - None - } else { - Some(Kind::Minus2) - } - } else if self.next_eq('=') { - Some(Kind::MinusEq) - } else { - Some(Kind::Minus) - } - } - - fn private_identifier(&mut self) -> Kind { - let mut builder = AutoCow::new(self); - let start = self.offset(); - match self.current.chars.next() { - Some(c) if is_identifier_start(c) => { - builder.push_matching(c); - } - Some('\\') => { - builder.force_allocation_without_current_ascii_char(self); - self.identifier_unicode_escape_sequence(&mut builder, true); - } - Some(c) => { - #[allow(clippy::cast_possible_truncation)] - self.error(diagnostics::InvalidCharacter( - c, - Span::new(start, start + c.len_utf8() as u32), - )); - return Kind::Undetermined; - } - None => { - self.error(diagnostics::UnexpectedEnd(Span::new(start, start))); - return Kind::Undetermined; - } - } - self.identifier_tail(builder); - Kind::PrivateIdentifier - } - - /// 12.9.3 Numeric Literals with `0` prefix - fn read_zero(&mut self) -> Kind { - match self.peek() { - Some('b' | 'B') => self.read_non_decimal(Kind::Binary), - Some('o' | 'O') => self.read_non_decimal(Kind::Octal), - Some('x' | 'X') => self.read_non_decimal(Kind::Hex), - Some('e' | 'E') => { - self.current.chars.next(); - self.read_decimal_exponent() - } - Some('.') => { - self.current.chars.next(); - self.decimal_literal_after_decimal_point_after_digits() - } - Some('n') => { - self.current.chars.next(); - self.check_after_numeric_literal(Kind::Decimal) - } - Some(n) if n.is_ascii_digit() => self.read_legacy_octal(), - _ => self.check_after_numeric_literal(Kind::Decimal), - } - } - - fn read_non_decimal(&mut self, kind: Kind) -> Kind { - self.current.chars.next(); - - if self.peek().is_some_and(|c| kind.matches_number_char(c)) { - self.current.chars.next(); - } else { - self.unexpected_err(); - return Kind::Undetermined; - } - - while let Some(c) = self.peek() { - match c { - '_' => { - self.current.chars.next(); - if self.peek().is_some_and(|c| kind.matches_number_char(c)) { - self.current.chars.next(); - } else { - self.unexpected_err(); - return Kind::Undetermined; - } - } - c if kind.matches_number_char(c) => { - self.current.chars.next(); - } - _ => break, - } - } - if self.peek() == Some('n') { - self.current.chars.next(); - } - self.check_after_numeric_literal(kind) - } - - fn read_legacy_octal(&mut self) -> Kind { - let mut kind = Kind::Octal; - loop { - match self.peek() { - Some('0'..='7') => { - self.current.chars.next(); - } - Some('8'..='9') => { - self.current.chars.next(); - kind = Kind::Decimal; - } - _ => break, - } - } - - match self.peek() { - // allow 08.5 and 09.5 - Some('.') if kind == Kind::Decimal => { - self.current.chars.next(); - self.decimal_literal_after_decimal_point_after_digits() - } - // allow 08e1 and 09e1 - Some('e') if kind == Kind::Decimal => { - self.current.chars.next(); - self.read_decimal_exponent() - } - _ => self.check_after_numeric_literal(kind), - } - } - - fn decimal_literal_after_first_digit(&mut self) -> Kind { - self.read_decimal_digits_after_first_digit(); - if self.next_eq('.') { - return self.decimal_literal_after_decimal_point_after_digits(); - } else if self.next_eq('n') { - return self.check_after_numeric_literal(Kind::Decimal); - } - - let kind = self.optional_exponent().map_or(Kind::Decimal, |kind| kind); - self.check_after_numeric_literal(kind) - } - - fn read_decimal_exponent(&mut self) -> Kind { - let kind = match self.peek() { - Some('-') => { - self.current.chars.next(); - Kind::NegativeExponential - } - Some('+') => { - self.current.chars.next(); - Kind::PositiveExponential - } - _ => Kind::PositiveExponential, - }; - self.read_decimal_digits(); - kind - } - - fn read_decimal_digits(&mut self) { - if self.peek().is_some_and(|c| c.is_ascii_digit()) { - self.current.chars.next(); - } else { - self.unexpected_err(); - return; - } - - self.read_decimal_digits_after_first_digit(); - } - - fn read_decimal_digits_after_first_digit(&mut self) { - while let Some(c) = self.peek() { - match c { - '_' => { - self.current.chars.next(); - if self.peek().is_some_and(|c| c.is_ascii_digit()) { - self.current.chars.next(); - } else { - self.unexpected_err(); - return; - } - } - '0'..='9' => { - self.current.chars.next(); - } - _ => break, - } - } - } - - fn decimal_literal_after_decimal_point(&mut self) -> Kind { - self.read_decimal_digits(); - self.optional_exponent(); - self.check_after_numeric_literal(Kind::Float) - } - - fn decimal_literal_after_decimal_point_after_digits(&mut self) -> Kind { - self.optional_decimal_digits(); - self.optional_exponent(); - self.check_after_numeric_literal(Kind::Float) - } - - fn optional_decimal_digits(&mut self) { - if self.peek().is_some_and(|c| c.is_ascii_digit()) { - self.current.chars.next(); - } else { - return; - } - self.read_decimal_digits_after_first_digit(); - } - - fn optional_exponent(&mut self) -> Option { - if matches!(self.peek(), Some('e' | 'E')) { - self.current.chars.next(); - return Some(self.read_decimal_exponent()); - } - None - } - - fn check_after_numeric_literal(&mut self, kind: Kind) -> Kind { - let offset = self.offset(); - // The SourceCharacter immediately following a NumericLiteral must not be an IdentifierStart or DecimalDigit. - let c = self.peek(); - if c.is_none() || c.is_some_and(|ch| !ch.is_ascii_digit() && !is_identifier_start(ch)) { - return kind; - } - self.current.chars.next(); - while let Some(c) = self.peek() { - if is_identifier_start(c) { - self.current.chars.next(); - } else { - break; - } - } - self.error(diagnostics::InvalidNumberEnd(Span::new(offset, self.offset()))); - Kind::Undetermined - } - - /// 12.9.4 String Literals - fn read_string_literal(&mut self, delimiter: char) -> Kind { - let mut builder = AutoCow::new(self); - loop { - match self.current.chars.next() { - None | Some('\r' | '\n') => { - self.error(diagnostics::UnterminatedString(self.unterminated_range())); - return Kind::Undetermined; - } - Some(c @ ('"' | '\'')) => { - if c == delimiter { - self.save_string(builder.has_escape(), builder.finish_without_push(self)); - return Kind::Str; - } - builder.push_matching(c); - } - Some('\\') => { - let start = self.offset() - 1; - let text = builder.get_mut_string_without_current_ascii_char(self); - let mut is_valid_escape_sequence = true; - self.read_string_escape_sequence(text, false, &mut is_valid_escape_sequence); - if !is_valid_escape_sequence { - let range = Span::new(start, self.offset()); - self.error(diagnostics::InvalidEscapeSequence(range)); - } - } - Some(c) => { - builder.push_matching(c); - } - } - } - } - - /// 12.9.5 Regular Expression Literals - fn read_regex(&mut self) -> (u32, RegExpFlags) { - let mut in_escape = false; - let mut in_character_class = false; - loop { - match self.current.chars.next() { - None => { - self.error(diagnostics::UnterminatedRegExp(self.unterminated_range())); - return (self.offset(), RegExpFlags::empty()); - } - Some(c) if is_line_terminator(c) => { - self.error(diagnostics::UnterminatedRegExp(self.unterminated_range())); - #[allow(clippy::cast_possible_truncation)] - let pattern_end = self.offset() - c.len_utf8() as u32; - return (pattern_end, RegExpFlags::empty()); - } - Some(c) => { - if in_escape { - in_escape = false; - } else if c == '/' && !in_character_class { - break; - } else if c == '[' { - in_character_class = true; - } else if c == '\\' { - in_escape = true; - } else if c == ']' { - in_character_class = false; - } - } - } - } - - let pattern_end = self.offset() - 1; // -1 to exclude `/` - let mut flags = RegExpFlags::empty(); - - while let Some(ch @ ('$' | '_' | 'a'..='z' | 'A'..='Z' | '0'..='9')) = self.peek() { - self.current.chars.next(); - let flag = if let Ok(flag) = RegExpFlags::try_from(ch) { - flag - } else { - self.error(diagnostics::RegExpFlag(ch, self.current_offset())); - continue; - }; - if flags.contains(flag) { - self.error(diagnostics::RegExpFlagTwice(ch, self.current_offset())); - continue; - } - flags |= flag; - } - - (pattern_end, flags) - } - - /// 12.8.6 Template Literal Lexical Components - fn read_template_literal(&mut self, substitute: Kind, tail: Kind) -> Kind { - let mut builder = AutoCow::new(self); - let mut is_valid_escape_sequence = true; - while let Some(c) = self.current.chars.next() { - match c { - '$' if self.peek() == Some('{') => { - self.save_template_string( - is_valid_escape_sequence, - builder.has_escape(), - builder.finish_without_push(self), - ); - self.current.chars.next(); - return substitute; - } - '`' => { - self.save_template_string( - is_valid_escape_sequence, - builder.has_escape(), - builder.finish_without_push(self), - ); - return tail; - } - CR => { - builder.force_allocation_without_current_ascii_char(self); - if self.next_eq(LF) { - builder.push_different(LF); - } - } - '\\' => { - let text = builder.get_mut_string_without_current_ascii_char(self); - self.read_string_escape_sequence(text, true, &mut is_valid_escape_sequence); - } - _ => builder.push_matching(c), - } - } - self.error(diagnostics::UnterminatedString(self.unterminated_range())); - Kind::Undetermined - } - - /// `JSXIdentifier` : - /// `IdentifierStart` - /// `JSXIdentifier` `IdentifierPart` - /// `JSXIdentifier` [no `WhiteSpace` or Comment here] - - fn read_jsx_identifier(&mut self, _start_offset: u32) -> Kind { - while let Some(c) = self.peek() { - if c == '-' || is_identifier_start(c) { - self.current.chars.next(); - while let Some(c) = self.peek() { - if is_identifier_part(c) { - self.current.chars.next(); - } else { - break; - } - } - } else { - break; - } - } - Kind::Ident - } - - /// [`JSXChild`](https://facebook.github.io/jsx/#prod-JSXChild) - /// `JSXChild` : - /// `JSXText` - /// `JSXElement` - /// `JSXFragment` - /// { `JSXChildExpressionopt` } - fn read_jsx_child(&mut self) -> Kind { - match self.peek() { - Some('<') => { - self.current.chars.next(); - Kind::LAngle - } - Some('{') => { - self.current.chars.next(); - Kind::LCurly - } - Some(_) => { - loop { - // The tokens `{`, `<`, `>` and `}` cannot appear in a jsx text. - // The TypeScript compiler raises the error "Unexpected token. Did you mean `{'>'}` or `>`?". - // Where as the Babel compiler does not raise any errors. - // The following check omits `>` and `}` so that more Babel tests can be passed. - if self.peek().is_some_and(|c| c == '{' || c == '<') { - break; - } - if self.current.chars.next().is_none() { - break; - } - } - Kind::JSXText - } - None => Kind::Eof, - } - } - - /// `JSXDoubleStringCharacters` :: - /// `JSXDoubleStringCharacter` `JSXDoubleStringCharactersopt` - /// `JSXDoubleStringCharacter` :: - /// `JSXStringCharacter` but not " - /// `JSXSingleStringCharacters` :: - /// `JSXSingleStringCharacter` `JSXSingleStringCharactersopt` - /// `JSXSingleStringCharacter` :: - /// `JSXStringCharacter` but not ' - /// `JSXStringCharacter` :: - /// `SourceCharacter` but not one of `HTMLCharacterReference` - fn read_jsx_string_literal(&mut self, delimiter: char) -> Kind { - let mut builder = AutoCow::new(self); - loop { - match self.current.chars.next() { - Some(c @ ('"' | '\'')) => { - if c == delimiter { - self.save_string(builder.has_escape(), builder.finish_without_push(self)); - return Kind::Str; - } - builder.push_matching(c); - } - Some(other) => { - builder.push_matching(other); - } - None => { - self.error(diagnostics::UnterminatedString(self.unterminated_range())); - return Kind::Undetermined; - } - } - } - } - - /* ---------- utils ---------- */ - - /// Identifier `UnicodeEscapeSequence` - /// \u `Hex4Digits` - /// \u{ `CodePoint` } - fn identifier_unicode_escape_sequence( - &mut self, - builder: &mut AutoCow<'a>, - check_identifier_start: bool, - ) { - let start = self.offset(); - if self.current.chars.next() != Some('u') { - let range = Span::new(start, self.offset()); - self.error(diagnostics::UnicodeEscapeSequence(range)); - return; - } - - let value = match self.peek() { - Some('{') => self.unicode_code_point(), - _ => self.surrogate_pair(), - }; - - let Some(value) = value else { - let range = Span::new(start, self.offset()); - self.error(diagnostics::UnicodeEscapeSequence(range)); - return; - }; - - // For Identifiers, surrogate pair is an invalid grammar, e.g. `var \uD800\uDEA7`. - let ch = match value { - SurrogatePair::Astral(..) | SurrogatePair::HighLow(..) => { - let range = Span::new(start, self.offset()); - self.error(diagnostics::UnicodeEscapeSequence(range)); - return; - } - SurrogatePair::CodePoint(code_point) => { - if let Ok(ch) = char::try_from(code_point) { - ch - } else { - let range = Span::new(start, self.offset()); - self.error(diagnostics::UnicodeEscapeSequence(range)); - return; - } - } - }; - - let is_valid = - if check_identifier_start { is_identifier_start(ch) } else { is_identifier_part(ch) }; - - if !is_valid { - self.error(diagnostics::InvalidCharacter(ch, self.current_offset())); - return; - } - - builder.push_different(ch); - } - - /// String `UnicodeEscapeSequence` - /// \u `Hex4Digits` - /// \u `Hex4Digits` \u `Hex4Digits` - /// \u{ `CodePoint` } - fn string_unicode_escape_sequence( - &mut self, - text: &mut String<'a>, - is_valid_escape_sequence: &mut bool, - ) { - let value = match self.peek() { - Some('{') => self.unicode_code_point(), - _ => self.surrogate_pair(), - }; - - let Some(value) = value else { - // error raised within the parser by `diagnostics::TemplateLiteral` - *is_valid_escape_sequence = false; - return; - }; - - // For strings and templates, surrogate pairs are valid grammar, e.g. `"\uD83D\uDE00" === 😀` - // values are interpreted as is if they fall out of range - match value { - SurrogatePair::CodePoint(code_point) | SurrogatePair::Astral(code_point) => { - if let Ok(ch) = char::try_from(code_point) { - text.push(ch); - } else { - text.push_str("\\u"); - text.push_str(format!("{code_point:x}").as_str()); - } - } - SurrogatePair::HighLow(high, low) => { - text.push_str("\\u"); - text.push_str(format!("{high:x}").as_str()); - text.push_str("\\u"); - text.push_str(format!("{low:x}").as_str()); - } - } - } - - fn unicode_code_point(&mut self) -> Option { - if !self.next_eq('{') { - return None; - } - let value = self.code_point()?; - if !self.next_eq('}') { - return None; - } - Some(SurrogatePair::CodePoint(value)) - } - - fn hex_4_digits(&mut self) -> Option { - let mut value = 0; - for _ in 0..4 { - value = (value << 4) | self.hex_digit()?; - } - Some(value) - } - - fn hex_digit(&mut self) -> Option { - let value = match self.peek() { - Some(c @ '0'..='9') => c as u32 - '0' as u32, - Some(c @ 'a'..='f') => 10 + (c as u32 - 'a' as u32), - Some(c @ 'A'..='F') => 10 + (c as u32 - 'A' as u32), - _ => return None, - }; - self.current.chars.next(); - Some(value) - } - - fn code_point(&mut self) -> Option { - let mut value = self.hex_digit()?; - while let Some(next) = self.hex_digit() { - value = (value << 4) | next; - if value > 0x0010_FFFF { - return None; - } - } - Some(value) - } - - /// Surrogate pairs - /// See background info: - /// * `https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae` - /// * `https://mathiasbynens.be/notes/javascript-identifiers-es6` - fn surrogate_pair(&mut self) -> Option { - let high = self.hex_4_digits()?; - // The first code unit of a surrogate pair is always in the range from 0xD800 to 0xDBFF, and is called a high surrogate or a lead surrogate. - if !((0xD800..=0xDBFF).contains(&high) - && self.peek() == Some('\\') - && self.peek2() == Some('u')) - { - return Some(SurrogatePair::CodePoint(high)); - } - - self.current.chars.next(); - self.current.chars.next(); - - let low = self.hex_4_digits()?; - - // The second code unit of a surrogate pair is always in the range from 0xDC00 to 0xDFFF, and is called a low surrogate or a trail surrogate. - if !(0xDC00..=0xDFFF).contains(&low) { - return Some(SurrogatePair::HighLow(high, low)); - } - - // `https://tc39.es/ecma262/#sec-utf16decodesurrogatepair` - let astral_code_point = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000; - - Some(SurrogatePair::Astral(astral_code_point)) - } - - // EscapeSequence :: - fn read_string_escape_sequence( - &mut self, - text: &mut String<'a>, - in_template: bool, - is_valid_escape_sequence: &mut bool, - ) { - match self.current.chars.next() { - None => { - self.error(diagnostics::UnterminatedString(self.unterminated_range())); - } - Some(c) => match c { - // \ LineTerminatorSequence - // LineTerminatorSequence :: - // - // [lookahead ≠ ] - // - // - // - LF | LS | PS => {} - CR => { - self.next_eq(LF); - } - // SingleEscapeCharacter :: one of - // ' " \ b f n r t v - '\'' | '"' | '\\' => text.push(c), - 'b' => text.push('\u{8}'), - 'f' => text.push(FF), - 'n' => text.push(LF), - 'r' => text.push(CR), - 't' => text.push(TAB), - 'v' => text.push(VT), - // HexEscapeSequence - 'x' => { - self.hex_digit() - .and_then(|value1| { - let value2 = self.hex_digit()?; - Some((value1, value2)) - }) - .map(|(value1, value2)| (value1 << 4) | value2) - .and_then(|value| char::try_from(value).ok()) - .map_or_else( - || { - *is_valid_escape_sequence = false; - }, - |c| { - text.push(c); - }, - ); - } - // UnicodeEscapeSequence - 'u' => { - self.string_unicode_escape_sequence(text, is_valid_escape_sequence); - } - // 0 [lookahead ∉ DecimalDigit] - '0' if !self.peek().is_some_and(|c| c.is_ascii_digit()) => text.push('\0'), - // Section 12.9.4 String Literals - // LegacyOctalEscapeSequence - // NonOctalDecimalEscapeSequence - a @ '0'..='7' if !in_template => { - let mut num = String::new_in(self.allocator); - num.push(a); - match a { - '4'..='7' => { - if matches!(self.peek(), Some('0'..='7')) { - let b = self.consume_char(); - num.push(b); - } - } - '0'..='3' => { - if matches!(self.peek(), Some('0'..='7')) { - let b = self.consume_char(); - num.push(b); - if matches!(self.peek(), Some('0'..='7')) { - let c = self.consume_char(); - num.push(c); - } - } - } - _ => {} - } - - let value = - char::from_u32(u32::from_str_radix(num.as_str(), 8).unwrap()).unwrap(); - text.push(value); - } - '0' if in_template && self.peek().is_some_and(|c| c.is_ascii_digit()) => { - self.current.chars.next(); - // error raised within the parser by `diagnostics::TemplateLiteral` - *is_valid_escape_sequence = false; - } - // NotEscapeSequence :: DecimalDigit but not 0 - '1'..='9' if in_template => { - // error raised within the parser by `diagnostics::TemplateLiteral` - *is_valid_escape_sequence = false; - } - other => { - // NonOctalDecimalEscapeSequence \8 \9 in strict mode - text.push(other); - } - }, - } - } } - -enum SurrogatePair { - // valid \u Hex4Digits \u Hex4Digits - Astral(u32), - // valid \u Hex4Digits - CodePoint(u32), - // invalid \u Hex4Digits \u Hex4Digits - HighLow(u32, u32), -} - -#[allow(clippy::unnecessary_safety_comment)] -/// Handle next byte of source. -/// SAFETY: -/// * Lexer must not be at end of file. -/// * `byte` must be next byte of source code, corresponding to current position -/// of `lexer.current.chars`. -/// * Only `BYTE_HANDLERS` for ASCII characters may use the `ascii_byte_handler!()` macro. -unsafe fn handle_byte(byte: u8, lexer: &mut Lexer) -> Kind { - BYTE_HANDLERS[byte as usize](lexer) -} - -type ByteHandler = unsafe fn(&mut Lexer<'_>) -> Kind; - -/// Lookup table mapping any incoming byte to a handler function defined below. -/// -#[rustfmt::skip] -static BYTE_HANDLERS: [ByteHandler; 256] = [ -// 0 1 2 3 4 5 6 7 8 9 A B C D E F // - ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, SPS, LIN, SPS, SPS, LIN, ERR, ERR, // 0 - ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, // 1 - SPS, EXL, QOT, HAS, IDT, PRC, AMP, QOT, PNO, PNC, ATR, PLS, COM, MIN, PRD, SLH, // 2 - ZER, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, COL, SEM, LSS, EQL, GTR, QST, // 3 - AT_, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, // 4 - IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, BTO, ESC, BTC, CRT, IDT, // 5 - TPL, L_A, L_B, L_C, L_D, L_E, L_F, L_G, IDT, L_I, IDT, L_K, L_L, L_M, L_N, L_O, // 6 - L_P, IDT, L_R, L_S, L_T, L_U, L_V, L_W, IDT, L_Y, IDT, BEO, PIP, BEC, TLD, ERR, // 7 - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 8 - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 9 - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // A - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // B - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // C - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // D - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // E - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // F -]; - -#[allow(clippy::unnecessary_safety_comment)] -/// Macro for defining byte handler for an ASCII character. -/// -/// In addition to defining a `const` for the handler, it also asserts that lexer -/// is not at end of file, and that next char is ASCII. -/// Where the handler is for an ASCII character, these assertions are self-evidently true. -/// -/// These assertions produce no runtime code, but hint to the compiler that it can assume that -/// next char is ASCII, and it uses that information to optimize the rest of the handler. -/// e.g. `lexer.current.chars.next()` becomes just a single assembler instruction. -/// Without the assertions, the compiler is unable to deduce the next char is ASCII, due to -/// the indirection of the `BYTE_HANDLERS` jump table. -/// -/// These assertions are unchecked (i.e. won't panic) and will cause UB if they're incorrect. -/// -/// SAFETY: Only use this macro to define byte handlers for ASCII characters. -/// -/// ``` -/// ascii_byte_handler!(SPS(lexer) { -/// lexer.consume_char(); -/// Kind::WhiteSpace -/// }); -/// ``` -/// -/// expands to: -/// -/// ``` -/// const SPS: ByteHandler = |lexer| { -/// unsafe { -/// use ::assert_unchecked::assert_unchecked; -/// let s = lexer.current.chars.as_str(); -/// assert_unchecked!(!s.is_empty()); -/// assert_unchecked!(s.as_bytes()[0] < 128); -/// } -/// lexer.consume_char(); -/// Kind::WhiteSpace -/// }; -/// ``` -macro_rules! ascii_byte_handler { - ($id:ident($lex:ident) $body:expr) => { - const $id: ByteHandler = |$lex| { - // SAFETY: This macro is only used for ASCII characters - unsafe { - use assert_unchecked::assert_unchecked; - let s = $lex.current.chars.as_str(); - assert_unchecked!(!s.is_empty()); - assert_unchecked!(s.as_bytes()[0] < 128); - } - $body - }; - }; -} - -// `\0` `\1` etc -ascii_byte_handler!(ERR(lexer) { - let c = lexer.consume_char(); - lexer.error(diagnostics::InvalidCharacter(c, lexer.unterminated_range())); - Kind::Undetermined -}); - -// -ascii_byte_handler!(SPS(lexer) { - lexer.consume_char(); - Kind::Skip -}); - -// '\r' '\n' -ascii_byte_handler!(LIN(lexer) { - lexer.consume_char(); - lexer.current.token.is_on_new_line = true; - Kind::Skip -}); - -// ! -ascii_byte_handler!(EXL(lexer) { - lexer.consume_char(); - if lexer.next_eq('=') { - if lexer.next_eq('=') { - Kind::Neq2 - } else { - Kind::Neq - } - } else { - Kind::Bang - } -}); - -// ' " -ascii_byte_handler!(QOT(lexer) { - let c = lexer.consume_char(); - if lexer.context == LexerContext::JsxAttributeValue { - lexer.read_jsx_string_literal(c) - } else { - lexer.read_string_literal(c) - } -}); - -// # -ascii_byte_handler!(HAS(lexer) { - lexer.consume_char(); - // HashbangComment :: - // `#!` SingleLineCommentChars? - if lexer.current.token.start == 0 && lexer.next_eq('!') { - lexer.read_hashbang_comment() - } else { - lexer.private_identifier() - } -}); - -// `A..=Z`, `a..=z` (except special cases below), `_`, `$` -ascii_byte_handler!(IDT(lexer) { - lexer.identifier_name_handler(); - Kind::Ident -}); - -// % -ascii_byte_handler!(PRC(lexer) { - lexer.consume_char(); - if lexer.next_eq('=') { - Kind::PercentEq - } else { - Kind::Percent - } -}); - -// & -ascii_byte_handler!(AMP(lexer) { - lexer.consume_char(); - if lexer.next_eq('&') { - if lexer.next_eq('=') { - Kind::Amp2Eq - } else { - Kind::Amp2 - } - } else if lexer.next_eq('=') { - Kind::AmpEq - } else { - Kind::Amp - } -}); - -// ( -ascii_byte_handler!(PNO(lexer) { - lexer.consume_char(); - Kind::LParen -}); - -// ) -ascii_byte_handler!(PNC(lexer) { - lexer.consume_char(); - Kind::RParen -}); - -// * -ascii_byte_handler!(ATR(lexer) { - lexer.consume_char(); - if lexer.next_eq('*') { - if lexer.next_eq('=') { - Kind::Star2Eq - } else { - Kind::Star2 - } - } else if lexer.next_eq('=') { - Kind::StarEq - } else { - Kind::Star - } -}); - -// + -ascii_byte_handler!(PLS(lexer) { - lexer.consume_char(); - if lexer.next_eq('+') { - Kind::Plus2 - } else if lexer.next_eq('=') { - Kind::PlusEq - } else { - Kind::Plus - } -}); - -// , -ascii_byte_handler!(COM(lexer) { - lexer.consume_char(); - Kind::Comma -}); - -// - -ascii_byte_handler!(MIN(lexer) { - lexer.consume_char(); - lexer.read_minus().unwrap_or_else(|| lexer.skip_single_line_comment()) -}); - -// . -ascii_byte_handler!(PRD(lexer) { - lexer.consume_char(); - lexer.read_dot() -}); - -// / -ascii_byte_handler!(SLH(lexer) { - lexer.consume_char(); - match lexer.peek() { - Some('/') => { - lexer.current.chars.next(); - lexer.skip_single_line_comment() - } - Some('*') => { - lexer.current.chars.next(); - lexer.skip_multi_line_comment() - } - _ => { - // regex is handled separately, see `next_regex` - if lexer.next_eq('=') { - Kind::SlashEq - } else { - Kind::Slash - } - } - } -}); - -// 0 -ascii_byte_handler!(ZER(lexer) { - lexer.consume_char(); - lexer.read_zero() -}); - -// 1 to 9 -ascii_byte_handler!(DIG(lexer) { - lexer.consume_char(); - lexer.decimal_literal_after_first_digit() -}); - -// : -ascii_byte_handler!(COL(lexer) { - lexer.consume_char(); - Kind::Colon -}); - -// ; -ascii_byte_handler!(SEM(lexer) { - lexer.consume_char(); - Kind::Semicolon -}); - -// < -ascii_byte_handler!(LSS(lexer) { - lexer.consume_char(); - lexer.read_left_angle().unwrap_or_else(|| lexer.skip_single_line_comment()) -}); - -// = -ascii_byte_handler!(EQL(lexer) { - lexer.consume_char(); - if lexer.next_eq('=') { - if lexer.next_eq('=') { - Kind::Eq3 - } else { - Kind::Eq2 - } - } else if lexer.next_eq('>') { - Kind::Arrow - } else { - Kind::Eq - } -}); - -// > -ascii_byte_handler!(GTR(lexer) { - lexer.consume_char(); - // `>=` is re-lexed with [Lexer::next_jsx_child] - Kind::RAngle -}); - -// ? -ascii_byte_handler!(QST(lexer) { - lexer.consume_char(); - if lexer.next_eq('?') { - if lexer.next_eq('=') { - Kind::Question2Eq - } else { - Kind::Question2 - } - } else if lexer.peek() == Some('.') { - // parse `?.1` as `?` `.1` - if lexer.peek2().is_some_and(|c| c.is_ascii_digit()) { - Kind::Question - } else { - lexer.current.chars.next(); - Kind::QuestionDot - } - } else { - Kind::Question - } -}); - -// @ -ascii_byte_handler!(AT_(lexer) { - lexer.consume_char(); - Kind::At -}); - -// [ -ascii_byte_handler!(BTO(lexer) { - lexer.consume_char(); - Kind::LBrack -}); - -// \ -ascii_byte_handler!(ESC(lexer) { - let mut builder = AutoCow::new(lexer); - lexer.consume_char(); - builder.force_allocation_without_current_ascii_char(lexer); - lexer.identifier_unicode_escape_sequence(&mut builder, true); - let text = lexer.identifier_name(builder); - Kind::match_keyword(text) -}); - -// ] -ascii_byte_handler!(BTC(lexer) { - lexer.consume_char(); - Kind::RBrack -}); - -// ^ -ascii_byte_handler!(CRT(lexer) { - lexer.consume_char(); - if lexer.next_eq('=') { - Kind::CaretEq - } else { - Kind::Caret - } -}); - -// ` -ascii_byte_handler!(TPL(lexer) { - lexer.consume_char(); - lexer.read_template_literal(Kind::TemplateHead, Kind::NoSubstitutionTemplate) -}); - -// { -ascii_byte_handler!(BEO(lexer) { - lexer.consume_char(); - Kind::LCurly -}); - -// | -ascii_byte_handler!(PIP(lexer) { - lexer.consume_char(); - if lexer.next_eq('|') { - if lexer.next_eq('=') { - Kind::Pipe2Eq - } else { - Kind::Pipe2 - } - } else if lexer.next_eq('=') { - Kind::PipeEq - } else { - Kind::Pipe - } -}); - -// } -ascii_byte_handler!(BEC(lexer) { - lexer.consume_char(); - Kind::RCurly -}); - -// ~ -ascii_byte_handler!(TLD(lexer) { - lexer.consume_char(); - Kind::Tilde -}); - -ascii_byte_handler!(L_A(lexer) match &lexer.identifier_name_handler()[1..] { - "wait" => Kind::Await, - "sync" => Kind::Async, - "bstract" => Kind::Abstract, - "ccessor" => Kind::Accessor, - "ny" => Kind::Any, - "s" => Kind::As, - "ssert" => Kind::Assert, - "sserts" => Kind::Asserts, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_B(lexer) match &lexer.identifier_name_handler()[1..] { - "reak" => Kind::Break, - "oolean" => Kind::Boolean, - "igint" => Kind::BigInt, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_C(lexer) match &lexer.identifier_name_handler()[1..] { - "onst" => Kind::Const, - "lass" => Kind::Class, - "ontinue" => Kind::Continue, - "atch" => Kind::Catch, - "ase" => Kind::Case, - "onstructor" => Kind::Constructor, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_D(lexer) match &lexer.identifier_name_handler()[1..] { - "o" => Kind::Do, - "elete" => Kind::Delete, - "eclare" => Kind::Declare, - "efault" => Kind::Default, - "ebugger" => Kind::Debugger, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_E(lexer) match &lexer.identifier_name_handler()[1..] { - "lse" => Kind::Else, - "num" => Kind::Enum, - "xport" => Kind::Export, - "xtends" => Kind::Extends, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_F(lexer) match &lexer.identifier_name_handler()[1..] { - "unction" => Kind::Function, - "alse" => Kind::False, - "or" => Kind::For, - "inally" => Kind::Finally, - "rom" => Kind::From, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_G(lexer) match &lexer.identifier_name_handler()[1..] { - "et" => Kind::Get, - "lobal" => Kind::Global, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_I(lexer) match &lexer.identifier_name_handler()[1..] { - "f" => Kind::If, - "nstanceof" => Kind::Instanceof, - "n" => Kind::In, - "mplements" => Kind::Implements, - "mport" => Kind::Import, - "nfer" => Kind::Infer, - "nterface" => Kind::Interface, - "ntrinsic" => Kind::Intrinsic, - "s" => Kind::Is, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_K(lexer) match &lexer.identifier_name_handler()[1..] { - "eyof" => Kind::KeyOf, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_L(lexer) match &lexer.identifier_name_handler()[1..] { - "et" => Kind::Let, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_M(lexer) match &lexer.identifier_name_handler()[1..] { - "eta" => Kind::Meta, - "odule" => Kind::Module, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_N(lexer) match &lexer.identifier_name_handler()[1..] { - "ull" => Kind::Null, - "ew" => Kind::New, - "umber" => Kind::Number, - "amespace" => Kind::Namespace, - "ever" => Kind::Never, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_O(lexer) match &lexer.identifier_name_handler()[1..] { - "f" => Kind::Of, - "bject" => Kind::Object, - "ut" => Kind::Out, - "verride" => Kind::Override, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_P(lexer) match &lexer.identifier_name_handler()[1..] { - "ackage" => Kind::Package, - "rivate" => Kind::Private, - "rotected" => Kind::Protected, - "ublic" => Kind::Public, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_R(lexer) match &lexer.identifier_name_handler()[1..] { - "eturn" => Kind::Return, - "equire" => Kind::Require, - "eadonly" => Kind::Readonly, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_S(lexer) match &lexer.identifier_name_handler()[1..] { - "et" => Kind::Set, - "uper" => Kind::Super, - "witch" => Kind::Switch, - "tatic" => Kind::Static, - "ymbol" => Kind::Symbol, - "tring" => Kind::String, - "atisfies" => Kind::Satisfies, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_T(lexer) match &lexer.identifier_name_handler()[1..] { - "his" => Kind::This, - "rue" => Kind::True, - "hrow" => Kind::Throw, - "ry" => Kind::Try, - "ypeof" => Kind::Typeof, - "arget" => Kind::Target, - "ype" => Kind::Type, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_U(lexer) match &lexer.identifier_name_handler()[1..] { - "ndefined" => Kind::Undefined, - "sing" => Kind::Using, - "nique" => Kind::Unique, - "nknown" => Kind::Unknown, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_V(lexer) match &lexer.identifier_name_handler()[1..] { - "ar" => Kind::Var, - "oid" => Kind::Void, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_W(lexer) match &lexer.identifier_name_handler()[1..] { - "hile" => Kind::While, - "ith" => Kind::With, - _ => Kind::Ident, -}); - -ascii_byte_handler!(L_Y(lexer) match &lexer.identifier_name_handler()[1..] { - "ield" => Kind::Yield, - _ => Kind::Ident, -}); - -// Non-ASCII characters. -// NB: Must not use `ascii_byte_handler!()` macro, as this handler is for non-ASCII chars. -#[allow(clippy::redundant_closure_for_method_calls)] -const UNI: ByteHandler = |lexer| lexer.unicode_char_handler(); diff --git a/crates/oxc_parser/src/lexer/numeric.rs b/crates/oxc_parser/src/lexer/numeric.rs new file mode 100644 index 000000000..8dcc27d05 --- /dev/null +++ b/crates/oxc_parser/src/lexer/numeric.rs @@ -0,0 +1,199 @@ +use super::{Kind, Lexer, Span}; +use crate::diagnostics; + +use oxc_syntax::identifier::is_identifier_start; + +impl<'a> Lexer<'a> { + /// 12.9.3 Numeric Literals with `0` prefix + pub(super) fn read_zero(&mut self) -> Kind { + match self.peek() { + Some('b' | 'B') => self.read_non_decimal(Kind::Binary), + Some('o' | 'O') => self.read_non_decimal(Kind::Octal), + Some('x' | 'X') => self.read_non_decimal(Kind::Hex), + Some('e' | 'E') => { + self.current.chars.next(); + self.read_decimal_exponent() + } + Some('.') => { + self.current.chars.next(); + self.decimal_literal_after_decimal_point_after_digits() + } + Some('n') => { + self.current.chars.next(); + self.check_after_numeric_literal(Kind::Decimal) + } + Some(n) if n.is_ascii_digit() => self.read_legacy_octal(), + _ => self.check_after_numeric_literal(Kind::Decimal), + } + } + + pub(super) fn decimal_literal_after_first_digit(&mut self) -> Kind { + self.read_decimal_digits_after_first_digit(); + if self.next_eq('.') { + return self.decimal_literal_after_decimal_point_after_digits(); + } else if self.next_eq('n') { + return self.check_after_numeric_literal(Kind::Decimal); + } + + let kind = self.optional_exponent().map_or(Kind::Decimal, |kind| kind); + self.check_after_numeric_literal(kind) + } + + fn read_non_decimal(&mut self, kind: Kind) -> Kind { + self.current.chars.next(); + + if self.peek().is_some_and(|c| kind.matches_number_char(c)) { + self.current.chars.next(); + } else { + self.unexpected_err(); + return Kind::Undetermined; + } + + while let Some(c) = self.peek() { + match c { + '_' => { + self.current.chars.next(); + if self.peek().is_some_and(|c| kind.matches_number_char(c)) { + self.current.chars.next(); + } else { + self.unexpected_err(); + return Kind::Undetermined; + } + } + c if kind.matches_number_char(c) => { + self.current.chars.next(); + } + _ => break, + } + } + if self.peek() == Some('n') { + self.current.chars.next(); + } + self.check_after_numeric_literal(kind) + } + + fn read_legacy_octal(&mut self) -> Kind { + let mut kind = Kind::Octal; + loop { + match self.peek() { + Some('0'..='7') => { + self.current.chars.next(); + } + Some('8'..='9') => { + self.current.chars.next(); + kind = Kind::Decimal; + } + _ => break, + } + } + + match self.peek() { + // allow 08.5 and 09.5 + Some('.') if kind == Kind::Decimal => { + self.current.chars.next(); + self.decimal_literal_after_decimal_point_after_digits() + } + // allow 08e1 and 09e1 + Some('e') if kind == Kind::Decimal => { + self.current.chars.next(); + self.read_decimal_exponent() + } + _ => self.check_after_numeric_literal(kind), + } + } + + fn read_decimal_exponent(&mut self) -> Kind { + let kind = match self.peek() { + Some('-') => { + self.current.chars.next(); + Kind::NegativeExponential + } + Some('+') => { + self.current.chars.next(); + Kind::PositiveExponential + } + _ => Kind::PositiveExponential, + }; + self.read_decimal_digits(); + kind + } + + fn read_decimal_digits(&mut self) { + if self.peek().is_some_and(|c| c.is_ascii_digit()) { + self.current.chars.next(); + } else { + self.unexpected_err(); + return; + } + + self.read_decimal_digits_after_first_digit(); + } + + fn read_decimal_digits_after_first_digit(&mut self) { + while let Some(c) = self.peek() { + match c { + '_' => { + self.current.chars.next(); + if self.peek().is_some_and(|c| c.is_ascii_digit()) { + self.current.chars.next(); + } else { + self.unexpected_err(); + return; + } + } + '0'..='9' => { + self.current.chars.next(); + } + _ => break, + } + } + } + + pub(super) fn decimal_literal_after_decimal_point(&mut self) -> Kind { + self.read_decimal_digits(); + self.optional_exponent(); + self.check_after_numeric_literal(Kind::Float) + } + + fn decimal_literal_after_decimal_point_after_digits(&mut self) -> Kind { + self.optional_decimal_digits(); + self.optional_exponent(); + self.check_after_numeric_literal(Kind::Float) + } + + fn optional_decimal_digits(&mut self) { + if self.peek().is_some_and(|c| c.is_ascii_digit()) { + self.current.chars.next(); + } else { + return; + } + self.read_decimal_digits_after_first_digit(); + } + + fn optional_exponent(&mut self) -> Option { + if matches!(self.peek(), Some('e' | 'E')) { + self.current.chars.next(); + return Some(self.read_decimal_exponent()); + } + None + } + + fn check_after_numeric_literal(&mut self, kind: Kind) -> Kind { + let offset = self.offset(); + // The SourceCharacter immediately following a NumericLiteral must not be an IdentifierStart or DecimalDigit. + let c = self.peek(); + if c.is_none() || c.is_some_and(|ch| !ch.is_ascii_digit() && !is_identifier_start(ch)) { + return kind; + } + self.current.chars.next(); + while let Some(c) = self.peek() { + if is_identifier_start(c) { + self.current.chars.next(); + } else { + break; + } + } + self.error(diagnostics::InvalidNumberEnd(Span::new(offset, self.offset()))); + Kind::Undetermined + } +} diff --git a/crates/oxc_parser/src/lexer/punctuation.rs b/crates/oxc_parser/src/lexer/punctuation.rs new file mode 100644 index 000000000..e119a45b5 --- /dev/null +++ b/crates/oxc_parser/src/lexer/punctuation.rs @@ -0,0 +1,83 @@ +use super::{Kind, Lexer, Token}; + +impl<'a> Lexer<'a> { + /// Section 12.8 Punctuators + pub(super) fn read_dot(&mut self) -> Kind { + if self.peek() == Some('.') && self.peek2() == Some('.') { + self.current.chars.next(); + self.current.chars.next(); + return Kind::Dot3; + } + if self.peek().is_some_and(|c| c.is_ascii_digit()) { + self.decimal_literal_after_decimal_point() + } else { + Kind::Dot + } + } + + /// returns None for `SingleLineHTMLOpenComment` `` in script mode + pub(super) fn read_minus(&mut self) -> Option { + if self.next_eq('-') { + // SingleLineHTMLCloseComment `-->` in script mode + if self.current.token.is_on_new_line + && self.source_type.is_script() + && self.next_eq('>') + { + None + } else { + Some(Kind::Minus2) + } + } else if self.next_eq('=') { + Some(Kind::MinusEq) + } else { + Some(Kind::Minus) + } + } + + pub(crate) fn next_right_angle(&mut self) -> Token { + let kind = self.read_right_angle(); + self.lookahead.clear(); + self.finish_next(kind) + } + + fn read_right_angle(&mut self) -> Kind { + if self.next_eq('>') { + if self.next_eq('>') { + if self.next_eq('=') { + Kind::ShiftRight3Eq + } else { + Kind::ShiftRight3 + } + } else if self.next_eq('=') { + Kind::ShiftRightEq + } else { + Kind::ShiftRight + } + } else if self.next_eq('=') { + Kind::GtEq + } else { + Kind::RAngle + } + } +} diff --git a/crates/oxc_parser/src/lexer/regex.rs b/crates/oxc_parser/src/lexer/regex.rs new file mode 100644 index 000000000..084e1175d --- /dev/null +++ b/crates/oxc_parser/src/lexer/regex.rs @@ -0,0 +1,78 @@ +use super::{Kind, Lexer, RegExpFlags, Token}; +use crate::diagnostics; + +use oxc_syntax::identifier::is_line_terminator; + +impl<'a> Lexer<'a> { + /// Re-tokenize the current `/` or `/=` and return `RegExp` + /// See Section 12: + /// The `InputElementRegExp` goal symbol is used in all syntactic grammar contexts + /// where a `RegularExpressionLiteral` is permitted + /// Which means the parser needs to re-tokenize on `PrimaryExpression`, + /// `RegularExpressionLiteral` only appear on the right hand side of `PrimaryExpression` + pub(crate) fn next_regex(&mut self, kind: Kind) -> (Token, u32, RegExpFlags) { + self.current.token.start = self.offset() + - match kind { + Kind::Slash => 1, + Kind::SlashEq => 2, + _ => unreachable!(), + }; + let (pattern_end, flags) = self.read_regex(); + self.lookahead.clear(); + let token = self.finish_next(Kind::RegExp); + (token, pattern_end, flags) + } + + /// 12.9.5 Regular Expression Literals + fn read_regex(&mut self) -> (u32, RegExpFlags) { + let mut in_escape = false; + let mut in_character_class = false; + loop { + match self.current.chars.next() { + None => { + self.error(diagnostics::UnterminatedRegExp(self.unterminated_range())); + return (self.offset(), RegExpFlags::empty()); + } + Some(c) if is_line_terminator(c) => { + self.error(diagnostics::UnterminatedRegExp(self.unterminated_range())); + #[allow(clippy::cast_possible_truncation)] + let pattern_end = self.offset() - c.len_utf8() as u32; + return (pattern_end, RegExpFlags::empty()); + } + Some(c) => { + if in_escape { + in_escape = false; + } else if c == '/' && !in_character_class { + break; + } else if c == '[' { + in_character_class = true; + } else if c == '\\' { + in_escape = true; + } else if c == ']' { + in_character_class = false; + } + } + } + } + + let pattern_end = self.offset() - 1; // -1 to exclude `/` + let mut flags = RegExpFlags::empty(); + + while let Some(ch @ ('$' | '_' | 'a'..='z' | 'A'..='Z' | '0'..='9')) = self.peek() { + self.current.chars.next(); + let flag = if let Ok(flag) = RegExpFlags::try_from(ch) { + flag + } else { + self.error(diagnostics::RegExpFlag(ch, self.current_offset())); + continue; + }; + if flags.contains(flag) { + self.error(diagnostics::RegExpFlagTwice(ch, self.current_offset())); + continue; + } + flags |= flag; + } + + (pattern_end, flags) + } +} diff --git a/crates/oxc_parser/src/lexer/string.rs b/crates/oxc_parser/src/lexer/string.rs new file mode 100644 index 000000000..f2f0c14b0 --- /dev/null +++ b/crates/oxc_parser/src/lexer/string.rs @@ -0,0 +1,65 @@ +use super::{AutoCow, Kind, Lexer, Span, Token}; +use crate::diagnostics; + +impl<'a> Lexer<'a> { + /// 12.9.4 String Literals + pub(super) fn read_string_literal(&mut self, delimiter: char) -> Kind { + let mut builder = AutoCow::new(self); + loop { + match self.current.chars.next() { + None | Some('\r' | '\n') => { + self.error(diagnostics::UnterminatedString(self.unterminated_range())); + return Kind::Undetermined; + } + Some(c @ ('"' | '\'')) => { + if c == delimiter { + self.save_string(builder.has_escape(), builder.finish_without_push(self)); + return Kind::Str; + } + builder.push_matching(c); + } + Some('\\') => { + let start = self.offset() - 1; + let text = builder.get_mut_string_without_current_ascii_char(self); + let mut is_valid_escape_sequence = true; + self.read_string_escape_sequence(text, false, &mut is_valid_escape_sequence); + if !is_valid_escape_sequence { + let range = Span::new(start, self.offset()); + self.error(diagnostics::InvalidEscapeSequence(range)); + } + } + Some(c) => { + builder.push_matching(c); + } + } + } + } + + /// Save the string if it is escaped + /// This reduces the overall memory consumption while keeping the `Token` size small + /// Strings without escaped values can be retrieved as is from the token span + pub(super) fn save_string(&mut self, has_escape: bool, s: &'a str) { + if !has_escape { + return; + } + self.escaped_strings.insert(self.current.token.start, s); + self.current.token.escaped = true; + } + + pub(crate) fn get_string(&self, token: Token) -> &'a str { + if token.escaped { + return self.escaped_strings[&token.start]; + } + + let raw = &self.source[token.start as usize..token.end as usize]; + match token.kind { + Kind::Str => { + &raw[1..raw.len() - 1] // omit surrounding quotes + } + Kind::PrivateIdentifier => { + &raw[1..] // omit leading `#` + } + _ => raw, + } + } +} diff --git a/crates/oxc_parser/src/lexer/template.rs b/crates/oxc_parser/src/lexer/template.rs new file mode 100644 index 000000000..661bfda4f --- /dev/null +++ b/crates/oxc_parser/src/lexer/template.rs @@ -0,0 +1,86 @@ +use super::{AutoCow, Kind, Lexer, Token}; +use crate::diagnostics; + +use oxc_syntax::identifier::{CR, LF}; + +impl<'a> Lexer<'a> { + /// 12.8.6 Template Literal Lexical Components + pub(super) fn read_template_literal(&mut self, substitute: Kind, tail: Kind) -> Kind { + let mut builder = AutoCow::new(self); + let mut is_valid_escape_sequence = true; + while let Some(c) = self.current.chars.next() { + match c { + '$' if self.peek() == Some('{') => { + self.save_template_string( + is_valid_escape_sequence, + builder.has_escape(), + builder.finish_without_push(self), + ); + self.current.chars.next(); + return substitute; + } + '`' => { + self.save_template_string( + is_valid_escape_sequence, + builder.has_escape(), + builder.finish_without_push(self), + ); + return tail; + } + CR => { + builder.force_allocation_without_current_ascii_char(self); + if self.next_eq(LF) { + builder.push_different(LF); + } + } + '\\' => { + let text = builder.get_mut_string_without_current_ascii_char(self); + self.read_string_escape_sequence(text, true, &mut is_valid_escape_sequence); + } + _ => builder.push_matching(c), + } + } + self.error(diagnostics::UnterminatedString(self.unterminated_range())); + Kind::Undetermined + } + + /// Re-tokenize the current `}` token for `TemplateSubstitutionTail` + /// See Section 12, the parser needs to re-tokenize on `TemplateSubstitutionTail`, + pub(crate) fn next_template_substitution_tail(&mut self) -> Token { + self.current.token.start = self.offset() - 1; + let kind = self.read_template_literal(Kind::TemplateMiddle, Kind::TemplateTail); + self.lookahead.clear(); + self.finish_next(kind) + } + + /// Save the template if it is escaped + fn save_template_string( + &mut self, + is_valid_escape_sequence: bool, + has_escape: bool, + s: &'a str, + ) { + if !has_escape { + return; + } + self.escaped_templates + .insert(self.current.token.start, is_valid_escape_sequence.then(|| s)); + self.current.token.escaped = true; + } + + pub(crate) fn get_template_string(&self, token: Token) -> Option<&'a str> { + if token.escaped { + return self.escaped_templates[&token.start]; + } + let raw = &self.source[token.start as usize..token.end as usize]; + Some(match token.kind { + Kind::NoSubstitutionTemplate | Kind::TemplateTail => { + &raw[1..raw.len() - 1] // omit surrounding quotes or leading "}" and trailing "`" + } + Kind::TemplateHead | Kind::TemplateMiddle => { + &raw[1..raw.len() - 2] // omit leading "`" or "}" and trailing "${" + } + _ => raw, + }) + } +} diff --git a/crates/oxc_parser/src/lexer/typescript.rs b/crates/oxc_parser/src/lexer/typescript.rs new file mode 100644 index 000000000..e2c781969 --- /dev/null +++ b/crates/oxc_parser/src/lexer/typescript.rs @@ -0,0 +1,17 @@ +use super::{Kind, Lexer, Token}; + +impl<'a> Lexer<'a> { + /// Re-tokenize '<<' or '<=' or '<<=' to '<' + pub(crate) fn re_lex_as_typescript_l_angle(&mut self, kind: Kind) -> Token { + let offset = match kind { + Kind::ShiftLeft | Kind::LtEq => 2, + Kind::ShiftLeftEq => 3, + _ => unreachable!(), + }; + self.current.token.start = self.offset() - offset; + self.current.chars = self.source[self.current.token.start as usize + 1..].chars(); + let kind = Kind::LAngle; + self.lookahead.clear(); + self.finish_next(kind) + } +} diff --git a/crates/oxc_parser/src/lexer/unicode.rs b/crates/oxc_parser/src/lexer/unicode.rs new file mode 100644 index 000000000..fe8f08f49 --- /dev/null +++ b/crates/oxc_parser/src/lexer/unicode.rs @@ -0,0 +1,318 @@ +use super::{AutoCow, Kind, Lexer, Span}; +use crate::diagnostics; + +use oxc_allocator::String; +use oxc_syntax::identifier::{ + is_identifier_part, is_identifier_start, is_identifier_start_unicode, + is_irregular_line_terminator, is_irregular_whitespace, CR, FF, LF, LS, PS, TAB, VT, +}; + +enum SurrogatePair { + // valid \u Hex4Digits \u Hex4Digits + Astral(u32), + // valid \u Hex4Digits + CodePoint(u32), + // invalid \u Hex4Digits \u Hex4Digits + HighLow(u32, u32), +} + +impl<'a> Lexer<'a> { + pub(super) fn unicode_char_handler(&mut self) -> Kind { + let c = self.current.chars.clone().next().unwrap(); + match c { + c if is_identifier_start_unicode(c) => { + let mut builder = AutoCow::new(self); + let c = self.consume_char(); + builder.push_matching(c); + self.identifier_name(builder); + Kind::Ident + } + c if is_irregular_whitespace(c) => { + self.trivia_builder + .add_irregular_whitespace(self.current.token.start, self.offset()); + self.consume_char(); + Kind::Skip + } + c if is_irregular_line_terminator(c) => { + self.consume_char(); + self.current.token.is_on_new_line = true; + Kind::Skip + } + _ => { + self.consume_char(); + self.error(diagnostics::InvalidCharacter(c, self.unterminated_range())); + Kind::Undetermined + } + } + } + + /// Identifier `UnicodeEscapeSequence` + /// \u `Hex4Digits` + /// \u{ `CodePoint` } + pub(super) fn identifier_unicode_escape_sequence( + &mut self, + builder: &mut AutoCow<'a>, + check_identifier_start: bool, + ) { + let start = self.offset(); + if self.current.chars.next() != Some('u') { + let range = Span::new(start, self.offset()); + self.error(diagnostics::UnicodeEscapeSequence(range)); + return; + } + + let value = match self.peek() { + Some('{') => self.unicode_code_point(), + _ => self.surrogate_pair(), + }; + + let Some(value) = value else { + let range = Span::new(start, self.offset()); + self.error(diagnostics::UnicodeEscapeSequence(range)); + return; + }; + + // For Identifiers, surrogate pair is an invalid grammar, e.g. `var \uD800\uDEA7`. + let ch = match value { + SurrogatePair::Astral(..) | SurrogatePair::HighLow(..) => { + let range = Span::new(start, self.offset()); + self.error(diagnostics::UnicodeEscapeSequence(range)); + return; + } + SurrogatePair::CodePoint(code_point) => { + if let Ok(ch) = char::try_from(code_point) { + ch + } else { + let range = Span::new(start, self.offset()); + self.error(diagnostics::UnicodeEscapeSequence(range)); + return; + } + } + }; + + let is_valid = + if check_identifier_start { is_identifier_start(ch) } else { is_identifier_part(ch) }; + + if !is_valid { + self.error(diagnostics::InvalidCharacter(ch, self.current_offset())); + return; + } + + builder.push_different(ch); + } + + /// String `UnicodeEscapeSequence` + /// \u `Hex4Digits` + /// \u `Hex4Digits` \u `Hex4Digits` + /// \u{ `CodePoint` } + fn string_unicode_escape_sequence( + &mut self, + text: &mut String<'a>, + is_valid_escape_sequence: &mut bool, + ) { + let value = match self.peek() { + Some('{') => self.unicode_code_point(), + _ => self.surrogate_pair(), + }; + + let Some(value) = value else { + // error raised within the parser by `diagnostics::TemplateLiteral` + *is_valid_escape_sequence = false; + return; + }; + + // For strings and templates, surrogate pairs are valid grammar, e.g. `"\uD83D\uDE00" === 😀` + // values are interpreted as is if they fall out of range + match value { + SurrogatePair::CodePoint(code_point) | SurrogatePair::Astral(code_point) => { + if let Ok(ch) = char::try_from(code_point) { + text.push(ch); + } else { + text.push_str("\\u"); + text.push_str(format!("{code_point:x}").as_str()); + } + } + SurrogatePair::HighLow(high, low) => { + text.push_str("\\u"); + text.push_str(format!("{high:x}").as_str()); + text.push_str("\\u"); + text.push_str(format!("{low:x}").as_str()); + } + } + } + + fn unicode_code_point(&mut self) -> Option { + if !self.next_eq('{') { + return None; + } + let value = self.code_point()?; + if !self.next_eq('}') { + return None; + } + Some(SurrogatePair::CodePoint(value)) + } + + fn hex_4_digits(&mut self) -> Option { + let mut value = 0; + for _ in 0..4 { + value = (value << 4) | self.hex_digit()?; + } + Some(value) + } + + fn hex_digit(&mut self) -> Option { + let value = match self.peek() { + Some(c @ '0'..='9') => c as u32 - '0' as u32, + Some(c @ 'a'..='f') => 10 + (c as u32 - 'a' as u32), + Some(c @ 'A'..='F') => 10 + (c as u32 - 'A' as u32), + _ => return None, + }; + self.current.chars.next(); + Some(value) + } + + fn code_point(&mut self) -> Option { + let mut value = self.hex_digit()?; + while let Some(next) = self.hex_digit() { + value = (value << 4) | next; + if value > 0x0010_FFFF { + return None; + } + } + Some(value) + } + + /// Surrogate pairs + /// See background info: + /// * `https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae` + /// * `https://mathiasbynens.be/notes/javascript-identifiers-es6` + fn surrogate_pair(&mut self) -> Option { + let high = self.hex_4_digits()?; + // The first code unit of a surrogate pair is always in the range from 0xD800 to 0xDBFF, and is called a high surrogate or a lead surrogate. + if !((0xD800..=0xDBFF).contains(&high) + && self.peek() == Some('\\') + && self.peek2() == Some('u')) + { + return Some(SurrogatePair::CodePoint(high)); + } + + self.current.chars.next(); + self.current.chars.next(); + + let low = self.hex_4_digits()?; + + // The second code unit of a surrogate pair is always in the range from 0xDC00 to 0xDFFF, and is called a low surrogate or a trail surrogate. + if !(0xDC00..=0xDFFF).contains(&low) { + return Some(SurrogatePair::HighLow(high, low)); + } + + // `https://tc39.es/ecma262/#sec-utf16decodesurrogatepair` + let astral_code_point = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000; + + Some(SurrogatePair::Astral(astral_code_point)) + } + + // EscapeSequence :: + pub(super) fn read_string_escape_sequence( + &mut self, + text: &mut String<'a>, + in_template: bool, + is_valid_escape_sequence: &mut bool, + ) { + match self.current.chars.next() { + None => { + self.error(diagnostics::UnterminatedString(self.unterminated_range())); + } + Some(c) => match c { + // \ LineTerminatorSequence + // LineTerminatorSequence :: + // + // [lookahead ≠ ] + // + // + // + LF | LS | PS => {} + CR => { + self.next_eq(LF); + } + // SingleEscapeCharacter :: one of + // ' " \ b f n r t v + '\'' | '"' | '\\' => text.push(c), + 'b' => text.push('\u{8}'), + 'f' => text.push(FF), + 'n' => text.push(LF), + 'r' => text.push(CR), + 't' => text.push(TAB), + 'v' => text.push(VT), + // HexEscapeSequence + 'x' => { + self.hex_digit() + .and_then(|value1| { + let value2 = self.hex_digit()?; + Some((value1, value2)) + }) + .map(|(value1, value2)| (value1 << 4) | value2) + .and_then(|value| char::try_from(value).ok()) + .map_or_else( + || { + *is_valid_escape_sequence = false; + }, + |c| { + text.push(c); + }, + ); + } + // UnicodeEscapeSequence + 'u' => { + self.string_unicode_escape_sequence(text, is_valid_escape_sequence); + } + // 0 [lookahead ∉ DecimalDigit] + '0' if !self.peek().is_some_and(|c| c.is_ascii_digit()) => text.push('\0'), + // Section 12.9.4 String Literals + // LegacyOctalEscapeSequence + // NonOctalDecimalEscapeSequence + a @ '0'..='7' if !in_template => { + let mut num = String::new_in(self.allocator); + num.push(a); + match a { + '4'..='7' => { + if matches!(self.peek(), Some('0'..='7')) { + let b = self.consume_char(); + num.push(b); + } + } + '0'..='3' => { + if matches!(self.peek(), Some('0'..='7')) { + let b = self.consume_char(); + num.push(b); + if matches!(self.peek(), Some('0'..='7')) { + let c = self.consume_char(); + num.push(c); + } + } + } + _ => {} + } + + let value = + char::from_u32(u32::from_str_radix(num.as_str(), 8).unwrap()).unwrap(); + text.push(value); + } + '0' if in_template && self.peek().is_some_and(|c| c.is_ascii_digit()) => { + self.current.chars.next(); + // error raised within the parser by `diagnostics::TemplateLiteral` + *is_valid_escape_sequence = false; + } + // NotEscapeSequence :: DecimalDigit but not 0 + '1'..='9' if in_template => { + // error raised within the parser by `diagnostics::TemplateLiteral` + *is_valid_escape_sequence = false; + } + other => { + // NonOctalDecimalEscapeSequence \8 \9 in strict mode + text.push(other); + } + }, + } + } +}