diff --git a/Cargo.lock b/Cargo.lock index 593cbad02..48ba903ba 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1201,7 +1201,6 @@ dependencies = [ "oxc_syntax", "rustc-hash", "serde_json", - "unicode-id-start", ] [[package]] @@ -1247,6 +1246,7 @@ name = "oxc_syntax" version = "0.0.0" dependencies = [ "serde", + "unicode-id-start", ] [[package]] diff --git a/crates/oxc_parser/Cargo.toml b/crates/oxc_parser/Cargo.toml index 032bf7a03..b8a6680e9 100644 --- a/crates/oxc_parser/Cargo.toml +++ b/crates/oxc_parser/Cargo.toml @@ -23,7 +23,6 @@ oxc_index = { workspace = true } bitflags = { workspace = true } rustc-hash = { workspace = true } -unicode-id-start = { workspace = true } num-bigint = { workspace = true } [dev-dependencies] diff --git a/crates/oxc_parser/src/lexer/constants.rs b/crates/oxc_parser/src/lexer/constants.rs index 3f1ed2c14..b9a23c158 100644 --- a/crates/oxc_parser/src/lexer/constants.rs +++ b/crates/oxc_parser/src/lexer/constants.rs @@ -1,114 +1,5 @@ use super::Kind; -pub const EOF: char = '\0'; - -// 11.1 Unicode Format-Control Characters - -/// U+200C ZERO WIDTH NON-JOINER, abbreviated in the spec as . -/// Specially permitted in identifiers. -pub const ZWNJ: char = '\u{200c}'; - -/// U+200D ZERO WIDTH JOINER, abbreviated as . -/// Specially permitted in identifiers. -pub const ZWJ: char = '\u{200d}'; - -/// U+FEFF ZERO WIDTH NO-BREAK SPACE, abbreviated . -/// Considered a whitespace character in JS. -pub const ZWNBSP: char = '\u{feff}'; - -// 11.2 White Space -/// U+0009 CHARACTER TABULATION, abbreviated . -pub const TAB: char = '\u{9}'; - -/// U+000B VERTICAL TAB, abbreviated . -pub const VT: char = '\u{b}'; - -/// U+000C FORM FEED, abbreviated . -pub const FF: char = '\u{c}'; - -/// U+00A0 NON-BREAKING SPACE, abbreviated . -pub const NBSP: char = '\u{a0}'; - -pub fn is_irregular_whitespace(c: char) -> bool { - matches!( - c, - VT | FF | NBSP | ZWNBSP | '\u{85}' | '\u{1680}' | '\u{2000}' - ..='\u{200a}' | '\u{202f}' | '\u{205f}' | '\u{3000}' - ) -} - -// 11.3 Line Terminators - -/// U+000A LINE FEED, abbreviated in the spec as . -pub const LF: char = '\u{a}'; - -/// U+000D CARRIAGE RETURN, abbreviated in the spec as . -pub const CR: char = '\u{d}'; - -/// U+2028 LINE SEPARATOR, abbreviated . -pub const LS: char = '\u{2028}'; - -/// U+2029 PARAGRAPH SEPARATOR, abbreviated . -pub const PS: char = '\u{2029}'; - -pub fn is_regular_line_terminator(c: char) -> bool { - matches!(c, LF | CR) -} - -pub fn is_irregular_line_terminator(c: char) -> bool { - matches!(c, LS | PS) -} - -pub fn is_line_terminator(c: char) -> bool { - is_regular_line_terminator(c) || is_irregular_line_terminator(c) -} - -const T: bool = true; -const F: bool = false; - -#[repr(C, align(64))] -pub struct Align64(pub(crate) T); - -// This contains `$` (36) and `_` (95) -pub const ASCII_START: Align64<[bool; 128]> = Align64([ - F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, T, - F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F, -]); - -// This contains `$` (36) -pub const ASCII_CONTINUE: Align64<[bool; 128]> = Align64([ - F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, - F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F, F, - F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, T, - F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F, -]); - -#[inline] -pub fn is_identifier_start_ascii(c: char) -> bool { - ASCII_START.0[c as usize] -} - -/// Section 12.6 Detect `IdentifierStartChar` -#[inline] -pub fn is_identifier_start_all(c: char) -> bool { - if c.is_ascii() { - return is_identifier_start_ascii(c); - } - unicode_id_start::is_id_start_unicode(c) -} - -/// Section 12.6 Detect `IdentifierPartChar` -/// NOTE 2: The nonterminal `IdentifierPart` derives _ via `UnicodeIDContinue`. -#[inline] -pub fn is_identifier_part(c: char) -> bool { - if c.is_ascii() { - return ASCII_CONTINUE.0[c as usize]; - } - unicode_id_start::is_id_continue_unicode(c) || c == ZWNJ || c == ZWJ -} - pub const SINGLE_CHAR_TOKENS: &[Kind; 128] = &[ /* 0 */ Kind::Undetermined, /* 1 */ Kind::Undetermined, diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs index 132204f65..a1661410a 100644 --- a/crates/oxc_parser/src/lexer/mod.rs +++ b/crates/oxc_parser/src/lexer/mod.rs @@ -19,16 +19,20 @@ use oxc_allocator::{Allocator, String}; use oxc_ast::ast::RegExpFlags; use oxc_diagnostics::Error; use oxc_span::{SourceType, Span}; +use oxc_syntax::{ + identifier::{ + is_identifier_part, is_identifier_start_all, is_identifier_start_ascii, + is_irregular_line_terminator, is_irregular_whitespace, is_line_terminator, CR, EOF, FF, LF, + LS, PS, TAB, VT, + }, + unicode_id_start::is_id_start_unicode, +}; use simd::{SkipMultilineComment, SkipWhitespace}; pub use token::{RegExp, Token, TokenValue}; pub use self::kind::Kind; use self::{ - constants::{ - is_identifier_part, is_identifier_start_all, is_identifier_start_ascii, - is_irregular_line_terminator, is_irregular_whitespace, is_line_terminator, EOF, - SINGLE_CHAR_TOKENS, - }, + constants::SINGLE_CHAR_TOKENS, number::{parse_big_int, parse_float, parse_int}, string_builder::AutoCow, trivia_builder::TriviaBuilder, @@ -429,7 +433,7 @@ impl<'a> Lexer<'a> { self.identifier_unicode_escape_sequence(&mut builder, true); self.identifier_name_or_keyword(builder) } - c if unicode_id_start::is_id_start_unicode(c) => { + c if is_id_start_unicode(c) => { builder.push_matching(c); self.identifier_name_or_keyword(builder) } @@ -1053,10 +1057,10 @@ impl<'a> Lexer<'a> { } return tail; } - constants::CR => { + CR => { builder.force_allocation_without_current_ascii_char(self); - if self.next_eq(constants::LF) { - builder.push_different(constants::LF); + if self.next_eq(LF) { + builder.push_different(LF); } } '\\' => { @@ -1354,17 +1358,17 @@ impl<'a> Lexer<'a> { } Some(c) => match c { // CharacterEscapeSequence - constants::LF | constants::LS | constants::PS => {} - constants::CR => { - self.next_eq(constants::LF); + LF | LS | PS => {} + CR => { + self.next_eq(LF); } '\'' | '"' | '\\' => text.push(c), 'b' => text.push('\u{8}'), - 'f' => text.push(constants::FF), - 'n' => text.push(constants::LF), - 'r' => text.push(constants::CR), - 't' => text.push(constants::TAB), - 'v' => text.push(constants::VT), + 'f' => text.push(FF), + 'n' => text.push(LF), + 'r' => text.push(CR), + 't' => text.push(TAB), + 'v' => text.push(VT), // HexEscapeSequence 'x' => { self.hex_digit() diff --git a/crates/oxc_syntax/Cargo.toml b/crates/oxc_syntax/Cargo.toml index 624fc3e52..e9ef3e9f8 100644 --- a/crates/oxc_syntax/Cargo.toml +++ b/crates/oxc_syntax/Cargo.toml @@ -15,4 +15,5 @@ default = [] serde = ["dep:serde"] [dependencies] +unicode-id-start = { workspace = true } serde = { workspace = true, features = ["derive"], optional = true } diff --git a/crates/oxc_syntax/src/identifier.rs b/crates/oxc_syntax/src/identifier.rs new file mode 100644 index 000000000..008b9a1a8 --- /dev/null +++ b/crates/oxc_syntax/src/identifier.rs @@ -0,0 +1,110 @@ +use unicode_id_start::{is_id_continue_unicode, is_id_start_unicode}; + +pub const EOF: char = '\0'; + +// 11.1 Unicode Format-Control Characters + +/// U+200C ZERO WIDTH NON-JOINER, abbreviated in the spec as . +/// Specially permitted in identifiers. +pub const ZWNJ: char = '\u{200c}'; + +/// U+200D ZERO WIDTH JOINER, abbreviated as . +/// Specially permitted in identifiers. +pub const ZWJ: char = '\u{200d}'; + +/// U+FEFF ZERO WIDTH NO-BREAK SPACE, abbreviated . +/// Considered a whitespace character in JS. +pub const ZWNBSP: char = '\u{feff}'; + +// 11.2 White Space +/// U+0009 CHARACTER TABULATION, abbreviated . +pub const TAB: char = '\u{9}'; + +/// U+000B VERTICAL TAB, abbreviated . +pub const VT: char = '\u{b}'; + +/// U+000C FORM FEED, abbreviated . +pub const FF: char = '\u{c}'; + +/// U+00A0 NON-BREAKING SPACE, abbreviated . +pub const NBSP: char = '\u{a0}'; + +pub fn is_irregular_whitespace(c: char) -> bool { + matches!( + c, + VT | FF | NBSP | ZWNBSP | '\u{85}' | '\u{1680}' | '\u{2000}' + ..='\u{200a}' | '\u{202f}' | '\u{205f}' | '\u{3000}' + ) +} + +// 11.3 Line Terminators + +/// U+000A LINE FEED, abbreviated in the spec as . +pub const LF: char = '\u{a}'; + +/// U+000D CARRIAGE RETURN, abbreviated in the spec as . +pub const CR: char = '\u{d}'; + +/// U+2028 LINE SEPARATOR, abbreviated . +pub const LS: char = '\u{2028}'; + +/// U+2029 PARAGRAPH SEPARATOR, abbreviated . +pub const PS: char = '\u{2029}'; + +pub fn is_regular_line_terminator(c: char) -> bool { + matches!(c, LF | CR) +} + +pub fn is_irregular_line_terminator(c: char) -> bool { + matches!(c, LS | PS) +} + +pub fn is_line_terminator(c: char) -> bool { + is_regular_line_terminator(c) || is_irregular_line_terminator(c) +} + +const T: bool = true; +const F: bool = false; + +#[repr(C, align(64))] +pub struct Align64(pub(crate) T); + +// This contains `$` (36) and `_` (95) +pub const ASCII_START: Align64<[bool; 128]> = Align64([ + F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, T, + F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F, +]); + +// This contains `$` (36) +pub const ASCII_CONTINUE: Align64<[bool; 128]> = Align64([ + F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F, F, + F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, T, + F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F, +]); + +#[inline] +pub fn is_identifier_start_ascii(c: char) -> bool { + ASCII_START.0[c as usize] +} + +/// Section 12.6 Detect `IdentifierStartChar` +#[inline] +pub fn is_identifier_start_all(c: char) -> bool { + if c.is_ascii() { + return is_identifier_start_ascii(c); + } + is_id_start_unicode(c) +} + +/// Section 12.6 Detect `IdentifierPartChar` +/// NOTE 2: The nonterminal `IdentifierPart` derives _ via `UnicodeIDContinue`. +#[inline] +pub fn is_identifier_part(c: char) -> bool { + if c.is_ascii() { + return ASCII_CONTINUE.0[c as usize]; + } + is_id_continue_unicode(c) || c == ZWNJ || c == ZWJ +} diff --git a/crates/oxc_syntax/src/lib.rs b/crates/oxc_syntax/src/lib.rs index b05df8ffa..2e3d61967 100644 --- a/crates/oxc_syntax/src/lib.rs +++ b/crates/oxc_syntax/src/lib.rs @@ -1,7 +1,10 @@ //! Common code for JavaScript Syntax +pub mod identifier; pub mod operator; +pub use unicode_id_start; + #[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)] pub enum NumberBase { Float,