refactor(syntax): move identifier related code from lexer to syntax

2026-05-24 20:32:10 +00:00 · 2023-05-27 11:00:02 +08:00 · 2023-05-27 11:00:02 +08:00 · 24f11a4ea8
commit 24f11a4ea8
parent 8ea9e38ee5
7 changed files with 136 additions and 128 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1201,7 +1201,6 @@ dependencies = [
 "oxc_syntax",
 "rustc-hash",
 "serde_json",
- "unicode-id-start",
 ]

 [[package]]
@ -1247,6 +1246,7 @@ name = "oxc_syntax"
 version = "0.0.0"
 dependencies = [
 "serde",
+ "unicode-id-start",
 ]

 [[package]]
--- a/crates/oxc_parser/Cargo.toml
+++ b/crates/oxc_parser/Cargo.toml
@ -23,7 +23,6 @@ oxc_index       = { workspace = true }

 bitflags         = { workspace = true }
 rustc-hash       = { workspace = true }
-unicode-id-start = { workspace = true }
 num-bigint       = { workspace = true }

 [dev-dependencies]
--- a/crates/oxc_parser/src/lexer/constants.rs
+++ b/crates/oxc_parser/src/lexer/constants.rs
@ -1,114 +1,5 @@
 use super::Kind;

-pub const EOF: char = '\0';
-
-// 11.1 Unicode Format-Control Characters
-
-/// U+200C ZERO WIDTH NON-JOINER, abbreviated in the spec as <ZWNJ>.
-/// Specially permitted in identifiers.
-pub const ZWNJ: char = '\u{200c}';
-
-/// U+200D ZERO WIDTH JOINER, abbreviated as <ZWJ>.
-/// Specially permitted in identifiers.
-pub const ZWJ: char = '\u{200d}';
-
-/// U+FEFF ZERO WIDTH NO-BREAK SPACE, abbreviated <ZWNBSP>.
-/// Considered a whitespace character in JS.
-pub const ZWNBSP: char = '\u{feff}';
-
-// 11.2 White Space
-/// U+0009 CHARACTER TABULATION, abbreviated <TAB>.
-pub const TAB: char = '\u{9}';
-
-/// U+000B VERTICAL TAB, abbreviated <VT>.
-pub const VT: char = '\u{b}';
-
-/// U+000C FORM FEED, abbreviated <FF>.
-pub const FF: char = '\u{c}';
-
-/// U+00A0 NON-BREAKING SPACE, abbreviated <NBSP>.
-pub const NBSP: char = '\u{a0}';
-
-pub fn is_irregular_whitespace(c: char) -> bool {
-    matches!(
-        c,
-        VT | FF | NBSP | ZWNBSP | '\u{85}' | '\u{1680}' | '\u{2000}'
-            ..='\u{200a}' | '\u{202f}' | '\u{205f}' | '\u{3000}'
-    )
-}
-
-// 11.3 Line Terminators
-
-///  U+000A LINE FEED, abbreviated in the spec as <LF>.
-pub const LF: char = '\u{a}';
-
-/// U+000D CARRIAGE RETURN, abbreviated in the spec as <CR>.
-pub const CR: char = '\u{d}';
-
-/// U+2028 LINE SEPARATOR, abbreviated <LS>.
-pub const LS: char = '\u{2028}';
-
-/// U+2029 PARAGRAPH SEPARATOR, abbreviated <PS>.
-pub const PS: char = '\u{2029}';
-
-pub fn is_regular_line_terminator(c: char) -> bool {
-    matches!(c, LF | CR)
-}
-
-pub fn is_irregular_line_terminator(c: char) -> bool {
-    matches!(c, LS | PS)
-}
-
-pub fn is_line_terminator(c: char) -> bool {
-    is_regular_line_terminator(c) || is_irregular_line_terminator(c)
-}
-
-const T: bool = true;
-const F: bool = false;
-
-#[repr(C, align(64))]
-pub struct Align64<T>(pub(crate) T);
-
-// This contains `$` (36) and `_` (95)
-pub const ASCII_START: Align64<[bool; 128]> = Align64([
-    F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
-    F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
-    F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, T,
-    F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F,
-]);
-
-// This contains `$` (36)
-pub const ASCII_CONTINUE: Align64<[bool; 128]> = Align64([
-    F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
-    F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F, F,
-    F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, T,
-    F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F,
-]);
-
-#[inline]
-pub fn is_identifier_start_ascii(c: char) -> bool {
-    ASCII_START.0[c as usize]
-}
-
-/// Section 12.6 Detect `IdentifierStartChar`
-#[inline]
-pub fn is_identifier_start_all(c: char) -> bool {
-    if c.is_ascii() {
-        return is_identifier_start_ascii(c);
-    }
-    unicode_id_start::is_id_start_unicode(c)
-}
-
-/// Section 12.6 Detect `IdentifierPartChar`
-/// NOTE 2: The nonterminal `IdentifierPart` derives _ via `UnicodeIDContinue`.
-#[inline]
-pub fn is_identifier_part(c: char) -> bool {
-    if c.is_ascii() {
-        return ASCII_CONTINUE.0[c as usize];
-    }
-    unicode_id_start::is_id_continue_unicode(c) || c == ZWNJ || c == ZWJ
-}
-
 pub const SINGLE_CHAR_TOKENS: &[Kind; 128] = &[
    /*   0 */ Kind::Undetermined,
    /*   1 */ Kind::Undetermined,
--- a/crates/oxc_parser/src/lexer/mod.rs
+++ b/crates/oxc_parser/src/lexer/mod.rs
@ -19,16 +19,20 @@ use oxc_allocator::{Allocator, String};
 use oxc_ast::ast::RegExpFlags;
 use oxc_diagnostics::Error;
 use oxc_span::{SourceType, Span};
+use oxc_syntax::{
+    identifier::{
+        is_identifier_part, is_identifier_start_all, is_identifier_start_ascii,
+        is_irregular_line_terminator, is_irregular_whitespace, is_line_terminator, CR, EOF, FF, LF,
+        LS, PS, TAB, VT,
+    },
+    unicode_id_start::is_id_start_unicode,
+};
 use simd::{SkipMultilineComment, SkipWhitespace};
 pub use token::{RegExp, Token, TokenValue};

 pub use self::kind::Kind;
 use self::{
-    constants::{
-        is_identifier_part, is_identifier_start_all, is_identifier_start_ascii,
-        is_irregular_line_terminator, is_irregular_whitespace, is_line_terminator, EOF,
-        SINGLE_CHAR_TOKENS,
-    },
+    constants::SINGLE_CHAR_TOKENS,
    number::{parse_big_int, parse_float, parse_int},
    string_builder::AutoCow,
    trivia_builder::TriviaBuilder,
@ -429,7 +433,7 @@ impl<'a> Lexer<'a> {
                self.identifier_unicode_escape_sequence(&mut builder, true);
                self.identifier_name_or_keyword(builder)
            }
-            c if unicode_id_start::is_id_start_unicode(c) => {
+            c if is_id_start_unicode(c) => {
                builder.push_matching(c);
                self.identifier_name_or_keyword(builder)
            }
@ -1053,10 +1057,10 @@ impl<'a> Lexer<'a> {
                    }
                    return tail;
                }
-                constants::CR => {
+                CR => {
                    builder.force_allocation_without_current_ascii_char(self);
-                    if self.next_eq(constants::LF) {
-                        builder.push_different(constants::LF);
+                    if self.next_eq(LF) {
+                        builder.push_different(LF);
                    }
                }
                '\\' => {
@ -1354,17 +1358,17 @@ impl<'a> Lexer<'a> {
            }
            Some(c) => match c {
                // CharacterEscapeSequence
-                constants::LF | constants::LS | constants::PS => {}
-                constants::CR => {
-                    self.next_eq(constants::LF);
+                LF | LS | PS => {}
+                CR => {
+                    self.next_eq(LF);
                }
                '\'' | '"' | '\\' => text.push(c),
                'b' => text.push('\u{8}'),
-                'f' => text.push(constants::FF),
-                'n' => text.push(constants::LF),
-                'r' => text.push(constants::CR),
-                't' => text.push(constants::TAB),
-                'v' => text.push(constants::VT),
+                'f' => text.push(FF),
+                'n' => text.push(LF),
+                'r' => text.push(CR),
+                't' => text.push(TAB),
+                'v' => text.push(VT),
                // HexEscapeSequence
                'x' => {
                    self.hex_digit()
--- a/crates/oxc_syntax/Cargo.toml
+++ b/crates/oxc_syntax/Cargo.toml
@ -15,4 +15,5 @@ default = []
 serde   = ["dep:serde"]

 [dependencies]
+unicode-id-start = { workspace = true }
 serde = { workspace = true, features = ["derive"], optional = true }
--- a/crates/oxc_syntax/src/identifier.rs
+++ b/crates/oxc_syntax/src/identifier.rs
@ -0,0 +1,110 @@
+use unicode_id_start::{is_id_continue_unicode, is_id_start_unicode};
+
+pub const EOF: char = '\0';
+
+// 11.1 Unicode Format-Control Characters
+
+/// U+200C ZERO WIDTH NON-JOINER, abbreviated in the spec as <ZWNJ>.
+/// Specially permitted in identifiers.
+pub const ZWNJ: char = '\u{200c}';
+
+/// U+200D ZERO WIDTH JOINER, abbreviated as <ZWJ>.
+/// Specially permitted in identifiers.
+pub const ZWJ: char = '\u{200d}';
+
+/// U+FEFF ZERO WIDTH NO-BREAK SPACE, abbreviated <ZWNBSP>.
+/// Considered a whitespace character in JS.
+pub const ZWNBSP: char = '\u{feff}';
+
+// 11.2 White Space
+/// U+0009 CHARACTER TABULATION, abbreviated <TAB>.
+pub const TAB: char = '\u{9}';
+
+/// U+000B VERTICAL TAB, abbreviated <VT>.
+pub const VT: char = '\u{b}';
+
+/// U+000C FORM FEED, abbreviated <FF>.
+pub const FF: char = '\u{c}';
+
+/// U+00A0 NON-BREAKING SPACE, abbreviated <NBSP>.
+pub const NBSP: char = '\u{a0}';
+
+pub fn is_irregular_whitespace(c: char) -> bool {
+    matches!(
+        c,
+        VT | FF | NBSP | ZWNBSP | '\u{85}' | '\u{1680}' | '\u{2000}'
+            ..='\u{200a}' | '\u{202f}' | '\u{205f}' | '\u{3000}'
+    )
+}
+
+// 11.3 Line Terminators
+
+///  U+000A LINE FEED, abbreviated in the spec as <LF>.
+pub const LF: char = '\u{a}';
+
+/// U+000D CARRIAGE RETURN, abbreviated in the spec as <CR>.
+pub const CR: char = '\u{d}';
+
+/// U+2028 LINE SEPARATOR, abbreviated <LS>.
+pub const LS: char = '\u{2028}';
+
+/// U+2029 PARAGRAPH SEPARATOR, abbreviated <PS>.
+pub const PS: char = '\u{2029}';
+
+pub fn is_regular_line_terminator(c: char) -> bool {
+    matches!(c, LF | CR)
+}
+
+pub fn is_irregular_line_terminator(c: char) -> bool {
+    matches!(c, LS | PS)
+}
+
+pub fn is_line_terminator(c: char) -> bool {
+    is_regular_line_terminator(c) || is_irregular_line_terminator(c)
+}
+
+const T: bool = true;
+const F: bool = false;
+
+#[repr(C, align(64))]
+pub struct Align64<T>(pub(crate) T);
+
+// This contains `$` (36) and `_` (95)
+pub const ASCII_START: Align64<[bool; 128]> = Align64([
+    F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, T,
+    F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F,
+]);
+
+// This contains `$` (36)
+pub const ASCII_CONTINUE: Align64<[bool; 128]> = Align64([
+    F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F, F,
+    F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, T,
+    F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F,
+]);
+
+#[inline]
+pub fn is_identifier_start_ascii(c: char) -> bool {
+    ASCII_START.0[c as usize]
+}
+
+/// Section 12.6 Detect `IdentifierStartChar`
+#[inline]
+pub fn is_identifier_start_all(c: char) -> bool {
+    if c.is_ascii() {
+        return is_identifier_start_ascii(c);
+    }
+    is_id_start_unicode(c)
+}
+
+/// Section 12.6 Detect `IdentifierPartChar`
+/// NOTE 2: The nonterminal `IdentifierPart` derives _ via `UnicodeIDContinue`.
+#[inline]
+pub fn is_identifier_part(c: char) -> bool {
+    if c.is_ascii() {
+        return ASCII_CONTINUE.0[c as usize];
+    }
+    is_id_continue_unicode(c) || c == ZWNJ || c == ZWJ
+}
--- a/crates/oxc_syntax/src/lib.rs
+++ b/crates/oxc_syntax/src/lib.rs
@ -1,7 +1,10 @@
 //! Common code for JavaScript Syntax

+pub mod identifier;
 pub mod operator;

+pub use unicode_id_start;
+
 #[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
 pub enum NumberBase {
    Float,