mirror of
https://github.com/danbulant/oxc
synced 2026-05-24 20:32:10 +00:00
refactor(syntax): move identifier related code from lexer to syntax
This commit is contained in:
parent
8ea9e38ee5
commit
24f11a4ea8
7 changed files with 136 additions and 128 deletions
2
Cargo.lock
generated
2
Cargo.lock
generated
|
|
@ -1201,7 +1201,6 @@ dependencies = [
|
|||
"oxc_syntax",
|
||||
"rustc-hash",
|
||||
"serde_json",
|
||||
"unicode-id-start",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -1247,6 +1246,7 @@ name = "oxc_syntax"
|
|||
version = "0.0.0"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"unicode-id-start",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
|
|||
|
|
@ -23,7 +23,6 @@ oxc_index = { workspace = true }
|
|||
|
||||
bitflags = { workspace = true }
|
||||
rustc-hash = { workspace = true }
|
||||
unicode-id-start = { workspace = true }
|
||||
num-bigint = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
|
|
|
|||
|
|
@ -1,114 +1,5 @@
|
|||
use super::Kind;
|
||||
|
||||
pub const EOF: char = '\0';
|
||||
|
||||
// 11.1 Unicode Format-Control Characters
|
||||
|
||||
/// U+200C ZERO WIDTH NON-JOINER, abbreviated in the spec as <ZWNJ>.
|
||||
/// Specially permitted in identifiers.
|
||||
pub const ZWNJ: char = '\u{200c}';
|
||||
|
||||
/// U+200D ZERO WIDTH JOINER, abbreviated as <ZWJ>.
|
||||
/// Specially permitted in identifiers.
|
||||
pub const ZWJ: char = '\u{200d}';
|
||||
|
||||
/// U+FEFF ZERO WIDTH NO-BREAK SPACE, abbreviated <ZWNBSP>.
|
||||
/// Considered a whitespace character in JS.
|
||||
pub const ZWNBSP: char = '\u{feff}';
|
||||
|
||||
// 11.2 White Space
|
||||
/// U+0009 CHARACTER TABULATION, abbreviated <TAB>.
|
||||
pub const TAB: char = '\u{9}';
|
||||
|
||||
/// U+000B VERTICAL TAB, abbreviated <VT>.
|
||||
pub const VT: char = '\u{b}';
|
||||
|
||||
/// U+000C FORM FEED, abbreviated <FF>.
|
||||
pub const FF: char = '\u{c}';
|
||||
|
||||
/// U+00A0 NON-BREAKING SPACE, abbreviated <NBSP>.
|
||||
pub const NBSP: char = '\u{a0}';
|
||||
|
||||
pub fn is_irregular_whitespace(c: char) -> bool {
|
||||
matches!(
|
||||
c,
|
||||
VT | FF | NBSP | ZWNBSP | '\u{85}' | '\u{1680}' | '\u{2000}'
|
||||
..='\u{200a}' | '\u{202f}' | '\u{205f}' | '\u{3000}'
|
||||
)
|
||||
}
|
||||
|
||||
// 11.3 Line Terminators
|
||||
|
||||
/// U+000A LINE FEED, abbreviated in the spec as <LF>.
|
||||
pub const LF: char = '\u{a}';
|
||||
|
||||
/// U+000D CARRIAGE RETURN, abbreviated in the spec as <CR>.
|
||||
pub const CR: char = '\u{d}';
|
||||
|
||||
/// U+2028 LINE SEPARATOR, abbreviated <LS>.
|
||||
pub const LS: char = '\u{2028}';
|
||||
|
||||
/// U+2029 PARAGRAPH SEPARATOR, abbreviated <PS>.
|
||||
pub const PS: char = '\u{2029}';
|
||||
|
||||
pub fn is_regular_line_terminator(c: char) -> bool {
|
||||
matches!(c, LF | CR)
|
||||
}
|
||||
|
||||
pub fn is_irregular_line_terminator(c: char) -> bool {
|
||||
matches!(c, LS | PS)
|
||||
}
|
||||
|
||||
pub fn is_line_terminator(c: char) -> bool {
|
||||
is_regular_line_terminator(c) || is_irregular_line_terminator(c)
|
||||
}
|
||||
|
||||
const T: bool = true;
|
||||
const F: bool = false;
|
||||
|
||||
#[repr(C, align(64))]
|
||||
pub struct Align64<T>(pub(crate) T);
|
||||
|
||||
// This contains `$` (36) and `_` (95)
|
||||
pub const ASCII_START: Align64<[bool; 128]> = Align64([
|
||||
F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
||||
F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
||||
F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, T,
|
||||
F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F,
|
||||
]);
|
||||
|
||||
// This contains `$` (36)
|
||||
pub const ASCII_CONTINUE: Align64<[bool; 128]> = Align64([
|
||||
F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
||||
F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F, F,
|
||||
F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, T,
|
||||
F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F,
|
||||
]);
|
||||
|
||||
#[inline]
|
||||
pub fn is_identifier_start_ascii(c: char) -> bool {
|
||||
ASCII_START.0[c as usize]
|
||||
}
|
||||
|
||||
/// Section 12.6 Detect `IdentifierStartChar`
|
||||
#[inline]
|
||||
pub fn is_identifier_start_all(c: char) -> bool {
|
||||
if c.is_ascii() {
|
||||
return is_identifier_start_ascii(c);
|
||||
}
|
||||
unicode_id_start::is_id_start_unicode(c)
|
||||
}
|
||||
|
||||
/// Section 12.6 Detect `IdentifierPartChar`
|
||||
/// NOTE 2: The nonterminal `IdentifierPart` derives _ via `UnicodeIDContinue`.
|
||||
#[inline]
|
||||
pub fn is_identifier_part(c: char) -> bool {
|
||||
if c.is_ascii() {
|
||||
return ASCII_CONTINUE.0[c as usize];
|
||||
}
|
||||
unicode_id_start::is_id_continue_unicode(c) || c == ZWNJ || c == ZWJ
|
||||
}
|
||||
|
||||
pub const SINGLE_CHAR_TOKENS: &[Kind; 128] = &[
|
||||
/* 0 */ Kind::Undetermined,
|
||||
/* 1 */ Kind::Undetermined,
|
||||
|
|
|
|||
|
|
@ -19,16 +19,20 @@ use oxc_allocator::{Allocator, String};
|
|||
use oxc_ast::ast::RegExpFlags;
|
||||
use oxc_diagnostics::Error;
|
||||
use oxc_span::{SourceType, Span};
|
||||
use oxc_syntax::{
|
||||
identifier::{
|
||||
is_identifier_part, is_identifier_start_all, is_identifier_start_ascii,
|
||||
is_irregular_line_terminator, is_irregular_whitespace, is_line_terminator, CR, EOF, FF, LF,
|
||||
LS, PS, TAB, VT,
|
||||
},
|
||||
unicode_id_start::is_id_start_unicode,
|
||||
};
|
||||
use simd::{SkipMultilineComment, SkipWhitespace};
|
||||
pub use token::{RegExp, Token, TokenValue};
|
||||
|
||||
pub use self::kind::Kind;
|
||||
use self::{
|
||||
constants::{
|
||||
is_identifier_part, is_identifier_start_all, is_identifier_start_ascii,
|
||||
is_irregular_line_terminator, is_irregular_whitespace, is_line_terminator, EOF,
|
||||
SINGLE_CHAR_TOKENS,
|
||||
},
|
||||
constants::SINGLE_CHAR_TOKENS,
|
||||
number::{parse_big_int, parse_float, parse_int},
|
||||
string_builder::AutoCow,
|
||||
trivia_builder::TriviaBuilder,
|
||||
|
|
@ -429,7 +433,7 @@ impl<'a> Lexer<'a> {
|
|||
self.identifier_unicode_escape_sequence(&mut builder, true);
|
||||
self.identifier_name_or_keyword(builder)
|
||||
}
|
||||
c if unicode_id_start::is_id_start_unicode(c) => {
|
||||
c if is_id_start_unicode(c) => {
|
||||
builder.push_matching(c);
|
||||
self.identifier_name_or_keyword(builder)
|
||||
}
|
||||
|
|
@ -1053,10 +1057,10 @@ impl<'a> Lexer<'a> {
|
|||
}
|
||||
return tail;
|
||||
}
|
||||
constants::CR => {
|
||||
CR => {
|
||||
builder.force_allocation_without_current_ascii_char(self);
|
||||
if self.next_eq(constants::LF) {
|
||||
builder.push_different(constants::LF);
|
||||
if self.next_eq(LF) {
|
||||
builder.push_different(LF);
|
||||
}
|
||||
}
|
||||
'\\' => {
|
||||
|
|
@ -1354,17 +1358,17 @@ impl<'a> Lexer<'a> {
|
|||
}
|
||||
Some(c) => match c {
|
||||
// CharacterEscapeSequence
|
||||
constants::LF | constants::LS | constants::PS => {}
|
||||
constants::CR => {
|
||||
self.next_eq(constants::LF);
|
||||
LF | LS | PS => {}
|
||||
CR => {
|
||||
self.next_eq(LF);
|
||||
}
|
||||
'\'' | '"' | '\\' => text.push(c),
|
||||
'b' => text.push('\u{8}'),
|
||||
'f' => text.push(constants::FF),
|
||||
'n' => text.push(constants::LF),
|
||||
'r' => text.push(constants::CR),
|
||||
't' => text.push(constants::TAB),
|
||||
'v' => text.push(constants::VT),
|
||||
'f' => text.push(FF),
|
||||
'n' => text.push(LF),
|
||||
'r' => text.push(CR),
|
||||
't' => text.push(TAB),
|
||||
'v' => text.push(VT),
|
||||
// HexEscapeSequence
|
||||
'x' => {
|
||||
self.hex_digit()
|
||||
|
|
|
|||
|
|
@ -15,4 +15,5 @@ default = []
|
|||
serde = ["dep:serde"]
|
||||
|
||||
[dependencies]
|
||||
unicode-id-start = { workspace = true }
|
||||
serde = { workspace = true, features = ["derive"], optional = true }
|
||||
|
|
|
|||
110
crates/oxc_syntax/src/identifier.rs
Normal file
110
crates/oxc_syntax/src/identifier.rs
Normal file
|
|
@ -0,0 +1,110 @@
|
|||
use unicode_id_start::{is_id_continue_unicode, is_id_start_unicode};
|
||||
|
||||
pub const EOF: char = '\0';
|
||||
|
||||
// 11.1 Unicode Format-Control Characters
|
||||
|
||||
/// U+200C ZERO WIDTH NON-JOINER, abbreviated in the spec as <ZWNJ>.
|
||||
/// Specially permitted in identifiers.
|
||||
pub const ZWNJ: char = '\u{200c}';
|
||||
|
||||
/// U+200D ZERO WIDTH JOINER, abbreviated as <ZWJ>.
|
||||
/// Specially permitted in identifiers.
|
||||
pub const ZWJ: char = '\u{200d}';
|
||||
|
||||
/// U+FEFF ZERO WIDTH NO-BREAK SPACE, abbreviated <ZWNBSP>.
|
||||
/// Considered a whitespace character in JS.
|
||||
pub const ZWNBSP: char = '\u{feff}';
|
||||
|
||||
// 11.2 White Space
|
||||
/// U+0009 CHARACTER TABULATION, abbreviated <TAB>.
|
||||
pub const TAB: char = '\u{9}';
|
||||
|
||||
/// U+000B VERTICAL TAB, abbreviated <VT>.
|
||||
pub const VT: char = '\u{b}';
|
||||
|
||||
/// U+000C FORM FEED, abbreviated <FF>.
|
||||
pub const FF: char = '\u{c}';
|
||||
|
||||
/// U+00A0 NON-BREAKING SPACE, abbreviated <NBSP>.
|
||||
pub const NBSP: char = '\u{a0}';
|
||||
|
||||
pub fn is_irregular_whitespace(c: char) -> bool {
|
||||
matches!(
|
||||
c,
|
||||
VT | FF | NBSP | ZWNBSP | '\u{85}' | '\u{1680}' | '\u{2000}'
|
||||
..='\u{200a}' | '\u{202f}' | '\u{205f}' | '\u{3000}'
|
||||
)
|
||||
}
|
||||
|
||||
// 11.3 Line Terminators
|
||||
|
||||
/// U+000A LINE FEED, abbreviated in the spec as <LF>.
|
||||
pub const LF: char = '\u{a}';
|
||||
|
||||
/// U+000D CARRIAGE RETURN, abbreviated in the spec as <CR>.
|
||||
pub const CR: char = '\u{d}';
|
||||
|
||||
/// U+2028 LINE SEPARATOR, abbreviated <LS>.
|
||||
pub const LS: char = '\u{2028}';
|
||||
|
||||
/// U+2029 PARAGRAPH SEPARATOR, abbreviated <PS>.
|
||||
pub const PS: char = '\u{2029}';
|
||||
|
||||
pub fn is_regular_line_terminator(c: char) -> bool {
|
||||
matches!(c, LF | CR)
|
||||
}
|
||||
|
||||
pub fn is_irregular_line_terminator(c: char) -> bool {
|
||||
matches!(c, LS | PS)
|
||||
}
|
||||
|
||||
pub fn is_line_terminator(c: char) -> bool {
|
||||
is_regular_line_terminator(c) || is_irregular_line_terminator(c)
|
||||
}
|
||||
|
||||
const T: bool = true;
|
||||
const F: bool = false;
|
||||
|
||||
#[repr(C, align(64))]
|
||||
pub struct Align64<T>(pub(crate) T);
|
||||
|
||||
// This contains `$` (36) and `_` (95)
|
||||
pub const ASCII_START: Align64<[bool; 128]> = Align64([
|
||||
F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
||||
F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
||||
F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, T,
|
||||
F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F,
|
||||
]);
|
||||
|
||||
// This contains `$` (36)
|
||||
pub const ASCII_CONTINUE: Align64<[bool; 128]> = Align64([
|
||||
F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
||||
F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F, F,
|
||||
F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, T,
|
||||
F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F,
|
||||
]);
|
||||
|
||||
#[inline]
|
||||
pub fn is_identifier_start_ascii(c: char) -> bool {
|
||||
ASCII_START.0[c as usize]
|
||||
}
|
||||
|
||||
/// Section 12.6 Detect `IdentifierStartChar`
|
||||
#[inline]
|
||||
pub fn is_identifier_start_all(c: char) -> bool {
|
||||
if c.is_ascii() {
|
||||
return is_identifier_start_ascii(c);
|
||||
}
|
||||
is_id_start_unicode(c)
|
||||
}
|
||||
|
||||
/// Section 12.6 Detect `IdentifierPartChar`
|
||||
/// NOTE 2: The nonterminal `IdentifierPart` derives _ via `UnicodeIDContinue`.
|
||||
#[inline]
|
||||
pub fn is_identifier_part(c: char) -> bool {
|
||||
if c.is_ascii() {
|
||||
return ASCII_CONTINUE.0[c as usize];
|
||||
}
|
||||
is_id_continue_unicode(c) || c == ZWNJ || c == ZWJ
|
||||
}
|
||||
|
|
@ -1,7 +1,10 @@
|
|||
//! Common code for JavaScript Syntax
|
||||
|
||||
pub mod identifier;
|
||||
pub mod operator;
|
||||
|
||||
pub use unicode_id_start;
|
||||
|
||||
#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
|
||||
pub enum NumberBase {
|
||||
Float,
|
||||
|
|
|
|||
Loading…
Reference in a new issue