refactor(syntax): move identifier related code from lexer to syntax

This commit is contained in:
Boshen 2023-05-27 11:00:02 +08:00
parent 8ea9e38ee5
commit 24f11a4ea8
No known key found for this signature in database
GPG key ID: 9C7A8C8AB22BEBD1
7 changed files with 136 additions and 128 deletions

2
Cargo.lock generated
View file

@ -1201,7 +1201,6 @@ dependencies = [
"oxc_syntax",
"rustc-hash",
"serde_json",
"unicode-id-start",
]
[[package]]
@ -1247,6 +1246,7 @@ name = "oxc_syntax"
version = "0.0.0"
dependencies = [
"serde",
"unicode-id-start",
]
[[package]]

View file

@ -23,7 +23,6 @@ oxc_index = { workspace = true }
bitflags = { workspace = true }
rustc-hash = { workspace = true }
unicode-id-start = { workspace = true }
num-bigint = { workspace = true }
[dev-dependencies]

View file

@ -1,114 +1,5 @@
use super::Kind;
pub const EOF: char = '\0';
// 11.1 Unicode Format-Control Characters
/// U+200C ZERO WIDTH NON-JOINER, abbreviated in the spec as <ZWNJ>.
/// Specially permitted in identifiers.
pub const ZWNJ: char = '\u{200c}';
/// U+200D ZERO WIDTH JOINER, abbreviated as <ZWJ>.
/// Specially permitted in identifiers.
pub const ZWJ: char = '\u{200d}';
/// U+FEFF ZERO WIDTH NO-BREAK SPACE, abbreviated <ZWNBSP>.
/// Considered a whitespace character in JS.
pub const ZWNBSP: char = '\u{feff}';
// 11.2 White Space
/// U+0009 CHARACTER TABULATION, abbreviated <TAB>.
pub const TAB: char = '\u{9}';
/// U+000B VERTICAL TAB, abbreviated <VT>.
pub const VT: char = '\u{b}';
/// U+000C FORM FEED, abbreviated <FF>.
pub const FF: char = '\u{c}';
/// U+00A0 NON-BREAKING SPACE, abbreviated <NBSP>.
pub const NBSP: char = '\u{a0}';
pub fn is_irregular_whitespace(c: char) -> bool {
matches!(
c,
VT | FF | NBSP | ZWNBSP | '\u{85}' | '\u{1680}' | '\u{2000}'
..='\u{200a}' | '\u{202f}' | '\u{205f}' | '\u{3000}'
)
}
// 11.3 Line Terminators
/// U+000A LINE FEED, abbreviated in the spec as <LF>.
pub const LF: char = '\u{a}';
/// U+000D CARRIAGE RETURN, abbreviated in the spec as <CR>.
pub const CR: char = '\u{d}';
/// U+2028 LINE SEPARATOR, abbreviated <LS>.
pub const LS: char = '\u{2028}';
/// U+2029 PARAGRAPH SEPARATOR, abbreviated <PS>.
pub const PS: char = '\u{2029}';
pub fn is_regular_line_terminator(c: char) -> bool {
matches!(c, LF | CR)
}
pub fn is_irregular_line_terminator(c: char) -> bool {
matches!(c, LS | PS)
}
pub fn is_line_terminator(c: char) -> bool {
is_regular_line_terminator(c) || is_irregular_line_terminator(c)
}
const T: bool = true;
const F: bool = false;
#[repr(C, align(64))]
pub struct Align64<T>(pub(crate) T);
// This contains `$` (36) and `_` (95)
pub const ASCII_START: Align64<[bool; 128]> = Align64([
F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, T,
F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F,
]);
// This contains `$` (36)
pub const ASCII_CONTINUE: Align64<[bool; 128]> = Align64([
F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F, F,
F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, T,
F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F,
]);
#[inline]
pub fn is_identifier_start_ascii(c: char) -> bool {
ASCII_START.0[c as usize]
}
/// Section 12.6 Detect `IdentifierStartChar`
#[inline]
pub fn is_identifier_start_all(c: char) -> bool {
if c.is_ascii() {
return is_identifier_start_ascii(c);
}
unicode_id_start::is_id_start_unicode(c)
}
/// Section 12.6 Detect `IdentifierPartChar`
/// NOTE 2: The nonterminal `IdentifierPart` derives _ via `UnicodeIDContinue`.
#[inline]
pub fn is_identifier_part(c: char) -> bool {
if c.is_ascii() {
return ASCII_CONTINUE.0[c as usize];
}
unicode_id_start::is_id_continue_unicode(c) || c == ZWNJ || c == ZWJ
}
pub const SINGLE_CHAR_TOKENS: &[Kind; 128] = &[
/* 0 */ Kind::Undetermined,
/* 1 */ Kind::Undetermined,

View file

@ -19,16 +19,20 @@ use oxc_allocator::{Allocator, String};
use oxc_ast::ast::RegExpFlags;
use oxc_diagnostics::Error;
use oxc_span::{SourceType, Span};
use oxc_syntax::{
identifier::{
is_identifier_part, is_identifier_start_all, is_identifier_start_ascii,
is_irregular_line_terminator, is_irregular_whitespace, is_line_terminator, CR, EOF, FF, LF,
LS, PS, TAB, VT,
},
unicode_id_start::is_id_start_unicode,
};
use simd::{SkipMultilineComment, SkipWhitespace};
pub use token::{RegExp, Token, TokenValue};
pub use self::kind::Kind;
use self::{
constants::{
is_identifier_part, is_identifier_start_all, is_identifier_start_ascii,
is_irregular_line_terminator, is_irregular_whitespace, is_line_terminator, EOF,
SINGLE_CHAR_TOKENS,
},
constants::SINGLE_CHAR_TOKENS,
number::{parse_big_int, parse_float, parse_int},
string_builder::AutoCow,
trivia_builder::TriviaBuilder,
@ -429,7 +433,7 @@ impl<'a> Lexer<'a> {
self.identifier_unicode_escape_sequence(&mut builder, true);
self.identifier_name_or_keyword(builder)
}
c if unicode_id_start::is_id_start_unicode(c) => {
c if is_id_start_unicode(c) => {
builder.push_matching(c);
self.identifier_name_or_keyword(builder)
}
@ -1053,10 +1057,10 @@ impl<'a> Lexer<'a> {
}
return tail;
}
constants::CR => {
CR => {
builder.force_allocation_without_current_ascii_char(self);
if self.next_eq(constants::LF) {
builder.push_different(constants::LF);
if self.next_eq(LF) {
builder.push_different(LF);
}
}
'\\' => {
@ -1354,17 +1358,17 @@ impl<'a> Lexer<'a> {
}
Some(c) => match c {
// CharacterEscapeSequence
constants::LF | constants::LS | constants::PS => {}
constants::CR => {
self.next_eq(constants::LF);
LF | LS | PS => {}
CR => {
self.next_eq(LF);
}
'\'' | '"' | '\\' => text.push(c),
'b' => text.push('\u{8}'),
'f' => text.push(constants::FF),
'n' => text.push(constants::LF),
'r' => text.push(constants::CR),
't' => text.push(constants::TAB),
'v' => text.push(constants::VT),
'f' => text.push(FF),
'n' => text.push(LF),
'r' => text.push(CR),
't' => text.push(TAB),
'v' => text.push(VT),
// HexEscapeSequence
'x' => {
self.hex_digit()

View file

@ -15,4 +15,5 @@ default = []
serde = ["dep:serde"]
[dependencies]
unicode-id-start = { workspace = true }
serde = { workspace = true, features = ["derive"], optional = true }

View file

@ -0,0 +1,110 @@
use unicode_id_start::{is_id_continue_unicode, is_id_start_unicode};
pub const EOF: char = '\0';
// 11.1 Unicode Format-Control Characters
/// U+200C ZERO WIDTH NON-JOINER, abbreviated in the spec as <ZWNJ>.
/// Specially permitted in identifiers.
pub const ZWNJ: char = '\u{200c}';
/// U+200D ZERO WIDTH JOINER, abbreviated as <ZWJ>.
/// Specially permitted in identifiers.
pub const ZWJ: char = '\u{200d}';
/// U+FEFF ZERO WIDTH NO-BREAK SPACE, abbreviated <ZWNBSP>.
/// Considered a whitespace character in JS.
pub const ZWNBSP: char = '\u{feff}';
// 11.2 White Space
/// U+0009 CHARACTER TABULATION, abbreviated <TAB>.
pub const TAB: char = '\u{9}';
/// U+000B VERTICAL TAB, abbreviated <VT>.
pub const VT: char = '\u{b}';
/// U+000C FORM FEED, abbreviated <FF>.
pub const FF: char = '\u{c}';
/// U+00A0 NON-BREAKING SPACE, abbreviated <NBSP>.
pub const NBSP: char = '\u{a0}';
pub fn is_irregular_whitespace(c: char) -> bool {
matches!(
c,
VT | FF | NBSP | ZWNBSP | '\u{85}' | '\u{1680}' | '\u{2000}'
..='\u{200a}' | '\u{202f}' | '\u{205f}' | '\u{3000}'
)
}
// 11.3 Line Terminators
/// U+000A LINE FEED, abbreviated in the spec as <LF>.
pub const LF: char = '\u{a}';
/// U+000D CARRIAGE RETURN, abbreviated in the spec as <CR>.
pub const CR: char = '\u{d}';
/// U+2028 LINE SEPARATOR, abbreviated <LS>.
pub const LS: char = '\u{2028}';
/// U+2029 PARAGRAPH SEPARATOR, abbreviated <PS>.
pub const PS: char = '\u{2029}';
pub fn is_regular_line_terminator(c: char) -> bool {
matches!(c, LF | CR)
}
pub fn is_irregular_line_terminator(c: char) -> bool {
matches!(c, LS | PS)
}
pub fn is_line_terminator(c: char) -> bool {
is_regular_line_terminator(c) || is_irregular_line_terminator(c)
}
const T: bool = true;
const F: bool = false;
#[repr(C, align(64))]
pub struct Align64<T>(pub(crate) T);
// This contains `$` (36) and `_` (95)
pub const ASCII_START: Align64<[bool; 128]> = Align64([
F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, T,
F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F,
]);
// This contains `$` (36)
pub const ASCII_CONTINUE: Align64<[bool; 128]> = Align64([
F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F, F,
F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, T,
F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F,
]);
#[inline]
pub fn is_identifier_start_ascii(c: char) -> bool {
ASCII_START.0[c as usize]
}
/// Section 12.6 Detect `IdentifierStartChar`
#[inline]
pub fn is_identifier_start_all(c: char) -> bool {
if c.is_ascii() {
return is_identifier_start_ascii(c);
}
is_id_start_unicode(c)
}
/// Section 12.6 Detect `IdentifierPartChar`
/// NOTE 2: The nonterminal `IdentifierPart` derives _ via `UnicodeIDContinue`.
#[inline]
pub fn is_identifier_part(c: char) -> bool {
if c.is_ascii() {
return ASCII_CONTINUE.0[c as usize];
}
is_id_continue_unicode(c) || c == ZWNJ || c == ZWJ
}

View file

@ -1,7 +1,10 @@
//! Common code for JavaScript Syntax
pub mod identifier;
pub mod operator;
pub use unicode_id_start;
#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
pub enum NumberBase {
Float,