refactor(regular_expression): Misc fixes (#6234)

Preparation for #6141

- Keep `enum` size + add size asserts tests
- Arrange AST related directories
- Renaming
This commit is contained in:
leaysgur 2024-10-02 13:32:29 +00:00
parent 1c31932f03
commit acab777c0a
11 changed files with 165 additions and 105 deletions

View file

@ -1445,7 +1445,7 @@ const _: () = {
assert!(offset_of!(Alternative, span) == 0usize);
assert!(offset_of!(Alternative, body) == 8usize);
assert!(size_of::<Term>() == 24usize);
assert!(size_of::<Term>() == 16usize);
assert!(align_of::<Term>() == 8usize);
assert!(size_of::<BoundaryAssertion>() == 12usize);
@ -1465,7 +1465,7 @@ const _: () = {
assert!(size_of::<LookAroundAssertionKind>() == 1usize);
assert!(align_of::<LookAroundAssertionKind>() == 1usize);
assert!(size_of::<Quantifier>() == 64usize);
assert!(size_of::<Quantifier>() == 56usize);
assert!(align_of::<Quantifier>() == 8usize);
assert!(offset_of!(Quantifier, span) == 0usize);
assert!(offset_of!(Quantifier, min) == 8usize);
@ -1513,7 +1513,7 @@ const _: () = {
assert!(size_of::<CharacterClassContentsKind>() == 1usize);
assert!(align_of::<CharacterClassContentsKind>() == 1usize);
assert!(size_of::<CharacterClassContents>() == 24usize);
assert!(size_of::<CharacterClassContents>() == 16usize);
assert!(align_of::<CharacterClassContents>() == 8usize);
assert!(size_of::<CharacterClassRange>() == 40usize);
@ -3000,7 +3000,7 @@ const _: () = {
assert!(offset_of!(Alternative, span) == 0usize);
assert!(offset_of!(Alternative, body) == 8usize);
assert!(size_of::<Term>() == 20usize);
assert!(size_of::<Term>() == 12usize);
assert!(align_of::<Term>() == 4usize);
assert!(size_of::<BoundaryAssertion>() == 12usize);
@ -3020,7 +3020,7 @@ const _: () = {
assert!(size_of::<LookAroundAssertionKind>() == 1usize);
assert!(align_of::<LookAroundAssertionKind>() == 1usize);
assert!(size_of::<Quantifier>() == 56usize);
assert!(size_of::<Quantifier>() == 48usize);
assert!(align_of::<Quantifier>() == 8usize);
assert!(offset_of!(Quantifier, span) == 0usize);
assert!(offset_of!(Quantifier, min) == 8usize);
@ -3068,7 +3068,7 @@ const _: () = {
assert!(size_of::<CharacterClassContentsKind>() == 1usize);
assert!(align_of::<CharacterClassContentsKind>() == 1usize);
assert!(size_of::<CharacterClassContents>() == 20usize);
assert!(size_of::<CharacterClassContents>() == 8usize);
assert!(align_of::<CharacterClassContents>() == 4usize);
assert!(size_of::<CharacterClassRange>() == 40usize);

View file

@ -1,7 +1,3 @@
// NB: `#[span]`, `#[scope(...)]`,`#[visit(...)]` and `#[generate_derive(...)]` do NOT do anything to the code.
// They are purely markers for codegen used in `tasks/ast_tools` and `crates/oxc_traverse/scripts`. See docs in those crates.
// Read [`macro@oxc_ast_macros::ast`] for more information.
// Silence erroneous warnings from Rust Analyser for `#[derive(Tsify)]`
#![allow(non_snake_case)]
@ -76,19 +72,19 @@ pub struct Alternative<'a> {
#[cfg_attr(feature = "serialize", derive(Serialize, Tsify))]
pub enum Term<'a> {
// Assertion, QuantifiableAssertion
BoundaryAssertion(BoundaryAssertion) = 0,
BoundaryAssertion(Box<'a, BoundaryAssertion>) = 0,
LookAroundAssertion(Box<'a, LookAroundAssertion<'a>>) = 1,
// Quantifier
Quantifier(Box<'a, Quantifier<'a>>) = 2,
// Atom, ExtendedAtom
Character(Character) = 3,
Character(Box<'a, Character>) = 3,
Dot(Dot) = 4,
CharacterClassEscape(CharacterClassEscape) = 5,
CharacterClassEscape(Box<'a, CharacterClassEscape>) = 5,
UnicodePropertyEscape(Box<'a, UnicodePropertyEscape<'a>>) = 6,
CharacterClass(Box<'a, CharacterClass<'a>>) = 7,
CapturingGroup(Box<'a, CapturingGroup<'a>>) = 8,
IgnoreGroup(Box<'a, IgnoreGroup<'a>>) = 9,
IndexedReference(IndexedReference) = 10,
IndexedReference(Box<'a, IndexedReference>) = 10,
NamedReference(Box<'a, NamedReference<'a>>) = 11,
}
@ -286,9 +282,9 @@ pub enum CharacterClassContentsKind {
#[cfg_attr(feature = "serialize", derive(Serialize, Tsify))]
pub enum CharacterClassContents<'a> {
CharacterClassRange(Box<'a, CharacterClassRange>) = 0,
CharacterClassEscape(CharacterClassEscape) = 1,
CharacterClassEscape(Box<'a, CharacterClassEscape>) = 1,
UnicodePropertyEscape(Box<'a, UnicodePropertyEscape<'a>>) = 2,
Character(Character) = 3,
Character(Box<'a, Character>) = 3,
/// `UnicodeSetsMode` only
NestedCharacterClass(Box<'a, CharacterClass<'a>>) = 4,
/// `UnicodeSetsMode` only
@ -404,3 +400,13 @@ pub struct NamedReference<'a> {
pub span: Span,
pub name: Atom<'a>,
}
// See `oxc_ast/src/lib.rs` for the details
#[cfg(target_pointer_width = "64")]
#[test]
fn size_asserts() {
use std::mem::size_of;
assert!(size_of::<Term>() == 16);
assert!(size_of::<CharacterClassContents>() == 16);
}

View file

@ -78,17 +78,17 @@ impl<'a> Display for Alternative<'a> {
impl<'a> Display for Term<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::BoundaryAssertion(it) => write!(f, "{it}"),
Self::BoundaryAssertion(it) => write!(f, "{}", it.as_ref()),
Self::LookAroundAssertion(it) => write!(f, "{}", it.as_ref()),
Self::Quantifier(it) => write!(f, "{}", it.as_ref()),
Self::Character(it) => write!(f, "{it}"),
Self::Character(it) => write!(f, "{}", it.as_ref()),
Self::Dot(it) => write!(f, "{it}"),
Self::CharacterClassEscape(it) => write!(f, "{it}"),
Self::CharacterClassEscape(it) => write!(f, "{}", it.as_ref()),
Self::UnicodePropertyEscape(it) => write!(f, "{}", it.as_ref()),
Self::CharacterClass(it) => write!(f, "{}", it.as_ref()),
Self::CapturingGroup(it) => write!(f, "{}", it.as_ref()),
Self::IgnoreGroup(it) => write!(f, "{}", it.as_ref()),
Self::IndexedReference(it) => write!(f, "{it}"),
Self::IndexedReference(it) => write!(f, "{}", it.as_ref()),
Self::NamedReference(it) => write!(f, "{}", it.as_ref()),
}
}
@ -246,9 +246,9 @@ impl<'a> Display for CharacterClassContents<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::CharacterClassRange(it) => write!(f, "{}", it.as_ref()),
Self::CharacterClassEscape(it) => write!(f, "{it}"),
Self::CharacterClassEscape(it) => write!(f, "{}", it.as_ref()),
Self::UnicodePropertyEscape(it) => write!(f, "{}", it.as_ref()),
Self::Character(it) => write!(f, "{it}"),
Self::Character(it) => write!(f, "{}", it.as_ref()),
Self::NestedCharacterClass(it) => write!(f, "{}", it.as_ref()),
Self::ClassStringDisjunction(it) => write!(f, "{}", it.as_ref()),
}

View file

@ -0,0 +1,2 @@
mod display;
pub mod visit;

View file

@ -1,3 +1,6 @@
// NOTE: For now, this file is implemented by hand for convenience.
// But like `oxc_ast`, this should be generated by `tasks/ast_tools` in the future.
#![allow(unused_variables, clippy::wildcard_imports)]
use oxc_span::{GetSpan, Span};
use walk::walk_pattern;

View file

@ -7,7 +7,7 @@ use crate::{
body_parser::{reader::Reader, state::State, unicode, unicode_property},
diagnostics,
options::ParserOptions,
span::SpanFactory,
span_factory::SpanFactory,
surrogate_pair,
};
@ -255,10 +255,13 @@ impl<'a> PatternParser<'a> {
};
if let Some(kind) = kind {
return Ok(Some(ast::Term::BoundaryAssertion(ast::BoundaryAssertion {
span: self.span_factory.create(span_start, self.reader.offset()),
kind,
})));
return Ok(Some(ast::Term::BoundaryAssertion(Box::new_in(
ast::BoundaryAssertion {
span: self.span_factory.create(span_start, self.reader.offset()),
kind,
},
self.allocator,
))));
}
let kind = if self.reader.eat3('(', '?', '=') {
@ -312,11 +315,14 @@ impl<'a> PatternParser<'a> {
if let Some(cp) = self.reader.peek().filter(|&cp| !unicode::is_syntax_character(cp)) {
self.reader.advance();
return Ok(Some(ast::Term::Character(ast::Character {
span: self.span_factory.create(span_start, self.reader.offset()),
kind: ast::CharacterKind::Symbol,
value: cp,
})));
return Ok(Some(ast::Term::Character(Box::new_in(
ast::Character {
span: self.span_factory.create(span_start, self.reader.offset()),
kind: ast::CharacterKind::Symbol,
value: cp,
},
self.allocator,
))));
}
// .
@ -387,11 +393,14 @@ impl<'a> PatternParser<'a> {
// \ [lookahead = c]
if self.reader.peek().filter(|&cp| cp == 'c' as u32).is_some() {
return Ok(Some(ast::Term::Character(ast::Character {
span: self.span_factory.create(span_start, self.reader.offset()),
kind: ast::CharacterKind::Symbol,
value: '\\' as u32,
})));
return Ok(Some(ast::Term::Character(Box::new_in(
ast::Character {
span: self.span_factory.create(span_start, self.reader.offset()),
kind: ast::CharacterKind::Symbol,
value: '\\' as u32,
},
self.allocator,
))));
}
return Err(diagnostics::invalid_extended_atom_escape(
@ -434,11 +443,14 @@ impl<'a> PatternParser<'a> {
// ExtendedPatternCharacter
if let Some(cp) = self.consume_extended_pattern_character() {
return Ok(Some(ast::Term::Character(ast::Character {
span: self.span_factory.create(span_start, self.reader.offset()),
kind: ast::CharacterKind::Symbol,
value: cp,
})));
return Ok(Some(ast::Term::Character(Box::new_in(
ast::Character {
span: self.span_factory.create(span_start, self.reader.offset()),
kind: ast::CharacterKind::Symbol,
value: cp,
},
self.allocator,
))));
}
Ok(None)
@ -467,17 +479,23 @@ impl<'a> PatternParser<'a> {
));
}
return Ok(Some(ast::Term::IndexedReference(ast::IndexedReference {
span: self.span_factory.create(span_start, self.reader.offset()),
index,
})));
return Ok(Some(ast::Term::IndexedReference(Box::new_in(
ast::IndexedReference {
span: self.span_factory.create(span_start, self.reader.offset()),
index,
},
self.allocator,
))));
}
if index <= self.state.num_of_capturing_groups {
return Ok(Some(ast::Term::IndexedReference(ast::IndexedReference {
span: self.span_factory.create(span_start, self.reader.offset()),
index,
})));
return Ok(Some(ast::Term::IndexedReference(Box::new_in(
ast::IndexedReference {
span: self.span_factory.create(span_start, self.reader.offset()),
index,
},
self.allocator,
))));
}
self.reader.rewind(checkpoint);
@ -485,7 +503,10 @@ impl<'a> PatternParser<'a> {
// CharacterClassEscape: \d, \p{...}
if let Some(character_class_escape) = self.parse_character_class_escape(span_start) {
return Ok(Some(ast::Term::CharacterClassEscape(character_class_escape)));
return Ok(Some(ast::Term::CharacterClassEscape(Box::new_in(
character_class_escape,
self.allocator,
))));
}
if let Some(unicode_property_escape) =
self.parse_character_class_escape_unicode(span_start)?
@ -498,7 +519,7 @@ impl<'a> PatternParser<'a> {
// CharacterEscape: \n, \cM, \0, etc...
if let Some(character_escape) = self.parse_character_escape(span_start)? {
return Ok(Some(ast::Term::Character(character_escape)));
return Ok(Some(ast::Term::Character(Box::new_in(character_escape, self.allocator))));
}
// k GroupName: \k<name> means named reference
@ -820,11 +841,14 @@ impl<'a> PatternParser<'a> {
continue;
}
let dash = ast::CharacterClassContents::Character(ast::Character {
span: self.span_factory.create(span_start, self.reader.offset()),
kind: ast::CharacterKind::Symbol,
value: '-' as u32,
});
let dash = ast::CharacterClassContents::Character(Box::new_in(
ast::Character {
span: self.span_factory.create(span_start, self.reader.offset()),
kind: ast::CharacterKind::Symbol,
value: '-' as u32,
},
self.allocator,
));
let Some(class_atom_to) = self.parse_class_atom()? else {
// ClassAtom[?UnicodeMode] NonemptyClassRangesNoDash[?UnicodeMode]
@ -855,8 +879,8 @@ impl<'a> PatternParser<'a> {
body.push(ast::CharacterClassContents::CharacterClassRange(Box::new_in(
ast::CharacterClassRange {
span: from.span.merge(&to.span),
min: *from,
max: *to,
min: **from,
max: **to,
},
self.allocator,
)));
@ -895,11 +919,14 @@ impl<'a> PatternParser<'a> {
let span_start = self.reader.offset();
if self.reader.eat('-') {
return Ok(Some(ast::CharacterClassContents::Character(ast::Character {
span: self.span_factory.create(span_start, self.reader.offset()),
kind: ast::CharacterKind::Symbol,
value: '-' as u32,
})));
return Ok(Some(ast::CharacterClassContents::Character(Box::new_in(
ast::Character {
span: self.span_factory.create(span_start, self.reader.offset()),
kind: ast::CharacterKind::Symbol,
value: '-' as u32,
},
self.allocator,
))));
}
self.parse_class_atom_no_dash()
@ -922,20 +949,26 @@ impl<'a> PatternParser<'a> {
{
self.reader.advance();
return Ok(Some(ast::CharacterClassContents::Character(ast::Character {
span: self.span_factory.create(span_start, self.reader.offset()),
kind: ast::CharacterKind::Symbol,
value: cp,
})));
return Ok(Some(ast::CharacterClassContents::Character(Box::new_in(
ast::Character {
span: self.span_factory.create(span_start, self.reader.offset()),
kind: ast::CharacterKind::Symbol,
value: cp,
},
self.allocator,
))));
}
if self.reader.eat('\\') {
if self.reader.peek().filter(|&cp| cp == 'c' as u32).is_some() {
return Ok(Some(ast::CharacterClassContents::Character(ast::Character {
span: self.span_factory.create(span_start, self.reader.offset()),
kind: ast::CharacterKind::Symbol,
value: '\\' as u32,
})));
return Ok(Some(ast::CharacterClassContents::Character(Box::new_in(
ast::Character {
span: self.span_factory.create(span_start, self.reader.offset()),
kind: ast::CharacterKind::Symbol,
value: '\\' as u32,
},
self.allocator,
))));
}
if let Some(class_escape) = self.parse_class_escape(span_start)? {
@ -969,20 +1002,26 @@ impl<'a> PatternParser<'a> {
) -> Result<Option<ast::CharacterClassContents<'a>>> {
// b
if self.reader.eat('b') {
return Ok(Some(ast::CharacterClassContents::Character(ast::Character {
span: self.span_factory.create(span_start, self.reader.offset()),
kind: ast::CharacterKind::SingleEscape,
value: 0x08,
})));
return Ok(Some(ast::CharacterClassContents::Character(Box::new_in(
ast::Character {
span: self.span_factory.create(span_start, self.reader.offset()),
kind: ast::CharacterKind::SingleEscape,
value: 0x08,
},
self.allocator,
))));
}
// [+UnicodeMode] -
if self.state.unicode_mode && self.reader.eat('-') {
return Ok(Some(ast::CharacterClassContents::Character(ast::Character {
span: self.span_factory.create(span_start, self.reader.offset()),
kind: ast::CharacterKind::SingleEscape,
value: '-' as u32,
})));
return Ok(Some(ast::CharacterClassContents::Character(Box::new_in(
ast::Character {
span: self.span_factory.create(span_start, self.reader.offset()),
kind: ast::CharacterKind::SingleEscape,
value: '-' as u32,
},
self.allocator,
))));
}
// [~UnicodeMode] c ClassControlLetter
@ -997,11 +1036,14 @@ impl<'a> PatternParser<'a> {
{
self.reader.advance();
return Ok(Some(ast::CharacterClassContents::Character(ast::Character {
span: self.span_factory.create(span_start, self.reader.offset()),
kind: ast::CharacterKind::ControlLetter,
value: cp,
})));
return Ok(Some(ast::CharacterClassContents::Character(Box::new_in(
ast::Character {
span: self.span_factory.create(span_start, self.reader.offset()),
kind: ast::CharacterKind::ControlLetter,
value: cp,
},
self.allocator,
))));
}
self.reader.rewind(checkpoint);
@ -1010,9 +1052,10 @@ impl<'a> PatternParser<'a> {
// CharacterClassEscape[?UnicodeMode]
if let Some(character_class_escape) = self.parse_character_class_escape(span_start) {
return Ok(Some(ast::CharacterClassContents::CharacterClassEscape(
return Ok(Some(ast::CharacterClassContents::CharacterClassEscape(Box::new_in(
character_class_escape,
)));
self.allocator,
))));
}
if let Some(unicode_property_escape) =
self.parse_character_class_escape_unicode(span_start)?
@ -1025,7 +1068,10 @@ impl<'a> PatternParser<'a> {
// CharacterEscape[?UnicodeMode, ?NamedCaptureGroups]
if let Some(character_escape) = self.parse_character_escape(span_start)? {
return Ok(Some(ast::CharacterClassContents::Character(character_escape)));
return Ok(Some(ast::CharacterClassContents::Character(Box::new_in(
character_escape,
self.allocator,
))));
}
Ok(None)
@ -1246,7 +1292,10 @@ impl<'a> PatternParser<'a> {
}
if let Some(class_set_character) = self.parse_class_set_character()? {
return Ok(Some(ast::CharacterClassContents::Character(class_set_character)));
return Ok(Some(ast::CharacterClassContents::Character(Box::new_in(
class_set_character,
self.allocator,
))));
}
Ok(None)
@ -1301,9 +1350,10 @@ impl<'a> PatternParser<'a> {
let checkpoint = self.reader.checkpoint();
if self.reader.eat('\\') {
if let Some(character_class_escape) = self.parse_character_class_escape(span_start) {
return Ok(Some(ast::CharacterClassContents::CharacterClassEscape(
return Ok(Some(ast::CharacterClassContents::CharacterClassEscape(Box::new_in(
character_class_escape,
)));
self.allocator,
))));
}
if let Some(unicode_property_escape) =
self.parse_character_class_escape_unicode(span_start)?

View file

@ -1,6 +1,6 @@
use rustc_hash::FxHashSet;
use super::reader::Reader;
use crate::body_parser::reader::Reader;
/// Currently all of properties are read only from outside of this module.
/// Even inside of this module, it is not changed after initialized.

View file

@ -2,7 +2,7 @@ use oxc_allocator::Allocator;
use oxc_diagnostics::Result;
use rustc_hash::FxHashSet;
use crate::{ast, diagnostics, options::ParserOptions, span::SpanFactory};
use crate::{ast, diagnostics, options::ParserOptions, span_factory::SpanFactory};
pub struct FlagsParser<'a> {
source_text: &'a str,

View file

@ -1,15 +1,13 @@
#![allow(clippy::missing_errors_doc)]
pub mod ast;
mod ast_impl;
mod body_parser;
mod diagnostics;
mod display;
mod flag_parser;
mod flags_parser;
mod literal_parser;
mod options;
mod span;
mod span_factory;
mod surrogate_pair;
pub mod visit;
mod generated {
mod derive_clone_in;
@ -17,7 +15,8 @@ mod generated {
mod derive_content_hash;
}
pub mod ast;
pub use crate::{
body_parser::PatternParser, flag_parser::FlagsParser, literal_parser::Parser,
ast_impl::visit, body_parser::PatternParser, flags_parser::FlagsParser, literal_parser::Parser,
options::ParserOptions,
};

View file

@ -2,8 +2,8 @@ use oxc_allocator::Allocator;
use oxc_diagnostics::Result;
use crate::{
ast, body_parser::PatternParser, diagnostics, flag_parser::FlagsParser, options::ParserOptions,
span::SpanFactory,
ast, body_parser::PatternParser, diagnostics, flags_parser::FlagsParser,
options::ParserOptions, span_factory::SpanFactory,
};
/// LiteralParser