diff --git a/Cargo.lock b/Cargo.lock index 3f0fdf03e..9312e7d7b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1602,6 +1602,7 @@ version = "0.0.0" dependencies = [ "criterion2", "oxc_allocator", + "oxc_ast", "oxc_codegen", "oxc_isolated_declarations", "oxc_linter", diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs index 4527e3a9f..02397fdd2 100644 --- a/crates/oxc_parser/src/lexer/mod.rs +++ b/crates/oxc_parser/src/lexer/mod.rs @@ -136,6 +136,13 @@ impl<'a> Lexer<'a> { Self::new(allocator, source_text, source_type, unique) } + /// Get errors. + /// Only used in benchmarks. + #[cfg(feature = "benchmarking")] + pub fn errors(&self) -> &[OxcDiagnostic] { + &self.errors + } + /// Remaining string from `Source` pub fn remaining(&self) -> &'a str { self.source.remaining() diff --git a/tasks/benchmark/Cargo.toml b/tasks/benchmark/Cargo.toml index a3eceb789..d9e330f05 100644 --- a/tasks/benchmark/Cargo.toml +++ b/tasks/benchmark/Cargo.toml @@ -65,6 +65,7 @@ bench = false # with only the crates it needs, to speed up the builds [dependencies] oxc_allocator = { workspace = true, optional = true } +oxc_ast = { workspace = true, optional = true } oxc_codegen = { workspace = true, optional = true } oxc_isolated_declarations = { workspace = true, optional = true } oxc_linter = { workspace = true, optional = true } @@ -86,6 +87,7 @@ serde_json = { workspace = true, optional = true } [features] default = [ "dep:oxc_allocator", + "dep:oxc_ast", "dep:oxc_codegen", "dep:oxc_isolated_declarations", "dep:oxc_linter", @@ -103,7 +105,7 @@ codspeed_napi = ["criterion2/codspeed", "dep:serde", "dep:serde_json"] # Features for running each benchmark separately with minimum dependencies that benchmark needs. # e.g. `cargo build --release -p oxc_benchmark --bench parser --no-default-features --features parser` -lexer = ["dep:oxc_allocator", "dep:oxc_parser", "dep:oxc_span", "dep:oxc_tasks_common"] +lexer = ["dep:oxc_allocator", "dep:oxc_ast", "dep:oxc_parser", "dep:oxc_span", "dep:oxc_tasks_common"] parser = ["dep:oxc_allocator", "dep:oxc_parser", "dep:oxc_span", "dep:oxc_tasks_common"] transformer = [ "dep:oxc_allocator", diff --git a/tasks/benchmark/benches/lexer.rs b/tasks/benchmark/benches/lexer.rs index 234da608b..0ef9dcb7c 100644 --- a/tasks/benchmark/benches/lexer.rs +++ b/tasks/benchmark/benches/lexer.rs @@ -1,7 +1,11 @@ #![allow(clippy::disallowed_methods)] use oxc_allocator::Allocator; +use oxc_ast::{ast::*, Visit}; use oxc_benchmark::{criterion_group, criterion_main, BenchmarkId, Criterion}; -use oxc_parser::lexer::{Kind, Lexer}; +use oxc_parser::{ + lexer::{Kind, Lexer}, + Parser, +}; use oxc_span::SourceType; use oxc_tasks_common::{TestFile, TestFiles}; @@ -9,16 +13,25 @@ fn bench_lexer(criterion: &mut Criterion) { let mut group = criterion.benchmark_group("lexer"); // Lexer lacks awareness of JS grammar, so it gets confused by a few things without the parser - // driving it, notably escapes in regexps and template strings. - // So simplify the input for it, by removing backslashes and converting template strings to - // normal string literals. + // driving it. So simplify the input for it, by replacing these syntaxes with plain strings. + // This ensures lexing completes without generating any errors, which is more realistic. + // + // It's unfortunate that this benchmark doesn't exercise the code paths for these syntaxes, + // but this is the closest we can get to a realistic benchmark of lexer in isolation. + let mut allocator = Allocator::default(); let files = TestFiles::complicated() .files() .iter() - .map(|file| TestFile { - url: file.url.clone(), - file_name: file.file_name.clone(), - source_text: file.source_text.replace('\\', " ").replace('`', "'"), + .map(|file| { + let source_type = SourceType::from_path(&file.file_name).unwrap(); + + let mut cleaner = SourceCleaner::new(&file.source_text); + cleaner.clean(source_type, &allocator); + let source_text = cleaner.source_text; + + allocator.reset(); + + TestFile { url: file.url.clone(), file_name: file.file_name.clone(), source_text } }) .collect::>(); @@ -43,3 +56,96 @@ fn bench_lexer(criterion: &mut Criterion) { criterion_group!(lexer, bench_lexer); criterion_main!(lexer); + +/// Cleaner of source text. +/// +/// Purpose is to allow lexer to complete without any errors. +/// Usually sources Oxc is asked to parse will not produce lexer errors, and generating diagnostics is +/// fairly expensive, so is unrealistic for benchmarking purposes. +/// +/// Certain syntax will parse without error, but the lexer alone does not have the context to understand +/// they're fine. Notably this includes syntax where the lexer only consumes the first character and +/// parser would then call back into lexer to complete the job. +/// +/// So replace these syntaxes with strings so that lexer can complete without error: +/// * `RegExpLiteral` +/// * `TemplateLiteral` +/// * `JSXText` +struct SourceCleaner { + source_text: String, + replacements: Vec, +} + +struct Replacement { + span: Span, + text: String, +} + +impl SourceCleaner { + fn new(source_text: &str) -> Self { + Self { source_text: source_text.to_string(), replacements: vec![] } + } + + fn clean(&mut self, source_type: SourceType, allocator: &Allocator) { + // Parse + let source_text = self.source_text.clone(); + let parser_ret = Parser::new(allocator, &source_text, source_type).parse(); + assert!(parser_ret.errors.is_empty()); + let program = parser_ret.program; + + // Visit AST and compile list of replacements + self.visit_program(&program); + + // Make replacements + self.replacements.sort_unstable_by_key(|replacement| replacement.span); + + for replacement in self.replacements.iter().rev() { + let span = replacement.span; + self.source_text + .replace_range(span.start as usize..span.end as usize, &replacement.text); + } + + // Check lexer can lex it without any errors + let mut lexer = Lexer::new_for_benchmarks(allocator, &self.source_text, source_type); + while lexer.next_token().kind != Kind::Eof {} + assert!(lexer.errors().is_empty()); + } + + fn replace(&mut self, span: Span, text: String) { + self.replacements.push(Replacement { span, text }); + } +} + +impl<'a> Visit<'a> for SourceCleaner { + fn visit_reg_exp_literal(&mut self, regexp: &RegExpLiteral<'a>) { + let RegExpPattern::Raw(pattern) = regexp.regex.pattern else { unreachable!() }; + let span = Span::sized(regexp.span.start, u32::try_from(pattern.len()).unwrap() + 2); + let text = convert_to_string(pattern); + self.replace(span, text); + } + + fn visit_template_literal(&mut self, lit: &TemplateLiteral<'a>) { + let span = lit.span; + let text = span.shrink(1).source_text(&self.source_text); + let text = convert_to_string(text).replace('\n', " "); + self.replace(span, text); + } + + fn visit_jsx_text(&mut self, jsx_text: &JSXText<'a>) { + let span = jsx_text.span; + let text = span.source_text(&self.source_text); + let text = convert_to_string(text).replace('\n', " "); + self.replace(span, text); + } +} + +#[expect(clippy::naive_bytecount)] +fn convert_to_string(text: &str) -> String { + let single_quote_count = text.as_bytes().iter().filter(|&&b| b == b'\'').count(); + let double_quote_count = text.as_bytes().iter().filter(|&&b| b == b'"').count(); + + let (quote, other_quote) = + if single_quote_count <= double_quote_count { ('\'', "\"") } else { ('"', "'") }; + let text = text.replace(quote, other_quote); + format!("{quote}{text}{quote}") +}