oxc/Cargo.toml
overlookmotel d3a59f27f7
perf(parser): lex identifiers as bytes not chars (#2352)
This PR re-implements lexing identifiers with a fast path for the most common case - identifiers which are pure ASCII characters, using the new `Source` / `SourcePosition` APIs.

Lexing identifiers is a hot path, and accounts for the majority of the time the Lexer spends. The performance bump from this change is (if I do say so myself!) quite decent.

I've spent a lot of time tuning the implementation, which gained a further 10-15% on the Lexer benchmarks compared to my first, simpler attempt. Some of the design decisions, if they look odd, are likely motivated by gains in performance.

### Techniques

This implementation uses a few different strategies for performance:

* Search byte-by-byte, not char-by-char.
* Process batches of 32 bytes at a time to reduce bounds checks.
* Mark uncommon paths `#[cold]`.

### Structure

The implementation is built in 3 layers:

1. ASCII characters only.
2. ASCII and Unicode characters.
3. `\` escape sequences (and all the above).

`identifier_name_handler` starts at the top layer, and is optimized for consuming ASCII as fast as possible. Each "layer" is considered more uncommon than the previous, and dropping down a layer is a de-opt.

I'm assuming that 95%+ of JavaScript code does not include either Unicode characters or escapes in identifiers, so the speed of the fast path is prioritised.

That said, once a Unicode character is encountered, the next layer does expect to find further Unicode characters, rather than de-opting over and over again. If an identifier *starts* with a Unicode character, it enters the code straight on the 2nd layer, so is not penalised by going through a `#[cold]` boundary. Lexing Unicode is never going to be as fast as ASCII, but still I felt it was important not to penalise it unnecessarily, so as not to be Anglo-centric.

### ASCII search macro

The main ASCII search is implemented as a macro. I found that, for reasons I don't understand, it's significantly faster to have all the code in a single function, even compared to multiple functions marked `#[inline]` or `#[inline(always)]`. The fastest implementation also requires some code to be repeated twice, which is nicer to do with a macro.

This macro, and the `ByteMatchTable` types that go with it, are designed to be re-usable. Next step will be to apply them for whitespace and strings, which should be fairly simple.

Searching in batches of 32 bytes is also designed to be forward-compatible with SIMD.

### Bye bye `AutoCow`

`AutoCow` is removed. Instead, a string-builder is only created if it's needed, when a `\` escape is first encountered. The string builder is also more efficient than `AutoCow` was, as it copies bytes in chunks, rather than 1-by-1.

This won't make much difference for identifiers, as escapes are so rare anyway, but this same technique can be used for strings, where they're more common.
2024-02-09 12:01:30 +08:00

158 lines
7.1 KiB
TOML

[workspace]
resolver = "2"
members = ["crates/*", "tasks/*", "napi/*", "wasm/*"]
exclude = ["tasks/lint_rules"]
[workspace.package]
authors = ["Boshen <boshenc@gmail.com>", "Oxc contributors"]
categories = ["development-tools", "web-programming", "compilers"]
description = "A collection of JavaScript tools written in Rust."
edition = "2021"
homepage = "https://oxc-project.github.io"
keywords = ["JavaScript", "TypeScript", "parser", "linter", "minifier"]
license = "MIT"
repository = "https://github.com/oxc-project/oxc"
rust-version = "1.60"
[workspace.lints.rust]
[workspace.lints.clippy]
all = { level = "warn" }
# restriction
dbg_macro = "warn"
todo = "warn"
unimplemented = "warn"
# I like the explicitness of this rule as it removes confusion around `clone`.
# This increases readability, avoids `clone` mindlessly and heap allocating on accident.
clone_on_ref_ptr = "warn"
# These two are mutually exclusive, I like `mod.rs` files for better fuzzy searches on module entries.
self_named_module_files = "warn" # "-Wclippy::mod_module_files"
empty_drop = "warn"
empty_structs_with_brackets = "warn"
exit = "warn"
filetype_is_file = "warn"
get_unwrap = "warn"
impl_trait_in_params = "warn"
rc_buffer = "warn"
rc_mutex = "warn"
rest_pat_in_fully_bound_structs = "warn"
unnecessary_safety_comment = "warn"
undocumented_unsafe_blocks = "warn"
# I want to write the best Rust code so pedantic is enabled.
# We should only disable rules globally if they are either false positives, chaotic, or does not make sense.
pedantic = { level = "warn", priority = -1 }
# Allowed rules
# pedantic
# This rule is too pedantic, I don't want to force this because naming things are hard.
module_name_repetitions = "allow"
# All triggers are mostly ignored in our codebase, so this is ignored globally.
struct_excessive_bools = "allow"
too_many_lines = "allow"
# #[must_use] is creating too much noise for this codebase, it does not add much value except nagging
# the programmer to add a #[must_use] after clippy has been run.
# Having #[must_use] every where also hinders readability.
must_use_candidate = "allow"
# used_underscore_binding= "allow"
doc_markdown = "allow"
# nursery
# `const` functions do not make sense for our project because this is not a `const` library.
# This rule also confuses new comers and forces them to add `const` blindlessly without any reason.
missing_const_for_fn = "allow"
[workspace.dependencies]
# publish = true
oxc = { version = "0.6.0", path = "crates/oxc" }
oxc_allocator = { version = "0.6.0", path = "crates/oxc_allocator" }
oxc_ast = { version = "0.6.0", path = "crates/oxc_ast" }
oxc_codegen = { version = "0.6.0", path = "crates/oxc_codegen" }
oxc_diagnostics = { version = "0.6.0", path = "crates/oxc_diagnostics" }
oxc_index = { version = "0.6.0", path = "crates/oxc_index" }
oxc_minifier = { version = "0.6.0", path = "crates/oxc_minifier" }
oxc_parser = { version = "0.6.0", path = "crates/oxc_parser" }
oxc_semantic = { version = "0.6.0", path = "crates/oxc_semantic" }
oxc_span = { version = "0.6.0", path = "crates/oxc_span" }
oxc_syntax = { version = "0.6.0", path = "crates/oxc_syntax" }
oxc_transformer = { version = "0.6.0", path = "crates/oxc_transformer" }
# publish = false
oxc_macros = { path = "crates/oxc_macros" }
oxc_linter = { path = "crates/oxc_linter" }
oxc_type_synthesis = { path = "crates/oxc_type_synthesis" }
oxc_prettier = { path = "crates/oxc_prettier" }
oxc_tasks_common = { path = "tasks/common" }
oxc_language_server = { path = "crates/oxc_language_server" }
assert-unchecked = { version = "0.1.2" }
bpaf = { version = "0.9.9" }
bitflags = { version = "2.4.2" }
bumpalo = { version = "3.14.0" }
convert_case = { version = "0.6.0" }
criterion = { version = "0.5.1", default-features = false }
crossbeam-channel = { version = "0.5.11" }
dashmap = { version = "5.5.3" }
flate2 = { version = "1.0.28" }
futures = { version = "0.3.30" }
ignore = { version = "0.4.22" }
itertools = { version = "0.12.1" }
jemallocator = { version = "0.5.4" }
lazy_static = { version = "1.4.0" }
miette = { version = "5.10.0", features = ["fancy-no-backtrace"] }
mimalloc = { version = "0.1.39" }
num-bigint = { version = "0.4.4" }
num-traits = { version = "0.2.17" }
phf = { version = "0.11" }
pico-args = { version = "0.5.0" }
proc-macro2 = { version = "1.0.78" }
project-root = { version = "0.2.2" }
quote = { version = "1.0.35" }
rayon = { version = "1.8.1" }
regex = { version = "1.10.3" }
rustc-hash = { version = "1.1.0", default-features = false, features = ["std"] }
ryu-js = { version = "1.0.0" }
ropey = { version = "1.6.1" }
seq-macro = { version = "0.3.5" }
serde = { version = "1.0.196" }
serde_json = { version = "1.0.113" }
syn = { version = "=1.0.109" }
thiserror = { version = "1.0.56" }
tokio = { version = "1" }
tower-lsp = { version = "0.20.0", features = ["proposed"] }
unicode-id-start = { version = "1.1.2" }
ureq = { version = "2.9.1", default-features = false, features = ["tls", "json"] }
url = { version = "2.5.0" }
walkdir = { version = "2.4.0" }
indexmap = { version = "2.2.2" }
index_vec = { version = "0.1.3" }
static_assertions = { version = "1.1.0" }
stacker = { version = "0.1.15" }
tracing = { version = "0.1" }
tracing-subscriber = { version = "0.3" }
insta = { version = "1.34.0", features = ["glob"] }
codspeed-criterion-compat = { version = "2.3.3", default-features = false }
glob = { version = "0.3.1" }
mime_guess = { version = "2.0.4" }
language-tags = { version = "0.3.2" }
tsify = { version = "0.4.5" }
wasm-bindgen = { version = "0.2" }
serde-wasm-bindgen = { version = "0.6.3" }
[profile.release.package.oxc_wasm]
opt-level = 'z'
[profile.release]
# Configurations explicitly listed here for clarity.
# Using the best options for performance.
opt-level = 3
lto = "fat"
codegen-units = 1
strip = "symbols"
debug = false
panic = "abort" # Let it crash and force ourselves to write safe Rust.
# Use the `--profile release-debug` flag to show symbols in release mode.
# e.g. `cargo build --profile release-debug`
[profile.release-debug]
inherits = "release"
strip = false
debug = true