mirror of
https://github.com/danbulant/oxc
synced 2026-05-24 20:32:10 +00:00
perf(lexer): use bitshifting when parsing known integers (#3296)
## What This PR Does - perf(lexer): use bit shifting when parsing hex, octal, and binary integers instead of `mul_add`-ing on `f64`s. Check out the difference in assembly generated [here](https://godbolt.org/z/zMEKaeYzh) - perf(lexer): skip redundant utf8 check when parsing BigInts - refactor(lexer): remove `unsafe` usage (as per @overlookmotel's request [here](https://github.com/oxc-project/oxc/pull/3283#issuecomment-2111814598)) - test(lexer): add numeric parsing unit tests I don't expect this PR to have a large performance improvement, since the most common case (`Kind::Decimal`) is not affected. We could do this, however, by splitting `Kind::Decimal` into `Kind::DecimalFloat` and `Kind::DecimalInt` when the lexer encounters a `.`
This commit is contained in:
parent
712ee0dde8
commit
27030b9eb4
4 changed files with 249 additions and 42 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
|
@ -1506,6 +1506,7 @@ dependencies = [
|
||||||
"bitflags 2.5.0",
|
"bitflags 2.5.0",
|
||||||
"memchr",
|
"memchr",
|
||||||
"num-bigint",
|
"num-bigint",
|
||||||
|
"num-traits",
|
||||||
"ouroboros",
|
"ouroboros",
|
||||||
"oxc_allocator",
|
"oxc_allocator",
|
||||||
"oxc_ast",
|
"oxc_ast",
|
||||||
|
|
|
||||||
|
|
@ -24,12 +24,13 @@ oxc_ast = { workspace = true }
|
||||||
oxc_syntax = { workspace = true }
|
oxc_syntax = { workspace = true }
|
||||||
oxc_diagnostics = { workspace = true }
|
oxc_diagnostics = { workspace = true }
|
||||||
|
|
||||||
static_assertions = { workspace = true }
|
|
||||||
assert-unchecked = { workspace = true }
|
assert-unchecked = { workspace = true }
|
||||||
bitflags = { workspace = true }
|
bitflags = { workspace = true }
|
||||||
rustc-hash = { workspace = true }
|
|
||||||
num-bigint = { workspace = true }
|
num-bigint = { workspace = true }
|
||||||
|
num-traits = { workspace = true }
|
||||||
|
rustc-hash = { workspace = true }
|
||||||
seq-macro = { workspace = true }
|
seq-macro = { workspace = true }
|
||||||
|
static_assertions = { workspace = true }
|
||||||
|
|
||||||
memchr = { workspace = true }
|
memchr = { workspace = true }
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -326,7 +326,7 @@ impl<'a> ParserImpl<'a> {
|
||||||
let token = self.cur_token();
|
let token = self.cur_token();
|
||||||
let raw = self.cur_src();
|
let raw = self.cur_src();
|
||||||
let src = raw.strip_suffix('n').unwrap();
|
let src = raw.strip_suffix('n').unwrap();
|
||||||
let _value = parse_big_int(src, token.kind)
|
let _value = parse_big_int(src, token.kind, token.has_separator())
|
||||||
.map_err(|err| diagnostics::invalid_number(err, token.span()))?;
|
.map_err(|err| diagnostics::invalid_number(err, token.span()))?;
|
||||||
self.bump_any();
|
self.bump_any();
|
||||||
Ok(self.ast.bigint_literal(self.end_span(span), Atom::from(raw), base))
|
Ok(self.ast.bigint_literal(self.end_span(span), Atom::from(raw), base))
|
||||||
|
|
|
||||||
|
|
@ -2,38 +2,30 @@
|
||||||
//! code copied from [jsparagus](https://github.com/mozilla-spidermonkey/jsparagus/blob/master/crates/parser/src/numeric_value.rs)
|
//! code copied from [jsparagus](https://github.com/mozilla-spidermonkey/jsparagus/blob/master/crates/parser/src/numeric_value.rs)
|
||||||
|
|
||||||
use num_bigint::BigInt;
|
use num_bigint::BigInt;
|
||||||
|
use num_traits::Num as _;
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
|
|
||||||
use super::kind::Kind;
|
use super::kind::Kind;
|
||||||
|
|
||||||
pub fn parse_int(s: &str, kind: Kind, has_sep: bool) -> Result<f64, &'static str> {
|
pub fn parse_int(s: &str, kind: Kind, has_sep: bool) -> Result<f64, &'static str> {
|
||||||
let s = if has_sep { Cow::Owned(s.replace('_', "")) } else { Cow::Borrowed(s) };
|
let s = if has_sep { Cow::Owned(s.replace('_', "")) } else { Cow::Borrowed(s) };
|
||||||
let s = s.as_ref();
|
|
||||||
debug_assert!(!s.contains('_'));
|
debug_assert!(!s.contains('_'));
|
||||||
|
|
||||||
// SAFETY: we just checked that `s` has no `_` characters
|
parse_int_without_underscores(&s, kind)
|
||||||
unsafe { parse_int_without_underscores_unchecked(s, kind) }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn parse_float(s: &str, has_sep: bool) -> Result<f64, &'static str> {
|
pub fn parse_float(s: &str, has_sep: bool) -> Result<f64, &'static str> {
|
||||||
let s = if has_sep { Cow::Owned(s.replace('_', "")) } else { Cow::Borrowed(s) };
|
let s = if has_sep { Cow::Owned(s.replace('_', "")) } else { Cow::Borrowed(s) };
|
||||||
debug_assert!(!s.contains('_'));
|
debug_assert!(!s.contains('_'));
|
||||||
|
|
||||||
// SAFETY: we just checked that `s` has no `_` characters
|
parse_float_without_underscores(&s)
|
||||||
unsafe { parse_float_without_underscores_unchecked(&s) }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// # Safety
|
/// This function assumes `s` has had all numeric separators (`_`) removed.
|
||||||
///
|
/// Parsing will fail if this assumption is violated.
|
||||||
/// This function assumes that all `_` characters have been stripped from `s`.
|
fn parse_int_without_underscores(s: &str, kind: Kind) -> Result<f64, &'static str> {
|
||||||
/// Violating this assumption does _not_ cause UB. However, this function is
|
|
||||||
/// marked as unsafe to ensure consumers are aware of the assumption.
|
|
||||||
unsafe fn parse_int_without_underscores_unchecked(
|
|
||||||
s: &str,
|
|
||||||
kind: Kind,
|
|
||||||
) -> Result<f64, &'static str> {
|
|
||||||
if kind == Kind::Decimal {
|
if kind == Kind::Decimal {
|
||||||
return parse_float_without_underscores_unchecked(s);
|
return parse_float_without_underscores(s);
|
||||||
}
|
}
|
||||||
match kind {
|
match kind {
|
||||||
Kind::Binary => Ok(parse_binary(&s[2..])),
|
Kind::Binary => Ok(parse_binary(&s[2..])),
|
||||||
|
|
@ -50,64 +42,84 @@ unsafe fn parse_int_without_underscores_unchecked(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// # Safety
|
/// This function assumes `s` has had all numeric separators (`_`) removed.
|
||||||
///
|
/// Parsing will fail if this assumption is violated.
|
||||||
/// This function assumes that all `_` characters have been stripped from `s`.
|
fn parse_float_without_underscores(s: &str) -> Result<f64, &'static str> {
|
||||||
/// Violating this assumption does _not_ cause UB. However, this function is
|
|
||||||
/// marked as unsafe to ensure consumers are aware of the assumption.
|
|
||||||
unsafe fn parse_float_without_underscores_unchecked(s: &str) -> Result<f64, &'static str> {
|
|
||||||
s.parse::<f64>().map_err(|_| "invalid float")
|
s.parse::<f64>().map_err(|_| "invalid float")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// NOTE: bit shifting here is is safe and much faster than f64::mul_add.
|
||||||
|
/// It's safe because we're sure this number is an integer - if it wasn't, it
|
||||||
|
/// would be a [`Kind::Float`] instead. It's fast because shifting usually takes
|
||||||
|
/// 1 clock cycle on the ALU, while multiplication+addition uses the FPU and is
|
||||||
|
/// much slower. Addtiionally, this loop often gets unrolled by rustc since
|
||||||
|
/// these numbers are usually not long. On x84_64, FMUL has a latency of 4 clock
|
||||||
|
/// cycles, which doesn't include addition. Some platorms support mul + add in a
|
||||||
|
/// single instruction, but many others do not.
|
||||||
|
#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
|
||||||
fn parse_binary(s: &str) -> f64 {
|
fn parse_binary(s: &str) -> f64 {
|
||||||
debug_assert!(!s.is_empty());
|
debug_assert!(!s.is_empty());
|
||||||
|
|
||||||
let mut result = 0_f64;
|
let mut result = 0_u64;
|
||||||
|
|
||||||
for c in s.as_bytes().iter().filter(|s| s != &&b'_') {
|
for c in s.as_bytes() {
|
||||||
|
debug_assert!(c != &b'_');
|
||||||
#[allow(clippy::cast_lossless)]
|
#[allow(clippy::cast_lossless)]
|
||||||
let value = (c - b'0') as f64;
|
let value = (c - b'0') as u64;
|
||||||
result = result.mul_add(2.0, value);
|
result <<= 1;
|
||||||
|
result |= value;
|
||||||
}
|
}
|
||||||
|
|
||||||
result
|
result as f64
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[allow(clippy::cast_precision_loss)]
|
||||||
fn parse_octal(s: &str) -> f64 {
|
fn parse_octal(s: &str) -> f64 {
|
||||||
debug_assert!(!s.is_empty());
|
debug_assert!(!s.is_empty());
|
||||||
|
|
||||||
let mut result = 0_f64;
|
let mut result = 0_u64;
|
||||||
|
|
||||||
for c in s.as_bytes().iter().filter(|s| s != &&b'_') {
|
for c in s.as_bytes() {
|
||||||
|
debug_assert!(c != &b'_');
|
||||||
#[allow(clippy::cast_lossless)]
|
#[allow(clippy::cast_lossless)]
|
||||||
let value = (c - b'0') as f64;
|
let value = (c - b'0') as u64;
|
||||||
result = result.mul_add(8.0, value);
|
result <<= 3;
|
||||||
|
result |= value;
|
||||||
}
|
}
|
||||||
|
|
||||||
result
|
result as f64
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[allow(clippy::cast_precision_loss, clippy::cast_lossless)]
|
||||||
fn parse_hex(s: &str) -> f64 {
|
fn parse_hex(s: &str) -> f64 {
|
||||||
debug_assert!(!s.is_empty());
|
debug_assert!(!s.is_empty());
|
||||||
|
|
||||||
let mut result = 0_f64;
|
let mut result = 0_u64;
|
||||||
|
|
||||||
for c in s.as_bytes().iter().filter(|s| s != &&b'_') {
|
for c in s.as_bytes() {
|
||||||
|
debug_assert!(c != &b'_');
|
||||||
let value = match c {
|
let value = match c {
|
||||||
b'0'..=b'9' => c - b'0',
|
b'0'..=b'9' => c - b'0',
|
||||||
b'A'..=b'F' => c - b'A' + 10,
|
b'A'..=b'F' => c - b'A' + 10,
|
||||||
b'a'..=b'f' => c - b'a' + 10,
|
b'a'..=b'f' => c - b'a' + 10,
|
||||||
_ => unreachable!("invalid hex syntax {}", s),
|
_ => unreachable!("invalid hex syntax {}", s),
|
||||||
};
|
};
|
||||||
#[allow(clippy::cast_lossless)]
|
result <<= 4;
|
||||||
let value = value as f64;
|
result |= value as u64;
|
||||||
result = result.mul_add(16.0, value);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
result
|
result as f64
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn parse_big_int(s: &str, kind: Kind) -> Result<BigInt, &'static str> {
|
pub fn parse_big_int(s: &str, kind: Kind, has_sep: bool) -> Result<BigInt, &'static str> {
|
||||||
|
let s = if has_sep { Cow::Owned(s.replace('_', "")) } else { Cow::Borrowed(s) };
|
||||||
|
debug_assert!(!s.contains('_'));
|
||||||
|
parse_big_int_without_underscores(&s, kind)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This function assumes `s` has had all numeric separators (`_`) removed.
|
||||||
|
/// Parsing will fail if this assumption is violated.
|
||||||
|
fn parse_big_int_without_underscores(s: &str, kind: Kind) -> Result<BigInt, &'static str> {
|
||||||
let s = match kind {
|
let s = match kind {
|
||||||
Kind::Decimal => s,
|
Kind::Decimal => s,
|
||||||
Kind::Binary | Kind::Octal | Kind::Hex => &s[2..],
|
Kind::Binary | Kind::Octal | Kind::Hex => &s[2..],
|
||||||
|
|
@ -120,5 +132,198 @@ pub fn parse_big_int(s: &str, kind: Kind) -> Result<BigInt, &'static str> {
|
||||||
Kind::Hex => 16,
|
Kind::Hex => 16,
|
||||||
_ => unreachable!(),
|
_ => unreachable!(),
|
||||||
};
|
};
|
||||||
BigInt::parse_bytes(s.as_bytes(), radix).ok_or("invalid bigint")
|
// NOTE: BigInt::from_bytes does a utf8 check, then uses from_str_radix
|
||||||
|
// under the hood. We already have a string, so we can just use that
|
||||||
|
// directly.
|
||||||
|
BigInt::from_str_radix(s, radix).map_err(|_| "invalid bigint")
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
#[allow(clippy::unreadable_literal, clippy::mixed_case_hex_literals)]
|
||||||
|
mod test {
|
||||||
|
|
||||||
|
use super::{parse_float, parse_int, Kind};
|
||||||
|
|
||||||
|
#[allow(clippy::cast_precision_loss)]
|
||||||
|
fn assert_all_ints_eq<I>(test_cases: I, kind: Kind, has_sep: bool)
|
||||||
|
where
|
||||||
|
I: IntoIterator<Item = (&'static str, i64)>,
|
||||||
|
{
|
||||||
|
for (s, expected) in test_cases {
|
||||||
|
let parsed = parse_int(s, kind, has_sep);
|
||||||
|
assert_eq!(
|
||||||
|
parsed,
|
||||||
|
Ok(expected as f64),
|
||||||
|
"expected {s} to parse to {expected}, but got {parsed:?}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fn assert_all_floats_eq<I>(test_cases: I, has_sep: bool)
|
||||||
|
where
|
||||||
|
I: IntoIterator<Item = (&'static str, f64)>,
|
||||||
|
{
|
||||||
|
for (s, expected) in test_cases {
|
||||||
|
let parsed = parse_float(s, has_sep);
|
||||||
|
assert_eq!(
|
||||||
|
parsed,
|
||||||
|
Ok(expected),
|
||||||
|
"expected {s} to parse to {expected}, but got {parsed:?}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[allow(clippy::excessive_precision)]
|
||||||
|
fn test_int_precision() {
|
||||||
|
assert_eq!(parse_int("9007199254740991", Kind::Decimal, false), Ok(9007199254740991.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[allow(clippy::excessive_precision)]
|
||||||
|
fn test_float_precision() {
|
||||||
|
let cases = vec![
|
||||||
|
("1.7976931348623157e+308", 1.7976931348623157e+308),
|
||||||
|
("0.000000001", 0.000_000_001),
|
||||||
|
];
|
||||||
|
assert_all_floats_eq(cases, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_int_no_sep() {
|
||||||
|
let decimal: Vec<(&str, i64)> = vec![
|
||||||
|
// normal
|
||||||
|
("0", 0),
|
||||||
|
("-0", 0),
|
||||||
|
("1", 1),
|
||||||
|
("-1", -1),
|
||||||
|
("000000000000", 0),
|
||||||
|
("-000000000000", 0),
|
||||||
|
("9007199254740991", 9007199254740991), // max safe integer, 2^53 - 1
|
||||||
|
("-9007199254740990", -9007199254740990), // min safe integer, -(2^53 - 1)
|
||||||
|
];
|
||||||
|
let binary = vec![
|
||||||
|
("0b0", 0b0),
|
||||||
|
("0b1", 0b1),
|
||||||
|
("0b10", 0b10),
|
||||||
|
("0b110001001000100", 0b110001001000100),
|
||||||
|
("0b110001001000100", 0b110001001000100),
|
||||||
|
];
|
||||||
|
let octal = vec![("0o0", 0o0), ("0o1", 0o1), ("0o10", 0o10), ("0o777", 0o777)];
|
||||||
|
let hex: Vec<(&str, i64)> = vec![
|
||||||
|
("0x0", 0x0),
|
||||||
|
("0X0", 0x0),
|
||||||
|
("0xFF", 0xFF),
|
||||||
|
("0xc", 0xc), // :)
|
||||||
|
("0xdeadbeef", 0xdeadbeef),
|
||||||
|
("0xFfEeDdCcBbAa", 0xFfEeDdCcBbAa),
|
||||||
|
];
|
||||||
|
|
||||||
|
assert_all_ints_eq(decimal, Kind::Decimal, false);
|
||||||
|
assert_all_ints_eq(binary, Kind::Binary, false);
|
||||||
|
assert_all_ints_eq(octal, Kind::Octal, false);
|
||||||
|
assert_all_ints_eq(hex, Kind::Hex, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_int_with_sep() {
|
||||||
|
let decimal: Vec<(&str, i64)> = vec![
|
||||||
|
// still works without separators
|
||||||
|
("0", 0),
|
||||||
|
("-0", 0),
|
||||||
|
("1", 1),
|
||||||
|
("-1", -1),
|
||||||
|
("1_000_000", 1_000_000),
|
||||||
|
("-1_000_000", -1_000_000),
|
||||||
|
("000000000000", 0),
|
||||||
|
("-000000000000", 0),
|
||||||
|
("9_007_199_254_740_991", 9_007_199_254_740_991), // max safe integer, 2^53 - 1
|
||||||
|
("-9_007_199_254_740_990", -9_007_199_254_740_990), // min safe integer, -(2^53 - 1)
|
||||||
|
// still works for illegal tokens
|
||||||
|
("1___000_000", 1_000_000),
|
||||||
|
("1_", 1),
|
||||||
|
("_1", 1),
|
||||||
|
];
|
||||||
|
|
||||||
|
let binary = vec![
|
||||||
|
("0b0", 0b0),
|
||||||
|
("0b1", 0b1),
|
||||||
|
("0b10", 0b10),
|
||||||
|
("0b110001001000100", 0b110001001000100),
|
||||||
|
("0b110001001000100", 0b110001001000100),
|
||||||
|
("0b1_1000_1001_0001_0000", 0b1_1000_1001_0001_0000),
|
||||||
|
// still works for illegal tokens
|
||||||
|
("0b1_0000__0000", 0b1_0000_0000),
|
||||||
|
("0b1_", 0b1),
|
||||||
|
("0b_0", 0b0),
|
||||||
|
];
|
||||||
|
|
||||||
|
let octal = vec![
|
||||||
|
("0o0", 0o0),
|
||||||
|
("0o1", 0o1),
|
||||||
|
("0o10", 0o10),
|
||||||
|
("0o777", 0o777),
|
||||||
|
("0o7_7_7", 0o777),
|
||||||
|
("0o77_73_72", 0o77_73_72),
|
||||||
|
// still works for illegal tokens
|
||||||
|
("0o1_0000__0000", 0o100_000_000),
|
||||||
|
("0o1_", 0o1),
|
||||||
|
("0o_0", 0o0),
|
||||||
|
];
|
||||||
|
|
||||||
|
let hex: Vec<(&str, i64)> = vec![
|
||||||
|
// still works without separators
|
||||||
|
("0x0", 0x0),
|
||||||
|
("0X0", 0x0),
|
||||||
|
("0xFF", 0xFF),
|
||||||
|
("0xFF_AA_11", 0xFFAA11),
|
||||||
|
("0xdead_beef", 0xdead_beef),
|
||||||
|
("0xFf_Ee_Dd_Cc_Bb_Aa", 0xFfEe_DdCc_BbAa),
|
||||||
|
("0xFfEe_DdCc_BbAa", 0xFfEe_DdCc_BbAa),
|
||||||
|
// still works for illegal tokens
|
||||||
|
("0x1_0000__0000", 0x100_000_000),
|
||||||
|
("0x1_", 0x1),
|
||||||
|
("0x_0", 0x0),
|
||||||
|
];
|
||||||
|
|
||||||
|
assert_all_ints_eq(decimal, Kind::Decimal, true);
|
||||||
|
assert_all_ints_eq(binary, Kind::Binary, true);
|
||||||
|
assert_all_ints_eq(octal, Kind::Octal, true);
|
||||||
|
assert_all_ints_eq(hex, Kind::Hex, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_decimal() {
|
||||||
|
let no_sep: Vec<(&'static str, f64)> =
|
||||||
|
vec![("0", 0.0), ("1.0", 1.0), ("1.1", 1.1), ("25.125", 25.125)];
|
||||||
|
|
||||||
|
let sep: Vec<(&'static str, f64)> = vec![
|
||||||
|
("1_000.0", 1000.0),
|
||||||
|
("1.5_000", 1.5),
|
||||||
|
// works on invalid tokens
|
||||||
|
("_0._5", 0.5),
|
||||||
|
("0._5", 0.5),
|
||||||
|
("0.5_", 0.5),
|
||||||
|
];
|
||||||
|
|
||||||
|
// parse_int() handles Kind::Decimal as a float. Should we check if
|
||||||
|
// a '.' is encountered during lexing and pick which parser to use?
|
||||||
|
assert_all_floats_eq(no_sep.clone(), false);
|
||||||
|
assert_all_floats_eq(sep.clone(), true);
|
||||||
|
for (s, expected) in no_sep {
|
||||||
|
let parsed = parse_int(s, Kind::Decimal, false);
|
||||||
|
assert_eq!(
|
||||||
|
parsed,
|
||||||
|
Ok(expected),
|
||||||
|
"expected {s} to parse to {expected}, but got {parsed:?}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
for (s, expected) in sep {
|
||||||
|
let parsed = parse_int(s, Kind::Decimal, true);
|
||||||
|
assert_eq!(
|
||||||
|
parsed,
|
||||||
|
Ok(expected),
|
||||||
|
"expected {s} to parse to {expected}, but got {parsed:?}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue