fix(lexer): incorrect lexing of large hex/octal/binary literals (#4072)

Closes #3347. Implementation follows the approach described by @overlookmotel [here](https://github.com/oxc-project/oxc/issues/3347#issuecomment-2119004288).
2026-05-19 12:19:15 +00:00 · 2024-07-10 16:39:09 +00:00 · 2024-07-10 16:39:09 +00:00 · 4a656c3a18
commit 4a656c3a18
parent bd69571965
5 changed files with 184 additions and 70 deletions
--- a/crates/oxc_parser/src/lexer/number.rs
+++ b/crates/oxc_parser/src/lexer/number.rs
@ -25,10 +25,8 @@ pub fn parse_float(s: &str, has_sep: bool) -> Result<f64, &'static str> {
 /// This function assumes `s` has had all numeric separators (`_`) removed.
 /// Parsing will fail if this assumption is violated.
 fn parse_int_without_underscores(s: &str, kind: Kind) -> Result<f64, &'static str> {
-    if kind == Kind::Decimal {
-        return parse_float_without_underscores(s);
-    }
    match kind {
+        Kind::Decimal => parse_float_without_underscores(s),
        Kind::Binary => Ok(parse_binary(&s[2..])),
        Kind::Octal => {
            let s = if s.starts_with("0o") || s.starts_with("0O") {
@ -49,6 +47,17 @@ fn parse_float_without_underscores(s: &str) -> Result<f64, &'static str> {
    s.parse::<f64>().map_err(|_| "invalid float")
 }

+/// b'0' is 0x30 and b'1' is 0x31.
+///
+/// So we can convert from binary digit to its value with `c & 1`.
+/// This is produces more compact assembly than `c - b'0'`.
+///
+/// <https://godbolt.org/z/1vvrK78jf>
+const fn binary_byte_to_value(c: u8) -> u8 {
+    debug_assert!(c == b'0' || c == b'1');
+    c & 1
+}
+
 /// NOTE: bit shifting here is is safe and much faster than f64::mul_add.
 /// It's safe because we're sure this number is an integer - if it wasn't, it
 /// would be a [`Kind::Float`] instead. It's fast because shifting usually takes
@ -57,100 +66,148 @@ fn parse_float_without_underscores(s: &str) -> Result<f64, &'static str> {
 /// these numbers are usually not long. On x84_64, FMUL has a latency of 4 clock
 /// cycles, which doesn't include addition. Some platorms support mul + add in a
 /// single instruction, but many others do not.
+///
+/// Unfortunately, this approach has the chance to overflow for excessively
+/// large numbers. In such cases, we fall back to mul_add. Note that right now
+/// we consider leading zeros as part of that length. Right now it doesn't seem
+/// worth it performance-wise to check and strip them. Further experimentation
+/// could be useful.
 #[allow(clippy::cast_precision_loss, clippy::cast_lossless)]
 fn parse_binary(s: &str) -> f64 {
-    // b'0' is 0x30 and b'1' is 0x31.
-    // So we can convert from binary digit to its value with `c & 1`.
-    // This is produces more compact assembly than `c - b'0'`.
-    // https://godbolt.org/z/1vvrK78jf
-    const fn byte_to_value(c: u8) -> u8 {
-        debug_assert!(c == b'0' || c == b'1');
-        c & 1
-    }
-    #[cfg(test)]
-    {
-        static_assertions::const_assert_eq!(byte_to_value(b'0'), 0);
-        static_assertions::const_assert_eq!(byte_to_value(b'1'), 1);
-    }
+    /// binary literals longer than this many characters have the chance to
+    /// overflow a u64, forcing us to take the slow path.
+    const MAX_FAST_BINARY_LEN: usize = 64;

    debug_assert!(!s.is_empty());
+    debug_assert!(!s.starts_with("0b") && !s.starts_with("0B"));
+
+    if s.len() > MAX_FAST_BINARY_LEN {
+        return parse_binary_slow(s);
+    }

    let mut result = 0_u64;
    for c in s.as_bytes() {
        result <<= 1;
-        result |= byte_to_value(*c) as u64;
+        result |= binary_byte_to_value(*c) as u64;
    }
    result as f64
 }

+#[cold]
+#[inline(never)]
+fn parse_binary_slow(s: &str) -> f64 {
+    let mut result = 0_f64;
+    for c in s.as_bytes() {
+        let value = f64::from(binary_byte_to_value(*c));
+        result = result.mul_add(2.0, value);
+    }
+
+    result
+}
+
+// ==================================== OCTAL ====================================
+
+/// b'0' is 0x30 and b'7' is 0x37.
+///
+/// So we can convert from any octal digit to its value with `c & 7`.
+/// This is produces more compact assembly than `c - b'0'`.
+///
+/// <https://godbolt.org/z/9rYTsMoMM>
+const fn octal_byte_to_value(c: u8) -> u8 {
+    debug_assert!(c >= b'0' && c <= b'7');
+    c & 7
+}
+
 #[allow(clippy::cast_precision_loss, clippy::cast_lossless)]
 fn parse_octal(s: &str) -> f64 {
-    // b'0' is 0x30 and b'7' is 0x37.
-    // So we can convert from any octal digit to its value with `c & 7`.
-    // This is produces more compact assembly than `c - b'0'`.
-    // https://godbolt.org/z/9rYTsMoMM
-    const fn byte_to_value(c: u8) -> u8 {
-        debug_assert!(c >= b'0' && c <= b'7');
-        c & 7
-    }
-    #[cfg(test)]
-    {
-        static_assertions::const_assert_eq!(byte_to_value(b'0'), 0);
-        static_assertions::const_assert_eq!(byte_to_value(b'7'), 7);
-    }
+    /// Numeric strings longer than this have the chance to overflow u64.
+    const MAX_FAST_OCTAL_LEN: usize = 21;

    debug_assert!(!s.is_empty());
+    debug_assert!(!s.starts_with("0o") && !s.starts_with("0O"));
+    if s.len() > MAX_FAST_OCTAL_LEN {
+        return parse_octal_slow(s);
+    }

    let mut result = 0_u64;
    for c in s.as_bytes() {
+        let n = octal_byte_to_value(*c);
        result <<= 3;
-        result |= byte_to_value(*c) as u64;
+        result |= n as u64;
    }
    result as f64
 }

+#[cold]
+#[inline(never)]
 #[allow(clippy::cast_precision_loss, clippy::cast_lossless)]
-fn parse_hex(s: &str) -> f64 {
-    // b'0' is 0x30 and b'9' is 0x39.
-    // b'A' is 0x41 and b'F' is 0x46.
-    // b'a' is 0x61 and b'f' is 0x66.
-    // So we can convert from a digit to its value with `c & 15`,
-    // and from `A-F` or `a-f` to its value with `(c & 15) + 9`.
-    // We could use `(c & 7) + 9` for `A-F`, but we use `(c & 15) + 9`
-    // so that both branches share the same `c & 15` operation.
-    // This is produces more slightly more assembly than explicitly matching all possibilities,
-    // but only because compiler unrolls the loop.
-    // https://godbolt.org/z/5fsdv8rGo
-    const fn byte_to_value(c: u8) -> u8 {
-        debug_assert!(
-            (c >= b'0' && c <= b'9') || (c >= b'A' && c <= b'F') || (c >= b'a' && c <= b'f')
-        );
-        if c < b'A' {
-            c & 15 // 0-9
-        } else {
-            (c & 15) + 9 // A-F or a-f
-        }
-    }
-    #[cfg(test)]
-    {
-        static_assertions::const_assert_eq!(byte_to_value(b'0'), 0);
-        static_assertions::const_assert_eq!(byte_to_value(b'9'), 9);
-        static_assertions::const_assert_eq!(byte_to_value(b'A'), 10);
-        static_assertions::const_assert_eq!(byte_to_value(b'F'), 15);
-        static_assertions::const_assert_eq!(byte_to_value(b'a'), 10);
-        static_assertions::const_assert_eq!(byte_to_value(b'f'), 15);
+fn parse_octal_slow(s: &str) -> f64 {
+    let mut result = 0_f64;
+    for c in s.as_bytes() {
+        let value = f64::from(octal_byte_to_value(*c));
+        result = result.mul_add(8.0, value);
    }

+    result
+}
+
+// ==================================== HEX ====================================
+
+/// - b'0' is 0x30 and b'9' is 0x39.
+/// - b'A' is 0x41 and b'F' is 0x46.
+/// - b'a' is 0x61 and b'f' is 0x66.
+///
+/// So we can convert from a digit to its value with `c & 15`,
+/// and from `A-F` or `a-f` to its value with `(c & 15) + 9`.
+/// We could use `(c & 7) + 9` for `A-F`, but we use `(c & 15) + 9`
+/// so that both branches share the same `c & 15` operation.
+///
+/// This is produces more slightly more assembly than explicitly matching all possibilities,
+/// but only because compiler unrolls the loop.
+///
+/// <https://godbolt.org/z/5fsdv8rGo>
+const fn hex_byte_to_value(c: u8) -> u8 {
+    debug_assert!((c >= b'0' && c <= b'9') || (c >= b'A' && c <= b'F') || (c >= b'a' && c <= b'f'));
+    if c < b'A' {
+        c & 15 // 0-9
+    } else {
+        (c & 15) + 9 // A-F or a-f
+    }
+}
+
+#[allow(clippy::cast_precision_loss, clippy::cast_lossless)]
+fn parse_hex(s: &str) -> f64 {
+    /// Hex strings longer than this have the chance to overflow u64.
+    const MAX_FAST_HEX_LEN: usize = 16;
+
    debug_assert!(!s.is_empty());
+    debug_assert!(!s.starts_with("0x"));
+
+    if s.len() > MAX_FAST_HEX_LEN {
+        return parse_hex_slow(s);
+    }

    let mut result = 0_u64;
    for c in s.as_bytes() {
+        let n = hex_byte_to_value(*c);
        result <<= 4;
-        result |= byte_to_value(*c) as u64;
+        result |= n as u64;
    }
    result as f64
 }

+#[cold]
+#[inline(never)]
+fn parse_hex_slow(s: &str) -> f64 {
+    let mut result = 0_f64;
+    for c in s.as_bytes() {
+        let value = f64::from(hex_byte_to_value(*c));
+        result = result.mul_add(16.0, value);
+    }
+
+    result
+}
+
 pub fn parse_big_int(s: &str, kind: Kind, has_sep: bool) -> Result<BigInt, &'static str> {
    let s = if has_sep { Cow::Owned(s.replace('_', "")) } else { Cow::Borrowed(s) };
    debug_assert!(!s.contains('_'));
@ -181,8 +238,9 @@ fn parse_big_int_without_underscores(s: &str, kind: Kind) -> Result<BigInt, &'st
 #[cfg(test)]
 #[allow(clippy::unreadable_literal, clippy::mixed_case_hex_literals)]
 mod test {
-
-    use super::{parse_float, parse_int, Kind};
+    use super::{
+        binary_byte_to_value, hex_byte_to_value, octal_byte_to_value, parse_float, parse_int, Kind,
+    };

    #[allow(clippy::cast_precision_loss)]
    fn assert_all_ints_eq<I>(test_cases: I, kind: Kind, has_sep: bool)
@ -212,10 +270,52 @@ mod test {
        }
    }

+    // octal
+    static_assertions::const_assert_eq!(octal_byte_to_value(b'0'), 0);
+    static_assertions::const_assert_eq!(octal_byte_to_value(b'7'), 7);
+
+    // hex
+    static_assertions::const_assert_eq!(hex_byte_to_value(b'0'), 0);
+    static_assertions::const_assert_eq!(hex_byte_to_value(b'9'), 9);
+    static_assertions::const_assert_eq!(hex_byte_to_value(b'A'), 10);
+    static_assertions::const_assert_eq!(hex_byte_to_value(b'F'), 15);
+    static_assertions::const_assert_eq!(hex_byte_to_value(b'a'), 10);
+    static_assertions::const_assert_eq!(hex_byte_to_value(b'f'), 15);
+
+    // binary
+    static_assertions::const_assert_eq!(binary_byte_to_value(b'0'), 0);
+    static_assertions::const_assert_eq!(binary_byte_to_value(b'1'), 1);
+
    #[test]
-    #[allow(clippy::excessive_precision)]
+    #[allow(clippy::excessive_precision, clippy::cast_precision_loss)]
    fn test_int_precision() {
        assert_eq!(parse_int("9007199254740991", Kind::Decimal, false), Ok(9007199254740991.0));
+        assert_eq!(
+            parse_int("0x10000000000000000", Kind::Hex, false),
+            Ok(0x10000000000000000_i128 as f64)
+        );
+        assert_eq!(
+            parse_int("0o40000000000000000", Kind::Octal, false),
+            Ok(0o40000000000000000_i128 as f64)
+        );
+        assert_eq!(
+            parse_int(
+                "0b00010000000000000000000000000000000000000000000000000000000000000000",
+                Kind::Binary,
+                false
+            ),
+            Ok(0b00010000000000000000000000000000000000000000000000000000000000000000_i128 as f64)
+        );
+    }
+
+    #[test]
+    #[allow(clippy::excessive_precision)]
+    fn test_large_number_of_leading_zeros() {
+        assert_all_ints_eq(
+            vec![("000000000000000000000000000000000000000000000000000000001", 1)],
+            Kind::Decimal,
+            false,
+        );
    }

    #[test]
--- a/tasks/coverage/codegen_misc.snap
+++ b/tasks/coverage/codegen_misc.snap
@ -1,3 +1,3 @@
 codegen_misc Summary:
-AST Parsed     : 23/23 (100.00%)
-Positive Passed: 23/23 (100.00%)
+AST Parsed     : 24/24 (100.00%)
+Positive Passed: 24/24 (100.00%)
--- a/tasks/coverage/misc/pass/oxc-4072.ts
+++ b/tasks/coverage/misc/pass/oxc-4072.ts
@ -0,0 +1,14 @@
+// Tests large numbers that may overflow integer parsing, but are all small
+// enough to be valid f64s. Related to PR #4072 and issue #3347.
+
+let HEX_INT_LARGER_THAN_MAX_SAFE_INTEGER: number = 0x10000000000000000
+let OCT_INT_LARGER_THAN_MAX_SAFE_INTEGER: number = 0o40000000000000000
+let BIN_INT_LARGER_THAN_MAX_SAFE_INTEGER: number = 0b00010000000000000000000000000000000000000000000000000000000000000000;
+
+if (
+    HEX_INT_LARGER_THAN_MAX_SAFE_INTEGER === 0 ||
+    OCT_INT_LARGER_THAN_MAX_SAFE_INTEGER === 0 ||
+    BIN_INT_LARGER_THAN_MAX_SAFE_INTEGER === 0
+) {
+    throw new Error('Large numeric literals are overflowing when parsed')
+}
--- a/tasks/coverage/parser_misc.snap
+++ b/tasks/coverage/parser_misc.snap
@ -1,6 +1,6 @@
 parser_misc Summary:
-AST Parsed     : 23/23 (100.00%)
-Positive Passed: 23/23 (100.00%)
+AST Parsed     : 24/24 (100.00%)
+Positive Passed: 24/24 (100.00%)
 Negative Passed: 12/12 (100.00%)

  × Unexpected token
--- a/tasks/coverage/transformer_misc.snap
+++ b/tasks/coverage/transformer_misc.snap
@ -1,3 +1,3 @@
 transformer_misc Summary:
-AST Parsed     : 23/23 (100.00%)
-Positive Passed: 23/23 (100.00%)
+AST Parsed     : 24/24 (100.00%)
+Positive Passed: 24/24 (100.00%)