refactor(parser): reduce Token size from 32 to 16 bytes (#1962)

Part of #1880 `Token` size is reduced from 32 to 16 bytes by changing the previous token value `Option<&'a str>` to a u32 index handle. It would be nice if this handle is eliminated entirely because the normal case for a string is always `&source_text[token.span.start.token.span.end]` Unfortunately, JavaScript allows escaped characters to appear in identifiers, strings and templates. These strings need to be unescaped for equality checks, i.e. `"\a" === "a"`. This leads us to adding a `escaped_strings[]` vec for storing these unescaped and allocated strings. Performance regression for adding this vec should be minimal because escaped strings are rare. Background Reading: * https://floooh.github.io/2018/06/17/handles-vs-pointers.html
2026-05-24 20:32:10 +00:00 · 2024-01-09 15:17:02 +08:00 · 2024-01-09 15:17:02 +08:00 · 4706765d2a
commit 4706765d2a
parent 66e95a5968
10 changed files with 111 additions and 104 deletions
--- a/crates/oxc_parser/src/cursor.rs
+++ b/crates/oxc_parser/src/cursor.rs
@ -11,7 +11,7 @@ use crate::{

 pub struct ParserCheckpoint<'a> {
    lexer: LexerCheckpoint<'a>,
-    cur_token: Token<'a>,
+    cur_token: Token,
    prev_span_end: u32,
    errors_pos: usize,
 }
@ -29,8 +29,8 @@ impl<'a> Parser<'a> {
    }

    /// Get current token
-    pub(crate) fn cur_token(&self) -> &Token<'a> {
-        &self.token
+    pub(crate) fn cur_token(&self) -> Token {
+        self.token
    }

    /// Get current Kind
@ -47,12 +47,12 @@ impl<'a> Parser<'a> {
    }

    /// Get current string
-    pub(crate) fn cur_string(&self) -> Option<&str> {
-        self.cur_token().value.get_string()
+    pub(crate) fn cur_string(&self) -> &'a str {
+        self.lexer.get_string(self.token)
    }

    /// Peek next token, returns EOF for final peek
-    pub(crate) fn peek_token(&mut self) -> &Token {
+    pub(crate) fn peek_token(&mut self) -> Token {
        self.lexer.lookahead(1)
    }

@ -67,7 +67,7 @@ impl<'a> Parser<'a> {
    }

    /// Peek nth token
-    pub(crate) fn nth(&mut self, n: u8) -> &Token {
+    pub(crate) fn nth(&mut self, n: u8) -> Token {
        if n == 0 {
            return self.cur_token();
        }
@ -94,7 +94,7 @@ impl<'a> Parser<'a> {
    /// whose code point sequence is the same as a `ReservedWord`.
    #[inline]
    fn test_escaped_keyword(&mut self, kind: Kind) {
-        if self.cur_token().escaped && kind.is_all_keyword() {
+        if self.cur_token().escaped() && kind.is_all_keyword() {
            let span = self.cur_token().span();
            self.error(diagnostics::EscapedKeyword(span));
        }
--- a/crates/oxc_parser/src/js/expression.rs
+++ b/crates/oxc_parser/src/js/expression.rs
@ -17,8 +17,7 @@ use super::{
 };
 use crate::{
    diagnostics,
-    lexer::{parse_big_int, parse_float, parse_int},
-    lexer::{Kind, TokenValue},
+    lexer::{parse_big_int, parse_float, parse_int, Kind},
    list::SeparatedList,
    Context, Parser,
 };
@ -96,10 +95,7 @@ impl<'a> Parser<'a> {

    pub(crate) fn parse_identifier_kind(&mut self, kind: Kind) -> (Span, Atom) {
        let span = self.start_span();
-        let name = match std::mem::take(&mut self.token.value) {
-            TokenValue::String(value) => value,
-            TokenValue::None => "",
-        };
+        let name = self.cur_string();
        self.bump_remap(kind);
        (self.end_span(span), Atom::from(name))
    }
@ -121,7 +117,7 @@ impl<'a> Parser<'a> {
    /// # Panics
    pub(crate) fn parse_private_identifier(&mut self) -> PrivateIdentifier {
        let span = self.start_span();
-        let name = Atom::from(self.cur_string().unwrap());
+        let name = Atom::from(self.cur_string());
        self.bump_any();
        PrivateIdentifier { span: self.end_span(span), name }
    }
@ -349,9 +345,7 @@ impl<'a> Parser<'a> {
        if !self.at(Kind::Str) {
            return Err(self.unexpected());
        }
-        let TokenValue::String(value) = std::mem::take(&mut self.token.value) else {
-            unreachable!()
-        };
+        let value = self.cur_string();
        let span = self.start_span();
        self.bump_any();
        Ok(StringLiteral { span: self.end_span(span), value: value.into() })
@ -454,8 +448,9 @@ impl<'a> Parser<'a> {
            _ => unreachable!(),
        };

-        // cooked = None when template literal has invalid escape sequence
-        let cooked = self.cur_string().map(Atom::from);
+        // `cooked = None` when template literal has invalid escape sequence
+        // This is matched by `is_valid_escape_sequence` in `Lexer::read_template_literal`
+        let cooked = self.cur_token().escaped_string_id.map(|_| self.cur_string());

        let raw = &self.cur_src()[1..self.cur_src().len() - end_offset as usize];
        let raw = Atom::from(if cooked.is_some() && raw.contains('\r') {
@ -475,7 +470,11 @@ impl<'a> Parser<'a> {
        }

        let tail = matches!(cur_kind, Kind::TemplateTail | Kind::NoSubstitutionTemplate);
-        TemplateElement { span, tail, value: TemplateElementValue { raw, cooked } }
+        TemplateElement {
+            span,
+            tail,
+            value: TemplateElementValue { raw, cooked: cooked.map(Atom::from) },
+        }
    }

    /// Section 13.3 Meta Property
--- a/crates/oxc_parser/src/js/function.rs
+++ b/crates/oxc_parser/src/js/function.rs
@ -50,7 +50,7 @@ impl<'a> Parser<'a> {
    }

    pub(crate) fn at_async_no_new_line(&mut self) -> bool {
-        self.at(Kind::Async) && !self.cur_token().escaped && !self.peek_token().is_on_new_line
+        self.at(Kind::Async) && !self.cur_token().escaped() && !self.peek_token().is_on_new_line
    }

    pub(crate) fn parse_function_body(&mut self) -> Result<Box<'a, FunctionBody<'a>>> {
--- a/crates/oxc_parser/src/js/statement.rs
+++ b/crates/oxc_parser/src/js/statement.rs
@ -127,7 +127,7 @@ impl<'a> Parser<'a> {
            Kind::Const if !(self.ts_enabled() && self.is_at_enum_declaration()) => {
                self.parse_variable_statement(stmt_ctx)
            }
-            Kind::Let if !self.cur_token().escaped => self.parse_let(stmt_ctx),
+            Kind::Let if !self.cur_token().escaped() => self.parse_let(stmt_ctx),
            Kind::Await
                if self.peek_kind() == Kind::Using && self.nth_kind(2).is_binding_identifier() =>
            {
@ -276,7 +276,7 @@ impl<'a> Parser<'a> {

        let is_let_of = self.at(Kind::Let) && self.peek_at(Kind::Of);
        let is_async_of =
-            self.at(Kind::Async) && !self.cur_token().escaped && self.peek_at(Kind::Of);
+            self.at(Kind::Async) && !self.cur_token().escaped() && self.peek_at(Kind::Of);
        let expr_span = self.start_span();

        if self.at(Kind::RParen) {
--- a/crates/oxc_parser/src/jsx/mod.rs
+++ b/crates/oxc_parser/src/jsx/mod.rs
@ -360,14 +360,15 @@ impl<'a> Parser<'a> {
        }
        // we are at a valid normal Ident or Keyword, let's keep on lexing for `-`
        self.re_lex_jsx_identifier();
-        let name = Atom::from(self.cur_string().unwrap());
        self.bump_any();
-        Ok(self.ast.jsx_identifier(self.end_span(span), name))
+        let span = self.end_span(span);
+        let name = span.source_text(self.source_text);
+        Ok(self.ast.jsx_identifier(span, name.into()))
    }

    fn parse_jsx_text(&mut self) -> JSXText {
        let span = self.start_span();
-        let value = Atom::from(self.cur_string().unwrap());
+        let value = Atom::from(self.cur_string());
        self.bump_any();
        self.ast.jsx_text(self.end_span(span), value)
    }
--- a/crates/oxc_parser/src/lexer/mod.rs
+++ b/crates/oxc_parser/src/lexer/mod.rs
@ -24,13 +24,13 @@ use oxc_syntax::{
    },
    unicode_id_start::is_id_start_unicode,
 };
-pub use token::{Token, TokenValue};

 pub use self::{
    kind::Kind,
    number::{parse_big_int, parse_float, parse_int},
+    token::Token,
 };
-use self::{string_builder::AutoCow, trivia_builder::TriviaBuilder};
+use self::{string_builder::AutoCow, token::EscapedStringId, trivia_builder::TriviaBuilder};
 use crate::{diagnostics, MAX_LEN};

 #[derive(Debug, Clone)]
@ -38,7 +38,7 @@ pub struct LexerCheckpoint<'a> {
    /// Remaining chars to be tokenized
    chars: Chars<'a>,

-    token: Token<'a>,
+    token: Token,

    errors_pos: usize,
 }
@ -66,6 +66,9 @@ pub struct Lexer<'a> {
    context: LexerContext,

    pub(crate) trivia_builder: TriviaBuilder,
+
+    /// Data store for escaped strings, indexed by `Token.escaped_string_id`
+    escaped_strings: Vec<&'a str>,
 }

 #[allow(clippy::unused_self)]
@ -91,6 +94,7 @@ impl<'a> Lexer<'a> {
            lookahead: VecDeque::with_capacity(4), // 4 is the maximum lookahead for TypeScript
            context: LexerContext::Regular,
            trivia_builder: TriviaBuilder::default(),
+            escaped_strings: vec![],
        }
    }

@ -117,12 +121,12 @@ impl<'a> Lexer<'a> {
    }

    /// Find the nth lookahead token lazily
-    pub fn lookahead(&mut self, n: u8) -> &Token<'a> {
+    pub fn lookahead(&mut self, n: u8) -> Token {
        let n = n as usize;
        debug_assert!(n > 0);

        if self.lookahead.len() > n - 1 {
-            return &self.lookahead[n - 1].token;
+            return self.lookahead[n - 1].token;
        }

        let checkpoint = self.checkpoint();
@ -148,7 +152,7 @@ impl<'a> Lexer<'a> {

        self.current = checkpoint;

-        &self.lookahead[n - 1].token
+        self.lookahead[n - 1].token
    }

    /// Set context
@ -157,7 +161,7 @@ impl<'a> Lexer<'a> {
    }

    /// Main entry point
-    pub fn next_token(&mut self) -> Token<'a> {
+    pub fn next_token(&mut self) -> Token {
        if let Some(checkpoint) = self.lookahead.pop_front() {
            self.current.chars = checkpoint.chars;
            self.current.errors_pos = checkpoint.errors_pos;
@ -167,13 +171,13 @@ impl<'a> Lexer<'a> {
        self.finish_next(kind)
    }

-    pub fn next_jsx_child(&mut self) -> Token<'a> {
+    pub fn next_jsx_child(&mut self) -> Token {
        self.current.token.start = self.offset();
        let kind = self.read_jsx_child();
        self.finish_next(kind)
    }

-    fn finish_next(&mut self, kind: Kind) -> Token<'a> {
+    fn finish_next(&mut self, kind: Kind) -> Token {
        self.current.token.kind = kind;
        self.current.token.end = self.offset();
        debug_assert!(self.current.token.start <= self.current.token.end);
@ -188,7 +192,7 @@ impl<'a> Lexer<'a> {
    ///   where a `RegularExpressionLiteral` is permitted
    /// Which means the parser needs to re-tokenize on `PrimaryExpression`,
    /// `RegularExpressionLiteral` only appear on the right hand side of `PrimaryExpression`
-    pub fn next_regex(&mut self, kind: Kind) -> Token<'a> {
+    pub fn next_regex(&mut self, kind: Kind) -> Token {
        self.current.token.start = self.offset()
            - match kind {
                Kind::Slash => 1,
@ -200,7 +204,7 @@ impl<'a> Lexer<'a> {
        self.finish_next(kind)
    }

-    pub fn next_right_angle(&mut self) -> Token<'a> {
+    pub fn next_right_angle(&mut self) -> Token {
        let kind = self.read_right_angle();
        self.lookahead.clear();
        self.finish_next(kind)
@ -208,7 +212,7 @@ impl<'a> Lexer<'a> {

    /// Re-tokenize the current `}` token for `TemplateSubstitutionTail`
    /// See Section 12, the parser needs to re-tokenize on `TemplateSubstitutionTail`,
-    pub fn next_template_substitution_tail(&mut self) -> Token<'a> {
+    pub fn next_template_substitution_tail(&mut self) -> Token {
        self.current.token.start = self.offset() - 1;
        let kind = self.read_template_literal(Kind::TemplateMiddle, Kind::TemplateTail);
        self.lookahead.clear();
@ -216,14 +220,14 @@ impl<'a> Lexer<'a> {
    }

    /// Expand the current token for `JSXIdentifier`
-    pub fn next_jsx_identifier(&mut self, start_offset: u32) -> Token<'a> {
+    pub fn next_jsx_identifier(&mut self, start_offset: u32) -> Token {
        let kind = self.read_jsx_identifier(start_offset);
        self.lookahead.clear();
        self.finish_next(kind)
    }

    /// Re-tokenize '<<' or '<=' or '<<=' to '<'
-    pub fn re_lex_as_typescript_l_angle(&mut self, kind: Kind) -> Token<'a> {
+    pub fn re_lex_as_typescript_l_angle(&mut self, kind: Kind) -> Token {
        let offset = match kind {
            Kind::ShiftLeft | Kind::LtEq => 2,
            Kind::ShiftLeftEq => 3,
@ -297,6 +301,44 @@ impl<'a> Lexer<'a> {
        }
    }

+    /// Save the string if it is escaped
+    /// This reduces the overall memory consumption while keeping the `Token` size small
+    /// Strings without escaped values can be retrieved as is from the token span
+    #[allow(clippy::cast_possible_truncation)]
+    fn save_string(&mut self, has_escape: bool, s: &'a str) {
+        if !has_escape {
+            return;
+        }
+        self.escaped_strings.push(s);
+        let escaped_string_id = self.escaped_strings.len() as u32;
+        // SAFETY: escaped_string_id is the length of `self.escaped_strings` after an item is pushed, which can never be 0
+        let escaped_string_id = unsafe { EscapedStringId::new_unchecked(escaped_string_id) };
+        self.current.token.escaped_string_id.replace(escaped_string_id);
+    }
+
+    pub(crate) fn get_string(&self, token: Token) -> &'a str {
+        if let Some(escaped_string_id) = token.escaped_string_id {
+            return self.escaped_strings[escaped_string_id.get() as usize - 1];
+        }
+
+        let raw = &self.source[token.start as usize..token.end as usize];
+        match token.kind {
+            Kind::Str | Kind::NoSubstitutionTemplate => {
+                // omit surrounding quotes
+                &raw[1..raw.len() - 1]
+            }
+            Kind::TemplateHead => {
+                // omit leading "`${"
+                &raw[3..]
+            }
+            Kind::TemplateTail => {
+                // omit trailing "$`"
+                &raw[..raw.len() - 2]
+            }
+            _ => raw,
+        }
+    }
+
    /// Read each char and set the current token
    /// Whitespace and line terminators are skipped
    fn read_next_token(&mut self) -> Kind {
@ -402,7 +444,7 @@ impl<'a> Lexer<'a> {
    }

    /// Section 12.7.1 Identifier Names
-    fn identifier_tail(&mut self, mut builder: AutoCow<'a>) -> (bool, &'a str) {
+    fn identifier_tail(&mut self, mut builder: AutoCow<'a>) -> &'a str {
        // ident tail
        while let Some(c) = self.peek() {
            if !is_identifier_part(c) {
@ -418,14 +460,13 @@ impl<'a> Lexer<'a> {
            builder.push_matching(c);
        }
        let has_escape = builder.has_escape();
-        (has_escape, builder.finish(self))
+        let text = builder.finish(self);
+        self.save_string(has_escape, text);
+        text
    }

    fn identifier_name(&mut self, builder: AutoCow<'a>) -> &'a str {
-        let (has_escape, text) = self.identifier_tail(builder);
-        self.current.token.escaped = has_escape;
-        self.current.token.value = TokenValue::String(text);
-        text
+        self.identifier_tail(builder)
    }

    fn identifier_name_handler(&mut self) -> &'a str {
@ -532,8 +573,7 @@ impl<'a> Lexer<'a> {
                return Kind::Undetermined;
            }
        }
-        let (_, name) = self.identifier_tail(builder);
-        self.current.token.value = TokenValue::String(name);
+        self.identifier_tail(builder);
        Kind::PrivateIdentifier
    }

@ -765,8 +805,7 @@ impl<'a> Lexer<'a> {
                }
                Some(c @ ('"' | '\'')) => {
                    if c == delimiter {
-                        self.current.token.value =
-                            TokenValue::String(builder.finish_without_push(self));
+                        self.save_string(builder.has_escape(), builder.finish_without_push(self));
                        return Kind::Str;
                    }
                    builder.push_matching(c);
@ -850,16 +889,14 @@ impl<'a> Lexer<'a> {
            match c {
                '$' if self.peek() == Some('{') => {
                    if is_valid_escape_sequence {
-                        self.current.token.value =
-                            TokenValue::String(builder.finish_without_push(self));
+                        self.save_string(true, builder.finish_without_push(self));
                    }
                    self.current.chars.next();
                    return substitute;
                }
                '`' => {
                    if is_valid_escape_sequence {
-                        self.current.token.value =
-                            TokenValue::String(builder.finish_without_push(self));
+                        self.save_string(true, builder.finish_without_push(self));
                    }
                    return tail;
                }
@ -872,6 +909,7 @@ impl<'a> Lexer<'a> {
                '\\' => {
                    let text = builder.get_mut_string_without_current_ascii_char(self);
                    self.read_string_escape_sequence(text, true, &mut is_valid_escape_sequence);
+                    if !is_valid_escape_sequence {}
                }
                _ => builder.push_matching(c),
            }
@ -884,18 +922,13 @@ impl<'a> Lexer<'a> {
    ///   `IdentifierStart`
    ///   `JSXIdentifier` `IdentifierPart`
    ///   `JSXIdentifier` [no `WhiteSpace` or Comment here] -
-    fn read_jsx_identifier(&mut self, start_offset: u32) -> Kind {
-        let prev_str = &self.source[start_offset as usize..self.offset() as usize];
-
-        let mut builder = AutoCow::new(self);
+    fn read_jsx_identifier(&mut self, _start_offset: u32) -> Kind {
        while let Some(c) = self.peek() {
            if c == '-' || is_identifier_start_all(c) {
                self.current.chars.next();
-                builder.push_matching(c);
                while let Some(c) = self.peek() {
                    if is_identifier_part(c) {
-                        let c = self.current.chars.next().unwrap();
-                        builder.push_matching(c);
+                        self.current.chars.next().unwrap();
                    } else {
                        break;
                    }
@ -904,9 +937,6 @@ impl<'a> Lexer<'a> {
                break;
            }
        }
-        let mut s = String::from_str_in(prev_str, self.allocator);
-        s.push_str(builder.finish(self));
-        self.current.token.value = TokenValue::String(s.into_bump_str());
        Kind::Ident
    }

@ -941,7 +971,6 @@ impl<'a> Lexer<'a> {
                        break;
                    }
                }
-                self.current.token.value = TokenValue::String(builder.finish(self));
                Kind::JSXText
            }
            None => Kind::Eof,
@ -964,8 +993,7 @@ impl<'a> Lexer<'a> {
            match self.current.chars.next() {
                Some(c @ ('"' | '\'')) => {
                    if c == delimiter {
-                        self.current.token.value =
-                            TokenValue::String(builder.finish_without_push(self));
+                        self.save_string(builder.has_escape(), builder.finish_without_push(self));
                        return Kind::Str;
                    }
                    builder.push_matching(c);
--- a/crates/oxc_parser/src/lexer/string_builder.rs
+++ b/crates/oxc_parser/src/lexer/string_builder.rs
@ -33,14 +33,14 @@ impl<'a> AutoCow<'a> {
    // and return the reference to it
    pub fn get_mut_string_without_current_ascii_char<'b>(
        &'b mut self,
-        lexer: &'_ Lexer<'a>,
+        lexer: &Lexer<'a>,
    ) -> &'b mut String<'a> {
        self.force_allocation_without_current_ascii_char(lexer);
        self.value.as_mut().unwrap()
    }

    // Force allocation of a String, excluding the current ASCII character.
-    pub fn force_allocation_without_current_ascii_char(&mut self, lexer: &'_ Lexer<'a>) {
+    pub fn force_allocation_without_current_ascii_char(&mut self, lexer: &Lexer<'a>) {
        if self.value.is_some() {
            return;
        }
--- a/crates/oxc_parser/src/lexer/token.rs
+++ b/crates/oxc_parser/src/lexer/token.rs
@ -4,8 +4,10 @@ use oxc_span::Span;

 use super::kind::Kind;

+pub type EscapedStringId = std::num::NonZeroU32;
+
 #[derive(Debug, Clone, Copy, Default)]
-pub struct Token<'a> {
+pub struct Token {
    /// Token Kind
    pub kind: Kind,

@ -18,40 +20,22 @@ pub struct Token<'a> {
    /// Indicates the token is on a newline
    pub is_on_new_line: bool,

-    /// Is the original string escaped?
-    pub escaped: bool,
-
-    pub value: TokenValue<'a>,
+    /// A index handle to `Lexer::escaped_strings`
+    /// See https://floooh.github.io/2018/06/17/handles-vs-pointers.html for some background reading
+    pub escaped_string_id: Option<EscapedStringId>,
 }

 #[cfg(target_pointer_width = "64")]
 mod size_asserts {
-    oxc_index::assert_eq_size!(super::Token, [u8; 32]);
+    oxc_index::assert_eq_size!(super::Token, [u8; 16]);
 }

-impl<'a> Token<'a> {
+impl Token {
    pub fn span(&self) -> Span {
        Span::new(self.start, self.end)
    }
-}

-#[derive(Debug, Copy, Clone)]
-pub enum TokenValue<'a> {
-    None,
-    String(&'a str),
-}
-
-impl<'a> Default for TokenValue<'a> {
-    fn default() -> Self {
-        Self::None
-    }
-}
-
-impl<'a> TokenValue<'a> {
-    pub fn get_string(&self) -> Option<&str> {
-        match self {
-            Self::String(s) => Some(s),
-            Self::None => None,
-        }
+    pub fn escaped(&self) -> bool {
+        self.escaped_string_id.is_some()
    }
 }
--- a/crates/oxc_parser/src/lib.rs
+++ b/crates/oxc_parser/src/lib.rs
@ -117,7 +117,7 @@ pub struct Parser<'a> {
    errors: Vec<Error>,

    /// The current parsing token
-    token: Token<'a>,
+    token: Token,

    /// The end range of the previous token
    prev_token_end: u32,
--- a/crates/oxc_parser/src/ts/types.rs
+++ b/crates/oxc_parser/src/ts/types.rs
@ -302,13 +302,8 @@ impl<'a> Parser<'a> {
            return self.parse_ts_infer_type();
        }

-        let mut operator = None;
-
-        if !self.at(Kind::Str) {
-            if let Some(atom) = self.cur_string() {
-                operator = TSTypeOperator::from_src(atom);
-            }
-        }
+        let operator =
+            if self.at(Kind::Str) { None } else { TSTypeOperator::from_src(self.cur_string()) };

        // test ts ts_type_operator
        // type B = keyof A;