refactor(parser): reduce Token size from 32 to 16 bytes (#1962)

Part of #1880

`Token` size is reduced from 32 to 16 bytes by changing the previous
token value `Option<&'a str>` to a u32 index handle.

It would be nice if this handle is eliminated entirely because
the normal case for a string is always
`&source_text[token.span.start.token.span.end]`

Unfortunately, JavaScript allows escaped characters to appear in
identifiers, strings and templates. These strings need to be unescaped
for equality checks, i.e. `"\a"  === "a"`.

This leads us to adding a `escaped_strings[]` vec for storing these
unescaped and allocated
strings.

Performance regression for adding this vec should be minimal because
escaped strings are rare.

Background Reading:

* https://floooh.github.io/2018/06/17/handles-vs-pointers.html
This commit is contained in:
Boshen 2024-01-09 15:17:02 +08:00 committed by GitHub
parent 66e95a5968
commit 4706765d2a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 111 additions and 104 deletions

View file

@ -11,7 +11,7 @@ use crate::{
pub struct ParserCheckpoint<'a> {
lexer: LexerCheckpoint<'a>,
cur_token: Token<'a>,
cur_token: Token,
prev_span_end: u32,
errors_pos: usize,
}
@ -29,8 +29,8 @@ impl<'a> Parser<'a> {
}
/// Get current token
pub(crate) fn cur_token(&self) -> &Token<'a> {
&self.token
pub(crate) fn cur_token(&self) -> Token {
self.token
}
/// Get current Kind
@ -47,12 +47,12 @@ impl<'a> Parser<'a> {
}
/// Get current string
pub(crate) fn cur_string(&self) -> Option<&str> {
self.cur_token().value.get_string()
pub(crate) fn cur_string(&self) -> &'a str {
self.lexer.get_string(self.token)
}
/// Peek next token, returns EOF for final peek
pub(crate) fn peek_token(&mut self) -> &Token {
pub(crate) fn peek_token(&mut self) -> Token {
self.lexer.lookahead(1)
}
@ -67,7 +67,7 @@ impl<'a> Parser<'a> {
}
/// Peek nth token
pub(crate) fn nth(&mut self, n: u8) -> &Token {
pub(crate) fn nth(&mut self, n: u8) -> Token {
if n == 0 {
return self.cur_token();
}
@ -94,7 +94,7 @@ impl<'a> Parser<'a> {
/// whose code point sequence is the same as a `ReservedWord`.
#[inline]
fn test_escaped_keyword(&mut self, kind: Kind) {
if self.cur_token().escaped && kind.is_all_keyword() {
if self.cur_token().escaped() && kind.is_all_keyword() {
let span = self.cur_token().span();
self.error(diagnostics::EscapedKeyword(span));
}

View file

@ -17,8 +17,7 @@ use super::{
};
use crate::{
diagnostics,
lexer::{parse_big_int, parse_float, parse_int},
lexer::{Kind, TokenValue},
lexer::{parse_big_int, parse_float, parse_int, Kind},
list::SeparatedList,
Context, Parser,
};
@ -96,10 +95,7 @@ impl<'a> Parser<'a> {
pub(crate) fn parse_identifier_kind(&mut self, kind: Kind) -> (Span, Atom) {
let span = self.start_span();
let name = match std::mem::take(&mut self.token.value) {
TokenValue::String(value) => value,
TokenValue::None => "",
};
let name = self.cur_string();
self.bump_remap(kind);
(self.end_span(span), Atom::from(name))
}
@ -121,7 +117,7 @@ impl<'a> Parser<'a> {
/// # Panics
pub(crate) fn parse_private_identifier(&mut self) -> PrivateIdentifier {
let span = self.start_span();
let name = Atom::from(self.cur_string().unwrap());
let name = Atom::from(self.cur_string());
self.bump_any();
PrivateIdentifier { span: self.end_span(span), name }
}
@ -349,9 +345,7 @@ impl<'a> Parser<'a> {
if !self.at(Kind::Str) {
return Err(self.unexpected());
}
let TokenValue::String(value) = std::mem::take(&mut self.token.value) else {
unreachable!()
};
let value = self.cur_string();
let span = self.start_span();
self.bump_any();
Ok(StringLiteral { span: self.end_span(span), value: value.into() })
@ -454,8 +448,9 @@ impl<'a> Parser<'a> {
_ => unreachable!(),
};
// cooked = None when template literal has invalid escape sequence
let cooked = self.cur_string().map(Atom::from);
// `cooked = None` when template literal has invalid escape sequence
// This is matched by `is_valid_escape_sequence` in `Lexer::read_template_literal`
let cooked = self.cur_token().escaped_string_id.map(|_| self.cur_string());
let raw = &self.cur_src()[1..self.cur_src().len() - end_offset as usize];
let raw = Atom::from(if cooked.is_some() && raw.contains('\r') {
@ -475,7 +470,11 @@ impl<'a> Parser<'a> {
}
let tail = matches!(cur_kind, Kind::TemplateTail | Kind::NoSubstitutionTemplate);
TemplateElement { span, tail, value: TemplateElementValue { raw, cooked } }
TemplateElement {
span,
tail,
value: TemplateElementValue { raw, cooked: cooked.map(Atom::from) },
}
}
/// Section 13.3 Meta Property

View file

@ -50,7 +50,7 @@ impl<'a> Parser<'a> {
}
pub(crate) fn at_async_no_new_line(&mut self) -> bool {
self.at(Kind::Async) && !self.cur_token().escaped && !self.peek_token().is_on_new_line
self.at(Kind::Async) && !self.cur_token().escaped() && !self.peek_token().is_on_new_line
}
pub(crate) fn parse_function_body(&mut self) -> Result<Box<'a, FunctionBody<'a>>> {

View file

@ -127,7 +127,7 @@ impl<'a> Parser<'a> {
Kind::Const if !(self.ts_enabled() && self.is_at_enum_declaration()) => {
self.parse_variable_statement(stmt_ctx)
}
Kind::Let if !self.cur_token().escaped => self.parse_let(stmt_ctx),
Kind::Let if !self.cur_token().escaped() => self.parse_let(stmt_ctx),
Kind::Await
if self.peek_kind() == Kind::Using && self.nth_kind(2).is_binding_identifier() =>
{
@ -276,7 +276,7 @@ impl<'a> Parser<'a> {
let is_let_of = self.at(Kind::Let) && self.peek_at(Kind::Of);
let is_async_of =
self.at(Kind::Async) && !self.cur_token().escaped && self.peek_at(Kind::Of);
self.at(Kind::Async) && !self.cur_token().escaped() && self.peek_at(Kind::Of);
let expr_span = self.start_span();
if self.at(Kind::RParen) {

View file

@ -360,14 +360,15 @@ impl<'a> Parser<'a> {
}
// we are at a valid normal Ident or Keyword, let's keep on lexing for `-`
self.re_lex_jsx_identifier();
let name = Atom::from(self.cur_string().unwrap());
self.bump_any();
Ok(self.ast.jsx_identifier(self.end_span(span), name))
let span = self.end_span(span);
let name = span.source_text(self.source_text);
Ok(self.ast.jsx_identifier(span, name.into()))
}
fn parse_jsx_text(&mut self) -> JSXText {
let span = self.start_span();
let value = Atom::from(self.cur_string().unwrap());
let value = Atom::from(self.cur_string());
self.bump_any();
self.ast.jsx_text(self.end_span(span), value)
}

View file

@ -24,13 +24,13 @@ use oxc_syntax::{
},
unicode_id_start::is_id_start_unicode,
};
pub use token::{Token, TokenValue};
pub use self::{
kind::Kind,
number::{parse_big_int, parse_float, parse_int},
token::Token,
};
use self::{string_builder::AutoCow, trivia_builder::TriviaBuilder};
use self::{string_builder::AutoCow, token::EscapedStringId, trivia_builder::TriviaBuilder};
use crate::{diagnostics, MAX_LEN};
#[derive(Debug, Clone)]
@ -38,7 +38,7 @@ pub struct LexerCheckpoint<'a> {
/// Remaining chars to be tokenized
chars: Chars<'a>,
token: Token<'a>,
token: Token,
errors_pos: usize,
}
@ -66,6 +66,9 @@ pub struct Lexer<'a> {
context: LexerContext,
pub(crate) trivia_builder: TriviaBuilder,
/// Data store for escaped strings, indexed by `Token.escaped_string_id`
escaped_strings: Vec<&'a str>,
}
#[allow(clippy::unused_self)]
@ -91,6 +94,7 @@ impl<'a> Lexer<'a> {
lookahead: VecDeque::with_capacity(4), // 4 is the maximum lookahead for TypeScript
context: LexerContext::Regular,
trivia_builder: TriviaBuilder::default(),
escaped_strings: vec![],
}
}
@ -117,12 +121,12 @@ impl<'a> Lexer<'a> {
}
/// Find the nth lookahead token lazily
pub fn lookahead(&mut self, n: u8) -> &Token<'a> {
pub fn lookahead(&mut self, n: u8) -> Token {
let n = n as usize;
debug_assert!(n > 0);
if self.lookahead.len() > n - 1 {
return &self.lookahead[n - 1].token;
return self.lookahead[n - 1].token;
}
let checkpoint = self.checkpoint();
@ -148,7 +152,7 @@ impl<'a> Lexer<'a> {
self.current = checkpoint;
&self.lookahead[n - 1].token
self.lookahead[n - 1].token
}
/// Set context
@ -157,7 +161,7 @@ impl<'a> Lexer<'a> {
}
/// Main entry point
pub fn next_token(&mut self) -> Token<'a> {
pub fn next_token(&mut self) -> Token {
if let Some(checkpoint) = self.lookahead.pop_front() {
self.current.chars = checkpoint.chars;
self.current.errors_pos = checkpoint.errors_pos;
@ -167,13 +171,13 @@ impl<'a> Lexer<'a> {
self.finish_next(kind)
}
pub fn next_jsx_child(&mut self) -> Token<'a> {
pub fn next_jsx_child(&mut self) -> Token {
self.current.token.start = self.offset();
let kind = self.read_jsx_child();
self.finish_next(kind)
}
fn finish_next(&mut self, kind: Kind) -> Token<'a> {
fn finish_next(&mut self, kind: Kind) -> Token {
self.current.token.kind = kind;
self.current.token.end = self.offset();
debug_assert!(self.current.token.start <= self.current.token.end);
@ -188,7 +192,7 @@ impl<'a> Lexer<'a> {
/// where a `RegularExpressionLiteral` is permitted
/// Which means the parser needs to re-tokenize on `PrimaryExpression`,
/// `RegularExpressionLiteral` only appear on the right hand side of `PrimaryExpression`
pub fn next_regex(&mut self, kind: Kind) -> Token<'a> {
pub fn next_regex(&mut self, kind: Kind) -> Token {
self.current.token.start = self.offset()
- match kind {
Kind::Slash => 1,
@ -200,7 +204,7 @@ impl<'a> Lexer<'a> {
self.finish_next(kind)
}
pub fn next_right_angle(&mut self) -> Token<'a> {
pub fn next_right_angle(&mut self) -> Token {
let kind = self.read_right_angle();
self.lookahead.clear();
self.finish_next(kind)
@ -208,7 +212,7 @@ impl<'a> Lexer<'a> {
/// Re-tokenize the current `}` token for `TemplateSubstitutionTail`
/// See Section 12, the parser needs to re-tokenize on `TemplateSubstitutionTail`,
pub fn next_template_substitution_tail(&mut self) -> Token<'a> {
pub fn next_template_substitution_tail(&mut self) -> Token {
self.current.token.start = self.offset() - 1;
let kind = self.read_template_literal(Kind::TemplateMiddle, Kind::TemplateTail);
self.lookahead.clear();
@ -216,14 +220,14 @@ impl<'a> Lexer<'a> {
}
/// Expand the current token for `JSXIdentifier`
pub fn next_jsx_identifier(&mut self, start_offset: u32) -> Token<'a> {
pub fn next_jsx_identifier(&mut self, start_offset: u32) -> Token {
let kind = self.read_jsx_identifier(start_offset);
self.lookahead.clear();
self.finish_next(kind)
}
/// Re-tokenize '<<' or '<=' or '<<=' to '<'
pub fn re_lex_as_typescript_l_angle(&mut self, kind: Kind) -> Token<'a> {
pub fn re_lex_as_typescript_l_angle(&mut self, kind: Kind) -> Token {
let offset = match kind {
Kind::ShiftLeft | Kind::LtEq => 2,
Kind::ShiftLeftEq => 3,
@ -297,6 +301,44 @@ impl<'a> Lexer<'a> {
}
}
/// Save the string if it is escaped
/// This reduces the overall memory consumption while keeping the `Token` size small
/// Strings without escaped values can be retrieved as is from the token span
#[allow(clippy::cast_possible_truncation)]
fn save_string(&mut self, has_escape: bool, s: &'a str) {
if !has_escape {
return;
}
self.escaped_strings.push(s);
let escaped_string_id = self.escaped_strings.len() as u32;
// SAFETY: escaped_string_id is the length of `self.escaped_strings` after an item is pushed, which can never be 0
let escaped_string_id = unsafe { EscapedStringId::new_unchecked(escaped_string_id) };
self.current.token.escaped_string_id.replace(escaped_string_id);
}
pub(crate) fn get_string(&self, token: Token) -> &'a str {
if let Some(escaped_string_id) = token.escaped_string_id {
return self.escaped_strings[escaped_string_id.get() as usize - 1];
}
let raw = &self.source[token.start as usize..token.end as usize];
match token.kind {
Kind::Str | Kind::NoSubstitutionTemplate => {
// omit surrounding quotes
&raw[1..raw.len() - 1]
}
Kind::TemplateHead => {
// omit leading "`${"
&raw[3..]
}
Kind::TemplateTail => {
// omit trailing "$`"
&raw[..raw.len() - 2]
}
_ => raw,
}
}
/// Read each char and set the current token
/// Whitespace and line terminators are skipped
fn read_next_token(&mut self) -> Kind {
@ -402,7 +444,7 @@ impl<'a> Lexer<'a> {
}
/// Section 12.7.1 Identifier Names
fn identifier_tail(&mut self, mut builder: AutoCow<'a>) -> (bool, &'a str) {
fn identifier_tail(&mut self, mut builder: AutoCow<'a>) -> &'a str {
// ident tail
while let Some(c) = self.peek() {
if !is_identifier_part(c) {
@ -418,14 +460,13 @@ impl<'a> Lexer<'a> {
builder.push_matching(c);
}
let has_escape = builder.has_escape();
(has_escape, builder.finish(self))
let text = builder.finish(self);
self.save_string(has_escape, text);
text
}
fn identifier_name(&mut self, builder: AutoCow<'a>) -> &'a str {
let (has_escape, text) = self.identifier_tail(builder);
self.current.token.escaped = has_escape;
self.current.token.value = TokenValue::String(text);
text
self.identifier_tail(builder)
}
fn identifier_name_handler(&mut self) -> &'a str {
@ -532,8 +573,7 @@ impl<'a> Lexer<'a> {
return Kind::Undetermined;
}
}
let (_, name) = self.identifier_tail(builder);
self.current.token.value = TokenValue::String(name);
self.identifier_tail(builder);
Kind::PrivateIdentifier
}
@ -765,8 +805,7 @@ impl<'a> Lexer<'a> {
}
Some(c @ ('"' | '\'')) => {
if c == delimiter {
self.current.token.value =
TokenValue::String(builder.finish_without_push(self));
self.save_string(builder.has_escape(), builder.finish_without_push(self));
return Kind::Str;
}
builder.push_matching(c);
@ -850,16 +889,14 @@ impl<'a> Lexer<'a> {
match c {
'$' if self.peek() == Some('{') => {
if is_valid_escape_sequence {
self.current.token.value =
TokenValue::String(builder.finish_without_push(self));
self.save_string(true, builder.finish_without_push(self));
}
self.current.chars.next();
return substitute;
}
'`' => {
if is_valid_escape_sequence {
self.current.token.value =
TokenValue::String(builder.finish_without_push(self));
self.save_string(true, builder.finish_without_push(self));
}
return tail;
}
@ -872,6 +909,7 @@ impl<'a> Lexer<'a> {
'\\' => {
let text = builder.get_mut_string_without_current_ascii_char(self);
self.read_string_escape_sequence(text, true, &mut is_valid_escape_sequence);
if !is_valid_escape_sequence {}
}
_ => builder.push_matching(c),
}
@ -884,18 +922,13 @@ impl<'a> Lexer<'a> {
/// `IdentifierStart`
/// `JSXIdentifier` `IdentifierPart`
/// `JSXIdentifier` [no `WhiteSpace` or Comment here] -
fn read_jsx_identifier(&mut self, start_offset: u32) -> Kind {
let prev_str = &self.source[start_offset as usize..self.offset() as usize];
let mut builder = AutoCow::new(self);
fn read_jsx_identifier(&mut self, _start_offset: u32) -> Kind {
while let Some(c) = self.peek() {
if c == '-' || is_identifier_start_all(c) {
self.current.chars.next();
builder.push_matching(c);
while let Some(c) = self.peek() {
if is_identifier_part(c) {
let c = self.current.chars.next().unwrap();
builder.push_matching(c);
self.current.chars.next().unwrap();
} else {
break;
}
@ -904,9 +937,6 @@ impl<'a> Lexer<'a> {
break;
}
}
let mut s = String::from_str_in(prev_str, self.allocator);
s.push_str(builder.finish(self));
self.current.token.value = TokenValue::String(s.into_bump_str());
Kind::Ident
}
@ -941,7 +971,6 @@ impl<'a> Lexer<'a> {
break;
}
}
self.current.token.value = TokenValue::String(builder.finish(self));
Kind::JSXText
}
None => Kind::Eof,
@ -964,8 +993,7 @@ impl<'a> Lexer<'a> {
match self.current.chars.next() {
Some(c @ ('"' | '\'')) => {
if c == delimiter {
self.current.token.value =
TokenValue::String(builder.finish_without_push(self));
self.save_string(builder.has_escape(), builder.finish_without_push(self));
return Kind::Str;
}
builder.push_matching(c);

View file

@ -33,14 +33,14 @@ impl<'a> AutoCow<'a> {
// and return the reference to it
pub fn get_mut_string_without_current_ascii_char<'b>(
&'b mut self,
lexer: &'_ Lexer<'a>,
lexer: &Lexer<'a>,
) -> &'b mut String<'a> {
self.force_allocation_without_current_ascii_char(lexer);
self.value.as_mut().unwrap()
}
// Force allocation of a String, excluding the current ASCII character.
pub fn force_allocation_without_current_ascii_char(&mut self, lexer: &'_ Lexer<'a>) {
pub fn force_allocation_without_current_ascii_char(&mut self, lexer: &Lexer<'a>) {
if self.value.is_some() {
return;
}

View file

@ -4,8 +4,10 @@ use oxc_span::Span;
use super::kind::Kind;
pub type EscapedStringId = std::num::NonZeroU32;
#[derive(Debug, Clone, Copy, Default)]
pub struct Token<'a> {
pub struct Token {
/// Token Kind
pub kind: Kind,
@ -18,40 +20,22 @@ pub struct Token<'a> {
/// Indicates the token is on a newline
pub is_on_new_line: bool,
/// Is the original string escaped?
pub escaped: bool,
pub value: TokenValue<'a>,
/// A index handle to `Lexer::escaped_strings`
/// See https://floooh.github.io/2018/06/17/handles-vs-pointers.html for some background reading
pub escaped_string_id: Option<EscapedStringId>,
}
#[cfg(target_pointer_width = "64")]
mod size_asserts {
oxc_index::assert_eq_size!(super::Token, [u8; 32]);
oxc_index::assert_eq_size!(super::Token, [u8; 16]);
}
impl<'a> Token<'a> {
impl Token {
pub fn span(&self) -> Span {
Span::new(self.start, self.end)
}
}
#[derive(Debug, Copy, Clone)]
pub enum TokenValue<'a> {
None,
String(&'a str),
}
impl<'a> Default for TokenValue<'a> {
fn default() -> Self {
Self::None
}
}
impl<'a> TokenValue<'a> {
pub fn get_string(&self) -> Option<&str> {
match self {
Self::String(s) => Some(s),
Self::None => None,
}
pub fn escaped(&self) -> bool {
self.escaped_string_id.is_some()
}
}

View file

@ -117,7 +117,7 @@ pub struct Parser<'a> {
errors: Vec<Error>,
/// The current parsing token
token: Token<'a>,
token: Token,
/// The end range of the previous token
prev_token_end: u32,

View file

@ -302,13 +302,8 @@ impl<'a> Parser<'a> {
return self.parse_ts_infer_type();
}
let mut operator = None;
if !self.at(Kind::Str) {
if let Some(atom) = self.cur_string() {
operator = TSTypeOperator::from_src(atom);
}
}
let operator =
if self.at(Kind::Str) { None } else { TSTypeOperator::from_src(self.cur_string()) };
// test ts ts_type_operator
// type B = keyof A;