diff --git a/crates/oxc_parser/src/cursor.rs b/crates/oxc_parser/src/cursor.rs index 12ee4bdee..821cc2bf5 100644 --- a/crates/oxc_parser/src/cursor.rs +++ b/crates/oxc_parser/src/cursor.rs @@ -10,6 +10,7 @@ use crate::{ Context, Parser, }; +#[derive(Clone, Copy)] pub struct ParserCheckpoint<'a> { lexer: LexerCheckpoint<'a>, cur_token: Token, @@ -254,7 +255,9 @@ impl<'a> Parser<'a> { let ParserCheckpoint { lexer, cur_token, prev_span_end, errors_pos: errors_lens } = checkpoint; - self.lexer.rewind(lexer); + // SAFETY: Parser only ever creates a single `Lexer`, + // therefore all checkpoints must be created from it. + unsafe { self.lexer.rewind(lexer) }; self.token = cur_token; self.prev_token_end = prev_span_end; self.errors.truncate(errors_lens); diff --git a/crates/oxc_parser/src/lexer/byte_handlers.rs b/crates/oxc_parser/src/lexer/byte_handlers.rs index fb54cd367..95166beb7 100644 --- a/crates/oxc_parser/src/lexer/byte_handlers.rs +++ b/crates/oxc_parser/src/lexer/byte_handlers.rs @@ -6,9 +6,8 @@ use crate::diagnostics; /// /// SAFETY: /// * Lexer must not be at end of file. -/// * `byte` must be next byte of source code, corresponding to current position -/// of `lexer.current.chars`. -/// * Only `BYTE_HANDLERS` for ASCII characters may use the `ascii_byte_handler!` macro. +/// * `byte` must be next byte of source code, corresponding to current position of `lexer.source`. +/// * Only `BYTE_HANDLERS` for ASCII characters may use the `ascii_byte_handler!()` macro. pub(super) unsafe fn handle_byte(byte: u8, lexer: &mut Lexer) -> Kind { BYTE_HANDLERS[byte as usize](lexer) } @@ -82,7 +81,7 @@ macro_rules! byte_handler { /// /// These assertions produce no runtime code, but hint to the compiler that it can assume that /// next char is ASCII, and it uses that information to optimize the rest of the handler. -/// e.g. `lexer.current.chars.next()` becomes just a single assembler instruction. +/// e.g. `lexer.consume_char()` becomes just a single assembler instruction. /// Without the assertions, the compiler is unable to deduce the next char is ASCII, due to /// the indirection of the `BYTE_HANDLERS` jump table. /// @@ -108,8 +107,8 @@ macro_rules! byte_handler { /// unsafe { /// use assert_unchecked::assert_unchecked; /// let s = lexer.current.chars.as_str(); -/// assert_unchecked!(!s.is_empty()); -/// assert_unchecked!(s.as_bytes()[0] < 128); +/// assert_unchecked!(!lexer.source.is_eof()); +/// assert_unchecked!(lexer.source.peek_byte_unchecked() < 128); /// } /// { /// lexer.consume_char(); @@ -125,9 +124,8 @@ macro_rules! ascii_byte_handler { // SAFETY: This macro is only used for ASCII characters unsafe { use assert_unchecked::assert_unchecked; - let s = $lex.current.chars.as_str(); - assert_unchecked!(!s.is_empty()); - assert_unchecked!(s.as_bytes()[0] < 128); + assert_unchecked!(!$lex.source.is_eof()); + assert_unchecked!($lex.source.peek_byte_unchecked() < 128); } $body }); @@ -150,14 +148,14 @@ ascii_byte_handler!(SPS(lexer) { // Irregular Whitespace ascii_byte_handler!(ISP(lexer) { lexer.consume_char(); - lexer.trivia_builder.add_irregular_whitespace(lexer.current.token.start, lexer.offset()); + lexer.trivia_builder.add_irregular_whitespace(lexer.token.start, lexer.offset()); Kind::Skip }); // '\r' '\n' ascii_byte_handler!(LIN(lexer) { lexer.consume_char(); - lexer.current.token.is_on_new_line = true; + lexer.token.is_on_new_line = true; Kind::Skip }); @@ -190,7 +188,7 @@ ascii_byte_handler!(HAS(lexer) { lexer.consume_char(); // HashbangComment :: // `#!` SingleLineCommentChars? - if lexer.current.token.start == 0 && lexer.next_eq('!') { + if lexer.token.start == 0 && lexer.next_eq('!') { lexer.read_hashbang_comment() } else { lexer.private_identifier() diff --git a/crates/oxc_parser/src/lexer/comment.rs b/crates/oxc_parser/src/lexer/comment.rs index 5ac3ef5f0..f57b75f6c 100644 --- a/crates/oxc_parser/src/lexer/comment.rs +++ b/crates/oxc_parser/src/lexer/comment.rs @@ -7,10 +7,10 @@ impl<'a> Lexer<'a> { /// Section 12.4 Single Line Comment #[allow(clippy::cast_possible_truncation)] pub(super) fn skip_single_line_comment(&mut self) -> Kind { - let start = self.current.token.start; + let start = self.token.start; while let Some(c) = self.next_char() { if is_line_terminator(c) { - self.current.token.is_on_new_line = true; + self.token.is_on_new_line = true; self.trivia_builder .add_single_line_comment(start, self.offset() - c.len_utf8() as u32); return Kind::Skip; @@ -25,11 +25,11 @@ impl<'a> Lexer<'a> { pub(super) fn skip_multi_line_comment(&mut self) -> Kind { while let Some(c) = self.next_char() { if c == '*' && self.next_eq('/') { - self.trivia_builder.add_multi_line_comment(self.current.token.start, self.offset()); + self.trivia_builder.add_multi_line_comment(self.token.start, self.offset()); return Kind::Skip; } if is_line_terminator(c) { - self.current.token.is_on_new_line = true; + self.token.is_on_new_line = true; } } self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range())); @@ -43,7 +43,7 @@ impl<'a> Lexer<'a> { break; } } - self.current.token.is_on_new_line = true; + self.token.is_on_new_line = true; Kind::HashbangComment } } diff --git a/crates/oxc_parser/src/lexer/jsx.rs b/crates/oxc_parser/src/lexer/jsx.rs index d04275f53..307109c34 100644 --- a/crates/oxc_parser/src/lexer/jsx.rs +++ b/crates/oxc_parser/src/lexer/jsx.rs @@ -37,7 +37,7 @@ impl<'a> Lexer<'a> { } pub(crate) fn next_jsx_child(&mut self) -> Token { - self.current.token.start = self.offset(); + self.token.start = self.offset(); let kind = self.read_jsx_child(); self.finish_next(kind) } diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs index 3051a6244..3134f3422 100644 --- a/crates/oxc_parser/src/lexer/mod.rs +++ b/crates/oxc_parser/src/lexer/mod.rs @@ -1,3 +1,5 @@ +#![allow(clippy::unnecessary_safety_comment)] + //! An Ecma-262 Lexer / Tokenizer //! Prior Arts: //! * [jsparagus](https://github.com/mozilla-spidermonkey/jsparagus/blob/master/crates/parser/src) @@ -14,6 +16,7 @@ mod number; mod numeric; mod punctuation; mod regex; +mod source; mod string; mod string_builder; mod template; @@ -23,25 +26,30 @@ mod typescript; mod unicode; use rustc_hash::FxHashMap; -use std::{collections::VecDeque, str::Chars}; +use std::collections::VecDeque; use oxc_allocator::Allocator; use oxc_ast::ast::RegExpFlags; use oxc_diagnostics::Error; use oxc_span::{SourceType, Span}; -use self::{byte_handlers::handle_byte, string_builder::AutoCow, trivia_builder::TriviaBuilder}; +use self::{ + byte_handlers::handle_byte, + source::{Source, SourcePosition}, + string_builder::AutoCow, + trivia_builder::TriviaBuilder, +}; pub use self::{ kind::Kind, number::{parse_big_int, parse_float, parse_int}, token::Token, }; -use crate::{diagnostics, MAX_LEN}; +use crate::diagnostics; -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Copy)] pub struct LexerCheckpoint<'a> { - /// Remaining chars to be tokenized - chars: Chars<'a>, + /// Current position in source + position: SourcePosition<'a>, token: Token, @@ -55,18 +63,25 @@ pub enum LexerContext { JsxAttributeValue, } +#[derive(Debug, Clone, Copy)] +struct Lookahead<'a> { + position: SourcePosition<'a>, + token: Token, +} + pub struct Lexer<'a> { allocator: &'a Allocator, - source: &'a str, + // Wrapper around source text. Must not be changed after initialization. + source: Source<'a>, source_type: SourceType, - current: LexerCheckpoint<'a>, + token: Token, pub(crate) errors: Vec, - lookahead: VecDeque>, + lookahead: VecDeque>, context: LexerContext, @@ -82,21 +97,16 @@ pub struct Lexer<'a> { #[allow(clippy::unused_self)] impl<'a> Lexer<'a> { - pub fn new(allocator: &'a Allocator, mut source: &'a str, source_type: SourceType) -> Self { - // If source exceeds size limit, substitute a short source which will fail to parse. - // `Parser::parse` will convert error to `diagnostics::OverlongSource`. - if source.len() > MAX_LEN { - source = "\0"; - } + pub fn new(allocator: &'a Allocator, source_text: &'a str, source_type: SourceType) -> Self { + let source = Source::new(source_text); // The first token is at the start of file, so is allows on a new line let token = Token::new_on_new_line(); - let current = LexerCheckpoint { chars: source.chars(), token, errors_pos: 0 }; Self { allocator, source, source_type, - current, + token, errors: vec![], lookahead: VecDeque::with_capacity(4), // 4 is the maximum lookahead for TypeScript context: LexerContext::Regular, @@ -108,23 +118,30 @@ impl<'a> Lexer<'a> { /// Remaining string from `Chars` pub fn remaining(&self) -> &'a str { - self.current.chars.as_str() + self.source.remaining() } /// Creates a checkpoint storing the current lexer state. /// Use `rewind` to restore the lexer to the state stored in the checkpoint. pub fn checkpoint(&self) -> LexerCheckpoint<'a> { LexerCheckpoint { - chars: self.current.chars.clone(), - token: self.current.token, + position: self.source.position(), + token: self.token, errors_pos: self.errors.len(), } } /// Rewinds the lexer to the same state as when the passed in `checkpoint` was created. - pub fn rewind(&mut self, checkpoint: LexerCheckpoint<'a>) { + /// + /// # SAFETY + /// `checkpoint` must have been created from this `Lexer`. + #[allow(clippy::missing_safety_doc)] // Clippy is wrong! + pub unsafe fn rewind(&mut self, checkpoint: LexerCheckpoint<'a>) { self.errors.truncate(checkpoint.errors_pos); - self.current = checkpoint; + // SAFETY: Caller guarantees `checkpoint` was created from this `Lexer`, + // and therefore `checkpoint.position` was created from `self.source`. + self.source.set_position(checkpoint.position); + self.token = checkpoint.token; self.lookahead.clear(); } @@ -137,28 +154,30 @@ impl<'a> Lexer<'a> { return self.lookahead[n - 1].token; } - let checkpoint = self.checkpoint(); + let position = self.source.position(); - if let Some(checkpoint) = self.lookahead.back() { - self.current = checkpoint.clone(); + if let Some(lookahead) = self.lookahead.back() { + // SAFETY: `self.lookahead` only contains lookaheads created by this `Lexer`. + // `self.source` never changes, so `lookahead.position` must have been created + // from `self.source`. + unsafe { self.source.set_position(lookahead.position) }; } - // reset the current token for `read_next_token`, - // otherwise it will contain the token from - // `self.current = checkpoint` - self.current.token = Token::default(); - for _i in self.lookahead.len()..n { let kind = self.read_next_token(); let peeked = self.finish_next(kind); - self.lookahead.push_back(LexerCheckpoint { - chars: self.current.chars.clone(), - token: peeked, - errors_pos: self.errors.len(), - }); + self.lookahead.push_back(Lookahead { position: self.source.position(), token: peeked }); } - self.current = checkpoint; + // Call to `finish_next` in loop above leaves `self.token = Token::default()`. + // Only circumstance in which `self.token` wouldn't have been default at start of this + // function is if we were at very start of file, before any tokens have been read, when + // `token.is_on_new_line` is `true`. But `lookahead` isn't called before the first token is + // read, so that's not possible. So no need to restore `self.token` here. + // It's already in same state as it was at start of this function. + + // SAFETY: `position` was created above from `self.source`. `self.source` never changes. + unsafe { self.source.set_position(position) }; self.lookahead[n - 1].token } @@ -170,21 +189,23 @@ impl<'a> Lexer<'a> { /// Main entry point pub fn next_token(&mut self) -> Token { - if let Some(checkpoint) = self.lookahead.pop_front() { - self.current.chars = checkpoint.chars; - self.current.errors_pos = checkpoint.errors_pos; - return checkpoint.token; + if let Some(lookahead) = self.lookahead.pop_front() { + // SAFETY: `self.lookahead` only contains lookaheads created by this `Lexer`. + // `self.source` never changes, so `lookahead.position` must have been created + // from `self.source`. + unsafe { self.source.set_position(lookahead.position) }; + return lookahead.token; } let kind = self.read_next_token(); self.finish_next(kind) } fn finish_next(&mut self, kind: Kind) -> Token { - self.current.token.kind = kind; - self.current.token.end = self.offset(); - debug_assert!(self.current.token.start <= self.current.token.end); - let token = self.current.token; - self.current.token = Token::default(); + self.token.kind = kind; + self.token.end = self.offset(); + debug_assert!(self.token.start <= self.token.end); + let token = self.token; + self.token = Token::default(); token } @@ -197,45 +218,36 @@ impl<'a> Lexer<'a> { #[inline] #[allow(clippy::cast_possible_truncation)] fn offset(&self) -> u32 { - // Offset = current position of `chars` relative to start of `source`. - // Previously was `self.source.len() - self.current.chars.as_str().len()`, - // but that was slower because `std::str::Chars` internally is a current pointer + end pointer, - // whereas `&str` internally is a start pointer and len. - // So comparing `len()` of the two requires an extra memory read, and addition operation. - // https://godbolt.org/z/v46MWddTM - // This function is on hot path, so saving even a single instruction makes a measurable difference. - (self.current.chars.as_str().as_ptr() as usize - self.source.as_ptr() as usize) as u32 + self.source.offset() } /// Get the current unterminated token range fn unterminated_range(&self) -> Span { - Span::new(self.current.token.start, self.offset()) + Span::new(self.token.start, self.offset()) } /// Consume the current char if not at EOF #[inline] fn next_char(&mut self) -> Option { - self.current.chars.next() + self.source.next_char() } /// Consume the current char #[inline] fn consume_char(&mut self) -> char { - self.current.chars.next().unwrap() + self.source.next_char().unwrap() } /// Peek the next char without advancing the position #[inline] fn peek(&self) -> Option { - self.current.chars.clone().next() + self.source.peek_char() } /// Peek the next next char without advancing the position #[inline] fn peek2(&self) -> Option { - let mut chars = self.current.chars.clone(); - chars.next(); - chars.next() + self.source.peek_char2() } /// Peek the next character, and advance the current position if it matches @@ -243,7 +255,7 @@ impl<'a> Lexer<'a> { fn next_eq(&mut self, c: char) -> bool { let matched = self.peek() == Some(c); if matched { - self.current.chars.next(); + self.source.next_char().unwrap(); } matched } @@ -267,16 +279,15 @@ impl<'a> Lexer<'a> { fn read_next_token(&mut self) -> Kind { loop { let offset = self.offset(); - self.current.token.start = offset; + self.token.start = offset; - let remaining = self.current.chars.as_str(); - if remaining.is_empty() { + let byte = if let Some(byte) = self.source.peek_byte() { + byte + } else { return Kind::Eof; - } + }; - let byte = remaining.as_bytes()[0]; - // SAFETY: Check for `remaining.is_empty()` ensures not at end of file, - // and `byte` is the byte at current position of `self.current.chars`. + // SAFETY: `byte` is byte value at current position in source let kind = unsafe { handle_byte(byte, self) }; if kind != Kind::Skip { return kind; diff --git a/crates/oxc_parser/src/lexer/punctuation.rs b/crates/oxc_parser/src/lexer/punctuation.rs index 067f41d35..9d6d71dba 100644 --- a/crates/oxc_parser/src/lexer/punctuation.rs +++ b/crates/oxc_parser/src/lexer/punctuation.rs @@ -40,10 +40,7 @@ impl<'a> Lexer<'a> { pub(super) fn read_minus(&mut self) -> Option { if self.next_eq('-') { // SingleLineHTMLCloseComment `-->` in script mode - if self.current.token.is_on_new_line - && self.source_type.is_script() - && self.next_eq('>') - { + if self.token.is_on_new_line && self.source_type.is_script() && self.next_eq('>') { None } else { Some(Kind::Minus2) diff --git a/crates/oxc_parser/src/lexer/regex.rs b/crates/oxc_parser/src/lexer/regex.rs index 96159296a..93cea826f 100644 --- a/crates/oxc_parser/src/lexer/regex.rs +++ b/crates/oxc_parser/src/lexer/regex.rs @@ -11,7 +11,7 @@ impl<'a> Lexer<'a> { /// Which means the parser needs to re-tokenize on `PrimaryExpression`, /// `RegularExpressionLiteral` only appear on the right hand side of `PrimaryExpression` pub(crate) fn next_regex(&mut self, kind: Kind) -> (Token, u32, RegExpFlags) { - self.current.token.start = self.offset() + self.token.start = self.offset() - match kind { Kind::Slash => 1, Kind::SlashEq => 2, diff --git a/crates/oxc_parser/src/lexer/source.rs b/crates/oxc_parser/src/lexer/source.rs new file mode 100644 index 000000000..25a492288 --- /dev/null +++ b/crates/oxc_parser/src/lexer/source.rs @@ -0,0 +1,410 @@ +#![allow(clippy::unnecessary_safety_comment)] + +use crate::MAX_LEN; + +use std::{marker::PhantomData, slice, str}; + +/// `Source` holds the source text for the lexer, and provides APIs to read it. +/// +/// It provides a cursor which allows consuming source text either as `char`s, or as bytes. +/// It replaces `std::str::Chars` iterator which performed the same function previously, +/// but was less flexible as only allowed consuming source char by char. +/// +/// Consuming source text byte-by-byte is often more performant than char-by-char. +/// +/// `Source` provides: +/// +/// * Safe API for consuming source char-by-char (`Source::next_char`, `Source::peek_char`). +/// * Safe API for peeking next source byte (`Source::peek_byte`). +/// * Unsafe API for consuming source byte-by-byte (`Source::next_byte`). +/// * Mostly-safe API for rewinding to a previous position in source +/// (`Source::position`, `Source::set_position`). +/// +/// # Composition of `Source` +/// +/// * `start` is pointer to start of source text. +/// * `end` is pointer to end of source text. +/// * `ptr` is cursor for current position in source text. +/// +/// # Invariants of `Source` +/// +/// 1. `start` <= `end` +/// 2. The region of memory bounded between `start` and `end` must be initialized, +/// a single allocation, and contain the bytes of a valid UTF-8 string. +/// 3. `ptr` must always be >= `start` and <= `end`. +/// i.e. cursor always within bounds of source text `&str`, or 1 byte after last byte +/// of source text (positioned on EOF). +/// 4. `ptr` must always point to a UTF-8 character boundary, or EOF. +/// i.e. pointing to *1st* byte of a UTF-8 character. +/// +/// These invariants are the same as `std::str::Chars`, except `Source` allows temporarily +/// breaking invariant (4) to step through source text byte-by-byte. +/// +/// Invariants (1), (2) and (3) must be upheld at all times. +/// Invariant (4) can be temporarily broken, as long as caller ensures it's satisfied again. +/// +/// Invariants (1) and (2) are enforced by initializing `start` and `end` from a valid `&str`, +/// and they are never modified after initialization. +/// +/// Safe methods of `Source` enforce invariant (3) i.e. they do not allow reading past EOF. +/// Unsafe methods e.g. `Source::next_byte_unchecked` and `Source::peek_byte_unchecked` +/// require caller to uphold this invariant. +/// +/// Invariant (4) is the most difficult to satisfy. +/// `Source::next_char` relies on source text being valid UTF-8 to provide a safe API which +/// upholds this invariant. +/// `Source::next_byte` requires very careful use as it may violate invariant (4). +/// That is fine temporarily, but caller *must* ensure the safety conditions of `Source::next_byte` +/// are satisfied, to restore this invariant before passing control back to other code. +/// It will often be preferable to instead use `Source::peek_byte`, followed by `Source::next_char`, +/// which are safe methods, and compiler will often reduce to equally efficient code. +#[derive(Clone)] +pub(super) struct Source<'a> { + /// Pointer to start of source string. Never altered after initialization. + start: *const u8, + /// Pointer to end of source string. Never altered after initialization. + end: *const u8, + /// Pointer to current position in source string + ptr: *const u8, + /// Marker for immutable borrow of source string + _marker: PhantomData<&'a str>, +} + +impl<'a> Source<'a> { + /// Create `Source` from `&str`. + pub(super) fn new(mut source_text: &'a str) -> Self { + // If source text exceeds size limit, substitute a short source text which will fail to parse. + // `Parser::parse` will convert error to `diagnostics::OverlongSource`. + if source_text.len() > MAX_LEN { + source_text = "\0"; + } + + let start = source_text.as_ptr(); + // SAFETY: Adding `source_text.len()` to the starting pointer gives a pointer + // at the end of `source_text`. `end` will never be dereferenced, only checked + // for direct pointer equality with `ptr` to check if at end of file. + let end = unsafe { start.add(source_text.len()) }; + + Self { start, end, ptr: start, _marker: PhantomData } + } + + /// Get entire source text as `&str`. + #[inline] + pub(super) fn whole(&self) -> &'a str { + // SAFETY: `start` and `end` are created from a `&str` in `Source::new`, + // so guaranteed to be start and end of a valid UTF-8 string + unsafe { + let len = self.end as usize - self.start as usize; + let slice = slice::from_raw_parts(self.start, len); + str::from_utf8_unchecked(slice) + } + } + + /// Get remaining source text as `&str`. + #[inline] + pub(super) fn remaining(&self) -> &'a str { + // SAFETY: + // `start` and `end` are created from a `&str` in `Source::new` so span a single allocation. + // Invariant of `Source` is that `ptr` is always >= `start` and <= `end`, + // so a slice spanning `ptr` to `end` will always be part of of a single allocation. + // Invariant of `Source` is that `ptr` is always on a UTF-8 character boundary, + // so slice from `ptr` to `end` will always be a valid UTF-8 string. + unsafe { + let len = self.end as usize - self.ptr as usize; + let slice = slice::from_raw_parts(self.ptr, len); + debug_assert!(slice.is_empty() || !is_utf8_cont_byte(slice[0])); + str::from_utf8_unchecked(slice) + } + } + + /// Return whether at end of source. + #[inline] + pub(super) fn is_eof(&self) -> bool { + self.ptr == self.end + } + + /// Get current position. + /// + /// The `SourcePosition` returned is guaranteed to be within bounds of `&str` that `Source` + /// was created from, and on a UTF-8 character boundary, so can be used by caller + /// to later move current position of this `Source` using `Source::set_position`. + /// + /// `SourcePosition` lives as long as the source text `&str` that `Source` was created from. + #[inline] + pub(super) fn position(&self) -> SourcePosition<'a> { + SourcePosition { ptr: self.ptr, _marker: PhantomData } + } + + /// Move current position. + /// + /// # SAFETY + /// `pos` must be created from this `Source`, not another `Source`. + /// If this is the case, the invariants of `Source` are guaranteed to be upheld. + #[inline] + pub(super) unsafe fn set_position(&mut self, pos: SourcePosition) { + // `SourcePosition` always upholds the invariants of `Source`, + // as long as it's created from this `Source`. + // SAFETY: `read_u8`'s contract is upheld by: + // * The preceding checks that `pos.ptr` >= `self.start` and < `self.end`. + // * `Source`'s invariants guarantee that `self.start` - `self.end` contains allocated memory. + // * `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `pos.ptr` + // addresses cannot be aliased by a `&mut` ref as long as `Source` exists. + // * `SourcePosition` can only live as long as the `&str` underlying `Source`. + debug_assert!( + pos.ptr >= self.start + && pos.ptr <= self.end + && (pos.ptr == self.end || !is_utf8_cont_byte(read_u8(pos.ptr))) + ); + self.ptr = pos.ptr; + } + + /// Get current position in source, relative to start of source. + #[allow(clippy::cast_possible_truncation)] + #[inline] + pub(super) fn offset(&self) -> u32 { + // Cannot overflow `u32` because of `MAX_LEN` check in `Source::new` + (self.ptr as usize - self.start as usize) as u32 + } + + /// Move current position back by `n` bytes. + /// + /// # Panic + /// Panics if: + /// * `n` is 0. + /// * `n` is greater than current offset in source. + /// * Moving back `n` bytes would not place current position on a UTF-8 character boundary. + #[inline] + pub(super) fn back(&mut self, n: usize) { + // This assertion is essential to ensure safety of `read_u8()` call below. + // Without this check, calling `back(0)` on an empty `Source` would cause reading + // out of bounds. + // Compiler should remove this assertion when inlining this function, + // as long as it can deduce from calling code that `n` is non-zero. + assert!(n > 0, "Cannot call `Source::back` with 0"); + + // Ensure not attempting to go back to before start of source + let offset = self.ptr as usize - self.start as usize; + assert!(n <= offset, "Cannot go back {n} bytes - only {offset} bytes consumed"); + + // SAFETY: We have checked that `n` is less than distance between `start` and `ptr`, + // so `new_ptr` cannot be outside of allocation of original `&str` + let new_ptr = unsafe { self.ptr.sub(n) }; + + // Enforce invariant that `ptr` must be positioned on a UTF-8 character boundary. + // SAFETY: `new_ptr` is in bounds of original `&str`, and `n > 0` assertion ensures + // not at the end, so valid to read a byte. + // `Source`'s invariants guarantee that `self.start` - `self.end` contains allocated memory. + // `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `new_ptr` + // addresses cannot be aliased by a `&mut` ref as long as `Source` exists. + let byte = unsafe { read_u8(new_ptr) }; + assert!(!is_utf8_cont_byte(byte), "Offset is not on a UTF-8 character boundary"); + + // Move current position. The checks above satisfy `Source`'s invariants. + self.ptr = new_ptr; + } + + /// Get next char of source, and advance position to after it. + #[inline] + pub(super) fn next_char(&mut self) -> Option { + // Check not at EOF and handle ASCII bytes + let byte = self.peek_byte()?; + if byte.is_ascii() { + // SAFETY: We already exited if at EOF, so `ptr < end`. + // So incrementing `ptr` cannot result in `ptr > end`. + // Current byte is ASCII, so incremented `ptr` must be on a UTF-8 character boundary. + unsafe { self.ptr = self.ptr.add(1) }; + return Some(byte as char); + } + + // Multi-byte Unicode character. + // Check invariant that `ptr` is on a UTF-8 character boundary. + debug_assert!(!is_utf8_cont_byte(byte)); + + // Create a `Chars` iterator, get next char from it, and then update `self.ptr` + // to match `Chars` iterator's updated pointer afterwards. + // `Chars` iterator upholds same invariants as `Source`, so its pointer is guaranteed + // to be valid as `self.ptr`. + let mut chars = self.remaining().chars(); + // SAFETY: We know that there's a byte to be consumed, so `chars.next()` must return `Some(_)` + let c = unsafe { chars.next().unwrap_unchecked() }; + self.ptr = chars.as_str().as_ptr(); + Some(c) + } + + /// Get next byte of source, and advance position to after it. + /// + /// # SAFETY + /// This function may leave `Source` positioned in middle of a UTF-8 character sequence, + /// which would violate one of `Source`'s invariants. + /// + /// This is OK temporarily, but caller *must* ensure the invariant is restored again. + /// + /// Caller must ensure one of: + /// + /// 1. No byte is returned (end of file). + /// 2. The byte returned is ASCII. + /// 3. Further calls to `Source::next_byte` or `Source::next_byte_unchecked` are made + /// to consume the rest of the multi-byte UTF-8 character, before calling any other methods + /// of `Source` (even safe methods) which rely on `Source` being positioned on a UTF-8 + /// character boundary, or before passing control back to other safe code which may call them. + /// + /// In particular, safe methods `Source::next_char`, `Source::peek_char`, and `Source::remaining` + /// are *not* safe to call until one of above conditions is satisfied. + /// + /// It will often be preferable to instead use `Source::peek_byte`, followed by `Source::next_char`, + /// which are safe methods, and compiler will often reduce to equally efficient code, if calling + /// code tests the byte returned. e.g.: + /// + /// ``` + /// // Consume a space + /// let byte = source.peek_byte(); + /// if byte == Some(b' ') { + /// source.next_char().unwrap(); + /// } + /// ``` + #[allow(dead_code)] + #[inline] + unsafe fn next_byte(&mut self) -> Option { + if self.is_eof() { + None + } else { + // SAFETY: Safe to read from `ptr` as we just checked it's not out of bounds + Some(self.next_byte_unchecked()) + } + } + + /// Get next bytes of source, and advance position to after it, without EOF bounds-check. + /// + /// # SAFETY + /// Caller must ensure `Source` is not at end of file. + /// + /// This function may leave `Source` positioned in middle of a UTF-8 character sequence, + /// which would violate one of `Source`'s invariants. + /// + /// This is OK temporarily, but caller *must* ensure the invariant is restored again. + /// + /// Caller must ensure one of: + /// + /// 1. The byte returned is ASCII. + /// 2. Further calls to `Source::next_byte` or `Source::next_byte_unchecked` are made + /// to consume the rest of the multi-byte UTF-8 character, before calling any other methods + /// of `Source` (even safe methods) which rely on `Source` being positioned on a UTF-8 + /// character boundary, or before passing control back to other safe code which may call them. + /// + /// In particular, safe methods `Source::next_char`, `Source::peek_char`, and `Source::remaining` + /// are *not* safe to call until one of above conditions is satisfied. + #[allow(dead_code)] + #[inline] + unsafe fn next_byte_unchecked(&mut self) -> u8 { + // SAFETY: Caller guarantees not at end of file i.e. `ptr != end`. + // Methods of this type provide no way for `ptr` to be before `start` or after `end`. + // Therefore always valid to read a byte from `ptr`, and incrementing `ptr` cannot result + // in `ptr > end`. + let byte = self.peek_byte_unchecked(); + self.ptr = self.ptr.add(1); + byte + } + + /// Peek next char of source, without consuming it. + #[inline] + pub(super) fn peek_char(&self) -> Option { + // Check not at EOF and handle ASCII bytes + let byte = self.peek_byte()?; + if byte.is_ascii() { + return Some(byte as char); + } + + // Multi-byte Unicode character. + // Check invariant that `ptr` is on a UTF-8 character boundary. + debug_assert!(!is_utf8_cont_byte(byte)); + + // Create a `Chars` iterator, and get next char from it + let mut chars = self.remaining().chars(); + // SAFETY: We know that there's a byte to be consumed, so `chars.next()` must return `Some(_)`. + // Could just return `chars.next()` here, but making it clear to compiler that this branch + // always returns `Some(_)` may help it optimize the caller. Compiler seems to have difficulty + // "seeing into" `Chars` iterator and making deductions. + let c = unsafe { chars.next().unwrap_unchecked() }; + Some(c) + } + + /// Peek next next char of source, without consuming it. + #[inline] + pub(super) fn peek_char2(&self) -> Option { + // Handle EOF + if self.is_eof() { + return None; + } + + // Check invariant that `ptr` is on a UTF-8 character boundary. + debug_assert!(!is_utf8_cont_byte(self.peek_byte().unwrap())); + + let mut chars = self.remaining().chars(); + // SAFETY: We already checked not at EOF, so `chars.next()` must return `Some(_)` + unsafe { chars.next().unwrap_unchecked() }; + chars.next() + } + + /// Peek next byte of source without consuming it. + #[inline] + pub(super) fn peek_byte(&self) -> Option { + if self.is_eof() { + None + } else { + // SAFETY: Safe to read from `ptr` as we just checked it's not out of bounds + Some(unsafe { self.peek_byte_unchecked() }) + } + } + + /// Peek next byte of source without consuming it, without EOF bounds-check. + /// + /// # SAFETY + /// Caller must ensure `Source` is not at end of file. + #[inline] + pub(super) unsafe fn peek_byte_unchecked(&self) -> u8 { + // SAFETY: Caller guarantees `ptr` is before `end` (i.e. not at end of file). + // Methods of this type provide no way to allow `ptr` to be before `start`. + // `Source`'s invariants guarantee that `self.start` - `self.end` contains allocated memory. + // `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `self.ptr` + // addresses cannot be aliased by a `&mut` ref as long as `Source` exists. + debug_assert!(self.ptr >= self.start && self.ptr < self.end); + read_u8(self.ptr) + } +} + +/// Wrapper around a pointer to a position in `Source`. +#[derive(Debug, Clone, Copy)] +pub struct SourcePosition<'a> { + ptr: *const u8, + _marker: PhantomData<&'a u8>, +} + +/// Return if byte is a UTF-8 continuation byte. +#[inline] +const fn is_utf8_cont_byte(byte: u8) -> bool { + // 0x80 - 0xBF are continuation bytes i.e. not 1st byte of a UTF-8 character sequence + byte >= 0x80 && byte < 0xC0 +} + +/// Read `u8` from `*const u8` pointer. +/// +/// Using `as_ref()` for reading is copied from `core::slice::iter::next`. +/// https://doc.rust-lang.org/src/core/slice/iter.rs.html#132 +/// https://doc.rust-lang.org/src/core/slice/iter/macros.rs.html#156-168 +/// +/// This is about 7% faster than `*ptr` or `ptr.read()`, presumably because it tells the compiler +/// it can rely on the memory being immutable, because if a `&mut` reference existed, that would +/// violate Rust's aliasing rules. +/// +/// # SAFETY +/// Caller must ensure pointer is non-null, and points to allocated, initialized memory. +/// Pointer must point to within an object for which no `&mut` references are currently held. +#[inline] +unsafe fn read_u8(ptr: *const u8) -> u8 { + // SAFETY: Caller guarantees pointer is non-null, and points to allocated, initialized memory. + // Caller guarantees no mutable references to same memory exist, thus upholding Rust's aliasing rules. + // Pointer is "dereferenceable" by definition as a `u8` is 1 byte and cannot span multiple objects. + // Alignment is not relevant as `u8` is aligned on 1 (i.e. no alignment requirements). + debug_assert!(!ptr.is_null()); + *ptr.as_ref().unwrap_unchecked() +} diff --git a/crates/oxc_parser/src/lexer/string.rs b/crates/oxc_parser/src/lexer/string.rs index 5fd5e2132..12785e0a6 100644 --- a/crates/oxc_parser/src/lexer/string.rs +++ b/crates/oxc_parser/src/lexer/string.rs @@ -42,8 +42,8 @@ impl<'a> Lexer<'a> { if !has_escape { return; } - self.escaped_strings.insert(self.current.token.start, s); - self.current.token.escaped = true; + self.escaped_strings.insert(self.token.start, s); + self.token.escaped = true; } pub(crate) fn get_string(&self, token: Token) -> &'a str { @@ -51,7 +51,7 @@ impl<'a> Lexer<'a> { return self.escaped_strings[&token.start]; } - let raw = &self.source[token.start as usize..token.end as usize]; + let raw = &self.source.whole()[token.start as usize..token.end as usize]; match token.kind { Kind::Str => { &raw[1..raw.len() - 1] // omit surrounding quotes diff --git a/crates/oxc_parser/src/lexer/string_builder.rs b/crates/oxc_parser/src/lexer/string_builder.rs index eee31a9d9..01e1c4ef0 100644 --- a/crates/oxc_parser/src/lexer/string_builder.rs +++ b/crates/oxc_parser/src/lexer/string_builder.rs @@ -15,14 +15,14 @@ impl<'a> AutoCow<'a> { AutoCow { start, value: None } } - // Push a char that matches lexer.current.chars().next() + // Push a char that matches `lexer.next_char()`. pub fn push_matching(&mut self, c: char) { if let Some(text) = &mut self.value { text.push(c); } } - // Push a different character than lexer.current.chars().next(). + // Push a different character than `lexer.next_char()`. // force_allocation_without_current_ascii_char must be called before this. pub fn push_different(&mut self, c: char) { debug_assert!(self.value.is_some()); diff --git a/crates/oxc_parser/src/lexer/template.rs b/crates/oxc_parser/src/lexer/template.rs index 812cd8622..f488031db 100644 --- a/crates/oxc_parser/src/lexer/template.rs +++ b/crates/oxc_parser/src/lexer/template.rs @@ -47,7 +47,7 @@ impl<'a> Lexer<'a> { /// Re-tokenize the current `}` token for `TemplateSubstitutionTail` /// See Section 12, the parser needs to re-tokenize on `TemplateSubstitutionTail`, pub(crate) fn next_template_substitution_tail(&mut self) -> Token { - self.current.token.start = self.offset() - 1; + self.token.start = self.offset() - 1; let kind = self.read_template_literal(Kind::TemplateMiddle, Kind::TemplateTail); self.lookahead.clear(); self.finish_next(kind) @@ -63,16 +63,15 @@ impl<'a> Lexer<'a> { if !has_escape { return; } - self.escaped_templates - .insert(self.current.token.start, is_valid_escape_sequence.then(|| s)); - self.current.token.escaped = true; + self.escaped_templates.insert(self.token.start, is_valid_escape_sequence.then(|| s)); + self.token.escaped = true; } pub(crate) fn get_template_string(&self, token: Token) -> Option<&'a str> { if token.escaped { return self.escaped_templates[&token.start]; } - let raw = &self.source[token.start as usize..token.end as usize]; + let raw = &self.source.whole()[token.start as usize..token.end as usize]; Some(match token.kind { Kind::NoSubstitutionTemplate | Kind::TemplateTail => { &raw[1..raw.len() - 1] // omit surrounding quotes or leading "}" and trailing "`" diff --git a/crates/oxc_parser/src/lexer/typescript.rs b/crates/oxc_parser/src/lexer/typescript.rs index e2c781969..0603a7576 100644 --- a/crates/oxc_parser/src/lexer/typescript.rs +++ b/crates/oxc_parser/src/lexer/typescript.rs @@ -8,8 +8,8 @@ impl<'a> Lexer<'a> { Kind::ShiftLeftEq => 3, _ => unreachable!(), }; - self.current.token.start = self.offset() - offset; - self.current.chars = self.source[self.current.token.start as usize + 1..].chars(); + self.token.start = self.offset() - offset; + self.source.back(offset as usize - 1); let kind = Kind::LAngle; self.lookahead.clear(); self.finish_next(kind) diff --git a/crates/oxc_parser/src/lexer/unicode.rs b/crates/oxc_parser/src/lexer/unicode.rs index 8c9096dc1..01aac7d1d 100644 --- a/crates/oxc_parser/src/lexer/unicode.rs +++ b/crates/oxc_parser/src/lexer/unicode.rs @@ -29,15 +29,13 @@ impl<'a> Lexer<'a> { } c if is_irregular_whitespace(c) => { self.consume_char(); - self.trivia_builder - .add_irregular_whitespace(self.current.token.start, self.offset()); + self.trivia_builder.add_irregular_whitespace(self.token.start, self.offset()); Kind::Skip } c if is_irregular_line_terminator(c) => { self.consume_char(); - self.current.token.is_on_new_line = true; - self.trivia_builder - .add_irregular_whitespace(self.current.token.start, self.offset()); + self.token.is_on_new_line = true; + self.trivia_builder.add_irregular_whitespace(self.token.start, self.offset()); Kind::Skip } _ => {