From 383f5b3081ad8de3a62830972fb30e784c19e1dc Mon Sep 17 00:00:00 2001 From: overlookmotel Date: Sun, 11 Feb 2024 04:43:14 +0000 Subject: [PATCH] perf(parser): consume multi-line comments faster (#2377) Consume multi-line comments faster. * Initially search for `*/`, `\r`, `\n` or `0xE2` (first byte of irregular line breaks). * Once a line break is found, switch to faster search which only looks for `*/`, as it's not relevant whether there are more line breaks or not. Using `memchr` for the 2nd simpler search, as it's efficient for a search with only one "needle". Initializing `memchr::memmem::Finder` is fairly expensive, and tried numerous ways to handle it. This is most performant way I could find. Any ideas how to avoid re-creating it for each Lexer pass? (it can't be a `static` as `Finder::new` is not a const function, and `lazy_static!` is too costly) --- Cargo.lock | 1 + crates/oxc_parser/Cargo.toml | 2 + crates/oxc_parser/src/lexer/comment.rs | 104 ++++++++++++++++++++++--- crates/oxc_parser/src/lexer/mod.rs | 4 + crates/oxc_parser/src/lexer/source.rs | 17 ++++ 5 files changed, 118 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b6c46b3d9..3fdae0d46 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1593,6 +1593,7 @@ version = "0.7.0" dependencies = [ "assert-unchecked", "bitflags 2.4.2", + "memchr", "miette", "num-bigint", "ouroboros", diff --git a/crates/oxc_parser/Cargo.toml b/crates/oxc_parser/Cargo.toml index 396662da6..66d9e9a57 100644 --- a/crates/oxc_parser/Cargo.toml +++ b/crates/oxc_parser/Cargo.toml @@ -31,6 +31,8 @@ rustc-hash = { workspace = true } num-bigint = { workspace = true } seq-macro = { workspace = true } +memchr = "2.7.1" + [dev-dependencies] oxc_ast = { workspace = true, features = ["serde"] } miette = { workspace = true, features = ["fancy-no-backtrace"] } diff --git a/crates/oxc_parser/src/lexer/comment.rs b/crates/oxc_parser/src/lexer/comment.rs index 2293487a3..45088ef85 100644 --- a/crates/oxc_parser/src/lexer/comment.rs +++ b/crates/oxc_parser/src/lexer/comment.rs @@ -1,12 +1,16 @@ use super::{ cold_branch, search::{byte_search, safe_byte_match_table, SafeByteMatchTable}, + source::SourcePosition, Kind, Lexer, }; use crate::diagnostics; use oxc_syntax::identifier::is_line_terminator; +use memchr::memmem::Finder; + +// Irregular line breaks - '\u{2028}' (LS) and '\u{2029}' (PS) const LS_OR_PS_FIRST: u8 = 0xE2; const LS_BYTES_2_AND_3: [u8; 2] = [0x80, 0xA8]; const PS_BYTES_2_AND_3: [u8; 2] = [0x80, 0xA9]; @@ -14,6 +18,9 @@ const PS_BYTES_2_AND_3: [u8; 2] = [0x80, 0xA9]; static LINE_BREAK_TABLE: SafeByteMatchTable = safe_byte_match_table!(|b| matches!(b, b'\r' | b'\n' | LS_OR_PS_FIRST)); +static MULTILINE_COMMENT_START_TABLE: SafeByteMatchTable = + safe_byte_match_table!(|b| matches!(b, b'*' | b'\r' | b'\n' | LS_OR_PS_FIRST)); + impl<'a> Lexer<'a> { /// Section 12.4 Single Line Comment pub(super) fn skip_single_line_comment(&mut self) -> Kind { @@ -74,17 +81,94 @@ impl<'a> Lexer<'a> { /// Section 12.4 Multi Line Comment pub(super) fn skip_multi_line_comment(&mut self) -> Kind { - while let Some(c) = self.next_char() { - if c == '*' && self.next_eq('/') { - self.trivia_builder.add_multi_line_comment(self.token.start, self.offset()); - return Kind::Skip; - } - if is_line_terminator(c) { - self.token.is_on_new_line = true; - } + // If `is_on_new_line` is already set, go directly to faster search which only looks for `*/` + if self.token.is_on_new_line { + return self.skip_multi_line_comment_after_line_break(self.source.position()); + } + + // SAFETY: Requirement not to alter `pos` if return `true` from `if_continue` is satisfied + unsafe { + byte_search! { + lexer: self, + table: MULTILINE_COMMENT_START_TABLE, + continue_if: |next_byte, pos| { + // Match found. Decide whether to continue searching. + if next_byte == b'*' { + if pos.addr() < self.source.end_addr() - 1 { + // If next byte isn't `/`, continue + // SAFETY: Have checked there's at least 1 further byte to read + if pos.add(1).read() == b'/' { + // Consume `*/` + // SAFETY: Consuming `*/` leaves `pos` on a UTF-8 char boundary + pos = pos.add(2); + false + } else { + true + } + } else { + // This is last byte in file. Continue to `handle_eof`. + // This is illegal in valid JS, so mark this branch cold. + cold_branch(|| true) + } + } else if next_byte == LS_OR_PS_FIRST { + // `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char. + // Either way, Unicode is uncommon, so make this a cold branch. + cold_branch(|| { + // SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char. + // So safe to advance `pos` by 1 and read 2 bytes. + let next2 = pos.add(1).read2(); + if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 { + // Irregular line break + self.token.is_on_new_line = true; + // Ideally we'd go on to `skip_multi_line_comment_after_line_break` here + // but can't do that easily because can't use `return` in a closure. + // But irregular line breaks are rare anyway. + } + // Either way, continue searching + true + }) + } else { + // Regular line break. + // No need to look for more line breaks, so switch to faster search just for `*/`. + self.token.is_on_new_line = true; + return self.skip_multi_line_comment_after_line_break(pos.add(1)); + } + }, + handle_match: |_next_byte, _start| { + self.trivia_builder.add_multi_line_comment(self.token.start, self.offset()); + Kind::Skip + }, + handle_eof: |_start| { + self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range())); + Kind::Eof + }, + }; + } + } + + fn skip_multi_line_comment_after_line_break(&mut self, pos: SourcePosition) -> Kind { + // Can use `memchr` here as only searching for 1 pattern. + // Cache `Finder` instance on `Lexer` as there's a significant cost to creating it. + // `Finder::new` isn't a const function, so can't make it a `static`, and `lazy_static!` + // has a cost each time it's deref-ed. Creating `Finder` unconditionally in `Lexer::new` + // would be efficient for files containing multi-line comments, but would impose pointless + // cost on files which don't. So this is the fastest solution. + if self.multi_line_comment_end_finder.is_none() { + self.multi_line_comment_end_finder = Some(Finder::new("*/")); + } + let finder = self.multi_line_comment_end_finder.as_ref().unwrap(); + + let remaining = self.source.str_from_pos_to_end(pos).as_bytes(); + if let Some(index) = finder.find(remaining) { + // SAFETY: `pos + index + 2` is end of `*/`, so a valid `SourcePosition` + self.source.set_position(unsafe { pos.add(index + 2) }); + self.trivia_builder.add_multi_line_comment(self.token.start, self.offset()); + Kind::Skip + } else { + self.source.advance_to_end(); + self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range())); + Kind::Eof } - self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range())); - Kind::Eof } /// Section 12.5 Hashbang Comments diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs index 41ed2c6eb..8e8121920 100644 --- a/crates/oxc_parser/src/lexer/mod.rs +++ b/crates/oxc_parser/src/lexer/mod.rs @@ -95,6 +95,9 @@ pub struct Lexer<'a> { /// Data store for escaped templates, indexed by [Token::start] when [Token::escaped] is true /// `None` is saved when the string contains an invalid escape sequence. pub escaped_templates: FxHashMap>, + + /// `memchr` Finder for end of multi-line comments. Created lazily when first used. + multi_line_comment_end_finder: Option>, } #[allow(clippy::unused_self)] @@ -124,6 +127,7 @@ impl<'a> Lexer<'a> { trivia_builder: TriviaBuilder::default(), escaped_strings: FxHashMap::default(), escaped_templates: FxHashMap::default(), + multi_line_comment_end_finder: None, } } diff --git a/crates/oxc_parser/src/lexer/source.rs b/crates/oxc_parser/src/lexer/source.rs index a9efd6e84..689067232 100644 --- a/crates/oxc_parser/src/lexer/source.rs +++ b/crates/oxc_parser/src/lexer/source.rs @@ -192,6 +192,11 @@ impl<'a> Source<'a> { self.ptr = pos.ptr; } + #[inline] + pub(super) fn advance_to_end(&mut self) { + self.ptr = self.end; + } + /// Get string slice from a `SourcePosition` up to the current position of `Source`. pub(super) fn str_from_pos_to_current(&self, pos: SourcePosition) -> &'a str { assert!(pos.ptr <= self.ptr); @@ -226,6 +231,18 @@ impl<'a> Source<'a> { std::str::from_utf8_unchecked(slice) } + /// Get string slice from a `SourcePosition` up to the end of `Source`. + #[inline] + pub(super) fn str_from_pos_to_end(&self, pos: SourcePosition) -> &'a str { + // SAFETY: Invariants of `SourcePosition` is that it cannot be after end of `Source`, + // and always on a UTF-8 character boundary + unsafe { + let len = self.end as usize - pos.addr(); + let slice = slice::from_raw_parts(pos.ptr, len); + std::str::from_utf8_unchecked(slice) + } + } + /// Get current position in source, relative to start of source. #[allow(clippy::cast_possible_truncation)] #[inline]