From 383f5b3081ad8de3a62830972fb30e784c19e1dc Mon Sep 17 00:00:00 2001
From: overlookmotel <theoverlookmotel@gmail.com>
Date: Sun, 11 Feb 2024 04:43:14 +0000
Subject: [PATCH] perf(parser): consume multi-line comments faster (#2377)

Consume multi-line comments faster.

* Initially search for `*/`, `\r`, `\n` or `0xE2` (first byte of
irregular line breaks).
* Once a line break is found, switch to faster search which only looks
for `*/`, as it's not relevant whether there are more line breaks or
not.

Using `memchr` for the 2nd simpler search, as it's efficient for a
search with only one "needle".

Initializing `memchr::memmem::Finder` is fairly expensive, and tried
numerous ways to handle it. This is most performant way I could find.
Any ideas how to avoid re-creating it for each Lexer pass? (it can't be
a `static` as `Finder::new` is not a const function, and `lazy_static!`
is too costly)
---
 Cargo.lock                             |   1 +
 crates/oxc_parser/Cargo.toml           |   2 +
 crates/oxc_parser/src/lexer/comment.rs | 104 ++++++++++++++++++++++---
 crates/oxc_parser/src/lexer/mod.rs     |   4 +
 crates/oxc_parser/src/lexer/source.rs  |  17 ++++
 5 files changed, 118 insertions(+), 10 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b6c46b3d9..3fdae0d46 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1593,6 +1593,7 @@ version = "0.7.0"
 dependencies = [
  "assert-unchecked",
  "bitflags 2.4.2",
+ "memchr",
  "miette",
  "num-bigint",
  "ouroboros",
diff --git a/crates/oxc_parser/Cargo.toml b/crates/oxc_parser/Cargo.toml
index 396662da6..66d9e9a57 100644
--- a/crates/oxc_parser/Cargo.toml
+++ b/crates/oxc_parser/Cargo.toml
@@ -31,6 +31,8 @@ rustc-hash       = { workspace = true }
 num-bigint       = { workspace = true }
 seq-macro        = { workspace = true }
 
+memchr = "2.7.1"
+
 [dev-dependencies]
 oxc_ast    = { workspace = true, features = ["serde"] }
 miette     = { workspace = true, features = ["fancy-no-backtrace"] }
diff --git a/crates/oxc_parser/src/lexer/comment.rs b/crates/oxc_parser/src/lexer/comment.rs
index 2293487a3..45088ef85 100644
--- a/crates/oxc_parser/src/lexer/comment.rs
+++ b/crates/oxc_parser/src/lexer/comment.rs
@@ -1,12 +1,16 @@
 use super::{
     cold_branch,
     search::{byte_search, safe_byte_match_table, SafeByteMatchTable},
+    source::SourcePosition,
     Kind, Lexer,
 };
 use crate::diagnostics;
 
 use oxc_syntax::identifier::is_line_terminator;
 
+use memchr::memmem::Finder;
+
+// Irregular line breaks - '\u{2028}' (LS) and '\u{2029}' (PS)
 const LS_OR_PS_FIRST: u8 = 0xE2;
 const LS_BYTES_2_AND_3: [u8; 2] = [0x80, 0xA8];
 const PS_BYTES_2_AND_3: [u8; 2] = [0x80, 0xA9];
@@ -14,6 +18,9 @@ const PS_BYTES_2_AND_3: [u8; 2] = [0x80, 0xA9];
 static LINE_BREAK_TABLE: SafeByteMatchTable =
     safe_byte_match_table!(|b| matches!(b, b'\r' | b'\n' | LS_OR_PS_FIRST));
 
+static MULTILINE_COMMENT_START_TABLE: SafeByteMatchTable =
+    safe_byte_match_table!(|b| matches!(b, b'*' | b'\r' | b'\n' | LS_OR_PS_FIRST));
+
 impl<'a> Lexer<'a> {
     /// Section 12.4 Single Line Comment
     pub(super) fn skip_single_line_comment(&mut self) -> Kind {
@@ -74,17 +81,94 @@ impl<'a> Lexer<'a> {
 
     /// Section 12.4 Multi Line Comment
     pub(super) fn skip_multi_line_comment(&mut self) -> Kind {
-        while let Some(c) = self.next_char() {
-            if c == '*' && self.next_eq('/') {
-                self.trivia_builder.add_multi_line_comment(self.token.start, self.offset());
-                return Kind::Skip;
-            }
-            if is_line_terminator(c) {
-                self.token.is_on_new_line = true;
-            }
+        // If `is_on_new_line` is already set, go directly to faster search which only looks for `*/`
+        if self.token.is_on_new_line {
+            return self.skip_multi_line_comment_after_line_break(self.source.position());
+        }
+
+        // SAFETY: Requirement not to alter `pos` if return `true` from `if_continue` is satisfied
+        unsafe {
+            byte_search! {
+                lexer: self,
+                table: MULTILINE_COMMENT_START_TABLE,
+                continue_if: |next_byte, pos| {
+                    // Match found. Decide whether to continue searching.
+                    if next_byte == b'*' {
+                        if pos.addr() < self.source.end_addr() - 1 {
+                            // If next byte isn't `/`, continue
+                            // SAFETY: Have checked there's at least 1 further byte to read
+                            if pos.add(1).read() == b'/' {
+                                // Consume `*/`
+                                // SAFETY: Consuming `*/` leaves `pos` on a UTF-8 char boundary
+                                pos = pos.add(2);
+                                false
+                            } else {
+                                true
+                            }
+                        } else {
+                            // This is last byte in file. Continue to `handle_eof`.
+                            // This is illegal in valid JS, so mark this branch cold.
+                            cold_branch(|| true)
+                        }
+                    } else if next_byte == LS_OR_PS_FIRST {
+                        // `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
+                        // Either way, Unicode is uncommon, so make this a cold branch.
+                        cold_branch(|| {
+                            // SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
+                            // So safe to advance `pos` by 1 and read 2 bytes.
+                            let next2 = pos.add(1).read2();
+                            if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
+                                // Irregular line break
+                                self.token.is_on_new_line = true;
+                                // Ideally we'd go on to `skip_multi_line_comment_after_line_break` here
+                                // but can't do that easily because can't use `return` in a closure.
+                                // But irregular line breaks are rare anyway.
+                            }
+                            // Either way, continue searching
+                            true
+                        })
+                    } else {
+                        // Regular line break.
+                        // No need to look for more line breaks, so switch to faster search just for `*/`.
+                        self.token.is_on_new_line = true;
+                        return self.skip_multi_line_comment_after_line_break(pos.add(1));
+                    }
+                },
+                handle_match: |_next_byte, _start| {
+                    self.trivia_builder.add_multi_line_comment(self.token.start, self.offset());
+                    Kind::Skip
+                },
+                handle_eof: |_start| {
+                    self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range()));
+                    Kind::Eof
+                },
+            };
+        }
+    }
+
+    fn skip_multi_line_comment_after_line_break(&mut self, pos: SourcePosition) -> Kind {
+        // Can use `memchr` here as only searching for 1 pattern.
+        // Cache `Finder` instance on `Lexer` as there's a significant cost to creating it.
+        // `Finder::new` isn't a const function, so can't make it a `static`, and `lazy_static!`
+        // has a cost each time it's deref-ed. Creating `Finder` unconditionally in `Lexer::new`
+        // would be efficient for files containing multi-line comments, but would impose pointless
+        // cost on files which don't. So this is the fastest solution.
+        if self.multi_line_comment_end_finder.is_none() {
+            self.multi_line_comment_end_finder = Some(Finder::new("*/"));
+        }
+        let finder = self.multi_line_comment_end_finder.as_ref().unwrap();
+
+        let remaining = self.source.str_from_pos_to_end(pos).as_bytes();
+        if let Some(index) = finder.find(remaining) {
+            // SAFETY: `pos + index + 2` is end of `*/`, so a valid `SourcePosition`
+            self.source.set_position(unsafe { pos.add(index + 2) });
+            self.trivia_builder.add_multi_line_comment(self.token.start, self.offset());
+            Kind::Skip
+        } else {
+            self.source.advance_to_end();
+            self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range()));
+            Kind::Eof
         }
-        self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range()));
-        Kind::Eof
     }
 
     /// Section 12.5 Hashbang Comments
diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs
index 41ed2c6eb..8e8121920 100644
--- a/crates/oxc_parser/src/lexer/mod.rs
+++ b/crates/oxc_parser/src/lexer/mod.rs
@@ -95,6 +95,9 @@ pub struct Lexer<'a> {
     /// Data store for escaped templates, indexed by [Token::start] when [Token::escaped] is true
     /// `None` is saved when the string contains an invalid escape sequence.
     pub escaped_templates: FxHashMap<u32, Option<&'a str>>,
+
+    /// `memchr` Finder for end of multi-line comments. Created lazily when first used.
+    multi_line_comment_end_finder: Option<memchr::memmem::Finder<'static>>,
 }
 
 #[allow(clippy::unused_self)]
@@ -124,6 +127,7 @@ impl<'a> Lexer<'a> {
             trivia_builder: TriviaBuilder::default(),
             escaped_strings: FxHashMap::default(),
             escaped_templates: FxHashMap::default(),
+            multi_line_comment_end_finder: None,
         }
     }
 
diff --git a/crates/oxc_parser/src/lexer/source.rs b/crates/oxc_parser/src/lexer/source.rs
index a9efd6e84..689067232 100644
--- a/crates/oxc_parser/src/lexer/source.rs
+++ b/crates/oxc_parser/src/lexer/source.rs
@@ -192,6 +192,11 @@ impl<'a> Source<'a> {
         self.ptr = pos.ptr;
     }
 
+    #[inline]
+    pub(super) fn advance_to_end(&mut self) {
+        self.ptr = self.end;
+    }
+
     /// Get string slice from a `SourcePosition` up to the current position of `Source`.
     pub(super) fn str_from_pos_to_current(&self, pos: SourcePosition) -> &'a str {
         assert!(pos.ptr <= self.ptr);
@@ -226,6 +231,18 @@ impl<'a> Source<'a> {
         std::str::from_utf8_unchecked(slice)
     }
 
+    /// Get string slice from a `SourcePosition` up to the end of `Source`.
+    #[inline]
+    pub(super) fn str_from_pos_to_end(&self, pos: SourcePosition) -> &'a str {
+        // SAFETY: Invariants of `SourcePosition` is that it cannot be after end of `Source`,
+        // and always on a UTF-8 character boundary
+        unsafe {
+            let len = self.end as usize - pos.addr();
+            let slice = slice::from_raw_parts(pos.ptr, len);
+            std::str::from_utf8_unchecked(slice)
+        }
+    }
+
     /// Get current position in source, relative to start of source.
     #[allow(clippy::cast_possible_truncation)]
     #[inline]