perf(parser): consume single-line comments faster (#2374)

Use `byte_search!` macro to consume single-line comments. Would be a lot simpler if didn't have to deal with irregular line breaks. Damn you Unicode!
2026-05-24 12:21:58 +00:00 · 2024-02-10 03:02:30 +00:00 · 2024-02-10 03:02:30 +00:00 · c4fa738312
commit c4fa738312
parent b29719d2df
2 changed files with 64 additions and 14 deletions
--- a/crates/oxc_parser/src/lexer/comment.rs
+++ b/crates/oxc_parser/src/lexer/comment.rs
@ -1,24 +1,75 @@
-use super::{Kind, Lexer};
+use super::{
+    cold_branch,
+    search::{byte_search, safe_byte_match_table, SafeByteMatchTable},
+    Kind, Lexer,
+};
 use crate::diagnostics;

 use oxc_syntax::identifier::is_line_terminator;

+const LS_OR_PS_FIRST: u8 = 0xE2;
+const LS_BYTES_2_AND_3: [u8; 2] = [0x80, 0xA8];
+const PS_BYTES_2_AND_3: [u8; 2] = [0x80, 0xA9];
+
+static LINE_BREAK_TABLE: SafeByteMatchTable =
+    safe_byte_match_table!(|b| matches!(b, b'\r' | b'\n' | LS_OR_PS_FIRST));
+
 impl<'a> Lexer<'a> {
    /// Section 12.4 Single Line Comment
-    #[allow(clippy::cast_possible_truncation)]
    pub(super) fn skip_single_line_comment(&mut self) -> Kind {
-        let start = self.token.start;
-        while let Some(c) = self.next_char() {
-            if is_line_terminator(c) {
-                self.token.is_on_new_line = true;
-                self.trivia_builder
-                    .add_single_line_comment(start, self.offset() - c.len_utf8() as u32);
-                return Kind::Skip;
-            }
+        // SAFETY: Requirement not to alter `pos` if return `true` from `if_continue` is satisfied
+        unsafe {
+            byte_search! {
+                lexer: self,
+                table: LINE_BREAK_TABLE,
+                continue_if: |next_byte, pos| {
+                    // Match found. Decide whether to continue searching.
+                    // If this is end of comment, create trivia, and advance `pos` to after line break.
+                    // Do that here rather than in `handle_match`, to avoid branching twice on value of
+                    // the matched byte.
+                    #[allow(clippy::if_not_else)]
+                    if next_byte != LS_OR_PS_FIRST {
+                        // `\r` or `\n`
+                        self.trivia_builder
+                            .add_single_line_comment(self.token.start, self.source.offset_of(pos));
+                        // SAFETY: Safe to consume `\r` or `\n` as both are ASCII
+                        pos = pos.add(1);
+                        // We've found the end. Do not continue searching.
+                        false
+                    } else {
+                        // `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
+                        // Either way, Unicode is uncommon, so make this a cold branch.
+                        cold_branch(|| {
+                            // SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
+                            // So safe to advance `pos` by 1 and read 2 bytes.
+                            let next2 = pos.add(1).read2();
+                            if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
+                                // Irregular line break
+                                self.trivia_builder
+                                    .add_single_line_comment(self.token.start, self.source.offset_of(pos));
+                                // Advance `pos` to after this char.
+                                // SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
+                                // so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
+                                pos = pos.add(3);
+                                // We've found the end. Do not continue searching.
+                                false
+                            } else {
+                                // Some other Unicode char beginning with `0xE2`. Continue searching.
+                                true
+                            }
+                        })
+                    }
+                },
+                handle_match: |_next_byte, _start| {
+                    self.token.is_on_new_line = true;
+                    Kind::Skip
+                },
+                handle_eof: |_start| {
+                    self.trivia_builder.add_single_line_comment(self.token.start, self.offset());
+                    Kind::Skip
+                },
+            };
        }
-        // EOF
-        self.trivia_builder.add_single_line_comment(start, self.offset());
-        Kind::Skip
    }

    /// Section 12.4 Multi Line Comment
--- a/crates/oxc_parser/src/lexer/source.rs
+++ b/crates/oxc_parser/src/lexer/source.rs
@ -534,7 +534,6 @@ impl<'a> SourcePosition<'a> {
    /// # SAFETY
    /// Caller must ensure `SourcePosition` is no later than 2 bytes before end of source text.
    /// i.e. if source length is 10, `self` must be on position 8 max.
-    #[allow(dead_code)]
    #[inline]
    pub(super) unsafe fn read2(self) -> [u8; 2] {
        // SAFETY: