refactor(parser): continue_if in byte_search macro not unsafe (#2440)

#2439 made using `continue_if` in `byte_search!` macro safe, as it no longer continues the main loop after a match, so no danger of reading out of bounds if `continue_if` code fast-forwards the current position. This follow-on PR removes the unsafe blocks, and uses that fast-forward ability in a couple of places.
2026-05-24 20:32:10 +00:00 · 2024-02-20 02:45:31 +00:00 · 2024-02-20 02:45:31 +00:00 · a78303d5a6
commit a78303d5a6
parent a5a3c695f7
2 changed files with 140 additions and 152 deletions
--- a/crates/oxc_parser/src/lexer/comment.rs
+++ b/crates/oxc_parser/src/lexer/comment.rs
@ -24,59 +24,60 @@ static MULTILINE_COMMENT_START_TABLE: SafeByteMatchTable =
 impl<'a> Lexer<'a> {
    /// Section 12.4 Single Line Comment
    pub(super) fn skip_single_line_comment(&mut self) -> Kind {
-        // SAFETY: Requirement not to alter `pos` if return `true` from `if_continue` is satisfied
-        unsafe {
-            byte_search! {
-                lexer: self,
-                table: LINE_BREAK_TABLE,
-                continue_if: |next_byte, pos| {
-                    // Match found. Decide whether to continue searching.
-                    // If this is end of comment, create trivia, and advance `pos` to after line break.
-                    // Do that here rather than in `handle_match`, to avoid branching twice on value of
-                    // the matched byte.
-                    #[allow(clippy::if_not_else)]
-                    if next_byte != LS_OR_PS_FIRST {
-                        // `\r` or `\n`
-                        self.trivia_builder
-                            .add_single_line_comment(self.token.start, self.source.offset_of(pos));
-                        // SAFETY: Safe to consume `\r` or `\n` as both are ASCII
-                        pos = pos.add(1);
-                        // We've found the end. Do not continue searching.
-                        false
-                    } else {
-                        // `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
-                        // Either way, Unicode is uncommon, so make this a cold branch.
-                        cold_branch(|| {
-                            // SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
-                            // So safe to advance `pos` by 1 and read 2 bytes.
-                            let next2 = pos.add(1).read2();
-                            if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
-                                // Irregular line break
-                                self.trivia_builder
-                                    .add_single_line_comment(self.token.start, self.source.offset_of(pos));
-                                // Advance `pos` to after this char.
-                                // SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
-                                // so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
-                                pos = pos.add(3);
-                                // We've found the end. Do not continue searching.
-                                false
-                            } else {
-                                // Some other Unicode char beginning with `0xE2`. Continue searching.
-                                true
-                            }
-                        })
-                    }
-                },
-                handle_match: |_next_byte, _start| {
-                    self.token.is_on_new_line = true;
-                    Kind::Skip
-                },
-                handle_eof: |_start| {
-                    self.trivia_builder.add_single_line_comment(self.token.start, self.offset());
-                    Kind::Skip
-                },
-            };
-        }
+        byte_search! {
+            lexer: self,
+            table: LINE_BREAK_TABLE,
+            continue_if: |next_byte, pos| {
+                // Match found. Decide whether to continue searching.
+                // If this is end of comment, create trivia, and advance `pos` to after line break.
+                // Do that here rather than in `handle_match`, to avoid branching twice on value of
+                // the matched byte.
+                #[allow(clippy::if_not_else)]
+                if next_byte != LS_OR_PS_FIRST {
+                    // `\r` or `\n`
+                    self.trivia_builder
+                        .add_single_line_comment(self.token.start, self.source.offset_of(pos));
+                    // SAFETY: Safe to consume `\r` or `\n` as both are ASCII
+                    unsafe { pos = pos.add(1) };
+                    // We've found the end. Do not continue searching.
+                    false
+                } else {
+                    // `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
+                    // Either way, Unicode is uncommon, so make this a cold branch.
+                    cold_branch(|| {
+                        // SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
+                        // So safe to advance `pos` by 1 and read 2 bytes.
+                        let next2 = unsafe { pos.add(1).read2() };
+                        if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
+                            // Irregular line break
+                            self.trivia_builder
+                                .add_single_line_comment(self.token.start, self.source.offset_of(pos));
+                            // Advance `pos` to after this char.
+                            // SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
+                            // so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
+                            unsafe { pos = pos.add(3) };
+                            // We've found the end. Do not continue searching.
+                            false
+                        } else {
+                            // Some other Unicode char beginning with `0xE2`.
+                            // Skip 3 bytes (macro skips 1 already, so skip 2 here), and continue searching.
+                            // SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
+                            // so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
+                            unsafe { pos = pos.add(2) };
+                            true
+                        }
+                    })
+                }
+            },
+            handle_match: |_next_byte, _start| {
+                self.token.is_on_new_line = true;
+                Kind::Skip
+            },
+            handle_eof: |_start| {
+                self.trivia_builder.add_single_line_comment(self.token.start, self.offset());
+                Kind::Skip
+            },
+        };
    }

    /// Section 12.4 Multi Line Comment
@ -86,64 +87,67 @@ impl<'a> Lexer<'a> {
            return self.skip_multi_line_comment_after_line_break(self.source.position());
        }

-        // SAFETY: Requirement not to alter `pos` if return `true` from `if_continue` is satisfied
-        unsafe {
-            byte_search! {
-                lexer: self,
-                table: MULTILINE_COMMENT_START_TABLE,
-                continue_if: |next_byte, pos| {
-                    // Match found. Decide whether to continue searching.
-                    if next_byte == b'*' {
-                        if pos.addr() < self.source.end_addr() - 1 {
-                            // If next byte isn't `/`, continue
-                            // SAFETY: Have checked there's at least 1 further byte to read
-                            if pos.add(1).read() == b'/' {
-                                // Consume `*/`
-                                // SAFETY: Consuming `*/` leaves `pos` on a UTF-8 char boundary
-                                pos = pos.add(2);
-                                false
-                            } else {
-                                true
-                            }
+        byte_search! {
+            lexer: self,
+            table: MULTILINE_COMMENT_START_TABLE,
+            continue_if: |next_byte, pos| {
+                // Match found. Decide whether to continue searching.
+                if next_byte == b'*' {
+                    if pos.addr() < self.source.end_addr() - 1 {
+                        // If next byte isn't `/`, continue
+                        // SAFETY: Have checked there's at least 1 further byte to read
+                        if unsafe { pos.add(1).read() } == b'/' {
+                            // Consume `*/`
+                            // SAFETY: Consuming `*/` leaves `pos` on a UTF-8 char boundary
+                            unsafe { pos = pos.add(2) };
+                            false
                        } else {
-                            // This is last byte in file. Continue to `handle_eof`.
-                            // This is illegal in valid JS, so mark this branch cold.
-                            cold_branch(|| true)
-                        }
-                    } else if next_byte == LS_OR_PS_FIRST {
-                        // `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
-                        // Either way, Unicode is uncommon, so make this a cold branch.
-                        cold_branch(|| {
-                            // SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
-                            // So safe to advance `pos` by 1 and read 2 bytes.
-                            let next2 = pos.add(1).read2();
-                            if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
-                                // Irregular line break
-                                self.token.is_on_new_line = true;
-                                // Ideally we'd go on to `skip_multi_line_comment_after_line_break` here
-                                // but can't do that easily because can't use `return` in a closure.
-                                // But irregular line breaks are rare anyway.
-                            }
-                            // Either way, continue searching
                            true
-                        })
+                        }
                    } else {
-                        // Regular line break.
-                        // No need to look for more line breaks, so switch to faster search just for `*/`.
-                        self.token.is_on_new_line = true;
-                        return self.skip_multi_line_comment_after_line_break(pos.add(1));
+                        // This is last byte in file. Continue to `handle_eof`.
+                        // This is illegal in valid JS, so mark this branch cold.
+                        cold_branch(|| true)
                    }
-                },
-                handle_match: |_next_byte, _start| {
-                    self.trivia_builder.add_multi_line_comment(self.token.start, self.offset());
-                    Kind::Skip
-                },
-                handle_eof: |_start| {
-                    self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range()));
-                    Kind::Eof
-                },
-            };
-        }
+                } else if next_byte == LS_OR_PS_FIRST {
+                    // `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
+                    // Either way, Unicode is uncommon, so make this a cold branch.
+                    cold_branch(|| {
+                        // SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
+                        // So safe to advance `pos` by 1 and read 2 bytes.
+                        let next2 = unsafe { pos.add(1).read2() };
+                        if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
+                            // Irregular line break
+                            self.token.is_on_new_line = true;
+                            // Ideally we'd go on to `skip_multi_line_comment_after_line_break` here
+                            // but can't do that easily because can't use `return` in a closure.
+                            // But irregular line breaks are rare anyway.
+                        }
+                        // Either way, continue searching.
+                        // Skip 3 bytes (macro skips 1 already, so skip 2 here), and continue searching.
+                        // SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
+                        // so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
+                        unsafe { pos = pos.add(2) };
+                        true
+                    })
+                } else {
+                    // Regular line break.
+                    // No need to look for more line breaks, so switch to faster search just for `*/`.
+                    self.token.is_on_new_line = true;
+                    // SAFETY: Regular line breaks are ASCII, so skipping 1 byte is a UTF-8 char boundary.
+                    let after_line_break = unsafe { pos.add(1) };
+                    return self.skip_multi_line_comment_after_line_break(after_line_break);
+                }
+            },
+            handle_match: |_next_byte, _start| {
+                self.trivia_builder.add_multi_line_comment(self.token.start, self.offset());
+                Kind::Skip
+            },
+            handle_eof: |_start| {
+                self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range()));
+                Kind::Eof
+            },
+        };
    }

    fn skip_multi_line_comment_after_line_break(&mut self, pos: SourcePosition) -> Kind {
--- a/crates/oxc_parser/src/lexer/search.rs
+++ b/crates/oxc_parser/src/lexer/search.rs
@ -328,40 +328,36 @@ pub(crate) use safe_byte_match_table;
 /// ```
 /// impl<'a> Lexer<'a> {
 ///   fn eat_stuff(&mut self) -> bool {
-///     // SAFETY: It is unsafe to use `continue_if`. See requirements below.
-///     unsafe {
-///       byte_search! {
-///         lexer: self,
-///         table: NOT_STUFF_TABLE,
-///         continue_if: |matched_byte, pos| {
-///           // Matching byte found. Decide whether it's really a match.
-///           // NB: `lexer.source` has NOT been updated at this point.
-///           // SAFETY: If return `true` to continue searching, must NOT alter `pos`.
-///           if matched_byte == 0xE2 {
-///             // Only match a specific Unicode char (in this case 0xE2, 0x80, 0xA8)
-///             unsafe { pos.add(1).read() != 0x80 || pos.add(2).read() != 0xA8) }
-///           } else {
-///             // All others do match. `handle_match` is executed.
-///             false
-///           }
-///         },
-///         handle_match: |matched_byte| {
-///           // Matching byte has been found and `continue_if` returned `false` for it.
-///           // `matched_byte` is `u8` value of first byte which matched the table.
-///           // `lexer.source` is now positioned on first matching byte.
-///           // Handle the next matching byte (deal with any special cases).
-///           // Value this block evaluates to will be returned from enclosing function.
-///           true
-///         },
-///         handle_eof: || {
-///           // No bytes from start position to end of source matched the table.
-///           // `lexer.source` is now positioned at EOF.
-///           // Handle EOF in some way.
-///           // Value this block evaluates to will be returned from enclosing function.
+///     byte_search! {
+///       lexer: self,
+///       table: NOT_STUFF_TABLE,
+///       continue_if: |matched_byte, pos| {
+///         // Matching byte found. Decide whether it's really a match.
+///         // NB: `lexer.source` has NOT been updated at this point.
+///         if matched_byte == 0xE2 {
+///           // Only match a specific Unicode char (in this case 0xE2, 0x80, 0xA8)
+///           unsafe { pos.add(1).read() != 0x80 || pos.add(2).read() != 0xA8) }
+///         } else {
+///           // All others do match. `handle_match` is executed.
 ///           false
-///         },
-///       };
-///     }
+///         }
+///       },
+///       handle_match: |matched_byte| {
+///         // Matching byte has been found and `continue_if` returned `false` for it.
+///         // `matched_byte` is `u8` value of first byte which matched the table.
+///         // `lexer.source` is now positioned on first matching byte.
+///         // Handle the next matching byte (deal with any special cases).
+///         // Value this block evaluates to will be returned from enclosing function.
+///         true
+///       },
+///       handle_eof: || {
+///         // No bytes from start position to end of source matched the table.
+///         // `lexer.source` is now positioned at EOF.
+///         // Handle EOF in some way.
+///         // Value this block evaluates to will be returned from enclosing function.
+///         false
+///       },
+///     };
 ///
 ///     // This is unreachable.
 ///     // Macro always exits current function with a `return` statement.
@ -414,12 +410,6 @@ macro_rules! byte_search {
        handle_match: |$match_byte:ident, $match_start:ident| $match_handler:expr,
        handle_eof: |$eof_start:ident| $eof_handler:expr,
    ) => {{
-        // User has free access to change `$pos` in `continue_if`.
-        // They must satisfy safety requirements explained above.
-        #[inline]
-        unsafe fn unsafe_noop() {}
-        unsafe_noop();
-
        let start = $lexer.source.position();
        byte_search! {
            lexer: $lexer,
@ -458,12 +448,6 @@ macro_rules! byte_search {
        handle_match: |$match_byte:ident| $match_handler:expr,
        handle_eof: || $eof_handler:expr,
    ) => {{
-        // User has free access to change `$pos` in `continue_if`.
-        // They must satisfy safety requirements explained above.
-        #[inline]
-        unsafe fn unsafe_noop() {}
-        unsafe_noop();
-
        byte_search! {
            lexer: $lexer,
            table: $table,