refactor(parser): continue_if in byte_search macro not unsafe (#2440)

#2439 made using `continue_if` in `byte_search!` macro safe, as it no longer continues the main loop after a match, so no danger of reading out of bounds if `continue_if` code fast-forwards the current position. This follow-on PR removes the unsafe blocks, and uses that fast-forward ability in a couple of places.
2026-05-24 20:32:10 +00:00 · 2024-02-20 02:45:31 +00:00 · 2024-02-20 02:45:31 +00:00 · a78303d5a6
commit a78303d5a6
parent a5a3c695f7
2 changed files with 140 additions and 152 deletions
--- a/crates/oxc_parser/src/lexer/comment.rs
+++ b/crates/oxc_parser/src/lexer/comment.rs
@ -24,59 +24,60 @@ static MULTILINE_COMMENT_START_TABLE: SafeByteMatchTable =
 impl<'a> Lexer<'a> {
    /// Section 12.4 Single Line Comment
    pub(super) fn skip_single_line_comment(&mut self) -> Kind {
-        // SAFETY: Requirement not to alter `pos` if return `true` from `if_continue` is satisfied
+        byte_search! {
-        unsafe {
+            lexer: self,
-            byte_search! {
+            table: LINE_BREAK_TABLE,
-                lexer: self,
+            continue_if: |next_byte, pos| {
-                table: LINE_BREAK_TABLE,
+                // Match found. Decide whether to continue searching.
-                continue_if: |next_byte, pos| {
+                // If this is end of comment, create trivia, and advance `pos` to after line break.
-                    // Match found. Decide whether to continue searching.
+                // Do that here rather than in `handle_match`, to avoid branching twice on value of
-                    // If this is end of comment, create trivia, and advance `pos` to after line break.
+                // the matched byte.
-                    // Do that here rather than in `handle_match`, to avoid branching twice on value of
+                #[allow(clippy::if_not_else)]
-                    // the matched byte.
+                if next_byte != LS_OR_PS_FIRST {
-                    #[allow(clippy::if_not_else)]
+                    // `\r` or `\n`
-                    if next_byte != LS_OR_PS_FIRST {
+                    self.trivia_builder
-                        // `\r` or `\n`
+                        .add_single_line_comment(self.token.start, self.source.offset_of(pos));
-                        self.trivia_builder
+                    // SAFETY: Safe to consume `\r` or `\n` as both are ASCII
-                            .add_single_line_comment(self.token.start, self.source.offset_of(pos));
+                    unsafe { pos = pos.add(1) };
-                        // SAFETY: Safe to consume `\r` or `\n` as both are ASCII
+                    // We've found the end. Do not continue searching.
-                        pos = pos.add(1);
+                    false
-                        // We've found the end. Do not continue searching.
+                } else {
-                        false
+                    // `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
-                    } else {
+                    // Either way, Unicode is uncommon, so make this a cold branch.
-                        // `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
+                    cold_branch(|| {
-                        // Either way, Unicode is uncommon, so make this a cold branch.
+                        // SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
-                        cold_branch(|| {
+                        // So safe to advance `pos` by 1 and read 2 bytes.
-                            // SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
+                        let next2 = unsafe { pos.add(1).read2() };
-                            // So safe to advance `pos` by 1 and read 2 bytes.
+                        if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
-                            let next2 = pos.add(1).read2();
+                            // Irregular line break
-                            if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
+                            self.trivia_builder
-                                // Irregular line break
+                                .add_single_line_comment(self.token.start, self.source.offset_of(pos));
-                                self.trivia_builder
+                            // Advance `pos` to after this char.
-                                    .add_single_line_comment(self.token.start, self.source.offset_of(pos));
+                            // SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
-                                // Advance `pos` to after this char.
+                            // so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
-                                // SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
+                            unsafe { pos = pos.add(3) };
-                                // so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
+                            // We've found the end. Do not continue searching.
-                                pos = pos.add(3);
+                            false
-                                // We've found the end. Do not continue searching.
+                        } else {
-                                false
+                            // Some other Unicode char beginning with `0xE2`.
-                            } else {
+                            // Skip 3 bytes (macro skips 1 already, so skip 2 here), and continue searching.
-                                // Some other Unicode char beginning with `0xE2`. Continue searching.
+                            // SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
-                                true
+                            // so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
-                            }
+                            unsafe { pos = pos.add(2) };
-                        })
+                            true
-                    }
+                        }
-                },
+                    })
-                handle_match: |_next_byte, _start| {
+                }
-                    self.token.is_on_new_line = true;
+            },
-                    Kind::Skip
+            handle_match: |_next_byte, _start| {
-                },
+                self.token.is_on_new_line = true;
-                handle_eof: |_start| {
+                Kind::Skip
-                    self.trivia_builder.add_single_line_comment(self.token.start, self.offset());
+            },
-                    Kind::Skip
+            handle_eof: |_start| {
-                },
+                self.trivia_builder.add_single_line_comment(self.token.start, self.offset());
-            };
+                Kind::Skip
-        }
+            },
        };
    }
    /// Section 12.4 Multi Line Comment
@ -86,64 +87,67 @@ impl<'a> Lexer<'a> {
            return self.skip_multi_line_comment_after_line_break(self.source.position());
        }
-        // SAFETY: Requirement not to alter `pos` if return `true` from `if_continue` is satisfied
+        byte_search! {
-        unsafe {
+            lexer: self,
-            byte_search! {
+            table: MULTILINE_COMMENT_START_TABLE,
-                lexer: self,
+            continue_if: |next_byte, pos| {
-                table: MULTILINE_COMMENT_START_TABLE,
+                // Match found. Decide whether to continue searching.
-                continue_if: |next_byte, pos| {
+                if next_byte == b'*' {
-                    // Match found. Decide whether to continue searching.
+                    if pos.addr() < self.source.end_addr() - 1 {
-                    if next_byte == b'*' {
+                        // If next byte isn't `/`, continue
-                        if pos.addr() < self.source.end_addr() - 1 {
+                        // SAFETY: Have checked there's at least 1 further byte to read
-                            // If next byte isn't `/`, continue
+                        if unsafe { pos.add(1).read() } == b'/' {
-                            // SAFETY: Have checked there's at least 1 further byte to read
+                            // Consume `*/`
-                            if pos.add(1).read() == b'/' {
+                            // SAFETY: Consuming `*/` leaves `pos` on a UTF-8 char boundary
-                                // Consume `*/`
+                            unsafe { pos = pos.add(2) };
-                                // SAFETY: Consuming `*/` leaves `pos` on a UTF-8 char boundary
+                            false
                                pos = pos.add(2);
                                false
                            } else {
                                true
                            }
                        } else {
                            // This is last byte in file. Continue to `handle_eof`.
                            // This is illegal in valid JS, so mark this branch cold.
                            cold_branch(|| true)
                        }
                    } else if next_byte == LS_OR_PS_FIRST {
                        // `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
                        // Either way, Unicode is uncommon, so make this a cold branch.
                        cold_branch(|| {
                            // SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
                            // So safe to advance `pos` by 1 and read 2 bytes.
                            let next2 = pos.add(1).read2();
                            if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
                                // Irregular line break
                                self.token.is_on_new_line = true;
                                // Ideally we'd go on to `skip_multi_line_comment_after_line_break` here
                                // but can't do that easily because can't use `return` in a closure.
                                // But irregular line breaks are rare anyway.
                            }
                            // Either way, continue searching
                            true
-                        })
+                        }
                    } else {
-                        // Regular line break.
+                        // This is last byte in file. Continue to `handle_eof`.
-                        // No need to look for more line breaks, so switch to faster search just for `*/`.
+                        // This is illegal in valid JS, so mark this branch cold.
-                        self.token.is_on_new_line = true;
+                        cold_branch(|| true)
                        return self.skip_multi_line_comment_after_line_break(pos.add(1));
                    }
-                },
+                } else if next_byte == LS_OR_PS_FIRST {
-                handle_match: |_next_byte, _start| {
+                    // `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
-                    self.trivia_builder.add_multi_line_comment(self.token.start, self.offset());
+                    // Either way, Unicode is uncommon, so make this a cold branch.
-                    Kind::Skip
+                    cold_branch(|| {
-                },
+                        // SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
-                handle_eof: |_start| {
+                        // So safe to advance `pos` by 1 and read 2 bytes.
-                    self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range()));
+                        let next2 = unsafe { pos.add(1).read2() };
-                    Kind::Eof
+                        if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
-                },
+                            // Irregular line break
-            };
+                            self.token.is_on_new_line = true;
-        }
+                            // Ideally we'd go on to `skip_multi_line_comment_after_line_break` here
                            // but can't do that easily because can't use `return` in a closure.
                            // But irregular line breaks are rare anyway.
                        }
                        // Either way, continue searching.
                        // Skip 3 bytes (macro skips 1 already, so skip 2 here), and continue searching.
                        // SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
                        // so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
                        unsafe { pos = pos.add(2) };
                        true
                    })
                } else {
                    // Regular line break.
                    // No need to look for more line breaks, so switch to faster search just for `*/`.
                    self.token.is_on_new_line = true;
                    // SAFETY: Regular line breaks are ASCII, so skipping 1 byte is a UTF-8 char boundary.
                    let after_line_break = unsafe { pos.add(1) };
                    return self.skip_multi_line_comment_after_line_break(after_line_break);
                }
            },
            handle_match: |_next_byte, _start| {
                self.trivia_builder.add_multi_line_comment(self.token.start, self.offset());
                Kind::Skip
            },
            handle_eof: |_start| {
                self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range()));
                Kind::Eof
            },
        };
    }
    fn skip_multi_line_comment_after_line_break(&mut self, pos: SourcePosition) -> Kind {
--- a/crates/oxc_parser/src/lexer/search.rs
+++ b/crates/oxc_parser/src/lexer/search.rs
@ -328,40 +328,36 @@ pub(crate) use safe_byte_match_table;
 /// ```
 /// impl<'a> Lexer<'a> {
 ///   fn eat_stuff(&mut self) -> bool {
-///     // SAFETY: It is unsafe to use `continue_if`. See requirements below.
+///     byte_search! {
-///     unsafe {
+///       lexer: self,
-///       byte_search! {
+///       table: NOT_STUFF_TABLE,
-///         lexer: self,
+///       continue_if: |matched_byte, pos| {
-///         table: NOT_STUFF_TABLE,
+///         // Matching byte found. Decide whether it's really a match.
-///         continue_if: |matched_byte, pos| {
+///         // NB: `lexer.source` has NOT been updated at this point.
-///           // Matching byte found. Decide whether it's really a match.
+///         if matched_byte == 0xE2 {
-///           // NB: `lexer.source` has NOT been updated at this point.
+///           // Only match a specific Unicode char (in this case 0xE2, 0x80, 0xA8)
-///           // SAFETY: If return `true` to continue searching, must NOT alter `pos`.
+///           unsafe { pos.add(1).read() != 0x80 || pos.add(2).read() != 0xA8) }
-///           if matched_byte == 0xE2 {
+///         } else {
-///             // Only match a specific Unicode char (in this case 0xE2, 0x80, 0xA8)
+///           // All others do match. `handle_match` is executed.
 ///             unsafe { pos.add(1).read() != 0x80 || pos.add(2).read() != 0xA8) }
 ///           } else {
 ///             // All others do match. `handle_match` is executed.
 ///             false
 ///           }
 ///         },
 ///         handle_match: |matched_byte| {
 ///           // Matching byte has been found and `continue_if` returned `false` for it.
 ///           // `matched_byte` is `u8` value of first byte which matched the table.
 ///           // `lexer.source` is now positioned on first matching byte.
 ///           // Handle the next matching byte (deal with any special cases).
 ///           // Value this block evaluates to will be returned from enclosing function.
 ///           true
 ///         },
 ///         handle_eof: || {
 ///           // No bytes from start position to end of source matched the table.
 ///           // `lexer.source` is now positioned at EOF.
 ///           // Handle EOF in some way.
 ///           // Value this block evaluates to will be returned from enclosing function.
 ///           false
-///         },
+///         }
-///       };
+///       },
-///     }
+///       handle_match: |matched_byte| {
 ///         // Matching byte has been found and `continue_if` returned `false` for it.
 ///         // `matched_byte` is `u8` value of first byte which matched the table.
 ///         // `lexer.source` is now positioned on first matching byte.
 ///         // Handle the next matching byte (deal with any special cases).
 ///         // Value this block evaluates to will be returned from enclosing function.
 ///         true
 ///       },
 ///       handle_eof: || {
 ///         // No bytes from start position to end of source matched the table.
 ///         // `lexer.source` is now positioned at EOF.
 ///         // Handle EOF in some way.
 ///         // Value this block evaluates to will be returned from enclosing function.
 ///         false
 ///       },
 ///     };
 ///
 ///     // This is unreachable.
 ///     // Macro always exits current function with a `return` statement.
@ -414,12 +410,6 @@ macro_rules! byte_search {
        handle_match: |$match_byte:ident, $match_start:ident| $match_handler:expr,
        handle_eof: |$eof_start:ident| $eof_handler:expr,
    ) => {{
        // User has free access to change `$pos` in `continue_if`.
        // They must satisfy safety requirements explained above.
        #[inline]
        unsafe fn unsafe_noop() {}
        unsafe_noop();
        let start = $lexer.source.position();
        byte_search! {
            lexer: $lexer,
@ -458,12 +448,6 @@ macro_rules! byte_search {
        handle_match: |$match_byte:ident| $match_handler:expr,
        handle_eof: || $eof_handler:expr,
    ) => {{
        // User has free access to change `$pos` in `continue_if`.
        // They must satisfy safety requirements explained above.
        #[inline]
        unsafe fn unsafe_noop() {}
        unsafe_noop();
        byte_search! {
            lexer: $lexer,
            table: $table,