From 996a9d27ebf16e6d48e40a543449a9be490e66a1 Mon Sep 17 00:00:00 2001 From: overlookmotel Date: Tue, 20 Feb 2024 02:39:52 +0000 Subject: [PATCH] perf(parser): `byte_search` macro always unroll main loop (#2439) Refactor `byte_search!` macro to move logic out of the main loop. This ensures the compiler unrolls the loop. This speeds up lexing single-line comments by 20%-25% on the benchmarks which contain enough comments for the change to register. Presumably the loop wasn't unrolled previously. The code required to do this is a little odd. It adds an extra `loop {}` which always exits on the first turn (so not really a useful loop), but is required to be able to use `break` to exit that "loop", making 2 different paths for (1) matching byte found and (2) `for` loop completed without finding any match. This is only way I could find to produce this behavior without using a macro. Is there a more "normal" way to get the same logic? --- crates/oxc_parser/src/lexer/search.rs | 71 ++++++++++++++++----------- 1 file changed, 41 insertions(+), 30 deletions(-) diff --git a/crates/oxc_parser/src/lexer/search.rs b/crates/oxc_parser/src/lexer/search.rs index 49e6cb335..e7c5e894d 100644 --- a/crates/oxc_parser/src/lexer/search.rs +++ b/crates/oxc_parser/src/lexer/search.rs @@ -495,45 +495,56 @@ macro_rules! byte_search { let mut $pos = $start; #[allow(unused_unsafe)] // Silence warnings if macro called in unsafe code - loop { + 'outer: loop { + #[allow(clippy::redundant_else)] if $pos.addr() <= $lexer.source.end_for_batch_search_addr() { // Search a batch of `SEARCH_BATCH_SIZE` bytes. - // The compiler unrolls this loop. + // + // `'inner: loop {}` is not a real loop - it always exits on first turn. + // Only using `loop {}` so that can use `break 'inner` to get out of it. + // This allows complex logic of `$should_continue` and `$match_handler` to be + // outside the `for` loop, keeping it as minimal as possible, to encourage + // compiler to unroll it. + // // SAFETY: - // `$pos.addr() > lexer.source.end_for_batch_search_addr()` check above ensures there are - // at least `SEARCH_BATCH_SIZE` bytes remaining in `lexer.source`. + // `$pos.addr() <= lexer.source.end_for_batch_search_addr()` check above ensures + // there are at least `SEARCH_BATCH_SIZE` bytes remaining in `lexer.source`. // So calls to `$pos.read()` and `$pos.add(1)` in this loop cannot go out of bounds. - for _i in 0..crate::lexer::search::SEARCH_BATCH_SIZE { - // SAFETY: `$pos` cannot go out of bounds in this loop (see above). - let $match_byte = unsafe { $pos.read() }; - if $table.matches($match_byte) { - // Found match. - // Check if should continue. - { - let $continue_byte = $match_byte; - if $should_continue { - // Not a match after all - continue searching. - // SAFETY: `pos` is not at end of source, so safe to advance 1 byte. - // See above about UTF-8 character boundaries invariant. - $pos = unsafe { $pos.add(1) }; - continue; - } + let $match_byte = 'inner: loop { + for _i in 0..crate::lexer::search::SEARCH_BATCH_SIZE { + // SAFETY: `$pos` cannot go out of bounds in this loop (see above) + let byte = unsafe { $pos.read() }; + if $table.matches(byte) { + break 'inner byte; } - // Advance `lexer.source`'s position up to `$pos`, consuming unmatched bytes. - // SAFETY: See above about UTF-8 character boundaries invariant. - $lexer.source.set_position($pos); - - let $match_start = $start; - return $match_handler; + // No match - continue searching batch. + // SAFETY: `$pos` cannot go out of bounds in this loop (see above). + // Also see above about UTF-8 character boundaries invariant. + $pos = unsafe { $pos.add(1) }; } + // No match in batch - search next batch + continue 'outer; + }; - // No match - continue searching - // SAFETY: `$pos` cannot go out of bounds in this loop (see above). - // Also see above about UTF-8 character boundaries invariant. - $pos = unsafe { $pos.add(1) }; + // Found match. Check if should continue. + { + let $continue_byte = $match_byte; + if $should_continue { + // Not a match after all - continue searching. + // SAFETY: `pos` is not at end of source, so safe to advance 1 byte. + // See above about UTF-8 character boundaries invariant. + $pos = unsafe { $pos.add(1) }; + continue; + } } - // No match in batch - loop round and searching next batch + + // Advance `lexer.source`'s position up to `$pos`, consuming unmatched bytes. + // SAFETY: See above about UTF-8 character boundaries invariant. + $lexer.source.set_position($pos); + + let $match_start = $start; + return $match_handler; } else { // Not enough bytes remaining to process as a batch. // This branch marked `#[cold]` as should be very uncommon in normal-length JS files.