mirror of
https://github.com/danbulant/oxc
synced 2026-05-24 12:21:58 +00:00
refactor(parser): extend byte_search macro (#2372)
Preparatory step for #2374.
This commit is contained in:
parent
0be8397c77
commit
79ae9a9b2c
1 changed files with 139 additions and 15 deletions
|
|
@ -323,6 +323,52 @@ pub(crate) use safe_byte_match_table;
|
||||||
/// }
|
/// }
|
||||||
/// ```
|
/// ```
|
||||||
///
|
///
|
||||||
|
/// Can also add a block to decide whether to continue searching for some matches:
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// impl<'a> Lexer<'a> {
|
||||||
|
/// fn eat_stuff(&mut self) -> bool {
|
||||||
|
/// // SAFETY: It is unsafe to use `continue_if`. See requirements below.
|
||||||
|
/// unsafe {
|
||||||
|
/// byte_search! {
|
||||||
|
/// lexer: self,
|
||||||
|
/// table: NOT_STUFF_TABLE,
|
||||||
|
/// continue_if: |matched_byte, pos| {
|
||||||
|
/// // Matching byte found. Decide whether it's really a match.
|
||||||
|
/// // NB: `lexer.source` has NOT been updated at this point.
|
||||||
|
/// // SAFETY: If return `true` to continue searching, must NOT alter `pos`.
|
||||||
|
/// if matched_byte == 0xE2 {
|
||||||
|
/// // Only match a specific Unicode char (in this case 0xE2, 0x80, 0xA8)
|
||||||
|
/// unsafe { pos.add(1).read() != 0x80 || pos.add(2).read() != 0xA8) }
|
||||||
|
/// } else {
|
||||||
|
/// // All others do match. `handle_match` is executed.
|
||||||
|
/// false
|
||||||
|
/// }
|
||||||
|
/// },
|
||||||
|
/// handle_match: |matched_byte| {
|
||||||
|
/// // Matching byte has been found and `continue_if` returned `false` for it.
|
||||||
|
/// // `matched_byte` is `u8` value of first byte which matched the table.
|
||||||
|
/// // `lexer.source` is now positioned on first matching byte.
|
||||||
|
/// // Handle the next matching byte (deal with any special cases).
|
||||||
|
/// // Value this block evaluates to will be returned from enclosing function.
|
||||||
|
/// true
|
||||||
|
/// },
|
||||||
|
/// handle_eof: || {
|
||||||
|
/// // No bytes from start position to end of source matched the table.
|
||||||
|
/// // `lexer.source` is now positioned at EOF.
|
||||||
|
/// // Handle EOF in some way.
|
||||||
|
/// // Value this block evaluates to will be returned from enclosing function.
|
||||||
|
/// false
|
||||||
|
/// },
|
||||||
|
/// };
|
||||||
|
/// }
|
||||||
|
///
|
||||||
|
/// // This is unreachable.
|
||||||
|
/// // Macro always exits current function with a `return` statement.
|
||||||
|
/// }
|
||||||
|
/// }
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
/// NB: The macro always causes enclosing function to return.
|
/// NB: The macro always causes enclosing function to return.
|
||||||
/// It creates `return` statements with the value that `handle_match` / `handle_eof` blocks evaluate to.
|
/// It creates `return` statements with the value that `handle_match` / `handle_eof` blocks evaluate to.
|
||||||
/// After the `byte_search!` macro is unreachable.
|
/// After the `byte_search!` macro is unreachable.
|
||||||
|
|
@ -353,6 +399,33 @@ macro_rules! byte_search {
|
||||||
lexer: $lexer,
|
lexer: $lexer,
|
||||||
table: $table,
|
table: $table,
|
||||||
start: start,
|
start: start,
|
||||||
|
continue_if: |__byte, pos| false,
|
||||||
|
handle_match: |$match_byte, $match_start| $match_handler,
|
||||||
|
handle_eof: |$eof_start| $eof_handler,
|
||||||
|
}
|
||||||
|
}};
|
||||||
|
|
||||||
|
// Standard version with `continue_if`.
|
||||||
|
// `start` is calculated from current position of `lexer.source`.
|
||||||
|
(
|
||||||
|
lexer: $lexer:ident,
|
||||||
|
table: $table:ident,
|
||||||
|
continue_if: |$continue_byte:ident, $pos:ident| $should_continue:expr,
|
||||||
|
handle_match: |$match_byte:ident, $match_start:ident| $match_handler:expr,
|
||||||
|
handle_eof: |$eof_start:ident| $eof_handler:expr,
|
||||||
|
) => {{
|
||||||
|
// User has free access to change `$pos` in `continue_if`.
|
||||||
|
// They must satisfy safety requirements explained above.
|
||||||
|
#[inline]
|
||||||
|
unsafe fn unsafe_noop() {}
|
||||||
|
unsafe_noop();
|
||||||
|
|
||||||
|
let start = $lexer.source.position();
|
||||||
|
byte_search! {
|
||||||
|
lexer: $lexer,
|
||||||
|
table: $table,
|
||||||
|
start: start,
|
||||||
|
continue_if: |$continue_byte, $pos| $should_continue,
|
||||||
handle_match: |$match_byte, $match_start| $match_handler,
|
handle_match: |$match_byte, $match_start| $match_handler,
|
||||||
handle_eof: |$eof_start| $eof_handler,
|
handle_eof: |$eof_start| $eof_handler,
|
||||||
}
|
}
|
||||||
|
|
@ -370,16 +443,43 @@ macro_rules! byte_search {
|
||||||
lexer: $lexer,
|
lexer: $lexer,
|
||||||
table: $table,
|
table: $table,
|
||||||
start: $start,
|
start: $start,
|
||||||
|
continue_if: |__byte, pos| false,
|
||||||
handle_match: |$match_byte, __start| $match_handler,
|
handle_match: |$match_byte, __start| $match_handler,
|
||||||
handle_eof: |__start| $eof_handler,
|
handle_eof: |__start| $eof_handler,
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Provide your own `start` position, and `continue_if`
|
||||||
|
(
|
||||||
|
lexer: $lexer:ident,
|
||||||
|
table: $table:ident,
|
||||||
|
start: $start:ident,
|
||||||
|
continue_if: |$continue_byte:ident, $pos:ident| $should_continue:expr,
|
||||||
|
handle_match: |$match_byte:ident| $match_handler:expr,
|
||||||
|
handle_eof: || $eof_handler:expr,
|
||||||
|
) => {{
|
||||||
|
// User has free access to change `$pos` in `continue_if`.
|
||||||
|
// They must satisfy safety requirements explained above.
|
||||||
|
#[inline]
|
||||||
|
unsafe fn unsafe_noop() {}
|
||||||
|
unsafe_noop();
|
||||||
|
|
||||||
|
byte_search! {
|
||||||
|
lexer: $lexer,
|
||||||
|
table: $table,
|
||||||
|
start: $start,
|
||||||
|
continue_if: |$continue_byte, $pos| $should_continue,
|
||||||
|
handle_match: |$match_byte, __start| $match_handler,
|
||||||
|
handle_eof: |__start| $eof_handler,
|
||||||
|
}
|
||||||
|
}};
|
||||||
|
|
||||||
// Actual implementation
|
// Actual implementation
|
||||||
(
|
(
|
||||||
lexer: $lexer:ident,
|
lexer: $lexer:ident,
|
||||||
table: $table:ident,
|
table: $table:ident,
|
||||||
start: $start:ident,
|
start: $start:ident,
|
||||||
|
continue_if: |$continue_byte:ident, $pos:ident| $should_continue:expr,
|
||||||
handle_match: |$match_byte:ident, $match_start:ident| $match_handler:expr,
|
handle_match: |$match_byte:ident, $match_start:ident| $match_handler:expr,
|
||||||
handle_eof: |$eof_start:ident| $eof_handler:expr,
|
handle_eof: |$eof_start:ident| $eof_handler:expr,
|
||||||
) => {{
|
) => {{
|
||||||
|
|
@ -393,33 +493,45 @@ macro_rules! byte_search {
|
||||||
// to unsafe functions in this function with `unsafe {}`.
|
// to unsafe functions in this function with `unsafe {}`.
|
||||||
$table.use_table();
|
$table.use_table();
|
||||||
|
|
||||||
let mut pos = $start;
|
let mut $pos = $start;
|
||||||
#[allow(unused_unsafe)] // Silence warnings if macro called in unsafe code
|
#[allow(unused_unsafe)] // Silence warnings if macro called in unsafe code
|
||||||
loop {
|
loop {
|
||||||
if pos.addr() <= $lexer.source.end_for_batch_search_addr() {
|
if $pos.addr() <= $lexer.source.end_for_batch_search_addr() {
|
||||||
// Search a batch of `SEARCH_BATCH_SIZE` bytes.
|
// Search a batch of `SEARCH_BATCH_SIZE` bytes.
|
||||||
// The compiler unrolls this loop.
|
// The compiler unrolls this loop.
|
||||||
// SAFETY:
|
// SAFETY:
|
||||||
// `pos.addr() > lexer.source.end_for_batch_search_addr()` check above ensures there are
|
// `$pos.addr() > lexer.source.end_for_batch_search_addr()` check above ensures there are
|
||||||
// at least `SEARCH_BATCH_SIZE` bytes remaining in `lexer.source`.
|
// at least `SEARCH_BATCH_SIZE` bytes remaining in `lexer.source`.
|
||||||
// So calls to `pos.read()` and `pos.add(1)` in this loop cannot go out of bounds.
|
// So calls to `$pos.read()` and `$pos.add(1)` in this loop cannot go out of bounds.
|
||||||
for _i in 0..crate::lexer::search::SEARCH_BATCH_SIZE {
|
for _i in 0..crate::lexer::search::SEARCH_BATCH_SIZE {
|
||||||
// SAFETY: `pos` cannot go out of bounds in this loop (see above).
|
// SAFETY: `$pos` cannot go out of bounds in this loop (see above).
|
||||||
let $match_byte = unsafe { pos.read() };
|
let $match_byte = unsafe { $pos.read() };
|
||||||
if $table.matches($match_byte) {
|
if $table.matches($match_byte) {
|
||||||
// Found match.
|
// Found match.
|
||||||
// Advance `lexer.source`'s position up to `pos`, consuming unmatched bytes.
|
// Check if should continue.
|
||||||
|
{
|
||||||
|
let $continue_byte = $match_byte;
|
||||||
|
if $should_continue {
|
||||||
|
// Not a match after all - continue searching.
|
||||||
|
// SAFETY: `pos` is not at end of source, so safe to advance 1 byte.
|
||||||
|
// See above about UTF-8 character boundaries invariant.
|
||||||
|
$pos = unsafe { $pos.add(1) };
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Advance `lexer.source`'s position up to `$pos`, consuming unmatched bytes.
|
||||||
// SAFETY: See above about UTF-8 character boundaries invariant.
|
// SAFETY: See above about UTF-8 character boundaries invariant.
|
||||||
$lexer.source.set_position(pos);
|
$lexer.source.set_position($pos);
|
||||||
|
|
||||||
let $match_start = $start;
|
let $match_start = $start;
|
||||||
return $match_handler;
|
return $match_handler;
|
||||||
}
|
}
|
||||||
|
|
||||||
// No match - continue searching
|
// No match - continue searching
|
||||||
// SAFETY: `pos` cannot go out of bounds in this loop (see above).
|
// SAFETY: `$pos` cannot go out of bounds in this loop (see above).
|
||||||
// Also see above about UTF-8 character boundaries invariant.
|
// Also see above about UTF-8 character boundaries invariant.
|
||||||
pos = unsafe { pos.add(1) };
|
$pos = unsafe { $pos.add(1) };
|
||||||
}
|
}
|
||||||
// No match in batch - loop round and searching next batch
|
// No match in batch - loop round and searching next batch
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -430,14 +542,26 @@ macro_rules! byte_search {
|
||||||
// to remove that problem.
|
// to remove that problem.
|
||||||
return crate::lexer::cold_branch(|| {
|
return crate::lexer::cold_branch(|| {
|
||||||
let end_addr = $lexer.source.end_addr();
|
let end_addr = $lexer.source.end_addr();
|
||||||
while pos.addr() < end_addr {
|
while $pos.addr() < end_addr {
|
||||||
// SAFETY: `pos` is not at end of source, so safe to read a byte
|
// SAFETY: `pos` is not at end of source, so safe to read a byte
|
||||||
let $match_byte = unsafe { pos.read() };
|
let $match_byte = unsafe { $pos.read() };
|
||||||
if $table.matches($match_byte) {
|
if $table.matches($match_byte) {
|
||||||
// Found match.
|
// Found match.
|
||||||
|
// Check if should continue.
|
||||||
|
{
|
||||||
|
let $continue_byte = $match_byte;
|
||||||
|
if $should_continue {
|
||||||
|
// Not a match after all - continue searching.
|
||||||
|
// SAFETY: `pos` is not at end of source, so safe to advance 1 byte.
|
||||||
|
// See above about UTF-8 character boundaries invariant.
|
||||||
|
$pos = unsafe { $pos.add(1) };
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Advance `lexer.source`'s position up to `pos`, consuming unmatched bytes.
|
// Advance `lexer.source`'s position up to `pos`, consuming unmatched bytes.
|
||||||
// SAFETY: See above about UTF-8 character boundaries invariant.
|
// SAFETY: See above about UTF-8 character boundaries invariant.
|
||||||
$lexer.source.set_position(pos);
|
$lexer.source.set_position($pos);
|
||||||
|
|
||||||
let $match_start = $start;
|
let $match_start = $start;
|
||||||
return $match_handler;
|
return $match_handler;
|
||||||
|
|
@ -446,12 +570,12 @@ macro_rules! byte_search {
|
||||||
// No match - continue searching
|
// No match - continue searching
|
||||||
// SAFETY: `pos` is not at end of source, so safe to advance 1 byte.
|
// SAFETY: `pos` is not at end of source, so safe to advance 1 byte.
|
||||||
// See above about UTF-8 character boundaries invariant.
|
// See above about UTF-8 character boundaries invariant.
|
||||||
pos = unsafe { pos.add(1) };
|
$pos = unsafe { $pos.add(1) };
|
||||||
}
|
}
|
||||||
|
|
||||||
// EOF.
|
// EOF.
|
||||||
// Advance `lexer.source`'s position to end of file.
|
// Advance `lexer.source`'s position to end of file.
|
||||||
$lexer.source.set_position(pos);
|
$lexer.source.set_position($pos);
|
||||||
|
|
||||||
let $eof_start = $start;
|
let $eof_start = $start;
|
||||||
$eof_handler
|
$eof_handler
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue