mirror of
https://github.com/danbulant/oxc
synced 2026-05-24 20:32:10 +00:00
refactor(parser): continue_if in byte_search macro not unsafe (#2440)
#2439 made using `continue_if` in `byte_search!` macro safe, as it no longer continues the main loop after a match, so no danger of reading out of bounds if `continue_if` code fast-forwards the current position. This follow-on PR removes the unsafe blocks, and uses that fast-forward ability in a couple of places.
This commit is contained in:
parent
a5a3c695f7
commit
a78303d5a6
2 changed files with 140 additions and 152 deletions
|
|
@ -24,59 +24,60 @@ static MULTILINE_COMMENT_START_TABLE: SafeByteMatchTable =
|
|||
impl<'a> Lexer<'a> {
|
||||
/// Section 12.4 Single Line Comment
|
||||
pub(super) fn skip_single_line_comment(&mut self) -> Kind {
|
||||
// SAFETY: Requirement not to alter `pos` if return `true` from `if_continue` is satisfied
|
||||
unsafe {
|
||||
byte_search! {
|
||||
lexer: self,
|
||||
table: LINE_BREAK_TABLE,
|
||||
continue_if: |next_byte, pos| {
|
||||
// Match found. Decide whether to continue searching.
|
||||
// If this is end of comment, create trivia, and advance `pos` to after line break.
|
||||
// Do that here rather than in `handle_match`, to avoid branching twice on value of
|
||||
// the matched byte.
|
||||
#[allow(clippy::if_not_else)]
|
||||
if next_byte != LS_OR_PS_FIRST {
|
||||
// `\r` or `\n`
|
||||
self.trivia_builder
|
||||
.add_single_line_comment(self.token.start, self.source.offset_of(pos));
|
||||
// SAFETY: Safe to consume `\r` or `\n` as both are ASCII
|
||||
pos = pos.add(1);
|
||||
// We've found the end. Do not continue searching.
|
||||
false
|
||||
} else {
|
||||
// `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
|
||||
// Either way, Unicode is uncommon, so make this a cold branch.
|
||||
cold_branch(|| {
|
||||
// SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
|
||||
// So safe to advance `pos` by 1 and read 2 bytes.
|
||||
let next2 = pos.add(1).read2();
|
||||
if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
|
||||
// Irregular line break
|
||||
self.trivia_builder
|
||||
.add_single_line_comment(self.token.start, self.source.offset_of(pos));
|
||||
// Advance `pos` to after this char.
|
||||
// SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
|
||||
// so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
|
||||
pos = pos.add(3);
|
||||
// We've found the end. Do not continue searching.
|
||||
false
|
||||
} else {
|
||||
// Some other Unicode char beginning with `0xE2`. Continue searching.
|
||||
true
|
||||
}
|
||||
})
|
||||
}
|
||||
},
|
||||
handle_match: |_next_byte, _start| {
|
||||
self.token.is_on_new_line = true;
|
||||
Kind::Skip
|
||||
},
|
||||
handle_eof: |_start| {
|
||||
self.trivia_builder.add_single_line_comment(self.token.start, self.offset());
|
||||
Kind::Skip
|
||||
},
|
||||
};
|
||||
}
|
||||
byte_search! {
|
||||
lexer: self,
|
||||
table: LINE_BREAK_TABLE,
|
||||
continue_if: |next_byte, pos| {
|
||||
// Match found. Decide whether to continue searching.
|
||||
// If this is end of comment, create trivia, and advance `pos` to after line break.
|
||||
// Do that here rather than in `handle_match`, to avoid branching twice on value of
|
||||
// the matched byte.
|
||||
#[allow(clippy::if_not_else)]
|
||||
if next_byte != LS_OR_PS_FIRST {
|
||||
// `\r` or `\n`
|
||||
self.trivia_builder
|
||||
.add_single_line_comment(self.token.start, self.source.offset_of(pos));
|
||||
// SAFETY: Safe to consume `\r` or `\n` as both are ASCII
|
||||
unsafe { pos = pos.add(1) };
|
||||
// We've found the end. Do not continue searching.
|
||||
false
|
||||
} else {
|
||||
// `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
|
||||
// Either way, Unicode is uncommon, so make this a cold branch.
|
||||
cold_branch(|| {
|
||||
// SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
|
||||
// So safe to advance `pos` by 1 and read 2 bytes.
|
||||
let next2 = unsafe { pos.add(1).read2() };
|
||||
if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
|
||||
// Irregular line break
|
||||
self.trivia_builder
|
||||
.add_single_line_comment(self.token.start, self.source.offset_of(pos));
|
||||
// Advance `pos` to after this char.
|
||||
// SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
|
||||
// so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
|
||||
unsafe { pos = pos.add(3) };
|
||||
// We've found the end. Do not continue searching.
|
||||
false
|
||||
} else {
|
||||
// Some other Unicode char beginning with `0xE2`.
|
||||
// Skip 3 bytes (macro skips 1 already, so skip 2 here), and continue searching.
|
||||
// SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
|
||||
// so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
|
||||
unsafe { pos = pos.add(2) };
|
||||
true
|
||||
}
|
||||
})
|
||||
}
|
||||
},
|
||||
handle_match: |_next_byte, _start| {
|
||||
self.token.is_on_new_line = true;
|
||||
Kind::Skip
|
||||
},
|
||||
handle_eof: |_start| {
|
||||
self.trivia_builder.add_single_line_comment(self.token.start, self.offset());
|
||||
Kind::Skip
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
/// Section 12.4 Multi Line Comment
|
||||
|
|
@ -86,64 +87,67 @@ impl<'a> Lexer<'a> {
|
|||
return self.skip_multi_line_comment_after_line_break(self.source.position());
|
||||
}
|
||||
|
||||
// SAFETY: Requirement not to alter `pos` if return `true` from `if_continue` is satisfied
|
||||
unsafe {
|
||||
byte_search! {
|
||||
lexer: self,
|
||||
table: MULTILINE_COMMENT_START_TABLE,
|
||||
continue_if: |next_byte, pos| {
|
||||
// Match found. Decide whether to continue searching.
|
||||
if next_byte == b'*' {
|
||||
if pos.addr() < self.source.end_addr() - 1 {
|
||||
// If next byte isn't `/`, continue
|
||||
// SAFETY: Have checked there's at least 1 further byte to read
|
||||
if pos.add(1).read() == b'/' {
|
||||
// Consume `*/`
|
||||
// SAFETY: Consuming `*/` leaves `pos` on a UTF-8 char boundary
|
||||
pos = pos.add(2);
|
||||
false
|
||||
} else {
|
||||
true
|
||||
}
|
||||
byte_search! {
|
||||
lexer: self,
|
||||
table: MULTILINE_COMMENT_START_TABLE,
|
||||
continue_if: |next_byte, pos| {
|
||||
// Match found. Decide whether to continue searching.
|
||||
if next_byte == b'*' {
|
||||
if pos.addr() < self.source.end_addr() - 1 {
|
||||
// If next byte isn't `/`, continue
|
||||
// SAFETY: Have checked there's at least 1 further byte to read
|
||||
if unsafe { pos.add(1).read() } == b'/' {
|
||||
// Consume `*/`
|
||||
// SAFETY: Consuming `*/` leaves `pos` on a UTF-8 char boundary
|
||||
unsafe { pos = pos.add(2) };
|
||||
false
|
||||
} else {
|
||||
// This is last byte in file. Continue to `handle_eof`.
|
||||
// This is illegal in valid JS, so mark this branch cold.
|
||||
cold_branch(|| true)
|
||||
}
|
||||
} else if next_byte == LS_OR_PS_FIRST {
|
||||
// `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
|
||||
// Either way, Unicode is uncommon, so make this a cold branch.
|
||||
cold_branch(|| {
|
||||
// SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
|
||||
// So safe to advance `pos` by 1 and read 2 bytes.
|
||||
let next2 = pos.add(1).read2();
|
||||
if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
|
||||
// Irregular line break
|
||||
self.token.is_on_new_line = true;
|
||||
// Ideally we'd go on to `skip_multi_line_comment_after_line_break` here
|
||||
// but can't do that easily because can't use `return` in a closure.
|
||||
// But irregular line breaks are rare anyway.
|
||||
}
|
||||
// Either way, continue searching
|
||||
true
|
||||
})
|
||||
}
|
||||
} else {
|
||||
// Regular line break.
|
||||
// No need to look for more line breaks, so switch to faster search just for `*/`.
|
||||
self.token.is_on_new_line = true;
|
||||
return self.skip_multi_line_comment_after_line_break(pos.add(1));
|
||||
// This is last byte in file. Continue to `handle_eof`.
|
||||
// This is illegal in valid JS, so mark this branch cold.
|
||||
cold_branch(|| true)
|
||||
}
|
||||
},
|
||||
handle_match: |_next_byte, _start| {
|
||||
self.trivia_builder.add_multi_line_comment(self.token.start, self.offset());
|
||||
Kind::Skip
|
||||
},
|
||||
handle_eof: |_start| {
|
||||
self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range()));
|
||||
Kind::Eof
|
||||
},
|
||||
};
|
||||
}
|
||||
} else if next_byte == LS_OR_PS_FIRST {
|
||||
// `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
|
||||
// Either way, Unicode is uncommon, so make this a cold branch.
|
||||
cold_branch(|| {
|
||||
// SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
|
||||
// So safe to advance `pos` by 1 and read 2 bytes.
|
||||
let next2 = unsafe { pos.add(1).read2() };
|
||||
if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
|
||||
// Irregular line break
|
||||
self.token.is_on_new_line = true;
|
||||
// Ideally we'd go on to `skip_multi_line_comment_after_line_break` here
|
||||
// but can't do that easily because can't use `return` in a closure.
|
||||
// But irregular line breaks are rare anyway.
|
||||
}
|
||||
// Either way, continue searching.
|
||||
// Skip 3 bytes (macro skips 1 already, so skip 2 here), and continue searching.
|
||||
// SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
|
||||
// so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
|
||||
unsafe { pos = pos.add(2) };
|
||||
true
|
||||
})
|
||||
} else {
|
||||
// Regular line break.
|
||||
// No need to look for more line breaks, so switch to faster search just for `*/`.
|
||||
self.token.is_on_new_line = true;
|
||||
// SAFETY: Regular line breaks are ASCII, so skipping 1 byte is a UTF-8 char boundary.
|
||||
let after_line_break = unsafe { pos.add(1) };
|
||||
return self.skip_multi_line_comment_after_line_break(after_line_break);
|
||||
}
|
||||
},
|
||||
handle_match: |_next_byte, _start| {
|
||||
self.trivia_builder.add_multi_line_comment(self.token.start, self.offset());
|
||||
Kind::Skip
|
||||
},
|
||||
handle_eof: |_start| {
|
||||
self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range()));
|
||||
Kind::Eof
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
fn skip_multi_line_comment_after_line_break(&mut self, pos: SourcePosition) -> Kind {
|
||||
|
|
|
|||
|
|
@ -328,40 +328,36 @@ pub(crate) use safe_byte_match_table;
|
|||
/// ```
|
||||
/// impl<'a> Lexer<'a> {
|
||||
/// fn eat_stuff(&mut self) -> bool {
|
||||
/// // SAFETY: It is unsafe to use `continue_if`. See requirements below.
|
||||
/// unsafe {
|
||||
/// byte_search! {
|
||||
/// lexer: self,
|
||||
/// table: NOT_STUFF_TABLE,
|
||||
/// continue_if: |matched_byte, pos| {
|
||||
/// // Matching byte found. Decide whether it's really a match.
|
||||
/// // NB: `lexer.source` has NOT been updated at this point.
|
||||
/// // SAFETY: If return `true` to continue searching, must NOT alter `pos`.
|
||||
/// if matched_byte == 0xE2 {
|
||||
/// // Only match a specific Unicode char (in this case 0xE2, 0x80, 0xA8)
|
||||
/// unsafe { pos.add(1).read() != 0x80 || pos.add(2).read() != 0xA8) }
|
||||
/// } else {
|
||||
/// // All others do match. `handle_match` is executed.
|
||||
/// false
|
||||
/// }
|
||||
/// },
|
||||
/// handle_match: |matched_byte| {
|
||||
/// // Matching byte has been found and `continue_if` returned `false` for it.
|
||||
/// // `matched_byte` is `u8` value of first byte which matched the table.
|
||||
/// // `lexer.source` is now positioned on first matching byte.
|
||||
/// // Handle the next matching byte (deal with any special cases).
|
||||
/// // Value this block evaluates to will be returned from enclosing function.
|
||||
/// true
|
||||
/// },
|
||||
/// handle_eof: || {
|
||||
/// // No bytes from start position to end of source matched the table.
|
||||
/// // `lexer.source` is now positioned at EOF.
|
||||
/// // Handle EOF in some way.
|
||||
/// // Value this block evaluates to will be returned from enclosing function.
|
||||
/// byte_search! {
|
||||
/// lexer: self,
|
||||
/// table: NOT_STUFF_TABLE,
|
||||
/// continue_if: |matched_byte, pos| {
|
||||
/// // Matching byte found. Decide whether it's really a match.
|
||||
/// // NB: `lexer.source` has NOT been updated at this point.
|
||||
/// if matched_byte == 0xE2 {
|
||||
/// // Only match a specific Unicode char (in this case 0xE2, 0x80, 0xA8)
|
||||
/// unsafe { pos.add(1).read() != 0x80 || pos.add(2).read() != 0xA8) }
|
||||
/// } else {
|
||||
/// // All others do match. `handle_match` is executed.
|
||||
/// false
|
||||
/// },
|
||||
/// };
|
||||
/// }
|
||||
/// }
|
||||
/// },
|
||||
/// handle_match: |matched_byte| {
|
||||
/// // Matching byte has been found and `continue_if` returned `false` for it.
|
||||
/// // `matched_byte` is `u8` value of first byte which matched the table.
|
||||
/// // `lexer.source` is now positioned on first matching byte.
|
||||
/// // Handle the next matching byte (deal with any special cases).
|
||||
/// // Value this block evaluates to will be returned from enclosing function.
|
||||
/// true
|
||||
/// },
|
||||
/// handle_eof: || {
|
||||
/// // No bytes from start position to end of source matched the table.
|
||||
/// // `lexer.source` is now positioned at EOF.
|
||||
/// // Handle EOF in some way.
|
||||
/// // Value this block evaluates to will be returned from enclosing function.
|
||||
/// false
|
||||
/// },
|
||||
/// };
|
||||
///
|
||||
/// // This is unreachable.
|
||||
/// // Macro always exits current function with a `return` statement.
|
||||
|
|
@ -414,12 +410,6 @@ macro_rules! byte_search {
|
|||
handle_match: |$match_byte:ident, $match_start:ident| $match_handler:expr,
|
||||
handle_eof: |$eof_start:ident| $eof_handler:expr,
|
||||
) => {{
|
||||
// User has free access to change `$pos` in `continue_if`.
|
||||
// They must satisfy safety requirements explained above.
|
||||
#[inline]
|
||||
unsafe fn unsafe_noop() {}
|
||||
unsafe_noop();
|
||||
|
||||
let start = $lexer.source.position();
|
||||
byte_search! {
|
||||
lexer: $lexer,
|
||||
|
|
@ -458,12 +448,6 @@ macro_rules! byte_search {
|
|||
handle_match: |$match_byte:ident| $match_handler:expr,
|
||||
handle_eof: || $eof_handler:expr,
|
||||
) => {{
|
||||
// User has free access to change `$pos` in `continue_if`.
|
||||
// They must satisfy safety requirements explained above.
|
||||
#[inline]
|
||||
unsafe fn unsafe_noop() {}
|
||||
unsafe_noop();
|
||||
|
||||
byte_search! {
|
||||
lexer: $lexer,
|
||||
table: $table,
|
||||
|
|
|
|||
Loading…
Reference in a new issue