refactor(parser): continue_if in byte_search macro not unsafe (#2440)

#2439 made using `continue_if` in `byte_search!` macro safe, as it no longer continues the main loop after a match, so no danger of reading out of bounds if `continue_if` code fast-forwards the current position.

This follow-on PR removes the unsafe blocks, and uses that fast-forward ability in a couple of places.
This commit is contained in:
overlookmotel 2024-02-20 02:45:31 +00:00 committed by GitHub
parent a5a3c695f7
commit a78303d5a6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 140 additions and 152 deletions

View file

@ -24,59 +24,60 @@ static MULTILINE_COMMENT_START_TABLE: SafeByteMatchTable =
impl<'a> Lexer<'a> { impl<'a> Lexer<'a> {
/// Section 12.4 Single Line Comment /// Section 12.4 Single Line Comment
pub(super) fn skip_single_line_comment(&mut self) -> Kind { pub(super) fn skip_single_line_comment(&mut self) -> Kind {
// SAFETY: Requirement not to alter `pos` if return `true` from `if_continue` is satisfied byte_search! {
unsafe { lexer: self,
byte_search! { table: LINE_BREAK_TABLE,
lexer: self, continue_if: |next_byte, pos| {
table: LINE_BREAK_TABLE, // Match found. Decide whether to continue searching.
continue_if: |next_byte, pos| { // If this is end of comment, create trivia, and advance `pos` to after line break.
// Match found. Decide whether to continue searching. // Do that here rather than in `handle_match`, to avoid branching twice on value of
// If this is end of comment, create trivia, and advance `pos` to after line break. // the matched byte.
// Do that here rather than in `handle_match`, to avoid branching twice on value of #[allow(clippy::if_not_else)]
// the matched byte. if next_byte != LS_OR_PS_FIRST {
#[allow(clippy::if_not_else)] // `\r` or `\n`
if next_byte != LS_OR_PS_FIRST { self.trivia_builder
// `\r` or `\n` .add_single_line_comment(self.token.start, self.source.offset_of(pos));
self.trivia_builder // SAFETY: Safe to consume `\r` or `\n` as both are ASCII
.add_single_line_comment(self.token.start, self.source.offset_of(pos)); unsafe { pos = pos.add(1) };
// SAFETY: Safe to consume `\r` or `\n` as both are ASCII // We've found the end. Do not continue searching.
pos = pos.add(1); false
// We've found the end. Do not continue searching. } else {
false // `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
} else { // Either way, Unicode is uncommon, so make this a cold branch.
// `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char. cold_branch(|| {
// Either way, Unicode is uncommon, so make this a cold branch. // SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
cold_branch(|| { // So safe to advance `pos` by 1 and read 2 bytes.
// SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char. let next2 = unsafe { pos.add(1).read2() };
// So safe to advance `pos` by 1 and read 2 bytes. if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
let next2 = pos.add(1).read2(); // Irregular line break
if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 { self.trivia_builder
// Irregular line break .add_single_line_comment(self.token.start, self.source.offset_of(pos));
self.trivia_builder // Advance `pos` to after this char.
.add_single_line_comment(self.token.start, self.source.offset_of(pos)); // SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
// Advance `pos` to after this char. // so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
// SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char, unsafe { pos = pos.add(3) };
// so consuming 3 bytes will place `pos` on next UTF-8 char boundary. // We've found the end. Do not continue searching.
pos = pos.add(3); false
// We've found the end. Do not continue searching. } else {
false // Some other Unicode char beginning with `0xE2`.
} else { // Skip 3 bytes (macro skips 1 already, so skip 2 here), and continue searching.
// Some other Unicode char beginning with `0xE2`. Continue searching. // SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
true // so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
} unsafe { pos = pos.add(2) };
}) true
} }
}, })
handle_match: |_next_byte, _start| { }
self.token.is_on_new_line = true; },
Kind::Skip handle_match: |_next_byte, _start| {
}, self.token.is_on_new_line = true;
handle_eof: |_start| { Kind::Skip
self.trivia_builder.add_single_line_comment(self.token.start, self.offset()); },
Kind::Skip handle_eof: |_start| {
}, self.trivia_builder.add_single_line_comment(self.token.start, self.offset());
}; Kind::Skip
} },
};
} }
/// Section 12.4 Multi Line Comment /// Section 12.4 Multi Line Comment
@ -86,64 +87,67 @@ impl<'a> Lexer<'a> {
return self.skip_multi_line_comment_after_line_break(self.source.position()); return self.skip_multi_line_comment_after_line_break(self.source.position());
} }
// SAFETY: Requirement not to alter `pos` if return `true` from `if_continue` is satisfied byte_search! {
unsafe { lexer: self,
byte_search! { table: MULTILINE_COMMENT_START_TABLE,
lexer: self, continue_if: |next_byte, pos| {
table: MULTILINE_COMMENT_START_TABLE, // Match found. Decide whether to continue searching.
continue_if: |next_byte, pos| { if next_byte == b'*' {
// Match found. Decide whether to continue searching. if pos.addr() < self.source.end_addr() - 1 {
if next_byte == b'*' { // If next byte isn't `/`, continue
if pos.addr() < self.source.end_addr() - 1 { // SAFETY: Have checked there's at least 1 further byte to read
// If next byte isn't `/`, continue if unsafe { pos.add(1).read() } == b'/' {
// SAFETY: Have checked there's at least 1 further byte to read // Consume `*/`
if pos.add(1).read() == b'/' { // SAFETY: Consuming `*/` leaves `pos` on a UTF-8 char boundary
// Consume `*/` unsafe { pos = pos.add(2) };
// SAFETY: Consuming `*/` leaves `pos` on a UTF-8 char boundary false
pos = pos.add(2);
false
} else {
true
}
} else { } else {
// This is last byte in file. Continue to `handle_eof`.
// This is illegal in valid JS, so mark this branch cold.
cold_branch(|| true)
}
} else if next_byte == LS_OR_PS_FIRST {
// `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
// Either way, Unicode is uncommon, so make this a cold branch.
cold_branch(|| {
// SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
// So safe to advance `pos` by 1 and read 2 bytes.
let next2 = pos.add(1).read2();
if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
// Irregular line break
self.token.is_on_new_line = true;
// Ideally we'd go on to `skip_multi_line_comment_after_line_break` here
// but can't do that easily because can't use `return` in a closure.
// But irregular line breaks are rare anyway.
}
// Either way, continue searching
true true
}) }
} else { } else {
// Regular line break. // This is last byte in file. Continue to `handle_eof`.
// No need to look for more line breaks, so switch to faster search just for `*/`. // This is illegal in valid JS, so mark this branch cold.
self.token.is_on_new_line = true; cold_branch(|| true)
return self.skip_multi_line_comment_after_line_break(pos.add(1));
} }
}, } else if next_byte == LS_OR_PS_FIRST {
handle_match: |_next_byte, _start| { // `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
self.trivia_builder.add_multi_line_comment(self.token.start, self.offset()); // Either way, Unicode is uncommon, so make this a cold branch.
Kind::Skip cold_branch(|| {
}, // SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
handle_eof: |_start| { // So safe to advance `pos` by 1 and read 2 bytes.
self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range())); let next2 = unsafe { pos.add(1).read2() };
Kind::Eof if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
}, // Irregular line break
}; self.token.is_on_new_line = true;
} // Ideally we'd go on to `skip_multi_line_comment_after_line_break` here
// but can't do that easily because can't use `return` in a closure.
// But irregular line breaks are rare anyway.
}
// Either way, continue searching.
// Skip 3 bytes (macro skips 1 already, so skip 2 here), and continue searching.
// SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
// so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
unsafe { pos = pos.add(2) };
true
})
} else {
// Regular line break.
// No need to look for more line breaks, so switch to faster search just for `*/`.
self.token.is_on_new_line = true;
// SAFETY: Regular line breaks are ASCII, so skipping 1 byte is a UTF-8 char boundary.
let after_line_break = unsafe { pos.add(1) };
return self.skip_multi_line_comment_after_line_break(after_line_break);
}
},
handle_match: |_next_byte, _start| {
self.trivia_builder.add_multi_line_comment(self.token.start, self.offset());
Kind::Skip
},
handle_eof: |_start| {
self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range()));
Kind::Eof
},
};
} }
fn skip_multi_line_comment_after_line_break(&mut self, pos: SourcePosition) -> Kind { fn skip_multi_line_comment_after_line_break(&mut self, pos: SourcePosition) -> Kind {

View file

@ -328,40 +328,36 @@ pub(crate) use safe_byte_match_table;
/// ``` /// ```
/// impl<'a> Lexer<'a> { /// impl<'a> Lexer<'a> {
/// fn eat_stuff(&mut self) -> bool { /// fn eat_stuff(&mut self) -> bool {
/// // SAFETY: It is unsafe to use `continue_if`. See requirements below. /// byte_search! {
/// unsafe { /// lexer: self,
/// byte_search! { /// table: NOT_STUFF_TABLE,
/// lexer: self, /// continue_if: |matched_byte, pos| {
/// table: NOT_STUFF_TABLE, /// // Matching byte found. Decide whether it's really a match.
/// continue_if: |matched_byte, pos| { /// // NB: `lexer.source` has NOT been updated at this point.
/// // Matching byte found. Decide whether it's really a match. /// if matched_byte == 0xE2 {
/// // NB: `lexer.source` has NOT been updated at this point. /// // Only match a specific Unicode char (in this case 0xE2, 0x80, 0xA8)
/// // SAFETY: If return `true` to continue searching, must NOT alter `pos`. /// unsafe { pos.add(1).read() != 0x80 || pos.add(2).read() != 0xA8) }
/// if matched_byte == 0xE2 { /// } else {
/// // Only match a specific Unicode char (in this case 0xE2, 0x80, 0xA8) /// // All others do match. `handle_match` is executed.
/// unsafe { pos.add(1).read() != 0x80 || pos.add(2).read() != 0xA8) }
/// } else {
/// // All others do match. `handle_match` is executed.
/// false
/// }
/// },
/// handle_match: |matched_byte| {
/// // Matching byte has been found and `continue_if` returned `false` for it.
/// // `matched_byte` is `u8` value of first byte which matched the table.
/// // `lexer.source` is now positioned on first matching byte.
/// // Handle the next matching byte (deal with any special cases).
/// // Value this block evaluates to will be returned from enclosing function.
/// true
/// },
/// handle_eof: || {
/// // No bytes from start position to end of source matched the table.
/// // `lexer.source` is now positioned at EOF.
/// // Handle EOF in some way.
/// // Value this block evaluates to will be returned from enclosing function.
/// false /// false
/// }, /// }
/// }; /// },
/// } /// handle_match: |matched_byte| {
/// // Matching byte has been found and `continue_if` returned `false` for it.
/// // `matched_byte` is `u8` value of first byte which matched the table.
/// // `lexer.source` is now positioned on first matching byte.
/// // Handle the next matching byte (deal with any special cases).
/// // Value this block evaluates to will be returned from enclosing function.
/// true
/// },
/// handle_eof: || {
/// // No bytes from start position to end of source matched the table.
/// // `lexer.source` is now positioned at EOF.
/// // Handle EOF in some way.
/// // Value this block evaluates to will be returned from enclosing function.
/// false
/// },
/// };
/// ///
/// // This is unreachable. /// // This is unreachable.
/// // Macro always exits current function with a `return` statement. /// // Macro always exits current function with a `return` statement.
@ -414,12 +410,6 @@ macro_rules! byte_search {
handle_match: |$match_byte:ident, $match_start:ident| $match_handler:expr, handle_match: |$match_byte:ident, $match_start:ident| $match_handler:expr,
handle_eof: |$eof_start:ident| $eof_handler:expr, handle_eof: |$eof_start:ident| $eof_handler:expr,
) => {{ ) => {{
// User has free access to change `$pos` in `continue_if`.
// They must satisfy safety requirements explained above.
#[inline]
unsafe fn unsafe_noop() {}
unsafe_noop();
let start = $lexer.source.position(); let start = $lexer.source.position();
byte_search! { byte_search! {
lexer: $lexer, lexer: $lexer,
@ -458,12 +448,6 @@ macro_rules! byte_search {
handle_match: |$match_byte:ident| $match_handler:expr, handle_match: |$match_byte:ident| $match_handler:expr,
handle_eof: || $eof_handler:expr, handle_eof: || $eof_handler:expr,
) => {{ ) => {{
// User has free access to change `$pos` in `continue_if`.
// They must satisfy safety requirements explained above.
#[inline]
unsafe fn unsafe_noop() {}
unsafe_noop();
byte_search! { byte_search! {
lexer: $lexer, lexer: $lexer,
table: $table, table: $table,