refactor(parser): continue_if in byte_search macro not unsafe (#2440)

#2439 made using `continue_if` in `byte_search!` macro safe, as it no longer continues the main loop after a match, so no danger of reading out of bounds if `continue_if` code fast-forwards the current position.

This follow-on PR removes the unsafe blocks, and uses that fast-forward ability in a couple of places.
This commit is contained in:
overlookmotel 2024-02-20 02:45:31 +00:00 committed by GitHub
parent a5a3c695f7
commit a78303d5a6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 140 additions and 152 deletions

View file

@ -24,59 +24,60 @@ static MULTILINE_COMMENT_START_TABLE: SafeByteMatchTable =
impl<'a> Lexer<'a> {
/// Section 12.4 Single Line Comment
pub(super) fn skip_single_line_comment(&mut self) -> Kind {
// SAFETY: Requirement not to alter `pos` if return `true` from `if_continue` is satisfied
unsafe {
byte_search! {
lexer: self,
table: LINE_BREAK_TABLE,
continue_if: |next_byte, pos| {
// Match found. Decide whether to continue searching.
// If this is end of comment, create trivia, and advance `pos` to after line break.
// Do that here rather than in `handle_match`, to avoid branching twice on value of
// the matched byte.
#[allow(clippy::if_not_else)]
if next_byte != LS_OR_PS_FIRST {
// `\r` or `\n`
self.trivia_builder
.add_single_line_comment(self.token.start, self.source.offset_of(pos));
// SAFETY: Safe to consume `\r` or `\n` as both are ASCII
pos = pos.add(1);
// We've found the end. Do not continue searching.
false
} else {
// `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
// Either way, Unicode is uncommon, so make this a cold branch.
cold_branch(|| {
// SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
// So safe to advance `pos` by 1 and read 2 bytes.
let next2 = pos.add(1).read2();
if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
// Irregular line break
self.trivia_builder
.add_single_line_comment(self.token.start, self.source.offset_of(pos));
// Advance `pos` to after this char.
// SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
// so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
pos = pos.add(3);
// We've found the end. Do not continue searching.
false
} else {
// Some other Unicode char beginning with `0xE2`. Continue searching.
true
}
})
}
},
handle_match: |_next_byte, _start| {
self.token.is_on_new_line = true;
Kind::Skip
},
handle_eof: |_start| {
self.trivia_builder.add_single_line_comment(self.token.start, self.offset());
Kind::Skip
},
};
}
byte_search! {
lexer: self,
table: LINE_BREAK_TABLE,
continue_if: |next_byte, pos| {
// Match found. Decide whether to continue searching.
// If this is end of comment, create trivia, and advance `pos` to after line break.
// Do that here rather than in `handle_match`, to avoid branching twice on value of
// the matched byte.
#[allow(clippy::if_not_else)]
if next_byte != LS_OR_PS_FIRST {
// `\r` or `\n`
self.trivia_builder
.add_single_line_comment(self.token.start, self.source.offset_of(pos));
// SAFETY: Safe to consume `\r` or `\n` as both are ASCII
unsafe { pos = pos.add(1) };
// We've found the end. Do not continue searching.
false
} else {
// `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
// Either way, Unicode is uncommon, so make this a cold branch.
cold_branch(|| {
// SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
// So safe to advance `pos` by 1 and read 2 bytes.
let next2 = unsafe { pos.add(1).read2() };
if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
// Irregular line break
self.trivia_builder
.add_single_line_comment(self.token.start, self.source.offset_of(pos));
// Advance `pos` to after this char.
// SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
// so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
unsafe { pos = pos.add(3) };
// We've found the end. Do not continue searching.
false
} else {
// Some other Unicode char beginning with `0xE2`.
// Skip 3 bytes (macro skips 1 already, so skip 2 here), and continue searching.
// SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
// so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
unsafe { pos = pos.add(2) };
true
}
})
}
},
handle_match: |_next_byte, _start| {
self.token.is_on_new_line = true;
Kind::Skip
},
handle_eof: |_start| {
self.trivia_builder.add_single_line_comment(self.token.start, self.offset());
Kind::Skip
},
};
}
/// Section 12.4 Multi Line Comment
@ -86,64 +87,67 @@ impl<'a> Lexer<'a> {
return self.skip_multi_line_comment_after_line_break(self.source.position());
}
// SAFETY: Requirement not to alter `pos` if return `true` from `if_continue` is satisfied
unsafe {
byte_search! {
lexer: self,
table: MULTILINE_COMMENT_START_TABLE,
continue_if: |next_byte, pos| {
// Match found. Decide whether to continue searching.
if next_byte == b'*' {
if pos.addr() < self.source.end_addr() - 1 {
// If next byte isn't `/`, continue
// SAFETY: Have checked there's at least 1 further byte to read
if pos.add(1).read() == b'/' {
// Consume `*/`
// SAFETY: Consuming `*/` leaves `pos` on a UTF-8 char boundary
pos = pos.add(2);
false
} else {
true
}
byte_search! {
lexer: self,
table: MULTILINE_COMMENT_START_TABLE,
continue_if: |next_byte, pos| {
// Match found. Decide whether to continue searching.
if next_byte == b'*' {
if pos.addr() < self.source.end_addr() - 1 {
// If next byte isn't `/`, continue
// SAFETY: Have checked there's at least 1 further byte to read
if unsafe { pos.add(1).read() } == b'/' {
// Consume `*/`
// SAFETY: Consuming `*/` leaves `pos` on a UTF-8 char boundary
unsafe { pos = pos.add(2) };
false
} else {
// This is last byte in file. Continue to `handle_eof`.
// This is illegal in valid JS, so mark this branch cold.
cold_branch(|| true)
}
} else if next_byte == LS_OR_PS_FIRST {
// `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
// Either way, Unicode is uncommon, so make this a cold branch.
cold_branch(|| {
// SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
// So safe to advance `pos` by 1 and read 2 bytes.
let next2 = pos.add(1).read2();
if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
// Irregular line break
self.token.is_on_new_line = true;
// Ideally we'd go on to `skip_multi_line_comment_after_line_break` here
// but can't do that easily because can't use `return` in a closure.
// But irregular line breaks are rare anyway.
}
// Either way, continue searching
true
})
}
} else {
// Regular line break.
// No need to look for more line breaks, so switch to faster search just for `*/`.
self.token.is_on_new_line = true;
return self.skip_multi_line_comment_after_line_break(pos.add(1));
// This is last byte in file. Continue to `handle_eof`.
// This is illegal in valid JS, so mark this branch cold.
cold_branch(|| true)
}
},
handle_match: |_next_byte, _start| {
self.trivia_builder.add_multi_line_comment(self.token.start, self.offset());
Kind::Skip
},
handle_eof: |_start| {
self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range()));
Kind::Eof
},
};
}
} else if next_byte == LS_OR_PS_FIRST {
// `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
// Either way, Unicode is uncommon, so make this a cold branch.
cold_branch(|| {
// SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
// So safe to advance `pos` by 1 and read 2 bytes.
let next2 = unsafe { pos.add(1).read2() };
if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
// Irregular line break
self.token.is_on_new_line = true;
// Ideally we'd go on to `skip_multi_line_comment_after_line_break` here
// but can't do that easily because can't use `return` in a closure.
// But irregular line breaks are rare anyway.
}
// Either way, continue searching.
// Skip 3 bytes (macro skips 1 already, so skip 2 here), and continue searching.
// SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
// so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
unsafe { pos = pos.add(2) };
true
})
} else {
// Regular line break.
// No need to look for more line breaks, so switch to faster search just for `*/`.
self.token.is_on_new_line = true;
// SAFETY: Regular line breaks are ASCII, so skipping 1 byte is a UTF-8 char boundary.
let after_line_break = unsafe { pos.add(1) };
return self.skip_multi_line_comment_after_line_break(after_line_break);
}
},
handle_match: |_next_byte, _start| {
self.trivia_builder.add_multi_line_comment(self.token.start, self.offset());
Kind::Skip
},
handle_eof: |_start| {
self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range()));
Kind::Eof
},
};
}
fn skip_multi_line_comment_after_line_break(&mut self, pos: SourcePosition) -> Kind {

View file

@ -328,40 +328,36 @@ pub(crate) use safe_byte_match_table;
/// ```
/// impl<'a> Lexer<'a> {
/// fn eat_stuff(&mut self) -> bool {
/// // SAFETY: It is unsafe to use `continue_if`. See requirements below.
/// unsafe {
/// byte_search! {
/// lexer: self,
/// table: NOT_STUFF_TABLE,
/// continue_if: |matched_byte, pos| {
/// // Matching byte found. Decide whether it's really a match.
/// // NB: `lexer.source` has NOT been updated at this point.
/// // SAFETY: If return `true` to continue searching, must NOT alter `pos`.
/// if matched_byte == 0xE2 {
/// // Only match a specific Unicode char (in this case 0xE2, 0x80, 0xA8)
/// unsafe { pos.add(1).read() != 0x80 || pos.add(2).read() != 0xA8) }
/// } else {
/// // All others do match. `handle_match` is executed.
/// false
/// }
/// },
/// handle_match: |matched_byte| {
/// // Matching byte has been found and `continue_if` returned `false` for it.
/// // `matched_byte` is `u8` value of first byte which matched the table.
/// // `lexer.source` is now positioned on first matching byte.
/// // Handle the next matching byte (deal with any special cases).
/// // Value this block evaluates to will be returned from enclosing function.
/// true
/// },
/// handle_eof: || {
/// // No bytes from start position to end of source matched the table.
/// // `lexer.source` is now positioned at EOF.
/// // Handle EOF in some way.
/// // Value this block evaluates to will be returned from enclosing function.
/// byte_search! {
/// lexer: self,
/// table: NOT_STUFF_TABLE,
/// continue_if: |matched_byte, pos| {
/// // Matching byte found. Decide whether it's really a match.
/// // NB: `lexer.source` has NOT been updated at this point.
/// if matched_byte == 0xE2 {
/// // Only match a specific Unicode char (in this case 0xE2, 0x80, 0xA8)
/// unsafe { pos.add(1).read() != 0x80 || pos.add(2).read() != 0xA8) }
/// } else {
/// // All others do match. `handle_match` is executed.
/// false
/// },
/// };
/// }
/// }
/// },
/// handle_match: |matched_byte| {
/// // Matching byte has been found and `continue_if` returned `false` for it.
/// // `matched_byte` is `u8` value of first byte which matched the table.
/// // `lexer.source` is now positioned on first matching byte.
/// // Handle the next matching byte (deal with any special cases).
/// // Value this block evaluates to will be returned from enclosing function.
/// true
/// },
/// handle_eof: || {
/// // No bytes from start position to end of source matched the table.
/// // `lexer.source` is now positioned at EOF.
/// // Handle EOF in some way.
/// // Value this block evaluates to will be returned from enclosing function.
/// false
/// },
/// };
///
/// // This is unreachable.
/// // Macro always exits current function with a `return` statement.
@ -414,12 +410,6 @@ macro_rules! byte_search {
handle_match: |$match_byte:ident, $match_start:ident| $match_handler:expr,
handle_eof: |$eof_start:ident| $eof_handler:expr,
) => {{
// User has free access to change `$pos` in `continue_if`.
// They must satisfy safety requirements explained above.
#[inline]
unsafe fn unsafe_noop() {}
unsafe_noop();
let start = $lexer.source.position();
byte_search! {
lexer: $lexer,
@ -458,12 +448,6 @@ macro_rules! byte_search {
handle_match: |$match_byte:ident| $match_handler:expr,
handle_eof: || $eof_handler:expr,
) => {{
// User has free access to change `$pos` in `continue_if`.
// They must satisfy safety requirements explained above.
#[inline]
unsafe fn unsafe_noop() {}
unsafe_noop();
byte_search! {
lexer: $lexer,
table: $table,