mirror of
https://github.com/danbulant/oxc
synced 2026-05-24 20:32:10 +00:00
refactor(parser): continue_if in byte_search macro not unsafe (#2440)
#2439 made using `continue_if` in `byte_search!` macro safe, as it no longer continues the main loop after a match, so no danger of reading out of bounds if `continue_if` code fast-forwards the current position. This follow-on PR removes the unsafe blocks, and uses that fast-forward ability in a couple of places.
This commit is contained in:
parent
a5a3c695f7
commit
a78303d5a6
2 changed files with 140 additions and 152 deletions
|
|
@ -24,59 +24,60 @@ static MULTILINE_COMMENT_START_TABLE: SafeByteMatchTable =
|
||||||
impl<'a> Lexer<'a> {
|
impl<'a> Lexer<'a> {
|
||||||
/// Section 12.4 Single Line Comment
|
/// Section 12.4 Single Line Comment
|
||||||
pub(super) fn skip_single_line_comment(&mut self) -> Kind {
|
pub(super) fn skip_single_line_comment(&mut self) -> Kind {
|
||||||
// SAFETY: Requirement not to alter `pos` if return `true` from `if_continue` is satisfied
|
byte_search! {
|
||||||
unsafe {
|
lexer: self,
|
||||||
byte_search! {
|
table: LINE_BREAK_TABLE,
|
||||||
lexer: self,
|
continue_if: |next_byte, pos| {
|
||||||
table: LINE_BREAK_TABLE,
|
// Match found. Decide whether to continue searching.
|
||||||
continue_if: |next_byte, pos| {
|
// If this is end of comment, create trivia, and advance `pos` to after line break.
|
||||||
// Match found. Decide whether to continue searching.
|
// Do that here rather than in `handle_match`, to avoid branching twice on value of
|
||||||
// If this is end of comment, create trivia, and advance `pos` to after line break.
|
// the matched byte.
|
||||||
// Do that here rather than in `handle_match`, to avoid branching twice on value of
|
#[allow(clippy::if_not_else)]
|
||||||
// the matched byte.
|
if next_byte != LS_OR_PS_FIRST {
|
||||||
#[allow(clippy::if_not_else)]
|
// `\r` or `\n`
|
||||||
if next_byte != LS_OR_PS_FIRST {
|
self.trivia_builder
|
||||||
// `\r` or `\n`
|
.add_single_line_comment(self.token.start, self.source.offset_of(pos));
|
||||||
self.trivia_builder
|
// SAFETY: Safe to consume `\r` or `\n` as both are ASCII
|
||||||
.add_single_line_comment(self.token.start, self.source.offset_of(pos));
|
unsafe { pos = pos.add(1) };
|
||||||
// SAFETY: Safe to consume `\r` or `\n` as both are ASCII
|
// We've found the end. Do not continue searching.
|
||||||
pos = pos.add(1);
|
false
|
||||||
// We've found the end. Do not continue searching.
|
} else {
|
||||||
false
|
// `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
|
||||||
} else {
|
// Either way, Unicode is uncommon, so make this a cold branch.
|
||||||
// `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
|
cold_branch(|| {
|
||||||
// Either way, Unicode is uncommon, so make this a cold branch.
|
// SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
|
||||||
cold_branch(|| {
|
// So safe to advance `pos` by 1 and read 2 bytes.
|
||||||
// SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
|
let next2 = unsafe { pos.add(1).read2() };
|
||||||
// So safe to advance `pos` by 1 and read 2 bytes.
|
if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
|
||||||
let next2 = pos.add(1).read2();
|
// Irregular line break
|
||||||
if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
|
self.trivia_builder
|
||||||
// Irregular line break
|
.add_single_line_comment(self.token.start, self.source.offset_of(pos));
|
||||||
self.trivia_builder
|
// Advance `pos` to after this char.
|
||||||
.add_single_line_comment(self.token.start, self.source.offset_of(pos));
|
// SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
|
||||||
// Advance `pos` to after this char.
|
// so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
|
||||||
// SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
|
unsafe { pos = pos.add(3) };
|
||||||
// so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
|
// We've found the end. Do not continue searching.
|
||||||
pos = pos.add(3);
|
false
|
||||||
// We've found the end. Do not continue searching.
|
} else {
|
||||||
false
|
// Some other Unicode char beginning with `0xE2`.
|
||||||
} else {
|
// Skip 3 bytes (macro skips 1 already, so skip 2 here), and continue searching.
|
||||||
// Some other Unicode char beginning with `0xE2`. Continue searching.
|
// SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
|
||||||
true
|
// so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
|
||||||
}
|
unsafe { pos = pos.add(2) };
|
||||||
})
|
true
|
||||||
}
|
}
|
||||||
},
|
})
|
||||||
handle_match: |_next_byte, _start| {
|
}
|
||||||
self.token.is_on_new_line = true;
|
},
|
||||||
Kind::Skip
|
handle_match: |_next_byte, _start| {
|
||||||
},
|
self.token.is_on_new_line = true;
|
||||||
handle_eof: |_start| {
|
Kind::Skip
|
||||||
self.trivia_builder.add_single_line_comment(self.token.start, self.offset());
|
},
|
||||||
Kind::Skip
|
handle_eof: |_start| {
|
||||||
},
|
self.trivia_builder.add_single_line_comment(self.token.start, self.offset());
|
||||||
};
|
Kind::Skip
|
||||||
}
|
},
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Section 12.4 Multi Line Comment
|
/// Section 12.4 Multi Line Comment
|
||||||
|
|
@ -86,64 +87,67 @@ impl<'a> Lexer<'a> {
|
||||||
return self.skip_multi_line_comment_after_line_break(self.source.position());
|
return self.skip_multi_line_comment_after_line_break(self.source.position());
|
||||||
}
|
}
|
||||||
|
|
||||||
// SAFETY: Requirement not to alter `pos` if return `true` from `if_continue` is satisfied
|
byte_search! {
|
||||||
unsafe {
|
lexer: self,
|
||||||
byte_search! {
|
table: MULTILINE_COMMENT_START_TABLE,
|
||||||
lexer: self,
|
continue_if: |next_byte, pos| {
|
||||||
table: MULTILINE_COMMENT_START_TABLE,
|
// Match found. Decide whether to continue searching.
|
||||||
continue_if: |next_byte, pos| {
|
if next_byte == b'*' {
|
||||||
// Match found. Decide whether to continue searching.
|
if pos.addr() < self.source.end_addr() - 1 {
|
||||||
if next_byte == b'*' {
|
// If next byte isn't `/`, continue
|
||||||
if pos.addr() < self.source.end_addr() - 1 {
|
// SAFETY: Have checked there's at least 1 further byte to read
|
||||||
// If next byte isn't `/`, continue
|
if unsafe { pos.add(1).read() } == b'/' {
|
||||||
// SAFETY: Have checked there's at least 1 further byte to read
|
// Consume `*/`
|
||||||
if pos.add(1).read() == b'/' {
|
// SAFETY: Consuming `*/` leaves `pos` on a UTF-8 char boundary
|
||||||
// Consume `*/`
|
unsafe { pos = pos.add(2) };
|
||||||
// SAFETY: Consuming `*/` leaves `pos` on a UTF-8 char boundary
|
false
|
||||||
pos = pos.add(2);
|
|
||||||
false
|
|
||||||
} else {
|
|
||||||
true
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
// This is last byte in file. Continue to `handle_eof`.
|
|
||||||
// This is illegal in valid JS, so mark this branch cold.
|
|
||||||
cold_branch(|| true)
|
|
||||||
}
|
|
||||||
} else if next_byte == LS_OR_PS_FIRST {
|
|
||||||
// `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
|
|
||||||
// Either way, Unicode is uncommon, so make this a cold branch.
|
|
||||||
cold_branch(|| {
|
|
||||||
// SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
|
|
||||||
// So safe to advance `pos` by 1 and read 2 bytes.
|
|
||||||
let next2 = pos.add(1).read2();
|
|
||||||
if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
|
|
||||||
// Irregular line break
|
|
||||||
self.token.is_on_new_line = true;
|
|
||||||
// Ideally we'd go on to `skip_multi_line_comment_after_line_break` here
|
|
||||||
// but can't do that easily because can't use `return` in a closure.
|
|
||||||
// But irregular line breaks are rare anyway.
|
|
||||||
}
|
|
||||||
// Either way, continue searching
|
|
||||||
true
|
true
|
||||||
})
|
}
|
||||||
} else {
|
} else {
|
||||||
// Regular line break.
|
// This is last byte in file. Continue to `handle_eof`.
|
||||||
// No need to look for more line breaks, so switch to faster search just for `*/`.
|
// This is illegal in valid JS, so mark this branch cold.
|
||||||
self.token.is_on_new_line = true;
|
cold_branch(|| true)
|
||||||
return self.skip_multi_line_comment_after_line_break(pos.add(1));
|
|
||||||
}
|
}
|
||||||
},
|
} else if next_byte == LS_OR_PS_FIRST {
|
||||||
handle_match: |_next_byte, _start| {
|
// `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
|
||||||
self.trivia_builder.add_multi_line_comment(self.token.start, self.offset());
|
// Either way, Unicode is uncommon, so make this a cold branch.
|
||||||
Kind::Skip
|
cold_branch(|| {
|
||||||
},
|
// SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
|
||||||
handle_eof: |_start| {
|
// So safe to advance `pos` by 1 and read 2 bytes.
|
||||||
self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range()));
|
let next2 = unsafe { pos.add(1).read2() };
|
||||||
Kind::Eof
|
if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 {
|
||||||
},
|
// Irregular line break
|
||||||
};
|
self.token.is_on_new_line = true;
|
||||||
}
|
// Ideally we'd go on to `skip_multi_line_comment_after_line_break` here
|
||||||
|
// but can't do that easily because can't use `return` in a closure.
|
||||||
|
// But irregular line breaks are rare anyway.
|
||||||
|
}
|
||||||
|
// Either way, continue searching.
|
||||||
|
// Skip 3 bytes (macro skips 1 already, so skip 2 here), and continue searching.
|
||||||
|
// SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
|
||||||
|
// so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
|
||||||
|
unsafe { pos = pos.add(2) };
|
||||||
|
true
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
// Regular line break.
|
||||||
|
// No need to look for more line breaks, so switch to faster search just for `*/`.
|
||||||
|
self.token.is_on_new_line = true;
|
||||||
|
// SAFETY: Regular line breaks are ASCII, so skipping 1 byte is a UTF-8 char boundary.
|
||||||
|
let after_line_break = unsafe { pos.add(1) };
|
||||||
|
return self.skip_multi_line_comment_after_line_break(after_line_break);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
handle_match: |_next_byte, _start| {
|
||||||
|
self.trivia_builder.add_multi_line_comment(self.token.start, self.offset());
|
||||||
|
Kind::Skip
|
||||||
|
},
|
||||||
|
handle_eof: |_start| {
|
||||||
|
self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range()));
|
||||||
|
Kind::Eof
|
||||||
|
},
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
fn skip_multi_line_comment_after_line_break(&mut self, pos: SourcePosition) -> Kind {
|
fn skip_multi_line_comment_after_line_break(&mut self, pos: SourcePosition) -> Kind {
|
||||||
|
|
|
||||||
|
|
@ -328,40 +328,36 @@ pub(crate) use safe_byte_match_table;
|
||||||
/// ```
|
/// ```
|
||||||
/// impl<'a> Lexer<'a> {
|
/// impl<'a> Lexer<'a> {
|
||||||
/// fn eat_stuff(&mut self) -> bool {
|
/// fn eat_stuff(&mut self) -> bool {
|
||||||
/// // SAFETY: It is unsafe to use `continue_if`. See requirements below.
|
/// byte_search! {
|
||||||
/// unsafe {
|
/// lexer: self,
|
||||||
/// byte_search! {
|
/// table: NOT_STUFF_TABLE,
|
||||||
/// lexer: self,
|
/// continue_if: |matched_byte, pos| {
|
||||||
/// table: NOT_STUFF_TABLE,
|
/// // Matching byte found. Decide whether it's really a match.
|
||||||
/// continue_if: |matched_byte, pos| {
|
/// // NB: `lexer.source` has NOT been updated at this point.
|
||||||
/// // Matching byte found. Decide whether it's really a match.
|
/// if matched_byte == 0xE2 {
|
||||||
/// // NB: `lexer.source` has NOT been updated at this point.
|
/// // Only match a specific Unicode char (in this case 0xE2, 0x80, 0xA8)
|
||||||
/// // SAFETY: If return `true` to continue searching, must NOT alter `pos`.
|
/// unsafe { pos.add(1).read() != 0x80 || pos.add(2).read() != 0xA8) }
|
||||||
/// if matched_byte == 0xE2 {
|
/// } else {
|
||||||
/// // Only match a specific Unicode char (in this case 0xE2, 0x80, 0xA8)
|
/// // All others do match. `handle_match` is executed.
|
||||||
/// unsafe { pos.add(1).read() != 0x80 || pos.add(2).read() != 0xA8) }
|
|
||||||
/// } else {
|
|
||||||
/// // All others do match. `handle_match` is executed.
|
|
||||||
/// false
|
|
||||||
/// }
|
|
||||||
/// },
|
|
||||||
/// handle_match: |matched_byte| {
|
|
||||||
/// // Matching byte has been found and `continue_if` returned `false` for it.
|
|
||||||
/// // `matched_byte` is `u8` value of first byte which matched the table.
|
|
||||||
/// // `lexer.source` is now positioned on first matching byte.
|
|
||||||
/// // Handle the next matching byte (deal with any special cases).
|
|
||||||
/// // Value this block evaluates to will be returned from enclosing function.
|
|
||||||
/// true
|
|
||||||
/// },
|
|
||||||
/// handle_eof: || {
|
|
||||||
/// // No bytes from start position to end of source matched the table.
|
|
||||||
/// // `lexer.source` is now positioned at EOF.
|
|
||||||
/// // Handle EOF in some way.
|
|
||||||
/// // Value this block evaluates to will be returned from enclosing function.
|
|
||||||
/// false
|
/// false
|
||||||
/// },
|
/// }
|
||||||
/// };
|
/// },
|
||||||
/// }
|
/// handle_match: |matched_byte| {
|
||||||
|
/// // Matching byte has been found and `continue_if` returned `false` for it.
|
||||||
|
/// // `matched_byte` is `u8` value of first byte which matched the table.
|
||||||
|
/// // `lexer.source` is now positioned on first matching byte.
|
||||||
|
/// // Handle the next matching byte (deal with any special cases).
|
||||||
|
/// // Value this block evaluates to will be returned from enclosing function.
|
||||||
|
/// true
|
||||||
|
/// },
|
||||||
|
/// handle_eof: || {
|
||||||
|
/// // No bytes from start position to end of source matched the table.
|
||||||
|
/// // `lexer.source` is now positioned at EOF.
|
||||||
|
/// // Handle EOF in some way.
|
||||||
|
/// // Value this block evaluates to will be returned from enclosing function.
|
||||||
|
/// false
|
||||||
|
/// },
|
||||||
|
/// };
|
||||||
///
|
///
|
||||||
/// // This is unreachable.
|
/// // This is unreachable.
|
||||||
/// // Macro always exits current function with a `return` statement.
|
/// // Macro always exits current function with a `return` statement.
|
||||||
|
|
@ -414,12 +410,6 @@ macro_rules! byte_search {
|
||||||
handle_match: |$match_byte:ident, $match_start:ident| $match_handler:expr,
|
handle_match: |$match_byte:ident, $match_start:ident| $match_handler:expr,
|
||||||
handle_eof: |$eof_start:ident| $eof_handler:expr,
|
handle_eof: |$eof_start:ident| $eof_handler:expr,
|
||||||
) => {{
|
) => {{
|
||||||
// User has free access to change `$pos` in `continue_if`.
|
|
||||||
// They must satisfy safety requirements explained above.
|
|
||||||
#[inline]
|
|
||||||
unsafe fn unsafe_noop() {}
|
|
||||||
unsafe_noop();
|
|
||||||
|
|
||||||
let start = $lexer.source.position();
|
let start = $lexer.source.position();
|
||||||
byte_search! {
|
byte_search! {
|
||||||
lexer: $lexer,
|
lexer: $lexer,
|
||||||
|
|
@ -458,12 +448,6 @@ macro_rules! byte_search {
|
||||||
handle_match: |$match_byte:ident| $match_handler:expr,
|
handle_match: |$match_byte:ident| $match_handler:expr,
|
||||||
handle_eof: || $eof_handler:expr,
|
handle_eof: || $eof_handler:expr,
|
||||||
) => {{
|
) => {{
|
||||||
// User has free access to change `$pos` in `continue_if`.
|
|
||||||
// They must satisfy safety requirements explained above.
|
|
||||||
#[inline]
|
|
||||||
unsafe fn unsafe_noop() {}
|
|
||||||
unsafe_noop();
|
|
||||||
|
|
||||||
byte_search! {
|
byte_search! {
|
||||||
lexer: $lexer,
|
lexer: $lexer,
|
||||||
table: $table,
|
table: $table,
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue