From a78303d5a6ceeaa39ae24bd67e7555caa061fc2f Mon Sep 17 00:00:00 2001 From: overlookmotel Date: Tue, 20 Feb 2024 02:45:31 +0000 Subject: [PATCH] refactor(parser): `continue_if` in `byte_search` macro not unsafe (#2440) #2439 made using `continue_if` in `byte_search!` macro safe, as it no longer continues the main loop after a match, so no danger of reading out of bounds if `continue_if` code fast-forwards the current position. This follow-on PR removes the unsafe blocks, and uses that fast-forward ability in a couple of places. --- crates/oxc_parser/src/lexer/comment.rs | 218 +++++++++++++------------ crates/oxc_parser/src/lexer/search.rs | 74 ++++----- 2 files changed, 140 insertions(+), 152 deletions(-) diff --git a/crates/oxc_parser/src/lexer/comment.rs b/crates/oxc_parser/src/lexer/comment.rs index 45088ef85..5212bfdc6 100644 --- a/crates/oxc_parser/src/lexer/comment.rs +++ b/crates/oxc_parser/src/lexer/comment.rs @@ -24,59 +24,60 @@ static MULTILINE_COMMENT_START_TABLE: SafeByteMatchTable = impl<'a> Lexer<'a> { /// Section 12.4 Single Line Comment pub(super) fn skip_single_line_comment(&mut self) -> Kind { - // SAFETY: Requirement not to alter `pos` if return `true` from `if_continue` is satisfied - unsafe { - byte_search! { - lexer: self, - table: LINE_BREAK_TABLE, - continue_if: |next_byte, pos| { - // Match found. Decide whether to continue searching. - // If this is end of comment, create trivia, and advance `pos` to after line break. - // Do that here rather than in `handle_match`, to avoid branching twice on value of - // the matched byte. - #[allow(clippy::if_not_else)] - if next_byte != LS_OR_PS_FIRST { - // `\r` or `\n` - self.trivia_builder - .add_single_line_comment(self.token.start, self.source.offset_of(pos)); - // SAFETY: Safe to consume `\r` or `\n` as both are ASCII - pos = pos.add(1); - // We've found the end. Do not continue searching. - false - } else { - // `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char. - // Either way, Unicode is uncommon, so make this a cold branch. - cold_branch(|| { - // SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char. - // So safe to advance `pos` by 1 and read 2 bytes. - let next2 = pos.add(1).read2(); - if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 { - // Irregular line break - self.trivia_builder - .add_single_line_comment(self.token.start, self.source.offset_of(pos)); - // Advance `pos` to after this char. - // SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char, - // so consuming 3 bytes will place `pos` on next UTF-8 char boundary. - pos = pos.add(3); - // We've found the end. Do not continue searching. - false - } else { - // Some other Unicode char beginning with `0xE2`. Continue searching. - true - } - }) - } - }, - handle_match: |_next_byte, _start| { - self.token.is_on_new_line = true; - Kind::Skip - }, - handle_eof: |_start| { - self.trivia_builder.add_single_line_comment(self.token.start, self.offset()); - Kind::Skip - }, - }; - } + byte_search! { + lexer: self, + table: LINE_BREAK_TABLE, + continue_if: |next_byte, pos| { + // Match found. Decide whether to continue searching. + // If this is end of comment, create trivia, and advance `pos` to after line break. + // Do that here rather than in `handle_match`, to avoid branching twice on value of + // the matched byte. + #[allow(clippy::if_not_else)] + if next_byte != LS_OR_PS_FIRST { + // `\r` or `\n` + self.trivia_builder + .add_single_line_comment(self.token.start, self.source.offset_of(pos)); + // SAFETY: Safe to consume `\r` or `\n` as both are ASCII + unsafe { pos = pos.add(1) }; + // We've found the end. Do not continue searching. + false + } else { + // `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char. + // Either way, Unicode is uncommon, so make this a cold branch. + cold_branch(|| { + // SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char. + // So safe to advance `pos` by 1 and read 2 bytes. + let next2 = unsafe { pos.add(1).read2() }; + if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 { + // Irregular line break + self.trivia_builder + .add_single_line_comment(self.token.start, self.source.offset_of(pos)); + // Advance `pos` to after this char. + // SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char, + // so consuming 3 bytes will place `pos` on next UTF-8 char boundary. + unsafe { pos = pos.add(3) }; + // We've found the end. Do not continue searching. + false + } else { + // Some other Unicode char beginning with `0xE2`. + // Skip 3 bytes (macro skips 1 already, so skip 2 here), and continue searching. + // SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char, + // so consuming 3 bytes will place `pos` on next UTF-8 char boundary. + unsafe { pos = pos.add(2) }; + true + } + }) + } + }, + handle_match: |_next_byte, _start| { + self.token.is_on_new_line = true; + Kind::Skip + }, + handle_eof: |_start| { + self.trivia_builder.add_single_line_comment(self.token.start, self.offset()); + Kind::Skip + }, + }; } /// Section 12.4 Multi Line Comment @@ -86,64 +87,67 @@ impl<'a> Lexer<'a> { return self.skip_multi_line_comment_after_line_break(self.source.position()); } - // SAFETY: Requirement not to alter `pos` if return `true` from `if_continue` is satisfied - unsafe { - byte_search! { - lexer: self, - table: MULTILINE_COMMENT_START_TABLE, - continue_if: |next_byte, pos| { - // Match found. Decide whether to continue searching. - if next_byte == b'*' { - if pos.addr() < self.source.end_addr() - 1 { - // If next byte isn't `/`, continue - // SAFETY: Have checked there's at least 1 further byte to read - if pos.add(1).read() == b'/' { - // Consume `*/` - // SAFETY: Consuming `*/` leaves `pos` on a UTF-8 char boundary - pos = pos.add(2); - false - } else { - true - } + byte_search! { + lexer: self, + table: MULTILINE_COMMENT_START_TABLE, + continue_if: |next_byte, pos| { + // Match found. Decide whether to continue searching. + if next_byte == b'*' { + if pos.addr() < self.source.end_addr() - 1 { + // If next byte isn't `/`, continue + // SAFETY: Have checked there's at least 1 further byte to read + if unsafe { pos.add(1).read() } == b'/' { + // Consume `*/` + // SAFETY: Consuming `*/` leaves `pos` on a UTF-8 char boundary + unsafe { pos = pos.add(2) }; + false } else { - // This is last byte in file. Continue to `handle_eof`. - // This is illegal in valid JS, so mark this branch cold. - cold_branch(|| true) - } - } else if next_byte == LS_OR_PS_FIRST { - // `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char. - // Either way, Unicode is uncommon, so make this a cold branch. - cold_branch(|| { - // SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char. - // So safe to advance `pos` by 1 and read 2 bytes. - let next2 = pos.add(1).read2(); - if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 { - // Irregular line break - self.token.is_on_new_line = true; - // Ideally we'd go on to `skip_multi_line_comment_after_line_break` here - // but can't do that easily because can't use `return` in a closure. - // But irregular line breaks are rare anyway. - } - // Either way, continue searching true - }) + } } else { - // Regular line break. - // No need to look for more line breaks, so switch to faster search just for `*/`. - self.token.is_on_new_line = true; - return self.skip_multi_line_comment_after_line_break(pos.add(1)); + // This is last byte in file. Continue to `handle_eof`. + // This is illegal in valid JS, so mark this branch cold. + cold_branch(|| true) } - }, - handle_match: |_next_byte, _start| { - self.trivia_builder.add_multi_line_comment(self.token.start, self.offset()); - Kind::Skip - }, - handle_eof: |_start| { - self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range())); - Kind::Eof - }, - }; - } + } else if next_byte == LS_OR_PS_FIRST { + // `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char. + // Either way, Unicode is uncommon, so make this a cold branch. + cold_branch(|| { + // SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char. + // So safe to advance `pos` by 1 and read 2 bytes. + let next2 = unsafe { pos.add(1).read2() }; + if next2 == LS_BYTES_2_AND_3 || next2 == PS_BYTES_2_AND_3 { + // Irregular line break + self.token.is_on_new_line = true; + // Ideally we'd go on to `skip_multi_line_comment_after_line_break` here + // but can't do that easily because can't use `return` in a closure. + // But irregular line breaks are rare anyway. + } + // Either way, continue searching. + // Skip 3 bytes (macro skips 1 already, so skip 2 here), and continue searching. + // SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char, + // so consuming 3 bytes will place `pos` on next UTF-8 char boundary. + unsafe { pos = pos.add(2) }; + true + }) + } else { + // Regular line break. + // No need to look for more line breaks, so switch to faster search just for `*/`. + self.token.is_on_new_line = true; + // SAFETY: Regular line breaks are ASCII, so skipping 1 byte is a UTF-8 char boundary. + let after_line_break = unsafe { pos.add(1) }; + return self.skip_multi_line_comment_after_line_break(after_line_break); + } + }, + handle_match: |_next_byte, _start| { + self.trivia_builder.add_multi_line_comment(self.token.start, self.offset()); + Kind::Skip + }, + handle_eof: |_start| { + self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range())); + Kind::Eof + }, + }; } fn skip_multi_line_comment_after_line_break(&mut self, pos: SourcePosition) -> Kind { diff --git a/crates/oxc_parser/src/lexer/search.rs b/crates/oxc_parser/src/lexer/search.rs index 25f40a22a..8d03b0919 100644 --- a/crates/oxc_parser/src/lexer/search.rs +++ b/crates/oxc_parser/src/lexer/search.rs @@ -328,40 +328,36 @@ pub(crate) use safe_byte_match_table; /// ``` /// impl<'a> Lexer<'a> { /// fn eat_stuff(&mut self) -> bool { -/// // SAFETY: It is unsafe to use `continue_if`. See requirements below. -/// unsafe { -/// byte_search! { -/// lexer: self, -/// table: NOT_STUFF_TABLE, -/// continue_if: |matched_byte, pos| { -/// // Matching byte found. Decide whether it's really a match. -/// // NB: `lexer.source` has NOT been updated at this point. -/// // SAFETY: If return `true` to continue searching, must NOT alter `pos`. -/// if matched_byte == 0xE2 { -/// // Only match a specific Unicode char (in this case 0xE2, 0x80, 0xA8) -/// unsafe { pos.add(1).read() != 0x80 || pos.add(2).read() != 0xA8) } -/// } else { -/// // All others do match. `handle_match` is executed. -/// false -/// } -/// }, -/// handle_match: |matched_byte| { -/// // Matching byte has been found and `continue_if` returned `false` for it. -/// // `matched_byte` is `u8` value of first byte which matched the table. -/// // `lexer.source` is now positioned on first matching byte. -/// // Handle the next matching byte (deal with any special cases). -/// // Value this block evaluates to will be returned from enclosing function. -/// true -/// }, -/// handle_eof: || { -/// // No bytes from start position to end of source matched the table. -/// // `lexer.source` is now positioned at EOF. -/// // Handle EOF in some way. -/// // Value this block evaluates to will be returned from enclosing function. +/// byte_search! { +/// lexer: self, +/// table: NOT_STUFF_TABLE, +/// continue_if: |matched_byte, pos| { +/// // Matching byte found. Decide whether it's really a match. +/// // NB: `lexer.source` has NOT been updated at this point. +/// if matched_byte == 0xE2 { +/// // Only match a specific Unicode char (in this case 0xE2, 0x80, 0xA8) +/// unsafe { pos.add(1).read() != 0x80 || pos.add(2).read() != 0xA8) } +/// } else { +/// // All others do match. `handle_match` is executed. /// false -/// }, -/// }; -/// } +/// } +/// }, +/// handle_match: |matched_byte| { +/// // Matching byte has been found and `continue_if` returned `false` for it. +/// // `matched_byte` is `u8` value of first byte which matched the table. +/// // `lexer.source` is now positioned on first matching byte. +/// // Handle the next matching byte (deal with any special cases). +/// // Value this block evaluates to will be returned from enclosing function. +/// true +/// }, +/// handle_eof: || { +/// // No bytes from start position to end of source matched the table. +/// // `lexer.source` is now positioned at EOF. +/// // Handle EOF in some way. +/// // Value this block evaluates to will be returned from enclosing function. +/// false +/// }, +/// }; /// /// // This is unreachable. /// // Macro always exits current function with a `return` statement. @@ -414,12 +410,6 @@ macro_rules! byte_search { handle_match: |$match_byte:ident, $match_start:ident| $match_handler:expr, handle_eof: |$eof_start:ident| $eof_handler:expr, ) => {{ - // User has free access to change `$pos` in `continue_if`. - // They must satisfy safety requirements explained above. - #[inline] - unsafe fn unsafe_noop() {} - unsafe_noop(); - let start = $lexer.source.position(); byte_search! { lexer: $lexer, @@ -458,12 +448,6 @@ macro_rules! byte_search { handle_match: |$match_byte:ident| $match_handler:expr, handle_eof: || $eof_handler:expr, ) => {{ - // User has free access to change `$pos` in `continue_if`. - // They must satisfy safety requirements explained above. - #[inline] - unsafe fn unsafe_noop() {} - unsafe_noop(); - byte_search! { lexer: $lexer, table: $table,