From dd31c6453ac15a516784d4ef38f33fdec222978c Mon Sep 17 00:00:00 2001 From: overlookmotel Date: Fri, 1 Mar 2024 13:28:39 +0000 Subject: [PATCH] refactor(parser): `byte_search` macro evaluate to matched byte (#2555) Change behavior of `byte_search!` macro, to make it easier to understand and use: 1. `handle_match` removed. Macro instead evaluates to the first matching byte. 2. `handle_eof` does not return from enclosing function. 3. Alter syntax to make clear that `continue_if` and `handle_eof` are not closures, so can use `return` statements in them. These changes enabled by #2552. --- crates/oxc_parser/src/lexer/comment.rs | 26 ++-- crates/oxc_parser/src/lexer/identifier.rs | 118 ++++++++-------- crates/oxc_parser/src/lexer/search.rs | 159 +++++++++------------- crates/oxc_parser/src/lexer/string.rs | 57 ++++---- crates/oxc_parser/src/lexer/template.rs | 28 ++-- crates/oxc_parser/src/lexer/whitespace.rs | 9 +- 6 files changed, 179 insertions(+), 218 deletions(-) diff --git a/crates/oxc_parser/src/lexer/comment.rs b/crates/oxc_parser/src/lexer/comment.rs index 119960ea5..8122dfd1e 100644 --- a/crates/oxc_parser/src/lexer/comment.rs +++ b/crates/oxc_parser/src/lexer/comment.rs @@ -27,7 +27,7 @@ impl<'a> Lexer<'a> { byte_search! { lexer: self, table: LINE_BREAK_TABLE, - continue_if: |next_byte, pos| { + continue_if: (next_byte, pos) { // Match found. Decide whether to continue searching. // If this is end of comment, create trivia, and advance `pos` to after line break. // Do that here rather than in `handle_match`, to avoid branching twice on value of @@ -69,15 +69,14 @@ impl<'a> Lexer<'a> { }) } }, - handle_match: |_next_byte| { - self.token.is_on_new_line = true; - Kind::Skip - }, - handle_eof: || { + handle_eof: { self.trivia_builder.add_single_line_comment(self.token.start, self.offset()); - Kind::Skip + return Kind::Skip; }, }; + + self.token.is_on_new_line = true; + Kind::Skip } /// Section 12.4 Multi Line Comment @@ -90,7 +89,7 @@ impl<'a> Lexer<'a> { byte_search! { lexer: self, table: MULTILINE_COMMENT_START_TABLE, - continue_if: |next_byte, pos| { + continue_if: (next_byte, pos) { // Match found. Decide whether to continue searching. if next_byte == b'*' { // SAFETY: Next byte is `*` (ASCII) so after it is UTF-8 char boundary @@ -145,15 +144,14 @@ impl<'a> Lexer<'a> { return self.skip_multi_line_comment_after_line_break(after_line_break); } }, - handle_match: |_next_byte| { - self.trivia_builder.add_multi_line_comment(self.token.start, self.offset()); - Kind::Skip - }, - handle_eof: || { + handle_eof: { self.error(diagnostics::UnterminatedMultiLineComment(self.unterminated_range())); - Kind::Eof + return Kind::Eof; }, }; + + self.trivia_builder.add_multi_line_comment(self.token.start, self.offset()); + Kind::Skip } fn skip_multi_line_comment_after_line_break(&mut self, pos: SourcePosition) -> Kind { diff --git a/crates/oxc_parser/src/lexer/identifier.rs b/crates/oxc_parser/src/lexer/identifier.rs index 30eadf41e..312fa92ca 100644 --- a/crates/oxc_parser/src/lexer/identifier.rs +++ b/crates/oxc_parser/src/lexer/identifier.rs @@ -54,45 +54,44 @@ impl<'a> Lexer<'a> { let after_first = self.source.position().add(1); // Consume bytes which are part of identifier - byte_search! { + let next_byte = byte_search! { lexer: self, table: NOT_ASCII_ID_CONTINUE_TABLE, start: after_first, - handle_match: |next_byte| { - // Found a matching byte. - // Either end of identifier found, or a Unicode char, or `\` escape. - // Handle uncommon cases in cold branches to keep the common ASCII path - // as fast as possible. - if !next_byte.is_ascii() { - return cold_branch(|| { - // SAFETY: `after_first` is position after consuming 1 byte, so subtracting 1 - // makes `start_pos` `source`'s position as it was at start of this function - let start_pos = unsafe { after_first.sub(1) }; - &self.identifier_tail_unicode(start_pos)[1..] - }); - } - if next_byte == b'\\' { - return cold_branch(|| { - // SAFETY: `after_first` is position after consuming 1 byte, so subtracting 1 - // makes `start_pos` `source`'s position as it was at start of this function - let start_pos = unsafe { after_first.sub(1) }; - &self.identifier_backslash(start_pos, false)[1..] - }); - } - - // Return identifier minus its first char. - // SAFETY: `after_first` was position of `lexer.source` at start of this search. - // Searching only proceeds in forwards direction, so `lexer.source.position()` - // cannot be before `after_first`. - unsafe { self.source.str_from_pos_to_current_unchecked(after_first) } - }, - handle_eof: || { + handle_eof: { // Return identifier minus its first char. // SAFETY: `lexer.source` is positioned at EOF, so there is no valid value // of `after_first` which could be after current position. - unsafe { self.source.str_from_pos_to_current_unchecked(after_first) } + return unsafe { self.source.str_from_pos_to_current_unchecked(after_first) }; }, }; + + // Found a matching byte. + // Either end of identifier found, or a Unicode char, or `\` escape. + // Handle uncommon cases in cold branches to keep the common ASCII path + // as fast as possible. + if !next_byte.is_ascii() { + return cold_branch(|| { + // SAFETY: `after_first` is position after consuming 1 byte, so subtracting 1 + // makes `start_pos` `source`'s position as it was at start of this function + let start_pos = unsafe { after_first.sub(1) }; + &self.identifier_tail_unicode(start_pos)[1..] + }); + } + if next_byte == b'\\' { + return cold_branch(|| { + // SAFETY: `after_first` is position after consuming 1 byte, so subtracting 1 + // makes `start_pos` `source`'s position as it was at start of this function + let start_pos = unsafe { after_first.sub(1) }; + &self.identifier_backslash(start_pos, false)[1..] + }); + } + + // Return identifier minus its first char. + // SAFETY: `after_first` was position of `lexer.source` at start of this search. + // Searching only proceeds in forwards direction, so `lexer.source.position()` + // cannot be before `after_first`. + unsafe { self.source.str_from_pos_to_current_unchecked(after_first) } } /// Handle rest of identifier after first byte of a multi-byte Unicode char found. @@ -233,40 +232,39 @@ impl<'a> Lexer<'a> { let after_first = unsafe { start_pos.add(1) }; // Consume bytes which are part of identifier - byte_search! { + let next_byte = byte_search! { lexer: self, table: NOT_ASCII_ID_CONTINUE_TABLE, start: after_first, - handle_match: |next_byte| { - // Found a matching byte. - // Either end of identifier found, or a Unicode char, or `\` escape. - // Handle uncommon cases in cold branches to keep the common ASCII path - // as fast as possible. - if !next_byte.is_ascii() { - return cold_branch(|| { - // SAFETY: `after_first` is position after consuming 1 byte, so subtracting 1 - // makes `start_pos` `source`'s position as it was at start of this function - let start_pos = unsafe { after_first.sub(1) }; - self.identifier_tail_unicode(start_pos); - Kind::PrivateIdentifier - }); - } - if next_byte == b'\\' { - return cold_branch(|| { - // SAFETY: `after_first` is position after consuming 1 byte, so subtracting 1 - // makes `start_pos` `source`'s position as it was at start of this function - let start_pos = unsafe { after_first.sub(1) }; - self.identifier_backslash(start_pos, false); - Kind::PrivateIdentifier - }); - } - - Kind::PrivateIdentifier - }, - handle_eof: || { - Kind::PrivateIdentifier + handle_eof: { + return Kind::PrivateIdentifier; }, }; + + // Found a matching byte. + // Either end of identifier found, or a Unicode char, or `\` escape. + // Handle uncommon cases in cold branches to keep the common ASCII path + // as fast as possible. + if !next_byte.is_ascii() { + return cold_branch(|| { + // SAFETY: `after_first` is position after consuming 1 byte, so subtracting 1 + // makes `start_pos` `source`'s position as it was at start of this function + let start_pos = unsafe { after_first.sub(1) }; + self.identifier_tail_unicode(start_pos); + Kind::PrivateIdentifier + }); + } + if next_byte == b'\\' { + return cold_branch(|| { + // SAFETY: `after_first` is position after consuming 1 byte, so subtracting 1 + // makes `start_pos` `source`'s position as it was at start of this function + let start_pos = unsafe { after_first.sub(1) }; + self.identifier_backslash(start_pos, false); + Kind::PrivateIdentifier + }); + } + + Kind::PrivateIdentifier } /// Handle private identifier whose first byte is not an ASCII identifier start byte. diff --git a/crates/oxc_parser/src/lexer/search.rs b/crates/oxc_parser/src/lexer/search.rs index 488d2edff..40f116b16 100644 --- a/crates/oxc_parser/src/lexer/search.rs +++ b/crates/oxc_parser/src/lexer/search.rs @@ -250,10 +250,9 @@ pub(crate) use safe_byte_match_table; /// Search processes source in batches of `SEARCH_BATCH_SIZE` bytes for speed. /// When not enough bytes remaining in source for a batch, search source byte by byte. /// -/// This is a macro rather than a function for 2 reasons: -/// 1. Searching is a bit faster when all the code is in a single function. -/// 2. The `handle_match` section has to be repeated twice. -/// This macro does that, so code using the macro can be DRY-er. +/// This is a macro rather than a function because searching is a bit faster when all the code +/// is in a single function, and some parts (e.g. `continue_if`) can be statically removed by +/// the compiler if they're not used. /// /// Used as follows: /// @@ -262,28 +261,24 @@ pub(crate) use safe_byte_match_table; /// /// impl<'a> Lexer<'a> { /// fn eat_stuff(&mut self) -> bool { -/// byte_search! { +/// let matched_byte = byte_search! { /// lexer: self, /// table: NOT_STUFF_TABLE, -/// handle_match: |matched_byte| { -/// // Matching byte has been found. -/// // `matched_byte` is `u8` value of first byte which matched the table. -/// // `lexer.source` is now positioned on first matching byte. -/// // Handle the next matching byte (deal with any special cases). -/// // Value this block evaluates to will be returned from enclosing function. -/// matched_byte == b'X' -/// }, -/// handle_eof: || { +/// handle_eof: { /// // No bytes from start position to end of source matched the table. /// // `lexer.source` is now positioned at EOF. -/// // Handle EOF in some way. -/// // Value this block evaluates to will be returned from enclosing function. -/// false +/// // Evaluate to a `u8` which macro call will evaluate to. +/// 0xFF +/// // Or can `return` from enclosing function e.g. `return false;` /// }, /// }; /// -/// // This is unreachable. -/// // Macro always exits current function with a `return` statement. +/// // Matching byte has been found. +/// // `matched_byte` is `u8` value of first byte which matched the table +/// // (or `0xFF` if EOF, because `handle_eof` evaluates to `0xFF`). +/// // `lexer.source` is now positioned on first matching byte. +/// // Handle the next matching byte (deal with any special cases). +/// matched_byte == b'X' /// } /// } /// ``` @@ -294,79 +289,65 @@ pub(crate) use safe_byte_match_table; /// impl<'a> Lexer<'a> { /// fn eat_stuff(&mut self) -> bool { /// let start = unsafe { self.source.position().add(1) }; -/// byte_search! { +/// let matched_byte = byte_search! { /// lexer: self, /// table: NOT_STUFF_TABLE, /// start: start, -/// handle_match: |matched_byte| { -/// // Matching byte has been found. -/// // `matched_byte` is `u8` value of first byte which matched the table. -/// // `lexer.source` is now positioned on first matching byte. -/// // Handle the next matching byte (deal with any special cases). -/// // Value this block evaluates to will be returned from enclosing function. -/// true -/// }, -/// handle_eof: || { +/// handle_eof: { /// // No bytes from start position to end of source matched the table. /// // `lexer.source` is now positioned at EOF. -/// // Handle EOF in some way. -/// // Value this block evaluates to will be returned from enclosing function. -/// false +/// return false; /// }, /// }; /// -/// // This is unreachable. -/// // Macro always exits current function with a `return` statement. +/// // Matching byte has been found. +/// // `matched_byte` is `u8` value of first byte which matched the table. +/// // `lexer.source` is now positioned on first matching byte. +/// // Handle the next matching byte (deal with any special cases). +/// matched_byte == b'X' /// } /// } /// ``` /// /// Can also add a block to decide whether to continue searching for some matches: /// -/// ```text +/// ``` /// impl<'a> Lexer<'a> { /// fn eat_stuff(&mut self) -> bool { -/// byte_search! { +/// let matched_byte = byte_search! { /// lexer: self, /// table: NOT_STUFF_TABLE, -/// continue_if: |matched_byte, pos| { +/// continue_if: (matched_byte, pos) { /// // Matching byte found. Decide whether it's really a match. +/// // Return `true` to continue searching, or `false` to end search. /// // NB: `lexer.source` has NOT been updated at this point. /// if matched_byte == 0xE2 { /// // Only match a specific Unicode char (in this case 0xE2, 0x80, 0xA8) -/// unsafe { pos.add(1).read() != 0x80 || pos.add(2).read() != 0xA8) } +/// // NB: We don't need to check if `pos` is at EOF here, as 0xE2 is always 1st byte +/// // of a 3-byte Unicode char, but if matching an ASCII char, would need to make sure +/// // don't read out of bounds. +/// unsafe { pos.add(1).read() != 0x80 || pos.add(2).read() != 0xA8 } /// } else { -/// // All others do match. `handle_match` is executed. +/// // End search for all other possibilities /// false /// } /// }, -/// handle_match: |matched_byte| { -/// // Matching byte has been found and `continue_if` returned `false` for it. -/// // `matched_byte` is `u8` value of first byte which matched the table. -/// // `lexer.source` is now positioned on first matching byte. -/// // Handle the next matching byte (deal with any special cases). -/// // Value this block evaluates to will be returned from enclosing function. -/// true -/// }, -/// handle_eof: || { +/// handle_eof: { /// // No bytes from start position to end of source matched the table. /// // `lexer.source` is now positioned at EOF. -/// // Handle EOF in some way. -/// // Value this block evaluates to will be returned from enclosing function. -/// false +/// return false; /// }, /// }; /// -/// // This is unreachable. -/// // Macro always exits current function with a `return` statement. +/// // Matching byte has been found. +/// // `matched_byte` is `u8` value of first byte which matched the table. +/// // `lexer.source` is now positioned on first matching byte. +/// // Handle the next matching byte (deal with any special cases). +/// matched_byte == b'X' /// } /// } /// ``` /// -/// NB: The macro always causes enclosing function to return. -/// It creates `return` statements with the value that `handle_match` / `handle_eof` blocks evaluate to. -/// After the `byte_search!` macro is unreachable. -/// /// # SAFETY /// /// This macro will consume bytes from `lexer.source` according to the `ByteMatchTable` @@ -380,71 +361,64 @@ pub(crate) use safe_byte_match_table; /// It is caller's responsibility to ensure that `lexer.source` is moved onto a UTF-8 character boundary. /// This is similar to the contract's of `Source`'s unsafe methods. macro_rules! byte_search { - // Standard version. + // Simple version. // `start` is calculated from current position of `lexer.source`. ( lexer: $lexer:ident, table: $table:ident, - handle_match: |$match_byte:ident| $match_handler:expr, - handle_eof: || $eof_handler:expr, + handle_eof: $eof_handler:expr, ) => {{ let start = $lexer.source.position(); byte_search! { lexer: $lexer, table: $table, start: start, - continue_if: |byte, pos| false, - handle_match: |$match_byte| $match_handler, - handle_eof: || $eof_handler, + continue_if: (byte, pos) false, + handle_eof: $eof_handler, } }}; - // Standard version with `continue_if`. + // With `continue_if`. // `start` is calculated from current position of `lexer.source`. ( lexer: $lexer:ident, table: $table:ident, - continue_if: |$continue_byte:ident, $pos:ident| $should_continue:expr, - handle_match: |$match_byte:ident| $match_handler:expr, - handle_eof: || $eof_handler:expr, + continue_if: ($byte:ident, $pos:ident) $should_continue:expr, + handle_eof: $eof_handler:expr, ) => {{ let start = $lexer.source.position(); byte_search! { lexer: $lexer, table: $table, start: start, - continue_if: |$continue_byte, $pos| $should_continue, - handle_match: |$match_byte| $match_handler, - handle_eof: || $eof_handler, + continue_if: ($byte, $pos) $should_continue, + handle_eof: $eof_handler, } }}; - // Provide your own `start` position + // With provided `start` position ( lexer: $lexer:ident, table: $table:ident, start: $start:ident, - handle_match: |$match_byte:ident| $match_handler:expr, - handle_eof: || $eof_handler:expr, + handle_eof: $eof_handler:expr, ) => { byte_search! { lexer: $lexer, table: $table, start: $start, - continue_if: |byte, pos| false, - handle_match: |$match_byte| $match_handler, - handle_eof: || $eof_handler, + continue_if: (byte, pos) false, + handle_eof: $eof_handler, } }; - // Actual implementation + // Actual implementation - with both `start` and `continue_if` ( lexer: $lexer:ident, table: $table:ident, start: $start:ident, - continue_if: |$continue_byte:ident, $pos:ident| $should_continue:expr, - handle_match: |$match_byte:ident| $match_handler:expr, - handle_eof: || $eof_handler:expr, + continue_if: ($byte:ident, $pos:ident) $should_continue:expr, + handle_eof: $eof_handler:expr, ) => {{ // SAFETY: // If `$table` is a `SafeByteMatchTable`, it's guaranteed that `lexer.source` @@ -458,8 +432,8 @@ macro_rules! byte_search { let mut $pos = $start; #[allow(unused_unsafe)] // Silence warnings if macro called in unsafe code - let $match_byte = 'outer: loop { - let $continue_byte = if $pos.addr() <= $lexer.source.end_for_batch_search_addr() { + 'outer: loop { + let $byte = if $pos.addr() <= $lexer.source.end_for_batch_search_addr() { // Search a batch of `SEARCH_BATCH_SIZE` bytes. // // `'inner: loop {}` is not a real loop - it always exits on first turn. @@ -510,7 +484,12 @@ macro_rules! byte_search { // Advance `lexer.source`'s position to end of file. $lexer.source.set_position($pos); - return $eof_handler; + // Avoid lint errors if `$eof_handler` contains `return` statement + #[allow(unused_variables, unreachable_code, clippy::diverging_sub_expression)] + { + let eof_ret = $eof_handler; + break 'outer eof_ret; + } } }; @@ -523,15 +502,13 @@ macro_rules! byte_search { continue; } - // Match confirmed - break $continue_byte; - }; + // Match confirmed. + // Advance `lexer.source`'s position up to `$pos`, consuming unmatched bytes. + // SAFETY: See above about UTF-8 character boundaries invariant. + $lexer.source.set_position($pos); - // Advance `lexer.source`'s position up to `$pos`, consuming unmatched bytes. - // SAFETY: See above about UTF-8 character boundaries invariant. - $lexer.source.set_position($pos); - - return $match_handler; + break $byte; + } }}; } pub(crate) use byte_search; diff --git a/crates/oxc_parser/src/lexer/string.rs b/crates/oxc_parser/src/lexer/string.rs index 1fc616e0d..0af68bf4a 100644 --- a/crates/oxc_parser/src/lexer/string.rs +++ b/crates/oxc_parser/src/lexer/string.rs @@ -37,43 +37,38 @@ macro_rules! handle_string_literal { let after_opening_quote = $lexer.source.position().add(1); // Consume bytes which are part of identifier - byte_search! { + let next_byte = byte_search! { lexer: $lexer, table: $table, start: after_opening_quote, - handle_match: |next_byte| { - // Found a matching byte. - // Either end of string found, or a line break, or `\` escape. - match next_byte { - $delimiter => { - // SAFETY: `handle_match` is only called if there's a byte to consume, - // and `next_byte` is the next byte in `lexer.source`. - // Macro user guarantees delimiter is ASCII, so consuming it cannot move - // `lexer.source` off a UTF-8 character boundary. - $lexer.source.next_byte_unchecked(); - Kind::Str - }, - b'\\' => { - cold_branch(|| { - handle_string_literal_escape!($lexer, $delimiter, $table, after_opening_quote) - }) - }, - _ => { - // Line break. This is impossible in valid JS, so cold path. - cold_branch(|| { - debug_assert!(matches!(next_byte, b'\r' | b'\n')); - $lexer.consume_char(); - $lexer.error(diagnostics::UnterminatedString($lexer.unterminated_range())); - Kind::Undetermined - }) - } - } - }, - handle_eof: || { + handle_eof: { $lexer.error(diagnostics::UnterminatedString($lexer.unterminated_range())); - Kind::Undetermined + return Kind::Undetermined; }, }; + + // Found a matching byte. + // Either end of string found, or a line break, or `\` escape. + match next_byte { + $delimiter => { + // SAFETY: Macro user guarantees delimiter is ASCII, so consuming it cannot move + // `lexer.source` off a UTF-8 character boundary. + $lexer.source.next_byte_unchecked(); + Kind::Str + } + b'\\' => cold_branch(|| { + handle_string_literal_escape!($lexer, $delimiter, $table, after_opening_quote) + }), + _ => { + // Line break. This is impossible in valid JS, so cold path. + cold_branch(|| { + debug_assert!(matches!(next_byte, b'\r' | b'\n')); + $lexer.consume_char(); + $lexer.error(diagnostics::UnterminatedString($lexer.unterminated_range())); + Kind::Undetermined + }) + } + } }}; } diff --git a/crates/oxc_parser/src/lexer/template.rs b/crates/oxc_parser/src/lexer/template.rs index 88f13920d..462d7b81b 100644 --- a/crates/oxc_parser/src/lexer/template.rs +++ b/crates/oxc_parser/src/lexer/template.rs @@ -29,7 +29,7 @@ impl<'a> Lexer<'a> { byte_search! { lexer: self, table: TEMPLATE_LITERAL_TABLE, - continue_if: |next_byte, pos| { + continue_if: (next_byte, pos) { match next_byte { b'$' => { // SAFETY: Next byte is `$` which is ASCII, so after it is a UTF-8 char boundary @@ -73,14 +73,13 @@ impl<'a> Lexer<'a> { } } }, - handle_match: |_next_byte| { - ret - }, - handle_eof: || { + handle_eof: { self.error(diagnostics::UnterminatedString(self.unterminated_range())); - Kind::Undetermined + return Kind::Undetermined; }, }; + + ret } /// Consume rest of template literal after a `\r` is found. @@ -195,7 +194,7 @@ impl<'a> Lexer<'a> { lexer: self, table: TEMPLATE_LITERAL_TABLE, start: pos, - continue_if: |next_byte, pos| { + continue_if: (next_byte, pos) { if next_byte == b'$' { // SAFETY: Next byte is `$` which is ASCII, so after it is a UTF-8 char boundary let after_dollar = pos.add(1); @@ -295,18 +294,15 @@ impl<'a> Lexer<'a> { } } }, - handle_match: |_next_byte| { - self.save_template_string( - is_valid_escape_sequence, - str.into_bump_str(), - ); - ret - }, - handle_eof: || { + handle_eof: { self.error(diagnostics::UnterminatedString(self.unterminated_range())); - Kind::Undetermined + return Kind::Undetermined; }, }; + + self.save_template_string(is_valid_escape_sequence, str.into_bump_str()); + + ret } /// Re-tokenize the current `}` token for `TemplateSubstitutionTail` diff --git a/crates/oxc_parser/src/lexer/whitespace.rs b/crates/oxc_parser/src/lexer/whitespace.rs index 7310e6ec6..577c12aec 100644 --- a/crates/oxc_parser/src/lexer/whitespace.rs +++ b/crates/oxc_parser/src/lexer/whitespace.rs @@ -17,12 +17,9 @@ impl<'a> Lexer<'a> { byte_search! { lexer: self, table: NOT_REGULAR_WHITESPACE_OR_LINE_BREAK_TABLE, - handle_match: |_next_byte| { - Kind::Skip - }, - handle_eof: || { - Kind::Skip - }, + handle_eof: 0, // Fall through to below }; + + Kind::Skip } }