perf(codegen): speed up building sourcemap line tables (#2591)

Speed up building the line tables for source maps, using same kind of techniques as have been using in the lexer:

* Iterate byte-by-byte not char-by-char (`chars` iterator is slow).
* Fast path for ASCII (common case).
This commit is contained in:
overlookmotel 2024-03-04 04:40:39 +00:00 committed by GitHub
parent ea30fd5b12
commit 42fa8ebbc2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -118,68 +118,104 @@ impl SourcemapBuilder {
fn generate_line_offset_tables(content: &str) -> Vec<LineOffsetTable> {
let mut tables = vec![];
let mut columns = None;
let mut column = 0;
// Process content line-by-line.
// For each line, start by assuming line will be entirely ASCII, and read byte-by-byte.
// If line is all ASCII, UTF-8 columns and UTF-16 columns are the same,
// so no need to create a `columns` Vec. This is the fast path for common case.
// If a Unicode character found, read rest of line char-by-char, populating `columns` Vec.
// At end of line, go back to top of outer loop, and again assume ASCII for next line.
let mut line_byte_offset = 0;
let mut byte_offset_to_first = 0;
for (i, ch) in content.char_indices() {
// Mark the start of the next line
if column == 0 {
line_byte_offset = i;
}
'lines: loop {
tables.push(LineOffsetTable {
columns: None,
// `usize::MAX` so `original_column >= line.byte_offset_to_first` check in
// `search_original_line_and_column` fails if line is all ASCII
byte_offset_to_first: usize::MAX,
byte_offset_to_start_of_line: line_byte_offset,
});
// Start the mapping if this character is non-ASCII
if !ch.is_ascii() && columns.is_none() {
byte_offset_to_first = i - line_byte_offset;
columns = Some(vec![]);
}
// Update the per-byte column offsets
if let Some(columns) = &mut columns {
for _ in 0..ch.len_utf8() {
columns.push(column);
}
}
match ch {
'\r' | '\n' | LS | PS => {
// Handle Windows-specific "\r\n" newlines
if ch == '\r' && content.as_bytes().get(i + 1) == Some(&b'\n') {
column += 1;
let remaining = &content.as_bytes()[line_byte_offset..];
for (mut byte_offset_from_line_start, b) in remaining.iter().copied().enumerate() {
match b {
b'\n' => {
byte_offset_from_line_start += 1;
}
b'\r' => {
byte_offset_from_line_start += 1;
// Handle Windows-specific "\r\n" newlines
if remaining.get(byte_offset_from_line_start) == Some(&b'\n') {
byte_offset_from_line_start += 1;
}
}
_ if b.is_ascii() => {
continue;
}
_ => {
// Unicode char found.
// Create `columns` Vec, and set `byte_offset_to_first`.
let table = tables.iter_mut().last().unwrap();
table.byte_offset_to_first = byte_offset_from_line_start;
table.columns = Some(vec![]);
let columns = table.columns.as_mut().unwrap();
tables.push(LineOffsetTable {
columns,
byte_offset_to_first,
byte_offset_to_start_of_line: line_byte_offset,
});
column = 0;
columns = None;
byte_offset_to_first = 0;
}
_ => {
// Mozilla's "source-map" library counts columns using UTF-16 code units
column += ch.len_utf16();
}
// Loop through rest of line char-by-char.
// `chunk_byte_offset` in this loop is byte offset from start of this 1st
// Unicode char.
let mut column = byte_offset_from_line_start;
line_byte_offset += byte_offset_from_line_start;
let remaining = &content[line_byte_offset..];
for (mut chunk_byte_offset, ch) in remaining.char_indices() {
for _ in 0..ch.len_utf8() {
columns.push(column);
}
match ch {
'\r' => {
// Handle Windows-specific "\r\n" newlines
chunk_byte_offset += 1;
if remaining.as_bytes().get(chunk_byte_offset) == Some(&b'\n') {
chunk_byte_offset += 1;
columns.push(column + 1);
}
}
'\n' => {
chunk_byte_offset += 1;
}
LS | PS => {
chunk_byte_offset += 3;
}
_ => {
// Mozilla's "source-map" library counts columns using UTF-16 code units
column += ch.len_utf16();
continue;
}
}
// Line break found.
// `chunk_byte_offset` is now the offset of *end* of the line break.
line_byte_offset += chunk_byte_offset;
// Revert back to outer loop for next line
continue 'lines;
}
// EOF.
// One last column entry for EOF position.
columns.push(column);
break 'lines;
}
};
// Line break found.
// `byte_offset_from_line_start` is now the length of line *including* line break.
line_byte_offset += byte_offset_from_line_start;
continue 'lines;
}
}
// Mark the start of the next line
if column == 0 {
line_byte_offset = content.len();
}
// Do one last update for the column at the end of the file
if let Some(columns) = &mut columns {
columns.push(column);
// EOF
break;
}
tables.push(LineOffsetTable {
columns,
byte_offset_to_first,
byte_offset_to_start_of_line: line_byte_offset,
});
tables
}
}