oxc/crates/oxc_codegen/src/sourcemap_builder.rs

use std::sync::Arc;

use oxc_span::Span;
use oxc_syntax::identifier::{LS, PS};

// Irregular line breaks - '\u{2028}' (LS) and '\u{2029}' (PS)
const LS_OR_PS_FIRST: u8 = 0xE2;
const LS_OR_PS_SECOND: u8 = 0x80;
const LS_THIRD: u8 = 0xA8;
const PS_THIRD: u8 = 0xA9;

/// Line offset table
///
/// Used for tracking lines and columns from byte offsets via binary search.
///
/// Code is adapted from [esbuild](https://github.com/evanw/esbuild/blob/cc74e6042a9f573bf58e1e3f165ebda70af4ad3b/internal/js_printer/js_printer.go#L4806-L4808)
#[derive(Debug)]
pub struct LineOffsetTable {
    columns: Option<Vec<usize>>,
    byte_offset_to_first: usize,
    byte_offset_to_start_of_line: usize,
}

#[allow(clippy::struct_field_names)]
pub struct SourcemapBuilder {
    source_id: u32,
    original_source: Arc<str>,
    last_generated_update: usize,
    last_position: Option<u32>,
    last_search_line: usize,
    line_offset_tables: Vec<LineOffsetTable>,
    sourcemap_builder: oxc_sourcemap::SourceMapBuilder,
    generated_line: u32,
    generated_column: u32,
}

impl Default for SourcemapBuilder {
    fn default() -> Self {
        Self {
            source_id: 0,
            original_source: "".into(),
            last_generated_update: 0,
            last_position: None,
            last_search_line: 0,
            line_offset_tables: vec![],
            sourcemap_builder: oxc_sourcemap::SourceMapBuilder::default(),
            generated_line: 0,
            generated_column: 0,
        }
    }
}

impl SourcemapBuilder {
    pub fn with_name_and_source(&mut self, name: &str, source: &str) {
        self.line_offset_tables = Self::generate_line_offset_tables(source);
        self.source_id = self.sourcemap_builder.set_source_and_content(name, source);
        self.original_source = source.into();
    }

    pub fn into_sourcemap(self) -> oxc_sourcemap::SourceMap {
        self.sourcemap_builder.into_sourcemap()
    }

    pub fn add_source_mapping_for_name(&mut self, output: &[u8], span: Span, name: &str) {
        // SAFETY: search original string by span.
        let original_name =
            unsafe { self.original_source.get_unchecked(span.start as usize..span.end as usize) };
        // The token name should be original name.
        // If it hasn't change, name should be `None` to reduce `SourceMap` size.
        let token_name = if original_name == name { None } else { Some(original_name.into()) };
        self.add_source_mapping(output, span.start, token_name);
    }

    pub fn add_source_mapping(&mut self, output: &[u8], position: u32, name: Option<Arc<str>>) {
        if matches!(self.last_position, Some(last_position) if last_position >= position) {
            return;
        }
        let (original_line, original_column) = self.search_original_line_and_column(position);
        self.update_generated_line_and_column(output);
        let name_id = name.map(|s| self.sourcemap_builder.add_name(&s));
        self.sourcemap_builder.add_token(
            self.generated_line,
            self.generated_column,
            original_line,
            original_column,
            Some(self.source_id),
            name_id,
        );
        self.last_position = Some(position);
    }

    #[allow(clippy::cast_possible_truncation)]
    fn search_original_line_and_column(&mut self, position: u32) -> (u32, u32) {
        let mut original_line = 0;
        for i in self.last_search_line..self.line_offset_tables.len() {
            if self.line_offset_tables[i].byte_offset_to_start_of_line > position as usize {
                original_line = i - 1;
                break;
            }
            if i == self.line_offset_tables.len() - 1 {
                original_line = i;
            }
        }
        self.last_search_line = original_line;
        let line = &self.line_offset_tables[original_line];
        let mut original_column = (position as usize) - line.byte_offset_to_start_of_line;
        if original_column >= line.byte_offset_to_first {
            if let Some(cols) = &line.columns {
                original_column = cols[original_column - line.byte_offset_to_first];
            }
        }
        (original_line as u32, original_column as u32)
    }

    #[allow(clippy::cast_possible_truncation)]
    fn update_generated_line_and_column(&mut self, output: &[u8]) {
        let remaining = &output[self.last_generated_update..];

        // Find last line break
        let mut line_start_ptr = remaining.as_ptr();
        let mut last_line_is_ascii = true;
        let mut iter = remaining.iter();
        while let Some(&b) = iter.next() {
            match b {
                b'\n' => {}
                b'\r' => {
                    // Handle Windows-specific "\r\n" newlines
                    if iter.clone().next() == Some(&b'\n') {
                        iter.next();
                    }
                }
                _ if b.is_ascii() => {
                    continue;
                }
                LS_OR_PS_FIRST => {
                    let next_byte = *iter.next().unwrap();
                    let next_next_byte = *iter.next().unwrap();
                    if next_byte != LS_OR_PS_SECOND
                        || !matches!(next_next_byte, LS_THIRD | PS_THIRD)
                    {
                        last_line_is_ascii = false;
                        continue;
                    }
                }
                _ => {
                    // Unicode char
                    last_line_is_ascii = false;
                    continue;
                }
            }

            // Line break found.
            // `iter` is now positioned after line break.
            line_start_ptr = iter.as_slice().as_ptr();
            self.generated_line += 1;
            self.generated_column = 0;
            last_line_is_ascii = true;
        }

        // Calculate column
        self.generated_column += if last_line_is_ascii {
            // `iter` is now exhausted, so `iter.as_slice().as_ptr()` is pointer to end of `output`
            (iter.as_slice().as_ptr() as usize - line_start_ptr as usize) as u32
        } else {
            let line_byte_offset = line_start_ptr as usize - remaining.as_ptr() as usize;
            // TODO: It'd be better if could use `from_utf8_unchecked` here, but we'd need to make this
            // function unsafe and caller guarantees `output` contains a valid UTF-8 string
            let last_line = std::str::from_utf8(&remaining[line_byte_offset..]).unwrap();
            // Mozilla's "source-map" library counts columns using UTF-16 code units
            last_line.encode_utf16().count() as u32
        };
        self.last_generated_update = output.len();
    }

    fn generate_line_offset_tables(content: &str) -> Vec<LineOffsetTable> {
        let mut tables = vec![];

        // Process content line-by-line.
        // For each line, start by assuming line will be entirely ASCII, and read byte-by-byte.
        // If line is all ASCII, UTF-8 columns and UTF-16 columns are the same,
        // so no need to create a `columns` Vec. This is the fast path for common case.
        // If a Unicode character found, read rest of line char-by-char, populating `columns` Vec.
        // At end of line, go back to top of outer loop, and again assume ASCII for next line.
        let mut line_byte_offset = 0;
        'lines: loop {
            tables.push(LineOffsetTable {
                columns: None,
                // `usize::MAX` so `original_column >= line.byte_offset_to_first` check in
                // `search_original_line_and_column` fails if line is all ASCII
                byte_offset_to_first: usize::MAX,
                byte_offset_to_start_of_line: line_byte_offset,
            });

            let remaining = &content.as_bytes()[line_byte_offset..];
            for (mut byte_offset_from_line_start, b) in remaining.iter().enumerate() {
                match b {
                    b'\n' => {
                        byte_offset_from_line_start += 1;
                    }
                    b'\r' => {
                        byte_offset_from_line_start += 1;
                        // Handle Windows-specific "\r\n" newlines
                        if remaining.get(byte_offset_from_line_start) == Some(&b'\n') {
                            byte_offset_from_line_start += 1;
                        }
                    }
                    _ if b.is_ascii() => {
                        continue;
                    }
                    _ => {
                        // Unicode char found.
                        // Create `columns` Vec, and set `byte_offset_to_first`.
                        let table = tables.iter_mut().last().unwrap();
                        table.byte_offset_to_first = byte_offset_from_line_start;
                        table.columns = Some(vec![]);
                        let columns = table.columns.as_mut().unwrap();

                        // Loop through rest of line char-by-char.
                        // `chunk_byte_offset` in this loop is byte offset from start of this 1st
                        // Unicode char.
                        let mut column = byte_offset_from_line_start;
                        line_byte_offset += byte_offset_from_line_start;
                        let remaining = &content[line_byte_offset..];
                        for (mut chunk_byte_offset, ch) in remaining.char_indices() {
                            for _ in 0..ch.len_utf8() {
                                columns.push(column);
                            }

                            match ch {
                                '\r' => {
                                    // Handle Windows-specific "\r\n" newlines
                                    chunk_byte_offset += 1;
                                    if remaining.as_bytes().get(chunk_byte_offset) == Some(&b'\n') {
                                        chunk_byte_offset += 1;
                                        columns.push(column + 1);
                                    }
                                }
                                '\n' => {
                                    chunk_byte_offset += 1;
                                }
                                LS | PS => {
                                    chunk_byte_offset += 3;
                                }
                                _ => {
                                    // Mozilla's "source-map" library counts columns using UTF-16 code units
                                    column += ch.len_utf16();
                                    continue;
                                }
                            }

                            // Line break found.
                            // `chunk_byte_offset` is now the offset of *end* of the line break.
                            line_byte_offset += chunk_byte_offset;
                            // Revert back to outer loop for next line
                            continue 'lines;
                        }

                        // EOF.
                        // One last column entry for EOF position.
                        columns.push(column);
                        break 'lines;
                    }
                };

                // Line break found.
                // `byte_offset_from_line_start` is now the length of line *including* line break.
                line_byte_offset += byte_offset_from_line_start;
                continue 'lines;
            }

            // EOF
            break;
        }

        tables
    }
}

#[cfg(test)]
mod test {
    use super::*;

    #[test]
    fn builder_ascii() {
        assert_mapping("", &[(0, 0, 0)]);
        assert_mapping("a", &[(0, 0, 0), (1, 0, 1)]);
        assert_mapping("\n", &[(0, 0, 0), (1, 1, 0)]);
        assert_mapping("a\n", &[(0, 0, 0), (1, 0, 1), (2, 1, 0)]);
        assert_mapping("\na", &[(0, 0, 0), (1, 1, 0), (2, 1, 1)]);
        assert_mapping(
            "ab\ncd\n\nef",
            &[
                (0, 0, 0),
                (1, 0, 1),
                (2, 0, 2),
                (3, 1, 0),
                (4, 1, 1),
                (5, 1, 2),
                (6, 2, 0),
                (7, 3, 0),
                (8, 3, 1),
                (9, 3, 2),
            ],
        );

        assert_mapping("\r", &[(0, 0, 0), (1, 1, 0)]);
        assert_mapping("\r\r", &[(0, 0, 0), (1, 1, 0), (2, 2, 0)]);
        assert_mapping("a\ra", &[(0, 0, 0), (1, 0, 1), (2, 1, 0), (3, 1, 1)]);

        assert_mapping("\r\n", &[(0, 0, 0), (1, 0, 1), (2, 1, 0)]);
        assert_mapping("\r\n\r\n", &[(0, 0, 0), (1, 0, 1), (2, 1, 0), (3, 1, 1), (4, 2, 0)]);
        assert_mapping("a\r\na", &[(0, 0, 0), (1, 0, 1), (2, 0, 2), (3, 1, 0), (4, 1, 1)]);
    }

    #[test]
    fn builder_unicode() {
        assert_mapping("Ö", &[(0, 0, 0), (2, 0, 1)]);
        assert_mapping("ÖÖ", &[(0, 0, 0), (2, 0, 1), (4, 0, 2)]);
        assert_mapping("Ö\n", &[(0, 0, 0), (2, 0, 1), (3, 1, 0)]);
        assert_mapping("ÖÖ\n", &[(0, 0, 0), (2, 0, 1), (4, 0, 2), (5, 1, 0)]);
        assert_mapping("\nÖ", &[(0, 0, 0), (1, 1, 0), (3, 1, 1)]);
        assert_mapping("\nÖÖ", &[(0, 0, 0), (1, 1, 0), (3, 1, 1), (5, 1, 2)]);
        assert_mapping("Ö\nÖ", &[(0, 0, 0), (2, 0, 1), (3, 1, 0), (5, 1, 1)]);
        assert_mapping("\nÖÖ\n", &[(0, 0, 0), (1, 1, 0), (3, 1, 1), (5, 1, 2), (6, 2, 0)]);
        assert_mapping("Ö\ra", &[(0, 0, 0), (2, 0, 1), (3, 1, 0), (4, 1, 1)]);
        assert_mapping("Ö\r\na", &[(0, 0, 0), (2, 0, 1), (3, 0, 2), (4, 1, 0), (5, 1, 1)]);
    }

    fn assert_mapping(source: &str, mappings: &[(u32, u32, u32)]) {
        let mut builder = SourcemapBuilder::default();
        builder.with_name_and_source("x.js", source);
        for (position, expected_line, expected_col) in mappings.iter().copied() {
            let (line, col) = builder.search_original_line_and_column(position);
            assert_eq!(
                builder.search_original_line_and_column(position),
                (expected_line, expected_col),
                "Incorrect mapping for '{source}' - position {position} = line {line}, column {col}"
            );
        }
    }

    #[test]
    fn add_source_mapping() {
        fn create_mappings(source: &str, line: u32, column: u32) {
            let mut builder = SourcemapBuilder::default();
            builder.with_name_and_source("x.js", source);
            let output: Vec<u8> = source.as_bytes().into();
            for (i, _ch) in source.char_indices() {
                #[allow(clippy::cast_possible_truncation)]
                builder.add_source_mapping(&output, i as u32, None);
                assert!(
                    builder.generated_line == line && builder.generated_column == column,
                    "Incorrect generated mapping for '{source}' ({:?}) starting at {i} - line {}, column {}",
                    source.as_bytes(),
                    builder.generated_line,
                    builder.generated_column
                );
                assert_eq!(builder.last_generated_update, source.len());
            }
        }

        create_mappings("", 0, 0);
        create_mappings("abc", 0, 3);
        create_mappings("\n", 1, 0);
        create_mappings("\n\n\n", 3, 0);
        create_mappings("\r", 1, 0);
        create_mappings("\r\r\r", 3, 0);
        create_mappings("\r\n", 1, 0);
        create_mappings("\r\n\r\n\r\n", 3, 0);
        create_mappings("\nabc", 1, 3);
        create_mappings("abc\n", 1, 0);
        create_mappings("\rabc", 1, 3);
        create_mappings("abc\r", 1, 0);
        create_mappings("\r\nabc", 1, 3);
        create_mappings("abc\r\n", 1, 0);
        create_mappings("ÖÖ\nÖ\nÖÖÖ", 2, 3);
    }

    #[test]
    fn add_source_mapping_for_name() {
        let output = "ac".as_bytes();
        let mut builder = SourcemapBuilder::default();
        builder.with_name_and_source("x.js", "ab");
        builder.add_source_mapping_for_name(output, Span::new(0, 1), "a");
        builder.add_source_mapping_for_name(output, Span::new(1, 2), "c");
        let sm = builder.into_sourcemap();
        // The name `a` not change.
        assert_eq!(
            sm.get_source_view_token(0_u32).as_ref().and_then(|token| token.get_name()),
            None
        );
        // The name `b` -> `c`, save `b` to token.
        assert_eq!(
            sm.get_source_view_token(1_u32).as_ref().and_then(|token| token.get_name()),
            Some("b")
        );
    }
}