fix(transformer): JSX source calculate correct column when Unicode chars (#3615)

Fix column number in JSX source transform, and add tests.

It was correct in all cases, except for when a Unicode character with code point above `0xFFFF` appears earlier on the line.

Such characters are:

* 4 bytes in UTF-8.
* 2 characters in UTF-16.
* 1 `char` in Rust.

Babel (which we're trying to match) uses count of UTF-16 characters for column number, whereas we were using count of Rust `char`s.
This commit is contained in:
overlookmotel 2024-06-11 06:41:18 +00:00
parent 9e8f4d60b5
commit 8d237c49a9
2 changed files with 88 additions and 8 deletions

View file

@ -85,6 +85,9 @@ impl<'a> ReactJsxSource<'a> {
let key = JSXAttributeName::Identifier(
self.ctx.ast.alloc(self.ctx.ast.jsx_identifier(SPAN, SOURCE.into())),
);
// TODO: We shouldn't calculate line + column from scratch each time as it's expensive.
// Build a table of byte indexes of each line's start on first usage, and save it.
// Then calculate line and column from that.
let (line, column) = get_line_column(elem.span.start, self.ctx.source_text);
let object = self.get_source_object(line, column, ctx);
let expr = self.ctx.ast.jsx_expression_container(SPAN, JSXExpression::from(object));

View file

@ -1,14 +1,91 @@
use ropey::Rope;
/// Get line and column from offset and source text
/// Get line and column from offset and source text.
///
/// Line number starts at 1.
/// Column number is in UTF-16 characters, and starts at 1.
///
/// This matches Babel's output.
pub fn get_line_column(offset: u32, source_text: &str) -> (usize, usize) {
let offset = offset as usize;
let rope = Rope::from_str(source_text);
let line = rope.byte_to_line(offset);
let first_char_of_line = rope.line_to_char(line);
// Original offset is byte, but Rope uses char offset
let offset = rope.byte_to_char(offset);
let column = offset - first_char_of_line;
// line and column is zero-indexed, but we want 1-indexed
(line + 1, column + 1)
// Get line number and byte offset of start of line
let line_index = rope.byte_to_line(offset);
let line_offset = rope.line_to_byte(line_index);
// Get column number
let column_index = source_text[line_offset..offset].encode_utf16().count();
// line and column are zero-indexed, but we want 1-indexed
(line_index + 1, column_index + 1)
}
#[test]
fn empty_file() {
assert_eq!(get_line_column(0, ""), (1, 1));
}
#[test]
fn first_line_start() {
assert_eq!(get_line_column(0, "foo\nbar\n"), (1, 1));
}
#[test]
fn first_line_middle() {
assert_eq!(get_line_column(5, "blahblahblah\noops\n"), (1, 6));
}
#[test]
fn later_line_start() {
assert_eq!(get_line_column(8, "foo\nbar\nblahblahblah"), (3, 1));
}
#[test]
fn later_line_middle() {
assert_eq!(get_line_column(12, "foo\nbar\nblahblahblah"), (3, 5));
}
#[test]
fn after_2_byte_unicode() {
assert_eq!("£".len(), 2);
assert_eq!(utf16_len("£"), 1);
assert_eq!(get_line_column(4, "£abc"), (1, 4));
}
#[test]
fn after_3_byte_unicode() {
assert_eq!("".len(), 3);
assert_eq!(utf16_len(""), 1);
assert_eq!(get_line_column(5, "अabc"), (1, 4));
}
#[test]
fn after_4_byte_unicode() {
assert_eq!("🍄".len(), 4);
assert_eq!(utf16_len("🍄"), 2);
assert_eq!(get_line_column(6, "🍄abc"), (1, 5));
}
#[test]
fn after_2_byte_unicode_on_previous_line() {
assert_eq!("£".len(), 2);
assert_eq!(utf16_len("£"), 1);
assert_eq!(get_line_column(4, "£\nabc"), (2, 2));
}
#[test]
fn after_3_byte_unicode_on_previous_line() {
assert_eq!("".len(), 3);
assert_eq!(utf16_len(""), 1);
assert_eq!(get_line_column(5, "\nabc"), (2, 2));
}
#[test]
fn after_4_byte_unicode_on_previous_line() {
assert_eq!("🍄".len(), 4);
assert_eq!(utf16_len("🍄"), 2);
assert_eq!(get_line_column(6, "🍄\nabc"), (2, 2));
}
#[cfg(test)]
fn utf16_len(s: &str) -> usize {
s.encode_utf16().count()
}