oxc/crates/oxc_sourcemap/src/encode.rs
overlookmotel ff43dff99d perf(sourcemap): speed up VLQ encoding (#4633)
Speed up source map VLQ encoding by removing a couple of operations from `serialize_mappings`'s hot loop.

On a local benchmark of just VLQ encoding, this change produces 5% performance increase (benchmarked on MacBook Pro M1).
2024-08-05 01:34:59 +00:00

456 lines
16 KiB
Rust

use std::borrow::Cow;
#[cfg(feature = "concurrent")]
use rayon::prelude::*;
use crate::JSONSourceMap;
/// Port from https://github.com/getsentry/rust-sourcemap/blob/master/src/encoder.rs
/// It is a helper for encode `SourceMap` to vlq sourcemap string, but here some different.
/// - Quote `source_content` at parallel.
/// - If you using `ConcatSourceMapBuilder`, serialize `tokens` to vlq `mappings` at parallel.
use crate::{token::TokenChunk, SourceMap, Token};
pub fn encode(sourcemap: &SourceMap) -> JSONSourceMap {
JSONSourceMap {
file: sourcemap.get_file().map(ToString::to_string),
mappings: Some(serialize_sourcemap_mappings(sourcemap)),
source_root: sourcemap.get_source_root().map(ToString::to_string),
sources: Some(sourcemap.sources.iter().map(ToString::to_string).map(Some).collect()),
sources_content: sourcemap
.source_contents
.as_ref()
.map(|x| x.iter().map(ToString::to_string).map(Some).collect()),
names: Some(sourcemap.names.iter().map(ToString::to_string).collect()),
}
}
// Here using `serde_json` to serialize `names` / `source_contents` / `sources`.
// It will escape the string to avoid invalid JSON string.
pub fn encode_to_string(sourcemap: &SourceMap) -> String {
let max_segments = 12
+ sourcemap.names.len() * 2
+ sourcemap.sources.len() * 2
+ sourcemap.source_contents.as_ref().map_or(0, |sources| sources.len() * 2 + 1)
+ sourcemap.x_google_ignore_list.as_ref().map_or(0, |x| x.len() * 2 + 1);
let mut contents = PreAllocatedString::new(max_segments);
contents.push("{\"version\":3,".into());
if let Some(file) = sourcemap.get_file() {
contents.push("\"file\":\"".into());
contents.push(file.into());
contents.push("\",".into());
}
if let Some(source_root) = sourcemap.get_source_root() {
contents.push("\"sourceRoot\":\"".into());
contents.push(source_root.into());
contents.push("\",".into());
}
contents.push("\"names\":[".into());
contents.push_list(sourcemap.names.iter().map(escape_json_string));
contents.push("],\"sources\":[".into());
contents.push_list(sourcemap.sources.iter().map(escape_json_string));
// Quote `source_content` in parallel
if let Some(source_contents) = &sourcemap.source_contents {
contents.push("],\"sourcesContent\":[".into());
cfg_if::cfg_if! {
if #[cfg(feature = "concurrent")] {
let quoted_source_contents: Vec<_> = source_contents
.par_iter()
.map(escape_json_string)
.collect();
contents.push_list(quoted_source_contents.into_iter());
} else {
contents.push_list(source_contents.iter().map(escape_json_string));
}
};
}
if let Some(x_google_ignore_list) = &sourcemap.x_google_ignore_list {
contents.push("],\"x_google_ignoreList\":[".into());
contents.push_list(x_google_ignore_list.iter().map(ToString::to_string));
}
contents.push("],\"mappings\":\"".into());
contents.push(serialize_sourcemap_mappings(sourcemap).into());
contents.push("\"}".into());
// Check we calculated number of segments required correctly
debug_assert!(contents.num_segments() <= max_segments);
contents.consume()
}
#[allow(clippy::cast_possible_truncation)]
fn serialize_sourcemap_mappings(sm: &SourceMap) -> String {
sm.token_chunks.as_ref().map_or_else(
|| {
serialize_mappings(
&sm.tokens,
&TokenChunk::new(0, sm.tokens.len() as u32, 0, 0, 0, 0, 0, 0),
)
},
|token_chunks| {
// Serialize `tokens` to vlq `mappings` at parallel.
cfg_if::cfg_if! {
if #[cfg(feature = "concurrent")] {
token_chunks
.par_iter()
.map(|token_chunk| serialize_mappings(&sm.tokens, token_chunk))
.collect::<String>()
} else {
token_chunks
.iter()
.map(|token_chunk| serialize_mappings(&sm.tokens, token_chunk))
.collect::<String>()
}
}
},
)
}
// Max length of a single VLQ encoding
const MAX_VLQ_BYTES: usize = 7;
fn serialize_mappings(tokens: &[Token], token_chunk: &TokenChunk) -> String {
let TokenChunk {
start,
end,
mut prev_dst_line,
mut prev_dst_col,
mut prev_src_line,
mut prev_src_col,
mut prev_name_id,
mut prev_source_id,
} = *token_chunk;
let capacity = ((end - start) * 10) as usize;
let mut rv = String::with_capacity(capacity);
let mut prev_token = if start == 0 { None } else { Some(&tokens[start as usize - 1]) };
for token in &tokens[start as usize..end as usize] {
// Max length of a single VLQ encoding is 7 bytes. Max number of calls to `encode_vlq_diff` is 5.
// Also need 1 byte for each line number difference, or 1 byte if no line num difference.
// Reserve this amount of capacity in `rv` early, so can skip bounds checks in code below.
// As well as skipping the bounds checks, this also removes a function call to
// `alloc::raw_vec::RawVec::grow_one` for every byte that's pushed.
// https://godbolt.org/z/44G8jjss3
const MAX_TOTAL_VLQ_BYTES: usize = 5 * MAX_VLQ_BYTES;
let num_line_breaks = token.get_dst_line() - prev_dst_line;
if num_line_breaks != 0 {
rv.reserve(MAX_TOTAL_VLQ_BYTES + num_line_breaks as usize);
// SAFETY: We have reserved sufficient capacity for `num_line_breaks` bytes
unsafe { push_bytes_unchecked(&mut rv, b';', num_line_breaks) };
prev_dst_col = 0;
prev_dst_line += num_line_breaks;
} else if let Some(prev_token) = prev_token {
if prev_token == token {
continue;
}
rv.reserve(MAX_TOTAL_VLQ_BYTES + 1);
// SAFETY: We have reserved sufficient capacity for 1 byte
unsafe { push_byte_unchecked(&mut rv, b',') };
}
// SAFETY: We have reserved enough capacity above to satisfy safety contract
// of `encode_vlq_diff` for all calls below
unsafe {
encode_vlq_diff(&mut rv, token.get_dst_col(), prev_dst_col);
prev_dst_col = token.get_dst_col();
if let Some(source_id) = token.get_source_id() {
encode_vlq_diff(&mut rv, source_id, prev_source_id);
prev_source_id = source_id;
encode_vlq_diff(&mut rv, token.get_src_line(), prev_src_line);
prev_src_line = token.get_src_line();
encode_vlq_diff(&mut rv, token.get_src_col(), prev_src_col);
prev_src_col = token.get_src_col();
if let Some(name_id) = token.get_name_id() {
encode_vlq_diff(&mut rv, name_id, prev_name_id);
prev_name_id = name_id;
}
}
}
prev_token = Some(token);
}
rv
}
/// Encode diff as VLQ and push encoding into `out`.
/// Will push between 1 byte (num = 0) and 7 bytes (num = -u32::MAX).
///
/// # SAFETY
/// Caller must ensure at least 7 bytes spare capacity in `out`,
/// as this function does not perform any bounds checks.
#[inline]
unsafe fn encode_vlq_diff(out: &mut String, a: u32, b: u32) {
encode_vlq(out, i64::from(a) - i64::from(b));
}
// Align chars lookup table on 64 so occupies a single cache line
#[repr(align(64))]
struct Aligned64([u8; 64]);
static B64_CHARS: Aligned64 = Aligned64([
b'A', b'B', b'C', b'D', b'E', b'F', b'G', b'H', b'I', b'J', b'K', b'L', b'M', b'N', b'O', b'P',
b'Q', b'R', b'S', b'T', b'U', b'V', b'W', b'X', b'Y', b'Z', b'a', b'b', b'c', b'd', b'e', b'f',
b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v',
b'w', b'x', b'y', b'z', b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'+', b'/',
]);
/// Encode number as VLQ and push encoding into `out`.
/// Will push between 1 byte (num = 0) and 7 bytes (num = -u32::MAX).
///
/// # SAFETY
/// Caller must ensure at least 7 bytes spare capacity in `out`,
/// as this function does not perform any bounds checks.
#[allow(
clippy::cast_possible_truncation,
clippy::cast_sign_loss,
clippy::unnecessary_safety_comment
)]
unsafe fn encode_vlq(out: &mut String, num: i64) {
let mut num = if num < 0 { ((-num) << 1) + 1 } else { num << 1 };
// Breaking out of loop early when have reached last char (rather than conditionally adding
// 32 for last char within the loop) removes 3 instructions from the loop.
// https://godbolt.org/z/Es4Pavh9j
// This translates to a 16% speed-up for VLQ encoding.
let mut digit;
loop {
digit = num & 0b11111;
num >>= 5;
if num == 0 {
break;
}
let b = B64_CHARS.0[digit as usize + 32];
// SAFETY:
// * This loop can execute a maximum of 7 times, and on last turn will exit before getting here.
// Caller promises there are at least 7 bytes spare capacity in `out` at start. We only
// push 1 byte on each turn, so guaranteed there is at least 1 byte capacity in `out` here.
// * All values in `B64_CHARS` lookup table are ASCII bytes.
push_byte_unchecked(out, b);
}
let b = B64_CHARS.0[digit as usize];
// SAFETY:
// * The loop above pushes max 6 bytes. Caller promises there are at least 7 bytes spare capacity
// in `out` at start. So guaranteed there is at least 1 byte capacity in `out` here.
// * All values in `B64_CHARS` lookup table are ASCII bytes.
push_byte_unchecked(out, b);
}
/// Push a byte to `out` without bounds checking.
///
/// # SAFETY
/// * `out` must have at least 1 byte spare capacity.
/// * `b` must be an ASCII byte (i.e. not `>= 128`).
//
// `#[inline(always)]` to ensure that `len` is stored in a register during `encode_vlq`'s loop.
#[allow(clippy::inline_always)]
#[inline(always)]
unsafe fn push_byte_unchecked(out: &mut String, b: u8) {
debug_assert!(out.len() < out.capacity());
debug_assert!(b.is_ascii());
let out = out.as_mut_vec();
let len = out.len();
let ptr = out.as_mut_ptr().add(len);
ptr.write(b);
out.set_len(len + 1);
}
/// Push a byte to `out` a number of times without bounds checking.
///
/// # SAFETY
/// * `out` must have at least `repeats` bytes spare capacity.
/// * `b` must be an ASCII byte (i.e. not `>= 128`).
#[inline]
unsafe fn push_bytes_unchecked(out: &mut String, b: u8, repeats: u32) {
debug_assert!(out.capacity() - out.len() >= repeats as usize);
debug_assert!(b.is_ascii());
let out = out.as_mut_vec();
let len = out.len();
let mut ptr = out.as_mut_ptr().add(len);
for _ in 0..repeats {
ptr.write(b);
ptr = ptr.add(1);
}
out.set_len(len + repeats as usize);
}
/// A helper for pre-allocate string buffer.
///
/// Pre-allocate a Cow<'a, str> buffer, and push the segment into it.
/// Finally, convert it to a pre-allocated length String.
struct PreAllocatedString<'a> {
buf: Vec<Cow<'a, str>>,
len: usize,
}
impl<'a> PreAllocatedString<'a> {
fn new(max_segments: usize) -> Self {
Self { buf: Vec::with_capacity(max_segments), len: 0 }
}
#[inline]
fn push(&mut self, s: Cow<'a, str>) {
self.len += s.len();
self.buf.push(s);
}
#[inline]
fn push_list<I>(&mut self, mut iter: I)
where
I: Iterator<Item = String>,
{
let Some(first) = iter.next() else {
return;
};
self.push(Cow::Owned(first));
for other in iter {
self.push(Cow::Borrowed(","));
self.push(Cow::Owned(other));
}
}
#[inline]
fn consume(self) -> String {
let mut buf = String::with_capacity(self.len);
buf.extend(self.buf);
buf
}
fn num_segments(&self) -> usize {
self.buf.len()
}
}
fn escape_json_string<S: AsRef<str>>(s: S) -> String {
let s = s.as_ref();
let mut escaped_buf = Vec::with_capacity(s.len() * 2 + 2);
// This call is infallible as only error it can return is if the writer errors.
// Writing to a `Vec<u8>` is infallible, so that's not possible here.
serde::Serialize::serialize(s, &mut serde_json::Serializer::new(&mut escaped_buf)).unwrap();
// Safety: `escaped_buf` is valid utf8.
unsafe { String::from_utf8_unchecked(escaped_buf) }
}
#[test]
fn test_escape_json_string() {
const FIXTURES: &[(char, &str)] = &[
('n', "\"n\""),
('"', "\"\\\"\""),
('\\', "\"\\\\\""),
('/', "\"/\""),
('\x08', "\"\\b\""),
('\x0C', "\"\\f\""),
('\n', "\"\\n\""),
('\r', "\"\\r\""),
('\t', "\"\\t\""),
('\x0B', "\"\\u000b\""),
('虎', "\"\""),
('\u{3A3}', "\"\u{3A3}\""),
];
for (c, expected) in FIXTURES {
let mut input = String::new();
input.push(*c);
assert_eq!(escape_json_string(input), *expected);
}
}
#[test]
fn test_encode() {
let input = r#"{
"version": 3,
"sources": ["coolstuff.js"],
"sourceRoot": "x",
"names": ["x","alert"],
"mappings": "AAAA,GAAIA,GAAI,EACR,IAAIA,GAAK,EAAG,CACVC,MAAM"
}"#;
let sm = SourceMap::from_json_string(input).unwrap();
let sm2 = SourceMap::from_json_string(&sm.to_json_string()).unwrap();
for (tok1, tok2) in sm.get_tokens().zip(sm2.get_tokens()) {
assert_eq!(tok1, tok2);
}
}
#[test]
fn test_encode_escape_string() {
// '\0' should be escaped.
let mut sm = SourceMap::new(
None,
vec!["name_length_greater_than_16_\0".into()],
None,
vec!["\0".into()],
Some(vec!["emoji-👀-\0".into()]),
vec![],
None,
);
sm.set_x_google_ignore_list(vec![0]);
assert_eq!(
sm.to_json_string(),
r#"{"version":3,"names":["name_length_greater_than_16_\u0000"],"sources":["\u0000"],"sourcesContent":["emoji-👀-\u0000"],"x_google_ignoreList":[0],"mappings":""}"#
);
}
#[test]
fn test_vlq_encode_diff() {
// Most import tests here are that with maximum values, `encode_vlq_diff` pushes maximum of 7 bytes.
// This invariant is essential to safety of `encode_vlq_diff`.
#[rustfmt::skip]
const FIXTURES: &[(u32, u32, &str)] = &[
(0, 0, "A"),
(1, 0, "C"),
(2, 0, "E"),
(15, 0, "e"),
(16, 0, "gB"),
(511, 0, "+f"),
(512, 0, "ggB"),
(16_383, 0, "+/f"),
(16_384, 0, "gggB"),
(524_287, 0, "+//f"),
(524_288, 0, "ggggB"),
(16_777_215, 0, "+///f"),
(16_777_216, 0, "gggggB"),
(536_870_911, 0, "+////f"),
(536_870_912, 0, "ggggggB"),
(u32::MAX, 0, "+/////H"), // 7 bytes
(0, 1, "D"),
(0, 2, "F"),
(0, 15, "f"),
(0, 16, "hB"),
(0, 511, "/f"),
(0, 512, "hgB"),
(0, 16_383, "//f"),
(0, 16_384, "hggB"),
(0, 524_287, "///f"),
(0, 524_288, "hgggB"),
(0, 16_777_215, "////f"),
(0, 16_777_216, "hggggB"),
(0, 536_870_911, "/////f"),
(0, 536_870_912, "hgggggB"),
(0, u32::MAX, "//////H"), // 7 bytes
];
for (a, b, res) in FIXTURES.iter().copied() {
let mut out = String::with_capacity(MAX_VLQ_BYTES);
// SAFETY: `out` has 7 bytes spare capacity
unsafe { encode_vlq_diff(&mut out, a, b) };
assert_eq!(&out, res);
}
}