refactor(codegen): add CodeBuffer to fix soundness hole (#6148)

# What This PR Does

Adds `CodeBuffer`, a simple wrapper over a `Vec<u8>` with a protective and reduced API for upholding UTF-8 validity guarantees. Closes #6147.

Note that this struct is actually quite small. Most of the added lines are doc comments.
This commit is contained in:
DonIsaac 2024-10-13 09:14:48 +00:00
parent 8fe1b0a0f8
commit 204bf5533e
4 changed files with 449 additions and 20 deletions

View file

@ -0,0 +1,427 @@
/// A string builder for constructing source code.
///
///
/// `CodeBuffer` provides safe abstractions over a byte array, allowing for
/// a compact byte-array representation without soundness holes.
///
/// Use one of the various `print_*` methods to add text into a buffer. When you
/// are done, call [`take_source_text`] to extract the final [`String`].
///
/// # Examples
/// ```
/// use oxc_codegen::CodeBuffer;
/// let mut code = CodeBuffer::new();
///
/// // mock settings
/// let is_public = true;
///
/// if is_public {
/// code.print_str("export ")
/// }
/// code.print_str("function foo() {\n");
/// code.print_str(" console.log('Hello, world!');\n");
/// code.print_str("}\n");
///
/// let source = code.take_source_text();
/// ```
///
/// [`take_source_text`]: CodeBuffer::take_source_text
#[derive(Debug, Default, Clone)]
pub struct CodeBuffer {
/// INVARIANT: `buf` is a valid UTF-8 string.
buf: Vec<u8>,
}
impl CodeBuffer {
/// Create a new empty `CodeBuffer`.
///
/// ## Examples
///
/// ```
/// use oxc_codegen::CodeBuffer;
/// let mut code = CodeBuffer::new();
///
/// // use `code` to build new source text
/// code.print_str("fn main() { println!(\"Hello, world!\"); }");
/// let source_text = code.take_source_text();
/// ```
pub fn new() -> Self {
Self::default()
}
/// Create a new, empty `CodeBuffer` with the specified capacity.
///
/// The buffer will be able to hold at least `capacity` bytes without
/// reallocating. This method is allowed to allocate for more bytes than
/// `capacity`. If `capacity` is 0, the buffer will not allocate.
///
/// It is important to note that although the returned buffer has the
/// minimum *capacity* specified, the buffer will have a zero *length*.
///
/// # Panics
///
/// Panics if the new capacity exceeds `isize::MAX` _bytes_.
pub fn with_capacity(capacity: usize) -> Self {
Self { buf: Vec::with_capacity(capacity) }
}
/// Returns the number of bytes in this buffer.
///
/// This is _not_ the same as the number of characters in the buffer, since
/// non-ASCII characters require multiple bytes.
pub fn len(&self) -> usize {
self.buf.len()
}
/// Returns `true` if this buffer contains no characters.
///
/// # Examples
///
/// ```
/// # use oxc_codegen::CodeBuffer;
/// let mut code = CodeBuffer::new();
/// assert!(code.is_empty());
///
/// code.push_char('c');
/// assert!(!code.is_empty());
/// ```
pub fn is_empty(&self) -> bool {
self.buf.is_empty()
}
/// Reserves capacity for at least `additional` more characters in the given
/// `CodeBuffer`. The buffer may reserve more space to speculatively avoid
/// frequent reallocations. After calling `reserve`, capacity will be
/// greater than or equal to `self.len() + additional`. Does nothing if
/// capacity is already sufficient.
///
/// # Panics
///
/// Panics if the new capacity exceeds `isize::MAX` _bytes_.
///
/// # Examples
///
/// ```
/// let mut code = CodeBuffer::default();
/// code.reserve(10);
/// ```
#[inline]
pub fn reserve(&mut self, additional: usize) {
self.buf.reserve(additional);
}
/// Peek the `n`th character from the end of the buffer.
/// When `n` is zero, the last character is returned. Returns [`None`] if
/// `n` exceeds the length of the buffer.
///
/// ## Examples
/// ```
/// # use oxc_codegen::CodeBuffer;
/// let mut code = CodeBuffer::new();
/// code.print_str("foo");
///
/// assert_eq!(code.peek_nth_back(0), Some('o'));
/// assert_eq!(code.peek_nth_back(2), Some('f'));
/// assert_eq!(code.peek_nth_back(3), None);
/// ```
#[inline]
#[must_use = "Peeking is pointless if the peeked char isn't used"]
pub fn peek_nth_back(&self, n: usize) -> Option<char> {
// SAFETY: `buf` is a valid UTF-8 string because of invariants upheld by CodeBuffer
unsafe { std::str::from_utf8_unchecked(&self.buf) }.chars().nth_back(n)
}
/// Push a single ASCII character into the buffer
///
/// # Panics
/// If `ch` is not a valid UTF-8 code point in the ASCII range (`0 - 0x7F`).
///
/// # Examples
///
/// ```
/// use oxc_codegen::CodeBuffer;
/// let mut code = CodeBuffer::new();
/// code.print_ascii_byte('f');
/// code.print_ascii_byte('o');
/// code.print_ascii_byte('o');
///
/// let source = code.take_source_text();
/// assert_eq!(source, "foo");
/// ```
#[inline]
pub fn print_ascii_byte(&mut self, b: u8) {
// NOTE: since this method is inlined, this assertion should get
// optimized away by the compiler when the value of `b` is known,
// e.g. when printing a constant.
assert!(b.is_ascii(), "byte {b} is not ASCII");
self.buf.push(b);
}
/// Print a byte without checking that this buffer still represents a valid
/// UTF-8 string.
///
/// If you are looking to print a byte you know is valid ASCII, prefer
/// [`print_ascii_byte`]. If you are not certain, you may use [`print_char`]
/// as a safe alternative.
///
/// # Safety
/// The caller must ensure that, after 1 or more sequential calls, this
/// buffer represents a valid UTF-8 string.
///
/// It is safe for a single call to temporarily result in invalid UTF-8, as
/// long as UTF-8 integrity is restored before calls to any other `print`
/// method or [`take_source_text`]. This lets you, for example, print an
/// 8-byte code point using 4 separate calls to this method.
///
/// If you find yourself in such a scenario, consider using
/// [`print_unchecked`] instead.
///
/// # Examples
///
/// ```
/// use oxc_codegen::CodeBuffer;
/// let mut code = CodeBuffer::new();
/// // Safe: 'a' is a valid ASCII character. Its UTF-8 representation only
/// // requires a single byte.
/// unsafe { code.print_byte_unsafe(b'a') };
///
/// let not_ascii = '⚓';
/// let as_bytes = not_ascii.to_string().into_bytes();
/// // Safe: after this loop completes, `code` returns to a valid state.
/// for byte in as_bytes {
/// unsafe { code.print_byte_unsafe(byte) };
/// }
///
/// // NOT SAFE: `ch` exceeds the ASCII segment range. `code` is no longer
/// valid UTF-8
/// unsafe { code.print_byte_unsafe(0xFF) };
/// ```
///
/// [`print_ascii_byte`]: CodeBuffer::print_ascii_byte
/// [`print_char`]: CodeBuffer::print_char
/// [`take_source_text`]: CodeBuffer::take_source_text
/// [`print_unchecked`]: CodeBuffer::print_unchecked
#[inline]
pub unsafe fn print_byte_unsafe(&mut self, ch: u8) {
self.buf.push(ch);
}
/// Print a single Unicode character into the buffer.
///
/// When pushing multiple characters, consider choosing [`print_str`] over
/// this method since it's much more efficient. If you really want to insert
/// only a single character and you're certain it's ASCII, consider using
/// [`print_ascii_byte`].
///
/// ## Examples
///
/// ```
/// use oxc_codegen::CodeBuffer;
/// let mut code = CodeBuffer::new();
///
/// code.print_char('f');
/// code.print_char('o');
/// code.print_char('o');
///
/// assert_eq!(String::from(code), "foo");
/// ```
///
/// [`print_str`]: CodeBuffer::print_str
/// [`print_ascii_byte`]: CodeBuffer::print_ascii_byte
#[inline]
pub fn print_char(&mut self, ch: char) {
let mut b = [0; 4];
self.buf.extend(ch.encode_utf8(&mut b).as_bytes());
}
/// Push a string into this the buffer.
///
/// # Examples
///
/// ```
/// use oxc_codegen::CodeBuffer;
/// let mut code = CodeBuffer::new();
/// code.print_str("function main() { console.log('Hello, world!') }");
/// ```
#[inline]
pub fn print_str<S: AsRef<str>>(&mut self, s: S) {
self.buf.extend(s.as_ref().as_bytes());
}
/// Push a sequence of ASCII characters into the buffer.
///
/// # Panics
/// If any byte in the iterator is not valid ASCII.
///
/// # Examples
///
/// ```
/// use oxc_codegen::CodeBuffer;
/// let mut code = CodeBuffer::new();
///
/// code.print_ascii([b'f', b'o', b'o'].into_iter());
/// assert_eq!(String::from(code), "foo");
/// ```
pub fn print_ascii<I>(&mut self, chars: I)
where
I: IntoIterator<Item = u8>,
{
let iter = chars.into_iter();
let hint = iter.size_hint();
self.buf.reserve(hint.1.unwrap_or(hint.0));
for c in iter {
self.print_ascii_byte(c);
}
}
/// Print a sequence of bytes without checking that this buffer still
/// represents a valid UTF-8 string.
///
/// # Safety
///
/// The caller must ensure that, after being called, this buffer represents
/// a valid UTF-8 string. In practice, this means only two cases are valid:
///
/// 1. Both the buffer and the byte sequence are valid UTF-8,
/// 2. The buffer became invalid after a call to [`print_byte_unsafe`] and `bytes`
/// completes any incomplete code points, returning the buffer to a valid
/// state.
///
/// # Examples
///
/// ```
/// use oxc_codegen::CodeBuffer;
/// let mut code = CodeBuffer::new();
///
/// // Indent to a dynamic level. Sound because all elements in this
/// // iterator are valid 1-byte UTF-8 code points (ASCII).
/// unsafe {
/// code.print_unchecked(std::iter::repeat(b' ').take(4));
/// }
/// ```
///
/// [`print_byte_unsafe`]: CodeBuffer::print_byte_unsafe
#[inline]
pub(crate) unsafe fn print_unchecked<I>(&mut self, bytes: I)
where
I: IntoIterator<Item = u8>,
{
self.buf.extend(bytes);
}
/// Convert a `CodeBuffer` into a string of source code, leaving its
/// internal buffer empty and finalizing the codegen process.
///
/// It is safe to re-use a buffer after calling this method. Its contents
/// will be emptied out, but all memory resources are retained and in a
/// valid state. You may use [`String::from`] if you don't intend on
/// re-using the buffer. It simply calls this method and drops the
/// `CodeBuffer` afterwards.
///
/// # Examples
///
/// ```
/// use oxc_codegen::CodeBuffer;
/// let mut code = CodeBuffer::new();
/// code.print_str("console.log('foo');");
///
/// let source = code.take_source_text();
/// assert_eq!(source, "console.log('foo');");
/// assert!(code.is_empty());
/// ```
#[must_use]
pub fn take_source_text(&mut self) -> String {
use std::mem::take;
#[cfg(debug_assertions)]
{
String::from_utf8(take(&mut self.buf)).unwrap()
}
#[cfg(not(debug_assertions))]
{
// SAFETY: `buf` is valid UTF-8 because of invariants upheld by
// CodeBuffer. If, for some reason, it is not, this is caused by
// improper use of `unsafe` printing methods.
unsafe { String::from_utf8_unchecked(take(&mut self.buf)) }
}
}
}
impl AsRef<[u8]> for CodeBuffer {
fn as_ref(&self) -> &[u8] {
&self.buf
}
}
impl From<CodeBuffer> for String {
#[inline]
fn from(printer: CodeBuffer) -> Self {
if cfg!(debug_assertions) {
String::from_utf8(printer.buf).unwrap()
} else {
// SAFETY: `buf` is valid UTF-8 because of invariants upheld by `CodeBuffer`
unsafe { String::from_utf8_unchecked(printer.buf) }
}
}
}
#[cfg(test)]
mod test {
use super::CodeBuffer;
#[test]
fn test_empty() {
let code = CodeBuffer::default();
assert!(code.is_empty());
assert_eq!(code.len(), 0);
assert_eq!(String::from(code), "");
}
#[test]
fn test_string_isomorphism() {
let s = "Hello, world!";
let mut code = CodeBuffer::with_capacity(s.len());
code.print_str(s);
assert_eq!(code.len(), s.len());
assert_eq!(String::from(code), s.to_string());
}
#[test]
fn test_into_source_string() {
let s = "Hello, world!";
let mut code = CodeBuffer::with_capacity(s.len());
code.print_str(s);
let source = code.take_source_text();
assert_eq!(source, s);
// buffer has been emptied
assert!(code.is_empty());
assert_eq!(code.len(), 0);
let empty_slice: &[u8] = &[];
assert_eq!(code.as_ref(), empty_slice);
assert_eq!(String::from(code), "");
}
#[test]
#[allow(clippy::byte_char_slices)]
fn test_print_byte_unsafe() {
let mut code = CodeBuffer::new();
code.print_ascii_byte(b'f');
code.print_ascii_byte(b'o');
code.print_ascii_byte(b'o');
assert_eq!(code.len(), 3);
assert_eq!(code.as_ref(), &[b'f', b'o', b'o']);
assert_eq!(String::from(code), "foo");
}
#[test]
fn test_peek() {
let mut code = CodeBuffer::new();
code.print_str("foo");
assert_eq!(code.peek_nth_back(0), Some('o'));
assert_eq!(code.peek_nth_back(2), Some('f'));
assert_eq!(code.peek_nth_back(3), None);
}
}

View file

@ -87,7 +87,7 @@ impl<'a> Codegen<'a> {
if comments.first().is_some_and(|c| c.preceded_by_newline) {
// Skip printing newline if this comment is already on a newline.
if self.peek_nth(0).is_some_and(|c| c != '\n' && c != '\t') {
if self.peek_nth_back(0).is_some_and(|c| c != '\n' && c != '\t') {
self.print_hard_newline();
self.print_indent();
}

View file

@ -1194,7 +1194,7 @@ impl<'a> Gen for BigIntLiteral<'a> {
impl<'a> Gen for RegExpLiteral<'a> {
fn gen(&self, p: &mut Codegen, _ctx: Context) {
p.add_source_mapping(self.span.start);
let last = p.peek_nth(0);
let last = p.peek_nth_back(0);
let pattern_text = self.regex.pattern.source_text(p.source_text);
// Avoid forming a single-line comment or "</script" sequence
if Some('/') == last

View file

@ -4,6 +4,7 @@
//! * [esbuild](https://github.com/evanw/esbuild/blob/main/internal/js_printer/js_printer.go)
mod binary_expr_visitor;
mod code_buffer;
mod comment;
mod context;
mod gen;
@ -24,8 +25,8 @@ use oxc_syntax::{
};
use crate::{
binary_expr_visitor::BinaryExpressionVisitor, comment::CommentsMap, operator::Operator,
sourcemap_builder::SourcemapBuilder,
binary_expr_visitor::BinaryExpressionVisitor, code_buffer::CodeBuffer, comment::CommentsMap,
operator::Operator, sourcemap_builder::SourcemapBuilder,
};
pub use crate::{
context::Context,
@ -102,7 +103,7 @@ pub struct Codegen<'a> {
mangler: Option<Mangler>,
/// Output Code
code: Vec<u8>,
code: CodeBuffer,
// states
prev_op_end: usize,
@ -171,7 +172,7 @@ impl<'a> Codegen<'a> {
comments: CommentsMap::default(),
start_of_annotation_comment: None,
mangler: None,
code: vec![],
code: CodeBuffer::default(),
needs_semicolon: false,
need_space_before_dot: 0,
print_next_indent_as_space: false,
@ -221,20 +222,19 @@ impl<'a> Codegen<'a> {
#[must_use]
pub fn into_source_text(&mut self) -> String {
// SAFETY: criteria of `from_utf8_unchecked` are met.
unsafe { String::from_utf8_unchecked(std::mem::take(&mut self.code)) }
self.code.take_source_text()
}
/// Push a single character into the buffer
#[inline]
pub fn print_char(&mut self, ch: u8) {
self.code.push(ch);
self.code.print_ascii_byte(ch);
}
/// Push str into the buffer
#[inline]
pub fn print_str(&mut self, s: &str) {
self.code.extend(s.as_bytes());
self.code.print_str(s);
}
#[inline]
@ -245,7 +245,7 @@ impl<'a> Codegen<'a> {
// Private APIs
impl<'a> Codegen<'a> {
fn code(&self) -> &Vec<u8> {
fn code(&self) -> &CodeBuffer {
&self.code
}
@ -256,7 +256,7 @@ impl<'a> Codegen<'a> {
#[inline]
fn print_soft_space(&mut self) {
if !self.options.minify {
self.print_char(b' ');
self.code.print_ascii_byte(b' ');
}
}
@ -290,7 +290,7 @@ impl<'a> Codegen<'a> {
#[inline]
fn print_space_before_identifier(&mut self) {
if self
.peek_nth(0)
.peek_nth_back(0)
.is_some_and(|ch| is_identifier_part(ch) || self.prev_reg_exp_end == self.code.len())
{
self.print_hard_space();
@ -298,9 +298,8 @@ impl<'a> Codegen<'a> {
}
#[inline]
fn peek_nth(&self, n: usize) -> Option<char> {
// SAFETY: criteria of `from_utf8_unchecked` are met.
unsafe { std::str::from_utf8_unchecked(self.code()) }.chars().nth_back(n)
fn peek_nth_back(&self, n: usize) -> Option<char> {
self.code.peek_nth_back(n)
}
#[inline]
@ -327,7 +326,10 @@ impl<'a> Codegen<'a> {
self.print_next_indent_as_space = false;
return;
}
self.code.extend(std::iter::repeat(b'\t').take(self.indent as usize));
// SAFETY: this iterator only yields tabs, which are always valid ASCII characters.
unsafe {
self.code.print_unchecked(std::iter::repeat(b'\t').take(self.indent as usize));
}
}
#[inline]
@ -528,7 +530,7 @@ impl<'a> Codegen<'a> {
|| ((prev == bin_op_sub || prev == un_op_neg)
&& (next == bin_op_sub || next == un_op_neg || next == un_op_pre_dec))
|| (prev == un_op_post_dec && next == bin_op_gt)
|| (prev == un_op_not && next == un_op_pre_dec && self.peek_nth(1) == Some('<'))
|| (prev == un_op_not && next == un_op_pre_dec && self.peek_nth_back(1) == Some('<'))
{
self.print_hard_space();
}
@ -554,13 +556,13 @@ impl<'a> Codegen<'a> {
fn add_source_mapping(&mut self, position: u32) {
if let Some(sourcemap_builder) = self.sourcemap_builder.as_mut() {
sourcemap_builder.add_source_mapping(&self.code, position, None);
sourcemap_builder.add_source_mapping(self.code.as_ref(), position, None);
}
}
fn add_source_mapping_for_name(&mut self, span: Span, name: &str) {
if let Some(sourcemap_builder) = self.sourcemap_builder.as_mut() {
sourcemap_builder.add_source_mapping_for_name(&self.code, span, name);
sourcemap_builder.add_source_mapping_for_name(self.code.as_ref(), span, name);
}
}
}