refactor(codegen): add CodeBuffer to fix soundness hole (#6148)

# What This PR Does Adds `CodeBuffer`, a simple wrapper over a `Vec<u8>` with a protective and reduced API for upholding UTF-8 validity guarantees. Closes #6147. Note that this struct is actually quite small. Most of the added lines are doc comments.
2026-05-19 12:19:15 +00:00 · 2024-10-13 09:14:48 +00:00 · 2024-10-13 09:14:48 +00:00 · 204bf5533e
commit 204bf5533e
parent 8fe1b0a0f8
4 changed files with 449 additions and 20 deletions
--- a/crates/oxc_codegen/src/code_buffer.rs
+++ b/crates/oxc_codegen/src/code_buffer.rs
@ -0,0 +1,427 @@
+/// A string builder for constructing source code.
+///
+///
+/// `CodeBuffer` provides safe abstractions over a byte array, allowing for
+/// a compact byte-array representation without soundness holes.
+///
+/// Use one of the various `print_*` methods to add text into a buffer. When you
+/// are done, call [`take_source_text`] to extract the final [`String`].
+///
+/// # Examples
+/// ```
+/// use oxc_codegen::CodeBuffer;
+/// let mut code = CodeBuffer::new();
+///
+/// // mock settings
+/// let is_public = true;
+///
+/// if is_public {
+///     code.print_str("export ")
+/// }
+/// code.print_str("function foo() {\n");
+/// code.print_str("    console.log('Hello, world!');\n");
+/// code.print_str("}\n");
+///
+/// let source = code.take_source_text();
+/// ```
+///
+/// [`take_source_text`]: CodeBuffer::take_source_text
+#[derive(Debug, Default, Clone)]
+pub struct CodeBuffer {
+    /// INVARIANT: `buf` is a valid UTF-8 string.
+    buf: Vec<u8>,
+}
+
+impl CodeBuffer {
+    /// Create a new empty `CodeBuffer`.
+    ///
+    /// ## Examples
+    ///
+    /// ```
+    /// use oxc_codegen::CodeBuffer;
+    /// let mut code = CodeBuffer::new();
+    ///
+    /// // use `code` to build new source text
+    /// code.print_str("fn main() { println!(\"Hello, world!\"); }");
+    /// let source_text = code.take_source_text();
+    /// ```
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Create a new, empty `CodeBuffer` with the specified capacity.
+    ///
+    /// The buffer will be able to hold at least `capacity` bytes without
+    /// reallocating. This method is allowed to allocate for more bytes than
+    /// `capacity`. If `capacity` is 0, the buffer will not allocate.
+    ///
+    /// It is important to note that although the returned buffer has the
+    /// minimum *capacity* specified, the buffer will have a zero *length*.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the new capacity exceeds `isize::MAX` _bytes_.
+    pub fn with_capacity(capacity: usize) -> Self {
+        Self { buf: Vec::with_capacity(capacity) }
+    }
+
+    /// Returns the number of bytes in this buffer.
+    ///
+    /// This is _not_ the same as the number of characters in the buffer, since
+    /// non-ASCII characters require multiple bytes.
+    pub fn len(&self) -> usize {
+        self.buf.len()
+    }
+
+    /// Returns `true` if this buffer contains no characters.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use oxc_codegen::CodeBuffer;
+    /// let mut code = CodeBuffer::new();
+    /// assert!(code.is_empty());
+    ///
+    /// code.push_char('c');
+    /// assert!(!code.is_empty());
+    /// ```
+    pub fn is_empty(&self) -> bool {
+        self.buf.is_empty()
+    }
+
+    /// Reserves capacity for at least `additional` more characters in the given
+    /// `CodeBuffer`. The buffer may reserve more space to speculatively avoid
+    /// frequent reallocations. After calling `reserve`, capacity will be
+    /// greater than or equal to `self.len() + additional`. Does nothing if
+    /// capacity is already sufficient.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the new capacity exceeds `isize::MAX` _bytes_.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut code = CodeBuffer::default();
+    /// code.reserve(10);
+    /// ```
+    #[inline]
+    pub fn reserve(&mut self, additional: usize) {
+        self.buf.reserve(additional);
+    }
+
+    /// Peek the `n`th character from the end of the buffer.
+    /// When `n` is zero, the last character is returned. Returns [`None`] if
+    /// `n` exceeds the length of the buffer.
+    ///
+    /// ## Examples
+    /// ```
+    /// # use oxc_codegen::CodeBuffer;
+    /// let mut code = CodeBuffer::new();
+    /// code.print_str("foo");
+    ///
+    /// assert_eq!(code.peek_nth_back(0), Some('o'));
+    /// assert_eq!(code.peek_nth_back(2), Some('f'));
+    /// assert_eq!(code.peek_nth_back(3), None);
+    /// ```
+    #[inline]
+    #[must_use = "Peeking is pointless if the peeked char isn't used"]
+    pub fn peek_nth_back(&self, n: usize) -> Option<char> {
+        // SAFETY: `buf` is a valid UTF-8 string because of invariants upheld by CodeBuffer
+        unsafe { std::str::from_utf8_unchecked(&self.buf) }.chars().nth_back(n)
+    }
+
+    /// Push a single ASCII character into the buffer
+    ///
+    /// # Panics
+    /// If `ch` is not a valid UTF-8 code point in the ASCII range (`0 - 0x7F`).
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use oxc_codegen::CodeBuffer;
+    /// let mut code = CodeBuffer::new();
+    /// code.print_ascii_byte('f');
+    /// code.print_ascii_byte('o');
+    /// code.print_ascii_byte('o');
+    ///
+    /// let source = code.take_source_text();
+    /// assert_eq!(source, "foo");
+    /// ```
+    #[inline]
+    pub fn print_ascii_byte(&mut self, b: u8) {
+        // NOTE: since this method is inlined, this assertion should get
+        // optimized away by the compiler when the value of `b` is known,
+        // e.g. when printing a constant.
+        assert!(b.is_ascii(), "byte {b} is not ASCII");
+        self.buf.push(b);
+    }
+
+    /// Print a byte without checking that this buffer still represents a valid
+    /// UTF-8 string.
+    ///
+    /// If you are looking to print a byte you know is valid ASCII, prefer
+    /// [`print_ascii_byte`]. If you are not certain, you may use [`print_char`]
+    /// as a safe alternative.
+    ///
+    /// # Safety
+    /// The caller must ensure that, after 1 or more sequential calls, this
+    /// buffer represents a valid UTF-8 string.
+    ///
+    /// It is safe for a single call to temporarily result in invalid UTF-8, as
+    /// long as UTF-8 integrity is restored before calls to any other `print`
+    /// method or [`take_source_text`]. This lets you, for example, print an
+    /// 8-byte code point using 4 separate calls to this method.
+    ///
+    /// If you find yourself in such a scenario, consider using
+    /// [`print_unchecked`] instead.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use oxc_codegen::CodeBuffer;
+    /// let mut code = CodeBuffer::new();
+    /// // Safe: 'a' is a valid ASCII character. Its UTF-8 representation only
+    /// // requires a single byte.
+    /// unsafe { code.print_byte_unsafe(b'a') };
+    ///
+    /// let not_ascii = '⚓';
+    /// let as_bytes = not_ascii.to_string().into_bytes();
+    /// // Safe: after this loop completes, `code` returns to a valid state.
+    /// for byte in as_bytes {
+    ///     unsafe { code.print_byte_unsafe(byte) };
+    /// }
+    ///
+    /// // NOT SAFE: `ch` exceeds the ASCII segment range. `code` is no longer
+    /// valid UTF-8
+    /// unsafe { code.print_byte_unsafe(0xFF) };
+    /// ```
+    ///
+    /// [`print_ascii_byte`]: CodeBuffer::print_ascii_byte
+    /// [`print_char`]: CodeBuffer::print_char
+    /// [`take_source_text`]: CodeBuffer::take_source_text
+    /// [`print_unchecked`]: CodeBuffer::print_unchecked
+    #[inline]
+    pub unsafe fn print_byte_unsafe(&mut self, ch: u8) {
+        self.buf.push(ch);
+    }
+
+    /// Print a single Unicode character into the buffer.
+    ///
+    /// When pushing multiple characters, consider choosing [`print_str`] over
+    /// this method since it's much more efficient. If you really want to insert
+    /// only a single character and you're certain it's ASCII, consider using
+    /// [`print_ascii_byte`].
+    ///
+    /// ## Examples
+    ///
+    /// ```
+    /// use oxc_codegen::CodeBuffer;
+    /// let mut code = CodeBuffer::new();
+    ///
+    /// code.print_char('f');
+    /// code.print_char('o');
+    /// code.print_char('o');
+    ///
+    /// assert_eq!(String::from(code), "foo");
+    /// ```
+    ///
+    /// [`print_str`]: CodeBuffer::print_str
+    /// [`print_ascii_byte`]: CodeBuffer::print_ascii_byte
+    #[inline]
+    pub fn print_char(&mut self, ch: char) {
+        let mut b = [0; 4];
+        self.buf.extend(ch.encode_utf8(&mut b).as_bytes());
+    }
+
+    /// Push a string into this the buffer.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use oxc_codegen::CodeBuffer;
+    /// let mut code = CodeBuffer::new();
+    /// code.print_str("function main() { console.log('Hello, world!') }");
+    /// ```
+    #[inline]
+    pub fn print_str<S: AsRef<str>>(&mut self, s: S) {
+        self.buf.extend(s.as_ref().as_bytes());
+    }
+
+    /// Push a sequence of ASCII characters into the buffer.
+    ///
+    /// # Panics
+    /// If any byte in the iterator is not valid ASCII.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use oxc_codegen::CodeBuffer;
+    /// let mut code = CodeBuffer::new();
+    ///
+    /// code.print_ascii([b'f', b'o', b'o'].into_iter());
+    /// assert_eq!(String::from(code), "foo");
+    /// ```
+    pub fn print_ascii<I>(&mut self, chars: I)
+    where
+        I: IntoIterator<Item = u8>,
+    {
+        let iter = chars.into_iter();
+        let hint = iter.size_hint();
+        self.buf.reserve(hint.1.unwrap_or(hint.0));
+        for c in iter {
+            self.print_ascii_byte(c);
+        }
+    }
+
+    /// Print a sequence of bytes without checking that this buffer still
+    /// represents a valid UTF-8 string.
+    ///
+    /// # Safety
+    ///
+    /// The caller must ensure that, after being called, this buffer represents
+    /// a valid UTF-8 string. In practice, this means only two cases are valid:
+    ///
+    /// 1. Both the buffer and the byte sequence are valid UTF-8,
+    /// 2. The buffer became invalid after a call to [`print_byte_unsafe`] and `bytes`
+    ///    completes any incomplete code points, returning the buffer to a valid
+    ///    state.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use oxc_codegen::CodeBuffer;
+    /// let mut code = CodeBuffer::new();
+    ///
+    /// // Indent to a dynamic level. Sound because all elements in this
+    /// // iterator are valid 1-byte UTF-8 code points (ASCII).
+    /// unsafe {
+    ///     code.print_unchecked(std::iter::repeat(b' ').take(4));
+    /// }
+    /// ```
+    ///
+    /// [`print_byte_unsafe`]: CodeBuffer::print_byte_unsafe
+    #[inline]
+    pub(crate) unsafe fn print_unchecked<I>(&mut self, bytes: I)
+    where
+        I: IntoIterator<Item = u8>,
+    {
+        self.buf.extend(bytes);
+    }
+
+    /// Convert a `CodeBuffer` into a string of source code, leaving its
+    /// internal buffer empty and finalizing the codegen process.
+    ///
+    /// It is safe to re-use a buffer after calling this method. Its contents
+    /// will be emptied out, but all memory resources are retained and in a
+    /// valid state. You may use [`String::from`] if you don't intend on
+    /// re-using the buffer. It simply calls this method and drops the
+    /// `CodeBuffer` afterwards.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use oxc_codegen::CodeBuffer;
+    /// let mut code = CodeBuffer::new();
+    /// code.print_str("console.log('foo');");
+    ///
+    /// let source = code.take_source_text();
+    /// assert_eq!(source, "console.log('foo');");
+    /// assert!(code.is_empty());
+    /// ```
+    #[must_use]
+    pub fn take_source_text(&mut self) -> String {
+        use std::mem::take;
+
+        #[cfg(debug_assertions)]
+        {
+            String::from_utf8(take(&mut self.buf)).unwrap()
+        }
+        #[cfg(not(debug_assertions))]
+        {
+            // SAFETY: `buf` is valid UTF-8 because of invariants upheld by
+            // CodeBuffer. If, for some reason, it is not, this is caused by
+            // improper use of `unsafe` printing methods.
+            unsafe { String::from_utf8_unchecked(take(&mut self.buf)) }
+        }
+    }
+}
+
+impl AsRef<[u8]> for CodeBuffer {
+    fn as_ref(&self) -> &[u8] {
+        &self.buf
+    }
+}
+impl From<CodeBuffer> for String {
+    #[inline]
+    fn from(printer: CodeBuffer) -> Self {
+        if cfg!(debug_assertions) {
+            String::from_utf8(printer.buf).unwrap()
+        } else {
+            // SAFETY: `buf` is valid UTF-8 because of invariants upheld by `CodeBuffer`
+            unsafe { String::from_utf8_unchecked(printer.buf) }
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::CodeBuffer;
+
+    #[test]
+    fn test_empty() {
+        let code = CodeBuffer::default();
+        assert!(code.is_empty());
+        assert_eq!(code.len(), 0);
+        assert_eq!(String::from(code), "");
+    }
+
+    #[test]
+    fn test_string_isomorphism() {
+        let s = "Hello, world!";
+        let mut code = CodeBuffer::with_capacity(s.len());
+        code.print_str(s);
+        assert_eq!(code.len(), s.len());
+        assert_eq!(String::from(code), s.to_string());
+    }
+
+    #[test]
+    fn test_into_source_string() {
+        let s = "Hello, world!";
+        let mut code = CodeBuffer::with_capacity(s.len());
+        code.print_str(s);
+
+        let source = code.take_source_text();
+        assert_eq!(source, s);
+
+        // buffer has been emptied
+        assert!(code.is_empty());
+        assert_eq!(code.len(), 0);
+        let empty_slice: &[u8] = &[];
+        assert_eq!(code.as_ref(), empty_slice);
+        assert_eq!(String::from(code), "");
+    }
+
+    #[test]
+    #[allow(clippy::byte_char_slices)]
+    fn test_print_byte_unsafe() {
+        let mut code = CodeBuffer::new();
+        code.print_ascii_byte(b'f');
+        code.print_ascii_byte(b'o');
+        code.print_ascii_byte(b'o');
+
+        assert_eq!(code.len(), 3);
+        assert_eq!(code.as_ref(), &[b'f', b'o', b'o']);
+        assert_eq!(String::from(code), "foo");
+    }
+
+    #[test]
+    fn test_peek() {
+        let mut code = CodeBuffer::new();
+        code.print_str("foo");
+
+        assert_eq!(code.peek_nth_back(0), Some('o'));
+        assert_eq!(code.peek_nth_back(2), Some('f'));
+        assert_eq!(code.peek_nth_back(3), None);
+    }
+}
--- a/crates/oxc_codegen/src/comment.rs
+++ b/crates/oxc_codegen/src/comment.rs
@ -87,7 +87,7 @@ impl<'a> Codegen<'a> {

        if comments.first().is_some_and(|c| c.preceded_by_newline) {
            // Skip printing newline if this comment is already on a newline.
-            if self.peek_nth(0).is_some_and(|c| c != '\n' && c != '\t') {
+            if self.peek_nth_back(0).is_some_and(|c| c != '\n' && c != '\t') {
                self.print_hard_newline();
                self.print_indent();
            }
--- a/crates/oxc_codegen/src/gen.rs
+++ b/crates/oxc_codegen/src/gen.rs
@ -1194,7 +1194,7 @@ impl<'a> Gen for BigIntLiteral<'a> {
 impl<'a> Gen for RegExpLiteral<'a> {
    fn gen(&self, p: &mut Codegen, _ctx: Context) {
        p.add_source_mapping(self.span.start);
-        let last = p.peek_nth(0);
+        let last = p.peek_nth_back(0);
        let pattern_text = self.regex.pattern.source_text(p.source_text);
        // Avoid forming a single-line comment or "</script" sequence
        if Some('/') == last
--- a/crates/oxc_codegen/src/lib.rs
+++ b/crates/oxc_codegen/src/lib.rs
@ -4,6 +4,7 @@
 //! * [esbuild](https://github.com/evanw/esbuild/blob/main/internal/js_printer/js_printer.go)

 mod binary_expr_visitor;
+mod code_buffer;
 mod comment;
 mod context;
 mod gen;
@ -24,8 +25,8 @@ use oxc_syntax::{
 };

 use crate::{
-    binary_expr_visitor::BinaryExpressionVisitor, comment::CommentsMap, operator::Operator,
-    sourcemap_builder::SourcemapBuilder,
+    binary_expr_visitor::BinaryExpressionVisitor, code_buffer::CodeBuffer, comment::CommentsMap,
+    operator::Operator, sourcemap_builder::SourcemapBuilder,
 };
 pub use crate::{
    context::Context,
@ -102,7 +103,7 @@ pub struct Codegen<'a> {
    mangler: Option<Mangler>,

    /// Output Code
-    code: Vec<u8>,
+    code: CodeBuffer,

    // states
    prev_op_end: usize,
@ -171,7 +172,7 @@ impl<'a> Codegen<'a> {
            comments: CommentsMap::default(),
            start_of_annotation_comment: None,
            mangler: None,
-            code: vec![],
+            code: CodeBuffer::default(),
            needs_semicolon: false,
            need_space_before_dot: 0,
            print_next_indent_as_space: false,
@ -221,20 +222,19 @@ impl<'a> Codegen<'a> {

    #[must_use]
    pub fn into_source_text(&mut self) -> String {
-        // SAFETY: criteria of `from_utf8_unchecked` are met.
-        unsafe { String::from_utf8_unchecked(std::mem::take(&mut self.code)) }
+        self.code.take_source_text()
    }

    /// Push a single character into the buffer
    #[inline]
    pub fn print_char(&mut self, ch: u8) {
-        self.code.push(ch);
+        self.code.print_ascii_byte(ch);
    }

    /// Push str into the buffer
    #[inline]
    pub fn print_str(&mut self, s: &str) {
-        self.code.extend(s.as_bytes());
+        self.code.print_str(s);
    }

    #[inline]
@ -245,7 +245,7 @@ impl<'a> Codegen<'a> {

 // Private APIs
 impl<'a> Codegen<'a> {
-    fn code(&self) -> &Vec<u8> {
+    fn code(&self) -> &CodeBuffer {
        &self.code
    }

@ -256,7 +256,7 @@ impl<'a> Codegen<'a> {
    #[inline]
    fn print_soft_space(&mut self) {
        if !self.options.minify {
-            self.print_char(b' ');
+            self.code.print_ascii_byte(b' ');
        }
    }

@ -290,7 +290,7 @@ impl<'a> Codegen<'a> {
    #[inline]
    fn print_space_before_identifier(&mut self) {
        if self
-            .peek_nth(0)
+            .peek_nth_back(0)
            .is_some_and(|ch| is_identifier_part(ch) || self.prev_reg_exp_end == self.code.len())
        {
            self.print_hard_space();
@ -298,9 +298,8 @@ impl<'a> Codegen<'a> {
    }

    #[inline]
-    fn peek_nth(&self, n: usize) -> Option<char> {
-        // SAFETY: criteria of `from_utf8_unchecked` are met.
-        unsafe { std::str::from_utf8_unchecked(self.code()) }.chars().nth_back(n)
+    fn peek_nth_back(&self, n: usize) -> Option<char> {
+        self.code.peek_nth_back(n)
    }

    #[inline]
@ -327,7 +326,10 @@ impl<'a> Codegen<'a> {
            self.print_next_indent_as_space = false;
            return;
        }
-        self.code.extend(std::iter::repeat(b'\t').take(self.indent as usize));
+        // SAFETY: this iterator only yields tabs, which are always valid ASCII characters.
+        unsafe {
+            self.code.print_unchecked(std::iter::repeat(b'\t').take(self.indent as usize));
+        }
    }

    #[inline]
@ -528,7 +530,7 @@ impl<'a> Codegen<'a> {
            || ((prev == bin_op_sub || prev == un_op_neg)
                && (next == bin_op_sub || next == un_op_neg || next == un_op_pre_dec))
            || (prev == un_op_post_dec && next == bin_op_gt)
-            || (prev == un_op_not && next == un_op_pre_dec && self.peek_nth(1) == Some('<'))
+            || (prev == un_op_not && next == un_op_pre_dec && self.peek_nth_back(1) == Some('<'))
        {
            self.print_hard_space();
        }
@ -554,13 +556,13 @@ impl<'a> Codegen<'a> {

    fn add_source_mapping(&mut self, position: u32) {
        if let Some(sourcemap_builder) = self.sourcemap_builder.as_mut() {
-            sourcemap_builder.add_source_mapping(&self.code, position, None);
+            sourcemap_builder.add_source_mapping(self.code.as_ref(), position, None);
        }
    }

    fn add_source_mapping_for_name(&mut self, span: Span, name: &str) {
        if let Some(sourcemap_builder) = self.sourcemap_builder.as_mut() {
-            sourcemap_builder.add_source_mapping_for_name(&self.code, span, name);
+            sourcemap_builder.add_source_mapping_for_name(self.code.as_ref(), span, name);
        }
    }
 }