From 6f597b18bcca9f20b48169be9fe9fa644ef102d0 Mon Sep 17 00:00:00 2001 From: overlookmotel Date: Fri, 9 Feb 2024 02:26:51 +0000 Subject: [PATCH] refactor(parser): all pointer manipulation through `SourcePosition` (#2350) A safer and faster interface for reading source text using pointers than `*ptr`. --- crates/oxc_parser/src/lexer/source.rs | 128 +++++++++++++++++++------- 1 file changed, 97 insertions(+), 31 deletions(-) diff --git a/crates/oxc_parser/src/lexer/source.rs b/crates/oxc_parser/src/lexer/source.rs index 16df3cad4..4178048db 100644 --- a/crates/oxc_parser/src/lexer/source.rs +++ b/crates/oxc_parser/src/lexer/source.rs @@ -136,7 +136,24 @@ impl<'a> Source<'a> { /// `SourcePosition` lives as long as the source text `&str` that `Source` was created from. #[inline] pub(super) fn position(&self) -> SourcePosition<'a> { - SourcePosition { ptr: self.ptr, _marker: PhantomData } + // SAFETY: Creating a `SourcePosition` from current position of `Source` is always valid, + // if caller has upheld safety conditions of other unsafe methods of this type. + let pos = unsafe { SourcePosition::new(self.ptr) }; + + // SAFETY: `pos.read()`'s contract is upheld by: + // * The preceding checks that `pos.ptr` >= `self.start` and < `self.end`. + // * `Source`'s invariants guarantee that `self.start` - `self.end` contains allocated memory. + // * `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `pos.ptr` + // addresses cannot be aliased by a `&mut` ref as long as `Source` exists. + // * `SourcePosition` can only live as long as the `&str` underlying `Source`. + debug_assert!( + pos.ptr >= self.start + && pos.ptr <= self.end + // SAFETY: See above + && (pos.ptr == self.end || !is_utf8_cont_byte(unsafe { pos.read() })) + ); + + pos } /// Move current position. @@ -151,7 +168,7 @@ impl<'a> Source<'a> { // from this `Source`. // This guarantee is what allows this function to be safe. - // SAFETY: `read_u8`'s contract is upheld by: + // SAFETY: `SourcePosition::read`'s contract is upheld by: // * The preceding checks that `pos.ptr` >= `self.start` and < `self.end`. // * `Source`'s invariants guarantee that `self.start` - `self.end` contains allocated memory. // * `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `pos.ptr` @@ -161,7 +178,7 @@ impl<'a> Source<'a> { pos.ptr >= self.start && pos.ptr <= self.end // SAFETY: See above - && (pos.ptr == self.end || !is_utf8_cont_byte(unsafe { read_u8(pos.ptr) })) + && (pos.ptr == self.end || !is_utf8_cont_byte(unsafe { pos.read() })) ); self.ptr = pos.ptr; } @@ -183,7 +200,7 @@ impl<'a> Source<'a> { /// * Moving back `n` bytes would not place current position on a UTF-8 character boundary. #[inline] pub(super) fn back(&mut self, n: usize) { - // This assertion is essential to ensure safety of `read_u8()` call below. + // This assertion is essential to ensure safety of `pos.read()` call below. // Without this check, calling `back(0)` on an empty `Source` would cause reading // out of bounds. // Compiler should remove this assertion when inlining this function, @@ -196,7 +213,7 @@ impl<'a> Source<'a> { // SAFETY: We have checked that `n` is less than distance between `start` and `ptr`, // so `new_ptr` cannot be outside of allocation of original `&str` - let new_ptr = unsafe { self.ptr.sub(n) }; + let new_pos = unsafe { self.position().sub(n) }; // Enforce invariant that `ptr` must be positioned on a UTF-8 character boundary. // SAFETY: `new_ptr` is in bounds of original `&str`, and `n > 0` assertion ensures @@ -204,11 +221,11 @@ impl<'a> Source<'a> { // `Source`'s invariants guarantee that `self.start` - `self.end` contains allocated memory. // `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `new_ptr` // addresses cannot be aliased by a `&mut` ref as long as `Source` exists. - let byte = unsafe { read_u8(new_ptr) }; + let byte = unsafe { new_pos.read() }; assert!(!is_utf8_cont_byte(byte), "Offset is not on a UTF-8 character boundary"); // Move current position. The checks above satisfy `Source`'s invariants. - self.ptr = new_ptr; + self.ptr = new_pos.ptr; } /// Get next char of source, and advance position to after it. @@ -376,43 +393,92 @@ impl<'a> Source<'a> { // `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `self.ptr` // addresses cannot be aliased by a `&mut` ref as long as `Source` exists. debug_assert!(self.ptr >= self.start && self.ptr < self.end); - read_u8(self.ptr) + self.position().read() } } /// Wrapper around a pointer to a position in `Source`. +/// +/// # SAFETY +/// `SourcePosition` must always be on a UTF-8 character boundary, +/// and within bounds of the `Source` that created it. #[derive(Debug, Clone, Copy)] pub struct SourcePosition<'a> { ptr: *const u8, _marker: PhantomData<&'a u8>, } +impl<'a> SourcePosition<'a> { + /// Create a new `SourcePosition` from a pointer. + /// + /// # SAFETY + /// * Pointer must obey all the same invariants as `Source::ptr`. + /// * It must be created from a `Source`. + /// * It must be in bounds of the source text `&str` the `Source` is created from, + /// or 1 byte after the end of the source text (i.e. positioned at EOF). + /// * It must be positioned on a UTF-8 character boundary (or EOF). + #[inline] + pub(super) unsafe fn new(ptr: *const u8) -> Self { + Self { ptr, _marker: PhantomData } + } + + /// Create new `SourcePosition` which is `n` bytes after this one. + /// The provenance of the pointer `SourcePosition` contains is maintained. + /// + /// # SAFETY + /// Caller must ensure that advancing `SourcePosition` by `n` bytes does not make it past the end + /// of `Source` this `SourcePosition` was created from. + /// NB: It is legal to use `add` to create a `SourcePosition` which is *on* the end of `Source`, + /// just not past it. + #[allow(dead_code)] + #[inline] + pub(super) unsafe fn add(self, n: usize) -> Self { + Self::new(self.ptr.add(n)) + } + + /// Create new `SourcePosition` which is `n` bytes before this one. + /// The provenance of the pointer `SourcePosition` contains is maintained. + /// + /// # SAFETY + /// Caller must ensure that reversing `SourcePosition` by `n` bytes does not make it before the start + /// of `Source` this `SourcePosition` was created from. + #[inline] + pub(super) unsafe fn sub(self, n: usize) -> Self { + Self::new(self.ptr.sub(n)) + } + + /// Read byte from this `SourcePosition`. + /// + /// # SAFETY + /// Caller must ensure `SourcePosition` is not at end of source text. + /// + /// # Implementation details + /// + /// Using `as_ref()` for reading is copied from `core::slice::iter::next`. + /// https://doc.rust-lang.org/src/core/slice/iter.rs.html#132 + /// https://doc.rust-lang.org/src/core/slice/iter/macros.rs.html#156-168 + /// + /// Using `ptr.as_ref().unwrap_unchecked()` instead of `*ptr` or `ptr.read()` produces + /// a 7% speed-up on Lexer benchmarks. + /// Presumably this is because it tells the compiler it can rely on the memory being immutable, + /// because if a `&mut` reference existed, that would violate Rust's aliasing rules. + #[inline] + pub(super) unsafe fn read(self) -> u8 { + // SAFETY: + // Caller guarantees `self` is not at end of source text. + // `Source` is created from a valid `&str`, so points to allocated, initialized memory. + // `Source` conceptually holds the source text `&str`, which guarantees to mutable references + // to the same memory can exist, as that would violate Rust's aliasing rules. + // Pointer is "dereferenceable" by definition as a `u8` is 1 byte and cannot span multiple objects. + // Alignment is not relevant as `u8` is aligned on 1 (i.e. no alignment requirements). + debug_assert!(!self.ptr.is_null()); + *self.ptr.as_ref().unwrap_unchecked() + } +} + /// Return if byte is a UTF-8 continuation byte. #[inline] const fn is_utf8_cont_byte(byte: u8) -> bool { // 0x80 - 0xBF are continuation bytes i.e. not 1st byte of a UTF-8 character sequence byte >= 0x80 && byte < 0xC0 } - -/// Read `u8` from `*const u8` pointer. -/// -/// Using `as_ref()` for reading is copied from `core::slice::iter::next`. -/// https://doc.rust-lang.org/src/core/slice/iter.rs.html#132 -/// https://doc.rust-lang.org/src/core/slice/iter/macros.rs.html#156-168 -/// -/// This is about 7% faster than `*ptr` or `ptr.read()`, presumably because it tells the compiler -/// it can rely on the memory being immutable, because if a `&mut` reference existed, that would -/// violate Rust's aliasing rules. -/// -/// # SAFETY -/// Caller must ensure pointer is non-null, and points to allocated, initialized memory. -/// Pointer must point to within an object for which no `&mut` references are currently held. -#[inline] -unsafe fn read_u8(ptr: *const u8) -> u8 { - // SAFETY: Caller guarantees pointer is non-null, and points to allocated, initialized memory. - // Caller guarantees no mutable references to same memory exist, thus upholding Rust's aliasing rules. - // Pointer is "dereferenceable" by definition as a `u8` is 1 byte and cannot span multiple objects. - // Alignment is not relevant as `u8` is aligned on 1 (i.e. no alignment requirements). - debug_assert!(!ptr.is_null()); - *ptr.as_ref().unwrap_unchecked() -}