refactor(parser): all pointer manipulation through SourcePosition (#2350)

A safer and faster interface for reading source text using pointers than `*ptr`.
This commit is contained in:
overlookmotel 2024-02-09 02:26:51 +00:00 committed by GitHub
parent 651b0b15d1
commit 6f597b18bc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -136,7 +136,24 @@ impl<'a> Source<'a> {
/// `SourcePosition` lives as long as the source text `&str` that `Source` was created from. /// `SourcePosition` lives as long as the source text `&str` that `Source` was created from.
#[inline] #[inline]
pub(super) fn position(&self) -> SourcePosition<'a> { pub(super) fn position(&self) -> SourcePosition<'a> {
SourcePosition { ptr: self.ptr, _marker: PhantomData } // SAFETY: Creating a `SourcePosition` from current position of `Source` is always valid,
// if caller has upheld safety conditions of other unsafe methods of this type.
let pos = unsafe { SourcePosition::new(self.ptr) };
// SAFETY: `pos.read()`'s contract is upheld by:
// * The preceding checks that `pos.ptr` >= `self.start` and < `self.end`.
// * `Source`'s invariants guarantee that `self.start` - `self.end` contains allocated memory.
// * `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `pos.ptr`
// addresses cannot be aliased by a `&mut` ref as long as `Source` exists.
// * `SourcePosition` can only live as long as the `&str` underlying `Source`.
debug_assert!(
pos.ptr >= self.start
&& pos.ptr <= self.end
// SAFETY: See above
&& (pos.ptr == self.end || !is_utf8_cont_byte(unsafe { pos.read() }))
);
pos
} }
/// Move current position. /// Move current position.
@ -151,7 +168,7 @@ impl<'a> Source<'a> {
// from this `Source`. // from this `Source`.
// This guarantee is what allows this function to be safe. // This guarantee is what allows this function to be safe.
// SAFETY: `read_u8`'s contract is upheld by: // SAFETY: `SourcePosition::read`'s contract is upheld by:
// * The preceding checks that `pos.ptr` >= `self.start` and < `self.end`. // * The preceding checks that `pos.ptr` >= `self.start` and < `self.end`.
// * `Source`'s invariants guarantee that `self.start` - `self.end` contains allocated memory. // * `Source`'s invariants guarantee that `self.start` - `self.end` contains allocated memory.
// * `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `pos.ptr` // * `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `pos.ptr`
@ -161,7 +178,7 @@ impl<'a> Source<'a> {
pos.ptr >= self.start pos.ptr >= self.start
&& pos.ptr <= self.end && pos.ptr <= self.end
// SAFETY: See above // SAFETY: See above
&& (pos.ptr == self.end || !is_utf8_cont_byte(unsafe { read_u8(pos.ptr) })) && (pos.ptr == self.end || !is_utf8_cont_byte(unsafe { pos.read() }))
); );
self.ptr = pos.ptr; self.ptr = pos.ptr;
} }
@ -183,7 +200,7 @@ impl<'a> Source<'a> {
/// * Moving back `n` bytes would not place current position on a UTF-8 character boundary. /// * Moving back `n` bytes would not place current position on a UTF-8 character boundary.
#[inline] #[inline]
pub(super) fn back(&mut self, n: usize) { pub(super) fn back(&mut self, n: usize) {
// This assertion is essential to ensure safety of `read_u8()` call below. // This assertion is essential to ensure safety of `pos.read()` call below.
// Without this check, calling `back(0)` on an empty `Source` would cause reading // Without this check, calling `back(0)` on an empty `Source` would cause reading
// out of bounds. // out of bounds.
// Compiler should remove this assertion when inlining this function, // Compiler should remove this assertion when inlining this function,
@ -196,7 +213,7 @@ impl<'a> Source<'a> {
// SAFETY: We have checked that `n` is less than distance between `start` and `ptr`, // SAFETY: We have checked that `n` is less than distance between `start` and `ptr`,
// so `new_ptr` cannot be outside of allocation of original `&str` // so `new_ptr` cannot be outside of allocation of original `&str`
let new_ptr = unsafe { self.ptr.sub(n) }; let new_pos = unsafe { self.position().sub(n) };
// Enforce invariant that `ptr` must be positioned on a UTF-8 character boundary. // Enforce invariant that `ptr` must be positioned on a UTF-8 character boundary.
// SAFETY: `new_ptr` is in bounds of original `&str`, and `n > 0` assertion ensures // SAFETY: `new_ptr` is in bounds of original `&str`, and `n > 0` assertion ensures
@ -204,11 +221,11 @@ impl<'a> Source<'a> {
// `Source`'s invariants guarantee that `self.start` - `self.end` contains allocated memory. // `Source`'s invariants guarantee that `self.start` - `self.end` contains allocated memory.
// `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `new_ptr` // `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `new_ptr`
// addresses cannot be aliased by a `&mut` ref as long as `Source` exists. // addresses cannot be aliased by a `&mut` ref as long as `Source` exists.
let byte = unsafe { read_u8(new_ptr) }; let byte = unsafe { new_pos.read() };
assert!(!is_utf8_cont_byte(byte), "Offset is not on a UTF-8 character boundary"); assert!(!is_utf8_cont_byte(byte), "Offset is not on a UTF-8 character boundary");
// Move current position. The checks above satisfy `Source`'s invariants. // Move current position. The checks above satisfy `Source`'s invariants.
self.ptr = new_ptr; self.ptr = new_pos.ptr;
} }
/// Get next char of source, and advance position to after it. /// Get next char of source, and advance position to after it.
@ -376,43 +393,92 @@ impl<'a> Source<'a> {
// `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `self.ptr` // `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `self.ptr`
// addresses cannot be aliased by a `&mut` ref as long as `Source` exists. // addresses cannot be aliased by a `&mut` ref as long as `Source` exists.
debug_assert!(self.ptr >= self.start && self.ptr < self.end); debug_assert!(self.ptr >= self.start && self.ptr < self.end);
read_u8(self.ptr) self.position().read()
} }
} }
/// Wrapper around a pointer to a position in `Source`. /// Wrapper around a pointer to a position in `Source`.
///
/// # SAFETY
/// `SourcePosition` must always be on a UTF-8 character boundary,
/// and within bounds of the `Source` that created it.
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
pub struct SourcePosition<'a> { pub struct SourcePosition<'a> {
ptr: *const u8, ptr: *const u8,
_marker: PhantomData<&'a u8>, _marker: PhantomData<&'a u8>,
} }
impl<'a> SourcePosition<'a> {
/// Create a new `SourcePosition` from a pointer.
///
/// # SAFETY
/// * Pointer must obey all the same invariants as `Source::ptr`.
/// * It must be created from a `Source`.
/// * It must be in bounds of the source text `&str` the `Source` is created from,
/// or 1 byte after the end of the source text (i.e. positioned at EOF).
/// * It must be positioned on a UTF-8 character boundary (or EOF).
#[inline]
pub(super) unsafe fn new(ptr: *const u8) -> Self {
Self { ptr, _marker: PhantomData }
}
/// Create new `SourcePosition` which is `n` bytes after this one.
/// The provenance of the pointer `SourcePosition` contains is maintained.
///
/// # SAFETY
/// Caller must ensure that advancing `SourcePosition` by `n` bytes does not make it past the end
/// of `Source` this `SourcePosition` was created from.
/// NB: It is legal to use `add` to create a `SourcePosition` which is *on* the end of `Source`,
/// just not past it.
#[allow(dead_code)]
#[inline]
pub(super) unsafe fn add(self, n: usize) -> Self {
Self::new(self.ptr.add(n))
}
/// Create new `SourcePosition` which is `n` bytes before this one.
/// The provenance of the pointer `SourcePosition` contains is maintained.
///
/// # SAFETY
/// Caller must ensure that reversing `SourcePosition` by `n` bytes does not make it before the start
/// of `Source` this `SourcePosition` was created from.
#[inline]
pub(super) unsafe fn sub(self, n: usize) -> Self {
Self::new(self.ptr.sub(n))
}
/// Read byte from this `SourcePosition`.
///
/// # SAFETY
/// Caller must ensure `SourcePosition` is not at end of source text.
///
/// # Implementation details
///
/// Using `as_ref()` for reading is copied from `core::slice::iter::next`.
/// https://doc.rust-lang.org/src/core/slice/iter.rs.html#132
/// https://doc.rust-lang.org/src/core/slice/iter/macros.rs.html#156-168
///
/// Using `ptr.as_ref().unwrap_unchecked()` instead of `*ptr` or `ptr.read()` produces
/// a 7% speed-up on Lexer benchmarks.
/// Presumably this is because it tells the compiler it can rely on the memory being immutable,
/// because if a `&mut` reference existed, that would violate Rust's aliasing rules.
#[inline]
pub(super) unsafe fn read(self) -> u8 {
// SAFETY:
// Caller guarantees `self` is not at end of source text.
// `Source` is created from a valid `&str`, so points to allocated, initialized memory.
// `Source` conceptually holds the source text `&str`, which guarantees to mutable references
// to the same memory can exist, as that would violate Rust's aliasing rules.
// Pointer is "dereferenceable" by definition as a `u8` is 1 byte and cannot span multiple objects.
// Alignment is not relevant as `u8` is aligned on 1 (i.e. no alignment requirements).
debug_assert!(!self.ptr.is_null());
*self.ptr.as_ref().unwrap_unchecked()
}
}
/// Return if byte is a UTF-8 continuation byte. /// Return if byte is a UTF-8 continuation byte.
#[inline] #[inline]
const fn is_utf8_cont_byte(byte: u8) -> bool { const fn is_utf8_cont_byte(byte: u8) -> bool {
// 0x80 - 0xBF are continuation bytes i.e. not 1st byte of a UTF-8 character sequence // 0x80 - 0xBF are continuation bytes i.e. not 1st byte of a UTF-8 character sequence
byte >= 0x80 && byte < 0xC0 byte >= 0x80 && byte < 0xC0
} }
/// Read `u8` from `*const u8` pointer.
///
/// Using `as_ref()` for reading is copied from `core::slice::iter::next`.
/// https://doc.rust-lang.org/src/core/slice/iter.rs.html#132
/// https://doc.rust-lang.org/src/core/slice/iter/macros.rs.html#156-168
///
/// This is about 7% faster than `*ptr` or `ptr.read()`, presumably because it tells the compiler
/// it can rely on the memory being immutable, because if a `&mut` reference existed, that would
/// violate Rust's aliasing rules.
///
/// # SAFETY
/// Caller must ensure pointer is non-null, and points to allocated, initialized memory.
/// Pointer must point to within an object for which no `&mut` references are currently held.
#[inline]
unsafe fn read_u8(ptr: *const u8) -> u8 {
// SAFETY: Caller guarantees pointer is non-null, and points to allocated, initialized memory.
// Caller guarantees no mutable references to same memory exist, thus upholding Rust's aliasing rules.
// Pointer is "dereferenceable" by definition as a `u8` is 1 byte and cannot span multiple objects.
// Alignment is not relevant as `u8` is aligned on 1 (i.e. no alignment requirements).
debug_assert!(!ptr.is_null());
*ptr.as_ref().unwrap_unchecked()
}