mirror of
https://github.com/danbulant/oxc
synced 2026-05-24 20:32:10 +00:00
refactor(parser): all pointer manipulation through SourcePosition (#2350)
A safer and faster interface for reading source text using pointers than `*ptr`.
This commit is contained in:
parent
651b0b15d1
commit
6f597b18bc
1 changed files with 97 additions and 31 deletions
|
|
@ -136,7 +136,24 @@ impl<'a> Source<'a> {
|
||||||
/// `SourcePosition` lives as long as the source text `&str` that `Source` was created from.
|
/// `SourcePosition` lives as long as the source text `&str` that `Source` was created from.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub(super) fn position(&self) -> SourcePosition<'a> {
|
pub(super) fn position(&self) -> SourcePosition<'a> {
|
||||||
SourcePosition { ptr: self.ptr, _marker: PhantomData }
|
// SAFETY: Creating a `SourcePosition` from current position of `Source` is always valid,
|
||||||
|
// if caller has upheld safety conditions of other unsafe methods of this type.
|
||||||
|
let pos = unsafe { SourcePosition::new(self.ptr) };
|
||||||
|
|
||||||
|
// SAFETY: `pos.read()`'s contract is upheld by:
|
||||||
|
// * The preceding checks that `pos.ptr` >= `self.start` and < `self.end`.
|
||||||
|
// * `Source`'s invariants guarantee that `self.start` - `self.end` contains allocated memory.
|
||||||
|
// * `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `pos.ptr`
|
||||||
|
// addresses cannot be aliased by a `&mut` ref as long as `Source` exists.
|
||||||
|
// * `SourcePosition` can only live as long as the `&str` underlying `Source`.
|
||||||
|
debug_assert!(
|
||||||
|
pos.ptr >= self.start
|
||||||
|
&& pos.ptr <= self.end
|
||||||
|
// SAFETY: See above
|
||||||
|
&& (pos.ptr == self.end || !is_utf8_cont_byte(unsafe { pos.read() }))
|
||||||
|
);
|
||||||
|
|
||||||
|
pos
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Move current position.
|
/// Move current position.
|
||||||
|
|
@ -151,7 +168,7 @@ impl<'a> Source<'a> {
|
||||||
// from this `Source`.
|
// from this `Source`.
|
||||||
// This guarantee is what allows this function to be safe.
|
// This guarantee is what allows this function to be safe.
|
||||||
|
|
||||||
// SAFETY: `read_u8`'s contract is upheld by:
|
// SAFETY: `SourcePosition::read`'s contract is upheld by:
|
||||||
// * The preceding checks that `pos.ptr` >= `self.start` and < `self.end`.
|
// * The preceding checks that `pos.ptr` >= `self.start` and < `self.end`.
|
||||||
// * `Source`'s invariants guarantee that `self.start` - `self.end` contains allocated memory.
|
// * `Source`'s invariants guarantee that `self.start` - `self.end` contains allocated memory.
|
||||||
// * `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `pos.ptr`
|
// * `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `pos.ptr`
|
||||||
|
|
@ -161,7 +178,7 @@ impl<'a> Source<'a> {
|
||||||
pos.ptr >= self.start
|
pos.ptr >= self.start
|
||||||
&& pos.ptr <= self.end
|
&& pos.ptr <= self.end
|
||||||
// SAFETY: See above
|
// SAFETY: See above
|
||||||
&& (pos.ptr == self.end || !is_utf8_cont_byte(unsafe { read_u8(pos.ptr) }))
|
&& (pos.ptr == self.end || !is_utf8_cont_byte(unsafe { pos.read() }))
|
||||||
);
|
);
|
||||||
self.ptr = pos.ptr;
|
self.ptr = pos.ptr;
|
||||||
}
|
}
|
||||||
|
|
@ -183,7 +200,7 @@ impl<'a> Source<'a> {
|
||||||
/// * Moving back `n` bytes would not place current position on a UTF-8 character boundary.
|
/// * Moving back `n` bytes would not place current position on a UTF-8 character boundary.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub(super) fn back(&mut self, n: usize) {
|
pub(super) fn back(&mut self, n: usize) {
|
||||||
// This assertion is essential to ensure safety of `read_u8()` call below.
|
// This assertion is essential to ensure safety of `pos.read()` call below.
|
||||||
// Without this check, calling `back(0)` on an empty `Source` would cause reading
|
// Without this check, calling `back(0)` on an empty `Source` would cause reading
|
||||||
// out of bounds.
|
// out of bounds.
|
||||||
// Compiler should remove this assertion when inlining this function,
|
// Compiler should remove this assertion when inlining this function,
|
||||||
|
|
@ -196,7 +213,7 @@ impl<'a> Source<'a> {
|
||||||
|
|
||||||
// SAFETY: We have checked that `n` is less than distance between `start` and `ptr`,
|
// SAFETY: We have checked that `n` is less than distance between `start` and `ptr`,
|
||||||
// so `new_ptr` cannot be outside of allocation of original `&str`
|
// so `new_ptr` cannot be outside of allocation of original `&str`
|
||||||
let new_ptr = unsafe { self.ptr.sub(n) };
|
let new_pos = unsafe { self.position().sub(n) };
|
||||||
|
|
||||||
// Enforce invariant that `ptr` must be positioned on a UTF-8 character boundary.
|
// Enforce invariant that `ptr` must be positioned on a UTF-8 character boundary.
|
||||||
// SAFETY: `new_ptr` is in bounds of original `&str`, and `n > 0` assertion ensures
|
// SAFETY: `new_ptr` is in bounds of original `&str`, and `n > 0` assertion ensures
|
||||||
|
|
@ -204,11 +221,11 @@ impl<'a> Source<'a> {
|
||||||
// `Source`'s invariants guarantee that `self.start` - `self.end` contains allocated memory.
|
// `Source`'s invariants guarantee that `self.start` - `self.end` contains allocated memory.
|
||||||
// `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `new_ptr`
|
// `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `new_ptr`
|
||||||
// addresses cannot be aliased by a `&mut` ref as long as `Source` exists.
|
// addresses cannot be aliased by a `&mut` ref as long as `Source` exists.
|
||||||
let byte = unsafe { read_u8(new_ptr) };
|
let byte = unsafe { new_pos.read() };
|
||||||
assert!(!is_utf8_cont_byte(byte), "Offset is not on a UTF-8 character boundary");
|
assert!(!is_utf8_cont_byte(byte), "Offset is not on a UTF-8 character boundary");
|
||||||
|
|
||||||
// Move current position. The checks above satisfy `Source`'s invariants.
|
// Move current position. The checks above satisfy `Source`'s invariants.
|
||||||
self.ptr = new_ptr;
|
self.ptr = new_pos.ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get next char of source, and advance position to after it.
|
/// Get next char of source, and advance position to after it.
|
||||||
|
|
@ -376,43 +393,92 @@ impl<'a> Source<'a> {
|
||||||
// `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `self.ptr`
|
// `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `self.ptr`
|
||||||
// addresses cannot be aliased by a `&mut` ref as long as `Source` exists.
|
// addresses cannot be aliased by a `&mut` ref as long as `Source` exists.
|
||||||
debug_assert!(self.ptr >= self.start && self.ptr < self.end);
|
debug_assert!(self.ptr >= self.start && self.ptr < self.end);
|
||||||
read_u8(self.ptr)
|
self.position().read()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Wrapper around a pointer to a position in `Source`.
|
/// Wrapper around a pointer to a position in `Source`.
|
||||||
|
///
|
||||||
|
/// # SAFETY
|
||||||
|
/// `SourcePosition` must always be on a UTF-8 character boundary,
|
||||||
|
/// and within bounds of the `Source` that created it.
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub struct SourcePosition<'a> {
|
pub struct SourcePosition<'a> {
|
||||||
ptr: *const u8,
|
ptr: *const u8,
|
||||||
_marker: PhantomData<&'a u8>,
|
_marker: PhantomData<&'a u8>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<'a> SourcePosition<'a> {
|
||||||
|
/// Create a new `SourcePosition` from a pointer.
|
||||||
|
///
|
||||||
|
/// # SAFETY
|
||||||
|
/// * Pointer must obey all the same invariants as `Source::ptr`.
|
||||||
|
/// * It must be created from a `Source`.
|
||||||
|
/// * It must be in bounds of the source text `&str` the `Source` is created from,
|
||||||
|
/// or 1 byte after the end of the source text (i.e. positioned at EOF).
|
||||||
|
/// * It must be positioned on a UTF-8 character boundary (or EOF).
|
||||||
|
#[inline]
|
||||||
|
pub(super) unsafe fn new(ptr: *const u8) -> Self {
|
||||||
|
Self { ptr, _marker: PhantomData }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create new `SourcePosition` which is `n` bytes after this one.
|
||||||
|
/// The provenance of the pointer `SourcePosition` contains is maintained.
|
||||||
|
///
|
||||||
|
/// # SAFETY
|
||||||
|
/// Caller must ensure that advancing `SourcePosition` by `n` bytes does not make it past the end
|
||||||
|
/// of `Source` this `SourcePosition` was created from.
|
||||||
|
/// NB: It is legal to use `add` to create a `SourcePosition` which is *on* the end of `Source`,
|
||||||
|
/// just not past it.
|
||||||
|
#[allow(dead_code)]
|
||||||
|
#[inline]
|
||||||
|
pub(super) unsafe fn add(self, n: usize) -> Self {
|
||||||
|
Self::new(self.ptr.add(n))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create new `SourcePosition` which is `n` bytes before this one.
|
||||||
|
/// The provenance of the pointer `SourcePosition` contains is maintained.
|
||||||
|
///
|
||||||
|
/// # SAFETY
|
||||||
|
/// Caller must ensure that reversing `SourcePosition` by `n` bytes does not make it before the start
|
||||||
|
/// of `Source` this `SourcePosition` was created from.
|
||||||
|
#[inline]
|
||||||
|
pub(super) unsafe fn sub(self, n: usize) -> Self {
|
||||||
|
Self::new(self.ptr.sub(n))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Read byte from this `SourcePosition`.
|
||||||
|
///
|
||||||
|
/// # SAFETY
|
||||||
|
/// Caller must ensure `SourcePosition` is not at end of source text.
|
||||||
|
///
|
||||||
|
/// # Implementation details
|
||||||
|
///
|
||||||
|
/// Using `as_ref()` for reading is copied from `core::slice::iter::next`.
|
||||||
|
/// https://doc.rust-lang.org/src/core/slice/iter.rs.html#132
|
||||||
|
/// https://doc.rust-lang.org/src/core/slice/iter/macros.rs.html#156-168
|
||||||
|
///
|
||||||
|
/// Using `ptr.as_ref().unwrap_unchecked()` instead of `*ptr` or `ptr.read()` produces
|
||||||
|
/// a 7% speed-up on Lexer benchmarks.
|
||||||
|
/// Presumably this is because it tells the compiler it can rely on the memory being immutable,
|
||||||
|
/// because if a `&mut` reference existed, that would violate Rust's aliasing rules.
|
||||||
|
#[inline]
|
||||||
|
pub(super) unsafe fn read(self) -> u8 {
|
||||||
|
// SAFETY:
|
||||||
|
// Caller guarantees `self` is not at end of source text.
|
||||||
|
// `Source` is created from a valid `&str`, so points to allocated, initialized memory.
|
||||||
|
// `Source` conceptually holds the source text `&str`, which guarantees to mutable references
|
||||||
|
// to the same memory can exist, as that would violate Rust's aliasing rules.
|
||||||
|
// Pointer is "dereferenceable" by definition as a `u8` is 1 byte and cannot span multiple objects.
|
||||||
|
// Alignment is not relevant as `u8` is aligned on 1 (i.e. no alignment requirements).
|
||||||
|
debug_assert!(!self.ptr.is_null());
|
||||||
|
*self.ptr.as_ref().unwrap_unchecked()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Return if byte is a UTF-8 continuation byte.
|
/// Return if byte is a UTF-8 continuation byte.
|
||||||
#[inline]
|
#[inline]
|
||||||
const fn is_utf8_cont_byte(byte: u8) -> bool {
|
const fn is_utf8_cont_byte(byte: u8) -> bool {
|
||||||
// 0x80 - 0xBF are continuation bytes i.e. not 1st byte of a UTF-8 character sequence
|
// 0x80 - 0xBF are continuation bytes i.e. not 1st byte of a UTF-8 character sequence
|
||||||
byte >= 0x80 && byte < 0xC0
|
byte >= 0x80 && byte < 0xC0
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Read `u8` from `*const u8` pointer.
|
|
||||||
///
|
|
||||||
/// Using `as_ref()` for reading is copied from `core::slice::iter::next`.
|
|
||||||
/// https://doc.rust-lang.org/src/core/slice/iter.rs.html#132
|
|
||||||
/// https://doc.rust-lang.org/src/core/slice/iter/macros.rs.html#156-168
|
|
||||||
///
|
|
||||||
/// This is about 7% faster than `*ptr` or `ptr.read()`, presumably because it tells the compiler
|
|
||||||
/// it can rely on the memory being immutable, because if a `&mut` reference existed, that would
|
|
||||||
/// violate Rust's aliasing rules.
|
|
||||||
///
|
|
||||||
/// # SAFETY
|
|
||||||
/// Caller must ensure pointer is non-null, and points to allocated, initialized memory.
|
|
||||||
/// Pointer must point to within an object for which no `&mut` references are currently held.
|
|
||||||
#[inline]
|
|
||||||
unsafe fn read_u8(ptr: *const u8) -> u8 {
|
|
||||||
// SAFETY: Caller guarantees pointer is non-null, and points to allocated, initialized memory.
|
|
||||||
// Caller guarantees no mutable references to same memory exist, thus upholding Rust's aliasing rules.
|
|
||||||
// Pointer is "dereferenceable" by definition as a `u8` is 1 byte and cannot span multiple objects.
|
|
||||||
// Alignment is not relevant as `u8` is aligned on 1 (i.e. no alignment requirements).
|
|
||||||
debug_assert!(!ptr.is_null());
|
|
||||||
*ptr.as_ref().unwrap_unchecked()
|
|
||||||
}
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue