fix(parser): error on source larger than 4 GiB (#1860)

`Token` and `Span` both represent `start` and `end` as `u32`.

This limits size of source which can be parsed to `u32::MAX`.


19577709db/crates/oxc_span/src/span.rs (L14-L20)

However, this constraint is currently not enforced.

In a release build, code will not panic on arithmetic overflow, so
`start`/`end` could wrap around back to zero if source is 4 GiB or more.

That'd produce nonsense spans. But worse, the lexer relies in some
places on `self.current.token.start` being correct, so if the value
wrapped around, possibly it'd keep rewinding to the start of the source
and lexing it again, causing an infinite loop.

In worst case, if for some reason an application's public API used OXC's
parser with user-supplied source code (parser-as-a-service!), this could
be exploited for denial of service.

This PR adds an assertion to catch this at the start of parsing instead.

This does add an extra instruction, but I imagine the effect will be
negligible compared to the work required to parse the code.
This commit is contained in:
overlookmotel 2024-01-02 03:05:28 +00:00 committed by GitHub
parent 8374197a08
commit 62bc8c5cea
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 69 additions and 3 deletions

View file

@ -4,6 +4,11 @@ use oxc_diagnostics::{
}; };
use oxc_span::Span; use oxc_span::Span;
#[derive(Debug, Error, Diagnostic)]
#[error("Source length exceeds 4 GiB limit")]
#[diagnostic()]
pub struct OverlongSource;
#[derive(Debug, Error, Diagnostic)] #[derive(Debug, Error, Diagnostic)]
#[error("Flow is not supported")] #[error("Flow is not supported")]
#[diagnostic()] #[diagnostic()]

View file

@ -32,7 +32,7 @@ use self::{
string_builder::AutoCow, string_builder::AutoCow,
trivia_builder::TriviaBuilder, trivia_builder::TriviaBuilder,
}; };
use crate::diagnostics; use crate::{diagnostics, MAX_LEN};
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct LexerCheckpoint<'a> { pub struct LexerCheckpoint<'a> {
@ -72,6 +72,11 @@ pub struct Lexer<'a> {
#[allow(clippy::unused_self)] #[allow(clippy::unused_self)]
impl<'a> Lexer<'a> { impl<'a> Lexer<'a> {
pub fn new(allocator: &'a Allocator, source: &'a str, source_type: SourceType) -> Self { pub fn new(allocator: &'a Allocator, source: &'a str, source_type: SourceType) -> Self {
// Token's start and end are u32s, so limit for length of source is u32::MAX bytes.
// Only a debug assertion is required, as parser checks length of source before calling
// this method.
debug_assert!(source.len() <= MAX_LEN, "Source length exceeds MAX_LEN");
let token = Token { let token = Token {
// the first token is at the start of file, so is allows on a new line // the first token is at the start of file, so is allows on a new line
is_on_new_line: true, is_on_new_line: true,

View file

@ -84,6 +84,10 @@ use crate::{
state::ParserState, state::ParserState,
}; };
/// Maximum length of source in bytes which can be parsed (~4 GiB).
// Span's start and end are u32s, so size limit is u32::MAX bytes.
pub const MAX_LEN: usize = u32::MAX as usize;
/// Return value of parser consisting of AST, errors and comments /// Return value of parser consisting of AST, errors and comments
/// ///
/// The parser always return a valid AST. /// The parser always return a valid AST.
@ -135,8 +139,12 @@ pub struct Parser<'a> {
impl<'a> Parser<'a> { impl<'a> Parser<'a> {
/// Create a new parser /// Create a new parser
pub fn new(allocator: &'a Allocator, source_text: &'a str, source_type: SourceType) -> Self { pub fn new(allocator: &'a Allocator, source_text: &'a str, source_type: SourceType) -> Self {
// If source exceeds size limit, substitute a short source which will fail to parse.
// `parse()` will convert error to `diagnostics::OverlongSource`.
let source_text_for_lexer = if source_text.len() > MAX_LEN { "\0" } else { source_text };
Self { Self {
lexer: Lexer::new(allocator, source_text, source_type), lexer: Lexer::new(allocator, source_text_for_lexer, source_type),
source_type, source_type,
source_text, source_text,
errors: vec![], errors: vec![],
@ -177,7 +185,9 @@ impl<'a> Parser<'a> {
let (program, panicked) = match self.parse_program() { let (program, panicked) = match self.parse_program() {
Ok(program) => (program, false), Ok(program) => (program, false),
Err(error) => { Err(error) => {
self.error(self.flow_error().unwrap_or(error)); self.error(
self.flow_error().unwrap_or_else(|| self.overlong_error().unwrap_or(error)),
);
let program = self.ast.program( let program = self.ast.program(
Span::default(), Span::default(),
self.source_type, self.source_type,
@ -227,6 +237,15 @@ impl<'a> Parser<'a> {
None None
} }
/// Check if source length exceeds MAX_LEN, if the file cannot be parsed.
/// Original parsing error is not real - `Parser::new` substituted "\0" as the source text.
fn overlong_error(&self) -> Option<Error> {
if self.source_text.len() > MAX_LEN {
return Some(diagnostics::OverlongSource.into());
}
None
}
/// Return error info at current token /// Return error info at current token
/// # Panics /// # Panics
/// * The lexer did not push a diagnostic when `Kind::Undetermined` is returned /// * The lexer did not push a diagnostic when `Kind::Undetermined` is returned
@ -280,4 +299,41 @@ mod test {
assert!(ret.program.is_empty()); assert!(ret.program.is_empty());
assert_eq!(ret.errors.first().unwrap().to_string(), "Flow is not supported"); assert_eq!(ret.errors.first().unwrap().to_string(), "Flow is not supported");
} }
// Source with length u32::MAX + 1 fails to parse
#[test]
fn overlong_source() {
let allocator = Allocator::default();
let source_type = SourceType::default();
let source = "var x = 123456;\n".repeat(256 * 1024 * 1024);
assert_eq!(source.len() - 1, u32::MAX as usize);
let ret = Parser::new(&allocator, &source, source_type).parse();
assert!(ret.program.is_empty());
assert!(ret.panicked);
assert_eq!(ret.errors.len(), 1);
assert_eq!(ret.errors.first().unwrap().to_string(), "Source length exceeds 4 GiB limit");
}
// Source with length u32::MAX parses OK.
// This test takes over 1 minute on an M1 Macbook Pro unless compiled in release mode.
// `not(debug_assertions)` is a proxy for detecting release mode.
#[cfg(not(debug_assertions))]
#[test]
fn legal_length_source() {
let allocator = Allocator::default();
let source_type = SourceType::default();
// Build a string u32::MAX bytes long which doesn't take too long to parse
let head = "const x = 1;\n/*";
let foot = "*/\nconst y = 2;\n";
let mut source = "x".repeat(u32::MAX as usize);
source.replace_range(..head.len(), head);
source.replace_range(source.len() - foot.len().., foot);
assert_eq!(source.len(), u32::MAX as usize);
let ret = Parser::new(&allocator, &source, source_type).parse();
assert!(!ret.panicked);
assert!(ret.errors.is_empty());
assert_eq!(ret.program.body.len(), 2);
}
} }