From 62bc8c5cea528fb93eebed36f33c18850a7f4277 Mon Sep 17 00:00:00 2001 From: overlookmotel Date: Tue, 2 Jan 2024 03:05:28 +0000 Subject: [PATCH] fix(parser): error on source larger than 4 GiB (#1860) `Token` and `Span` both represent `start` and `end` as `u32`. This limits size of source which can be parsed to `u32::MAX`. https://github.com/oxc-project/oxc/blob/19577709dbcfc9761cfcb1b39ec7f3ebdc810285/crates/oxc_span/src/span.rs#L14-L20 However, this constraint is currently not enforced. In a release build, code will not panic on arithmetic overflow, so `start`/`end` could wrap around back to zero if source is 4 GiB or more. That'd produce nonsense spans. But worse, the lexer relies in some places on `self.current.token.start` being correct, so if the value wrapped around, possibly it'd keep rewinding to the start of the source and lexing it again, causing an infinite loop. In worst case, if for some reason an application's public API used OXC's parser with user-supplied source code (parser-as-a-service!), this could be exploited for denial of service. This PR adds an assertion to catch this at the start of parsing instead. This does add an extra instruction, but I imagine the effect will be negligible compared to the work required to parse the code. --- crates/oxc_parser/src/diagnostics.rs | 5 +++ crates/oxc_parser/src/lexer/mod.rs | 7 +++- crates/oxc_parser/src/lib.rs | 60 +++++++++++++++++++++++++++- 3 files changed, 69 insertions(+), 3 deletions(-) diff --git a/crates/oxc_parser/src/diagnostics.rs b/crates/oxc_parser/src/diagnostics.rs index 8277557a0..997fc1bd8 100644 --- a/crates/oxc_parser/src/diagnostics.rs +++ b/crates/oxc_parser/src/diagnostics.rs @@ -4,6 +4,11 @@ use oxc_diagnostics::{ }; use oxc_span::Span; +#[derive(Debug, Error, Diagnostic)] +#[error("Source length exceeds 4 GiB limit")] +#[diagnostic()] +pub struct OverlongSource; + #[derive(Debug, Error, Diagnostic)] #[error("Flow is not supported")] #[diagnostic()] diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs index ac74bfb38..7933e6379 100644 --- a/crates/oxc_parser/src/lexer/mod.rs +++ b/crates/oxc_parser/src/lexer/mod.rs @@ -32,7 +32,7 @@ use self::{ string_builder::AutoCow, trivia_builder::TriviaBuilder, }; -use crate::diagnostics; +use crate::{diagnostics, MAX_LEN}; #[derive(Debug, Clone)] pub struct LexerCheckpoint<'a> { @@ -72,6 +72,11 @@ pub struct Lexer<'a> { #[allow(clippy::unused_self)] impl<'a> Lexer<'a> { pub fn new(allocator: &'a Allocator, source: &'a str, source_type: SourceType) -> Self { + // Token's start and end are u32s, so limit for length of source is u32::MAX bytes. + // Only a debug assertion is required, as parser checks length of source before calling + // this method. + debug_assert!(source.len() <= MAX_LEN, "Source length exceeds MAX_LEN"); + let token = Token { // the first token is at the start of file, so is allows on a new line is_on_new_line: true, diff --git a/crates/oxc_parser/src/lib.rs b/crates/oxc_parser/src/lib.rs index 8765b43a3..73f902c03 100644 --- a/crates/oxc_parser/src/lib.rs +++ b/crates/oxc_parser/src/lib.rs @@ -84,6 +84,10 @@ use crate::{ state::ParserState, }; +/// Maximum length of source in bytes which can be parsed (~4 GiB). +// Span's start and end are u32s, so size limit is u32::MAX bytes. +pub const MAX_LEN: usize = u32::MAX as usize; + /// Return value of parser consisting of AST, errors and comments /// /// The parser always return a valid AST. @@ -135,8 +139,12 @@ pub struct Parser<'a> { impl<'a> Parser<'a> { /// Create a new parser pub fn new(allocator: &'a Allocator, source_text: &'a str, source_type: SourceType) -> Self { + // If source exceeds size limit, substitute a short source which will fail to parse. + // `parse()` will convert error to `diagnostics::OverlongSource`. + let source_text_for_lexer = if source_text.len() > MAX_LEN { "\0" } else { source_text }; + Self { - lexer: Lexer::new(allocator, source_text, source_type), + lexer: Lexer::new(allocator, source_text_for_lexer, source_type), source_type, source_text, errors: vec![], @@ -177,7 +185,9 @@ impl<'a> Parser<'a> { let (program, panicked) = match self.parse_program() { Ok(program) => (program, false), Err(error) => { - self.error(self.flow_error().unwrap_or(error)); + self.error( + self.flow_error().unwrap_or_else(|| self.overlong_error().unwrap_or(error)), + ); let program = self.ast.program( Span::default(), self.source_type, @@ -227,6 +237,15 @@ impl<'a> Parser<'a> { None } + /// Check if source length exceeds MAX_LEN, if the file cannot be parsed. + /// Original parsing error is not real - `Parser::new` substituted "\0" as the source text. + fn overlong_error(&self) -> Option { + if self.source_text.len() > MAX_LEN { + return Some(diagnostics::OverlongSource.into()); + } + None + } + /// Return error info at current token /// # Panics /// * The lexer did not push a diagnostic when `Kind::Undetermined` is returned @@ -280,4 +299,41 @@ mod test { assert!(ret.program.is_empty()); assert_eq!(ret.errors.first().unwrap().to_string(), "Flow is not supported"); } + + // Source with length u32::MAX + 1 fails to parse + #[test] + fn overlong_source() { + let allocator = Allocator::default(); + let source_type = SourceType::default(); + let source = "var x = 123456;\n".repeat(256 * 1024 * 1024); + assert_eq!(source.len() - 1, u32::MAX as usize); + let ret = Parser::new(&allocator, &source, source_type).parse(); + assert!(ret.program.is_empty()); + assert!(ret.panicked); + assert_eq!(ret.errors.len(), 1); + assert_eq!(ret.errors.first().unwrap().to_string(), "Source length exceeds 4 GiB limit"); + } + + // Source with length u32::MAX parses OK. + // This test takes over 1 minute on an M1 Macbook Pro unless compiled in release mode. + // `not(debug_assertions)` is a proxy for detecting release mode. + #[cfg(not(debug_assertions))] + #[test] + fn legal_length_source() { + let allocator = Allocator::default(); + let source_type = SourceType::default(); + + // Build a string u32::MAX bytes long which doesn't take too long to parse + let head = "const x = 1;\n/*"; + let foot = "*/\nconst y = 2;\n"; + let mut source = "x".repeat(u32::MAX as usize); + source.replace_range(..head.len(), head); + source.replace_range(source.len() - foot.len().., foot); + assert_eq!(source.len(), u32::MAX as usize); + + let ret = Parser::new(&allocator, &source, source_type).parse(); + assert!(!ret.panicked); + assert!(ret.errors.is_empty()); + assert_eq!(ret.program.body.len(), 2); + } }