From 6a523938de2b00f6da63e77ec2fa62f00436b91a Mon Sep 17 00:00:00 2001 From: Nick Fitzgerald Date: Tue, 28 Sep 2021 16:07:42 -0700 Subject: [PATCH] Fix overflows when tokenizing integer literals --- cranelift/isle/fuzz/fuzz_targets/parse.rs | 7 ++- cranelift/isle/isle/src/lexer.rs | 68 ++++++++++++++--------- cranelift/isle/isle/src/parser.rs | 2 +- 3 files changed, 46 insertions(+), 31 deletions(-) diff --git a/cranelift/isle/fuzz/fuzz_targets/parse.rs b/cranelift/isle/fuzz/fuzz_targets/parse.rs index f2a572dbd7..747bb21e20 100644 --- a/cranelift/isle/fuzz/fuzz_targets/parse.rs +++ b/cranelift/isle/fuzz/fuzz_targets/parse.rs @@ -3,7 +3,8 @@ use libfuzzer_sys::fuzz_target; fuzz_target!(|s: &str| { - let lexer = isle::lexer::Lexer::from_str(s, "fuzz-input.isle"); - let mut parser = isle::parser::Parser::new(lexer); - let _ = parser.parse_defs(); + if let Ok(lexer) = isle::lexer::Lexer::from_str(s, "fuzz-input.isle") { + let mut parser = isle::parser::Parser::new(lexer); + let _ = parser.parse_defs(); + } }); diff --git a/cranelift/isle/isle/src/lexer.rs b/cranelift/isle/isle/src/lexer.rs index 561a962fb3..c247c0c3d7 100644 --- a/cranelift/isle/isle/src/lexer.rs +++ b/cranelift/isle/isle/src/lexer.rs @@ -1,6 +1,6 @@ //! Lexer for the ISLE language. -use crate::error::{Error, Result}; +use crate::error::{Error, Result, Source}; use std::borrow::Cow; use std::path::Path; use std::sync::Arc; @@ -72,7 +72,7 @@ pub enum Token { impl<'a> Lexer<'a> { /// Create a new lexer for the given source contents and filename. - pub fn from_str(s: &'a str, filename: &'a str) -> Lexer<'a> { + pub fn from_str(s: &'a str, filename: &'a str) -> Result> { let mut l = Lexer { filenames: vec![filename.into()], file_texts: vec![s.into()], @@ -86,8 +86,8 @@ impl<'a> Lexer<'a> { }, lookahead: None, }; - l.reload(); - l + l.reload()?; + Ok(l) } /// Create a new lexer from the given files. @@ -131,7 +131,7 @@ impl<'a> Lexer<'a> { }, lookahead: None, }; - l.reload(); + l.reload()?; Ok(l) } @@ -162,7 +162,18 @@ impl<'a> Lexer<'a> { } } - fn next_token(&mut self) -> Option<(Pos, Token)> { + fn error(&self, pos: Pos, msg: impl Into) -> Error { + Error::ParseError { + msg: msg.into(), + src: Source::new( + self.filenames[pos.file].clone(), + self.file_texts[pos.file].clone(), + ), + span: miette::SourceSpan::from((pos.offset, 1)), + } + } + + fn next_token(&mut self) -> Result> { fn is_sym_first_char(c: u8) -> bool { match c { b'-' | b'0'..=b'9' | b'(' | b')' | b';' => false, @@ -194,26 +205,26 @@ impl<'a> Lexer<'a> { } if self.pos.offset == self.buf.len() { - return None; + return Ok(None); } let char_pos = self.pos; match self.buf[self.pos.offset] { b'(' => { self.advance_pos(); - Some((char_pos, Token::LParen)) + Ok(Some((char_pos, Token::LParen))) } b')' => { self.advance_pos(); - Some((char_pos, Token::RParen)) + Ok(Some((char_pos, Token::RParen))) } b'@' => { self.advance_pos(); - Some((char_pos, Token::At)) + Ok(Some((char_pos, Token::At))) } b'<' => { self.advance_pos(); - Some((char_pos, Token::Lt)) + Ok(Some((char_pos, Token::Lt))) } c if is_sym_first_char(c) => { let start = self.pos.offset; @@ -226,7 +237,7 @@ impl<'a> Lexer<'a> { let end = self.pos.offset; let s = std::str::from_utf8(&self.buf[start..end]) .expect("Only ASCII characters, should be UTF-8"); - Some((start_pos, Token::Symbol(s.to_string()))) + Ok(Some((start_pos, Token::Symbol(s.to_string())))) } c if (c >= b'0' && c <= b'9') || c == b'-' => { let start_pos = self.pos; @@ -236,11 +247,16 @@ impl<'a> Lexer<'a> { } else { false }; - let mut num = 0; + let mut num = 0_i64; while self.pos.offset < self.buf.len() && (self.buf[self.pos.offset] >= b'0' && self.buf[self.pos.offset] <= b'9') { - num = (num * 10) + (self.buf[self.pos.offset] - b'0') as i64; + let base = num + .checked_mul(10) + .ok_or_else(|| self.error(start_pos, "integer literal too large"))?; + num = base + .checked_add((self.buf[self.pos.offset] - b'0') as i64) + .ok_or_else(|| self.error(start_pos, "integer literal too large"))?; self.advance_pos(); } @@ -249,16 +265,24 @@ impl<'a> Lexer<'a> { } else { Token::Int(num) }; - Some((start_pos, tok)) + Ok(Some((start_pos, tok))) } c => panic!("Unexpected character '{}' at offset {}", c, self.pos.offset), } } - fn reload(&mut self) { + /// Get the next token from this lexer's token stream, if any. + pub fn next(&mut self) -> Result> { + let tok = self.lookahead.take(); + self.reload()?; + Ok(tok) + } + + fn reload(&mut self) -> Result<()> { if self.lookahead.is_none() && self.pos.offset < self.buf.len() { - self.lookahead = self.next_token(); + self.lookahead = self.next_token()?; } + Ok(()) } /// Peek ahead at the next token. @@ -272,16 +296,6 @@ impl<'a> Lexer<'a> { } } -impl<'a> std::iter::Iterator for Lexer<'a> { - type Item = (Pos, Token); - - fn next(&mut self) -> Option<(Pos, Token)> { - let tok = self.lookahead.take(); - self.reload(); - tok - } -} - impl Token { /// Is this an `Int` token? pub fn is_int(&self) -> bool { diff --git a/cranelift/isle/isle/src/parser.rs b/cranelift/isle/isle/src/parser.rs index 4588e1c99b..7ea9b6e489 100644 --- a/cranelift/isle/isle/src/parser.rs +++ b/cranelift/isle/isle/src/parser.rs @@ -37,7 +37,7 @@ impl<'a> Parser<'a> { if !f(peek) { return Err(self.error(pos, format!("Unexpected token {:?}", peek))); } - Ok(self.lexer.next().unwrap().1) + Ok(self.lexer.next()?.unwrap().1) } else { Err(self.error(self.lexer.pos(), "Unexpected EOF".to_string())) }