peepmatic: Introduce the peepmatic-fuzzing crate

This crate contains oracles, generators, and fuzz targets for use with fuzzing
engines (e.g. libFuzzer). This doesn't contain the actual
`libfuzzer_sys::fuzz_target!` definitions (those are in the `peepmatic-fuzz`
crate) but does those definitions are one liners calling out to functions
defined in this crate.
This commit is contained in:
Nick Fitzgerald
2020-05-01 15:47:47 -07:00
parent 2828da1f56
commit 1a7670f964
6 changed files with 802 additions and 0 deletions

View File

@@ -0,0 +1,22 @@
[package]
name = "peepmatic-fuzzing"
version = "0.1.0"
authors = ["Nick Fitzgerald <fitzgen@gmail.com>"]
edition = "2018"
publish = false
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
arbitrary = { version = "0.4.1", features = ["derive"] }
bincode = "1.2.1"
env_logger = "0.7.1"
fst = "0.4.1"
log = "0.4.8"
peepmatic = { path = "../.." }
peepmatic-automata = { path = "../automata", features = ["serde"] }
peepmatic-runtime = { path = "../runtime", features = ["construct"] }
peepmatic-test = { path = "../test" }
rand = { version = "0.7.3", features = ["small_rng"] }
serde = "1.0.106"
wast = "13.0.0"

View File

@@ -0,0 +1,187 @@
//! Helpers for fuzzing the `peepmatic-automata` crate.
use peepmatic_automata::{Automaton, Builder, Output};
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet};
use std::hash::Hash;
fn serde_roundtrip<TAlphabet, TState, TOutput>(
automata: Automaton<TAlphabet, TState, TOutput>,
) -> Automaton<TAlphabet, TState, TOutput>
where
TAlphabet: Serialize + for<'de> Deserialize<'de> + Clone + Eq + Hash + Ord,
TState: Serialize + for<'de> Deserialize<'de> + Clone + Eq + Hash,
TOutput: Serialize + for<'de> Deserialize<'de> + Output,
{
let encoded: Vec<u8> = bincode::serialize(&automata).expect("should serialize OK");
bincode::deserialize(&encoded).expect("should deserialize OK")
}
/// Construct an automaton from the the given input-output pairs, and assert
/// that:
///
/// * Putting in each of the input strings should result in the expected output
/// string.
///
/// * Putting in an input string that is not one of the given inputs from our
/// input-output pairs should never yield an output value.
pub fn simple_automata(input_output_pairs: Vec<Vec<(u8, Vec<u8>)>>) {
let _ = env_logger::try_init();
let full_input = |pair: &[(u8, Vec<u8>)]| {
let mut full_input = vec![];
for (input, _) in pair {
full_input.push(*input);
}
full_input
};
let mut inputs = HashSet::new();
let mut input_output_pairs: Vec<_> = input_output_pairs
.into_iter()
.filter(|pair| {
!pair.is_empty() && {
// Make sure we don't have duplicate inputs.
let is_new = inputs.insert(full_input(pair));
is_new
}
})
.collect();
input_output_pairs.sort_by(|a, b| full_input(a).cmp(&full_input(b)));
if input_output_pairs.is_empty() {
return;
}
// A map from one of our concatenated input strings to its concatenated
// output.
let mut expected = HashMap::with_capacity(input_output_pairs.len());
let mut builder = Builder::<u8, (), Vec<u8>>::new();
for pair in &input_output_pairs {
let mut full_input = vec![];
let mut full_output = vec![];
let mut ins = builder.insert();
for (input, output) in pair.iter().cloned() {
full_input.push(input);
full_output.extend(output.iter().copied());
ins.next(input, output);
}
let old = expected.insert(full_input, full_output);
assert!(old.is_none());
ins.finish();
}
let automata = builder.finish();
let automata = serde_roundtrip(automata);
// Assert that each of our input strings yields the expected output.
for (input, expected_output) in &expected {
log::debug!("Testing input: {:?}", input);
let actual_output = automata.get(input);
assert!(actual_output.is_some());
assert_eq!(actual_output.as_ref().unwrap(), expected_output);
}
// Test that mutations of our input strings (that aren't themselves other
// input strings!) do not yeild any output.
for input in expected.keys() {
for i in 0..input.len() {
let mut mutated = input.clone();
mutated[i] = mutated[i].wrapping_add(1);
log::debug!("Testing mutated input: {:?}", mutated);
if !expected.contains_key(&mutated) {
assert!(automata.get(&mutated).is_none());
}
}
}
}
/// Do differential testing against the `fst` crate, which is another
/// implementation of the algorithm we use for finite-state transducer
/// construction in `peepmatic-automata`.
pub fn fst_differential(map: HashMap<Vec<u8>, u64>) {
let _ = env_logger::try_init();
let mut inputs: Vec<_> = map.keys().filter(|i| !i.is_empty()).cloned().collect();
inputs.sort();
inputs.dedup();
if inputs.is_empty() {
return;
}
let mut fst = fst::MapBuilder::memory();
let mut builder = Builder::<u8, (), u64>::new();
for inp in &inputs {
fst.insert(inp, map[inp]).unwrap();
let mut ins = builder.insert();
for (i, ch) in inp.iter().enumerate() {
ins.next(*ch, if i == 0 { map[inp] } else { 0 });
}
ins.finish();
}
let fst = fst.into_map();
let automata = builder.finish();
let automata = serde_roundtrip(automata);
for inp in inputs {
// Check we have the same result as `fst` for inputs we know are in the
// automata.
log::debug!("Testing input {:?}", inp);
let expected = fst.get(&inp).expect("`fst` should have entry for `inp`");
let actual = automata
.get(&inp)
.expect("automata should have entry for `inp`");
assert_eq!(expected, actual);
// Check that we have the same result as `fst` for inputs that may or
// may not be in the automata.
for i in 0..inp.len() {
let mut mutated = inp.clone();
mutated[i] = mutated[i].wrapping_add(1);
log::debug!("Testing mutated input {:?}", mutated);
let expected = fst.get(&mutated);
let actual = automata.get(&mutated);
assert_eq!(expected, actual);
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn check_simple_automata() {
crate::check(simple_automata);
}
#[test]
fn check_fst_differential() {
crate::check(fst_differential);
}
#[test]
fn regression_test_0() {
simple_automata(vec![vec![(0, vec![0]), (0, vec![1])], vec![(0, vec![2])]]);
}
#[test]
fn regression_test_1() {
fst_differential(vec![(vec![1, 3], 5), (vec![1, 2], 4)].into_iter().collect());
}
#[test]
fn regression_test_2() {
simple_automata(vec![vec![(0, vec![11]), (0, vec![])], vec![(0, vec![11])]]);
}
}

View File

@@ -0,0 +1,71 @@
//! Fuzz testing utilities related to AST pattern matching.
use peepmatic_runtime::PeepholeOptimizations;
use std::path::Path;
use std::str;
/// Attempt to interpret the given bytes as UTF-8 and then compile them as if
/// they were source text of our DSL.
pub fn compile(data: &[u8]) {
let source = match str::from_utf8(data) {
Err(_) => return,
Ok(s) => s,
};
let opt = match peepmatic::compile_str(source, Path::new("fuzz")) {
Err(_) => return,
Ok(o) => o,
};
// Should be able to serialize and deserialize the peephole optimizer.
let opt_bytes = bincode::serialize(&opt).expect("should serialize peephole optimizations OK");
let _: PeepholeOptimizations =
bincode::deserialize(&opt_bytes).expect("should deserialize peephole optimizations OK");
// Compiling the same source text again should be deterministic.
let opt2 = peepmatic::compile_str(source, Path::new("fuzz"))
.expect("should be able to compile source text again, if it compiled OK the first time");
let opt2_bytes =
bincode::serialize(&opt2).expect("should serialize second peephole optimizations OK");
assert_eq!(opt_bytes, opt2_bytes);
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn check_compile() {
crate::check(|s: String| compile(s.as_bytes()));
}
#[test]
fn regression_0() {
compile(
b"
(=> (bor (bor $x $y) $y) $x)
(=> (bor (bor $x $z) $y) $x)
",
);
}
#[test]
fn regression_1() {
compile(
b"
(=> (bor (bor $x $y) 0) $x)
(=> (bor $x 0) $x)
(=> (bor $y $x) $x)
",
);
}
#[test]
fn regression_2() {
compile(
b"
(=> (sshr $x 11111111110) $x)
",
);
}
}

View File

@@ -0,0 +1,374 @@
//! Interpreting compiled peephole optimizations against test instruction sequences.
use peepmatic::{
Constraint, Dfs, DynAstRef, Optimizations, Pattern, Span, TraversalEvent, ValueLiteral,
Variable,
};
use peepmatic_runtime::{
cc::ConditionCode,
operator::TypingContext as TypingContextTrait,
part::Constant,
r#type::BitWidth,
r#type::{Kind, Type},
};
use peepmatic_test::{Program, TestIsa};
use std::collections::{BTreeMap, HashMap};
use std::path::Path;
use std::str;
/// Compile the given source text, and if it is a valid set of optimizations,
/// then interpret the optimizations against test instruction sequences created
/// to reflect the optimizations.
pub fn interp(data: &[u8]) {
let _ = env_logger::try_init();
let source = match str::from_utf8(data) {
Err(_) => return,
Ok(s) => s,
};
let peep_opts = match peepmatic::compile_str(source, Path::new("fuzz")) {
Err(_) => return,
Ok(o) => o,
};
let mut optimizer = peep_opts.optimizer(TestIsa {
native_word_size_in_bits: 32,
});
// Okay, we know it compiles and verifies alright, so (re)parse the AST.
let buf = wast::parser::ParseBuffer::new(&source).unwrap();
let ast = wast::parser::parse::<Optimizations>(&buf).unwrap();
// And we need access to the assigned types, so re-verify it as well.
peepmatic::verify(&ast).unwrap();
// Walk over each optimization and create an instruction sequence that
// matches the optimization.
let mut program = Program::default();
for opt in &ast.optimizations {
// The instruction sequence we generate must match an optimization (not
// necessarily *this* optimization, if there is another that is more
// specific but also matches) unless there is an `bit-width`
// precondition or an implicit `bit-width` precondition via a type
// ascription. When those things exist, we might have constructed
// instructions with the wrong bit widths to match.
let mut allow_no_match = false;
// The last instruction we generated. After we've generated the full
// instruction sequence, this will be its root.
let mut last_inst = None;
// Remember the instructions associated with variables and constants, so
// that when they appear multiple times, we reuse the same instruction.
let mut id_to_inst = HashMap::new();
// Map from a pattern's span to the instruction we generated for
// it. This allows parent operations to get the instructions for their
// children.
let mut span_to_inst = BTreeMap::new();
for (te, lhs) in Dfs::new(&opt.lhs) {
// NB: We use a post-order traversal because we want arguments to be
// generated before they are used.
if te != TraversalEvent::Exit {
continue;
}
match lhs {
DynAstRef::Precondition(p) => {
allow_no_match |= p.constraint == Constraint::BitWidth;
}
DynAstRef::Pattern(Pattern::Operation(op)) => {
allow_no_match |= op.r#type.get().is_some();
let num_imms = op.operator.immediates_arity() as usize;
// Generate this operation's immediates.
let mut imm_tys = vec![];
op.operator
.immediate_types(&mut TypingContext, op.span(), &mut imm_tys);
let imms: Vec<_> = op
.operands
.iter()
.take(num_imms)
.zip(imm_tys)
.map(|(pat, ty)| match pat {
Pattern::ValueLiteral(ValueLiteral::Integer(i)) => {
Constant::Int(i.value as _, BitWidth::ThirtyTwo).into()
}
Pattern::ValueLiteral(ValueLiteral::Boolean(b)) => {
Constant::Bool(b.value, BitWidth::One).into()
}
Pattern::ValueLiteral(ValueLiteral::ConditionCode(cc)) => cc.cc.into(),
Pattern::Constant(_) | Pattern::Variable(_) => match ty {
TypeOrConditionCode::ConditionCode => ConditionCode::Eq.into(),
TypeOrConditionCode::Type(ty) => match ty.kind {
Kind::Int => Constant::Int(1, ty.bit_width).into(),
Kind::Bool => Constant::Bool(false, ty.bit_width).into(),
Kind::Void | Kind::CpuFlags => {
unreachable!("void and cpu flags cannot be immediates")
}
},
},
Pattern::Operation(_) => {
unreachable!("operations not allowed as immediates")
}
})
.collect();
// Generate (or collect already-generated) instructions for
// this operation's arguments.
let mut arg_tys = vec![];
op.operator
.param_types(&mut TypingContext, op.span(), &mut arg_tys);
let args: Vec<_> = op
.operands
.iter()
.skip(num_imms)
.zip(arg_tys)
.map(|(pat, ty)| match pat {
Pattern::Operation(op) => span_to_inst[&op.span()],
Pattern::ValueLiteral(ValueLiteral::Integer(i)) => program.r#const(
Constant::Int(i.value as _, BitWidth::ThirtyTwo),
BitWidth::ThirtyTwo,
),
Pattern::ValueLiteral(ValueLiteral::Boolean(b)) => program.r#const(
Constant::Bool(b.value, BitWidth::One),
BitWidth::ThirtyTwo,
),
Pattern::ValueLiteral(ValueLiteral::ConditionCode(_)) => {
unreachable!("condition codes cannot be arguments")
}
Pattern::Constant(peepmatic::Constant { id, .. })
| Pattern::Variable(Variable { id, .. }) => match ty {
TypeOrConditionCode::Type(ty) => {
*id_to_inst.entry(id).or_insert_with(|| match ty.kind {
Kind::Int => program.r#const(
Constant::Int(1, ty.bit_width),
BitWidth::ThirtyTwo,
),
Kind::Bool => program.r#const(
Constant::Bool(false, ty.bit_width),
BitWidth::ThirtyTwo,
),
Kind::CpuFlags => {
unreachable!("cpu flags cannot be an argument")
}
Kind::Void => unreachable!("void cannot be an argument"),
})
}
TypeOrConditionCode::ConditionCode => {
unreachable!("condition codes cannot be arguments")
}
},
})
.collect();
let ty = match op.operator.result_type(&mut TypingContext, op.span()) {
TypeOrConditionCode::Type(ty) => ty,
TypeOrConditionCode::ConditionCode => {
unreachable!("condition codes cannot be operation results")
}
};
let inst = program.new_instruction(op.operator, ty, imms, args);
last_inst = Some(inst);
let old_inst = span_to_inst.insert(op.span(), inst);
assert!(old_inst.is_none());
}
_ => continue,
}
}
// Run the optimizer on our newly generated instruction sequence.
if let Some(inst) = last_inst {
let replacement = optimizer.apply_one(&mut program, inst);
assert!(
replacement.is_some() || allow_no_match,
"an optimization should match the generated instruction sequence"
);
}
}
// Finally, just try and run the optimizer on every instruction we
// generated, just to potentially shake out some more bugs.
let instructions: Vec<_> = program.instructions().map(|(k, _)| k).collect();
for inst in instructions {
let _ = optimizer.apply_one(&mut program, inst);
}
}
enum TypeOrConditionCode {
Type(Type),
ConditionCode,
}
struct TypingContext;
impl<'a> TypingContextTrait<'a> for TypingContext {
type TypeVariable = TypeOrConditionCode;
fn cc(&mut self, _: wast::Span) -> Self::TypeVariable {
TypeOrConditionCode::ConditionCode
}
fn bNN(&mut self, _: wast::Span) -> Self::TypeVariable {
TypeOrConditionCode::Type(Type::b1())
}
fn iNN(&mut self, _: wast::Span) -> Self::TypeVariable {
TypeOrConditionCode::Type(Type::i32())
}
fn iMM(&mut self, _: wast::Span) -> Self::TypeVariable {
TypeOrConditionCode::Type(Type::i32())
}
fn cpu_flags(&mut self, _: wast::Span) -> Self::TypeVariable {
TypeOrConditionCode::Type(Type::cpu_flags())
}
fn b1(&mut self, _: wast::Span) -> Self::TypeVariable {
TypeOrConditionCode::Type(Type::b1())
}
fn void(&mut self, _: wast::Span) -> Self::TypeVariable {
TypeOrConditionCode::Type(Type::void())
}
fn bool_or_int(&mut self, _: wast::Span) -> Self::TypeVariable {
TypeOrConditionCode::Type(Type::b1())
}
fn any_t(&mut self, _: wast::Span) -> Self::TypeVariable {
TypeOrConditionCode::Type(Type::i32())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn check_interp() {
crate::check(|s: Vec<u8>| interp(String::from_utf8_lossy(&s).as_bytes()));
}
#[test]
fn regression_0() {
interp(b"(=> (imul $x $x) $x)");
}
#[test]
fn regression_1() {
interp(b"(=> (when (imul $x $C) (is-power-of-two $C)) $x)");
}
#[test]
fn regression_2() {
interp(
b"
(=> (bor (bor $x $y) $x) (bor $x $y))
(=> (bor (bor $x $C) 5) $x)
",
);
}
#[test]
fn regression_3() {
interp(
b"
(=> (bor $y (bor $x 9)) $x)
(=> (bor (bor $x $y) $x) $x)
",
);
}
#[test]
fn regression_4() {
interp(
b"
(=> (bor $C 33) 0)
(=> (bor $x 22) 1)
(=> (bor $x 11) 2)
",
);
}
#[test]
fn regression_5() {
interp(
b"
(=> (bor $y (bor $x $y)) (bor $x $y))
(=> (bor (bor $x $y) $z) $x)
(=> (bor (bor $x $y) $y) $x)
",
);
}
#[test]
fn regression_6() {
interp(b"(=> (imul $x $f) of)");
}
#[test]
fn regression_7() {
interp(
b"
(=> (when (sdiv $x $C)
(fits-in-native-word $y))
(sdiv $C $x))
",
);
}
#[test]
fn regression_8() {
interp(
b"
(=> (adjust_sp_down $C) (adjust_sp_down_imm $C))
",
);
}
#[test]
fn regression_9() {
interp(
b"
(=> (when $x) $x)
(=> (trapnz $x) (trapnz $x))
",
);
}
#[test]
fn regression_10() {
interp(b"(=> (sshr{i1} $x 0) $x)");
}
#[test]
fn regression_11() {
interp(
b"
(=> (when (ushr_imm $x (ishl 4 3))
(bit-width $x 64))
(sextend{i64} (ireduce{i32} $x)))
",
);
}
#[test]
fn regression_12() {
interp(b"(=> (band $C1 (band_imm $C1 1)) 1)");
}
#[test]
fn regression_13() {
interp(b"(=> (brz (icmp eq 0 $x)) (brz (ireduce{i32} $x)))");
}
#[test]
fn regression_14() {
interp(b"(=> (brz (icmp $E 0 $x)) (brz $x))");
}
}

View File

@@ -0,0 +1,119 @@
//! Utilities for fuzzing.
//!
//! The actual fuzz targets are defined in `peepmatic/fuzz/*`. This crate just
//! has oracles and generators for fuzzing.
#![deny(missing_debug_implementations)]
#![deny(missing_docs)]
use arbitrary::{Arbitrary, Unstructured};
use rand::prelude::*;
use std::fmt::Debug;
use std::panic;
use std::time;
pub mod automata;
pub mod compile;
pub mod interp;
pub mod parser;
/// A quickcheck-style runner for fuzz targets.
///
/// This is *not* intended to replace a long-running, coverage-guided fuzzing
/// engine like libFuzzer! This is only for defining quick, purely random tests
/// for use with `cargo test` and CI.
pub fn check<A>(mut f: impl FnMut(A))
where
A: Clone + Debug + Arbitrary,
{
let seed = rand::thread_rng().gen();
let mut rng = rand::rngs::SmallRng::seed_from_u64(seed);
const INITIAL_LENGTH: usize = 16;
const MAX_LENGTH: usize = 4096;
let mut buf: Vec<u8> = (0..INITIAL_LENGTH).map(|_| rng.gen()).collect();
let mut num_checked = 0;
let time_budget = time::Duration::from_secs(2);
let then = time::Instant::now();
let (failing_input, panic_info) = loop {
if num_checked > 0 && time::Instant::now().duration_since(then) > time_budget {
eprintln!("Checked {} random inputs.", num_checked);
return;
}
match <A as Arbitrary>::arbitrary_take_rest(Unstructured::new(&buf)) {
Ok(input) => {
num_checked += 1;
eprintln!("Checking input: {:#?}", input);
if let Err(p) = panic::catch_unwind(panic::AssertUnwindSafe(|| f(input.clone()))) {
break (input, p);
}
}
Err(e @ arbitrary::Error::NotEnoughData) => {
eprintln!("warning: {}", e);
if *buf.last().unwrap() == 0 {
if buf.len() < MAX_LENGTH {
let new_size = std::cmp::min(buf.len() * 2, MAX_LENGTH);
eprintln!("Growing buffer size to {}", new_size);
let delta = new_size - buf.len();
buf.reserve(delta);
for _ in 0..delta {
buf.push(rng.gen());
}
continue;
} else {
// Regenerate `buf` in the loop below and see if that
// fixes things...
eprintln!("Regenerating buffer data.");
}
} else {
// Shrink values in the end of `buf`, which is where
// `Arbitrary` pulls container lengths from. Then try again.
eprintln!("Shrinking buffer's tail values.");
let i = (buf.len() as f64).sqrt() as usize;
for j in i..buf.len() {
buf[j] /= 2;
}
continue;
}
}
Err(e) => {
eprintln!("warning: {}", e);
// Usually this happens because `A` requires a sequence utf-8
// bytes but its given sequence wasn't valid utf-8. Just skip
// this iteration and try again after we've updated `buf` below.
}
};
// Double the size of the buffer every so often, so we don't only
// explore small inputs.
if num_checked == buf.len() {
buf.resize(std::cmp::min(buf.len() * 2, MAX_LENGTH), 0);
}
for i in 0..buf.len() {
buf[i] = rng.gen();
}
};
// Shrink the failing input.
let mut smallest_failing_input = failing_input;
let mut panic_info = panic_info;
'shrinking: loop {
eprintln!("Smallest failing input: {:#?}", smallest_failing_input);
for input in smallest_failing_input.shrink() {
if let Err(p) = panic::catch_unwind(panic::AssertUnwindSafe(|| f(input.clone()))) {
smallest_failing_input = input;
panic_info = p;
continue 'shrinking;
}
}
break;
}
// Resume the panic for the smallest input.
panic::resume_unwind(panic_info);
}

View File

@@ -0,0 +1,29 @@
//! Utilities for fuzzing our DSL's parser.
use peepmatic::Optimizations;
use std::str;
/// Attempt to parse the given string as if it were a snippet of our DSL.
pub fn parse(data: &[u8]) {
let source = match str::from_utf8(data) {
Ok(s) => s,
Err(_) => return,
};
let buf = match wast::parser::ParseBuffer::new(&source) {
Ok(buf) => buf,
Err(_) => return,
};
let _ = wast::parser::parse::<Optimizations>(&buf);
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn check_parse() {
crate::check(|s: String| parse(s.as_bytes()));
}
}