diff --git a/cranelift/peepmatic/crates/automata/Cargo.toml b/cranelift/peepmatic/crates/automata/Cargo.toml new file mode 100644 index 0000000000..27c89230af --- /dev/null +++ b/cranelift/peepmatic/crates/automata/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "peepmatic-automata" +version = "0.1.0" +authors = ["Nick Fitzgerald "] +edition = "2018" + +[package.metadata.docs.rs] +all-features = true + +[dependencies] +serde = { version = "1.0.106", optional = true } + +[features] +# Enable support for generating GraphViz Dot files that can be used to visually +# render an automaton. +# +# https://en.wikipedia.org/wiki/DOT_%28graph_description_language%29 +dot = [] diff --git a/cranelift/peepmatic/crates/automata/src/dot.rs b/cranelift/peepmatic/crates/automata/src/dot.rs new file mode 100644 index 0000000000..bc102d1f81 --- /dev/null +++ b/cranelift/peepmatic/crates/automata/src/dot.rs @@ -0,0 +1,273 @@ +//! Helpers for generating [GraphViz +//! Dot](https://graphviz.gitlab.io/_pages/pdf/dotguide.pdf) files to visually +//! render automata. +//! +//! **This module only exists when the `"dot"` cargo feature is enabled.** + +use crate::{Automaton, Output, State}; +use std::fmt::{Debug, Display}; +use std::fs; +use std::hash::Hash; +use std::io::{self, Write}; +use std::path::Path; + +/// Format the user-provided bits of an `Automaton` for Graphviz Dot output. +/// +/// There are two provided implementations of `DotFmt`: +/// +/// * [`DebugDotFmt`][crate::dot::DebugDotFmt] -- format each type parameter +/// with its `std::fmt::Debug` implementation. +/// +/// * [`DisplayDotFmt`][crate::dot::DisplayDotFmt] -- format each type parameter +/// with its `std::fmt::Display` implementation. +/// +/// You can also implement this trait yourself if your type parameters don't +/// implement `Debug` or `Display`, or if you want to format them in some other +/// way. +pub trait DotFmt { + /// Format a transition edge: `from ---input---> to`. + /// + /// This will be inside an [HTML + /// label](https://www.graphviz.org/doc/info/shapes.html#html), so you may + /// use balanced HTML tags. + fn fmt_transition( + &self, + w: &mut impl Write, + from: Option<&TState>, + input: &TAlphabet, + to: Option<&TState>, + ) -> io::Result<()>; + + /// Format the custom data associated with a state. + /// + /// This will be inside an [HTML + /// label](https://www.graphviz.org/doc/info/shapes.html#html), so you may + /// use balanced HTML tags. + fn fmt_state(&self, w: &mut impl Write, state: &TState) -> io::Result<()>; + + /// Format a transition's output or the final output of a final state. + /// + /// This will be inside an [HTML + /// label](https://www.graphviz.org/doc/info/shapes.html#html), so you may + /// use balanced HTML tags. + fn fmt_output(&self, w: &mut impl Write, output: &TOutput) -> io::Result<()>; +} + +impl Automaton +where + TAlphabet: Clone + Eq + Hash + Ord, + TState: Clone + Eq + Hash, + TOutput: Output, +{ + /// Write this `Automaton` out as a [GraphViz + /// Dot](https://graphviz.gitlab.io/_pages/pdf/dotguide.pdf) file at the + /// given path. + /// + /// The `formatter` parameter controls how `TAlphabet`, `TState`, and + /// `TOutput` are rendered. See the [`DotFmt`][crate::dot::DotFmt] trait for + /// details. + /// + /// **This method only exists when the `"dot"` cargo feature is enabled.** + pub fn write_dot_file( + &self, + formatter: &impl DotFmt, + path: impl AsRef, + ) -> io::Result<()> { + let mut file = fs::File::create(path)?; + self.write_dot(formatter, &mut file)?; + Ok(()) + } + + /// Write this `Automaton` out to the given write-able as a [GraphViz + /// Dot](https://graphviz.gitlab.io/_pages/pdf/dotguide.pdf) file. + /// + /// The `formatter` parameter controls how `TAlphabet`, `TState`, and + /// `TOutput` are rendered. See the [`DotFmt`][crate::dot::DotFmt] trait for + /// details. + /// + /// **This method only exists when the `"dot"` cargo feature is enabled.** + pub fn write_dot( + &self, + formatter: &impl DotFmt, + w: &mut impl Write, + ) -> io::Result<()> { + writeln!(w, "digraph {{")?; + writeln!(w, " rankdir = \"LR\";")?; + writeln!(w, " nodesep = 2;")?; + + // Fake state for the incoming arrow to the start state. + writeln!(w, " \"\" [shape = none];")?; + + // Each state, its associated custom data, and its final output. + for (i, state_data) in self.state_data.iter().enumerate() { + write!( + w, + r#" state_{i} [shape = {shape}, label = <")?; + if let Some(final_output) = self.final_states.get(&State(i as u32)) { + write!(w, r#"")?; + } + writeln!(w, "
{i}
"#, + i = i, + shape = if self.final_states.contains_key(&State(i as u32)) { + "doublecircle" + } else { + "circle" + } + )?; + if let Some(state_data) = state_data { + formatter.fmt_state(w, state_data)?; + } else { + write!(w, "(no state data)")?; + } + write!(w, "
"#)?; + formatter.fmt_output(w, final_output)?; + write!(w, "
>];")?; + } + + // Fake transition to the start state. + writeln!(w, r#" "" -> state_{};"#, self.start_state.0)?; + + // Transitions between states and their outputs. + for (from, transitions) in self.transitions.iter().enumerate() { + for (input, (to, output)) in transitions { + write!( + w, + r#" state_{from} -> state_{to} [label = <
Input:"#, + from = from, + to = to.0, + )?; + formatter.fmt_transition( + w, + self.state_data[from].as_ref(), + input, + self.state_data[to.0 as usize].as_ref(), + )?; + write!( + w, + r#"
Output:"#, + )?; + formatter.fmt_output(w, output)?; + writeln!(w, "
>];")?; + } + } + + writeln!(w, "}}")?; + Ok(()) + } +} + +/// Format an `Automaton`'s `TAlphabet`, `TState`, and `TOutput` with their +/// `std::fmt::Debug` implementations. +#[derive(Debug)] +pub struct DebugDotFmt; + +impl DotFmt for DebugDotFmt +where + TAlphabet: Debug, + TState: Debug, + TOutput: Debug, +{ + fn fmt_transition( + &self, + w: &mut impl Write, + _from: Option<&TState>, + input: &TAlphabet, + _to: Option<&TState>, + ) -> io::Result<()> { + write!(w, r#"{:?}"#, input) + } + + fn fmt_state(&self, w: &mut impl Write, state: &TState) -> io::Result<()> { + write!(w, r#"{:?}"#, state) + } + + fn fmt_output(&self, w: &mut impl Write, output: &TOutput) -> io::Result<()> { + write!(w, r#"{:?}"#, output) + } +} + +/// Format an `Automaton`'s `TAlphabet`, `TState`, and `TOutput` with their +/// `std::fmt::Display` implementations. +#[derive(Debug)] +pub struct DisplayDotFmt; + +impl DotFmt for DisplayDotFmt +where + TAlphabet: Display, + TState: Display, + TOutput: Display, +{ + fn fmt_transition( + &self, + w: &mut impl Write, + _from: Option<&TState>, + input: &TAlphabet, + _to: Option<&TState>, + ) -> io::Result<()> { + write!(w, "{}", input) + } + + fn fmt_state(&self, w: &mut impl Write, state: &TState) -> io::Result<()> { + write!(w, "{}", state) + } + + fn fmt_output(&self, w: &mut impl Write, output: &TOutput) -> io::Result<()> { + write!(w, "{}", output) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::Builder; + + #[test] + fn test_write_dot() { + let mut builder = Builder::::new(); + + // Insert "mon" -> 1 + let mut insertion = builder.insert(); + insertion.next('m', 1).next('o', 0).next('n', 0); + insertion.finish(); + + // Insert "sat" -> 6 + let mut insertion = builder.insert(); + insertion.next('s', 6).next('a', 0).next('t', 0); + insertion.finish(); + + // Insert "sun" -> 0 + let mut insertion = builder.insert(); + insertion.next('s', 0).next('u', 0).next('n', 0); + insertion.finish(); + + let automata = builder.finish(); + + let expected = r#" +digraph { + rankdir = "LR"; + nodesep = 2; + "" [shape = none]; + state_0 [shape = doublecircle, label = <
0
(no state data)
0
>]; + state_1 [shape = circle, label = <
1
(no state data)
>]; + state_2 [shape = circle, label = <
2
(no state data)
>]; + state_3 [shape = circle, label = <
3
(no state data)
>]; + state_4 [shape = circle, label = <
4
(no state data)
>]; + state_5 [shape = circle, label = <
5
(no state data)
>]; + "" -> state_5; + state_1 -> state_0 [label = <
Input:'n'
Output:0
>]; + state_2 -> state_1 [label = <
Input:'o'
Output:0
>]; + state_3 -> state_0 [label = <
Input:'t'
Output:0
>]; + state_4 -> state_3 [label = <
Input:'a'
Output:6
>]; + state_4 -> state_1 [label = <
Input:'u'
Output:0
>]; + state_5 -> state_2 [label = <
Input:'m'
Output:1
>]; + state_5 -> state_4 [label = <
Input:'s'
Output:0
>]; +} +"#; + + let mut buf = vec![]; + automata.write_dot(&DebugDotFmt, &mut buf).unwrap(); + let actual = String::from_utf8(buf).unwrap(); + eprintln!("{}", actual); + assert_eq!(expected.trim(), actual.trim()); + } +} diff --git a/cranelift/peepmatic/crates/automata/src/lib.rs b/cranelift/peepmatic/crates/automata/src/lib.rs new file mode 100644 index 0000000000..6ed09a66e3 --- /dev/null +++ b/cranelift/peepmatic/crates/automata/src/lib.rs @@ -0,0 +1,1024 @@ +//! Finite-state transducer automata. +//! +//! A transducer is a type of automata that has not only an input that it +//! accepts or rejects, but also an output. While regular automata check whether +//! an input string is in the set that the automata accepts, a transducer maps +//! the input strings to values. A regular automata is sort of a compressed, +//! immutable set, and a transducer is sort of a compressed, immutable key-value +//! dictionary. A [trie] compresses a set of strings or map from a string to a +//! value by sharing prefixes of the input string. Automata and transducers can +//! compress even better: they can share both prefixes and suffixes. [*Index +//! 1,600,000,000 Keys with Automata and Rust* by Andrew Gallant (aka +//! burntsushi)][burntsushi-blog-post] is a top-notch introduction. +//! +//! If you're looking for a general-purpose transducers crate in Rust you're +//! probably looking for [the `fst` crate][fst-crate]. While this implementation +//! is fully generic and has no dependencies, its feature set is specific to +//! `peepmatic`'s needs: +//! +//! * We need to associate extra data with each state: the match operation to +//! evaluate next. +//! +//! * We can't provide the full input string up front, so this crate must +//! support incremental lookups. This is because the peephole optimizer is +//! computing the input string incrementally and dynamically: it looks at the +//! current state's match operation, evaluates it, and then uses the result as +//! the next character of the input string. +//! +//! * We also support incremental insertion and output when building the +//! transducer. This is necessary because we don't want to emit output values +//! that bind a match on an optimization's left-hand side's pattern (for +//! example) until after we've succeeded in matching it, which might not +//! happen until we've reached the n^th state. +//! +//! * We need to support generic output values. The `fst` crate only supports +//! `u64` outputs, while we need to build up an optimization's right-hand side +//! instructions. +//! +//! This implementation is based on [*Direct Construction of Minimal Acyclic +//! Subsequential Transducers* by Mihov and Maurel][paper]. That means that keys +//! must be inserted in lexicographic order during construction. +//! +//! [trie]: https://en.wikipedia.org/wiki/Trie +//! [burntsushi-blog-post]: https://blog.burntsushi.net/transducers/#ordered-maps +//! [fst-crate]: https://crates.io/crates/fst +//! [paper]: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.24.3698&rep=rep1&type=pdf + +#![deny(missing_debug_implementations)] +#![deny(missing_docs)] + +mod output_impls; + +#[cfg(feature = "serde")] +mod serde_impls; + +#[cfg(feature = "dot")] +pub mod dot; + +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::convert::TryInto; +use std::hash::Hash; +use std::iter; +use std::mem; + +/// An output type for a transducer automata. +/// +/// Not every type can be the output of a transducer. For correctness (not +/// memory safety) each type that implements this trait must satisfy the +/// following laws: +/// +/// 1. `concat(empty(), x) == x` -- concatenating something with the empty +/// instance produces that same something. +/// +/// 2. `prefix(a, b) == prefix(b, a)` -- taking the prefix of two instances is +/// commutative. +/// +/// 3. `prefix(empty(), x) == empty()` -- the prefix of any value and the empty +/// instance is the empty instance. +/// +/// 4. `difference(concat(a, b), a) == b` -- concatenating a prefix value and +/// then removing it is the identity function. +/// +/// ## Example +/// +/// Here is an example implementation for unsigned integers: +/// +/// ``` +/// use peepmatic_automata::Output; +/// +/// #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +/// struct MyInt(u64); +/// +/// impl Output for MyInt { +/// // The empty value is zero. +/// fn empty() -> Self { +/// MyInt(0) +/// } +/// +/// // The prefix of two values is their min. +/// fn prefix(a: &MyInt, b: &MyInt) -> Self { +/// std::cmp::min(*a, *b) +/// } +/// +/// // The difference is subtraction. +/// fn difference(a: &MyInt, b: &MyInt) -> Self { +/// MyInt(a.0 - b.0) +/// } +/// +/// // Concatenation is addition. +/// fn concat(a: &MyInt, b: &MyInt) -> Self { +/// MyInt(a.0 + b.0) +/// } +/// } +/// +/// // Law 1 +/// assert_eq!( +/// MyInt::concat(&MyInt::empty(), &MyInt(5)), +/// MyInt(5), +/// ); +/// +/// // Law 2 +/// assert_eq!( +/// MyInt::prefix(&MyInt(3), &MyInt(5)), +/// MyInt::prefix(&MyInt(5), &MyInt(3)) +/// ); +/// +/// // Law 3 +/// assert_eq!( +/// MyInt::prefix(&MyInt::empty(), &MyInt(5)), +/// MyInt::empty() +/// ); +/// +/// // Law 4 +/// assert_eq!( +/// MyInt::difference(&MyInt::concat(&MyInt(2), &MyInt(3)), &MyInt(2)), +/// MyInt(3), +/// ); +/// ``` +pub trait Output: Sized + Eq + Hash + Clone { + /// Construct the empty instance. + fn empty() -> Self; + + /// Is this the empty instance? + /// + /// The default implementation constructs the empty instance and then checks + /// if `self` is equal to it. Override this default if you can provide a + /// better implementation. + fn is_empty(&self) -> bool { + *self == Self::empty() + } + + /// Get the shared prefix of two instances. + /// + /// This must be commutative. + fn prefix(a: &Self, b: &Self) -> Self; + + /// When `b` is a prefix of `a`, get the remaining suffix of `a` that is not + /// shared with `b`. + fn difference(a: &Self, b: &Self) -> Self; + + /// Concatenate `a` and `b`. + fn concat(a: &Self, b: &Self) -> Self; +} + +/// A builder for a transducer automata. +/// +/// ## Type Parameters +/// +/// Generic over the following parameters: +/// +/// * `TAlphabet` -- the alphabet of the input strings. If your input keys are +/// `String`s, this would be `char`. If your input keys are arbitrary byte +/// strings, this would be `u8`. +/// +/// * `TState` -- extra, custom data associated with each state. This isn't used +/// by the automata itself, but you can use it to annotate states with extra +/// information for your own purposes. +/// +/// * `TOutput` -- the output type. See [the `Output` trait][crate::Output] for +/// the requirements that any output type must fulfill. +/// +/// ## Insertions +/// +/// Insertions *must* happen in lexicographic order. Failure to do this, or +/// inserting duplicates, will trigger panics. +/// +/// ## Example +/// +/// ``` +/// use peepmatic_automata::Builder; +/// +/// let mut builder = Builder::::new(); +/// +/// // Insert "mon" -> 1 +/// let mut insertion = builder.insert(); +/// insertion +/// .next(b'm', 1) +/// .next(b'o', 0) +/// .next(b'n', 0); +/// insertion.finish(); +/// +/// // Insert "sat" -> 6 +/// let mut insertion = builder.insert(); +/// insertion +/// .next(b's', 6) +/// .next(b'a', 0) +/// .next(b't', 0); +/// insertion.finish(); +/// +/// // Insert "sun" -> 0 +/// let mut insertion = builder.insert(); +/// insertion +/// .next(b's', 0) +/// .next(b'u', 0) +/// .next(b'n', 0); +/// insertion.finish(); +/// +/// let automata = builder.finish(); +/// +/// assert_eq!(automata.get(b"sun"), Some(0)); +/// assert_eq!(automata.get(b"mon"), Some(1)); +/// assert_eq!(automata.get(b"sat"), Some(6)); +/// +/// assert!(automata.get(b"tues").is_none()); +/// ``` +#[derive(Debug, Clone)] +pub struct Builder +where + TAlphabet: Clone + Eq + Hash + Ord, + TState: Clone + Eq + Hash, + TOutput: Output, +{ + inner: Option>, +} + +impl Builder +where + TAlphabet: Clone + Eq + Hash + Ord, + TState: Clone + Eq + Hash, + TOutput: Output, +{ + /// Make a new builder to start constructing a new transducer automata. + pub fn new() -> Self { + let mut inner = BuilderInner { + frozen: vec![], + wip: BTreeMap::new(), + wip_state_id_counter: 0, + unfinished: vec![], + already_frozen: HashMap::new(), + last_insertion_finished: true, + }; + + // Create the start state. + let id = inner.new_wip_state(); + inner.unfinished.push(id); + + Builder { inner: Some(inner) } + } + + fn inner(&mut self) -> &mut BuilderInner { + self.inner + .as_mut() + .expect("cannot use `Builder` anymore after calling `finish` on it") + } + + /// Start building a new key/value insertion. + /// + /// Insertions are built up incrementally, and a full entry is created from + /// a series of `TAlphabet` and `TOutput` pairs passed to + /// [`InsertionBuilder::next`][crate::InsertionBuilder::next]. + /// + /// ## Panics + /// + /// Panics if [`finish`][crate::InsertionBuilder::finish] was not called on + /// the last `InsertionBuilder` returned from this method. + pub fn insert(&mut self) -> InsertionBuilder { + let inner = self.inner(); + assert!( + inner.last_insertion_finished, + "did not call `finish` on the last `InsertionBuilder`" + ); + inner.last_insertion_finished = false; + InsertionBuilder { + inner: inner, + index: 0, + output: TOutput::empty(), + } + } + + /// Finish building this transducer and return the constructed `Automaton`. + /// + /// ## Panics + /// + /// Panics if this builder is empty, and has never had anything inserted + /// into it. + /// + /// Panics if the last insertion's + /// [`InsertionBuilder`][crate::InsertionBuilder] did not call its + /// [finish][crate::InsertionBuilder::finish] method. + pub fn finish(&mut self) -> Automaton { + let mut inner = self + .inner + .take() + .expect("cannot use `Builder` anymore after calling `finish` on it"); + assert!(inner.last_insertion_finished); + + let wip_start = inner.unfinished[0]; + + // Freeze everything! We're done! + let wip_to_frozen = inner.freeze_from(0); + assert!(inner.wip.is_empty()); + assert!(inner.unfinished.is_empty()); + + // Now transpose our states and transitions into our packed, + // struct-of-arrays representation that we use inside `Automaton`. + let FrozenStateId(s) = wip_to_frozen[&wip_start]; + let start_state = State(s); + let mut state_data = vec![None; inner.frozen.len()]; + let mut transitions = (0..inner.frozen.len()) + .map(|_| BTreeMap::new()) + .collect::>(); + let mut final_states = BTreeMap::new(); + + assert!((inner.frozen.len() as u64) < (std::u32::MAX as u64)); + for (i, state) in inner.frozen.into_iter().enumerate() { + assert!(state_data[i].is_none()); + assert!(transitions[i].is_empty()); + + state_data[i] = state.state_data; + + for (input, (FrozenStateId(to_state), output)) in state.transitions { + assert!((to_state as usize) < transitions.len()); + transitions[i].insert(input, (State(to_state), output)); + } + + if state.is_final { + final_states.insert(State(i as u32), state.final_output); + } else { + assert!(state.final_output.is_empty()); + } + } + + let automata = Automaton { + state_data, + transitions, + final_states, + start_state, + }; + + #[cfg(debug_assertions)] + { + if let Err(msg) = automata.check_representation() { + panic!("Automaton::check_representation failed: {}", msg); + } + } + + automata + } +} + +/// A state in an automaton. +/// +/// Only use a `State` with the automaton that it came from! Mixing and matching +/// states between automata will result in bogus results and/or panics! +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Ord, PartialOrd)] +pub struct State(u32); + +#[derive(Clone, Debug)] +struct BuilderInner +where + TAlphabet: Clone + Eq + Hash + Ord, + TState: Clone + Eq + Hash, + TOutput: Output, +{ + // The `i`th entry maps `FrozenStateId(i)` to its state. + frozen: Vec>, + + // Our mutable, work-in-progress states. + wip: BTreeMap>, + + // A counter for WIP state ids. + wip_state_id_counter: u32, + + // A stack of our work-in-progress states. + unfinished: Vec, + + // A map from `WipState`s that we've already frozen to their canonical, + // de-duplicated frozen state. This is used for hash-consing frozen states + // so that we share suffixes in the automata. + already_frozen: HashMap, FrozenStateId>, + + // The the last `InsertionBuilder` have its `finish` method invoked? + last_insertion_finished: bool, +} + +impl BuilderInner +where + TAlphabet: Clone + Eq + Hash + Ord, + TState: Clone + Eq + Hash, + TOutput: Output, +{ + fn new_wip_state(&mut self) -> WipStateId { + let id = WipStateId(self.wip_state_id_counter); + self.wip_state_id_counter += 1; + let old = self.wip.insert( + id, + WipState { + state_data: None, + transitions: BTreeMap::new(), + is_final: false, + final_output: TOutput::empty(), + }, + ); + debug_assert!(old.is_none()); + id + } + + fn freeze_from(&mut self, index: usize) -> BTreeMap { + assert!(index <= self.unfinished.len()); + + let mut wip_to_frozen = BTreeMap::new(); + + if index == self.unfinished.len() { + // Nothing to freeze. + return wip_to_frozen; + } + + // Freeze `self.inner.unfinished[self.index + 1..]` from the end + // back. We're essentially hash-consing each state. + for _ in (index..self.unfinished.len()).rev() { + let wip_id = self.unfinished.pop().unwrap(); + let mut wip = self.wip.remove(&wip_id).unwrap(); + + // Update transitions to any state we just froze in an earlier + // iteration of this loop. + wip.update_transitions(&wip_to_frozen); + + // Get or create the canonical frozen state for this WIP state. + // + // Note: we're not using the entry API here because this way we can + // avoid cloning `wip`, which would be more costly than the double + // lookup we're doing instead. + let frozen_id = if let Some(id) = self.already_frozen.get(&wip) { + *id + } else { + let id = FrozenStateId(self.frozen.len().try_into().unwrap()); + self.frozen.push(FrozenState { + state_data: wip.state_data.clone(), + transitions: wip + .transitions + .clone() + .into_iter() + .map(|(input, (id, output))| { + let id = match id { + WipOrFrozenStateId::Frozen(id) => id, + WipOrFrozenStateId::Wip(_) => panic!( + "when we are freezing a WIP state, it should never have \ + any transitions to another WIP state" + ), + }; + (input, (id, output)) + }) + .collect(), + is_final: wip.is_final, + final_output: wip.final_output.clone(), + }); + self.already_frozen.insert(wip, id); + id + }; + + // Record the id for this newly frozen state, so that other states + // which referenced it when it wasn't frozen can reference it as a + // frozen state. + wip_to_frozen.insert(wip_id, frozen_id); + } + + // Update references to newly frozen states from the rest of the + // unfinished stack that we didn't freeze. + for wip_id in &self.unfinished { + self.wip + .get_mut(wip_id) + .unwrap() + .update_transitions(&wip_to_frozen); + } + + wip_to_frozen + } +} + +/// A builder for a new entry in a transducer automata. +#[derive(Debug)] +pub struct InsertionBuilder<'a, TAlphabet, TState, TOutput> +where + TAlphabet: Clone + Eq + Hash + Ord, + TState: Clone + Eq + Hash, + TOutput: Output, +{ + inner: &'a mut BuilderInner, + + // The index within `inner.unfinished` where we will transition out of next. + index: usize, + + // Any leftover output from the last transition that we need to roll over + // into the next transition. + output: TOutput, +} + +impl<'a, TAlphabet, TState, TOutput> InsertionBuilder<'a, TAlphabet, TState, TOutput> +where + TAlphabet: Clone + Eq + Hash + Ord, + TState: Clone + Eq + Hash, + TOutput: Output, +{ + /// Insert the next character of input for this entry, and the associated + /// output that should be emitted along with it. + /// + /// In general, you want to add all of your output on the very first `next` + /// call, and use [`Output::empty()`][crate::Output::empty] for all the + /// rest. This enables the most tail-sharing of suffixes, which leads to the + /// most compact automatas. + /// + /// However, there are times when you *cannot* emit output yet, as it + /// depends on having moved throught he automata further. For example, with + /// `peepmatic` we cannot bind something from an optimization's left-hand + /// side's pattern until after we know it exists, which only happens after + /// we've moved some distance through the automata. + pub fn next(&mut self, input: TAlphabet, output: TOutput) -> &mut Self { + assert!(self.index < self.inner.unfinished.len()); + + if output.is_empty() { + // Leave `self.output` as it is. + } else if self.output.is_empty() { + self.output = output; + } else { + self.output = TOutput::concat(&self.output, &output); + } + + let wip_id = self.inner.unfinished[self.index]; + let wip = self.inner.wip.get_mut(&wip_id).unwrap(); + + match wip.transitions.get_mut(&input) { + Some((WipOrFrozenStateId::Frozen(_), _)) => { + panic!("out of order insertion: wip->frozen edge in shared prefix") + } + + // We're still in a shared prefix with the last insertion. That + // means that the state we are transitioning to must be the next + // state in `unfinished`. All we have to do is make sure the + // transition's output is the common prefix of the this insertion + // and the last, and push any excess suffix output out to other + // transition edges. + Some((WipOrFrozenStateId::Wip(next_id), out)) => { + let next_id = *next_id; + assert_eq!(next_id, self.inner.unfinished[self.index + 1]); + + // Find the common prefix of `out` and `self.output`. + let prefix = TOutput::prefix(&self.output, out); + + // Carry over this key's suffix for the next input's transition. + self.output = TOutput::difference(&self.output, &prefix); + + let rest = TOutput::difference(out, &prefix); + *out = prefix; + + let next_wip = self.inner.wip.get_mut(&next_id).unwrap(); + + // Push the leftover suffix of `out` along its other + // transitions. As a small optimization, only iterate over the + // edges if there is a non-empty value to push out along them. + if !rest.is_empty() { + if next_wip.is_final { + next_wip.final_output = TOutput::concat(&rest, &next_wip.final_output); + } + for (_input, (_state, output)) in &mut next_wip.transitions { + *output = TOutput::concat(&rest, output); + } + } + } + + // We've diverged from the shared prefix with the last + // insertion. Freeze the last insertion's unshared suffix and create + // a new WIP state for us to transition into. + None => { + self.inner.freeze_from(self.index + 1); + + let output = mem::replace(&mut self.output, TOutput::empty()); + + let new_id = self.inner.new_wip_state(); + self.inner.unfinished.push(new_id); + self.inner + .wip + .get_mut(&wip_id) + .unwrap() + .transitions + .insert(input, (WipOrFrozenStateId::Wip(new_id), output)); + } + } + + self.index += 1; + assert!(self.index < self.inner.unfinished.len()); + + self + } + + /// Finish this insertion. + /// + /// Failure to call this method before this `InsertionBuilder` is dropped + /// means that the insertion is *not* committed in the builder, and future + /// calls to [`InsertionBuilder::next`][crate::InsertionBuilder::next] will + /// panic! + pub fn finish(self) { + assert!(!self.inner.unfinished.is_empty()); + assert_eq!( + self.index, + self.inner.unfinished.len() - 1, + "out of order insertion" + ); + + let wip_id = *self.inner.unfinished.last().unwrap(); + let wip = self.inner.wip.get_mut(&wip_id).unwrap(); + wip.is_final = true; + wip.final_output = self.output; + + self.inner.last_insertion_finished = true; + } + + /// Set the optional, custom data for the current state. + /// + /// If you assign different state data to two otherwise-identical states + /// within the same shared *prefix* during insertion, it is implementation + /// defined which state and custom state data is kept. + /// + /// For *suffixes*, assigning different state data to two + /// otehrwise-identical states will result in the duplication of those + /// states: they won't get de-duplicated. + pub fn set_state_data(&mut self, data: TState) -> &mut Self { + assert!(self.index < self.inner.unfinished.len()); + let id = self.inner.unfinished[self.index]; + self.inner.wip.get_mut(&id).unwrap().state_data = Some(data); + self + } + + /// Get the current state's optional, custom data, if any. + /// + /// For shared prefixes, this may return state data that was assigned to an + /// equivalent state that was added earlier in the build process. + pub fn get_state_data(&self) -> Option<&TState> { + let id = self.inner.unfinished[self.index]; + self.inner.wip.get(&id).unwrap().state_data.as_ref() + } +} + +/// The id of an immutable, frozen state. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +struct FrozenStateId(u32); + +/// The id of a mutable, work-in-progress state. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +struct WipStateId(u32); + +/// The id of either a frozen or a WIP state. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +enum WipOrFrozenStateId { + Wip(WipStateId), + Frozen(FrozenStateId), +} + +/// A frozen, immutable state inside a `Builder`. +/// +/// These states are from earlier in the lexicographic sorting on input keys, +/// and have already been processed. +#[derive(Clone, Debug, Hash)] +struct FrozenState +where + TAlphabet: Clone + Eq + Hash + Ord, + TState: Clone + Eq + Hash, + TOutput: Output, +{ + state_data: Option, + transitions: BTreeMap, + is_final: bool, + final_output: TOutput, +} + +/// A mutable, work-in-progress state inside a `Builder`. +/// +/// These states only exist for the last-inserted and currently-being-inserted +/// input keys. As soon as we find the end of their shared prefix, the last +/// key's unshared suffix is frozen, and then only the currently-being-inserted +/// input key has associated WIP states. +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +struct WipState +where + TAlphabet: Clone + Eq + Hash + Ord, + TState: Clone + Eq + Hash, + TOutput: Output, +{ + state_data: Option, + transitions: BTreeMap, + is_final: bool, + final_output: TOutput, +} + +impl WipState +where + TAlphabet: Clone + Eq + Hash + Ord, + TState: Clone + Eq + Hash, + TOutput: Output, +{ + /// Given that we froze some old, WIP state, update any transitions out of + /// this WIP state so they point to the new, frozen state. + fn update_transitions(&mut self, wip_to_frozen: &BTreeMap) { + for (to, _) in self.transitions.values_mut() { + if let WipOrFrozenStateId::Wip(w) = *to { + if let Some(f) = wip_to_frozen.get(&w) { + *to = WipOrFrozenStateId::Frozen(*f); + } + } + } + } +} + +/// A finite-state transducer automata. +/// +/// These are constructed via [`Builder`][crate::Builder]. +/// +/// An `Automaton` is immutable: new entries cannot be inserted and existing +/// entries cannot be removed. +/// +/// To query an `Automaton`, there are two APIs: +/// +/// 1. [`get`][crate::Automaton::get] -- a high-level method to get the associated +/// output value of a full input sequence. +/// +/// 2. [`query`][crate::Automaton::query] -- a low-level method to +/// incrementally query the automata. It does not require that you have the +/// full input sequence on hand all at once, only the next character. It also +/// allows you to process the output as it it built up, rather than only at +/// giving you the final, complete output value. +#[derive(Debug, Clone)] +pub struct Automaton +where + TAlphabet: Clone + Eq + Hash + Ord, + TState: Clone + Eq + Hash, + TOutput: Output, +{ + // The `i`th entry is `State(i)`'s associated custom data. + state_data: Vec>, + + // The `i`th entry contains `State(i)`'s transitions. + transitions: Vec>, + + // Keeps track of which states are final, and if so, what their final output + // is. + final_states: BTreeMap, + + // The starting state. + start_state: State, +} + +impl Automaton +where + TAlphabet: Clone + Eq + Hash + Ord, + TState: Clone + Eq + Hash, + TOutput: Output, +{ + /// Get the output value associated with the given input sequence. + /// + /// Returns `None` if the input sequence is not a member of this + /// `Automaton`'s keys. Otherwise, returns `Some(output)`. + pub fn get<'a>(&self, input: impl IntoIterator) -> Option + where + TAlphabet: 'a, + { + let mut query = self.query(); + let mut output = TOutput::empty(); + + for inp in input { + let this_out = query.next(inp)?; + output = TOutput::concat(&output, &this_out); + } + + let final_output = query.finish()?; + Some(TOutput::concat(&output, final_output)) + } + + /// Create a low-level query. + /// + /// This allows you to incrementally query this `Automaton`, without + /// providing the full input sequence ahead of time, and also incrementally + /// build up the output. + /// + /// See [`Query`][crate::Query] for details. + pub fn query(&self) -> Query { + Query { + automata: self, + current_state: self.start_state, + } + } + + /// Check that the internal representaton is OK. + /// + /// Checks that we don't have any transitions to unknown states, that there + /// aren't any cycles, that ever path through the automata eventually ends + /// in a final state, etc. + /// + /// This property is `debug_assert!`ed in `Builder::finish`, and checked + /// when deserializing an `Automaton`. + /// + /// Returns `true` if the representation is okay, `false` otherwise. + fn check_representation(&self) -> Result<(), &'static str> { + macro_rules! bail_if { + ($condition:expr, $msg:expr) => { + if $condition { + return Err($msg); + } + }; + } + + bail_if!( + self.state_data.len() != self.transitions.len(), + "different number of states and transition sets" + ); + bail_if!( + self.final_states.is_empty(), + "the set of final states is empty" + ); + + bail_if!( + (self.start_state.0 as usize) >= self.transitions.len(), + "the start state is not a valid state" + ); + + for (f, _out) in &self.final_states { + bail_if!( + (f.0 as usize) >= self.transitions.len(), + "one of the final states is not a valid state" + ); + } + + // Walk the state transition graph and ensure that + // + // 1. there are no cycles, and + // + // 2. every path ends in a final state. + let mut on_stack = HashSet::new(); + let mut stack = vec![ + (Traversal::Stop, self.start_state), + (Traversal::Start, self.start_state), + ]; + loop { + match stack.pop() { + None => break, + Some((Traversal::Start, state)) => { + let is_new = on_stack.insert(state); + debug_assert!(is_new); + + let mut has_any_transitions = false; + for (_input, (to_state, _output)) in &self.transitions[state.0 as usize] { + has_any_transitions = true; + + // A transition to a state that we walked through to get + // here means that there is a cycle. + bail_if!( + on_stack.contains(to_state), + "there is a cycle in the state transition graph" + ); + + stack.extend( + iter::once((Traversal::Stop, *to_state)) + .chain(iter::once((Traversal::Start, *to_state))), + ); + } + + if !has_any_transitions { + // All paths must end in a final state. + bail_if!( + !self.final_states.contains_key(&state), + "a path through the state transition graph does not end in a final state" + ); + } + } + Some((Traversal::Stop, state)) => { + debug_assert!(on_stack.contains(&state)); + on_stack.remove(&state); + } + } + } + + return Ok(()); + + enum Traversal { + Start, + Stop, + } + } +} + +/// A low-level query of an `Automaton`. +/// +/// This allows you to incrementally query an `Automaton`, without providing the +/// full input sequence ahead of time, and also to incrementally build up the +/// output. +/// +/// The typical usage pattern is: +/// +/// * First, a series of [`next`][crate::Query::next] calls that each provide +/// one character of the input sequence. +/// +/// If this query is still on a path towards a known entry of the +/// automata, then `Some` is returned with the partial output of the +/// transition that was just taken. Otherwise, `None` is returned, signifying +/// that the input string has been rejected by the automata. +/// +/// You may also inspect the current state's associated custom data, if any, +/// in between `next` calls via the +/// [`current_state_data`][crate::Query::current_state_data] method. +/// +/// * When the input sequence is exhausted, call +/// [`is_in_final_state`][crate::Query::is_in_final_state] to determine if this +/// query is in a final state of the automata. If it is not, then the +/// input string has been rejected by the automata. +/// +/// * Given that the input sequence is exhausted, you may call +/// [`finish`][crate::Query::finish] to get the final bit of partial output. +#[derive(Debug, Clone)] +pub struct Query<'a, TAlphabet, TState, TOutput> +where + TAlphabet: Clone + Eq + Hash + Ord, + TState: Clone + Eq + Hash, + TOutput: Output, +{ + automata: &'a Automaton, + current_state: State, +} + +impl<'a, TAlphabet, TState, TOutput> Query<'a, TAlphabet, TState, TOutput> +where + TAlphabet: Clone + Eq + Hash + Ord, + TState: Clone + Eq + Hash, + TOutput: Output, +{ + /// Get the current state in the automaton that this query is at. + pub fn current_state(&self) -> State { + self.current_state + } + + /// Move this query to the given state in the automaton. + /// + /// This can be used to implement backtracking, if you can also reset your + /// output to the way it was when you previously visited the given `State`. + /// + /// Only use a `State` that came from this query's automaton! Mixing and + /// matching states between automata will result in bogus results and/or + /// panics! + pub fn go_to_state(&mut self, state: State) { + assert!((state.0 as usize) < self.automata.transitions.len()); + debug_assert_eq!( + self.automata.state_data.len(), + self.automata.transitions.len() + ); + self.current_state = state; + } + + /// Does the query's current state have a transition on the given input? + /// + /// Regardless whether a transition on the given input exists for the + /// current state or not, the query remains in the current state. + pub fn has_transition_on(&self, input: &TAlphabet) -> bool { + let State(i) = self.current_state; + self.automata.transitions[i as usize].contains_key(input) + } + + /// Transition to the next state given the next input character, and return + /// the partial output for that transition. + /// + /// If `None` is returned, then the input sequence has been rejected by the + /// automata, and this query remains in its current state. + #[inline] + pub fn next(&mut self, input: &TAlphabet) -> Option<&'a TOutput> { + let State(i) = self.current_state; + match self.automata.transitions[i as usize].get(input) { + None => None, + Some((next_state, output)) => { + self.current_state = *next_state; + Some(output) + } + } + } + + /// Get the current state's associated custom data, if any. + /// + /// See also + /// [`InsertionBuilder::set_state_data`][crate::InsertionBuilder::set_state_data]. + #[inline] + pub fn current_state_data(&self) -> Option<&'a TState> { + let State(i) = self.current_state; + self.automata.state_data[i as usize].as_ref() + } + + /// Is this query currently in a final state? + #[inline] + pub fn is_in_final_state(&self) -> bool { + self.automata.final_states.contains_key(&self.current_state) + } + + /// Given that the input sequence is exhausted, get the final bit of partial + /// output. + /// + /// Returns `None` if this query is not currently in a final state, meaning + /// that the automata has rejected this input sequence. You can check + /// whether that is the case or not with the + /// [`is_in_final_state`][crate::Query::is_in_final_state] method. + pub fn finish(self) -> Option<&'a TOutput> { + self.automata.final_states.get(&self.current_state) + } +} + +#[cfg(test)] +mod tests { + #[test] + fn it_works() { + assert_eq!(2 + 2, 4); + } +} diff --git a/cranelift/peepmatic/crates/automata/src/output_impls.rs b/cranelift/peepmatic/crates/automata/src/output_impls.rs new file mode 100644 index 0000000000..6701f7d867 --- /dev/null +++ b/cranelift/peepmatic/crates/automata/src/output_impls.rs @@ -0,0 +1,93 @@ +use crate::Output; +use std::cmp; +use std::hash::Hash; + +impl Output for u64 { + fn empty() -> Self { + 0 + } + + fn prefix(a: &Self, b: &Self) -> Self { + cmp::min(*a, *b) + } + + fn difference(a: &Self, b: &Self) -> Self { + a - b + } + + fn concat(a: &Self, b: &Self) -> Self { + a + b + } +} + +impl Output for Vec +where + T: Clone + Eq + Hash, +{ + fn empty() -> Self { + vec![] + } + + fn is_empty(&self) -> bool { + self.is_empty() + } + + fn prefix(a: &Self, b: &Self) -> Self { + a.iter() + .cloned() + .zip(b.iter().cloned()) + .take_while(|(a, b)| a == b) + .map(|(a, _)| a) + .collect() + } + + fn difference(a: &Self, b: &Self) -> Self { + let i = a + .iter() + .zip(b.iter()) + .position(|(a, b)| a != b) + .unwrap_or(cmp::min(a.len(), b.len())); + a[i..].to_vec() + } + + fn concat(a: &Self, b: &Self) -> Self { + let mut c = a.clone(); + c.extend(b.iter().cloned()); + c + } +} + +#[cfg(test)] +mod tests { + use crate::Output; + use std::fmt::Debug; + + // Assert the laws that `Output` requires for correctness. `a` and `b` + // should be two different instances of an `Output` type. + fn assert_laws(a: O, b: O) + where + O: Clone + Debug + Output, + { + // Law 1 + assert_eq!(O::concat(&O::empty(), &a), a.clone()); + + // Law 2 + assert_eq!(O::prefix(&b, &a), O::prefix(&a, &b)); + + // Law 3 + assert_eq!(O::prefix(&O::empty(), &a), O::empty()); + + // Law 4 + assert_eq!(O::difference(&O::concat(&a, &b), &a), b); + } + + #[test] + fn impl_for_u64() { + assert_laws(3, 5); + } + + #[test] + fn impl_for_vec() { + assert_laws(vec![0, 1, 2, 3], vec![0, 2, 4, 6]); + } +} diff --git a/cranelift/peepmatic/crates/automata/src/serde_impls.rs b/cranelift/peepmatic/crates/automata/src/serde_impls.rs new file mode 100644 index 0000000000..f7543626e5 --- /dev/null +++ b/cranelift/peepmatic/crates/automata/src/serde_impls.rs @@ -0,0 +1,195 @@ +//! `serde::Serialize` and `serde::Deserialize` implementations for `Automaton`. +//! +//! Rather than prefix each serialized field with which field it is, we always +//! serialize fields in alphabetical order. Make sure to maintain this if you +//! add or remove fields! +//! +//! Each time you add/remove a field, or change serialization in any other way, +//! make sure to bump `SERIALIZATION_VERSION`. + +use crate::{Automaton, Output, State}; +use serde::{ + de::{self, Deserializer, SeqAccess, Visitor}, + ser::SerializeTupleStruct, + Deserialize, Serialize, Serializer, +}; +use std::collections::BTreeMap; +use std::fmt; +use std::hash::Hash; +use std::marker::PhantomData; + +const SERIALIZATION_VERSION: u32 = 1; + +impl Serialize for State { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + serializer.serialize_u32(self.0) + } +} + +impl<'de> Deserialize<'de> for State { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + Ok(State(deserializer.deserialize_u32(U32Visitor)?)) + } +} + +struct U32Visitor; + +impl<'de> Visitor<'de> for U32Visitor { + type Value = u32; + + fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str("an integer between `0` and `2^32 - 1`") + } + + fn visit_u8(self, value: u8) -> Result + where + E: de::Error, + { + Ok(u32::from(value)) + } + + fn visit_u32(self, value: u32) -> Result + where + E: de::Error, + { + Ok(value) + } + + fn visit_u64(self, value: u64) -> Result + where + E: de::Error, + { + use std::u32; + if value <= u64::from(u32::MAX) { + Ok(value as u32) + } else { + Err(E::custom(format!("u32 out of range: {}", value))) + } + } +} + +impl Serialize for Automaton +where + TAlphabet: Serialize + Clone + Eq + Hash + Ord, + TState: Serialize + Clone + Eq + Hash, + TOutput: Serialize + Output, +{ + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let Automaton { + final_states, + start_state, + state_data, + transitions, + } = self; + + let mut s = serializer.serialize_tuple_struct("Automaton", 5)?; + s.serialize_field(&SERIALIZATION_VERSION)?; + s.serialize_field(final_states)?; + s.serialize_field(start_state)?; + s.serialize_field(state_data)?; + s.serialize_field(transitions)?; + s.end() + } +} + +impl<'de, TAlphabet, TState, TOutput> Deserialize<'de> for Automaton +where + TAlphabet: 'de + Deserialize<'de> + Clone + Eq + Hash + Ord, + TState: 'de + Deserialize<'de> + Clone + Eq + Hash, + TOutput: 'de + Deserialize<'de> + Output, +{ + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + deserializer.deserialize_tuple_struct( + "Automaton", + 5, + AutomatonVisitor { + phantom: PhantomData, + }, + ) + } +} + +struct AutomatonVisitor<'de, TAlphabet, TState, TOutput> +where + TAlphabet: 'de + Deserialize<'de> + Clone + Eq + Hash + Ord, + TState: 'de + Deserialize<'de> + Clone + Eq + Hash, + TOutput: 'de + Deserialize<'de> + Output, +{ + phantom: PhantomData<&'de (TAlphabet, TState, TOutput)>, +} + +impl<'de, TAlphabet, TState, TOutput> Visitor<'de> + for AutomatonVisitor<'de, TAlphabet, TState, TOutput> +where + TAlphabet: 'de + Deserialize<'de> + Clone + Eq + Hash + Ord, + TState: 'de + Deserialize<'de> + Clone + Eq + Hash, + TOutput: 'de + Deserialize<'de> + Output, +{ + type Value = Automaton; + + fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str("Automaton") + } + + fn visit_seq(self, mut seq: A) -> Result + where + A: SeqAccess<'de>, + { + match seq.next_element::()? { + Some(v) if v == SERIALIZATION_VERSION => {} + Some(v) => { + return Err(de::Error::invalid_value( + de::Unexpected::Unsigned(v as u64), + &self, + )); + } + None => return Err(de::Error::invalid_length(0, &"Automaton expects 5 elements")), + } + + let final_states = match seq.next_element::>()? { + Some(x) => x, + None => return Err(de::Error::invalid_length(1, &"Automaton expects 5 elements")), + }; + + let start_state = match seq.next_element::()? { + Some(x) => x, + None => return Err(de::Error::invalid_length(2, &"Automaton expects 5 elements")), + }; + + let state_data = match seq.next_element::>>()? { + Some(x) => x, + None => return Err(de::Error::invalid_length(3, &"Automaton expects 5 elements")), + }; + + let transitions = match seq.next_element::>>()? { + Some(x) => x, + None => return Err(de::Error::invalid_length(4, &"Automaton expects 5 elements")), + }; + + let automata = Automaton { + final_states, + start_state, + state_data, + transitions, + }; + + // Ensure that the deserialized automata is well-formed. + automata + .check_representation() + .map_err(|msg| de::Error::custom(msg))?; + + Ok(automata) + } +}