wasmtime/cranelift/codegen/src/egraph.rs

//! Support for egraphs represented in the DataFlowGraph.

use crate::alias_analysis::{AliasAnalysis, LastStores};
use crate::ctxhash::{CtxEq, CtxHash, CtxHashMap};
use crate::cursor::{Cursor, CursorPosition, FuncCursor};
use crate::dominator_tree::DominatorTree;
use crate::egraph::domtree::DomTreeWithChildren;
use crate::egraph::elaborate::Elaborator;
use crate::fx::FxHashSet;
use crate::inst_predicates::is_pure_for_egraph;
use crate::ir::{
    DataFlowGraph, Function, Inst, InstructionData, Type, Value, ValueDef, ValueListPool,
};
use crate::loop_analysis::LoopAnalysis;
use crate::opts::generated_code::ContextIter;
use crate::opts::IsleContext;
use crate::trace;
use crate::unionfind::UnionFind;
use cranelift_entity::packed_option::ReservedValue;
use cranelift_entity::SecondaryMap;
use std::hash::Hasher;

mod cost;
mod domtree;
mod elaborate;

/// Pass over a Function that does the whole aegraph thing.
///
/// - Removes non-skeleton nodes from the Layout.
/// - Performs a GVN-and-rule-application pass over all Values
///   reachable from the skeleton, potentially creating new Union
///   nodes (i.e., an aegraph) so that some values have multiple
///   representations.
/// - Does "extraction" on the aegraph: selects the best value out of
///   the tree-of-Union nodes for each used value.
/// - Does "scoped elaboration" on the aegraph: chooses one or more
///   locations for pure nodes to become instructions again in the
///   layout, as forced by the skeleton.
///
/// At the beginning and end of this pass, the CLIF should be in a
/// state that passes the verifier and, additionally, has no Union
/// nodes. During the pass, Union nodes may exist, and instructions in
/// the layout may refer to results of instructions that are not
/// placed in the layout.
pub struct EgraphPass<'a> {
    /// The function we're operating on.
    func: &'a mut Function,
    /// Dominator tree, used for elaboration pass.
    domtree: &'a DominatorTree,
    /// Alias analysis, used during optimization.
    alias_analysis: &'a mut AliasAnalysis<'a>,
    /// "Domtree with children": like `domtree`, but with an explicit
    /// list of children, rather than just parent pointers.
    domtree_children: DomTreeWithChildren,
    /// Loop analysis results, used for built-in LICM during
    /// elaboration.
    loop_analysis: &'a LoopAnalysis,
    /// Which canonical Values do we want to rematerialize in each
    /// block where they're used?
    ///
    /// (A canonical Value is the *oldest* Value in an eclass,
    /// i.e. tree of union value-nodes).
    remat_values: FxHashSet<Value>,
    /// Stats collected while we run this pass.
    pub(crate) stats: Stats,
    /// Union-find that maps all members of a Union tree (eclass) back
    /// to the *oldest* (lowest-numbered) `Value`.
    eclasses: UnionFind<Value>,
}

/// Context passed through node insertion and optimization.
pub(crate) struct OptimizeCtx<'opt, 'analysis>
where
    'analysis: 'opt,
{
    // Borrowed from EgraphPass:
    pub(crate) func: &'opt mut Function,
    pub(crate) value_to_opt_value: &'opt mut SecondaryMap<Value, Value>,
    pub(crate) gvn_map: &'opt mut CtxHashMap<(Type, InstructionData), Value>,
    pub(crate) eclasses: &'opt mut UnionFind<Value>,
    pub(crate) remat_values: &'opt mut FxHashSet<Value>,
    pub(crate) stats: &'opt mut Stats,
    pub(crate) alias_analysis: &'opt mut AliasAnalysis<'analysis>,
    pub(crate) alias_analysis_state: &'opt mut LastStores,
    // Held locally during optimization of one node (recursively):
    pub(crate) rewrite_depth: usize,
    pub(crate) subsume_values: FxHashSet<Value>,
}

/// For passing to `insert_pure_enode`. Sometimes the enode already
/// exists as an Inst (from the original CLIF), and sometimes we're in
/// the middle of creating it and want to avoid inserting it if
/// possible until we know we need it.
pub(crate) enum NewOrExistingInst {
    New(InstructionData, Type),
    Existing(Inst),
}

impl NewOrExistingInst {
    fn get_inst_key<'a>(&'a self, dfg: &'a DataFlowGraph) -> (Type, InstructionData) {
        match self {
            NewOrExistingInst::New(data, ty) => (*ty, *data),
            NewOrExistingInst::Existing(inst) => {
                let ty = dfg.ctrl_typevar(*inst);
                (ty, dfg[*inst].clone())
            }
        }
    }
}

impl<'opt, 'analysis> OptimizeCtx<'opt, 'analysis>
where
    'analysis: 'opt,
{
    /// Optimization of a single instruction.
    ///
    /// This does a few things:
    /// - Looks up the instruction in the GVN deduplication map. If we
    ///   already have the same instruction somewhere else, with the
    ///   same args, then we can alias the original instruction's
    ///   results and omit this instruction entirely.
    ///   - Note that we do this canonicalization based on the
    ///     instruction with its arguments as *canonical* eclass IDs,
    ///     that is, the oldest (smallest index) `Value` reachable in
    ///     the tree-of-unions (whole eclass). This ensures that we
    ///     properly canonicalize newer nodes that use newer "versions"
    ///     of a value that are still equal to the older versions.
    /// - If the instruction is "new" (not deduplicated), then apply
    ///   optimization rules:
    ///   - All of the mid-end rules written in ISLE.
    ///   - Store-to-load forwarding.
    /// - Update the value-to-opt-value map, and update the eclass
    ///   union-find, if we rewrote the value to different form(s).
    pub(crate) fn insert_pure_enode(&mut self, inst: NewOrExistingInst) -> Value {
        // Create the external context for looking up and updating the
        // GVN map. This is necessary so that instructions themselves
        // do not have to carry all the references or data for a full
        // `Eq` or `Hash` impl.
        let gvn_context = GVNContext {
            union_find: self.eclasses,
            value_lists: &self.func.dfg.value_lists,
        };

        self.stats.pure_inst += 1;
        if let NewOrExistingInst::New(..) = inst {
            self.stats.new_inst += 1;
        }

        // Does this instruction already exist? If so, add entries to
        // the value-map to rewrite uses of its results to the results
        // of the original (existing) instruction. If not, optimize
        // the new instruction.
        if let Some(&orig_result) = self
            .gvn_map
            .get(&inst.get_inst_key(&self.func.dfg), &gvn_context)
        {
            self.stats.pure_inst_deduped += 1;
            if let NewOrExistingInst::Existing(inst) = inst {
                debug_assert_eq!(self.func.dfg.inst_results(inst).len(), 1);
                let result = self.func.dfg.first_result(inst);
                self.value_to_opt_value[result] = orig_result;
                self.eclasses.union(result, orig_result);
                self.stats.union += 1;
                result
            } else {
                orig_result
            }
        } else {
            // Now actually insert the InstructionData and attach
            // result value (exactly one).
            let (inst, result, ty) = match inst {
                NewOrExistingInst::New(data, typevar) => {
                    let inst = self.func.dfg.make_inst(data);
                    // TODO: reuse return value?
                    self.func.dfg.make_inst_results(inst, typevar);
                    let result = self.func.dfg.first_result(inst);
                    // Add to eclass unionfind.
                    self.eclasses.add(result);
                    // New inst. We need to do the analysis of its result.
                    (inst, result, typevar)
                }
                NewOrExistingInst::Existing(inst) => {
                    let result = self.func.dfg.first_result(inst);
                    let ty = self.func.dfg.ctrl_typevar(inst);
                    (inst, result, ty)
                }
            };

            let opt_value = self.optimize_pure_enode(inst);
            let gvn_context = GVNContext {
                union_find: self.eclasses,
                value_lists: &self.func.dfg.value_lists,
            };
            self.gvn_map
                .insert((ty, self.func.dfg[inst].clone()), opt_value, &gvn_context);
            self.value_to_opt_value[result] = opt_value;
            opt_value
        }
    }

    /// Optimizes an enode by applying any matching mid-end rewrite
    /// rules (or store-to-load forwarding, which is a special case),
    /// unioning together all possible optimized (or rewritten) forms
    /// of this expression into an eclass and returning the `Value`
    /// that represents that eclass.
    fn optimize_pure_enode(&mut self, inst: Inst) -> Value {
        // A pure node always has exactly one result.
        let orig_value = self.func.dfg.first_result(inst);

        let mut isle_ctx = IsleContext { ctx: self };

        // Limit rewrite depth. When we apply optimization rules, they
        // may create new nodes (values) and those are, recursively,
        // optimized eagerly as soon as they are created. So we may
        // have more than one ISLE invocation on the stack. (This is
        // necessary so that as the toplevel builds the
        // right-hand-side expression bottom-up, it uses the "latest"
        // optimized values for all the constituent parts.) To avoid
        // infinite or problematic recursion, we bound the rewrite
        // depth to a small constant here.
        const REWRITE_LIMIT: usize = 5;
        if isle_ctx.ctx.rewrite_depth > REWRITE_LIMIT {
            isle_ctx.ctx.stats.rewrite_depth_limit += 1;
            return orig_value;
        }
        isle_ctx.ctx.rewrite_depth += 1;

        // Invoke the ISLE toplevel constructor, getting all new
        // values produced as equivalents to this value.
        trace!("Calling into ISLE with original value {}", orig_value);
        isle_ctx.ctx.stats.rewrite_rule_invoked += 1;
        let mut optimized_values =
            crate::opts::generated_code::constructor_simplify(&mut isle_ctx, orig_value);

        // Create a union of all new values with the original (or
        // maybe just one new value marked as "subsuming" the
        // original, if present.)
        let mut union_value = orig_value;
        while let Some(optimized_value) = optimized_values.next(&mut isle_ctx) {
            trace!(
                "Returned from ISLE for {}, got {:?}",
                orig_value,
                optimized_value
            );
            if optimized_value == orig_value {
                trace!(" -> same as orig value; skipping");
                continue;
            }
            if isle_ctx.ctx.subsume_values.contains(&optimized_value) {
                // Merge in the unionfind so canonicalization
                // still works, but take *only* the subsuming
                // value, and break now.
                isle_ctx.ctx.eclasses.union(optimized_value, union_value);
                union_value = optimized_value;
                break;
            }

            let old_union_value = union_value;
            union_value = isle_ctx
                .ctx
                .func
                .dfg
                .union(old_union_value, optimized_value);
            isle_ctx.ctx.stats.union += 1;
            trace!(" -> union: now {}", union_value);
            isle_ctx.ctx.eclasses.add(union_value);
            isle_ctx
                .ctx
                .eclasses
                .union(old_union_value, optimized_value);
            isle_ctx.ctx.eclasses.union(old_union_value, union_value);
        }

        isle_ctx.ctx.rewrite_depth -= 1;

        union_value
    }

    /// Optimize a "skeleton" instruction, possibly removing
    /// it. Returns `true` if the instruction should be removed from
    /// the layout.
    fn optimize_skeleton_inst(&mut self, inst: Inst) -> bool {
        self.stats.skeleton_inst += 1;
        // Not pure, but may still be a load or store:
        // process it to see if we can optimize it.
        if let Some(new_result) =
            self.alias_analysis
                .process_inst(self.func, self.alias_analysis_state, inst)
        {
            self.stats.alias_analysis_removed += 1;
            let result = self.func.dfg.first_result(inst);
            self.value_to_opt_value[result] = new_result;
            true
        } else {
            // Set all results to identity-map to themselves
            // in the value-to-opt-value map.
            for &result in self.func.dfg.inst_results(inst) {
                self.value_to_opt_value[result] = result;
                self.eclasses.add(result);
            }
            false
        }
    }
}

impl<'a> EgraphPass<'a> {
    /// Create a new EgraphPass.
    pub fn new(
        func: &'a mut Function,
        domtree: &'a DominatorTree,
        loop_analysis: &'a LoopAnalysis,
        alias_analysis: &'a mut AliasAnalysis<'a>,
    ) -> Self {
        let num_values = func.dfg.num_values();
        let domtree_children = DomTreeWithChildren::new(func, domtree);
        Self {
            func,
            domtree,
            domtree_children,
            loop_analysis,
            alias_analysis,
            stats: Stats::default(),
            eclasses: UnionFind::with_capacity(num_values),
            remat_values: FxHashSet::default(),
        }
    }

    /// Run the process.
    pub fn run(&mut self) {
        self.remove_pure_and_optimize();

        trace!("egraph built:\n{}\n", self.func.display());
        if cfg!(feature = "trace-log") {
            for (value, def) in self.func.dfg.values_and_defs() {
                trace!(" -> {} = {:?}", value, def);
                match def {
                    ValueDef::Result(i, 0) => {
                        trace!("  -> {} = {:?}", i, self.func.dfg[i]);
                    }
                    _ => {}
                }
            }
        }
        trace!("stats: {:?}", self.stats);
        self.elaborate();
    }

    /// Remove pure nodes from the `Layout` of the function, ensuring
    /// that only the "side-effect skeleton" remains, and also
    /// optimize the pure nodes. This is the first step of
    /// egraph-based processing and turns the pure CFG-based CLIF into
    /// a CFG skeleton with a sea of (optimized) nodes tying it
    /// together.
    ///
    /// As we walk through the code, we eagerly apply optimization
    /// rules; at any given point we have a "latest version" of an
    /// eclass of possible representations for a `Value` in the
    /// original program, which is itself a `Value` at the root of a
    /// union-tree. We keep a map from the original values to these
    /// optimized values. When we encounter any instruction (pure or
    /// side-effecting skeleton) we rewrite its arguments to capture
    /// the "latest" optimized forms of these values. (We need to do
    /// this as part of this pass, and not later using a finished map,
    /// because the eclass can continue to be updated and we need to
    /// only refer to its subset that exists at this stage, to
    /// maintain acyclicity.)
    fn remove_pure_and_optimize(&mut self) {
        let mut cursor = FuncCursor::new(self.func);
        let mut value_to_opt_value: SecondaryMap<Value, Value> =
            SecondaryMap::with_default(Value::reserved_value());
        let mut gvn_map: CtxHashMap<(Type, InstructionData), Value> =
            CtxHashMap::with_capacity(cursor.func.dfg.num_values());

        // In domtree preorder, visit blocks. (TODO: factor out an
        // iterator from this and elaborator.)
        let root = self.domtree_children.root();
        let mut block_stack = vec![root];
        while let Some(block) = block_stack.pop() {
            // We popped this block; push children
            // immediately, then process this block.
            block_stack.extend(self.domtree_children.children(block));

            trace!("Processing block {}", block);
            cursor.set_position(CursorPosition::Before(block));

            let mut alias_analysis_state = self.alias_analysis.block_starting_state(block);

            for &param in cursor.func.dfg.block_params(block) {
                trace!("creating initial singleton eclass for blockparam {}", param);
                self.eclasses.add(param);
                value_to_opt_value[param] = param;
            }
            while let Some(inst) = cursor.next_inst() {
                trace!("Processing inst {}", inst);

                // While we're passing over all insts, create initial
                // singleton eclasses for all result and blockparam
                // values.  Also do initial analysis of all inst
                // results.
                for &result in cursor.func.dfg.inst_results(inst) {
                    trace!("creating initial singleton eclass for {}", result);
                    self.eclasses.add(result);
                }

                // Rewrite args of *all* instructions using the
                // value-to-opt-value map.
                cursor.func.dfg.resolve_aliases_in_arguments(inst);
                for arg in cursor.func.dfg.inst_args_mut(inst) {
                    let new_value = value_to_opt_value[*arg];
                    trace!("rewriting arg {} of inst {} to {}", arg, inst, new_value);
                    debug_assert_ne!(new_value, Value::reserved_value());
                    *arg = new_value;
                }

                // Build a context for optimization, with borrows of
                // state. We can't invoke a method on `self` because
                // we've borrowed `self.func` mutably (as
                // `cursor.func`) so we pull apart the pieces instead
                // here.
                let mut ctx = OptimizeCtx {
                    func: cursor.func,
                    value_to_opt_value: &mut value_to_opt_value,
                    gvn_map: &mut gvn_map,
                    eclasses: &mut self.eclasses,
                    rewrite_depth: 0,
                    subsume_values: FxHashSet::default(),
                    remat_values: &mut self.remat_values,
                    stats: &mut self.stats,
                    alias_analysis: self.alias_analysis,
                    alias_analysis_state: &mut alias_analysis_state,
                };

                if is_pure_for_egraph(ctx.func, inst) {
                    // Insert into GVN map and optimize any new nodes
                    // inserted (recursively performing this work for
                    // any nodes the optimization rules produce).
                    let inst = NewOrExistingInst::Existing(inst);
                    ctx.insert_pure_enode(inst);
                    // We've now rewritten all uses, or will when we
                    // see them, and the instruction exists as a pure
                    // enode in the eclass, so we can remove it.
                    cursor.remove_inst_and_step_back();
                } else {
                    if ctx.optimize_skeleton_inst(inst) {
                        cursor.remove_inst_and_step_back();
                    }
                }
            }
        }
    }

    /// Scoped elaboration: compute a final ordering of op computation
    /// for each block and update the given Func body. After this
    /// runs, the function body is back into the state where every
    /// Inst with an used result is placed in the layout (possibly
    /// duplicated, if our code-motion logic decides this is the best
    /// option).
    ///
    /// This works in concert with the domtree. We do a preorder
    /// traversal of the domtree, tracking a scoped map from Id to
    /// (new) Value. The map's scopes correspond to levels in the
    /// domtree.
    ///
    /// At each block, we iterate forward over the side-effecting
    /// eclasses, and recursively generate their arg eclasses, then
    /// emit the ops themselves.
    ///
    /// To use an eclass in a given block, we first look it up in the
    /// scoped map, and get the Value if already present. If not, we
    /// need to generate it. We emit the extracted enode for this
    /// eclass after recursively generating its args. Eclasses are
    /// thus computed "as late as possible", but then memoized into
    /// the Id-to-Value map and available to all dominated blocks and
    /// for the rest of this block. (This subsumes GVN.)
    fn elaborate(&mut self) {
        let mut elaborator = Elaborator::new(
            self.func,
            self.domtree,
            &self.domtree_children,
            self.loop_analysis,
            &mut self.remat_values,
            &mut self.eclasses,
            &mut self.stats,
        );
        elaborator.elaborate();

        self.check_post_egraph();
    }

    #[cfg(debug_assertions)]
    fn check_post_egraph(&self) {
        // Verify that no union nodes are reachable from inst args,
        // and that all inst args' defining instructions are in the
        // layout.
        for block in self.func.layout.blocks() {
            for inst in self.func.layout.block_insts(block) {
                for &arg in self.func.dfg.inst_args(inst) {
                    match self.func.dfg.value_def(arg) {
                        ValueDef::Result(i, _) => {
                            debug_assert!(self.func.layout.inst_block(i).is_some());
                        }
                        ValueDef::Union(..) => {
                            panic!("egraph union node {} still reachable at {}!", arg, inst);
                        }
                        _ => {}
                    }
                }
            }
        }
    }

    #[cfg(not(debug_assertions))]
    fn check_post_egraph(&self) {}
}

/// Implementation of external-context equality and hashing on
/// InstructionData. This allows us to deduplicate instructions given
/// some context that lets us see its value lists and the mapping from
/// any value to "canonical value" (in an eclass).
struct GVNContext<'a> {
    value_lists: &'a ValueListPool,
    union_find: &'a UnionFind<Value>,
}

impl<'a> CtxEq<(Type, InstructionData), (Type, InstructionData)> for GVNContext<'a> {
    fn ctx_eq(
        &self,
        (a_ty, a_inst): &(Type, InstructionData),
        (b_ty, b_inst): &(Type, InstructionData),
    ) -> bool {
        a_ty == b_ty
            && a_inst.eq(b_inst, self.value_lists, |value| {
                self.union_find.find(value)
            })
    }
}

impl<'a> CtxHash<(Type, InstructionData)> for GVNContext<'a> {
    fn ctx_hash<H: Hasher>(&self, state: &mut H, (ty, inst): &(Type, InstructionData)) {
        std::hash::Hash::hash(&ty, state);
        inst.hash(state, self.value_lists, |value| self.union_find.find(value));
    }
}

/// Statistics collected during egraph-based processing.
#[derive(Clone, Debug, Default)]
pub(crate) struct Stats {
    pub(crate) pure_inst: u64,
    pub(crate) pure_inst_deduped: u64,
    pub(crate) skeleton_inst: u64,
    pub(crate) alias_analysis_removed: u64,
    pub(crate) new_inst: u64,
    pub(crate) union: u64,
    pub(crate) subsume: u64,
    pub(crate) remat: u64,
    pub(crate) rewrite_rule_invoked: u64,
    pub(crate) rewrite_depth_limit: u64,
    pub(crate) elaborate_visit_node: u64,
    pub(crate) elaborate_memoize_hit: u64,
    pub(crate) elaborate_memoize_miss: u64,
    pub(crate) elaborate_memoize_miss_remat: u64,
    pub(crate) elaborate_licm_hoist: u64,
    pub(crate) elaborate_func: u64,
    pub(crate) elaborate_func_pre_insts: u64,
    pub(crate) elaborate_func_post_insts: u64,
}