egraphs: fix handling of effectful-but-idempotent ops and GVN. (#5800)

* Revert "egraphs: disable GVN of effectful idempotent ops (temporarily). (#5808)" This reverts commit c7e2571866. * egraphs: fix handling of effectful-but-idempotent ops and GVN. This PR addresses #5796: currently, ops that are effectful, i.e., remain in the side-effecting skeleton (which we keep in the `Layout` while the egraph exists), but are idempotent and thus mergeable by a GVN pass, are not handled properly. GVN is still possible on effectful but idempotent ops precisely because our GVN does not create partial redundancies: it removes an instruction only when it is dominated by an identical instruction. An isntruction will not be "hoisted" to a point where it could execute in the optimized code but not in the original. However, there are really two parts to the egraph implementation that produce this effect: the deduplication on insertion into the egraph, and the elaboration with a scoped hashmap. The deduplication lets us give a single name (value ID) to all copies of an identical instruction, and then elaboration will re-create duplicates if GVN should not hoist or merge some of them. Because deduplication need not worry about dominance or scopes, we use a simple (non-scoped) hashmap to dedup/intern ops as "egraph nodes". When we added support for GVN'ing effectful but idempotent ops (#5594), we kept the use of this simple dedup'ing hashmap, but these ops do not get elaborated; instead they stay in the side-effecting skeleton. Thus, we inadvertently created potential for weird code-motion effects. The proposal in #5796 would solve this in a clean way by treating these ops as pure again, and keeping them out of the skeleton, instead putting "force" pseudo-ops in the skeleton. However, this is a little more complex than I would like, and I've realized that @jameysharp's earlier suggestion is much simpler: we can keep an actual scoped hashmap separately just for the effectful-but-idempotent ops, and use it to GVN while we build the egraph. In effect, we're fusing a separate GVN pass with the egraph pass (but letting it interact corecursively with egraph rewrites. This is in principle similar to how we keep a separate map for loads and fuse this pass with the egraph rewrite pass as well. Note that we can use a `ScopedHashMap` here without the "context" (as needed by `CtxHashMap`) because, as noted by @jameysharp, in practice the ops we want to GVN have all their args inline. Equality on the `InstructinoData` itself is conservative: two insts whose struct contents compare shallowly equal are definitely identical, but identical insts in a deep-equality sense may not compare shallowly equal, due to list indirection. This is fine for GVN, because it is still sound to skip any given GVN opportunity (and keep the original instructions). Fixes #5796. * Add comments from review.
2023-03-01 18:10:42 -08:00
parent f05babc744
commit 7b8854f803
7 changed files with 347 additions and 75 deletions
--- a/cranelift/codegen/meta/src/gen_inst.rs
+++ b/cranelift/codegen/meta/src/gen_inst.rs
@@ -66,7 +66,7 @@ fn gen_formats(formats: &[&InstructionFormat], fmt: &mut Formatter) {
 /// 16 bytes on 64-bit architectures. If more space is needed to represent an instruction, use a
 /// `ValueList` to store the additional information out of line.
 fn gen_instruction_data(formats: &[&InstructionFormat], fmt: &mut Formatter) {
-    fmt.line("#[derive(Copy, Clone, Debug, PartialEq, Hash)]");
+    fmt.line("#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]");
    fmt.line(r#"#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))]"#);
    fmt.line("#[allow(missing_docs)]");
    fmtln!(fmt, "pub enum InstructionData {");
--- a/cranelift/codegen/src/egraph.rs
+++ b/cranelift/codegen/src/egraph.rs
@@ -7,13 +7,14 @@ use crate::dominator_tree::DominatorTree;
 use crate::egraph::domtree::DomTreeWithChildren;
 use crate::egraph::elaborate::Elaborator;
 use crate::fx::FxHashSet;
-use crate::inst_predicates::is_pure_for_egraph;
+use crate::inst_predicates::{is_mergeable_for_egraph, is_pure_for_egraph};
 use crate::ir::{
-    DataFlowGraph, Function, Inst, InstructionData, Type, Value, ValueDef, ValueListPool,
+    Block, DataFlowGraph, Function, Inst, InstructionData, Type, Value, ValueDef, ValueListPool,
 };
 use crate::loop_analysis::LoopAnalysis;
 use crate::opts::generated_code::ContextIter;
 use crate::opts::IsleContext;
+use crate::scoped_hash_map::{Entry as ScopedEntry, ScopedHashMap};
 use crate::trace;
 use crate::unionfind::UnionFind;
 use cranelift_entity::packed_option::ReservedValue;
@@ -77,6 +78,7 @@ where
    pub(crate) func: &'opt mut Function,
    pub(crate) value_to_opt_value: &'opt mut SecondaryMap<Value, Value>,
    pub(crate) gvn_map: &'opt mut CtxHashMap<(Type, InstructionData), Value>,
+    pub(crate) effectful_gvn_map: &'opt mut ScopedHashMap<(Type, InstructionData), Value>,
    pub(crate) eclasses: &'opt mut UnionFind<Value>,
    pub(crate) remat_values: &'opt mut FxHashSet<Value>,
    pub(crate) stats: &'opt mut Stats,
@@ -285,10 +287,49 @@ where
    fn optimize_skeleton_inst(&mut self, inst: Inst) -> bool {
        self.stats.skeleton_inst += 1;

-        // If a load or store, process it with the alias analysis to see
-        // if we can optimize it (rewrite in terms of an earlier load or
-        // stored value).
-        if let Some(new_result) =
+        // First, can we try to deduplicate? We need to keep some copy
+        // of the instruction around because it's side-effecting, but
+        // we may be able to reuse an earlier instance of it.
+        if is_mergeable_for_egraph(self.func, inst) {
+            let result = self.func.dfg.inst_results(inst)[0];
+            trace!(" -> mergeable side-effecting op {}", inst);
+
+            // Does this instruction already exist? If so, add entries to
+            // the value-map to rewrite uses of its results to the results
+            // of the original (existing) instruction. If not, optimize
+            // the new instruction.
+            //
+            // Note that we use the "effectful GVN map", which is
+            // scoped: because effectful ops are not removed from the
+            // skeleton (`Layout`), we need to be mindful of whether
+            // our current position is dominated by an instance of the
+            // instruction. (See #5796 for details.)
+            let ty = self.func.dfg.ctrl_typevar(inst);
+            match self
+                .effectful_gvn_map
+                .entry((ty, self.func.dfg.insts[inst].clone()))
+            {
+                ScopedEntry::Occupied(o) => {
+                    let orig_result = *o.get();
+                    // Hit in GVN map -- reuse value.
+                    self.value_to_opt_value[result] = orig_result;
+                    self.eclasses.union(orig_result, result);
+                    trace!(" -> merges result {} to {}", result, orig_result);
+                    true
+                }
+                ScopedEntry::Vacant(v) => {
+                    // Otherwise, insert it into the value-map.
+                    self.value_to_opt_value[result] = result;
+                    v.insert(result);
+                    trace!(" -> inserts as new (no GVN)");
+                    false
+                }
+            }
+        }
+        // Otherwise, if a load or store, process it with the alias
+        // analysis to see if we can optimize it (rewrite in terms of
+        // an earlier load or stored value).
+        else if let Some(new_result) =
            self.alias_analysis
                .process_inst(self.func, self.alias_analysis_state, inst)
        {
@@ -382,82 +423,126 @@ impl<'a> EgraphPass<'a> {
        let mut cursor = FuncCursor::new(self.func);
        let mut value_to_opt_value: SecondaryMap<Value, Value> =
            SecondaryMap::with_default(Value::reserved_value());
+        // Map from instruction to value for hash-consing of pure ops
+        // into the egraph. This can be a standard (non-scoped)
+        // hashmap because pure ops have no location: they are
+        // "outside of" control flow.
+        //
+        // Note also that we keep the controlling typevar (the `Type`
+        // in the tuple below) because it may disambiguate
+        // instructions that are identical except for type.
        let mut gvn_map: CtxHashMap<(Type, InstructionData), Value> =
            CtxHashMap::with_capacity(cursor.func.dfg.num_values());
+        // Map from instruction to value for GVN'ing of effectful but
+        // idempotent ops, which remain in the side-effecting
+        // skeleton. This needs to be scoped because we cannot
+        // deduplicate one instruction to another that is in a
+        // non-dominating block.
+        //
+        // Note that we can use a ScopedHashMap here without the
+        // "context" (as needed by CtxHashMap) because in practice the
+        // ops we want to GVN have all their args inline. Equality on
+        // the InstructionData itself is conservative: two insts whose
+        // struct contents compare shallowly equal are definitely
+        // identical, but identical insts in a deep-equality sense may
+        // not compare shallowly equal, due to list indirection. This
+        // is fine for GVN, because it is still sound to skip any
+        // given GVN opportunity (and keep the original instructions).
+        //
+        // As above, we keep the controlling typevar here as part of
+        // the key: effectful instructions may (as for pure
+        // instructions) be differentiated only on the type.
+        let mut effectful_gvn_map: ScopedHashMap<(Type, InstructionData), Value> =
+            ScopedHashMap::new();

        // In domtree preorder, visit blocks. (TODO: factor out an
        // iterator from this and elaborator.)
        let root = self.domtree_children.root();
-        let mut block_stack = vec![root];
-        while let Some(block) = block_stack.pop() {
-            // We popped this block; push children
-            // immediately, then process this block.
-            block_stack.extend(self.domtree_children.children(block));
+        enum StackEntry {
+            Visit(Block),
+            Pop,
+        }
+        let mut block_stack = vec![StackEntry::Visit(root)];
+        while let Some(entry) = block_stack.pop() {
+            match entry {
+                StackEntry::Visit(block) => {
+                    // We popped this block; push children
+                    // immediately, then process this block.
+                    block_stack.push(StackEntry::Pop);
+                    block_stack
+                        .extend(self.domtree_children.children(block).map(StackEntry::Visit));
+                    effectful_gvn_map.increment_depth();

-            trace!("Processing block {}", block);
-            cursor.set_position(CursorPosition::Before(block));
+                    trace!("Processing block {}", block);
+                    cursor.set_position(CursorPosition::Before(block));

-            let mut alias_analysis_state = self.alias_analysis.block_starting_state(block);
+                    let mut alias_analysis_state = self.alias_analysis.block_starting_state(block);

-            for &param in cursor.func.dfg.block_params(block) {
-                trace!("creating initial singleton eclass for blockparam {}", param);
-                self.eclasses.add(param);
-                value_to_opt_value[param] = param;
-            }
-            while let Some(inst) = cursor.next_inst() {
-                trace!("Processing inst {}", inst);
-
-                // While we're passing over all insts, create initial
-                // singleton eclasses for all result and blockparam
-                // values.  Also do initial analysis of all inst
-                // results.
-                for &result in cursor.func.dfg.inst_results(inst) {
-                    trace!("creating initial singleton eclass for {}", result);
-                    self.eclasses.add(result);
-                }
-
-                // Rewrite args of *all* instructions using the
-                // value-to-opt-value map.
-                cursor.func.dfg.resolve_aliases_in_arguments(inst);
-                cursor.func.dfg.map_inst_values(inst, |_, arg| {
-                    let new_value = value_to_opt_value[arg];
-                    trace!("rewriting arg {} of inst {} to {}", arg, inst, new_value);
-                    debug_assert_ne!(new_value, Value::reserved_value());
-                    new_value
-                });
-
-                // Build a context for optimization, with borrows of
-                // state. We can't invoke a method on `self` because
-                // we've borrowed `self.func` mutably (as
-                // `cursor.func`) so we pull apart the pieces instead
-                // here.
-                let mut ctx = OptimizeCtx {
-                    func: cursor.func,
-                    value_to_opt_value: &mut value_to_opt_value,
-                    gvn_map: &mut gvn_map,
-                    eclasses: &mut self.eclasses,
-                    rewrite_depth: 0,
-                    subsume_values: FxHashSet::default(),
-                    remat_values: &mut self.remat_values,
-                    stats: &mut self.stats,
-                    alias_analysis: self.alias_analysis,
-                    alias_analysis_state: &mut alias_analysis_state,
-                };
-
-                if is_pure_for_egraph(ctx.func, inst) {
-                    // Insert into GVN map and optimize any new nodes
-                    // inserted (recursively performing this work for
-                    // any nodes the optimization rules produce).
-                    let inst = NewOrExistingInst::Existing(inst);
-                    ctx.insert_pure_enode(inst);
-                    // We've now rewritten all uses, or will when we
-                    // see them, and the instruction exists as a pure
-                    // enode in the eclass, so we can remove it.
-                    cursor.remove_inst_and_step_back();
-                } else {
-                    if ctx.optimize_skeleton_inst(inst) {
-                        cursor.remove_inst_and_step_back();
+                    for &param in cursor.func.dfg.block_params(block) {
+                        trace!("creating initial singleton eclass for blockparam {}", param);
+                        self.eclasses.add(param);
+                        value_to_opt_value[param] = param;
                    }
+                    while let Some(inst) = cursor.next_inst() {
+                        trace!("Processing inst {}", inst);
+
+                        // While we're passing over all insts, create initial
+                        // singleton eclasses for all result and blockparam
+                        // values.  Also do initial analysis of all inst
+                        // results.
+                        for &result in cursor.func.dfg.inst_results(inst) {
+                            trace!("creating initial singleton eclass for {}", result);
+                            self.eclasses.add(result);
+                        }
+
+                        // Rewrite args of *all* instructions using the
+                        // value-to-opt-value map.
+                        cursor.func.dfg.resolve_aliases_in_arguments(inst);
+                        cursor.func.dfg.map_inst_values(inst, |_, arg| {
+                            let new_value = value_to_opt_value[arg];
+                            trace!("rewriting arg {} of inst {} to {}", arg, inst, new_value);
+                            debug_assert_ne!(new_value, Value::reserved_value());
+                            new_value
+                        });
+
+                        // Build a context for optimization, with borrows of
+                        // state. We can't invoke a method on `self` because
+                        // we've borrowed `self.func` mutably (as
+                        // `cursor.func`) so we pull apart the pieces instead
+                        // here.
+                        let mut ctx = OptimizeCtx {
+                            func: cursor.func,
+                            value_to_opt_value: &mut value_to_opt_value,
+                            gvn_map: &mut gvn_map,
+                            effectful_gvn_map: &mut effectful_gvn_map,
+                            eclasses: &mut self.eclasses,
+                            rewrite_depth: 0,
+                            subsume_values: FxHashSet::default(),
+                            remat_values: &mut self.remat_values,
+                            stats: &mut self.stats,
+                            alias_analysis: self.alias_analysis,
+                            alias_analysis_state: &mut alias_analysis_state,
+                        };
+
+                        if is_pure_for_egraph(ctx.func, inst) {
+                            // Insert into GVN map and optimize any new nodes
+                            // inserted (recursively performing this work for
+                            // any nodes the optimization rules produce).
+                            let inst = NewOrExistingInst::Existing(inst);
+                            ctx.insert_pure_enode(inst);
+                            // We've now rewritten all uses, or will when we
+                            // see them, and the instruction exists as a pure
+                            // enode in the eclass, so we can remove it.
+                            cursor.remove_inst_and_step_back();
+                        } else {
+                            if ctx.optimize_skeleton_inst(inst) {
+                                cursor.remove_inst_and_step_back();
+                            }
+                        }
+                    }
+                }
+                StackEntry::Pop => {
+                    effectful_gvn_map.decrement_depth();
                }
            }
        }
--- a/cranelift/codegen/src/inst_predicates.rs
+++ b/cranelift/codegen/src/inst_predicates.rs
@@ -73,6 +73,25 @@ pub fn is_pure_for_egraph(func: &Function, inst: Inst) -> bool {
    has_one_result && (is_readonly_load || (!op.can_load() && !trivially_has_side_effects(op)))
 }

+/// Can the given instruction be merged into another copy of itself?
+/// These instructions may have side-effects, but as long as we retain
+/// the first instance of the instruction, the second and further
+/// instances are redundant if they would produce the same trap or
+/// result.
+pub fn is_mergeable_for_egraph(func: &Function, inst: Inst) -> bool {
+    let op = func.dfg.insts[inst].opcode();
+    // We can only merge one-result operators due to the way that GVN
+    // is structured in the egraph implementation.
+    let has_one_result = func.dfg.inst_results(inst).len() == 1;
+    has_one_result
+        // Loads/stores are handled by alias analysis and not
+        // otherwise mergeable.
+        && !op.can_load()
+        && !op.can_store()
+        // Can only have idempotent side-effects.
+        && (!has_side_effect(func, inst) || op.side_effects_idempotent())
+}
+
 /// Does the given instruction have any side-effect as per [has_side_effect], or else is a load,
 /// but not the get_pinned_reg opcode?
 pub fn has_lowering_side_effect(func: &Function, inst: Inst) -> bool {
--- a/cranelift/codegen/src/ir/instructions.rs
+++ b/cranelift/codegen/src/ir/instructions.rs
@@ -45,7 +45,7 @@ pub type ValueListPool = entity::ListPool<Value>;
 /// The BlockCall::new function guarantees this layout by requiring a block argument that's written
 /// in as the first element of the EntityList. Any subsequent entries are always assumed to be real
 /// Values.
-#[derive(Debug, Clone, Copy, PartialEq, Hash)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 #[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))]
 pub struct BlockCall {
    /// The underlying storage for the BlockCall. The first element of the values EntityList is
--- a/cranelift/entity/src/list.rs
+++ b/cranelift/entity/src/list.rs
@@ -62,7 +62,7 @@ use serde::{Deserialize, Serialize};
 ///
 /// The index stored in an `EntityList` points to part 2, the list elements. The value 0 is
 /// reserved for the empty list which isn't allocated in the vector.
-#[derive(Clone, Copy, Debug, PartialEq, Hash)]
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
 #[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))]
 pub struct EntityList<T: EntityRef + ReservedValue> {
    index: u32,
--- a/cranelift/filetests/filetests/egraph/issue-5796.clif
+++ b/cranelift/filetests/filetests/egraph/issue-5796.clif
@@ -0,0 +1,76 @@
+test optimize
+set opt_level=speed
+target x86_64
+
+function %f0(i32, i32, i32) -> i32 {
+block0(v0: i32, v1: i32, v2: i32):
+  brif v0, block1, block2
+
+block1:
+  v3 = udiv v1, v2
+  return v3
+
+block2:
+  v4 = udiv v1, v2
+  brif v1, block3, block4
+
+block3:
+  return v4
+
+block4:
+  return v4
+}
+
+; check: block0(v0: i32, v1: i32, v2: i32):
+; check:     brif v0, block1, block2
+
+; check: block1:
+; check:     v3 = udiv.i32 v1, v2
+; check:     return v3
+
+; check: block2:
+; check:     v4 = udiv.i32 v1, v2
+; check:     brif.i32 v1, block3, block4
+
+; check: block3:
+; check:     return v4
+
+; check: block4:
+; check:     return v4
+
+function %f1(i32, i32, i32) -> i32 {
+block0(v0: i32, v1: i32, v2: i32):
+  brif v0, block1, block2
+
+block1:
+  v3 = udiv v1, v2
+  return v3
+
+block2:
+  v4 = udiv v1, v2
+  brif v1, block3, block4
+
+block3:
+  v5 = udiv v1, v2
+  return v5
+
+block4:
+  return v4
+}
+
+; check: block0(v0: i32, v1: i32, v2: i32):
+; check:     brif v0, block1, block2
+
+; check: block1:
+; check:     v3 = udiv.i32 v1, v2
+; check:     return v3
+
+; check: block2:
+; check:     v4 = udiv.i32 v1, v2
+; check:     brif.i32 v1, block3, block4
+
+; check: block3:
+; check:     return v4
+
+; check: block4:
+; check:     return v4
--- a/cranelift/filetests/filetests/wasm/duplicate-loads-dynamic-memory-egraph.wat
+++ b/cranelift/filetests/filetests/wasm/duplicate-loads-dynamic-memory-egraph.wat
@@ -0,0 +1,92 @@
+;;! target = "x86_64"
+;;!
+;;! optimize = true
+;;!
+;;! settings = [
+;;!   "enable_heap_access_spectre_mitigation=true",
+;;!   "opt_level=speed_and_size",
+;;!   "use_egraphs=true"
+;;! ]
+;;!
+;;! [globals.vmctx]
+;;! type = "i64"
+;;! vmctx = true
+;;!
+;;! [globals.heap_base]
+;;! type = "i64"
+;;! load = { base = "vmctx", offset = 0 }
+;;!
+;;! [globals.heap_bound]
+;;! type = "i64"
+;;! load = { base = "vmctx", offset = 8 }
+;;!
+;;! [[heaps]]
+;;! base = "heap_base"
+;;! min_size = 0
+;;! offset_guard_size = 0xffffffff
+;;! index_type = "i32"
+;;! style = { kind = "dynamic", bound = "heap_bound" }
+
+(module
+  (memory (export "memory") 0)
+  (func (export "load-without-offset") (param i32) (result i32 i32)
+    local.get 0
+    i32.load
+    local.get 0
+    i32.load
+  )
+  (func (export "load-with-offset") (param i32) (result i32 i32)
+    local.get 0
+    i32.load offset=1234
+    local.get 0
+    i32.load offset=1234
+  )
+)
+
+;; function u0:0(i32, i64 vmctx) -> i32, i32 fast {
+;;     gv0 = vmctx
+;;     gv1 = load.i64 notrap aligned gv0+8
+;;     gv2 = load.i64 notrap aligned gv0
+;;
+;;                                 block0(v0: i32, v1: i64):
+;; @0057                               v4 = uextend.i64 v0
+;; @0057                               v5 = iconst.i64 4
+;; @0057                               v6 = uadd_overflow_trap v4, v5, heap_oob  ; v5 = 4
+;; @0057                               v7 = load.i64 notrap aligned v1+8
+;; @0057                               v8 = load.i64 notrap aligned v1
+;; @0057                               v11 = icmp ugt v6, v7
+;; @0057                               v10 = iconst.i64 0
+;; @0057                               v9 = iadd v8, v4
+;; @0057                               v12 = select_spectre_guard v11, v10, v9  ; v10 = 0
+;; @0057                               v13 = load.i32 little heap v12
+;;                                     v2 -> v13
+;; @005f                               jump block1
+;;
+;;                                 block1:
+;; @005f                               return v13, v13
+;; }
+;;
+;; function u0:1(i32, i64 vmctx) -> i32, i32 fast {
+;;     gv0 = vmctx
+;;     gv1 = load.i64 notrap aligned gv0+8
+;;     gv2 = load.i64 notrap aligned gv0
+;;
+;;                                 block0(v0: i32, v1: i64):
+;; @0064                               v4 = uextend.i64 v0
+;; @0064                               v5 = iconst.i64 1238
+;; @0064                               v6 = uadd_overflow_trap v4, v5, heap_oob  ; v5 = 1238
+;; @0064                               v7 = load.i64 notrap aligned v1+8
+;; @0064                               v8 = load.i64 notrap aligned v1
+;; @0064                               v12 = icmp ugt v6, v7
+;; @0064                               v11 = iconst.i64 0
+;; @0064                               v9 = iadd v8, v4
+;;                                     v26 = iconst.i64 1234
+;; @0064                               v10 = iadd v9, v26  ; v26 = 1234
+;; @0064                               v13 = select_spectre_guard v12, v11, v10  ; v11 = 0
+;; @0064                               v14 = load.i32 little heap v13
+;;                                     v2 -> v14
+;; @006e                               jump block1
+;;
+;;                                 block1:
+;; @006e                               return v14, v14
+;; }