From 704f5a577233eb37514d22925770bccdf2aac16c Mon Sep 17 00:00:00 2001
From: Chris Fallin <chris@cfallin.org>
Date: Thu, 19 Jan 2023 11:51:19 -0800
Subject: [PATCH] Cranelift/egraph mid-end: support merging
 effectful-but-idempotent ops (#5594)

* Support mergeable-but-side-effectful (idempotent) operations in general in the egraph's GVN.

This mirrors the similar change made in #5534.

* Add tests for egraph case.
---
 cranelift/codegen/src/egraph.rs               | 52 ++++++++++-
 cranelift/codegen/src/inst_predicates.rs      | 19 ++++
 .../duplicate-loads-dynamic-memory-egraph.wat | 92 +++++++++++++++++++
 .../duplicate-loads-static-memory-egraph.wat  | 74 +++++++++++++++
 4 files changed, 232 insertions(+), 5 deletions(-)
 create mode 100644 cranelift/filetests/filetests/wasm/duplicate-loads-dynamic-memory-egraph.wat
 create mode 100644 cranelift/filetests/filetests/wasm/duplicate-loads-static-memory-egraph.wat

diff --git a/cranelift/codegen/src/egraph.rs b/cranelift/codegen/src/egraph.rs
index 6f8bdbb73d..02bfeb013c 100644
--- a/cranelift/codegen/src/egraph.rs
+++ b/cranelift/codegen/src/egraph.rs
@@ -7,7 +7,7 @@ use crate::dominator_tree::DominatorTree;
 use crate::egraph::domtree::DomTreeWithChildren;
 use crate::egraph::elaborate::Elaborator;
 use crate::fx::FxHashSet;
-use crate::inst_predicates::is_pure_for_egraph;
+use crate::inst_predicates::{is_mergeable_for_egraph, is_pure_for_egraph};
 use crate::ir::{
     DataFlowGraph, Function, Inst, InstructionData, Type, Value, ValueDef, ValueListPool,
 };
@@ -284,17 +284,59 @@ where
     /// the layout.
     fn optimize_skeleton_inst(&mut self, inst: Inst) -> bool {
         self.stats.skeleton_inst += 1;
-        // Not pure, but may still be a load or store:
-        // process it to see if we can optimize it.
-        if let Some(new_result) =
+
+        // First, can we try to deduplicate? We need to keep some copy
+        // of the instruction around because it's side-effecting, but
+        // we may be able to reuse an earlier instance of it.
+        if is_mergeable_for_egraph(self.func, inst) {
+            let result = self.func.dfg.inst_results(inst)[0];
+            trace!(" -> mergeable side-effecting op {}", inst);
+            let inst = NewOrExistingInst::Existing(inst);
+            let gvn_context = GVNContext {
+                union_find: self.eclasses,
+                value_lists: &self.func.dfg.value_lists,
+            };
+
+            // Does this instruction already exist? If so, add entries to
+            // the value-map to rewrite uses of its results to the results
+            // of the original (existing) instruction. If not, optimize
+            // the new instruction.
+            let key = inst.get_inst_key(&self.func.dfg);
+            if let Some(&orig_result) = self.gvn_map.get(&key, &gvn_context) {
+                // Hit in GVN map -- reuse value.
+                self.value_to_opt_value[result] = orig_result;
+                self.eclasses.union(orig_result, result);
+                trace!(" -> merges result {} to {}", result, orig_result);
+                true
+            } else {
+                // Otherwise, insert it into the value-map.
+                self.value_to_opt_value[result] = result;
+                self.gvn_map.insert(key, result, &gvn_context);
+                trace!(" -> inserts as new (no GVN)");
+                false
+            }
+        }
+        // Otherwise, if a load or store, process it with the alias
+        // analysis to see if we can optimize it (rewrite in terms of
+        // an earlier load or stored value).
+        else if let Some(new_result) =
             self.alias_analysis
                 .process_inst(self.func, self.alias_analysis_state, inst)
         {
             self.stats.alias_analysis_removed += 1;
             let result = self.func.dfg.first_result(inst);
+            trace!(
+                " -> inst {} has result {} replaced with {}",
+                inst,
+                result,
+                new_result
+            );
             self.value_to_opt_value[result] = new_result;
             true
-        } else {
+        }
+        // Otherwise, generic side-effecting op -- always keep it, and
+        // set its results to identity-map to original values.
+        else {
             // Set all results to identity-map to themselves
             // in the value-to-opt-value map.
             for &result in self.func.dfg.inst_results(inst) {
diff --git a/cranelift/codegen/src/inst_predicates.rs b/cranelift/codegen/src/inst_predicates.rs
index 8eea3e383a..3c28899bd3 100644
--- a/cranelift/codegen/src/inst_predicates.rs
+++ b/cranelift/codegen/src/inst_predicates.rs
@@ -74,6 +74,25 @@ pub fn is_pure_for_egraph(func: &Function, inst: Inst) -> bool {
     has_one_result && (is_readonly_load || (!op.can_load() && !trivially_has_side_effects(op)))
 }
 
+/// Can the given instruction be merged into another copy of itself?
+/// These instructions may have side-effects, but as long as we retain
+/// the first instance of the instruction, the second and further
+/// instances are redundant if they would produce the same trap or
+/// result.
+pub fn is_mergeable_for_egraph(func: &Function, inst: Inst) -> bool {
+    let op = func.dfg.insts[inst].opcode();
+    // We can only merge one-result operators due to the way that GVN
+    // is structured in the egraph implementation.
+    let has_one_result = func.dfg.inst_results(inst).len() == 1;
+    has_one_result
+        // Loads/stores are handled by alias analysis and not
+        // otherwise mergeable.
+        && !op.can_load()
+        && !op.can_store()
+        // Can only have idempotent side-effects.
+        && (!has_side_effect(func, inst) || op.side_effects_idempotent())
+}
+
 /// Does the given instruction have any side-effect as per [has_side_effect], or else is a load,
 /// but not the get_pinned_reg opcode?
 pub fn has_lowering_side_effect(func: &Function, inst: Inst) -> bool {
diff --git a/cranelift/filetests/filetests/wasm/duplicate-loads-dynamic-memory-egraph.wat b/cranelift/filetests/filetests/wasm/duplicate-loads-dynamic-memory-egraph.wat
new file mode 100644
index 0000000000..b6d4e88dcf
--- /dev/null
+++ b/cranelift/filetests/filetests/wasm/duplicate-loads-dynamic-memory-egraph.wat
@@ -0,0 +1,92 @@
+;;! target = "x86_64"
+;;!
+;;! optimize = true
+;;!
+;;! settings = [
+;;!   "enable_heap_access_spectre_mitigation=true",
+;;!   "opt_level=speed_and_size",
+;;!   "use_egraphs=true"
+;;! ]
+;;!
+;;! [globals.vmctx]
+;;! type = "i64"
+;;! vmctx = true
+;;!
+;;! [globals.heap_base]
+;;! type = "i64"
+;;! load = { base = "vmctx", offset = 0 }
+;;!
+;;! [globals.heap_bound]
+;;! type = "i64"
+;;! load = { base = "vmctx", offset = 8 }
+;;!
+;;! [[heaps]]
+;;! base = "heap_base"
+;;! min_size = 0
+;;! offset_guard_size = 0xffffffff
+;;! index_type = "i32"
+;;! style = { kind = "dynamic", bound = "heap_bound" }
+
+(module
+  (memory (export "memory") 0)
+  (func (export "load-without-offset") (param i32) (result i32 i32)
+    local.get 0
+    i32.load
+    local.get 0
+    i32.load
+  )
+  (func (export "load-with-offset") (param i32) (result i32 i32)
+    local.get 0
+    i32.load offset=1234
+    local.get 0
+    i32.load offset=1234
+  )
+)
+
+;; function u0:0(i32, i64 vmctx) -> i32, i32 fast {
+;;     gv0 = vmctx
+;;     gv1 = load.i64 notrap aligned gv0+8
+;;     gv2 = load.i64 notrap aligned gv0
+;;
+;;                                 block0(v0: i32, v1: i64):
+;; @0057                               v4 = uextend.i64 v0
+;; @0057                               v5 = iconst.i64 4
+;; @0057                               v6 = uadd_overflow_trap v4, v5, heap_oob  ; v5 = 4
+;; @0057                               v7 = load.i64 notrap aligned v1+8
+;; @0057                               v8 = load.i64 notrap aligned v1
+;; @0057                               v11 = icmp ugt v6, v7
+;; @0057                               v10 = iconst.i64 0
+;; @0057                               v9 = iadd v8, v4
+;; @0057                               v12 = select_spectre_guard v11, v10, v9  ; v10 = 0
+;; @0057                               v13 = load.i32 little heap v12
+;;                                     v2 -> v13
+;; @005f                               jump block1
+;;
+;;                                 block1:
+;; @005f                               return v13, v13
+;; }
+;;
+;; function u0:1(i32, i64 vmctx) -> i32, i32 fast {
+;;     gv0 = vmctx
+;;     gv1 = load.i64 notrap aligned gv0+8
+;;     gv2 = load.i64 notrap aligned gv0
+;;
+;;                                 block0(v0: i32, v1: i64):
+;; @0064                               v4 = uextend.i64 v0
+;; @0064                               v5 = iconst.i64 1238
+;; @0064                               v6 = uadd_overflow_trap v4, v5, heap_oob  ; v5 = 1238
+;; @0064                               v7 = load.i64 notrap aligned v1+8
+;; @0064                               v8 = load.i64 notrap aligned v1
+;; @0064                               v12 = icmp ugt v6, v7
+;; @0064                               v11 = iconst.i64 0
+;; @0064                               v9 = iadd v8, v4
+;;                                     v26 = iconst.i64 1234
+;; @0064                               v10 = iadd v9, v26  ; v26 = 1234
+;; @0064                               v13 = select_spectre_guard v12, v11, v10  ; v11 = 0
+;; @0064                               v14 = load.i32 little heap v13
+;;                                     v2 -> v14
+;; @006e                               jump block1
+;;
+;;                                 block1:
+;; @006e                               return v14, v14
+;; }
diff --git a/cranelift/filetests/filetests/wasm/duplicate-loads-static-memory-egraph.wat b/cranelift/filetests/filetests/wasm/duplicate-loads-static-memory-egraph.wat
new file mode 100644
index 0000000000..d434d5a33a
--- /dev/null
+++ b/cranelift/filetests/filetests/wasm/duplicate-loads-static-memory-egraph.wat
@@ -0,0 +1,74 @@
+;;! target = "x86_64"
+;;!
+;;! optimize = true
+;;!
+;;! settings = [
+;;!   "enable_heap_access_spectre_mitigation=true",
+;;!   "opt_level=speed_and_size",
+;;!   "use_egraphs=true"
+;;! ]
+;;!
+;;! [globals.vmctx]
+;;! type = "i64"
+;;! vmctx = true
+;;!
+;;! [globals.heap_base]
+;;! type = "i64"
+;;! load = { base = "vmctx", offset = 0, readonly = true }
+;;!
+;;! [[heaps]]
+;;! base = "heap_base"
+;;! min_size = 0x10000
+;;! offset_guard_size = 0xffffffff
+;;! index_type = "i32"
+;;! style = { kind = "static", bound = 0x10000000 }
+
+(module
+  (memory (export "memory") 1)
+  (func (export "load-without-offset") (param i32) (result i32 i32)
+    local.get 0
+    i32.load
+    local.get 0
+    i32.load
+  )
+  (func (export "load-with-offset") (param i32) (result i32 i32)
+    local.get 0
+    i32.load offset=1234
+    local.get 0
+    i32.load offset=1234
+  )
+)
+
+;; function u0:0(i32, i64 vmctx) -> i32, i32 fast {
+;;     gv0 = vmctx
+;;     gv1 = load.i64 notrap aligned readonly gv0
+;;
+;;                                 block0(v0: i32, v1: i64):
+;; @0057                               v5 = load.i64 notrap aligned readonly v1
+;; @0057                               v4 = uextend.i64 v0
+;; @0057                               v6 = iadd v5, v4
+;; @0057                               v7 = load.i32 little heap v6
+;;                                     v2 -> v7
+;; @005f                               jump block1
+;;
+;;                                 block1:
+;; @005f                               return v7, v7
+;; }
+;;
+;; function u0:1(i32, i64 vmctx) -> i32, i32 fast {
+;;     gv0 = vmctx
+;;     gv1 = load.i64 notrap aligned readonly gv0
+;;
+;;                                 block0(v0: i32, v1: i64):
+;; @0064                               v5 = load.i64 notrap aligned readonly v1
+;; @0064                               v4 = uextend.i64 v0
+;; @0064                               v6 = iadd v5, v4
+;;                                     v14 = iconst.i64 1234
+;; @0064                               v7 = iadd v6, v14  ; v14 = 1234
+;; @0064                               v8 = load.i32 little heap v7
+;;                                     v2 -> v8
+;; @006e                               jump block1
+;;
+;;                                 block1:
+;; @006e                               return v8, v8
+;; }