From 704f5a577233eb37514d22925770bccdf2aac16c Mon Sep 17 00:00:00 2001 From: Chris Fallin Date: Thu, 19 Jan 2023 11:51:19 -0800 Subject: [PATCH] Cranelift/egraph mid-end: support merging effectful-but-idempotent ops (#5594) * Support mergeable-but-side-effectful (idempotent) operations in general in the egraph's GVN. This mirrors the similar change made in #5534. * Add tests for egraph case. --- cranelift/codegen/src/egraph.rs | 52 ++++++++++- cranelift/codegen/src/inst_predicates.rs | 19 ++++ .../duplicate-loads-dynamic-memory-egraph.wat | 92 +++++++++++++++++++ .../duplicate-loads-static-memory-egraph.wat | 74 +++++++++++++++ 4 files changed, 232 insertions(+), 5 deletions(-) create mode 100644 cranelift/filetests/filetests/wasm/duplicate-loads-dynamic-memory-egraph.wat create mode 100644 cranelift/filetests/filetests/wasm/duplicate-loads-static-memory-egraph.wat diff --git a/cranelift/codegen/src/egraph.rs b/cranelift/codegen/src/egraph.rs index 6f8bdbb73d..02bfeb013c 100644 --- a/cranelift/codegen/src/egraph.rs +++ b/cranelift/codegen/src/egraph.rs @@ -7,7 +7,7 @@ use crate::dominator_tree::DominatorTree; use crate::egraph::domtree::DomTreeWithChildren; use crate::egraph::elaborate::Elaborator; use crate::fx::FxHashSet; -use crate::inst_predicates::is_pure_for_egraph; +use crate::inst_predicates::{is_mergeable_for_egraph, is_pure_for_egraph}; use crate::ir::{ DataFlowGraph, Function, Inst, InstructionData, Type, Value, ValueDef, ValueListPool, }; @@ -284,17 +284,59 @@ where /// the layout. fn optimize_skeleton_inst(&mut self, inst: Inst) -> bool { self.stats.skeleton_inst += 1; - // Not pure, but may still be a load or store: - // process it to see if we can optimize it. - if let Some(new_result) = + + // First, can we try to deduplicate? We need to keep some copy + // of the instruction around because it's side-effecting, but + // we may be able to reuse an earlier instance of it. + if is_mergeable_for_egraph(self.func, inst) { + let result = self.func.dfg.inst_results(inst)[0]; + trace!(" -> mergeable side-effecting op {}", inst); + let inst = NewOrExistingInst::Existing(inst); + let gvn_context = GVNContext { + union_find: self.eclasses, + value_lists: &self.func.dfg.value_lists, + }; + + // Does this instruction already exist? If so, add entries to + // the value-map to rewrite uses of its results to the results + // of the original (existing) instruction. If not, optimize + // the new instruction. + let key = inst.get_inst_key(&self.func.dfg); + if let Some(&orig_result) = self.gvn_map.get(&key, &gvn_context) { + // Hit in GVN map -- reuse value. + self.value_to_opt_value[result] = orig_result; + self.eclasses.union(orig_result, result); + trace!(" -> merges result {} to {}", result, orig_result); + true + } else { + // Otherwise, insert it into the value-map. + self.value_to_opt_value[result] = result; + self.gvn_map.insert(key, result, &gvn_context); + trace!(" -> inserts as new (no GVN)"); + false + } + } + // Otherwise, if a load or store, process it with the alias + // analysis to see if we can optimize it (rewrite in terms of + // an earlier load or stored value). + else if let Some(new_result) = self.alias_analysis .process_inst(self.func, self.alias_analysis_state, inst) { self.stats.alias_analysis_removed += 1; let result = self.func.dfg.first_result(inst); + trace!( + " -> inst {} has result {} replaced with {}", + inst, + result, + new_result + ); self.value_to_opt_value[result] = new_result; true - } else { + } + // Otherwise, generic side-effecting op -- always keep it, and + // set its results to identity-map to original values. + else { // Set all results to identity-map to themselves // in the value-to-opt-value map. for &result in self.func.dfg.inst_results(inst) { diff --git a/cranelift/codegen/src/inst_predicates.rs b/cranelift/codegen/src/inst_predicates.rs index 8eea3e383a..3c28899bd3 100644 --- a/cranelift/codegen/src/inst_predicates.rs +++ b/cranelift/codegen/src/inst_predicates.rs @@ -74,6 +74,25 @@ pub fn is_pure_for_egraph(func: &Function, inst: Inst) -> bool { has_one_result && (is_readonly_load || (!op.can_load() && !trivially_has_side_effects(op))) } +/// Can the given instruction be merged into another copy of itself? +/// These instructions may have side-effects, but as long as we retain +/// the first instance of the instruction, the second and further +/// instances are redundant if they would produce the same trap or +/// result. +pub fn is_mergeable_for_egraph(func: &Function, inst: Inst) -> bool { + let op = func.dfg.insts[inst].opcode(); + // We can only merge one-result operators due to the way that GVN + // is structured in the egraph implementation. + let has_one_result = func.dfg.inst_results(inst).len() == 1; + has_one_result + // Loads/stores are handled by alias analysis and not + // otherwise mergeable. + && !op.can_load() + && !op.can_store() + // Can only have idempotent side-effects. + && (!has_side_effect(func, inst) || op.side_effects_idempotent()) +} + /// Does the given instruction have any side-effect as per [has_side_effect], or else is a load, /// but not the get_pinned_reg opcode? pub fn has_lowering_side_effect(func: &Function, inst: Inst) -> bool { diff --git a/cranelift/filetests/filetests/wasm/duplicate-loads-dynamic-memory-egraph.wat b/cranelift/filetests/filetests/wasm/duplicate-loads-dynamic-memory-egraph.wat new file mode 100644 index 0000000000..b6d4e88dcf --- /dev/null +++ b/cranelift/filetests/filetests/wasm/duplicate-loads-dynamic-memory-egraph.wat @@ -0,0 +1,92 @@ +;;! target = "x86_64" +;;! +;;! optimize = true +;;! +;;! settings = [ +;;! "enable_heap_access_spectre_mitigation=true", +;;! "opt_level=speed_and_size", +;;! "use_egraphs=true" +;;! ] +;;! +;;! [globals.vmctx] +;;! type = "i64" +;;! vmctx = true +;;! +;;! [globals.heap_base] +;;! type = "i64" +;;! load = { base = "vmctx", offset = 0 } +;;! +;;! [globals.heap_bound] +;;! type = "i64" +;;! load = { base = "vmctx", offset = 8 } +;;! +;;! [[heaps]] +;;! base = "heap_base" +;;! min_size = 0 +;;! offset_guard_size = 0xffffffff +;;! index_type = "i32" +;;! style = { kind = "dynamic", bound = "heap_bound" } + +(module + (memory (export "memory") 0) + (func (export "load-without-offset") (param i32) (result i32 i32) + local.get 0 + i32.load + local.get 0 + i32.load + ) + (func (export "load-with-offset") (param i32) (result i32 i32) + local.get 0 + i32.load offset=1234 + local.get 0 + i32.load offset=1234 + ) +) + +;; function u0:0(i32, i64 vmctx) -> i32, i32 fast { +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned gv0+8 +;; gv2 = load.i64 notrap aligned gv0 +;; +;; block0(v0: i32, v1: i64): +;; @0057 v4 = uextend.i64 v0 +;; @0057 v5 = iconst.i64 4 +;; @0057 v6 = uadd_overflow_trap v4, v5, heap_oob ; v5 = 4 +;; @0057 v7 = load.i64 notrap aligned v1+8 +;; @0057 v8 = load.i64 notrap aligned v1 +;; @0057 v11 = icmp ugt v6, v7 +;; @0057 v10 = iconst.i64 0 +;; @0057 v9 = iadd v8, v4 +;; @0057 v12 = select_spectre_guard v11, v10, v9 ; v10 = 0 +;; @0057 v13 = load.i32 little heap v12 +;; v2 -> v13 +;; @005f jump block1 +;; +;; block1: +;; @005f return v13, v13 +;; } +;; +;; function u0:1(i32, i64 vmctx) -> i32, i32 fast { +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned gv0+8 +;; gv2 = load.i64 notrap aligned gv0 +;; +;; block0(v0: i32, v1: i64): +;; @0064 v4 = uextend.i64 v0 +;; @0064 v5 = iconst.i64 1238 +;; @0064 v6 = uadd_overflow_trap v4, v5, heap_oob ; v5 = 1238 +;; @0064 v7 = load.i64 notrap aligned v1+8 +;; @0064 v8 = load.i64 notrap aligned v1 +;; @0064 v12 = icmp ugt v6, v7 +;; @0064 v11 = iconst.i64 0 +;; @0064 v9 = iadd v8, v4 +;; v26 = iconst.i64 1234 +;; @0064 v10 = iadd v9, v26 ; v26 = 1234 +;; @0064 v13 = select_spectre_guard v12, v11, v10 ; v11 = 0 +;; @0064 v14 = load.i32 little heap v13 +;; v2 -> v14 +;; @006e jump block1 +;; +;; block1: +;; @006e return v14, v14 +;; } diff --git a/cranelift/filetests/filetests/wasm/duplicate-loads-static-memory-egraph.wat b/cranelift/filetests/filetests/wasm/duplicate-loads-static-memory-egraph.wat new file mode 100644 index 0000000000..d434d5a33a --- /dev/null +++ b/cranelift/filetests/filetests/wasm/duplicate-loads-static-memory-egraph.wat @@ -0,0 +1,74 @@ +;;! target = "x86_64" +;;! +;;! optimize = true +;;! +;;! settings = [ +;;! "enable_heap_access_spectre_mitigation=true", +;;! "opt_level=speed_and_size", +;;! "use_egraphs=true" +;;! ] +;;! +;;! [globals.vmctx] +;;! type = "i64" +;;! vmctx = true +;;! +;;! [globals.heap_base] +;;! type = "i64" +;;! load = { base = "vmctx", offset = 0, readonly = true } +;;! +;;! [[heaps]] +;;! base = "heap_base" +;;! min_size = 0x10000 +;;! offset_guard_size = 0xffffffff +;;! index_type = "i32" +;;! style = { kind = "static", bound = 0x10000000 } + +(module + (memory (export "memory") 1) + (func (export "load-without-offset") (param i32) (result i32 i32) + local.get 0 + i32.load + local.get 0 + i32.load + ) + (func (export "load-with-offset") (param i32) (result i32 i32) + local.get 0 + i32.load offset=1234 + local.get 0 + i32.load offset=1234 + ) +) + +;; function u0:0(i32, i64 vmctx) -> i32, i32 fast { +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly gv0 +;; +;; block0(v0: i32, v1: i64): +;; @0057 v5 = load.i64 notrap aligned readonly v1 +;; @0057 v4 = uextend.i64 v0 +;; @0057 v6 = iadd v5, v4 +;; @0057 v7 = load.i32 little heap v6 +;; v2 -> v7 +;; @005f jump block1 +;; +;; block1: +;; @005f return v7, v7 +;; } +;; +;; function u0:1(i32, i64 vmctx) -> i32, i32 fast { +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly gv0 +;; +;; block0(v0: i32, v1: i64): +;; @0064 v5 = load.i64 notrap aligned readonly v1 +;; @0064 v4 = uextend.i64 v0 +;; @0064 v6 = iadd v5, v4 +;; v14 = iconst.i64 1234 +;; @0064 v7 = iadd v6, v14 ; v14 = 1234 +;; @0064 v8 = load.i32 little heap v7 +;; v2 -> v8 +;; @006e jump block1 +;; +;; block1: +;; @006e return v8, v8 +;; }