diff --git a/cranelift/codegen/meta/src/gen_inst.rs b/cranelift/codegen/meta/src/gen_inst.rs index 01ccefb7e0..4882969990 100644 --- a/cranelift/codegen/meta/src/gen_inst.rs +++ b/cranelift/codegen/meta/src/gen_inst.rs @@ -66,7 +66,7 @@ fn gen_formats(formats: &[&InstructionFormat], fmt: &mut Formatter) { /// 16 bytes on 64-bit architectures. If more space is needed to represent an instruction, use a /// `ValueList` to store the additional information out of line. fn gen_instruction_data(formats: &[&InstructionFormat], fmt: &mut Formatter) { - fmt.line("#[derive(Copy, Clone, Debug, PartialEq, Hash)]"); + fmt.line("#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]"); fmt.line(r#"#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))]"#); fmt.line("#[allow(missing_docs)]"); fmtln!(fmt, "pub enum InstructionData {"); diff --git a/cranelift/codegen/src/egraph.rs b/cranelift/codegen/src/egraph.rs index 2cbe412e34..e12b6fed39 100644 --- a/cranelift/codegen/src/egraph.rs +++ b/cranelift/codegen/src/egraph.rs @@ -7,13 +7,14 @@ use crate::dominator_tree::DominatorTree; use crate::egraph::domtree::DomTreeWithChildren; use crate::egraph::elaborate::Elaborator; use crate::fx::FxHashSet; -use crate::inst_predicates::is_pure_for_egraph; +use crate::inst_predicates::{is_mergeable_for_egraph, is_pure_for_egraph}; use crate::ir::{ - DataFlowGraph, Function, Inst, InstructionData, Type, Value, ValueDef, ValueListPool, + Block, DataFlowGraph, Function, Inst, InstructionData, Type, Value, ValueDef, ValueListPool, }; use crate::loop_analysis::LoopAnalysis; use crate::opts::generated_code::ContextIter; use crate::opts::IsleContext; +use crate::scoped_hash_map::{Entry as ScopedEntry, ScopedHashMap}; use crate::trace; use crate::unionfind::UnionFind; use cranelift_entity::packed_option::ReservedValue; @@ -77,6 +78,7 @@ where pub(crate) func: &'opt mut Function, pub(crate) value_to_opt_value: &'opt mut SecondaryMap, pub(crate) gvn_map: &'opt mut CtxHashMap<(Type, InstructionData), Value>, + pub(crate) effectful_gvn_map: &'opt mut ScopedHashMap<(Type, InstructionData), Value>, pub(crate) eclasses: &'opt mut UnionFind, pub(crate) remat_values: &'opt mut FxHashSet, pub(crate) stats: &'opt mut Stats, @@ -285,10 +287,49 @@ where fn optimize_skeleton_inst(&mut self, inst: Inst) -> bool { self.stats.skeleton_inst += 1; - // If a load or store, process it with the alias analysis to see - // if we can optimize it (rewrite in terms of an earlier load or - // stored value). - if let Some(new_result) = + // First, can we try to deduplicate? We need to keep some copy + // of the instruction around because it's side-effecting, but + // we may be able to reuse an earlier instance of it. + if is_mergeable_for_egraph(self.func, inst) { + let result = self.func.dfg.inst_results(inst)[0]; + trace!(" -> mergeable side-effecting op {}", inst); + + // Does this instruction already exist? If so, add entries to + // the value-map to rewrite uses of its results to the results + // of the original (existing) instruction. If not, optimize + // the new instruction. + // + // Note that we use the "effectful GVN map", which is + // scoped: because effectful ops are not removed from the + // skeleton (`Layout`), we need to be mindful of whether + // our current position is dominated by an instance of the + // instruction. (See #5796 for details.) + let ty = self.func.dfg.ctrl_typevar(inst); + match self + .effectful_gvn_map + .entry((ty, self.func.dfg.insts[inst].clone())) + { + ScopedEntry::Occupied(o) => { + let orig_result = *o.get(); + // Hit in GVN map -- reuse value. + self.value_to_opt_value[result] = orig_result; + self.eclasses.union(orig_result, result); + trace!(" -> merges result {} to {}", result, orig_result); + true + } + ScopedEntry::Vacant(v) => { + // Otherwise, insert it into the value-map. + self.value_to_opt_value[result] = result; + v.insert(result); + trace!(" -> inserts as new (no GVN)"); + false + } + } + } + // Otherwise, if a load or store, process it with the alias + // analysis to see if we can optimize it (rewrite in terms of + // an earlier load or stored value). + else if let Some(new_result) = self.alias_analysis .process_inst(self.func, self.alias_analysis_state, inst) { @@ -382,82 +423,126 @@ impl<'a> EgraphPass<'a> { let mut cursor = FuncCursor::new(self.func); let mut value_to_opt_value: SecondaryMap = SecondaryMap::with_default(Value::reserved_value()); + // Map from instruction to value for hash-consing of pure ops + // into the egraph. This can be a standard (non-scoped) + // hashmap because pure ops have no location: they are + // "outside of" control flow. + // + // Note also that we keep the controlling typevar (the `Type` + // in the tuple below) because it may disambiguate + // instructions that are identical except for type. let mut gvn_map: CtxHashMap<(Type, InstructionData), Value> = CtxHashMap::with_capacity(cursor.func.dfg.num_values()); + // Map from instruction to value for GVN'ing of effectful but + // idempotent ops, which remain in the side-effecting + // skeleton. This needs to be scoped because we cannot + // deduplicate one instruction to another that is in a + // non-dominating block. + // + // Note that we can use a ScopedHashMap here without the + // "context" (as needed by CtxHashMap) because in practice the + // ops we want to GVN have all their args inline. Equality on + // the InstructionData itself is conservative: two insts whose + // struct contents compare shallowly equal are definitely + // identical, but identical insts in a deep-equality sense may + // not compare shallowly equal, due to list indirection. This + // is fine for GVN, because it is still sound to skip any + // given GVN opportunity (and keep the original instructions). + // + // As above, we keep the controlling typevar here as part of + // the key: effectful instructions may (as for pure + // instructions) be differentiated only on the type. + let mut effectful_gvn_map: ScopedHashMap<(Type, InstructionData), Value> = + ScopedHashMap::new(); // In domtree preorder, visit blocks. (TODO: factor out an // iterator from this and elaborator.) let root = self.domtree_children.root(); - let mut block_stack = vec![root]; - while let Some(block) = block_stack.pop() { - // We popped this block; push children - // immediately, then process this block. - block_stack.extend(self.domtree_children.children(block)); + enum StackEntry { + Visit(Block), + Pop, + } + let mut block_stack = vec![StackEntry::Visit(root)]; + while let Some(entry) = block_stack.pop() { + match entry { + StackEntry::Visit(block) => { + // We popped this block; push children + // immediately, then process this block. + block_stack.push(StackEntry::Pop); + block_stack + .extend(self.domtree_children.children(block).map(StackEntry::Visit)); + effectful_gvn_map.increment_depth(); - trace!("Processing block {}", block); - cursor.set_position(CursorPosition::Before(block)); + trace!("Processing block {}", block); + cursor.set_position(CursorPosition::Before(block)); - let mut alias_analysis_state = self.alias_analysis.block_starting_state(block); + let mut alias_analysis_state = self.alias_analysis.block_starting_state(block); - for ¶m in cursor.func.dfg.block_params(block) { - trace!("creating initial singleton eclass for blockparam {}", param); - self.eclasses.add(param); - value_to_opt_value[param] = param; - } - while let Some(inst) = cursor.next_inst() { - trace!("Processing inst {}", inst); - - // While we're passing over all insts, create initial - // singleton eclasses for all result and blockparam - // values. Also do initial analysis of all inst - // results. - for &result in cursor.func.dfg.inst_results(inst) { - trace!("creating initial singleton eclass for {}", result); - self.eclasses.add(result); - } - - // Rewrite args of *all* instructions using the - // value-to-opt-value map. - cursor.func.dfg.resolve_aliases_in_arguments(inst); - cursor.func.dfg.map_inst_values(inst, |_, arg| { - let new_value = value_to_opt_value[arg]; - trace!("rewriting arg {} of inst {} to {}", arg, inst, new_value); - debug_assert_ne!(new_value, Value::reserved_value()); - new_value - }); - - // Build a context for optimization, with borrows of - // state. We can't invoke a method on `self` because - // we've borrowed `self.func` mutably (as - // `cursor.func`) so we pull apart the pieces instead - // here. - let mut ctx = OptimizeCtx { - func: cursor.func, - value_to_opt_value: &mut value_to_opt_value, - gvn_map: &mut gvn_map, - eclasses: &mut self.eclasses, - rewrite_depth: 0, - subsume_values: FxHashSet::default(), - remat_values: &mut self.remat_values, - stats: &mut self.stats, - alias_analysis: self.alias_analysis, - alias_analysis_state: &mut alias_analysis_state, - }; - - if is_pure_for_egraph(ctx.func, inst) { - // Insert into GVN map and optimize any new nodes - // inserted (recursively performing this work for - // any nodes the optimization rules produce). - let inst = NewOrExistingInst::Existing(inst); - ctx.insert_pure_enode(inst); - // We've now rewritten all uses, or will when we - // see them, and the instruction exists as a pure - // enode in the eclass, so we can remove it. - cursor.remove_inst_and_step_back(); - } else { - if ctx.optimize_skeleton_inst(inst) { - cursor.remove_inst_and_step_back(); + for ¶m in cursor.func.dfg.block_params(block) { + trace!("creating initial singleton eclass for blockparam {}", param); + self.eclasses.add(param); + value_to_opt_value[param] = param; } + while let Some(inst) = cursor.next_inst() { + trace!("Processing inst {}", inst); + + // While we're passing over all insts, create initial + // singleton eclasses for all result and blockparam + // values. Also do initial analysis of all inst + // results. + for &result in cursor.func.dfg.inst_results(inst) { + trace!("creating initial singleton eclass for {}", result); + self.eclasses.add(result); + } + + // Rewrite args of *all* instructions using the + // value-to-opt-value map. + cursor.func.dfg.resolve_aliases_in_arguments(inst); + cursor.func.dfg.map_inst_values(inst, |_, arg| { + let new_value = value_to_opt_value[arg]; + trace!("rewriting arg {} of inst {} to {}", arg, inst, new_value); + debug_assert_ne!(new_value, Value::reserved_value()); + new_value + }); + + // Build a context for optimization, with borrows of + // state. We can't invoke a method on `self` because + // we've borrowed `self.func` mutably (as + // `cursor.func`) so we pull apart the pieces instead + // here. + let mut ctx = OptimizeCtx { + func: cursor.func, + value_to_opt_value: &mut value_to_opt_value, + gvn_map: &mut gvn_map, + effectful_gvn_map: &mut effectful_gvn_map, + eclasses: &mut self.eclasses, + rewrite_depth: 0, + subsume_values: FxHashSet::default(), + remat_values: &mut self.remat_values, + stats: &mut self.stats, + alias_analysis: self.alias_analysis, + alias_analysis_state: &mut alias_analysis_state, + }; + + if is_pure_for_egraph(ctx.func, inst) { + // Insert into GVN map and optimize any new nodes + // inserted (recursively performing this work for + // any nodes the optimization rules produce). + let inst = NewOrExistingInst::Existing(inst); + ctx.insert_pure_enode(inst); + // We've now rewritten all uses, or will when we + // see them, and the instruction exists as a pure + // enode in the eclass, so we can remove it. + cursor.remove_inst_and_step_back(); + } else { + if ctx.optimize_skeleton_inst(inst) { + cursor.remove_inst_and_step_back(); + } + } + } + } + StackEntry::Pop => { + effectful_gvn_map.decrement_depth(); } } } diff --git a/cranelift/codegen/src/inst_predicates.rs b/cranelift/codegen/src/inst_predicates.rs index 9b284e8b8c..f82f41daac 100644 --- a/cranelift/codegen/src/inst_predicates.rs +++ b/cranelift/codegen/src/inst_predicates.rs @@ -73,6 +73,25 @@ pub fn is_pure_for_egraph(func: &Function, inst: Inst) -> bool { has_one_result && (is_readonly_load || (!op.can_load() && !trivially_has_side_effects(op))) } +/// Can the given instruction be merged into another copy of itself? +/// These instructions may have side-effects, but as long as we retain +/// the first instance of the instruction, the second and further +/// instances are redundant if they would produce the same trap or +/// result. +pub fn is_mergeable_for_egraph(func: &Function, inst: Inst) -> bool { + let op = func.dfg.insts[inst].opcode(); + // We can only merge one-result operators due to the way that GVN + // is structured in the egraph implementation. + let has_one_result = func.dfg.inst_results(inst).len() == 1; + has_one_result + // Loads/stores are handled by alias analysis and not + // otherwise mergeable. + && !op.can_load() + && !op.can_store() + // Can only have idempotent side-effects. + && (!has_side_effect(func, inst) || op.side_effects_idempotent()) +} + /// Does the given instruction have any side-effect as per [has_side_effect], or else is a load, /// but not the get_pinned_reg opcode? pub fn has_lowering_side_effect(func: &Function, inst: Inst) -> bool { diff --git a/cranelift/codegen/src/ir/instructions.rs b/cranelift/codegen/src/ir/instructions.rs index 97e0ac0b5a..f28cc45d2e 100644 --- a/cranelift/codegen/src/ir/instructions.rs +++ b/cranelift/codegen/src/ir/instructions.rs @@ -45,7 +45,7 @@ pub type ValueListPool = entity::ListPool; /// The BlockCall::new function guarantees this layout by requiring a block argument that's written /// in as the first element of the EntityList. Any subsequent entries are always assumed to be real /// Values. -#[derive(Debug, Clone, Copy, PartialEq, Hash)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] #[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))] pub struct BlockCall { /// The underlying storage for the BlockCall. The first element of the values EntityList is diff --git a/cranelift/entity/src/list.rs b/cranelift/entity/src/list.rs index 659b94a9bc..682eb256fa 100644 --- a/cranelift/entity/src/list.rs +++ b/cranelift/entity/src/list.rs @@ -62,7 +62,7 @@ use serde::{Deserialize, Serialize}; /// /// The index stored in an `EntityList` points to part 2, the list elements. The value 0 is /// reserved for the empty list which isn't allocated in the vector. -#[derive(Clone, Copy, Debug, PartialEq, Hash)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] #[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))] pub struct EntityList { index: u32, diff --git a/cranelift/filetests/filetests/egraph/issue-5796.clif b/cranelift/filetests/filetests/egraph/issue-5796.clif new file mode 100644 index 0000000000..66963f5c4e --- /dev/null +++ b/cranelift/filetests/filetests/egraph/issue-5796.clif @@ -0,0 +1,76 @@ +test optimize +set opt_level=speed +target x86_64 + +function %f0(i32, i32, i32) -> i32 { +block0(v0: i32, v1: i32, v2: i32): + brif v0, block1, block2 + +block1: + v3 = udiv v1, v2 + return v3 + +block2: + v4 = udiv v1, v2 + brif v1, block3, block4 + +block3: + return v4 + +block4: + return v4 +} + +; check: block0(v0: i32, v1: i32, v2: i32): +; check: brif v0, block1, block2 + +; check: block1: +; check: v3 = udiv.i32 v1, v2 +; check: return v3 + +; check: block2: +; check: v4 = udiv.i32 v1, v2 +; check: brif.i32 v1, block3, block4 + +; check: block3: +; check: return v4 + +; check: block4: +; check: return v4 + +function %f1(i32, i32, i32) -> i32 { +block0(v0: i32, v1: i32, v2: i32): + brif v0, block1, block2 + +block1: + v3 = udiv v1, v2 + return v3 + +block2: + v4 = udiv v1, v2 + brif v1, block3, block4 + +block3: + v5 = udiv v1, v2 + return v5 + +block4: + return v4 +} + +; check: block0(v0: i32, v1: i32, v2: i32): +; check: brif v0, block1, block2 + +; check: block1: +; check: v3 = udiv.i32 v1, v2 +; check: return v3 + +; check: block2: +; check: v4 = udiv.i32 v1, v2 +; check: brif.i32 v1, block3, block4 + +; check: block3: +; check: return v4 + +; check: block4: +; check: return v4 diff --git a/cranelift/filetests/filetests/wasm/duplicate-loads-dynamic-memory-egraph.wat b/cranelift/filetests/filetests/wasm/duplicate-loads-dynamic-memory-egraph.wat new file mode 100644 index 0000000000..b6d4e88dcf --- /dev/null +++ b/cranelift/filetests/filetests/wasm/duplicate-loads-dynamic-memory-egraph.wat @@ -0,0 +1,92 @@ +;;! target = "x86_64" +;;! +;;! optimize = true +;;! +;;! settings = [ +;;! "enable_heap_access_spectre_mitigation=true", +;;! "opt_level=speed_and_size", +;;! "use_egraphs=true" +;;! ] +;;! +;;! [globals.vmctx] +;;! type = "i64" +;;! vmctx = true +;;! +;;! [globals.heap_base] +;;! type = "i64" +;;! load = { base = "vmctx", offset = 0 } +;;! +;;! [globals.heap_bound] +;;! type = "i64" +;;! load = { base = "vmctx", offset = 8 } +;;! +;;! [[heaps]] +;;! base = "heap_base" +;;! min_size = 0 +;;! offset_guard_size = 0xffffffff +;;! index_type = "i32" +;;! style = { kind = "dynamic", bound = "heap_bound" } + +(module + (memory (export "memory") 0) + (func (export "load-without-offset") (param i32) (result i32 i32) + local.get 0 + i32.load + local.get 0 + i32.load + ) + (func (export "load-with-offset") (param i32) (result i32 i32) + local.get 0 + i32.load offset=1234 + local.get 0 + i32.load offset=1234 + ) +) + +;; function u0:0(i32, i64 vmctx) -> i32, i32 fast { +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned gv0+8 +;; gv2 = load.i64 notrap aligned gv0 +;; +;; block0(v0: i32, v1: i64): +;; @0057 v4 = uextend.i64 v0 +;; @0057 v5 = iconst.i64 4 +;; @0057 v6 = uadd_overflow_trap v4, v5, heap_oob ; v5 = 4 +;; @0057 v7 = load.i64 notrap aligned v1+8 +;; @0057 v8 = load.i64 notrap aligned v1 +;; @0057 v11 = icmp ugt v6, v7 +;; @0057 v10 = iconst.i64 0 +;; @0057 v9 = iadd v8, v4 +;; @0057 v12 = select_spectre_guard v11, v10, v9 ; v10 = 0 +;; @0057 v13 = load.i32 little heap v12 +;; v2 -> v13 +;; @005f jump block1 +;; +;; block1: +;; @005f return v13, v13 +;; } +;; +;; function u0:1(i32, i64 vmctx) -> i32, i32 fast { +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned gv0+8 +;; gv2 = load.i64 notrap aligned gv0 +;; +;; block0(v0: i32, v1: i64): +;; @0064 v4 = uextend.i64 v0 +;; @0064 v5 = iconst.i64 1238 +;; @0064 v6 = uadd_overflow_trap v4, v5, heap_oob ; v5 = 1238 +;; @0064 v7 = load.i64 notrap aligned v1+8 +;; @0064 v8 = load.i64 notrap aligned v1 +;; @0064 v12 = icmp ugt v6, v7 +;; @0064 v11 = iconst.i64 0 +;; @0064 v9 = iadd v8, v4 +;; v26 = iconst.i64 1234 +;; @0064 v10 = iadd v9, v26 ; v26 = 1234 +;; @0064 v13 = select_spectre_guard v12, v11, v10 ; v11 = 0 +;; @0064 v14 = load.i32 little heap v13 +;; v2 -> v14 +;; @006e jump block1 +;; +;; block1: +;; @006e return v14, v14 +;; }