From 0bc0503f3f6c6cfe8dfc7c46e185db26f8ac494f Mon Sep 17 00:00:00 2001 From: Julian Seward Date: Wed, 6 May 2020 09:08:14 +0200 Subject: [PATCH] Add a transformation pass which removes phi nodes to which it can demonstrate that only one value ever flows. Has been observed to improve generated code run times by up to 8%. Compilation cost increases by about 0.6%, but up to 7% total cost has been observed to be saved; iow it can be a significant win in terms of compilation time, overall. --- cranelift/codegen/src/context.rs | 13 + cranelift/codegen/src/lib.rs | 1 + cranelift/codegen/src/remove_constant_phis.rs | 393 ++++++++++++++++++ cranelift/codegen/src/timing.rs | 1 + 4 files changed, 408 insertions(+) create mode 100644 cranelift/codegen/src/remove_constant_phis.rs diff --git a/cranelift/codegen/src/context.rs b/cranelift/codegen/src/context.rs index d233946a65..2c4ce6b492 100644 --- a/cranelift/codegen/src/context.rs +++ b/cranelift/codegen/src/context.rs @@ -27,6 +27,7 @@ use crate::nan_canonicalization::do_nan_canonicalization; use crate::postopt::do_postopt; use crate::redundant_reload_remover::RedundantReloadRemover; use crate::regalloc; +use crate::remove_constant_phis::do_remove_constant_phis; use crate::result::CodegenResult; use crate::settings::{FlagsOrIsa, OptLevel}; use crate::simple_gvn::do_simple_gvn; @@ -179,6 +180,8 @@ impl Context { self.dce(isa)?; } + self.remove_constant_phis(isa)?; + if let Some(backend) = isa.get_mach_backend() { let result = backend.compile_function(&self.func, self.want_disasm)?; let info = result.code_info(); @@ -292,6 +295,16 @@ impl Context { Ok(()) } + /// Perform constant-phi removal on the function. + pub fn remove_constant_phis<'a, FOI: Into>>( + &mut self, + fisa: FOI, + ) -> CodegenResult<()> { + do_remove_constant_phis(&mut self.func, &mut self.domtree); + self.verify_if(fisa)?; + Ok(()) + } + /// Perform pre-legalization rewrites on the function. pub fn preopt(&mut self, isa: &dyn TargetIsa) -> CodegenResult<()> { do_preopt(&mut self.func, &mut self.cfg, isa); diff --git a/cranelift/codegen/src/lib.rs b/cranelift/codegen/src/lib.rs index d178187914..efe0ea00c6 100644 --- a/cranelift/codegen/src/lib.rs +++ b/cranelift/codegen/src/lib.rs @@ -105,6 +105,7 @@ mod postopt; mod predicates; mod redundant_reload_remover; mod regalloc; +mod remove_constant_phis; mod result; mod scoped_hash_map; mod simple_gvn; diff --git a/cranelift/codegen/src/remove_constant_phis.rs b/cranelift/codegen/src/remove_constant_phis.rs new file mode 100644 index 0000000000..26a9dc8de3 --- /dev/null +++ b/cranelift/codegen/src/remove_constant_phis.rs @@ -0,0 +1,393 @@ +//! A Constant-Phi-Node removal pass. + +use log::info; + +use crate::dominator_tree::DominatorTree; +use crate::entity::EntityList; +use crate::fx::FxHashMap; +use crate::fx::FxHashSet; +use crate::ir::instructions::BranchInfo; +use crate::ir::Function; +use crate::ir::{Block, Inst, Value}; +use crate::timing; + +use smallvec::{smallvec, SmallVec}; +use std::vec::Vec; + +// A note on notation. For the sake of clarity, this file uses the phrase +// "formal parameters" to mean the `Value`s listed in the block head, and +// "actual parameters" to mean the `Value`s passed in a branch or a jump: +// +// block4(v16: i32, v18: i32): <-- formal parameters +// ... +// brnz v27, block7(v22, v24) <-- actual parameters +// jump block6 + +// This transformation pass (conceptually) partitions all values in the +// function into two groups: +// +// * Group A: values defined by block formal parameters, except for the entry block. +// +// * Group B: All other values: that is, values defined by instructions, +// and the formals of the entry block. +// +// For each value in Group A, it attempts to establish whether it will have +// the value of exactly one member of Group B. If so, the formal parameter is +// deleted, all corresponding actual parameters (in jumps/branches to the +// defining block) are deleted, and a rename is inserted. +// +// The entry block is special-cased because (1) we don't know what values flow +// to its formals and (2) in any case we can't change its formals. +// +// Work proceeds in three phases. +// +// * Phase 1: examine all instructions. For each block, make up a useful +// grab-bag of information, `BlockSummary`, that summarises the block's +// formals and jump/branch instruction. This is used by Phases 2 and 3. +// +// * Phase 2: for each value in Group A, try to find a single Group B value +// that flows to it. This is done using a classical iterative forward +// dataflow analysis over a simple constant-propagation style lattice. It +// converges quickly in practice -- I have seen at most 4 iterations. This +// is relatively cheap because the iteration is done over the +// `BlockSummary`s, and does not visit each instruction. The resulting +// fixed point is stored in a `SolverState`. +// +// * Phase 3: using the `SolverState` and `BlockSummary`, edit the function to +// remove redundant formals and actuals, and to insert suitable renames. +// +// Note that the effectiveness of the analysis depends on on the fact that +// there are no copy instructions in Cranelift's IR. If there were, the +// computation of `actual_absval` in Phase 2 would have to be extended to +// chase through such copies. +// +// For large functions, the analysis cost using the new AArch64 backend is about +// 0.6% of the non-optimising compile time, as measured by instruction counts. +// This transformation usually pays for itself several times over, though, by +// reducing the isel/regalloc cost downstream. Gains of up to 7% have been +// seen for large functions. + +// The `Value`s (Group B) that can flow to a formal parameter (Group A). +#[derive(Clone, Copy, Debug, PartialEq)] +enum AbstractValue { + // Two or more values flow to this formal. + Many, + // Exactly one value, as stated, flows to this formal. The `Value`s that + // can appear here are exactly: `Value`s defined by `Inst`s, plus the + // `Value`s defined by the formals of the entry block. Note that this is + // exactly the set of `Value`s that are *not* tracked in the solver below + // (see `SolverState`). + One(Value /*Group B*/), + // No value flows to this formal. + None, +} + +impl AbstractValue { + fn join(self, other: AbstractValue) -> AbstractValue { + match (self, other) { + // Joining with `None` has no effect + (AbstractValue::None, p2) => p2, + (p1, AbstractValue::None) => p1, + // Joining with `Many` produces `Many` + (AbstractValue::Many, _p2) => AbstractValue::Many, + (_p1, AbstractValue::Many) => AbstractValue::Many, + // The only interesting case + (AbstractValue::One(v1), AbstractValue::One(v2)) => { + if v1 == v2 { + AbstractValue::One(v1) + } else { + AbstractValue::Many + } + } + } + } + fn is_one(self) -> bool { + if let AbstractValue::One(_) = self { + true + } else { + false + } + } +} + +// For some block, a useful bundle of info. The `Block` itself is not stored +// here since it will be the key in the associated `FxHashMap` -- see +// `summaries` below. For the `SmallVec` tuning params: most blocks have +// few parameters, hence `4`. And almost all blocks have either one or two +// successors, hence `2`. +#[derive(Debug)] +struct BlockSummary { + // Formal parameters for this `Block` + formals: SmallVec<[Value; 4] /*Group A*/>, + // For each `Inst` in this block that transfers to another block: the + // `Inst` itself, the destination `Block`, and the actual parameters + // passed. We don't bother to include transfers that pass zero parameters + // since that makes more work for the solver for no purpose. + dests: SmallVec<[(Inst, Block, SmallVec<[Value; 4] /*both Groups A and B*/>); 2]>, +} +impl BlockSummary { + fn new(formals: SmallVec<[Value; 4]>) -> Self { + Self { + formals, + dests: smallvec![], + } + } +} + +// Solver state. This holds a AbstractValue for each formal parameter, except +// for those from the entry block. +struct SolverState { + absvals: FxHashMap, +} +impl SolverState { + fn new() -> Self { + Self { + absvals: FxHashMap::default(), + } + } + fn get(&self, actual: Value) -> AbstractValue { + match self.absvals.get(&actual) { + Some(lp) => *lp, + None => panic!("SolverState::get: formal param {:?} is untracked?!", actual), + } + } + fn maybe_get(&self, actual: Value) -> Option<&AbstractValue> { + self.absvals.get(&actual) + } + fn set(&mut self, actual: Value, lp: AbstractValue) { + match self.absvals.insert(actual, lp) { + Some(_old_lp) => {} + None => panic!("SolverState::set: formal param {:?} is untracked?!", actual), + } + } +} + +/// Detect phis in `func` that will only ever produce one value, using a +/// classic forward dataflow analysis. Then remove them. +#[inline(never)] +pub fn do_remove_constant_phis(func: &mut Function, domtree: &mut DominatorTree) { + let _tt = timing::remove_constant_phis(); + debug_assert!(domtree.is_valid()); + + // Get the blocks, in reverse postorder + let mut blocks_reverse_postorder = Vec::::new(); + for block in domtree.cfg_postorder() { + blocks_reverse_postorder.push(*block); + } + blocks_reverse_postorder.reverse(); + + // Phase 1 of 3: for each block, make a summary containing all relevant + // info. The solver will iterate over the summaries, rather than having + // to inspect each instruction in each block. + let mut summaries = FxHashMap::::default(); + + for b in &blocks_reverse_postorder { + let formals = func.dfg.block_params(*b); + let mut summary = BlockSummary::new(SmallVec::from(formals)); + + for inst in func.layout.block_insts(*b) { + let idetails = &func.dfg[inst]; + // Note that multi-dest transfers (i.e., branch tables) don't + // carry parameters in our IR, so we only have to care about + // `SingleDest` here. + if let BranchInfo::SingleDest(dest, _) = idetails.analyze_branch(&func.dfg.value_lists) + { + let inst_var_args = func.dfg.inst_variable_args(inst); + // Skip branches/jumps that carry no params. + if inst_var_args.len() > 0 { + let mut actuals = SmallVec::<[Value; 4]>::new(); + for arg in inst_var_args { + let arg = func.dfg.resolve_aliases(*arg); + actuals.push(arg); + } + summary.dests.push((inst, dest, actuals)); + } + } + } + + // Ensure the invariant that all blocks (except for the entry) appear + // in the summary, *unless* they have neither formals nor any + // param-carrying branches/jumps. + if formals.len() > 0 || summary.dests.len() > 0 { + summaries.insert(*b, summary); + } + } + + // Phase 2 of 3: iterate over the summaries in reverse postorder, + // computing new `AbstractValue`s for each tracked `Value`. The set of + // tracked `Value`s is exactly Group A as described above. + + let entry_block = func + .layout + .entry_block() + .expect("remove_constant_phis: entry block unknown"); + + // Set up initial solver state + let mut state = SolverState::new(); + + for b in &blocks_reverse_postorder { + // For each block, get the formals + if *b == entry_block { + continue; + } + let formals: &[Value] = func.dfg.block_params(*b); + for formal in formals { + let mb_old_absval = state.absvals.insert(*formal, AbstractValue::None); + assert!(mb_old_absval.is_none()); + } + } + + // Solve: repeatedly traverse the blocks in reverse postorder, until there + // are no changes. + let mut iter_no = 0; + loop { + iter_no += 1; + let mut changed = false; + + for src in &blocks_reverse_postorder { + let mb_src_summary = summaries.get(src); + // The src block might have no summary. This means it has no + // branches/jumps that carry parameters *and* it doesn't take any + // parameters itself. Phase 1 ensures this. So we can ignore it. + if mb_src_summary.is_none() { + continue; + } + let src_summary = mb_src_summary.unwrap(); + for (_inst, dst, src_actuals) in &src_summary.dests { + assert!(*dst != entry_block); + // By contrast, the dst block must have a summary. Phase 1 + // will have only included an entry in `src_summary.dests` if + // that branch/jump carried at least one parameter. So the + // dst block does take parameters, so it must have a summary. + let dst_summary = summaries + .get(dst) + .expect("remove_constant_phis: dst block has no summary"); + let dst_formals = &dst_summary.formals; + assert!(src_actuals.len() == dst_formals.len()); + for (formal, actual) in dst_formals.iter().zip(src_actuals.iter()) { + // Find the abstract value for `actual`. If it is a block + // formal parameter then the most recent abstract value is + // to be found in the solver state. If not, then it's a + // real value defining point (not a phi), in which case + // return it itself. + let actual_absval = match state.maybe_get(*actual) { + Some(pt) => *pt, + None => AbstractValue::One(*actual), + }; + + // And `join` the new value with the old. + let formal_absval_old = state.get(*formal); + let formal_absval_new = formal_absval_old.join(actual_absval); + if formal_absval_new != formal_absval_old { + changed = true; + state.set(*formal, formal_absval_new); + } + } + } + } + + if !changed { + break; + } + } + let mut n_consts = 0; + for absval in state.absvals.values() { + if absval.is_one() { + n_consts += 1; + } + } + + // Phase 3 of 3: edit the function to remove constant formals, using the + // summaries and the final solver state as a guide. + + // Make up a set of blocks that need editing. + let mut need_editing = FxHashSet::::default(); + for (block, summary) in &summaries { + if *block == entry_block { + continue; + } + for formal in &summary.formals { + let formal_absval = state.get(*formal); + if formal_absval.is_one() { + need_editing.insert(*block); + break; + } + } + } + + // Firstly, deal with the formals. For each formal which is redundant, + // remove it, and also add a reroute from it to the constant value which + // it we know it to be. + for b in &need_editing { + let mut del_these = SmallVec::<[(Value, Value); 32]>::new(); + let formals: &[Value] = func.dfg.block_params(*b); + for formal in formals { + // The state must give an absval for `formal`. + if let AbstractValue::One(replacement_val) = state.get(*formal) { + del_these.push((*formal, replacement_val)); + } + } + // We can delete the formals in any order. However, + // `remove_block_param` works by sliding backwards all arguments to + // the right of the it is asked to delete. Hence when removing more + // than one formal, it is significantly more efficient to ask it to + // remove the rightmost formal first, and hence this `reverse`. + del_these.reverse(); + for (redundant_formal, replacement_val) in del_these { + func.dfg.remove_block_param(redundant_formal); + func.dfg.change_to_alias(redundant_formal, replacement_val); + } + } + + // Secondly, visit all branch insns. If the destination has had its + // formals changed, change the actuals accordingly. Don't scan all insns, + // rather just visit those as listed in the summaries we prepared earlier. + for (_src_block, summary) in &summaries { + for (inst, dst_block, _src_actuals) in &summary.dests { + if !need_editing.contains(dst_block) { + continue; + } + + let old_actuals = func.dfg[*inst].take_value_list().unwrap(); + let num_old_actuals = old_actuals.len(&func.dfg.value_lists); + let num_fixed_actuals = func.dfg[*inst] + .opcode() + .constraints() + .num_fixed_value_arguments(); + let dst_summary = summaries.get(&dst_block).unwrap(); + + // Check that the numbers of arguments make sense. + assert!(num_fixed_actuals <= num_old_actuals); + assert!(num_fixed_actuals + dst_summary.formals.len() == num_old_actuals); + + // Create a new value list. + let mut new_actuals = EntityList::::new(); + // Copy the fixed args to the new list + for i in 0..num_fixed_actuals { + let val = old_actuals.get(i, &func.dfg.value_lists).unwrap(); + new_actuals.push(val, &mut func.dfg.value_lists); + } + + // Copy the variable args (the actual block params) to the new + // list, filtering out redundant ones. + for i in 0..dst_summary.formals.len() { + let actual_i = old_actuals + .get(num_fixed_actuals + i, &func.dfg.value_lists) + .unwrap(); + let formal_i = dst_summary.formals[i]; + let is_redundant = state.get(formal_i).is_one(); + if !is_redundant { + new_actuals.push(actual_i, &mut func.dfg.value_lists); + } + } + func.dfg[*inst].put_value_list(new_actuals); + } + } + + info!( + "do_remove_constant_phis: done, {} iters. {} formals, of which {} const.", + iter_no, + state.absvals.len(), + n_consts + ); +} diff --git a/cranelift/codegen/src/timing.rs b/cranelift/codegen/src/timing.rs index 3cfc67f79d..7957541933 100644 --- a/cranelift/codegen/src/timing.rs +++ b/cranelift/codegen/src/timing.rs @@ -62,6 +62,7 @@ define_passes! { gvn: "Global value numbering", licm: "Loop invariant code motion", unreachable_code: "Remove unreachable blocks", + remove_constant_phis: "Remove constant phi-nodes", regalloc: "Register allocation", ra_liveness: "RA liveness analysis",