Rewrite lower_edge to produce better phi-translations:
* ensure that all const assignments are placed at the end of the sequence.
This minimises live ranges.
* for the non-const assignments, ignore self-assignments. This can
dramatically reduce the total number of moves generated, because any
self-assignments trigger the overlap-case handling, hence invoking the
double-copy behaviour in cases where it's not necessary.
It's worth pointing out that self-assignments are common, and are not due to
deficiencies in CLIR optimisation. Rather, they occur whenever a loop back
edge doesn't modify *all* loop-carried values. This can easily happen if
the loop has multiple "early" back-edges -- "continues" in C parlance. Eg:
loop_header(a, b, c, d, e, f):
...
a_new = ...
b_new = ...
if (..) goto loop_header(a_new, b_new, c, d, e, f)
...
c_new = ...
d_new = ...
if (..) goto loop_header(a_new, b_new, c_new, d_new, e, f)
etc
For functions with many live values, this can dramatically reduce the number
of spill moves we throw into the register allocator.
In terms of compilation costs, this ranges from neutral for functions which
spill not at all, or minimally (joey_small, joey_med) to a 7.1% reduction in
insn count.
In terms of run costs, for one spill-heavy test (bz2 w/ custom timing harness),
instruction counts are reduced by 4.3%, data reads by 12.3% and data writes
by 18.5%. Note those last two figures include all reads and writes made by the
generated code, not just spills/reloads, so the proportional reduction in
spill/reload traffic must be greater.
This commit is contained in:
@@ -408,112 +408,100 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
|
|||||||
fn lower_edge(&mut self, pred: Block, inst: Inst, succ: Block) -> CodegenResult<()> {
|
fn lower_edge(&mut self, pred: Block, inst: Inst, succ: Block) -> CodegenResult<()> {
|
||||||
debug!("lower_edge: pred {} succ {}", pred, succ);
|
debug!("lower_edge: pred {} succ {}", pred, succ);
|
||||||
|
|
||||||
let mut src_regs: SmallVec<[Option<Reg>; 16]> = SmallVec::new();
|
let num_args = self.f.dfg.block_params(succ).len();
|
||||||
let mut src_consts: SmallVec<[Option<u64>; 16]> = SmallVec::new();
|
debug_assert!(num_args == self.f.dfg.inst_variable_args(inst).len());
|
||||||
let mut dst_regs: SmallVec<[Writable<Reg>; 16]> = SmallVec::new();
|
|
||||||
|
|
||||||
fn overlap(a: &[Option<Reg>], b: &[Writable<Reg>]) -> bool {
|
// Most blocks have no params, so skip all the hoop-jumping below and make an early exit.
|
||||||
let mut set = FxHashSet::default();
|
if num_args == 0 {
|
||||||
for &maybe_reg in a {
|
return Ok(());
|
||||||
if let Some(r) = maybe_reg {
|
|
||||||
set.insert(r);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for ® in b {
|
|
||||||
if set.contains(®.to_reg()) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
false
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create a temporary for each block parameter.
|
// Make up two vectors of info:
|
||||||
let phi_classes: SmallVec<[Type; 16]> = self
|
//
|
||||||
|
// * one for dsts which are to be assigned constants. We'll deal with those second, so
|
||||||
|
// as to minimise live ranges.
|
||||||
|
//
|
||||||
|
// * one for dsts whose sources are non-constants.
|
||||||
|
|
||||||
|
let mut const_bundles = SmallVec::<[(Type, Writable<Reg>, u64); 16]>::new();
|
||||||
|
let mut var_bundles = SmallVec::<[(Type, Writable<Reg>, Reg); 16]>::new();
|
||||||
|
|
||||||
|
let mut i = 0;
|
||||||
|
for (dst_val, src_val) in self
|
||||||
.f
|
.f
|
||||||
.dfg
|
.dfg
|
||||||
.block_params(succ)
|
.block_params(succ)
|
||||||
.iter()
|
.iter()
|
||||||
.map(|p| self.f.dfg.value_type(*p))
|
.zip(self.f.dfg.inst_variable_args(inst).iter())
|
||||||
.collect();
|
{
|
||||||
|
let src_val = self.f.dfg.resolve_aliases(*src_val);
|
||||||
|
let ty = self.f.dfg.value_type(src_val);
|
||||||
|
|
||||||
|
debug_assert!(ty == self.f.dfg.value_type(*dst_val));
|
||||||
|
let dst_reg = self.value_regs[*dst_val];
|
||||||
|
|
||||||
|
let input = self.get_input_for_val(inst, src_val);
|
||||||
|
debug!("jump arg {} is {}, reg {:?}", i, src_val, input.reg);
|
||||||
|
i += 1;
|
||||||
|
|
||||||
// Create all of the phi uses (reads) from jump args to temps.
|
|
||||||
// Round up all the source and destination regs
|
|
||||||
for (i, arg) in self.f.dfg.inst_variable_args(inst).iter().enumerate() {
|
|
||||||
let arg = self.f.dfg.resolve_aliases(*arg);
|
|
||||||
let input = self.get_input_for_val(inst, arg);
|
|
||||||
debug!("jump arg {} is {}, reg {:?}", i, arg, input.reg);
|
|
||||||
if let Some(c) = input.constant {
|
if let Some(c) = input.constant {
|
||||||
src_consts.push(Some(c));
|
const_bundles.push((ty, Writable::from_reg(dst_reg), c));
|
||||||
src_regs.push(None);
|
|
||||||
} else {
|
} else {
|
||||||
self.use_input_reg(input);
|
self.use_input_reg(input);
|
||||||
src_regs.push(Some(input.reg));
|
let src_reg = input.reg;
|
||||||
src_consts.push(None);
|
// Skip self-assignments. Not only are they pointless, they falsely trigger the
|
||||||
|
// overlap-check below and hence can cause a lot of unnecessary copying through
|
||||||
|
// temporaries.
|
||||||
|
if dst_reg != src_reg {
|
||||||
|
var_bundles.push((ty, Writable::from_reg(dst_reg), src_reg));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (i, param) in self.f.dfg.block_params(succ).iter().enumerate() {
|
|
||||||
debug!("bb arg {} is {}", i, param);
|
|
||||||
dst_regs.push(Writable::from_reg(self.value_regs[*param]));
|
|
||||||
}
|
}
|
||||||
debug_assert!(src_regs.len() == dst_regs.len());
|
|
||||||
debug_assert!(src_consts.len() == dst_regs.len());
|
|
||||||
debug_assert!(phi_classes.len() == dst_regs.len());
|
|
||||||
debug!(
|
|
||||||
"src_regs = {:?} src_consts = {:?} dst_regs = {:?}",
|
|
||||||
src_regs, src_consts, dst_regs
|
|
||||||
);
|
|
||||||
|
|
||||||
// If, as is mostly the case, the source and destination register
|
// Deal first with the moves whose sources are variables.
|
||||||
// sets are non overlapping, then we can copy directly, so as to
|
|
||||||
// save the register allocator work.
|
// FIXME: use regalloc.rs' SparseSetU here. This would avoid all heap allocation
|
||||||
if !overlap(&src_regs[..], &dst_regs[..]) {
|
// for cases of up to circa 16 args. Currently not possible because regalloc.rs
|
||||||
for i in 0..dst_regs.len() {
|
// does not export it.
|
||||||
let src_reg = src_regs[i];
|
let mut src_reg_set = FxHashSet::<Reg>::default();
|
||||||
let src_const = src_consts[i];
|
for (_, _, src_reg) in &var_bundles {
|
||||||
let dst_reg = dst_regs[i];
|
src_reg_set.insert(*src_reg);
|
||||||
let ty = phi_classes[i];
|
}
|
||||||
if let Some(r) = src_reg {
|
let mut overlaps = false;
|
||||||
self.emit(I::gen_move(dst_reg, r, ty));
|
for (_, dst_reg, _) in &var_bundles {
|
||||||
} else {
|
if src_reg_set.contains(&dst_reg.to_reg()) {
|
||||||
// Generate constants fresh in phi-edge to avoid long
|
overlaps = true;
|
||||||
// live-ranges. Note that these are also excluded from the
|
break;
|
||||||
// overlap check, which increases the chance that we don't
|
|
||||||
// have to do a two-stage copy.
|
|
||||||
for inst in I::gen_constant(dst_reg, src_const.unwrap(), ty).into_iter() {
|
|
||||||
self.emit(inst);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If, as is mostly the case, the source and destination register sets are non
|
||||||
|
// overlapping, then we can copy directly, so as to save the register allocator work.
|
||||||
|
if !overlaps {
|
||||||
|
for (ty, dst_reg, src_reg) in &var_bundles {
|
||||||
|
self.emit(I::gen_move(*dst_reg, *src_reg, *ty));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// There's some overlap, so play safe and copy via temps.
|
// There's some overlap, so play safe and copy via temps.
|
||||||
let mut tmp_regs: SmallVec<[Writable<Reg>; 16]> = SmallVec::new();
|
let mut tmp_regs = SmallVec::<[Writable<Reg>; 16]>::new();
|
||||||
for &ty in &phi_classes {
|
for (ty, _, _) in &var_bundles {
|
||||||
tmp_regs.push(self.alloc_tmp(I::rc_for_type(ty)?, ty));
|
tmp_regs.push(self.alloc_tmp(I::rc_for_type(*ty)?, *ty));
|
||||||
|
}
|
||||||
|
for ((ty, _, src_reg), tmp_reg) in var_bundles.iter().zip(tmp_regs.iter()) {
|
||||||
|
self.emit(I::gen_move(*tmp_reg, *src_reg, *ty));
|
||||||
|
}
|
||||||
|
for ((ty, dst_reg, _), tmp_reg) in var_bundles.iter().zip(tmp_regs.iter()) {
|
||||||
|
self.emit(I::gen_move(*dst_reg, (*tmp_reg).to_reg(), *ty));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
debug!("phi_temps = {:?}", tmp_regs);
|
// Now, finally, deal with the moves whose sources are constants.
|
||||||
debug_assert!(tmp_regs.len() == src_regs.len());
|
for (ty, dst_reg, const_u64) in &const_bundles {
|
||||||
|
for inst in I::gen_constant(*dst_reg, *const_u64, *ty).into_iter() {
|
||||||
for i in 0..dst_regs.len() {
|
|
||||||
let src_reg = src_regs[i];
|
|
||||||
let tmp_reg = tmp_regs[i];
|
|
||||||
let ty = phi_classes[i];
|
|
||||||
let src_const = src_consts[i];
|
|
||||||
if let Some(src_reg) = src_reg {
|
|
||||||
self.emit(I::gen_move(tmp_reg, src_reg, ty));
|
|
||||||
} else {
|
|
||||||
for inst in I::gen_constant(tmp_reg, src_const.unwrap(), ty).into_iter() {
|
|
||||||
self.emit(inst);
|
self.emit(inst);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
for i in 0..dst_regs.len() {
|
|
||||||
let tmp_reg = tmp_regs[i].to_reg();
|
|
||||||
let dst_reg = dst_regs[i];
|
|
||||||
let ty = phi_classes[i];
|
|
||||||
self.emit(I::gen_move(dst_reg, tmp_reg, ty));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user