Remove an explicitly-set-aside scratch register per class. (#51)

Currently, regalloc2 sets aside one register per class, unconditionally,
to make move resolution possible. To solve the "parallel moves problem",
we sometimes need to conjure a cyclic permutation of data among
registers or stack slots (this can result, for example, from blockparam
flow that swaps two values on a loop backedge). This set-aside scratch
register is used when a cycle exists.

regalloc2 also uses the scratch register when needed to break down a
stack-to-stack move (which could happen due to blockparam moves on edges
when source and destination are both spilled) into a stack-to-reg move
followed by reg-to-stack, because most machines have loads and stores
but not memory-to-memory moves.

A set-aside register is certainly the simplest solution, but it is not
optimal: it means that we have one fewer register available for use by
the program, and this can be costly especially on machines with fewer
registers (e.g., 16 GPRs/XMMs on x86-64) and especially when some
registers may be set aside by our embedder for other purposes too. Every
register we can reclaim is some nontrivial performance in large function
bodies!

This PR removes this restriction and allows regalloc2 to use all
available physical registers. It then solves the two problems above,
cyclic moves and stack-to-stack moves, with a two-stage approach:

- First, it finds a location to use to resolve cycles, if any exist. If
  a register is unallocated at the location of the move, we can use it.
  Often we get lucky and this is the case. Otherwise, we allocate a
  stackslot to use as the temp. This is perfectly fine at this stage,
  even if it means that we have more stack-to-stack moves.

- Then, it resolves stack-to-stack moves into stack-to-reg /
  reg-to-stack. There are two subcases here. If there is *another*
  available free physical register, we opportunistically use it for this
  decomposition. If not, we fall back to our last-ditch option: we pick
  a victim register of the appropriate class, we allocate another
  temporary stackslot, we spill the victim to that slot just for this
  move, we do the move in the above way (stack-to-reg / reg-to-stack)
  with the victim, then we reload the victim. So one move (original
  stack-to-stack) becomes four moves, but no state is clobbered.

This PR extends the `moves` fuzz-target to exercise this functionality
as well, randomly choosing for some spare registers to exist or not, and
randomly generating {stack,reg}-to-{stack,reg} moves in the initial
parallel-move input set. The target does a simple symbolic simulation of
the sequential move sequence and ensures that the final state is
equivalent to the parallel-move semantics.

I fuzzed both the `moves` target, focusing on the new logic; as well as
the `ion_checker` target, checking the whole register allocator, and
both seem clean (~150M cases on the former, ~1M cases on the latter).
This commit is contained in:
Chris Fallin
2022-05-23 10:48:37 -07:00
committed by GitHub
parent 33611a68b9
commit 869c21e79c
8 changed files with 402 additions and 117 deletions

View File

@@ -351,7 +351,8 @@ pub struct Env<'a, F: Function> {
pub spillslots: Vec<SpillSlotData>,
pub slots_by_size: Vec<SpillSlotList>,
pub extra_spillslot: Vec<Option<Allocation>>,
pub extra_spillslots_by_class: [SmallVec<[Allocation; 2]>; 2],
pub preferred_victim_by_class: [PReg; 2],
// Program moves: these are moves in the provided program that we
// handle with our internal machinery, in order to avoid the

View File

@@ -109,6 +109,13 @@ impl<'a, F: Function> Env<'a, F> {
for &preg in &self.env.fixed_stack_slots {
self.pregs[preg.index()].is_stack = true;
}
for class in 0..self.preferred_victim_by_class.len() {
self.preferred_victim_by_class[class] = self.env.non_preferred_regs_by_class[class]
.last()
.or(self.env.preferred_regs_by_class[class].last())
.cloned()
.unwrap_or(PReg::invalid());
}
// Create VRegs from the vreg count.
for idx in 0..self.func.num_vregs() {
// We'll fill in the real details when we see the def.

View File

@@ -31,6 +31,7 @@ use liveranges::*;
pub(crate) mod merge;
pub(crate) mod process;
use process::*;
use smallvec::smallvec;
pub(crate) mod dump;
pub(crate) mod moves;
pub(crate) mod spill;
@@ -66,7 +67,8 @@ impl<'a, F: Function> Env<'a, F> {
slots_by_size: vec![],
allocated_bundle_count: 0,
extra_spillslot: vec![None, None],
extra_spillslots_by_class: [smallvec![], smallvec![]],
preferred_victim_by_class: [PReg::invalid(), PReg::invalid()],
prog_move_srcs: Vec::with_capacity(n / 2),
prog_move_dsts: Vec::with_capacity(n / 2),

View File

@@ -16,12 +16,16 @@ use super::{
Env, InsertMovePrio, InsertedMove, LiveRangeFlag, LiveRangeIndex, RedundantMoveEliminator,
VRegIndex, SLOT_NONE,
};
use crate::ion::data_structures::{BlockparamIn, BlockparamOut, CodeRange, PosWithPrio};
use crate::moves::ParallelMoves;
use crate::ion::data_structures::{
BlockparamIn, BlockparamOut, CodeRange, LiveRangeKey, PosWithPrio,
};
use crate::ion::reg_traversal::RegTraversalIter;
use crate::moves::{MoveAndScratchResolver, ParallelMoves};
use crate::{
Allocation, Block, Edit, Function, Inst, InstPosition, OperandConstraint, OperandKind,
OperandPos, PReg, ProgPoint, RegClass, VReg,
OperandPos, PReg, ProgPoint, RegClass, SpillSlot, VReg,
};
use fxhash::FxHashMap;
use smallvec::{smallvec, SmallVec};
use std::fmt::Debug;
@@ -965,8 +969,7 @@ impl<'a, F: Function> Env<'a, F> {
// have two separate ParallelMove instances. They need to
// be separate because moves between the two classes are
// impossible. (We could enhance ParallelMoves to
// understand register classes and take multiple scratch
// regs, but this seems simpler.)
// understand register classes, but this seems simpler.)
let mut int_moves: SmallVec<[InsertedMove; 8]> = smallvec![];
let mut float_moves: SmallVec<[InsertedMove; 8]> = smallvec![];
@@ -993,8 +996,7 @@ impl<'a, F: Function> Env<'a, F> {
// All moves in `moves` semantically happen in
// parallel. Let's resolve these to a sequence of moves
// that can be done one at a time.
let scratch = self.env.scratch_by_class[regclass as u8 as usize];
let mut parallel_moves = ParallelMoves::new(Allocation::reg(scratch));
let mut parallel_moves = ParallelMoves::new();
trace!(
"parallel moves at pos {:?} prio {:?}",
pos_prio.pos,
@@ -1008,59 +1010,79 @@ impl<'a, F: Function> Env<'a, F> {
}
let resolved = parallel_moves.resolve();
// If (i) the scratch register is used, and (ii) a
// stack-to-stack move exists, then we need to
// allocate an additional scratch spillslot to which
// we can temporarily spill the scratch reg when we
// lower the stack-to-stack move to a
// stack-to-scratch-to-stack sequence.
let scratch_used = resolved.iter().any(|&(src, dst, _)| {
src == Allocation::reg(scratch) || dst == Allocation::reg(scratch)
let mut scratch_iter = RegTraversalIter::new(
self.env,
regclass,
PReg::invalid(),
PReg::invalid(),
0,
None,
);
let key = LiveRangeKey::from_range(&CodeRange {
from: pos_prio.pos,
to: pos_prio.pos.next(),
});
let stack_stack_move = resolved.iter().any(|&(src, dst, _)| {
self.allocation_is_stack(src) && self.allocation_is_stack(dst)
});
let extra_slot = if scratch_used && stack_stack_move {
if self.extra_spillslot[regclass as u8 as usize].is_none() {
let slot = self.allocate_spillslot(regclass);
self.extra_spillslot[regclass as u8 as usize] = Some(slot);
let get_reg = || {
while let Some(preg) = scratch_iter.next() {
if !self.pregs[preg.index()]
.allocations
.btree
.contains_key(&key)
{
let alloc = Allocation::reg(preg);
if moves
.iter()
.any(|m| m.from_alloc == alloc || m.to_alloc == alloc)
{
// Skip pregs used by moves in this
// parallel move set, even if not
// marked used at progpoint: edge move
// liveranges meet but don't overlap
// so otherwise we may incorrectly
// overwrite a source reg.
continue;
}
return Some(alloc);
}
}
self.extra_spillslot[regclass as u8 as usize]
} else {
None
};
let mut stackslot_idx = 0;
let get_stackslot = || {
let idx = stackslot_idx;
stackslot_idx += 1;
// We can't borrow `self` as mutable, so we create
// these placeholders then allocate the actual
// slots if needed with `self.allocate_spillslot`
// below.
Allocation::stack(SpillSlot::new(SpillSlot::MAX - idx, regclass))
};
let preferred_victim = self.preferred_victim_by_class[regclass as usize];
let scratch_resolver =
MoveAndScratchResolver::new(get_reg, get_stackslot, preferred_victim);
let resolved = scratch_resolver.compute(resolved);
let mut rewrites = FxHashMap::default();
for i in 0..stackslot_idx {
if i >= self.extra_spillslots_by_class[regclass as usize].len() {
let slot = self.allocate_spillslot(regclass);
self.extra_spillslots_by_class[regclass as usize].push(slot);
}
rewrites.insert(
Allocation::stack(SpillSlot::new(SpillSlot::MAX - i, regclass)),
self.extra_spillslots_by_class[regclass as usize][i],
);
}
let mut scratch_used_yet = false;
for (src, dst, to_vreg) in resolved {
let src = rewrites.get(&src).cloned().unwrap_or(src);
let dst = rewrites.get(&dst).cloned().unwrap_or(dst);
trace!(" resolved: {} -> {} ({:?})", src, dst, to_vreg);
let action = redundant_moves.process_move(src, dst, to_vreg);
if !action.elide {
if dst == Allocation::reg(scratch) {
scratch_used_yet = true;
}
if self.allocation_is_stack(src) && self.allocation_is_stack(dst) {
if !scratch_used_yet {
self.add_move_edit(pos_prio, src, Allocation::reg(scratch));
self.add_move_edit(pos_prio, Allocation::reg(scratch), dst);
} else {
debug_assert!(extra_slot.is_some());
self.add_move_edit(
pos_prio,
Allocation::reg(scratch),
extra_slot.unwrap(),
);
self.add_move_edit(pos_prio, src, Allocation::reg(scratch));
self.add_move_edit(pos_prio, Allocation::reg(scratch), dst);
self.add_move_edit(
pos_prio,
extra_slot.unwrap(),
Allocation::reg(scratch),
);
}
} else {
self.add_move_edit(pos_prio, src, dst);
}
self.add_move_edit(pos_prio, src, dst);
} else {
trace!(" -> redundant move elided");
}
@@ -1081,7 +1103,7 @@ impl<'a, F: Function> Env<'a, F> {
let &(pos_prio, ref edit) = &self.edits[i];
match edit {
&Edit::Move { from, to } => {
self.annotate(pos_prio.pos, format!("move {} -> {})", from, to));
self.annotate(pos_prio.pos, format!("move {} -> {}", from, to));
}
}
}