Files
regalloc2/src/moves.rs
Amanieu d'Antras d95a9d9399 Combine sort keys into u64/u128
This allows the compiler to perform branch-less comparisons, which are
more efficient.

This results in ~5% fewer instructions executed.
2022-01-11 13:03:21 +00:00

208 lines
7.9 KiB
Rust

/*
* Released under the terms of the Apache 2.0 license with LLVM
* exception. See `LICENSE` for details.
*/
use crate::{ion::data_structures::u64_key, Allocation};
use smallvec::{smallvec, SmallVec};
pub type MoveVec<T> = SmallVec<[(Allocation, Allocation, T); 16]>;
/// A `ParallelMoves` represents a list of alloc-to-alloc moves that
/// must happen in parallel -- i.e., all reads of sources semantically
/// happen before all writes of destinations, and destinations are
/// allowed to overwrite sources. It can compute a list of sequential
/// moves that will produce the equivalent data movement, possibly
/// using a scratch register if one is necessary.
pub struct ParallelMoves<T: Clone + Copy + Default> {
parallel_moves: MoveVec<T>,
scratch: Allocation,
}
impl<T: Clone + Copy + Default> ParallelMoves<T> {
pub fn new(scratch: Allocation) -> Self {
Self {
parallel_moves: smallvec![],
scratch,
}
}
pub fn add(&mut self, from: Allocation, to: Allocation, t: T) {
self.parallel_moves.push((from, to, t));
}
fn sources_overlap_dests(&self) -> bool {
// Assumes `parallel_moves` has already been sorted in `resolve()` below.
for &(_, dst, _) in &self.parallel_moves {
if self
.parallel_moves
.binary_search_by_key(&dst, |&(src, _, _)| src)
.is_ok()
{
return true;
}
}
false
}
pub fn resolve(mut self) -> MoveVec<T> {
// Easy case: zero or one move. Just return our vec.
if self.parallel_moves.len() <= 1 {
return self.parallel_moves;
}
// Sort moves by source so that we can efficiently test for
// presence.
self.parallel_moves
.sort_by_key(|&(src, dst, _)| u64_key(src.bits(), dst.bits()));
// Do any dests overlap sources? If not, we can also just
// return the list.
if !self.sources_overlap_dests() {
return self.parallel_moves;
}
// General case: some moves overwrite dests that other moves
// read as sources. We'll use a general algorithm.
//
// *Important property*: because we expect that each register
// has only one writer (otherwise the effect of the parallel
// move is undefined), each move can only block one other move
// (with its one source corresponding to the one writer of
// that source). Thus, we *can only have simple cycles* (those
// that are a ring of nodes, i.e., with only one path from a
// node back to itself); there are no SCCs that are more
// complex than that. We leverage this fact below to avoid
// having to do a full Tarjan SCC DFS (with lowest-index
// computation, etc.): instead, as soon as we find a cycle, we
// know we have the full cycle and we can do a cyclic move
// sequence and continue.
// Sort moves by destination and check that each destination
// has only one writer.
self.parallel_moves.sort_by_key(|&(_, dst, _)| dst);
if cfg!(debug_assertions) {
let mut last_dst = None;
for &(_, dst, _) in &self.parallel_moves {
if last_dst.is_some() {
debug_assert!(last_dst.unwrap() != dst);
}
last_dst = Some(dst);
}
}
// Construct a mapping from move indices to moves they must
// come before. Any given move must come before a move that
// overwrites its destination; we have moves sorted by dest
// above so we can efficiently find such a move, if any.
let mut must_come_before: SmallVec<[Option<usize>; 16]> =
smallvec![None; self.parallel_moves.len()];
for (i, &(src, _, _)) in self.parallel_moves.iter().enumerate() {
if let Ok(move_to_dst_idx) = self
.parallel_moves
.binary_search_by_key(&src, |&(_, dst, _)| dst)
{
must_come_before[i] = Some(move_to_dst_idx);
}
}
// Do a simple stack-based DFS and emit moves in postorder,
// then reverse at the end for RPO. Unlike Tarjan's SCC
// algorithm, we can emit a cycle as soon as we find one, as
// noted above.
let mut ret: MoveVec<T> = smallvec![];
let mut stack: SmallVec<[usize; 16]> = smallvec![];
let mut visited: SmallVec<[bool; 16]> = smallvec![false; self.parallel_moves.len()];
let mut onstack: SmallVec<[bool; 16]> = smallvec![false; self.parallel_moves.len()];
stack.push(0);
onstack[0] = true;
loop {
if stack.is_empty() {
if let Some(next) = visited.iter().position(|&flag| !flag) {
stack.push(next);
onstack[next] = true;
} else {
break;
}
}
let top = *stack.last().unwrap();
visited[top] = true;
match must_come_before[top] {
None => {
ret.push(self.parallel_moves[top]);
onstack[top] = false;
stack.pop();
while let Some(top) = stack.pop() {
ret.push(self.parallel_moves[top]);
onstack[top] = false;
}
}
Some(next) if visited[next] && !onstack[next] => {
ret.push(self.parallel_moves[top]);
onstack[top] = false;
stack.pop();
while let Some(top) = stack.pop() {
ret.push(self.parallel_moves[top]);
onstack[top] = false;
}
}
Some(next) if !visited[next] && !onstack[next] => {
stack.push(next);
onstack[next] = true;
continue;
}
Some(next) => {
// Found a cycle -- emit a cyclic-move sequence
// for the cycle on the top of stack, then normal
// moves below it. Recall that these moves will be
// reversed in sequence, so from the original
// parallel move set
//
// { B := A, C := B, A := B }
//
// we will generate something like:
//
// A := scratch
// B := A
// C := B
// scratch := C
//
// which will become:
//
// scratch := C
// C := B
// B := A
// A := scratch
let mut last_dst = None;
let mut scratch_src = None;
while let Some(move_idx) = stack.pop() {
onstack[move_idx] = false;
let (mut src, dst, dst_t) = self.parallel_moves[move_idx];
if last_dst.is_none() {
scratch_src = Some(src);
src = self.scratch;
} else {
debug_assert_eq!(last_dst.unwrap(), src);
}
ret.push((src, dst, dst_t));
last_dst = Some(dst);
if move_idx == next {
break;
}
}
if let Some(src) = scratch_src {
ret.push((src, self.scratch, T::default()));
}
}
}
}
ret.reverse();
ret
}
}