Remove an explicitly-set-aside scratch register per class. (#51)

Currently, regalloc2 sets aside one register per class, unconditionally, to make move resolution possible. To solve the "parallel moves problem", we sometimes need to conjure a cyclic permutation of data among registers or stack slots (this can result, for example, from blockparam flow that swaps two values on a loop backedge). This set-aside scratch register is used when a cycle exists. regalloc2 also uses the scratch register when needed to break down a stack-to-stack move (which could happen due to blockparam moves on edges when source and destination are both spilled) into a stack-to-reg move followed by reg-to-stack, because most machines have loads and stores but not memory-to-memory moves. A set-aside register is certainly the simplest solution, but it is not optimal: it means that we have one fewer register available for use by the program, and this can be costly especially on machines with fewer registers (e.g., 16 GPRs/XMMs on x86-64) and especially when some registers may be set aside by our embedder for other purposes too. Every register we can reclaim is some nontrivial performance in large function bodies! This PR removes this restriction and allows regalloc2 to use all available physical registers. It then solves the two problems above, cyclic moves and stack-to-stack moves, with a two-stage approach: - First, it finds a location to use to resolve cycles, if any exist. If a register is unallocated at the location of the move, we can use it. Often we get lucky and this is the case. Otherwise, we allocate a stackslot to use as the temp. This is perfectly fine at this stage, even if it means that we have more stack-to-stack moves. - Then, it resolves stack-to-stack moves into stack-to-reg / reg-to-stack. There are two subcases here. If there is *another* available free physical register, we opportunistically use it for this decomposition. If not, we fall back to our last-ditch option: we pick a victim register of the appropriate class, we allocate another temporary stackslot, we spill the victim to that slot just for this move, we do the move in the above way (stack-to-reg / reg-to-stack) with the victim, then we reload the victim. So one move (original stack-to-stack) becomes four moves, but no state is clobbered. This PR extends the `moves` fuzz-target to exercise this functionality as well, randomly choosing for some spare registers to exist or not, and randomly generating {stack,reg}-to-{stack,reg} moves in the initial parallel-move input set. The target does a simple symbolic simulation of the sequential move sequence and ensures that the final state is equivalent to the parallel-move semantics. I fuzzed both the `moves` target, focusing on the new logic; as well as the `ion_checker` target, checking the whole register allocator, and both seem clean (~150M cases on the former, ~1M cases on the latter).
2022-05-23 10:48:37 -07:00
parent 33611a68b9
commit 869c21e79c
8 changed files with 402 additions and 117 deletions
--- a/fuzz/fuzz_targets/moves.rs
+++ b/fuzz/fuzz_targets/moves.rs
@@ -7,30 +7,56 @@
 use libfuzzer_sys::arbitrary::{Arbitrary, Result, Unstructured};
 use libfuzzer_sys::fuzz_target;

-use regalloc2::fuzzing::moves::ParallelMoves;
-use regalloc2::{Allocation, PReg, RegClass};
-use std::collections::HashSet;
+use regalloc2::fuzzing::moves::{MoveAndScratchResolver, ParallelMoves};
+use regalloc2::{Allocation, PReg, RegClass, SpillSlot};
+use std::collections::{HashMap, HashSet};

 #[derive(Clone, Debug)]
 struct TestCase {
    moves: Vec<(Allocation, Allocation)>,
+    available_pregs: Vec<Allocation>,
 }

 impl Arbitrary for TestCase {
    fn arbitrary(u: &mut Unstructured) -> Result<Self> {
-        let mut ret = TestCase { moves: vec![] };
+        let mut ret = TestCase {
+            moves: vec![],
+            available_pregs: vec![],
+        };
        let mut written = HashSet::new();
+        // An arbitrary sequence of moves between registers 0 to 29
+        // inclusive.
        while bool::arbitrary(u)? {
-            let reg1 = u.int_in_range(0..=30)?;
-            let reg2 = u.int_in_range(0..=30)?;
-            if written.contains(&reg2) {
+            let src = if bool::arbitrary(u)? {
+                let reg = u.int_in_range(0..=29)?;
+                Allocation::reg(PReg::new(reg, RegClass::Int))
+            } else {
+                let slot = u.int_in_range(0..=31)?;
+                Allocation::stack(SpillSlot::new(slot, RegClass::Int))
+            };
+            let dst = if bool::arbitrary(u)? {
+                let reg = u.int_in_range(0..=29)?;
+                Allocation::reg(PReg::new(reg, RegClass::Int))
+            } else {
+                let slot = u.int_in_range(0..=31)?;
+                Allocation::stack(SpillSlot::new(slot, RegClass::Int))
+            };
+
+            // Stop if we are going to write a reg more than once:
+            // that creates an invalid parallel move set.
+            if written.contains(&dst) {
                break;
            }
-            written.insert(reg2);
-            ret.moves.push((
-                Allocation::reg(PReg::new(reg1, RegClass::Int)),
-                Allocation::reg(PReg::new(reg2, RegClass::Int)),
-            ));
+            written.insert(dst);
+
+            ret.moves.push((src, dst));
+        }
+
+        // We might have some unallocated registers free for scratch
+        // space...
+        for i in u.int_in_range(0..=2) {
+            let reg = PReg::new(30 + i, RegClass::Int);
+            ret.available_pregs.push(Allocation::reg(reg));
        }
        Ok(ret)
    }
@@ -38,44 +64,64 @@ impl Arbitrary for TestCase {

 fuzz_target!(|testcase: TestCase| {
    let _ = env_logger::try_init();
-    let scratch = Allocation::reg(PReg::new(31, RegClass::Int));
-    let mut par = ParallelMoves::new(scratch);
+    let mut par = ParallelMoves::new();
    for &(src, dst) in &testcase.moves {
        par.add(src, dst, ());
    }
+
    let moves = par.resolve();
+    log::trace!("raw resolved moves: {:?}", moves);
+
+    // Resolve uses of scratch reg and stack-to-stack moves with the
+    // scratch resolver.
+    let mut avail = testcase.available_pregs.clone();
+    let get_reg = || avail.pop();
+    let mut next_slot = 32;
+    let get_stackslot = || {
+        let slot = next_slot;
+        next_slot += 1;
+        Allocation::stack(SpillSlot::new(slot, RegClass::Int))
+    };
+    let preferred_victim = PReg::new(0, RegClass::Int);
+    let scratch_resolver = MoveAndScratchResolver::new(get_reg, get_stackslot, preferred_victim);
+    let moves = scratch_resolver.compute(moves);
+    log::trace!("resolved moves: {:?}", moves);

    // Compute the final source reg for each dest reg in the original
    // parallel-move set.
-    let mut final_src_per_dest: Vec<Option<usize>> = vec![None; 32];
+    let mut final_src_per_dest: HashMap<Allocation, Allocation> = HashMap::new();
    for &(src, dst) in &testcase.moves {
-        if let (Some(preg_src), Some(preg_dst)) = (src.as_reg(), dst.as_reg()) {
-            final_src_per_dest[preg_dst.hw_enc()] = Some(preg_src.hw_enc());
-        }
+        final_src_per_dest.insert(dst, src);
    }
+    log::trace!("expected final state: {:?}", final_src_per_dest);

    // Simulate the sequence of moves.
-    let mut regfile: Vec<Option<usize>> = vec![None; 32];
-    for i in 0..32 {
-        regfile[i] = Some(i);
-    }
+    let mut locations: HashMap<Allocation, Allocation> = HashMap::new();
    for (src, dst, _) in moves {
-        if let (Some(preg_src), Some(preg_dst)) = (src.as_reg(), dst.as_reg()) {
-            let data = regfile[preg_src.hw_enc()];
-            regfile[preg_dst.hw_enc()] = data;
-        } else {
-            panic!("Bad allocation in move list");
+        if src.is_stack() && dst.is_stack() {
+            panic!("Stack-to-stack move!");
        }
+
+        let data = locations.get(&src).cloned().unwrap_or(src);
+        locations.insert(dst, data);
    }
+    log::trace!("simulated final state: {:?}", locations);

    // Assert that the expected register-moves occurred.
-    // N.B.: range up to 31 (not 32) to skip scratch register.
-    for i in 0..31 {
-        if let Some(orig_src) = final_src_per_dest[i] {
-            assert_eq!(regfile[i], Some(orig_src));
+    for (reg, data) in locations {
+        if let Some(&expected_data) = final_src_per_dest.get(&reg) {
+            assert_eq!(expected_data, data);
        } else {
-            // Should be untouched.
-            assert_eq!(regfile[i], Some(i));
+            if data != reg {
+                // If not just the original value, then this location
+                // has been modified, but it was not part of the
+                // original parallel move. It must have been an
+                // available preg or a scratch stackslot.
+                assert!(
+                    testcase.available_pregs.contains(&reg)
+                        || (reg.is_stack() && reg.as_stack().unwrap().index() >= 32)
+                );
+            }
        }
    }
 });