From d54a27d0ea83090bf1bd46032a2129677f70df0f Mon Sep 17 00:00:00 2001
From: Trevor Elliott <telliott@fastly.com>
Date: Thu, 1 Dec 2022 14:29:36 -0800
Subject: [PATCH] Allocate temporary intermediates when loading constants on
 aarch64 (#5366)

As loading constants on aarch64 can take up to 4 instructions, we need to plumb through some additional registers. Rather than pass a fixed list of registers in, pass an allocation function.
---
 cranelift/codegen/src/isa/aarch64/abi.rs      |  23 ++--
 .../codegen/src/isa/aarch64/inst/emit.rs      |   4 +-
 cranelift/codegen/src/isa/aarch64/inst/mod.rs | 101 +++++++++++-------
 cranelift/codegen/src/isa/aarch64/lower.rs    |   2 +-
 .../codegen/src/isa/aarch64/lower/isle.rs     |  77 +++++++------
 cranelift/codegen/src/isa/riscv64/abi.rs      |   6 +-
 cranelift/codegen/src/isa/s390x/abi.rs        |   5 +-
 cranelift/codegen/src/isa/x64/abi.rs          |   7 +-
 cranelift/codegen/src/machinst/abi.rs         |  15 +--
 .../filetests/isa/aarch64/icmp-const.clif     |  24 ++---
 .../filetests/isa/aarch64/vhigh_bits.clif     |  20 ++--
 11 files changed, 158 insertions(+), 126 deletions(-)
diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs
index d57a804ef7..dedff9cf25 100644
--- a/cranelift/codegen/src/isa/aarch64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -430,7 +430,10 @@ impl ABIMachineSpec for AArch64MachineDeps {
         } else {
             let scratch2 = writable_tmp2_reg();
             assert_ne!(scratch2.to_reg(), from_reg);
-            insts.extend(Inst::load_constant(scratch2, imm.into()));
+            // `gen_add_imm` is only ever called after register allocation has take place, and as a
+            // result it's ok to reuse the scratch2 register here. If that changes, we'll need to
+            // plumb through a way to allocate temporary virtual registers
+            insts.extend(Inst::load_constant(scratch2, imm.into(), &mut |_| scratch2));
             insts.push(Inst::AluRRRExtend {
                 alu_op: ALUOp::Add,
                 size: OperandSize::Size64,
@@ -515,7 +518,9 @@ impl ABIMachineSpec for AArch64MachineDeps {
             ret.push(adj_inst);
         } else {
             let tmp = writable_spilltmp_reg();
-            let const_inst = Inst::load_constant(tmp, amount);
+            // `gen_sp_reg_adjust` is called after regalloc2, so it's acceptable to reuse `tmp` for
+            // intermediates in `load_constant`.
+            let const_inst = Inst::load_constant(tmp, amount, &mut |_| tmp);
             let adj_inst = Inst::AluRRRExtend {
                 alu_op,
                 size: OperandSize::Size64,
@@ -673,8 +678,10 @@ impl ABIMachineSpec for AArch64MachineDeps {
             // itself is not allowed to use the registers.
             let start = writable_spilltmp_reg();
             let end = writable_tmp2_reg();
-            insts.extend(Inst::load_constant(start, 0));
-            insts.extend(Inst::load_constant(end, frame_size.into()));
+            // `gen_inline_probestack` is called after regalloc2, so it's acceptable to reuse
+            // `start` and `end` as temporaries in load_constant.
+            insts.extend(Inst::load_constant(start, 0, &mut |_| start));
+            insts.extend(Inst::load_constant(end, frame_size.into(), &mut |_| end));
             insts.push(Inst::StackProbeLoop {
                 start,
                 end: end.to_reg(),
@@ -1019,19 +1026,19 @@ impl ABIMachineSpec for AArch64MachineDeps {
         insts
     }
 
-    fn gen_memcpy(
+    fn gen_memcpy<F: FnMut(Type) -> Writable<Reg>>(
         call_conv: isa::CallConv,
         dst: Reg,
         src: Reg,
-        tmp: Writable<Reg>,
-        _tmp2: Writable<Reg>,
         size: usize,
+        mut alloc_tmp: F,
     ) -> SmallVec<[Self::I; 8]> {
         let mut insts = SmallVec::new();
         let arg0 = writable_xreg(0);
         let arg1 = writable_xreg(1);
         let arg2 = writable_xreg(2);
-        insts.extend(Inst::load_constant(tmp, size as u64).into_iter());
+        let tmp = alloc_tmp(Self::word_type());
+        insts.extend(Inst::load_constant(tmp, size as u64, &mut alloc_tmp));
         insts.push(Inst::Call {
             info: Box::new(CallInfo {
                 dest: ExternalName::LibCall(LibCall::Memcpy),
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index 0280a4c00f..15ed484dfe 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -64,7 +64,7 @@ pub fn mem_finalize(
             } else {
                 let tmp = writable_spilltmp_reg();
                 (
-                    Inst::load_constant(tmp, off as u64),
+                    Inst::load_constant(tmp, off as u64, &mut |_| tmp),
                     AMode::RegExtended {
                         rn: basereg,
                         rm: tmp.to_reg(),
@@ -3333,7 +3333,7 @@ impl MachInstEmit for Inst {
                     debug_assert!(rd.to_reg() != tmp2_reg());
                     debug_assert!(reg != tmp2_reg());
                     let tmp = writable_tmp2_reg();
-                    for insn in Inst::load_constant(tmp, abs_offset).into_iter() {
+                    for insn in Inst::load_constant(tmp, abs_offset, &mut |_| tmp).into_iter() {
                         insn.emit(&[], sink, emit_info, state);
                     }
                     let add = Inst::AluRRR {
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index 3ed883a494..dca2a479d0 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -130,7 +130,11 @@ fn inst_size_test() {
 impl Inst {
     /// Create an instruction that loads a constant, using one of serveral options (MOVZ, MOVN,
     /// logical immediate, or constant pool).
-    pub fn load_constant(rd: Writable<Reg>, value: u64) -> SmallVec<[Inst; 4]> {
+    pub fn load_constant<F: FnMut(Type) -> Writable<Reg>>(
+        rd: Writable<Reg>,
+        value: u64,
+        alloc_tmp: &mut F,
+    ) -> SmallVec<[Inst; 4]> {
         // NB: this is duplicated in `lower/isle.rs` and `inst.isle` right now,
         // if modifications are made here before this is deleted after moving to
         // ISLE then those locations should be updated as well.
@@ -169,60 +173,73 @@ impl Inst {
             } else {
                 (4, OperandSize::Size64, !value)
             };
+
             // If the number of 0xffff half words is greater than the number of 0x0000 half words
             // it is more efficient to use `movn` for the first instruction.
             let first_is_inverted = count_zero_half_words(negated, num_half_words)
                 > count_zero_half_words(value, num_half_words);
+
             // Either 0xffff or 0x0000 half words can be skipped, depending on the first
             // instruction used.
             let ignored_halfword = if first_is_inverted { 0xffff } else { 0 };
-            let mut first_mov_emitted = false;
 
-            for i in 0..num_half_words {
-                let imm16 = (value >> (16 * i)) & 0xffff;
-                if imm16 != ignored_halfword {
-                    if !first_mov_emitted {
-                        first_mov_emitted = true;
-                        if first_is_inverted {
-                            let imm =
-                                MoveWideConst::maybe_with_shift(((!imm16) & 0xffff) as u16, i * 16)
-                                    .unwrap();
-                            insts.push(Inst::MovWide {
-                                op: MoveWideOp::MovN,
-                                rd,
-                                imm,
-                                size,
-                            });
-                        } else {
-                            let imm =
-                                MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap();
-                            insts.push(Inst::MovWide {
-                                op: MoveWideOp::MovZ,
-                                rd,
-                                imm,
-                                size,
-                            });
-                        }
+            let halfwords: SmallVec<[_; 4]> = (0..num_half_words)
+                .filter_map(|i| {
+                    let imm16 = (value >> (16 * i)) & 0xffff;
+                    if imm16 == ignored_halfword {
+                        None
                     } else {
-                        let imm = MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap();
-                        insts.push(Inst::MovK {
+                        Some((i, imm16))
+                    }
+                })
+                .collect();
+
+            let mut prev_result = None;
+            let last_index = halfwords.last().unwrap().0;
+            for (i, imm16) in halfwords {
+                let shift = i * 16;
+                let rd = if i == last_index { rd } else { alloc_tmp(I16) };
+
+                if let Some(rn) = prev_result {
+                    let imm = MoveWideConst::maybe_with_shift(imm16 as u16, shift).unwrap();
+                    insts.push(Inst::MovK { rd, rn, imm, size });
+                } else {
+                    if first_is_inverted {
+                        let imm =
+                            MoveWideConst::maybe_with_shift(((!imm16) & 0xffff) as u16, shift)
+                                .unwrap();
+                        insts.push(Inst::MovWide {
+                            op: MoveWideOp::MovN,
+                            rd,
+                            imm,
+                            size,
+                        });
+                    } else {
+                        let imm = MoveWideConst::maybe_with_shift(imm16 as u16, shift).unwrap();
+                        insts.push(Inst::MovWide {
+                            op: MoveWideOp::MovZ,
                             rd,
-                            rn: rd.to_reg(), // Redef the same virtual register.
                             imm,
                             size,
                         });
                     }
                 }
+
+                prev_result = Some(rd.to_reg());
             }
 
-            assert!(first_mov_emitted);
+            assert!(prev_result.is_some());
 
             insts
         }
     }
 
     /// Create instructions that load a 128-bit constant.
-    pub fn load_constant128(to_regs: ValueRegs<Writable<Reg>>, value: u128) -> SmallVec<[Inst; 4]> {
+    pub fn load_constant128<F: FnMut(Type) -> Writable<Reg>>(
+        to_regs: ValueRegs<Writable<Reg>>,
+        value: u128,
+        mut alloc_tmp: F,
+    ) -> SmallVec<[Inst; 4]> {
         assert_eq!(to_regs.len(), 2, "Expected to load i128 into two registers");
 
         let lower = value as u64;
@@ -231,8 +248,8 @@ impl Inst {
         let lower_reg = to_regs.regs()[0];
         let upper_reg = to_regs.regs()[1];
 
-        let mut load_ins = Inst::load_constant(lower_reg, lower);
-        let load_upper = Inst::load_constant(upper_reg, upper);
+        let mut load_ins = Inst::load_constant(lower_reg, lower, &mut alloc_tmp);
+        let load_upper = Inst::load_constant(upper_reg, upper, &mut alloc_tmp);
 
         load_ins.extend(load_upper.into_iter());
         load_ins
@@ -264,7 +281,7 @@ impl Inst {
             }]
         } else {
             let tmp = alloc_tmp(I32);
-            let mut insts = Inst::load_constant(tmp, const_data as u64);
+            let mut insts = Inst::load_constant(tmp, const_data as u64, &mut alloc_tmp);
 
             insts.push(Inst::MovToFpu {
                 rd,
@@ -304,7 +321,7 @@ impl Inst {
             Inst::load_fp_constant32(rd, const_data, alloc_tmp)
         } else if const_data & (u32::MAX as u64) == 0 {
             let tmp = alloc_tmp(I64);
-            let mut insts = Inst::load_constant(tmp, const_data);
+            let mut insts = Inst::load_constant(tmp, const_data, &mut alloc_tmp);
 
             insts.push(Inst::MovToFpu {
                 rd,
@@ -426,7 +443,7 @@ impl Inst {
             smallvec![Inst::VecDupFPImm { rd, imm, size }]
         } else {
             let tmp = alloc_tmp(I64);
-            let mut insts = SmallVec::from(&Inst::load_constant(tmp, pattern)[..]);
+            let mut insts = SmallVec::from(&Inst::load_constant(tmp, pattern, &mut alloc_tmp)[..]);
 
             insts.push(Inst::VecDup {
                 rd,
@@ -1212,14 +1229,16 @@ impl MachInst for Inst {
         to_regs: ValueRegs<Writable<Reg>>,
         value: u128,
         ty: Type,
-        alloc_tmp: F,
+        mut alloc_tmp: F,
     ) -> SmallVec<[Inst; 4]> {
         let to_reg = to_regs.only_reg();
         match ty {
             F64 => Inst::load_fp_constant64(to_reg.unwrap(), value as u64, alloc_tmp),
             F32 => Inst::load_fp_constant32(to_reg.unwrap(), value as u32, alloc_tmp),
-            I8 | I16 | I32 | I64 | R32 | R64 => Inst::load_constant(to_reg.unwrap(), value as u64),
-            I128 => Inst::load_constant128(to_regs, value),
+            I8 | I16 | I32 | I64 | R32 | R64 => {
+                Inst::load_constant(to_reg.unwrap(), value as u64, &mut alloc_tmp)
+            }
+            I128 => Inst::load_constant128(to_regs, value, alloc_tmp),
             _ => panic!("Cannot generate constant for type: {}", ty),
         }
     }
@@ -2837,7 +2856,7 @@ impl Inst {
                     );
                 } else {
                     let tmp = writable_spilltmp_reg();
-                    for inst in Inst::load_constant(tmp, abs_offset).into_iter() {
+                    for inst in Inst::load_constant(tmp, abs_offset, &mut |_| tmp).into_iter() {
                         ret.push_str(
                             &inst.print_with_state(&mut EmitState::default(), &mut empty_allocs),
                         );
diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
index 7ba0249e77..2396bf1889 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -575,7 +575,7 @@ fn lower_add_immediate(ctx: &mut Lower<Inst>, dst: Writable<Reg>, src: Reg, imm:
 }
 
 pub(crate) fn lower_constant_u64(ctx: &mut Lower<Inst>, rd: Writable<Reg>, value: u64) {
-    for inst in Inst::load_constant(rd, value) {
+    for inst in Inst::load_constant(rd, value, &mut |ty| ctx.alloc_tmp(ty).only_reg().unwrap()) {
         ctx.emit(inst);
     }
 }
diff --git a/cranelift/codegen/src/isa/aarch64/lower/isle.rs b/cranelift/codegen/src/isa/aarch64/lower/isle.rs
index e47e3fc6b1..a9a0f674c3 100644
--- a/cranelift/codegen/src/isa/aarch64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle.rs
@@ -3,6 +3,7 @@
 // Pull in the ISLE generated code.
 pub mod generated_code;
 use generated_code::Context;
+use smallvec::SmallVec;
 
 // Types that the generated ISLE code uses via `use super::*`.
 use super::{
@@ -217,7 +218,6 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
         } else {
             value
         };
-        let rd = self.temp_writable_reg(I64);
         let size = OperandSize::Size64;
 
         // If the top 32 bits are zero, use 32-bit `mov` operations.
@@ -226,6 +226,7 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
             let lower_halfword = value as u16;
             let upper_halfword = (value >> 16) as u16;
 
+            let rd = self.temp_writable_reg(I64);
             if upper_halfword == u16::MAX {
                 self.emit(&MInst::MovWide {
                     op: MoveWideOp::MovN,
@@ -242,17 +243,20 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
                 });
 
                 if upper_halfword != 0 {
+                    let tmp = self.temp_writable_reg(I64);
                     self.emit(&MInst::MovK {
-                        rd,
+                        rd: tmp,
                         rn: rd.to_reg(),
                         imm: MoveWideConst::maybe_with_shift(upper_halfword, 16).unwrap(),
                         size,
                     });
+                    return tmp.to_reg();
                 }
-            }
+            };
 
             return rd.to_reg();
         } else if value == u64::MAX {
+            let rd = self.temp_writable_reg(I64);
             self.emit(&MInst::MovWide {
                 op: MoveWideOp::MovN,
                 rd,
@@ -265,50 +269,57 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
         // If the number of 0xffff half words is greater than the number of 0x0000 half words
         // it is more efficient to use `movn` for the first instruction.
         let first_is_inverted = count_zero_half_words(!value) > count_zero_half_words(value);
+
         // Either 0xffff or 0x0000 half words can be skipped, depending on the first
         // instruction used.
         let ignored_halfword = if first_is_inverted { 0xffff } else { 0 };
-        let mut first_mov_emitted = false;
 
-        for i in 0..4 {
-            let imm16 = (value >> (16 * i)) & 0xffff;
-            if imm16 != ignored_halfword {
-                if !first_mov_emitted {
-                    first_mov_emitted = true;
-                    if first_is_inverted {
-                        let imm =
-                            MoveWideConst::maybe_with_shift(((!imm16) & 0xffff) as u16, i * 16)
-                                .unwrap();
-                        self.emit(&MInst::MovWide {
-                            op: MoveWideOp::MovN,
-                            rd,
-                            imm,
-                            size,
-                        });
-                    } else {
-                        let imm = MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap();
-                        self.emit(&MInst::MovWide {
-                            op: MoveWideOp::MovZ,
-                            rd,
-                            imm,
-                            size,
-                        });
-                    }
+        let halfwords: SmallVec<[_; 4]> = (0..4)
+            .filter_map(|i| {
+                let imm16 = (value >> (16 * i)) & 0xffff;
+                if imm16 == ignored_halfword {
+                    None
                 } else {
-                    let imm = MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap();
-                    self.emit(&MInst::MovK {
+                    Some((i, imm16))
+                }
+            })
+            .collect();
+
+        let mut prev_result = None;
+        for (i, imm16) in halfwords {
+            let shift = i * 16;
+            let rd = self.temp_writable_reg(I64);
+
+            if let Some(rn) = prev_result {
+                let imm = MoveWideConst::maybe_with_shift(imm16 as u16, shift).unwrap();
+                self.emit(&MInst::MovK { rd, rn, imm, size });
+            } else {
+                if first_is_inverted {
+                    let imm =
+                        MoveWideConst::maybe_with_shift(((!imm16) & 0xffff) as u16, shift).unwrap();
+                    self.emit(&MInst::MovWide {
+                        op: MoveWideOp::MovN,
+                        rd,
+                        imm,
+                        size,
+                    });
+                } else {
+                    let imm = MoveWideConst::maybe_with_shift(imm16 as u16, shift).unwrap();
+                    self.emit(&MInst::MovWide {
+                        op: MoveWideOp::MovZ,
                         rd,
-                        rn: rd.to_reg(),
                         imm,
                         size,
                     });
                 }
             }
+
+            prev_result = Some(rd.to_reg());
         }
 
-        assert!(first_mov_emitted);
+        assert!(prev_result.is_some());
 
-        return self.writable_reg_to_reg(rd);
+        return prev_result.unwrap();
 
         fn count_zero_half_words(mut value: u64) -> usize {
             let mut count = 0;
diff --git a/cranelift/codegen/src/isa/riscv64/abi.rs b/cranelift/codegen/src/isa/riscv64/abi.rs
index 995a52fbc0..d3d0cf1032 100644
--- a/cranelift/codegen/src/isa/riscv64/abi.rs
+++ b/cranelift/codegen/src/isa/riscv64/abi.rs
@@ -525,18 +525,18 @@ impl ABIMachineSpec for Riscv64MachineDeps {
         insts
     }
 
-    fn gen_memcpy(
+    fn gen_memcpy<F: FnMut(Type) -> Writable<Reg>>(
         call_conv: isa::CallConv,
         dst: Reg,
         src: Reg,
-        tmp: Writable<Reg>,
-        _tmp2: Writable<Reg>,
         size: usize,
+        mut alloc_tmp: F,
     ) -> SmallVec<[Self::I; 8]> {
         let mut insts = SmallVec::new();
         let arg0 = Writable::from_reg(x_reg(10));
         let arg1 = Writable::from_reg(x_reg(11));
         let arg2 = Writable::from_reg(x_reg(12));
+        let tmp = alloc_tmp(Self::word_type());
         insts.extend(Inst::load_constant_u64(tmp, size as u64).into_iter());
         insts.push(Inst::Call {
             info: Box::new(CallInfo {
diff --git a/cranelift/codegen/src/isa/s390x/abi.rs b/cranelift/codegen/src/isa/s390x/abi.rs
index f123a48966..27c3a94ed6 100644
--- a/cranelift/codegen/src/isa/s390x/abi.rs
+++ b/cranelift/codegen/src/isa/s390x/abi.rs
@@ -752,13 +752,12 @@ impl ABIMachineSpec for S390xMachineDeps {
         unreachable!();
     }
 
-    fn gen_memcpy(
+    fn gen_memcpy<F: FnMut(Type) -> Writable<Reg>>(
         _call_conv: isa::CallConv,
         _dst: Reg,
         _src: Reg,
-        _tmp1: Writable<Reg>,
-        _tmp2: Writable<Reg>,
         _size: usize,
+        _alloc: F,
     ) -> SmallVec<[Self::I; 8]> {
         unimplemented!("StructArgs not implemented for S390X yet");
     }
diff --git a/cranelift/codegen/src/isa/x64/abi.rs b/cranelift/codegen/src/isa/x64/abi.rs
index d2b9d6e18e..bab5bc8ac1 100644
--- a/cranelift/codegen/src/isa/x64/abi.rs
+++ b/cranelift/codegen/src/isa/x64/abi.rs
@@ -624,18 +624,19 @@ impl ABIMachineSpec for X64ABIMachineSpec {
         insts
     }
 
-    fn gen_memcpy(
+    fn gen_memcpy<F: FnMut(Type) -> Writable<Reg>>(
         call_conv: isa::CallConv,
         dst: Reg,
         src: Reg,
-        temp: Writable<Reg>,
-        temp2: Writable<Reg>,
         size: usize,
+        mut alloc_tmp: F,
     ) -> SmallVec<[Self::I; 8]> {
         let mut insts = SmallVec::new();
         let arg0 = get_intreg_for_arg(&call_conv, 0, 0).unwrap();
         let arg1 = get_intreg_for_arg(&call_conv, 1, 1).unwrap();
         let arg2 = get_intreg_for_arg(&call_conv, 2, 2).unwrap();
+        let temp = alloc_tmp(Self::word_type());
+        let temp2 = alloc_tmp(Self::word_type());
         insts.extend(
             Inst::gen_constant(ValueRegs::one(temp), size as u128, I64, |_| {
                 panic!("tmp should not be needed")
diff --git a/cranelift/codegen/src/machinst/abi.rs b/cranelift/codegen/src/machinst/abi.rs
index b82ce632e1..1d94d4ab9a 100644
--- a/cranelift/codegen/src/machinst/abi.rs
+++ b/cranelift/codegen/src/machinst/abi.rs
@@ -567,16 +567,14 @@ pub trait ABIMachineSpec {
     ) -> SmallVec<[Self::I; 2]>;
 
     /// Generate a memcpy invocation. Used to set up struct
-    /// args. Takes `src`, `dst` as read-only inputs and requires two
-    /// temporaries to generate the call (for the size immediate and
-    /// possibly for the address of `memcpy` itself).
-    fn gen_memcpy(
+    /// args. Takes `src`, `dst` as read-only inputs and passes a temporary
+    /// allocator.
+    fn gen_memcpy<F: FnMut(Type) -> Writable<Reg>>(
         call_conv: isa::CallConv,
         dst: Reg,
         src: Reg,
-        tmp1: Writable<Reg>,
-        tmp2: Writable<Reg>,
         size: usize,
+        alloc_tmp: F,
     ) -> SmallVec<[Self::I; 8]>;
 
     /// Get the number of spillslots required for the given register-class.
@@ -2152,15 +2150,12 @@ impl<M: ABIMachineSpec> Caller<M> {
                 // arg regs.
                 let memcpy_call_conv =
                     isa::CallConv::for_libcall(&self.flags, ctx.sigs()[self.sig].call_conv);
-                let tmp1 = ctx.alloc_tmp(M::word_type()).only_reg().unwrap();
-                let tmp2 = ctx.alloc_tmp(M::word_type()).only_reg().unwrap();
                 for insn in M::gen_memcpy(
                     memcpy_call_conv,
                     dst_ptr.to_reg(),
                     src_ptr,
-                    tmp1,
-                    tmp2,
                     size as usize,
+                    |ty| ctx.alloc_tmp(ty).only_reg().unwrap(),
                 )
                 .into_iter()
                 {
diff --git a/cranelift/filetests/filetests/isa/aarch64/icmp-const.clif b/cranelift/filetests/filetests/isa/aarch64/icmp-const.clif
index d48e8c5019..9911422450 100644
--- a/cranelift/filetests/filetests/isa/aarch64/icmp-const.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/icmp-const.clif
@@ -37,9 +37,9 @@ block0(v0: i32):
 }
 
 ; block0:
-;   movz w2, #4369
-;   movk w2, w2, #17, LSL #16
-;   subs wzr, w0, w2
+;   movz w3, #4369
+;   movk w3, w3, #17, LSL #16
+;   subs wzr, w0, w3
 ;   cset x0, hs
 ;   ret
 
@@ -51,9 +51,9 @@ block0(v0: i32):
 }
 
 ; block0:
-;   movz w2, #4368
-;   movk w2, w2, #17, LSL #16
-;   subs wzr, w0, w2
+;   movz w3, #4368
+;   movk w3, w3, #17, LSL #16
+;   subs wzr, w0, w3
 ;   cset x0, hs
 ;   ret
 
@@ -89,9 +89,9 @@ block0(v0: i32):
 }
 
 ; block0:
-;   movz w2, #4369
-;   movk w2, w2, #17, LSL #16
-;   subs wzr, w0, w2
+;   movz w3, #4369
+;   movk w3, w3, #17, LSL #16
+;   subs wzr, w0, w3
 ;   cset x0, ge
 ;   ret
 
@@ -103,9 +103,9 @@ block0(v0: i32):
 }
 
 ; block0:
-;   movz w2, #4368
-;   movk w2, w2, #17, LSL #16
-;   subs wzr, w0, w2
+;   movz w3, #4368
+;   movk w3, w3, #17, LSL #16
+;   subs wzr, w0, w3
 ;   cset x0, ge
 ;   ret
 
diff --git a/cranelift/filetests/filetests/isa/aarch64/vhigh_bits.clif b/cranelift/filetests/filetests/isa/aarch64/vhigh_bits.clif
index 8f603a919e..970a31004c 100644
--- a/cranelift/filetests/filetests/isa/aarch64/vhigh_bits.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/vhigh_bits.clif
@@ -14,11 +14,11 @@ block0(v0: i8x16):
 ;   movk x5, x5, #8208, LSL #32
 ;   movk x5, x5, #32832, LSL #48
 ;   dup v16.2d, x5
-;   and v19.16b, v2.16b, v16.16b
-;   ext v21.16b, v19.16b, v19.16b, #8
-;   zip1 v23.16b, v19.16b, v21.16b
-;   addv h25, v23.8h
-;   umov w0, v25.h[0]
+;   and v22.16b, v2.16b, v16.16b
+;   ext v24.16b, v22.16b, v22.16b, #8
+;   zip1 v26.16b, v22.16b, v24.16b
+;   addv h28, v26.8h
+;   umov w0, v28.h[0]
 ;   ret
 
 function %f2(i8x16) -> i16 {
@@ -34,11 +34,11 @@ block0(v0: i8x16):
 ;   movk x5, x5, #8208, LSL #32
 ;   movk x5, x5, #32832, LSL #48
 ;   dup v16.2d, x5
-;   and v19.16b, v2.16b, v16.16b
-;   ext v21.16b, v19.16b, v19.16b, #8
-;   zip1 v23.16b, v19.16b, v21.16b
-;   addv h25, v23.8h
-;   umov w0, v25.h[0]
+;   and v22.16b, v2.16b, v16.16b
+;   ext v24.16b, v22.16b, v22.16b, #8
+;   zip1 v26.16b, v22.16b, v24.16b
+;   addv h28, v26.8h
+;   umov w0, v28.h[0]
 ;   ret
 
 function %f3(i16x8) -> i8 {