Cranelift AArch64: Migrate Splat to ISLE (#4521)

2022-07-26 18:57:15 +01:00
parent 1321c234e5
commit ead6edb0c5
21 changed files with 593 additions and 338 deletions
--- a/cranelift/codegen/src/ir/types.rs
+++ b/cranelift/codegen/src/ir/types.rs
@@ -171,8 +171,8 @@ impl Type {
        self.replace_lanes(match self.lane_type() {
            I8 | B1 | B8 => I8,
            I16 | B16 => I16,
-            I32 | B32 => I32,
+            I32 | B32 | F32 => I32,
-            I64 | B64 => I64,
+            I64 | B64 | F64 => I64,
            I128 | B128 => I128,
            _ => unimplemented!(),
        })
--- a/cranelift/codegen/src/isa/aarch64/inst.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst.isle
@@ -627,7 +627,8 @@
       (VecLoadReplicate
        (rd WritableReg)
        (rn Reg)
-        (size VectorSize))
+        (size VectorSize)
        (flags MemFlags))
       ;; Vector conditional select, 128 bit.  A synthetic instruction, which generates a 4-insn
       ;; control-flow diamond.
@@ -1376,6 +1377,16 @@
 (decl cond_br_cond (Cond) CondBrKind)
 (extern constructor cond_br_cond cond_br_cond)
 ;; Lower the address of a load or a store.
 (decl amode (Type Inst u32) AMode)
 ;; TODO: Port lower_address() to ISLE.
 (extern constructor amode amode)
 ;; Matches an `AMode` that is just a register.
 (decl pure amode_is_reg (AMode) Reg)
 ;; TODO: Implement in ISLE.
 (extern constructor amode_is_reg amode_is_reg)
 ;; Instruction creation helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Helper for creating the zero register.
@@ -1481,6 +1492,13 @@
            (_ Unit (emit (MInst.VecDup dst src size))))
        dst))
 ;; Helper for emitting `MInst.VecDupFromFpu` instructions.
 (decl vec_dup_from_fpu (Reg VectorSize) Reg)
 (rule (vec_dup_from_fpu src size)
      (let ((dst WritableReg (temp_writable_reg $I8X16))
            (_ Unit (emit (MInst.VecDupFromFpu dst src size))))
        dst))
 ;; Helper for emitting `MInst.AluRRImm12` instructions.
 (decl alu_rr_imm12 (ALUOp Type Reg Imm12) Reg)
 (rule (alu_rr_imm12 op ty src imm)
@@ -2167,7 +2185,7 @@
 (decl sinkable_atomic_load (SinkableAtomicLoad) Value)
 (extern extractor sinkable_atomic_load sinkable_atomic_load)
-;; Sink a `SinkableLoad` into a `Reg`.
+;; Sink a `SinkableAtomicLoad` into a `Reg`.
 ;;
 ;; This is a side-effectful operation that notifies the context that the
 ;; instruction that produced the `SinkableAtomicLoad` has been sunk into another
@@ -2230,6 +2248,29 @@
          (alu_rrr op ty x_lo y_lo)
          (alu_rrr op ty x_hi y_hi))))
 ;; Helper for emitting `MInst.VecLoadReplicate` instructions.
 (decl ld1r (Reg VectorSize MemFlags) Reg)
 (rule (ld1r src size flags)
      (let ((dst WritableReg (temp_writable_reg $I8X16))
            (_ Unit (emit (MInst.VecLoadReplicate dst src size flags))))
        dst))
 ;; Helper for emitting `MInst.LoadAddr` instructions.
 (decl load_addr (AMode) Reg)
 (rule (load_addr addr)
      (let ((dst WritableReg (temp_writable_reg $I64))
            (_ Unit (emit (MInst.LoadAddr dst addr))))
        dst))
 (rule (load_addr addr)
      (if-let addr_reg (amode_is_reg addr))
      addr_reg)
 ;; Lower a vector splat with a constant parameter.
 (decl splat_const (u64 VectorSize) Reg)
 ;; TODO: Port lower_splat_const() to ISLE.
 (extern constructor splat_const splat_const)
 ;; Generate comparison to zero operator from input condition code
 (decl float_cc_cmp_zero_to_vec_misc_op (FloatCC) VecMisc2)
 (extern constructor float_cc_cmp_zero_to_vec_misc_op float_cc_cmp_zero_to_vec_misc_op)
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -2258,10 +2258,10 @@ impl MachInstEmit for Inst {
                    ScalarSize::Size16 => 0b00010,
                    ScalarSize::Size32 => 0b00100,
                    ScalarSize::Size64 => 0b01000,
-                    _ => unimplemented!("Unexpected VectorSize: {:?}", size),
+                    _ => unreachable!(),
                };
                sink.put4(
-                    0b000_01110000_00000_000011_00000_00000
+                    0b0_0_0_01110000_00000_000011_00000_00000
                        | (q << 30)
                        | (imm5 << 16)
                        | (machreg_to_gpr(rn) << 5)
@@ -2625,13 +2625,18 @@ impl MachInstEmit for Inst {
                };
                sink.put4(enc_vec_rrr(top11 | q << 9, rm, bit15_10, rn, rd));
            }
-            &Inst::VecLoadReplicate { rd, rn, size } => {
+            &Inst::VecLoadReplicate {
                rd,
                rn,
                size,
                flags,
            } => {
                let rd = allocs.next_writable(rd);
                let rn = allocs.next(rn);
                let (q, size) = size.enc_size();
                let srcloc = state.cur_srcloc();
-                if srcloc != SourceLoc::default() {
+                if srcloc != SourceLoc::default() && !flags.notrap() {
                    // Register the offset at which the actual load instruction starts.
                    sink.add_trap(TrapCode::HeapOutOfBounds);
                }
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -2351,10 +2351,10 @@ fn test_aarch64_binemit() {
        Inst::VecDup {
            rd: writable_vreg(25),
            rn: xreg(7),
-            size: VectorSize::Size8x16,
+            size: VectorSize::Size8x8,
        },
-        "F90C014E",
+        "F90C010E",
-        "dup v25.16b, w7",
+        "dup v25.8b, w7",
    ));
    insns.push((
        Inst::VecDup {
@@ -2387,10 +2387,10 @@ fn test_aarch64_binemit() {
        Inst::VecDup {
            rd: writable_vreg(0),
            rn: xreg(28),
-            size: VectorSize::Size32x4,
+            size: VectorSize::Size32x2,
        },
-        "800F044E",
+        "800F040E",
-        "dup v0.4s, w28",
+        "dup v0.2s, w28",
    ));
    insns.push((
        Inst::VecDup {
@@ -5199,8 +5199,8 @@ fn test_aarch64_binemit() {
        Inst::VecLoadReplicate {
            rd: writable_vreg(31),
            rn: xreg(0),
            size: VectorSize::Size64x2,
            flags: MemFlags::trusted(),
        },
        "1FCC404D",
        "ld1r { v31.2d }, [x0]",
@@ -5210,8 +5210,8 @@ fn test_aarch64_binemit() {
        Inst::VecLoadReplicate {
            rd: writable_vreg(0),
            rn: xreg(25),
            size: VectorSize::Size8x8,
            flags: MemFlags::trusted(),
        },
        "20C3400D",
        "ld1r { v0.8b }, [x25]",
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -530,17 +530,6 @@ impl Inst {
            }
        }
    }
    /// Generate a LoadAddr instruction (load address of an amode into
    /// register). Elides when possible (when amode is just a register). Returns
    /// destination register: either `rd` or a register directly from the amode.
    pub fn gen_load_addr(rd: Writable<Reg>, mem: AMode) -> (Reg, Option<Inst>) {
        if let Some(r) = mem.is_reg() {
            (r, None)
        } else {
            (rd.to_reg(), Some(Inst::LoadAddr { rd, mem }))
        }
    }
 }
 //=============================================================================
--- a/cranelift/codegen/src/isa/aarch64/inst/regs.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/regs.rs
@@ -165,6 +165,8 @@ pub fn create_reg_env(flags: &settings::Flags) -> MachineEnv {
                preg(xreg(14)),
                preg(xreg(15)),
                // x16 and x17 are spilltmp and tmp2 (see above).
                // x18 could be used by the platform to carry inter-procedural state;
                // conservatively assume so and make it not allocatable.
                // x19-28 are callee-saved and so not preferred.
                // x21 is the pinned register (if enabled) and not allocatable if so.
                // x29 is FP, x30 is LR, x31 is SP/ZR.
@@ -178,30 +180,7 @@ pub fn create_reg_env(flags: &settings::Flags) -> MachineEnv {
                preg(vreg(5)),
                preg(vreg(6)),
                preg(vreg(7)),
-                preg(vreg(8)),
+                // v8-15 are callee-saved and so not preferred.
                preg(vreg(9)),
                preg(vreg(10)),
                preg(vreg(11)),
                preg(vreg(12)),
                preg(vreg(13)),
                preg(vreg(14)),
                preg(vreg(15)),
            ],
        ],
        non_preferred_regs_by_class: [
            vec![
                preg(xreg(19)),
                preg(xreg(20)),
                // x21 is pinned reg if enabled; we add to this list below if not.
                preg(xreg(22)),
                preg(xreg(23)),
                preg(xreg(24)),
                preg(xreg(25)),
                preg(xreg(26)),
                preg(xreg(27)),
                preg(xreg(28)),
            ],
            vec![
                preg(vreg(16)),
                preg(vreg(17)),
                preg(vreg(18)),
@@ -220,6 +199,30 @@ pub fn create_reg_env(flags: &settings::Flags) -> MachineEnv {
                preg(vreg(31)),
            ],
        ],
        non_preferred_regs_by_class: [
            vec![
                preg(xreg(19)),
                preg(xreg(20)),
                // x21 is pinned reg if enabled; we add to this list below if not.
                preg(xreg(22)),
                preg(xreg(23)),
                preg(xreg(24)),
                preg(xreg(25)),
                preg(xreg(26)),
                preg(xreg(27)),
                preg(xreg(28)),
            ],
            vec![
                preg(vreg(8)),
                preg(vreg(9)),
                preg(vreg(10)),
                preg(vreg(11)),
                preg(vreg(12)),
                preg(vreg(13)),
                preg(vreg(14)),
                preg(vreg(15)),
            ],
        ],
        fixed_stack_slots: vec![],
    };
--- a/cranelift/codegen/src/isa/aarch64/lower.isle
+++ b/cranelift/codegen/src/isa/aarch64/lower.isle
@@ -1423,7 +1423,8 @@
 ;;;; Rules for `bitselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-(rule (lower (has_type (ty_int_bool_ref_scalar_64 ty) (bitselect c x y)))
+(rule (lower (has_type ty (bitselect c x y)))
      (if (ty_int_bool_ref_scalar_64 ty))
      (let ((tmp1 Reg (and_reg ty x c))
            (tmp2 Reg (bic ty y c)))
        (orr ty tmp1 tmp2)))
@@ -1441,12 +1442,14 @@
 ;; T -> I{64,32,16,8}: We can simply pass through the value: values
 ;; are always stored with high bits undefined, so we can just leave
 ;; them be.
-(rule (lower (has_type (ty_int_bool_ref_scalar_64 ty) (ireduce src)))
+(rule (lower (has_type ty (ireduce src)))
    (if (ty_int_bool_ref_scalar_64 ty))
    (value_regs_get src 0))
 ;; Likewise for breduce.
-(rule (lower (has_type (ty_int_bool_ref_scalar_64 ty) (breduce src)))
+(rule (lower (has_type ty (breduce src)))
      (if (ty_int_bool_ref_scalar_64 ty))
      (value_regs_get src 0))
@@ -1515,6 +1518,39 @@
      (let ((use_allocated_encoding bool (is_not_baldrdash_call_conv)))
         (side_effect (udf use_allocated_encoding trap_code))))
 ;;;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (has_type ty (splat x @ (value_type in_ty))))
      (if (ty_int_bool_ref_scalar_64 in_ty))
      (vec_dup x (vector_size ty)))
 (rule (lower (has_type ty (splat x @ (value_type (ty_scalar_float _)))))
      (vec_dup_from_fpu x (vector_size ty)))
 (rule (lower (has_type ty (splat (bconst (u64_from_bool n)))))
      (splat_const n (vector_size ty)))
 (rule (lower (has_type ty (splat (breduce (bconst (u64_from_bool n))))))
      (splat_const n (vector_size ty)))
 (rule (lower (has_type ty (splat (f32const (u64_from_ieee32 n)))))
      (splat_const n (vector_size ty)))
 (rule (lower (has_type ty (splat (f64const (u64_from_ieee64 n)))))
      (splat_const n (vector_size ty)))
 (rule (lower (has_type ty (splat (iconst (u64_from_imm64 n)))))
      (splat_const n (vector_size ty)))
 (rule (lower (has_type ty (splat (ireduce (iconst (u64_from_imm64 n))))))
      (splat_const n (vector_size ty)))
 (rule (lower (has_type ty (splat x @ (load flags _addr offset))))
      (if-let mem_op (is_sinkable_inst x))
      (let ((_ Unit (sink_inst mem_op))
            (addr AMode (amode (lane_type ty) mem_op offset))
            (address Reg (load_addr addr)))
           (ld1r address (vector_size ty) flags)))
 ;;;; Rules for `AtomicLoad` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (has_type (valid_atomic_transaction ty) (atomic_load flags addr)))
@@ -1527,7 +1563,6 @@
                addr))
      (side_effect (store_release ty src addr)))
 ;;;; Rules for `AtomicRMW` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule 1 (lower (and (use_lse)
--- a/cranelift/codegen/src/isa/aarch64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle.rs
@@ -5,12 +5,13 @@ pub mod generated_code;
 // Types that the generated ISLE code uses via `use super::*`.
 use super::{
-    writable_zero_reg, zero_reg, AMode, ASIMDFPModImm, ASIMDMovModImm, BranchTarget, CallIndInfo,
+    insn_inputs, writable_zero_reg, zero_reg, AMode, ASIMDFPModImm, ASIMDMovModImm, BranchTarget,
-    CallInfo, Cond, CondBrKind, ExtendOp, FPUOpRI, FloatCC, Imm12, ImmLogic, ImmShift,
+    CallIndInfo, CallInfo, Cond, CondBrKind, ExtendOp, FPUOpRI, FloatCC, Imm12, ImmLogic, ImmShift,
    Inst as MInst, IntCC, JTSequenceInfo, MachLabel, MoveWideConst, MoveWideOp, NarrowValueMode,
    Opcode, OperandSize, PairAMode, Reg, ScalarSize, ShiftOpAndAmt, UImm5, VecMisc2, VectorSize,
    NZCV,
 };
 use crate::isa::aarch64::lower::{lower_address, lower_splat_const};
 use crate::isa::aarch64::settings::Flags as IsaFlags;
 use crate::machinst::{isle::*, InputSourceInst};
 use crate::settings::Flags;
@@ -442,4 +443,25 @@ where
            _ => panic!(),
        }
    }
    fn amode(&mut self, ty: Type, mem_op: Inst, offset: u32) -> AMode {
        lower_address(
            self.lower_ctx,
            ty,
            &insn_inputs(self.lower_ctx, mem_op)[..],
            offset as i32,
        )
    }
    fn amode_is_reg(&mut self, address: &AMode) -> Option<Reg> {
        address.is_reg()
    }
    fn splat_const(&mut self, value: u64, size: &VectorSize) -> Reg {
        let rd = self.temp_writable_reg(I8X16);
        lower_splat_const(self.lower_ctx, rd, value, *size);
        rd.to_reg()
    }
 }
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -741,80 +741,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            }
        }
-        Opcode::Splat => {
+        Opcode::Splat => implemented_in_isle(ctx),
            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            let ty = ty.unwrap();
            // TODO: Handle SVE Dup.
            let ty = if ty.is_dynamic_vector() {
                dynamic_to_fixed(ty)
            } else {
                ty
            };
            let size = VectorSize::from_ty(ty);
            if let Some((_, insn)) = maybe_input_insn_multi(
                ctx,
                inputs[0],
                &[
                    Opcode::Bconst,
                    Opcode::F32const,
                    Opcode::F64const,
                    Opcode::Iconst,
                ],
            ) {
                lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
            } else if let Some(insn) =
                maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Iconst, Opcode::Ireduce)
            {
                lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
            } else if let Some(insn) =
                maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Bconst, Opcode::Breduce)
            {
                lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
            } else if let Some((_, insn)) = maybe_input_insn_multi(
                ctx,
                inputs[0],
                &[
                    Opcode::Uload8,
                    Opcode::Sload8,
                    Opcode::Uload16,
                    Opcode::Sload16,
                    Opcode::Uload32,
                    Opcode::Sload32,
                    Opcode::Load,
                ],
            ) {
                ctx.sink_inst(insn);
                let load_inputs = insn_inputs(ctx, insn);
                let load_outputs = insn_outputs(ctx, insn);
                lower_load(
                    ctx,
                    insn,
                    &load_inputs[..],
                    load_outputs[0],
                    |ctx, _rd, _elem_ty, mem| {
                        let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
                        let (addr, addr_inst) = Inst::gen_load_addr(tmp, mem);
                        if let Some(addr_inst) = addr_inst {
                            ctx.emit(addr_inst);
                        }
                        ctx.emit(Inst::VecLoadReplicate { rd, rn: addr, size });
                        Ok(())
                    },
                )?;
            } else {
                let input_ty = ctx.input_ty(insn, 0);
                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                let inst = if ty_has_int_representation(input_ty) {
                    Inst::VecDup { rd, rn, size }
                } else {
                    Inst::VecDupFromFpu { rd, rn, size }
                };
                ctx.emit(inst);
            }
        }
        Opcode::ScalarToVector => implemented_in_isle(ctx),
--- a/cranelift/codegen/src/isa/s390x/inst.isle
+++ b/cranelift/codegen/src/isa/s390x/inst.isle
@@ -868,7 +868,7 @@
    ;; Pseudoinstruction to keep a value alive.
    (DummyUse
     (reg Reg))
-    
+
    ;; An unwind pseudoinstruction describing the state of the
    ;; machine at this program point.
    (Unwind
@@ -1641,15 +1641,6 @@
 (decl sinkable_inst (Inst) Value)
 (extern extractor sinkable_inst sinkable_inst)
 ;; Sink a sinkable instruction.
 ;;
 ;; This is a side-effectful operation that notifies the context that the
 ;; sinkable instruction been sunk into another instruction, and no longer
 ;; needs to be lowered.
 (decl sink_inst (Inst) Unit)
 (extern constructor sink_inst sink_inst)
 ;; Sinkable big-endian load instruction.
 (decl sinkable_load (Inst) Value)
 (extractor (sinkable_load inst)
--- a/cranelift/codegen/src/isa/s390x/lower.isle
+++ b/cranelift/codegen/src/isa/s390x/lower.isle
@@ -1656,8 +1656,9 @@
 ;; Insert vector lane from general-purpose register.
 (rule (lower (insertlane x @ (value_type ty)
-                         y @ (value_type (ty_int_bool_ref_scalar_64 _))
+                         y @ (value_type in_ty)
                         (u8_from_uimm8 idx)))
      (if (ty_int_bool_ref_scalar_64 in_ty))
      (vec_insert_lane ty x y (be_lane_idx ty idx) (zero_reg)))
 ;; Insert vector lane from floating-point register.
@@ -1771,8 +1772,9 @@
 ;;;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Extract vector lane to general-purpose register.
-(rule (lower (has_type (ty_int_bool_ref_scalar_64 _)
+(rule (lower (has_type out_ty
                       (extractlane x @ (value_type ty) (u8_from_uimm8 idx))))
      (if (ty_int_bool_ref_scalar_64 out_ty))
      (vec_extract_lane ty x (be_lane_idx ty idx) (zero_reg)))
 ;; Extract vector lane to floating-point register.
@@ -1828,8 +1830,8 @@
 ;;;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Load replicated value from general-purpose register.
-(rule (lower (has_type ty (splat
+(rule (lower (has_type ty (splat x @ (value_type in_ty))))
-                             x @ (value_type (ty_int_bool_ref_scalar_64 _)))))
+      (if (ty_int_bool_ref_scalar_64 in_ty))
      (vec_replicate_lane ty (vec_insert_lane_undef ty x 0 (zero_reg)) 0))
 ;; Load replicated value from floating-point register.
@@ -1888,7 +1890,8 @@
 ;; Load scalar value from general-purpose register.
 (rule (lower (has_type ty (scalar_to_vector
-                             x @ (value_type (ty_int_bool_ref_scalar_64 _)))))
+                             x @ (value_type in_ty))))
      (if (ty_int_bool_ref_scalar_64 in_ty))
      (vec_insert_lane ty (vec_imm ty 0) x (be_lane_idx ty 0) (zero_reg)))
 ;; Load scalar value from floating-point register.
--- a/cranelift/codegen/src/isa/s390x/lower/isle.rs
+++ b/cranelift/codegen/src/isa/s390x/lower/isle.rs
@@ -666,11 +666,6 @@ where
        None
    }
    #[inline]
    fn sink_inst(&mut self, inst: Inst) -> Unit {
        self.lower_ctx.sink_inst(inst);
    }
    #[inline]
    fn emit(&mut self, inst: &MInst) -> Unit {
        self.lower_ctx.emit(inst.clone());
--- a/cranelift/codegen/src/machinst/isle.rs
+++ b/cranelift/codegen/src/machinst/isle.rs
@@ -11,7 +11,9 @@ pub use crate::ir::{
    SigRef, StackSlot,
 };
 pub use crate::isa::unwind::UnwindInst;
-pub use crate::machinst::{ABIArg, ABIArgSlot, ABISig, RealReg, Reg, RelocDistance, Writable};
+pub use crate::machinst::{
    ABIArg, ABIArgSlot, ABISig, InputSourceInst, RealReg, Reg, RelocDistance, Writable,
 };
 pub type Unit = ();
 pub type ValueSlice = (ValueList, usize);
@@ -425,6 +427,15 @@ macro_rules! isle_prelude_methods {
            imm.bits() as u64
        }
        #[inline]
        fn u64_from_bool(&mut self, b: bool) -> u64 {
            if b {
                u64::MAX
            } else {
                0
            }
        }
        #[inline]
        fn inst_results(&mut self, inst: Inst) -> ValueSlice {
            (self.lower_ctx.dfg().inst_results_list(inst), 0)
@@ -854,6 +865,21 @@ macro_rules! isle_prelude_methods {
        fn real_reg_to_writable_reg(&mut self, reg: RealReg) -> WritableReg {
            Writable::from_reg(Reg::from(reg))
        }
        fn is_sinkable_inst(&mut self, val: Value) -> Option<Inst> {
            let input = self.lower_ctx.get_value_as_source_or_const(val);
            if let InputSourceInst::UniqueUse(inst, _) = input.inst {
                Some(inst)
            } else {
                None
            }
        }
        #[inline]
        fn sink_inst(&mut self, inst: Inst) {
            self.lower_ctx.sink_inst(inst);
        }
    };
 }
--- a/cranelift/codegen/src/prelude.isle
+++ b/cranelift/codegen/src/prelude.isle
@@ -308,10 +308,10 @@
 (decl fits_in_64 (Type) Type)
 (extern extractor fits_in_64 fits_in_64)
-;; An extractor that only matches scalar booleans, integers, and references that
+;; A pure constructor that only matches scalar booleans, integers, and
-;; can fit in 64 bits.
+;; references that can fit in 64 bits.
-(decl ty_int_bool_ref_scalar_64 (Type) Type)
+(decl pure ty_int_bool_ref_scalar_64 (Type) Type)
-(extern extractor ty_int_bool_ref_scalar_64 ty_int_bool_ref_scalar_64)
+(extern constructor ty_int_bool_ref_scalar_64 ty_int_bool_ref_scalar_64)
 ;; An extractor that matches 32- and 64-bit types only.
 (decl ty_32_or_64 (Type) Type)
@@ -407,6 +407,10 @@
 (decl u8_from_uimm8 (u8) Uimm8)
 (extern extractor infallible u8_from_uimm8 u8_from_uimm8)
 ;; Extract a `u64` from a `bool`.
 (decl u64_from_bool (u64) bool)
 (extern extractor infallible u64_from_bool u64_from_bool)
 ;; Extract a `u64` from an `Imm64`.
 (decl u64_from_imm64 (u64) Imm64)
 (extern extractor infallible u64_from_imm64 u64_from_imm64)
@@ -498,6 +502,10 @@
 (decl pure zero_value (Value) Value)
 (extern constructor zero_value zero_value)
 ;; Match a sinkable instruction from a value operand.
 (decl pure is_sinkable_inst (Value) Inst)
 (extern constructor is_sinkable_inst is_sinkable_inst)
 ;; Instruction creation helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Emit an instruction.
@@ -508,6 +516,14 @@
 (decl emit (MInst) Unit)
 (extern constructor emit emit)
 ;; Sink an instruction.
 ;;
 ;; This is a side-effectful operation that notifies the context that the
 ;; instruction has been sunk into another instruction, and no longer needs to
 ;; be lowered.
 (decl sink_inst (Inst) Unit)
 (extern constructor sink_inst sink_inst)
 ;; Constant pool emission.
 (type VCodeConstant (primitive VCodeConstant))
--- a/cranelift/filetests/filetests/isa/aarch64/bitops.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/bitops.clif
@@ -244,18 +244,13 @@ block0(v0: i128):
    return v1
 }
 ;   stp fp, lr, [sp, #-16]!
 ;   mov fp, sp
 ;   stp d11, d13, [sp, #-16]!
 ; block0:
 ;   fmov d6, x0
 ;   mov v6.d[1], x1
-;   cnt v11.16b, v6.16b
+;   cnt v19.16b, v6.16b
-;   addv b13, v11.16b
+;   addv b21, v19.16b
-;   umov w0, v13.b[0]
+;   umov w0, v21.b[0]
 ;   movz w1, #0
 ;   ldp d11, d13, [sp], #16
 ;   ldp fp, lr, [sp], #16
 ;   ret
 function %d(i64) -> i64 {
--- a/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-narrow.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-narrow.clif
@@ -15,9 +15,9 @@ block0(v0: i16):
 }
 ; block0:
-;   dup v2.4h, w0
+;   dup v6.4h, w0
-;   mov v7.16b, v2.16b
+;   mov v7.16b, v6.16b
-;   mov v7.d[1], v2.d[0]
+;   mov v7.d[1], v6.d[0]
 ;   sqxtn v0.8b, v7.8h
 ;   ret
@@ -35,9 +35,9 @@ block0(v0: i16):
 }
 ; block0:
-;   dup v2.8h, w0
+;   dup v6.8h, w0
-;   sqxtn v0.8b, v2.8h
+;   sqxtn v0.8b, v6.8h
-;   sqxtn2 v0.16b, v2.8h
+;   sqxtn2 v0.16b, v6.8h
 ;   ret
 function %snarrow_i32x2(i32) -> i16x4 {
@@ -54,9 +54,9 @@ block0(v0: i32):
 }
 ; block0:
-;   dup v2.2s, w0
+;   dup v6.2s, w0
-;   mov v7.16b, v2.16b
+;   mov v7.16b, v6.16b
-;   mov v7.d[1], v2.d[0]
+;   mov v7.d[1], v6.d[0]
 ;   sqxtn v0.4h, v7.4s
 ;   ret
@@ -74,9 +74,9 @@ block0(v0: i32):
 }
 ; block0:
-;   dup v2.4s, w0
+;   dup v6.4s, w0
-;   sqxtn v0.4h, v2.4s
+;   sqxtn v0.4h, v6.4s
-;   sqxtn2 v0.8h, v2.4s
+;   sqxtn2 v0.8h, v6.4s
 ;   ret
 function %snarrow_i64x2(i64) -> i32x4 {
@@ -93,9 +93,9 @@ block0(v0: i64):
 }
 ; block0:
-;   dup v2.2d, x0
+;   dup v6.2d, x0
-;   sqxtn v0.2s, v2.2d
+;   sqxtn v0.2s, v6.2d
-;   sqxtn2 v0.4s, v2.2d
+;   sqxtn2 v0.4s, v6.2d
 ;   ret
 function %unarrow_i16x4(i16) -> i8x8 {
@@ -112,9 +112,9 @@ block0(v0: i16):
 }
 ; block0:
-;   dup v2.4h, w0
+;   dup v6.4h, w0
-;   mov v7.16b, v2.16b
+;   mov v7.16b, v6.16b
-;   mov v7.d[1], v2.d[0]
+;   mov v7.d[1], v6.d[0]
 ;   sqxtun v0.8b, v7.8h
 ;   ret
@@ -132,9 +132,9 @@ block0(v0: i16):
 }
 ; block0:
-;   dup v2.8h, w0
+;   dup v6.8h, w0
-;   sqxtun v0.8b, v2.8h
+;   sqxtun v0.8b, v6.8h
-;   sqxtun2 v0.16b, v2.8h
+;   sqxtun2 v0.16b, v6.8h
 ;   ret
 function %unarrow_i32x2(i32) -> i16x4 {
@@ -151,9 +151,9 @@ block0(v0: i32):
 }
 ; block0:
-;   dup v2.2s, w0
+;   dup v6.2s, w0
-;   mov v7.16b, v2.16b
+;   mov v7.16b, v6.16b
-;   mov v7.d[1], v2.d[0]
+;   mov v7.d[1], v6.d[0]
 ;   sqxtun v0.4h, v7.4s
 ;   ret
@@ -171,9 +171,9 @@ block0(v0: i32):
 }
 ; block0:
-;   dup v2.4s, w0
+;   dup v6.4s, w0
-;   sqxtun v0.4h, v2.4s
+;   sqxtun v0.4h, v6.4s
-;   sqxtun2 v0.8h, v2.4s
+;   sqxtun2 v0.8h, v6.4s
 ;   ret
 function %unarrow_i64x2(i64) -> i32x4 {
@@ -190,9 +190,9 @@ block0(v0: i64):
 }
 ; block0:
-;   dup v2.2d, x0
+;   dup v6.2d, x0
-;   sqxtun v0.2s, v2.2d
+;   sqxtun v0.2s, v6.2d
-;   sqxtun2 v0.4s, v2.2d
+;   sqxtun2 v0.4s, v6.2d
 ;   ret
 function %uunarrow_i16x4(i16) -> i8x8 {
@@ -209,9 +209,9 @@ block0(v0: i16):
 }
 ; block0:
-;   dup v2.4h, w0
+;   dup v6.4h, w0
-;   mov v7.16b, v2.16b
+;   mov v7.16b, v6.16b
-;   mov v7.d[1], v2.d[0]
+;   mov v7.d[1], v6.d[0]
 ;   uqxtn v0.8b, v7.8h
 ;   ret
@@ -229,9 +229,9 @@ block0(v0: i16):
 }
 ; block0:
-;   dup v2.8h, w0
+;   dup v6.8h, w0
-;   uqxtn v0.8b, v2.8h
+;   uqxtn v0.8b, v6.8h
-;   uqxtn2 v0.16b, v2.8h
+;   uqxtn2 v0.16b, v6.8h
 ;   ret
 function %uunarrow_i32x2(i32) -> i16x4 {
@@ -248,9 +248,9 @@ block0(v0: i32):
 }
 ; block0:
-;   dup v2.2s, w0
+;   dup v6.2s, w0
-;   mov v7.16b, v2.16b
+;   mov v7.16b, v6.16b
-;   mov v7.d[1], v2.d[0]
+;   mov v7.d[1], v6.d[0]
 ;   uqxtn v0.4h, v7.4s
 ;   ret
@@ -268,9 +268,9 @@ block0(v0: i32):
 }
 ; block0:
-;   dup v2.4s, w0
+;   dup v6.4s, w0
-;   uqxtn v0.4h, v2.4s
+;   uqxtn v0.4h, v6.4s
-;   uqxtn2 v0.8h, v2.4s
+;   uqxtn2 v0.8h, v6.4s
 ;   ret
 function %uunarrow_i64x2(i64) -> i32x4 {
@@ -287,8 +287,7 @@ block0(v0: i64):
 }
 ; block0:
-;   dup v2.2d, x0
+;   dup v6.2d, x0
-;   uqxtn v0.2s, v2.2d
+;   uqxtn v0.2s, v6.2d
-;   uqxtn2 v0.4s, v2.2d
+;   uqxtn2 v0.4s, v6.2d
 ;   ret
--- a/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-neon.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-neon.clif
@@ -1,4 +1,4 @@
-test compile
+test compile precise-output
 target aarch64
 function %i8x16_splat_add(i8, i8) -> i8x16 {
@@ -13,10 +13,11 @@ block0(v0: i8, v1: i8):
  return v5
 }
-; check:  dup v4.16b, w0
+; block0:
-; nextln: dup v6.16b, w1
+;   dup v16.16b, w0
-; nextln: add v0.16b, v4.16b, v6.16b
+;   dup v17.16b, w1
-; nextln: ret
+;   add v0.16b, v16.16b, v17.16b
 ;   ret
 function %i16x8_splat_add(i16, i16) -> i16x8 {
  gv0 = dyn_scale_target_const.i16x8
@@ -30,10 +31,11 @@ block0(v0: i16, v1: i16):
  return v5
 }
-; check:  dup v4.8h, w0
+; block0:
-; nextln: dup v6.8h, w1
+;   dup v16.8h, w0
-; nextln: add v0.8h, v4.8h, v6.8h
+;   dup v17.8h, w1
-; nextln: ret
+;   add v0.8h, v16.8h, v17.8h
 ;   ret
 function %i32x4_splat_mul(i32, i32) -> i32x4 {
  gv0 = dyn_scale_target_const.i32x4
@@ -47,10 +49,11 @@ block0(v0: i32, v1: i32):
  return v5
 }
-; check:  dup v4.4s, w0
+; block0:
-; nextln: dup v6.4s, w1
+;   dup v16.4s, w0
-; nextln: mul v0.4s, v4.4s, v6.4s
+;   dup v17.4s, w1
-; nextln: ret
+;   mul v0.4s, v16.4s, v17.4s
 ;   ret
 function %i64x2_splat_sub(i64, i64) -> i64x2 {
  gv0 = dyn_scale_target_const.i64x2
@@ -64,10 +67,11 @@ block0(v0: i64, v1: i64):
  return v5
 }
-; check:  dup v4.2d, x0
+; block0:
-; nextln: dup v6.2d, x1
+;   dup v16.2d, x0
-; nextln: sub v0.2d, v4.2d, v6.2d
+;   dup v17.2d, x1
-; nextln: ret
+;   sub v0.2d, v16.2d, v17.2d
 ;   ret
 function %f32x4_splat_add(f32, f32) -> f32x4 {
  gv0 = dyn_scale_target_const.f32x4
@@ -81,10 +85,11 @@ block0(v0: f32, v1: f32):
  return v5
 }
-; check:  dup v4.4s, v0.s[0]
+; block0:
-; nextln: dup v6.4s, v1.s[0]
+;   dup v16.4s, v0.s[0]
-; nextln: fadd v0.4s, v4.4s, v6.4s
+;   dup v17.4s, v1.s[0]
-; nextln: ret
+;   fadd v0.4s, v16.4s, v17.4s
 ;   ret
 function %f64x2_splat_sub(f64, f64) -> f64x2 {
  gv0 = dyn_scale_target_const.f64x2
@@ -98,10 +103,11 @@ block0(v0: f64, v1: f64):
  return v5
 }
-; check:  dup v4.2d, v0.d[0]
+; block0:
-; nextln: dup v6.2d, v1.d[0]
+;   dup v16.2d, v0.d[0]
-; nextln: fsub v0.2d, v4.2d, v6.2d
+;   dup v17.2d, v1.d[0]
-; nextln: ret
+;   fsub v0.2d, v16.2d, v17.2d
 ;   ret
 function %f64x2_splat_mul(f64, f64) -> f64x2 {
  gv0 = dyn_scale_target_const.f64x2
@@ -115,10 +121,11 @@ block0(v0: f64, v1: f64):
  return v5
 }
-; check:  dup v4.2d, v0.d[0]
+; block0:
-; nextln: dup v6.2d, v1.d[0]
+;   dup v16.2d, v0.d[0]
-; nextln: fmul v0.2d, v4.2d, v6.2d
+;   dup v17.2d, v1.d[0]
-; nextln: ret
+;   fmul v0.2d, v16.2d, v17.2d
 ;   ret
 function %f64x2_splat_div(f64, f64) -> f64x2 {
  gv0 = dyn_scale_target_const.f64x2
@@ -132,10 +139,11 @@ block0(v0: f64, v1: f64):
  return v5
 }
-; check:  dup v4.2d, v0.d[0]
+; block0:
-; nextln: dup v6.2d, v1.d[0]
+;   dup v16.2d, v0.d[0]
-; nextln: fdiv v0.2d, v4.2d, v6.2d
+;   dup v17.2d, v1.d[0]
-; nextln: ret
+;   fdiv v0.2d, v16.2d, v17.2d
 ;   ret
 function %f64x2_splat_min(f64, f64) -> f64x2 {
  gv0 = dyn_scale_target_const.f64x2
@@ -149,10 +157,11 @@ block0(v0: f64, v1: f64):
  return v5
 }
-; check:  dup v4.2d, v0.d[0]
+; block0:
-; nextln: dup v6.2d, v1.d[0]
+;   dup v16.2d, v0.d[0]
-; nextln: fmin v0.2d, v4.2d, v6.2d
+;   dup v17.2d, v1.d[0]
-; nextln: ret
+;   fmin v0.2d, v16.2d, v17.2d
 ;   ret
 function %f64x2_splat_max(f64, f64) -> f64x2 {
  gv0 = dyn_scale_target_const.f64x2
@@ -166,10 +175,11 @@ block0(v0: f64, v1: f64):
  return v5
 }
-; check:  dup v4.2d, v0.d[0]
+; block0:
-; nextln: dup v6.2d, v1.d[0]
+;   dup v16.2d, v0.d[0]
-; nextln: fmax v0.2d, v4.2d, v6.2d
+;   dup v17.2d, v1.d[0]
-; nextln: ret
+;   fmax v0.2d, v16.2d, v17.2d
 ;   ret
 function %f64x2_splat_min_pseudo(f64, f64) -> f64x2 {
  gv0 = dyn_scale_target_const.f64x2
@@ -183,11 +193,12 @@ block0(v0: f64, v1: f64):
  return v5
 }
-; check:  dup v4.2d, v0.d[0]
+; block0:
-; nextln: dup v6.2d, v1.d[0]
+;   dup v17.2d, v0.d[0]
-; nextln: fcmgt v0.2d, v4.2d, v6.2d
+;   dup v18.2d, v1.d[0]
-; nextln: bsl v0.16b, v6.16b, v4.16b
+;   fcmgt v0.2d, v17.2d, v18.2d
-; nextln: ret
+;   bsl v0.16b, v18.16b, v17.16b
 ;   ret
 function %f64x2_splat_max_pseudo(f64, f64) -> f64x2 {
  gv0 = dyn_scale_target_const.f64x2
@@ -201,8 +212,9 @@ block0(v0: f64, v1: f64):
  return v5
 }
-; check:  dup v4.2d, v0.d[0]
+; block0:
-; nextln: dup v6.2d, v1.d[0]
+;   dup v17.2d, v0.d[0]
-; nextln: fcmgt v0.2d, v6.2d, v4.2d
+;   dup v18.2d, v1.d[0]
-; nextln: bsl v0.16b, v6.16b, v4.16b
+;   fcmgt v0.2d, v18.2d, v17.2d
-; nextln: ret
+;   bsl v0.16b, v18.16b, v17.16b
 ;   ret
--- a/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-widen.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-widen.clif
@@ -1,4 +1,4 @@
-test compile
+test compile precise-output
 target aarch64
 function %swidenhigh_i8x16(i8) -> i16x8 {
@@ -14,9 +14,10 @@ block0(v0: i8):
  return v3
 }
-; check: dup v2.16b, w0
+; block0:
-; nextln: sxtl2 v0.8h, v2.16b
+;   dup v5.16b, w0
-; nextln: ret
+;   sxtl2 v0.8h, v5.16b
 ;   ret
 function %swidenhigh_i16x8(i16) -> i32x4 {
  gv0 = dyn_scale_target_const.i32x4
@@ -31,9 +32,10 @@ block0(v0: i16):
  return v3
 }
-; check: dup v2.8h, w0
+; block0:
-; nextln: sxtl2 v0.4s, v2.8h
+;   dup v5.8h, w0
-; nextln: ret
+;   sxtl2 v0.4s, v5.8h
 ;   ret
 function %swidenhigh_i32x4(i32) -> i64x2 {
  gv0 = dyn_scale_target_const.i32x4
@@ -48,9 +50,10 @@ block0(v0: i32):
  return v3
 }
-; check: dup v2.4s, w0
+; block0:
-; nextln: sxtl2 v0.2d, v2.4s
+;   dup v5.4s, w0
-; nextln: ret
+;   sxtl2 v0.2d, v5.4s
 ;   ret
 function %swidenlow_i8x16(i8) -> i16x8 {
  gv0 = dyn_scale_target_const.i16x8
@@ -65,9 +68,10 @@ block0(v0: i8):
  return v3
 }
-; check: dup v2.16b, w0
+; block0:
-; nextln: sxtl v0.8h, v2.8b
+;   dup v5.16b, w0
-; nextln: ret
+;   sxtl v0.8h, v5.8b
 ;   ret
 function %swidenlow_i16x8(i16) -> i32x4 {
  gv0 = dyn_scale_target_const.i32x4
@@ -82,9 +86,10 @@ block0(v0: i16):
  return v3
 }
-; check: dup v2.8h, w0
+; block0:
-; nextln: sxtl v0.4s, v2.4h
+;   dup v5.8h, w0
-; nextln: ret
+;   sxtl v0.4s, v5.4h
 ;   ret
 function %swidenlow_i32x4(i32) -> i64x2 {
  gv0 = dyn_scale_target_const.i32x4
@@ -99,6 +104,7 @@ block0(v0: i32):
  return v3
 }
-; check: dup v2.4s, w0
+; block0:
-; nextln: sxtl v0.2d, v2.2s
+;   dup v5.4s, w0
-; nextln: ret
+;   sxtl v0.2d, v5.2s
 ;   ret
--- a/cranelift/filetests/filetests/isa/aarch64/dynamic-slot.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/dynamic-slot.clif
@@ -58,9 +58,9 @@ block0(v0: i32):
 ;   mov fp, sp
 ;   sub sp, sp, #16
 ; block0:
-;   dup v2.4s, w0
+;   dup v3.4s, w0
-;   mov x4, sp
+;   mov x3, sp
-;   str q2, [x4]
+;   str q3, [x3]
 ;   add sp, sp, #16
 ;   ldp fp, lr, [sp], #16
 ;   ret
@@ -101,9 +101,9 @@ block0(v0: i32):
 ;   mov fp, sp
 ;   sub sp, sp, #16
 ; block0:
-;   dup v2.4s, w0
+;   dup v3.4s, w0
-;   mov x4, sp
+;   mov x3, sp
-;   str q2, [x4]
+;   str q3, [x3]
 ;   add sp, sp, #16
 ;   ldp fp, lr, [sp], #16
 ;   ret
--- a/cranelift/filetests/filetests/isa/aarch64/prologue.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/prologue.clif
@@ -82,29 +82,6 @@ block0(v0: f64):
 ;   stp d10, d11, [sp, #-16]!
 ;   stp d8, d9, [sp, #-16]!
 ; block0:
 ;   fadd d1, d0, d0
 ;   fadd d2, d0, d0
 ;   fadd d3, d0, d0
 ;   fadd d4, d0, d0
 ;   fadd d5, d0, d0
 ;   fadd d6, d0, d0
 ;   fadd d7, d0, d0
 ;   fadd d8, d0, d0
 ;   fadd d9, d0, d0
 ;   fadd d10, d0, d0
 ;   fadd d11, d0, d0
 ;   fadd d12, d0, d0
 ;   fadd d13, d0, d0
 ;   fadd d14, d0, d0
 ;   fadd d15, d0, d0
 ;   fadd d16, d0, d0
 ;   fadd d17, d0, d0
 ;   fadd d18, d0, d0
 ;   fadd d19, d0, d0
 ;   fadd d20, d0, d0
 ;   fadd d21, d0, d0
 ;   fadd d22, d0, d0
 ;   fadd d23, d0, d0
 ;   fadd d24, d0, d0
 ;   fadd d25, d0, d0
 ;   fadd d26, d0, d0
@@ -113,37 +90,60 @@ block0(v0: f64):
 ;   fadd d29, d0, d0
 ;   fadd d30, d0, d0
 ;   fadd d31, d0, d0
-;   fadd d0, d0, d1
+;   fadd d1, d0, d0
-;   fadd d1, d2, d3
+;   fadd d2, d0, d0
-;   fadd d2, d4, d5
+;   fadd d3, d0, d0
-;   fadd d3, d6, d7
+;   fadd d4, d0, d0
 ;   fadd d5, d0, d0
 ;   fadd d6, d0, d0
 ;   fadd d7, d0, d0
 ;   fadd d16, d0, d0
 ;   fadd d17, d0, d0
 ;   fadd d18, d0, d0
 ;   fadd d19, d0, d0
 ;   fadd d20, d0, d0
 ;   fadd d21, d0, d0
 ;   fadd d22, d0, d0
 ;   fadd d23, d0, d0
 ;   fadd d8, d0, d0
 ;   fadd d9, d0, d0
 ;   fadd d10, d0, d0
 ;   fadd d11, d0, d0
 ;   fadd d12, d0, d0
 ;   fadd d13, d0, d0
 ;   fadd d14, d0, d0
 ;   fadd d15, d0, d0
 ;   fadd d24, d0, d24
 ;   fadd d25, d25, d26
 ;   fadd d26, d27, d28
 ;   fadd d27, d29, d30
 ;   fadd d28, d31, d1
 ;   fadd d29, d2, d3
 ;   fadd d30, d4, d5
 ;   fadd d31, d6, d7
 ;   fadd d0, d16, d17
 ;   fadd d1, d18, d19
 ;   fadd d2, d20, d21
 ;   fadd d3, d22, d23
 ;   fadd d4, d8, d9
 ;   fadd d5, d10, d11
 ;   fadd d6, d12, d13
 ;   fadd d7, d14, d15
-;   fadd d8, d16, d17
+;   fadd d24, d24, d25
-;   fadd d9, d18, d19
+;   fadd d25, d26, d27
-;   fadd d10, d20, d21
+;   fadd d26, d28, d29
-;   fadd d11, d22, d23
+;   fadd d27, d30, d31
-;   fadd d12, d24, d25
+;   fadd d28, d0, d1
-;   fadd d13, d26, d27
+;   fadd d29, d2, d3
-;   fadd d14, d28, d29
+;   fadd d30, d4, d5
-;   fadd d15, d30, d31
+;   fadd d31, d6, d7
-;   fadd d0, d0, d1
+;   fadd d24, d24, d25
-;   fadd d1, d2, d3
+;   fadd d25, d26, d27
-;   fadd d2, d4, d5
+;   fadd d26, d28, d29
-;   fadd d3, d6, d7
+;   fadd d27, d30, d31
-;   fadd d4, d8, d9
+;   fadd d24, d24, d25
-;   fadd d5, d10, d11
+;   fadd d25, d26, d27
-;   fadd d6, d12, d13
+;   fadd d0, d24, d25
 ;   fadd d7, d14, d15
 ;   fadd d0, d0, d1
 ;   fadd d1, d2, d3
 ;   fadd d2, d4, d5
 ;   fadd d3, d6, d7
 ;   fadd d0, d0, d1
 ;   fadd d1, d2, d3
 ;   fadd d0, d0, d1
 ;   ldp d8, d9, [sp], #16
 ;   ldp d10, d11, [sp], #16
 ;   ldp d12, d13, [sp], #16
@@ -242,4 +242,3 @@ block0(v0: i64):
 ;   ldr x28, [sp], #16
 ;   ldp fp, lr, [sp], #16
 ;   ret
--- a/cranelift/filetests/filetests/runtests/simd-splat.clif
+++ b/cranelift/filetests/filetests/runtests/simd-splat.clif
@@ -1,4 +1,4 @@
-test interpret
+; test interpret TODO: Not yet implemented
 test run
 target aarch64
 target s390x
@@ -10,6 +10,8 @@ block0(v0: i8):
    v1 = splat.i8x16 v0
    return v1
 }
 ; run: %splat_i8x16(-1) == [-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
 ; run: %splat_i8x16(0) == [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 ; run: %splat_i8x16(1) == [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 function %splat_i16x8(i16) -> i16x8 {
@@ -17,6 +19,8 @@ block0(v0: i16):
    v1 = splat.i16x8 v0
    return v1
 }
 ; run: %splat_i16x8(-1) == [-1 -1 -1 -1 -1 -1 -1 -1]
 ; run: %splat_i16x8(0) == [0 0 0 0 0 0 0 0]
 ; run: %splat_i16x8(512) == [512 512 512 512 512 512 512 512]
 function %splat_i32x4(i32) -> i32x4 {
@@ -24,6 +28,8 @@ block0(v0: i32):
    v1 = splat.i32x4 v0
    return v1
 }
 ; run: %splat_i32x4(-1) == [-1 -1 -1 -1]
 ; run: %splat_i32x4(0) == [0 0 0 0]
 ; run: %splat_i32x4(2000000) == [2000000 2000000 2000000 2000000]
 function %splat_i64x2(i64) -> i64x2 {
@@ -31,4 +37,189 @@ block0(v0: i64):
    v1 = splat.i64x2 v0
    return v1
 }
 ; run: %splat_i64x2(-1) == [-1 -1]
 ; run: %splat_i64x2(0) == [0 0]
 ; run: %splat_i64x2(5000000000) == [5000000000 5000000000]
 function %splat_f32x4(f32) -> f32x4 {
 block0(v0: f32):
    v1 = splat.f32x4 v0
    return v1
 }
 ; run: %splat_f32x4(-0x0.0) == [-0x0.0 -0x0.0 -0x0.0 -0x0.0]
 ; run: %splat_f32x4(0x1.0) == [0x1.0 0x1.0 0x1.0 0x1.0]
 ; run: %splat_f32x4(NaN) == [NaN NaN NaN NaN]
 function %splat_f64x2(f64) -> f64x2 {
 block0(v0: f64):
    v1 = splat.f64x2 v0
    return v1
 }
 ; run: %splat_f64x2(0x0.0) == [0x0.0 0x0.0]
 ; run: %splat_f64x2(0x2.0) == [0x2.0 0x2.0]
 ; run: %splat_f64x2(NaN) == [NaN NaN]
 ; TODO: Test combinations of `bconst` and `splat`, potentially with `breduce` in
 ; the middle
 function %splat_i8x16_2(i8x16) -> i8x16 {
 block0(v0: i8x16):
  v1 = iconst.i8 116
  v2 = splat.i8x16 v1
  v3 = iadd v0, v2
  return v3
 }
 ; run: %splat_i8x16_2([-128 -101 -75 -59 -22 -12 -7 -1 0 3 17 34 68 92 111 127]) == [-12 15 41 57 94 104 109 115 116 119 -123 -106 -72 -48 -29 -13]
 function %splat_i8x16_3(i8x16) -> i8x16 {
 block0(v0: i8x16):
  v1 = iconst.i16 116
  v2 = ireduce.i8 v1
  v3 = splat.i8x16 v2
  v4 = iadd v0, v3
  return v4
 }
 ; run: %splat_i8x16_3([-128 -101 -75 -59 -22 -12 -7 -1 0 3 17 34 68 92 111 127]) == [-12 15 41 57 94 104 109 115 116 119 -123 -106 -72 -48 -29 -13]
 function %splat_i16x8_2(i16x8) -> i16x8 {
 block0(v0: i16x8):
  v1 = iconst.i16 42
  v2 = splat.i16x8 v1
  v3 = iadd v0, v2
  return v3
 }
 ; run: %splat_i16x8_2([-32768 -1500 -1 0 42 200 8576 32767]) == [-32726 -1458 41 42 84 242 8618 -32727]
 function %splat_i16x8_3(i16x8) -> i16x8 {
 block0(v0: i16x8):
  v1 = iconst.i64 42
  v2 = ireduce.i16 v1
  v3 = splat.i16x8 v2
  v4 = iadd v0, v3
  return v4
 }
 ; run: %splat_i16x8_3([-32768 -1500 -1 0 42 200 8576 32767]) == [-32726 -1458 41 42 84 242 8618 -32727]
 function %splat_i32x4_2(i32x4) -> i32x4 {
 block0(v0: i32x4):
  v1 = iconst.i32 1024
  v2 = splat.i32x4 v1
  v3 = iadd v0, v2
  return v3
 }
 ; run: %splat_i32x4_2([-2147483648 -1 0 2147483647]) == [-2147482624 1023 1024 -2147482625]
 function %splat_i32x4_3(i32x4) -> i32x4 {
 block0(v0: i32x4):
  v1 = iconst.i64 1024
  v2 = ireduce.i32 v1
  v3 = splat.i32x4 v2
  v4 = iadd v0, v3
  return v4
 }
 ; run: %splat_i32x4_3([-2147483648 -1 0 2147483647]) == [-2147482624 1023 1024 -2147482625]
 function %splat_i64x2_2(i64x2) -> i64x2 {
 block0(v0: i64x2):
    v1 = iconst.i64 -1
    v2 = splat.i64x2 v1
    v3 = iadd v0, v2
    return v3
 }
 ; run: %splat_i64x2_2([-1 0]) == [-2 -1]
 function %splat_f32x4_2(f32x4) -> f32x4 {
 block0(v0: f32x4):
  v1 = f32const 0x1.5
  v2 = splat.f32x4 v1
  v3 = fadd v0, v2
  return v3
 }
 ; run: %splat_f32x4_2([0x0.0 NaN 0x1.0 0x2.0]) == [0x1.5 NaN 0x2.5 0x3.5]
 function %splat_f64x2_2(f64x2) -> f64x2 {
 block0(v0: f64x2):
  v1 = f64const 0x7.5
  v2 = splat.f64x2 v1
  v3 = fadd v0, v2
  return v3
 }
 ; run: %splat_f64x2_2([0x0.0 0x1.0]) == [0x7.5 0x8.5]
 function %load_splat_i8x16(i8) -> i8x16 {
    ss0 = explicit_slot 8
 block0(v0: i8):
    stack_store.i8 v0, ss0
    v1 = stack_load.i8 ss0
    v2 = splat.i8x16 v1
    return v2
 }
 ; run: %load_splat_i8x16(-1) == [-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
 ; run: %load_splat_i8x16(0) == [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 ; run: %load_splat_i8x16(1) == [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 function %load_splat_i16x8(i16) -> i16x8 {
    ss0 = explicit_slot 8
 block0(v0: i16):
    stack_store.i16 v0, ss0
    v1 = stack_load.i16 ss0
    v2 = splat.i16x8 v1
    return v2
 }
 ; run: %load_splat_i16x8(-1) == [-1 -1 -1 -1 -1 -1 -1 -1]
 ; run: %load_splat_i16x8(0) == [0 0 0 0 0 0 0 0]
 ; run: %load_splat_i16x8(512) == [512 512 512 512 512 512 512 512]
 function %load_splat_i32x4(i32) -> i32x4 {
    ss0 = explicit_slot 8
 block0(v0: i32):
    stack_store.i32 v0, ss0
    v1 = stack_load.i32 ss0
    v2 = splat.i32x4 v1
    return v2
 }
 ; run: %load_splat_i32x4(-1) == [-1 -1 -1 -1]
 ; run: %load_splat_i32x4(0) == [0 0 0 0]
 ; run: %load_splat_i32x4(2000000) == [2000000 2000000 2000000 2000000]
 function %load_splat_i64x2(i64) -> i64x2 {
    ss0 = explicit_slot 8
 block0(v0: i64):
    stack_store.i64 v0, ss0
    v1 = stack_load.i64 ss0
    v2 = splat.i64x2 v1
    return v2
 }
 ; run: %load_splat_i64x2(-1) == [-1 -1]
 ; run: %load_splat_i64x2(0) == [0 0]
 ; run: %load_splat_i64x2(5000000000) == [5000000000 5000000000]
 function %load_splat_f32x4(f32) -> f32x4 {
    ss0 = explicit_slot 8
 block0(v0: f32):
    stack_store.f32 v0, ss0
    v1 = stack_load.f32 ss0
    v2 = splat.f32x4 v1
    return v2
 }
 ; run: %load_splat_f32x4(-0x0.0) == [-0x0.0 -0x0.0 -0x0.0 -0x0.0]
 ; run: %load_splat_f32x4(0x1.0) == [0x1.0 0x1.0 0x1.0 0x1.0]
 ; run: %load_splat_f32x4(NaN) == [NaN NaN NaN NaN]
 function %load_splat_f64x2(f64) -> f64x2 {
    ss0 = explicit_slot 8
 block0(v0: f64):
    stack_store.f64 v0, ss0
    v1 = stack_load.f64 ss0
    v2 = splat.f64x2 v1
    return v2
 }
 ; run: %load_splat_f64x2(0x0.0) == [0x0.0 0x0.0]
 ; run: %load_splat_f64x2(0x2.0) == [0x2.0 0x2.0]
 ; run: %load_splat_f64x2(NaN) == [NaN NaN]