AArch64: port misc ops to ISLE. (#4796)

* Add some precise-output compile tests for aarch64. * AArch64: port misc ops to ISLE. - get_pinned_reg / set_pinned_reg - bitcast - stack_addr - extractlane - insertlane - vhigh_bits - iadd_ifcout - fcvt_low_from_sint
2022-08-29 12:56:39 -07:00
parent 6368c6b188
commit a6eb24bd4f
18 changed files with 1362 additions and 662 deletions
--- a/cranelift/codegen/src/isa/aarch64/inst.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst.isle
@@ -952,11 +952,19 @@
 ;; Helper for calculating the `ScalarSize` corresponding to a type
 (decl scalar_size (Type) ScalarSize)
 (rule (scalar_size $I8) (ScalarSize.Size8))
 (rule (scalar_size $I16) (ScalarSize.Size16))
 (rule (scalar_size $I32) (ScalarSize.Size32))
 (rule (scalar_size $I64) (ScalarSize.Size64))
 (rule (scalar_size $I128) (ScalarSize.Size128))
 (rule (scalar_size $B8) (ScalarSize.Size8))
 (rule (scalar_size $B16) (ScalarSize.Size16))
 (rule (scalar_size $B32) (ScalarSize.Size32))
 (rule (scalar_size $B64) (ScalarSize.Size64))
 (rule (scalar_size $B128) (ScalarSize.Size128))
 (rule (scalar_size $F32) (ScalarSize.Size32))
 (rule (scalar_size $F64) (ScalarSize.Size64))
@@ -1452,6 +1460,9 @@
 (decl pure lshl_from_imm64 (Type Imm64) ShiftOpAndAmt)
 (extern constructor lshl_from_imm64 lshl_from_imm64)
 (decl pure lshl_from_u64 (Type u64) ShiftOpAndAmt)
 (extern constructor lshl_from_u64 lshl_from_u64)
 (decl integral_ty (Type) Type)
 (extern extractor integral_ty integral_ty)
@@ -1704,6 +1715,14 @@
         (MInst.AluRRR (ALUOp.AddS) (operand_size ty) dst src1 src2)
         dst)))
 ;; Helper for emitting `adds` instructions, setting flags in ambient
 ;; state. Used only for `iadd_ifcout`.
 (decl add_with_flags (Type Reg Reg) Reg)
 (rule (add_with_flags ty src1 src2)
      (let ((dst WritableReg (temp_writable_reg $I64))
            (_ Unit (emit (MInst.AluRRR (ALUOp.AddS) (operand_size ty) dst src1 src2))))
        dst))
 ;; Helper for emitting `adc` instructions.
 (decl adc_paired (Type Reg Reg) ConsumesFlags)
 (rule (adc_paired ty src1 src2)
@@ -1927,6 +1946,13 @@
            (_ Unit (emit (MInst.VecExtend op dst src high_half size))))
        dst))
 ;; Helper for emitting `MInst.VecExtract` instructions.
 (decl vec_extract (Reg Reg u8) Reg)
 (rule (vec_extract src1 src2 idx)
      (let ((dst WritableReg (temp_writable_reg $I8X16))
            (_ Unit (emit (MInst.VecExtract dst src1 src2 idx))))
        dst))
 ;; Helper for emitting `MInst.LoadAcquire` instructions.
 (decl load_acquire (Type Reg) Reg)
 (rule (load_acquire ty addr)
@@ -2118,6 +2144,10 @@
 (decl addp (Reg Reg VectorSize) Reg)
 (rule (addp x y size) (vec_rrr (VecALUOp.Addp) x y size))
 ;; Helper for generating `zip1` instructions.
 (decl zip1 (Reg Reg VectorSize) Reg)
 (rule (zip1 x y size) (vec_rrr (VecALUOp.Zip1) x y size))
 ;; Helper for generating vector `abs` instructions.
 (decl vec_abs (Reg VectorSize) Reg)
 (rule (vec_abs x size) (vec_misc (VecMisc2.Abs) x size))
@@ -2826,3 +2856,24 @@
 (decl gen_call_indirect (SigRef Value ValueSlice) InstOutput)
 (extern constructor gen_call_indirect gen_call_indirect)
 ;; Helpers for pinned register manipulation.
 (decl writable_pinned_reg () WritableReg)
 (extern constructor writable_pinned_reg writable_pinned_reg)
 (decl pinned_reg () Reg)
 (rule (pinned_reg) (writable_pinned_reg))
 (decl write_pinned_reg (Reg) SideEffectNoResult)
 (rule (write_pinned_reg val)
      (let ((dst WritableReg (writable_pinned_reg)))
        (SideEffectNoResult.Inst (gen_move $I64 dst val))))
 ;; Helpers for stackslot effective address generation.
 (decl compute_stack_addr (StackSlot Offset32) Reg)
 (rule (compute_stack_addr stack_slot offset)
      (let ((dst WritableReg (temp_writable_reg $I64))
           (_ Unit (emit (abi_stackslot_addr dst stack_slot offset))))
        dst))
--- a/cranelift/codegen/src/isa/aarch64/lower.isle
+++ b/cranelift/codegen/src/isa/aarch64/lower.isle
@@ -2030,3 +2030,212 @@
 ;; N.B.: the Ret itself is generated by the ABI.
 (rule (lower (return args))
      (lower_return (range 0 (value_slice_len args)) args))
 ;;; Rules for `{get,set}_pinned_reg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (get_pinned_reg))
      (pinned_reg))
 (rule (lower (set_pinned_reg val))
      (side_effect (write_pinned_reg val)))
 ;;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (has_type $I32 (bitcast src @ (value_type $F32))))
      (mov_from_vec src 0 (ScalarSize.Size32)))
 (rule (lower (has_type $F32 (bitcast src @ (value_type $I32))))
      (mov_to_fpu src (ScalarSize.Size32)))
 (rule (lower (has_type $I64 (bitcast src @ (value_type $F64))))
      (mov_from_vec src 0 (ScalarSize.Size64)))
 (rule (lower (has_type $F64 (bitcast src @ (value_type $I64))))
      (mov_to_fpu src (ScalarSize.Size64)))
 ;;; Rules for `raw_bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (raw_bitcast val))
      val)
 ;;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; extractlane with lane 0 can pass through the value unchanged; upper
 ;; bits are undefined when a narrower type is in a wider register.
 (rule (lower (has_type (ty_scalar_float _) (extractlane val (u8_from_uimm8 0))))
      val)
 (rule (lower (has_type (ty_int_bool ty)
                       (extractlane val
                                    (u8_from_uimm8 lane))))
      (mov_from_vec val lane (scalar_size ty)))
 (rule (lower (has_type (ty_scalar_float ty)
                       (extractlane val @ (value_type vty)
                                    (u8_from_uimm8 lane))))
      (fpu_move_from_vec val lane (vector_size vty)))
 ;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (insertlane vec @ (value_type vty)
                         val @ (value_type (ty_int_bool _))
                         (u8_from_uimm8 lane)))
      (mov_to_vec vec val lane (vector_size vty)))
 (rule (lower (insertlane vec @ (value_type vty)
                         val @ (value_type (ty_scalar_float _))
                         (u8_from_uimm8 lane)))
      (mov_vec_elem vec val lane 0 (vector_size vty)))
 ;;; Rules for `copy` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (copy x))
      x)
 ;;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (stack_addr stack_slot offset))
      (compute_stack_addr stack_slot offset))
 ;;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; All three sequences use one integer temporary and two vector
 ;; temporaries.  The shift is done early so as to give the register
 ;; allocator the possibility of using the same reg for `tmp_v1` and
 ;; `src_v` in the case that this is the last use of `src_v`.  See
 ;; https://github.com/WebAssembly/simd/pull/201 for the background and
 ;; derivation of these sequences. Alternative sequences are discussed
 ;; in https://github.com/bytecodealliance/wasmtime/issues/2296,
 ;; although they are not used here.
 (rule (lower (vhigh_bits vec @ (value_type $I8X16)))
      (let (
            ;; Replicate the MSB of each of the 16 byte lanes across
            ;; the whole lane (sshr is an arithmetic right shift).
            (shifted Reg (vec_shift_imm (VecShiftImmOp.Sshr) 7 vec (VectorSize.Size8x16)))
            ;; Bitwise-and with a mask
            ;; `0x80402010_08040201_80402010_08040201` to get the bit
            ;; in the proper location for each group of 8 lanes.
            (anded Reg (and_vec shifted (constant_f128 0x80402010_08040201_80402010_08040201) (VectorSize.Size8x16)))
            ;; Produce a version of `anded` with upper 8 lanes and
            ;; lower 8 lanes swapped.
            (anded_swapped Reg (vec_extract anded anded 8))
            ;; Zip together the two; with the above this produces the lane permutation:
            ;; 15 7 14 6 13 5 12 4 11 3 10 2 9 1 8 0
            (zipped Reg (zip1 anded anded_swapped (VectorSize.Size8x16)))
            ;; Add 16-bit lanes together ("add across vector"), so we
            ;; get, in the low 16 bits, 15+14+...+8 in the high byte
            ;; and 7+6+...+0 in the low byte. This effectively puts
            ;; the 16 MSBs together, giving our results.
            ;;
            ;; N.B.: `Size16x8` is not a typo!
            (result Reg (addv zipped (VectorSize.Size16x8))))
        (mov_from_vec result 0 (ScalarSize.Size16))))
 (rule (lower (vhigh_bits vec @ (value_type $I16X8)))
      (let (
            ;; Replicate the MSB of each of the 8 16-bit lanes across
            ;; the whole lane (sshr is an arithmetic right shift).
            (shifted Reg (vec_shift_imm (VecShiftImmOp.Sshr) 15 vec (VectorSize.Size16x8)))
            ;; Bitwise-and with a mask
            ;; `0x0080_0040_0020_0010_0008_0004_0002_0001` to get the
            ;; bit in the proper location for each group of 4 lanes.
            (anded Reg (and_vec shifted (constant_f128 0x0080_0040_0020_0010_0008_0004_0002_0001) (VectorSize.Size16x8)))
            ;; Add lanes together to get the 8 MSBs in the low byte.
            (result Reg (addv anded (VectorSize.Size16x8))))
        (mov_from_vec result 0 (ScalarSize.Size16))))
 (rule (lower (vhigh_bits vec @ (value_type $I32X4)))
      (let (
            ;; Replicate the MSB of each of the 4 32-bit lanes across
            ;; the whole lane (sshr is an arithmetic right shift).
            (shifted Reg (vec_shift_imm (VecShiftImmOp.Sshr) 31 vec (VectorSize.Size32x4)))
            ;; Bitwise-and with a mask
            ;; `0x00000008_00000004_00000002_00000001` to get the bit
            ;; in the proper location for each group of 4 lanes.
            (anded Reg (and_vec shifted (constant_f128 0x00000008_00000004_00000002_00000001) (VectorSize.Size32x4)))
            ;; Add lanes together to get the 4 MSBs in the low byte.
            (result Reg (addv anded (VectorSize.Size32x4))))
        (mov_from_vec result 0 (ScalarSize.Size32))))
 (rule (lower (vhigh_bits vec @ (value_type $I64X2)))
      (let (
            ;; Grab the MSB out of each of the lanes, right-shift to
            ;; LSB, and add with a left-shift of upper lane's MSB back
            ;; to bit 1.  the whole lane (sshr is an arithmetic right
            ;; shift).
            (upper_msb Reg (mov_from_vec vec 1 (ScalarSize.Size64)))
            (lower_msb Reg (mov_from_vec vec 0 (ScalarSize.Size64)))
            (upper_msb Reg (lsr_imm $I64 upper_msb (imm_shift_from_u8 63)))
            (lower_msb Reg (lsr_imm $I64 lower_msb (imm_shift_from_u8 63))))
        (add_shift $I64 lower_msb upper_msb (lshl_from_u64 $I64 1))))
 ;;; Rules for `iadd_ifcout` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; This is a two-output instruction that is needed for the
 ;; legalizer's explicit heap-check sequence, among possible other
 ;; uses. Its second output is a flags output only ever meant to
 ;; check for overflow using the
 ;; `backend.unsigned_add_overflow_condition()` condition.
 ;; 
 ;; Note that the CLIF validation will ensure that no flag-setting
 ;; operation comes between this IaddIfcout and its use (e.g., a
 ;; Trapif). Thus, we can rely on implicit communication through the
 ;; processor flags rather than explicitly generating flags into a
 ;; register. We simply use the variant of the add instruction that
 ;; sets flags (`adds`) here.
 ;; 
 ;; Note that the second output (the flags) need not be generated,
 ;; because flags are never materialized into a register; the only
 ;; instructions that can use a value of type `iflags` or `fflags`
 ;; will look directly for the flags-producing instruction (which can
 ;; always be found, by construction) and merge it.
 ;; 
 ;; Now handle the iadd as above, except use an AddS opcode that sets
 ;; flags.
 (rule (lower (has_type (ty_int ty)
                       (iadd_ifcout a b)))
      (output_pair
       (add_with_flags ty a b)
       (invalid_reg)))
 ;;; Rules for `tls_value` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; TODO.
 ;;; Rules for `fcvt_low_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (has_type $F64X2 (fcvt_low_from_sint val)))
      (let ((extended Reg (vec_extend (VecExtendOp.Sxtl) val $false (ScalarSize.Size64)))
            (converted Reg (vec_misc (VecMisc2.Scvtf) extended (VectorSize.Size64x2))))
        converted))
 ;;; Rules for `fvpromote_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (fvpromote_low val))
      (vec_rr_long (VecRRLongOp.Fcvtl32) val $false))
 ;;; Rules for `select` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; TODO: requires icmp/fcmp first.
 ;;; Rules for `selectif` / `selectif_spectre_guard` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; TODO: requires icmp/fcmp first.
 ;;; Rules for `trueif` / `trueff` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; TODO: requires icmp/fcmp first.
 ;;; Rules for `brz`/`brnz`/`brif`/`brff`/`bricmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; TODO: requires icmp/fcmp first.
 ;;; Rules for `jump` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; TODO.
 ;;; Rules for `br_table` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; TODO.
--- a/cranelift/codegen/src/isa/aarch64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle.rs
@@ -128,7 +128,11 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
    }
    fn lshl_from_imm64(&mut self, ty: Type, n: Imm64) -> Option<ShiftOpAndAmt> {
-        let shiftimm = ShiftOpShiftImm::maybe_from_shift(n.bits() as u64)?;
+        self.lshl_from_u64(ty, n.bits() as u64)
    }
    fn lshl_from_u64(&mut self, ty: Type, n: u64) -> Option<ShiftOpAndAmt> {
        let shiftimm = ShiftOpShiftImm::maybe_from_shift(n)?;
        let shiftee_bits = ty_bits(ty);
        if shiftee_bits <= std::u8::MAX as usize {
            let shiftimm = shiftimm.mask(shiftee_bits as u8);
@@ -722,4 +726,8 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
            );
        }
    }
    fn writable_pinned_reg(&mut self) -> WritableReg {
        super::regs::writable_xreg(super::regs::PINNED_REG)
    }
 }
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -4,7 +4,7 @@ use super::lower::*;
 use crate::binemit::CodeOffset;
 use crate::ir::types::*;
 use crate::ir::Inst as IRInst;
-use crate::ir::{InstructionData, Opcode};
+use crate::ir::Opcode;
 use crate::isa::aarch64::inst::*;
 use crate::isa::aarch64::settings as aarch64_settings;
 use crate::machinst::lower::*;
@@ -13,7 +13,6 @@ use crate::settings::{Flags, TlsModel};
 use crate::{CodegenError, CodegenResult};
 use alloc::boxed::Box;
 use alloc::vec::Vec;
 use core::convert::TryFrom;
 use target_lexicon::Triple;
 /// Actually codegen an instruction's results into registers.
@@ -231,23 +230,7 @@ pub(crate) fn lower_insn_to_regs(
            }
        }
-        Opcode::StackAddr => {
+        Opcode::StackAddr => implemented_in_isle(ctx),
            let (stack_slot, offset) = match *ctx.data(insn) {
                InstructionData::StackLoad {
                    opcode: Opcode::StackAddr,
                    stack_slot,
                    offset,
                } => (stack_slot, offset),
                _ => unreachable!(),
            };
            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            let offset: i32 = offset.into();
            assert!(ctx.abi().sized_stackslot_offsets().is_valid(stack_slot));
            let inst =
                ctx.abi()
                    .sized_stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), rd);
            ctx.emit(inst);
        }
        Opcode::DynamicStackAddr => implemented_in_isle(ctx),
@@ -421,52 +404,7 @@ pub(crate) fn lower_insn_to_regs(
        Opcode::Bint => implemented_in_isle(ctx),
-        Opcode::Bitcast => {
+        Opcode::Bitcast => implemented_in_isle(ctx),
            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            let ity = ctx.input_ty(insn, 0);
            let oty = ctx.output_ty(insn, 0);
            let ity_bits = ty_bits(ity);
            let ity_vec_reg = ty_has_float_or_vec_representation(ity);
            let oty_bits = ty_bits(oty);
            let oty_vec_reg = ty_has_float_or_vec_representation(oty);
            debug_assert_eq!(ity_bits, oty_bits);
            match (ity_vec_reg, oty_vec_reg) {
                (true, true) => {
                    let narrow_mode = if ity_bits <= 32 {
                        NarrowValueMode::ZeroExtend32
                    } else {
                        NarrowValueMode::ZeroExtend64
                    };
                    let rm = put_input_in_reg(ctx, inputs[0], narrow_mode);
                    ctx.emit(Inst::gen_move(rd, rm, oty));
                }
                (false, false) => {
                    let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                    ctx.emit(Inst::gen_move(rd, rm, oty));
                }
                (false, true) => {
                    let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
                    ctx.emit(Inst::MovToFpu {
                        rd,
                        rn,
                        size: ScalarSize::Size64,
                    });
                }
                (true, false) => {
                    let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                    let size = ScalarSize::from_bits(oty_bits);
                    ctx.emit(Inst::MovFromVec {
                        rd,
                        rn,
                        idx: 0,
                        size,
                    });
                }
            }
        }
        Opcode::Return => implemented_in_isle(ctx),
@@ -556,15 +494,7 @@ pub(crate) fn lower_insn_to_regs(
        Opcode::Call | Opcode::CallIndirect => implemented_in_isle(ctx),
-        Opcode::GetPinnedReg => {
+        Opcode::GetPinnedReg | Opcode::SetPinnedReg => implemented_in_isle(ctx),
            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            ctx.emit(Inst::gen_move(rd, xreg(PINNED_REG), I64));
        }
        Opcode::SetPinnedReg => {
            let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            ctx.emit(Inst::gen_move(writable_xreg(PINNED_REG), rm, I64));
        }
        Opcode::Jump
        | Opcode::Brz
@@ -578,67 +508,11 @@ pub(crate) fn lower_insn_to_regs(
        Opcode::Vconst => implemented_in_isle(ctx),
-        Opcode::RawBitcast => {
+        Opcode::RawBitcast => implemented_in_isle(ctx),
            let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            let ty = ctx.input_ty(insn, 0);
            ctx.emit(Inst::gen_move(rd, rm, ty));
        }
-        Opcode::Extractlane => {
+        Opcode::Extractlane => implemented_in_isle(ctx),
            if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(insn) {
                let idx = *imm;
                let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                let input_ty = ctx.input_ty(insn, 0);
                let size = VectorSize::from_ty(input_ty);
                let ty = ty.unwrap();
-                if ty_has_int_representation(ty) {
+        Opcode::Insertlane => implemented_in_isle(ctx),
                    ctx.emit(Inst::MovFromVec {
                        rd,
                        rn,
                        idx,
                        size: size.lane_size(),
                    });
                // Plain moves are faster on some processors.
                } else if idx == 0 {
                    ctx.emit(Inst::gen_move(rd, rn, ty));
                } else {
                    ctx.emit(Inst::FpuMoveFromVec { rd, rn, idx, size });
                }
            } else {
                unreachable!();
            }
        }
        Opcode::Insertlane => {
            let idx = if let InstructionData::TernaryImm8 { imm, .. } = ctx.data(insn) {
                *imm
            } else {
                unreachable!();
            };
            let input_ty = ctx.input_ty(insn, 1);
            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
            let ty = ty.unwrap();
            let size = VectorSize::from_ty(ty);
            ctx.emit(Inst::gen_move(rd, rm, ty));
            if ty_has_int_representation(input_ty) {
                ctx.emit(Inst::MovToVec { rd, rn, idx, size });
            } else {
                ctx.emit(Inst::VecMovElement {
                    rd,
                    rn,
                    dest_idx: idx,
                    src_idx: 0,
                    size,
                });
            }
        }
        Opcode::Splat => implemented_in_isle(ctx),
@@ -646,240 +520,7 @@ pub(crate) fn lower_insn_to_regs(
        Opcode::VallTrue | Opcode::VanyTrue => implemented_in_isle(ctx),
-        Opcode::VhighBits => {
+        Opcode::VhighBits => implemented_in_isle(ctx),
            let dst_r = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            let src_v = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            let ty = ctx.input_ty(insn, 0);
            // All three sequences use one integer temporary and two vector temporaries.  The
            // shift is done early so as to give the register allocator the possibility of using
            // the same reg for `tmp_v1` and `src_v` in the case that this is the last use of
            // `src_v`.  See https://github.com/WebAssembly/simd/pull/201 for the background and
            // derivation of these sequences.  Alternative sequences are discussed in
            // https://github.com/bytecodealliance/wasmtime/issues/2296, although they are not
            // used here.
            let tmp_r0 = ctx.alloc_tmp(I64).only_reg().unwrap();
            let tmp_v0 = ctx.alloc_tmp(I8X16).only_reg().unwrap();
            let tmp_v1 = ctx.alloc_tmp(I8X16).only_reg().unwrap();
            match ty {
                I8X16 => {
                    // sshr  tmp_v1.16b, src_v.16b, #7
                    // mov   tmp_r0, #0x0201
                    // movk  tmp_r0, #0x0804, lsl 16
                    // movk  tmp_r0, #0x2010, lsl 32
                    // movk  tmp_r0, #0x8040, lsl 48
                    // dup   tmp_v0.2d, tmp_r0
                    // and   tmp_v1.16b, tmp_v1.16b, tmp_v0.16b
                    // ext   tmp_v0.16b, tmp_v1.16b, tmp_v1.16b, #8
                    // zip1  tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
                    // addv  tmp_v0h, tmp_v0.8h
                    // mov   dst_r, tmp_v0.h[0]
                    ctx.emit(Inst::VecShiftImm {
                        op: VecShiftImmOp::Sshr,
                        rd: tmp_v1,
                        rn: src_v,
                        size: VectorSize::Size8x16,
                        imm: 7,
                    });
                    lower_splat_const(ctx, tmp_v0, 0x8040201008040201u64, VectorSize::Size64x2);
                    ctx.emit(Inst::VecRRR {
                        alu_op: VecALUOp::And,
                        rd: tmp_v1,
                        rn: tmp_v1.to_reg(),
                        rm: tmp_v0.to_reg(),
                        size: VectorSize::Size8x16,
                    });
                    ctx.emit(Inst::VecExtract {
                        rd: tmp_v0,
                        rn: tmp_v1.to_reg(),
                        rm: tmp_v1.to_reg(),
                        imm4: 8,
                    });
                    ctx.emit(Inst::VecRRR {
                        alu_op: VecALUOp::Zip1,
                        rd: tmp_v0,
                        rn: tmp_v1.to_reg(),
                        rm: tmp_v0.to_reg(),
                        size: VectorSize::Size8x16,
                    });
                    ctx.emit(Inst::VecLanes {
                        op: VecLanesOp::Addv,
                        rd: tmp_v0,
                        rn: tmp_v0.to_reg(),
                        size: VectorSize::Size16x8,
                    });
                    ctx.emit(Inst::MovFromVec {
                        rd: dst_r,
                        rn: tmp_v0.to_reg(),
                        idx: 0,
                        size: ScalarSize::Size16,
                    });
                }
                I16X8 => {
                    // sshr  tmp_v1.8h, src_v.8h, #15
                    // mov   tmp_r0, #0x1
                    // movk  tmp_r0, #0x2, lsl 16
                    // movk  tmp_r0, #0x4, lsl 32
                    // movk  tmp_r0, #0x8, lsl 48
                    // dup   tmp_v0.2d, tmp_r0
                    // shl   tmp_r0, tmp_r0, #4
                    // mov   tmp_v0.d[1], tmp_r0
                    // and   tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
                    // addv  tmp_v0h, tmp_v0.8h
                    // mov   dst_r, tmp_v0.h[0]
                    ctx.emit(Inst::VecShiftImm {
                        op: VecShiftImmOp::Sshr,
                        rd: tmp_v1,
                        rn: src_v,
                        size: VectorSize::Size16x8,
                        imm: 15,
                    });
                    lower_constant_u64(ctx, tmp_r0, 0x0008000400020001u64);
                    ctx.emit(Inst::VecDup {
                        rd: tmp_v0,
                        rn: tmp_r0.to_reg(),
                        size: VectorSize::Size64x2,
                    });
                    ctx.emit(Inst::AluRRImmShift {
                        alu_op: ALUOp::Lsl,
                        size: OperandSize::Size64,
                        rd: tmp_r0,
                        rn: tmp_r0.to_reg(),
                        immshift: ImmShift { imm: 4 },
                    });
                    ctx.emit(Inst::MovToVec {
                        rd: tmp_v0,
                        rn: tmp_r0.to_reg(),
                        idx: 1,
                        size: VectorSize::Size64x2,
                    });
                    ctx.emit(Inst::VecRRR {
                        alu_op: VecALUOp::And,
                        rd: tmp_v0,
                        rn: tmp_v1.to_reg(),
                        rm: tmp_v0.to_reg(),
                        size: VectorSize::Size8x16,
                    });
                    ctx.emit(Inst::VecLanes {
                        op: VecLanesOp::Addv,
                        rd: tmp_v0,
                        rn: tmp_v0.to_reg(),
                        size: VectorSize::Size16x8,
                    });
                    ctx.emit(Inst::MovFromVec {
                        rd: dst_r,
                        rn: tmp_v0.to_reg(),
                        idx: 0,
                        size: ScalarSize::Size16,
                    });
                }
                I32X4 => {
                    // sshr  tmp_v1.4s, src_v.4s, #31
                    // mov   tmp_r0, #0x1
                    // movk  tmp_r0, #0x2, lsl 32
                    // dup   tmp_v0.2d, tmp_r0
                    // shl   tmp_r0, tmp_r0, #2
                    // mov   tmp_v0.d[1], tmp_r0
                    // and   tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
                    // addv  tmp_v0s, tmp_v0.4s
                    // mov   dst_r, tmp_v0.s[0]
                    ctx.emit(Inst::VecShiftImm {
                        op: VecShiftImmOp::Sshr,
                        rd: tmp_v1,
                        rn: src_v,
                        size: VectorSize::Size32x4,
                        imm: 31,
                    });
                    lower_constant_u64(ctx, tmp_r0, 0x0000000200000001u64);
                    ctx.emit(Inst::VecDup {
                        rd: tmp_v0,
                        rn: tmp_r0.to_reg(),
                        size: VectorSize::Size64x2,
                    });
                    ctx.emit(Inst::AluRRImmShift {
                        alu_op: ALUOp::Lsl,
                        size: OperandSize::Size64,
                        rd: tmp_r0,
                        rn: tmp_r0.to_reg(),
                        immshift: ImmShift { imm: 2 },
                    });
                    ctx.emit(Inst::MovToVec {
                        rd: tmp_v0,
                        rn: tmp_r0.to_reg(),
                        idx: 1,
                        size: VectorSize::Size64x2,
                    });
                    ctx.emit(Inst::VecRRR {
                        alu_op: VecALUOp::And,
                        rd: tmp_v0,
                        rn: tmp_v1.to_reg(),
                        rm: tmp_v0.to_reg(),
                        size: VectorSize::Size8x16,
                    });
                    ctx.emit(Inst::VecLanes {
                        op: VecLanesOp::Addv,
                        rd: tmp_v0,
                        rn: tmp_v0.to_reg(),
                        size: VectorSize::Size32x4,
                    });
                    ctx.emit(Inst::MovFromVec {
                        rd: dst_r,
                        rn: tmp_v0.to_reg(),
                        idx: 0,
                        size: ScalarSize::Size32,
                    });
                }
                I64X2 => {
                    // mov dst_r, src_v.d[0]
                    // mov tmp_r0, src_v.d[1]
                    // lsr dst_r, dst_r, #63
                    // lsr tmp_r0, tmp_r0, #63
                    // add dst_r, dst_r, tmp_r0, lsl #1
                    ctx.emit(Inst::MovFromVec {
                        rd: dst_r,
                        rn: src_v,
                        idx: 0,
                        size: ScalarSize::Size64,
                    });
                    ctx.emit(Inst::MovFromVec {
                        rd: tmp_r0,
                        rn: src_v,
                        idx: 1,
                        size: ScalarSize::Size64,
                    });
                    ctx.emit(Inst::AluRRImmShift {
                        alu_op: ALUOp::Lsr,
                        size: OperandSize::Size64,
                        rd: dst_r,
                        rn: dst_r.to_reg(),
                        immshift: ImmShift::maybe_from_u64(63).unwrap(),
                    });
                    ctx.emit(Inst::AluRRImmShift {
                        alu_op: ALUOp::Lsr,
                        size: OperandSize::Size64,
                        rd: tmp_r0,
                        rn: tmp_r0.to_reg(),
                        immshift: ImmShift::maybe_from_u64(63).unwrap(),
                    });
                    ctx.emit(Inst::AluRRRShift {
                        alu_op: ALUOp::Add,
                        size: OperandSize::Size32,
                        rd: dst_r,
                        rn: dst_r.to_reg(),
                        rm: tmp_r0.to_reg(),
                        shiftop: ShiftOpAndAmt::new(
                            ShiftOp::LSL,
                            ShiftOpShiftImm::maybe_from_shift(1).unwrap(),
                        ),
                    });
                }
                _ => {
                    return Err(CodegenError::Unsupported(format!(
                        "VhighBits: Unsupported type: {:?}",
                        ty
                    )))
                }
            }
        }
        Opcode::Shuffle => implemented_in_isle(ctx),
@@ -917,34 +558,7 @@ pub(crate) fn lower_insn_to_regs(
        Opcode::FcvtToUintSat | Opcode::FcvtToSintSat => implemented_in_isle(ctx),
-        Opcode::IaddIfcout => {
+        Opcode::IaddIfcout => implemented_in_isle(ctx),
            // This is a two-output instruction that is needed for the
            // legalizer's explicit heap-check sequence, among possible other
            // uses. Its second output is a flags output only ever meant to
            // check for overflow using the
            // `backend.unsigned_add_overflow_condition()` condition.
            //
            // Note that the CLIF validation will ensure that no flag-setting
            // operation comes between this IaddIfcout and its use (e.g., a
            // Trapif). Thus, we can rely on implicit communication through the
            // processor flags rather than explicitly generating flags into a
            // register. We simply use the variant of the add instruction that
            // sets flags (`adds`) here.
            // Note that the second output (the flags) need not be generated,
            // because flags are never materialized into a register; the only
            // instructions that can use a value of type `iflags` or `fflags`
            // will look directly for the flags-producing instruction (which can
            // always be found, by construction) and merge it.
            // Now handle the iadd as above, except use an AddS opcode that sets
            // flags.
            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None);
            let ty = ty.unwrap();
            ctx.emit(alu_inst_imm12(ALUOp::AddS, ty, rd, rn, rm));
        }
        Opcode::IaddImm
        | Opcode::ImulImm
@@ -1006,47 +620,9 @@ pub(crate) fn lower_insn_to_regs(
        Opcode::SqmulRoundSat => implemented_in_isle(ctx),
-        Opcode::FcvtLowFromSint => {
+        Opcode::FcvtLowFromSint => implemented_in_isle(ctx),
            let ty = ty.unwrap();
-            if ty != F64X2 {
+        Opcode::FvpromoteLow => implemented_in_isle(ctx),
                return Err(CodegenError::Unsupported(format!(
                    "FcvtLowFromSint: Unsupported type: {:?}",
                    ty
                )));
            }
            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            ctx.emit(Inst::VecExtend {
                t: VecExtendOp::Sxtl,
                rd,
                rn,
                high_half: false,
                lane_size: ScalarSize::Size64,
            });
            ctx.emit(Inst::VecMisc {
                op: VecMisc2::Scvtf,
                rd,
                rn: rd.to_reg(),
                size: VectorSize::Size64x2,
            });
        }
        Opcode::FvpromoteLow => {
            debug_assert_eq!(ty.unwrap(), F64X2);
            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            ctx.emit(Inst::VecRRLong {
                op: VecRRLongOp::Fcvtl32,
                rd,
                rn,
                high_half: false,
            });
        }
        Opcode::Fvdemote => implemented_in_isle(ctx),
--- a/cranelift/codegen/src/machinst/isle.rs
+++ b/cranelift/codegen/src/machinst/isle.rs
@@ -371,6 +371,15 @@ macro_rules! isle_prelude_methods {
            ty.is_int().then(|| ty)
        }
        #[inline]
        fn ty_int_bool(&mut self, ty: Type) -> Option<Type> {
            if ty.is_int() || ty.is_bool() {
                Some(ty)
            } else {
                None
            }
        }
        #[inline]
        fn ty_scalar_float(&mut self, ty: Type) -> Option<Type> {
            match ty {
@@ -379,6 +388,15 @@ macro_rules! isle_prelude_methods {
            }
        }
        #[inline]
        fn ty_float_or_vec(&mut self, ty: Type) -> Option<Type> {
            match ty {
                F32 | F64 => Some(ty),
                ty if ty.is_vector() => Some(ty),
                _ => None,
            }
        }
        #[inline]
        fn ty_vec64(&mut self, ty: Type) -> Option<Type> {
            if ty.is_vector() && ty.bits() == 64 {
--- a/cranelift/codegen/src/prelude.isle
+++ b/cranelift/codegen/src/prelude.isle
@@ -365,6 +365,10 @@
 (decl ty_int_bool_128 (Type) Type)
 (extern extractor ty_int_bool_128 ty_int_bool_128)
 ;; An extractor that matches any int or bool.
 (decl ty_int_bool (Type) Type)
 (extern extractor ty_int_bool ty_int_bool)
 ;; An extractor that only matches integers.
 (decl ty_int (Type) Type)
 (extern extractor ty_int ty_int)
@@ -373,6 +377,10 @@
 (decl ty_scalar_float (Type) Type)
 (extern extractor ty_scalar_float ty_scalar_float)
 ;; An extractor that matches scalar floating-point types or vector types.
 (decl ty_float_or_vec (Type) Type)
 (extern extractor ty_float_or_vec ty_float_or_vec)
 ;; A pure constructor that only matches 64-bit vector types.
 (decl pure ty_vec64 (Type) Type)
 (extern constructor ty_vec64 ty_vec64)
--- a/cranelift/filetests/filetests/isa/aarch64/bitcast.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/bitcast.clif
@@ -0,0 +1,43 @@
 test compile precise-output
 target aarch64
 function %f1(f32) -> i32 {
 block0(v0: f32):
  v1 = bitcast.i32 v0
  return v1
 }
 ; block0:
 ;   mov w0, v0.s[0]
 ;   ret
 function %f2(i32) -> f32 {
 block0(v0: i32):
  v1 = bitcast.f32 v0
  return v1
 }
 ; block0:
 ;   fmov s0, w0
 ;   ret
 function %f3(f64) -> i64 {
 block0(v0: f64):
  v1 = bitcast.i64 v0
  return v1
 }
 ; block0:
 ;   mov x0, v0.d[0]
 ;   ret
 function %f4(i64) -> f64 {
 block0(v0: i64):
  v1 = bitcast.f64 v0
  return v1
 }
 ; block0:
 ;   fmov d0, x0
 ;   ret
--- a/cranelift/filetests/filetests/isa/aarch64/dynamic-slot.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/dynamic-slot.clif
@@ -15,9 +15,9 @@ block0:
 ;   mov fp, sp
 ;   sub sp, sp, #16
 ; block0:
-;   mov x0, sp
+;   mov x1, sp
 ;   movz x2, #1
-;   str x2, [x0]
+;   str x2, [x1]
 ;   add sp, sp, #16
 ;   ldp fp, lr, [sp], #16
 ;   ret
@@ -36,9 +36,9 @@ block0:
 ;   mov fp, sp
 ;   sub sp, sp, #16
 ; block0:
-;   mov x0, sp
+;   mov x1, sp
 ;   movz x2, #1
-;   str x2, [x0]
+;   str x2, [x1]
 ;   add sp, sp, #16
 ;   ldp fp, lr, [sp], #16
 ;   ret
--- a/cranelift/filetests/filetests/isa/aarch64/fcvt.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/fcvt.clif
@@ -0,0 +1,461 @@
 test compile precise-output
 target aarch64
 function %f1(i8) -> f32 {
 block0(v0: i8):
  v1 = fcvt_from_sint.f32 v0
  return v1
 }
 ; block0:
 ;   sxtb w3, w0
 ;   scvtf s0, w3
 ;   ret
 function %f2(i16) -> f32 {
 block0(v0: i16):
  v1 = fcvt_from_sint.f32 v0
  return v1
 }
 ; block0:
 ;   sxth w3, w0
 ;   scvtf s0, w3
 ;   ret
 function %f3(i32) -> f32 {
 block0(v0: i32):
  v1 = fcvt_from_sint.f32 v0
  return v1
 }
 ; block0:
 ;   scvtf s0, w0
 ;   ret
 function %f4(i64) -> f32 {
 block0(v0: i64):
  v1 = fcvt_from_sint.f32 v0
  return v1
 }
 ; block0:
 ;   scvtf s0, x0
 ;   ret
 function %f5(i8) -> f64 {
 block0(v0: i8):
  v1 = fcvt_from_sint.f64 v0
  return v1
 }
 ; block0:
 ;   sxtb w3, w0
 ;   scvtf d0, w3
 ;   ret
 function %f6(i16) -> f64 {
 block0(v0: i16):
  v1 = fcvt_from_sint.f64 v0
  return v1
 }
 ; block0:
 ;   sxth w3, w0
 ;   scvtf d0, w3
 ;   ret
 function %f7(i32) -> f64 {
 block0(v0: i32):
  v1 = fcvt_from_sint.f64 v0
  return v1
 }
 ; block0:
 ;   scvtf d0, w0
 ;   ret
 function %f8(i64) -> f64 {
 block0(v0: i64):
  v1 = fcvt_from_sint.f64 v0
  return v1
 }
 ; block0:
 ;   scvtf d0, x0
 ;   ret
 function %f9(i32x4) -> f64x2 {
 block0(v0: i32x4):
  v1 = fcvt_low_from_sint.f64x2 v0
  return v1
 }
 ; block0:
 ;   sxtl v3.2d, v0.2s
 ;   scvtf v0.2d, v3.2d
 ;   ret
 function %f10(i8, i16, i32, i64) -> f32 {
 block0(v0: i8, v1: i16, v2: i32, v3: i64):
  v4 = fcvt_from_uint.f32 v0
  v5 = fcvt_from_uint.f32 v1
  v6 = fcvt_from_uint.f32 v2
  v7 = fcvt_from_uint.f32 v3
  v8 = fadd.f32 v4, v5
  v9 = fadd.f32 v8, v6
  v10 = fadd.f32 v9, v7
  return v10
 }
 ; block0:
 ;   uxtb w0, w0
 ;   ucvtf s26, w0
 ;   uxth w0, w1
 ;   ucvtf s27, w0
 ;   ucvtf s25, w2
 ;   ucvtf s28, x3
 ;   fadd s26, s26, s27
 ;   fadd s25, s26, s25
 ;   fadd s0, s25, s28
 ;   ret
 function %f11(i32x4) -> f64x2 {
 block0(v0: i32x4):
  v1 = uwiden_low v0
  v2 = fcvt_from_uint.f64x2 v1
  return v2
 }
 ; block0:
 ;   uxtl v4.2d, v0.2s
 ;   ucvtf v0.2d, v4.2d
 ;   ret
 function %f12(i32x4) -> f32x4 {
 block0(v0: i32x4):
  v1 = fcvt_from_uint.f32x4 v0
  return v1
 }
 ; block0:
 ;   ucvtf v0.4s, v0.4s
 ;   ret
 function %f13(f32) -> i32 {
 block0(v0: f32):
  v1 = fcvt_to_uint.i32 v0
  return v1
 }
 ; block0:
 ;   fcmp s0, s0
 ;   b.vc 8 ; udf
 ;   fmov s5, #-1
 ;   fcmp s0, s5
 ;   b.gt 8 ; udf
 ;   movz x10, #20352, LSL #16
 ;   fmov s18, w10
 ;   fcmp s0, s18
 ;   b.lt 8 ; udf
 ;   fcvtzu w0, s0
 ;   ret
 function %f14(f32) -> i64 {
 block0(v0: f32):
  v1 = fcvt_to_uint.i64 v0
  return v1
 }
 ; block0:
 ;   fcmp s0, s0
 ;   b.vc 8 ; udf
 ;   fmov s5, #-1
 ;   fcmp s0, s5
 ;   b.gt 8 ; udf
 ;   movz x10, #24448, LSL #16
 ;   fmov s18, w10
 ;   fcmp s0, s18
 ;   b.lt 8 ; udf
 ;   fcvtzu x0, s0
 ;   ret
 function %f15(f64) -> i32 {
 block0(v0: f64):
  v1 = fcvt_to_uint.i32 v0
  return v1
 }
 ; block0:
 ;   fcmp d0, d0
 ;   b.vc 8 ; udf
 ;   fmov d5, #-1
 ;   fcmp d0, d5
 ;   b.gt 8 ; udf
 ;   movz x10, #16880, LSL #48
 ;   fmov d18, x10
 ;   fcmp d0, d18
 ;   b.lt 8 ; udf
 ;   fcvtzu w0, d0
 ;   ret
 function %f16(f64) -> i64 {
 block0(v0: f64):
  v1 = fcvt_to_uint.i64 v0
  return v1
 }
 ; block0:
 ;   fcmp d0, d0
 ;   b.vc 8 ; udf
 ;   fmov d5, #-1
 ;   fcmp d0, d5
 ;   b.gt 8 ; udf
 ;   movz x10, #17392, LSL #48
 ;   fmov d18, x10
 ;   fcmp d0, d18
 ;   b.lt 8 ; udf
 ;   fcvtzu x0, d0
 ;   ret
 function %f17(f32) -> i32 {
 block0(v0: f32):
  v1 = fcvt_to_uint_sat.i32 v0
  return v1
 }
 ; block0:
 ;   movz x4, #20352, LSL #16
 ;   fmov s4, w4
 ;   fmin s7, s0, s4
 ;   movi v17.2s, #0
 ;   fmax s19, s7, s17
 ;   fcmp s0, s0
 ;   fcsel s22, s17, s19, ne
 ;   fcvtzu w0, s22
 ;   ret
 function %f18(f32) -> i64 {
 block0(v0: f32):
  v1 = fcvt_to_uint_sat.i64 v0
  return v1
 }
 ; block0:
 ;   movz x4, #24448, LSL #16
 ;   fmov s4, w4
 ;   fmin s7, s0, s4
 ;   movi v17.2s, #0
 ;   fmax s19, s7, s17
 ;   fcmp s0, s0
 ;   fcsel s22, s17, s19, ne
 ;   fcvtzu x0, s22
 ;   ret
 function %f19(f64) -> i32 {
 block0(v0: f64):
  v1 = fcvt_to_uint_sat.i32 v0
  return v1
 }
 ; block0:
 ;   ldr d3, pc+8 ; b 12 ; data.f64 4294967295
 ;   fmin d5, d0, d3
 ;   movi v7.2s, #0
 ;   fmax d17, d5, d7
 ;   fcmp d0, d0
 ;   fcsel d20, d7, d17, ne
 ;   fcvtzu w0, d20
 ;   ret
 function %f20(f64) -> i64 {
 block0(v0: f64):
  v1 = fcvt_to_uint_sat.i64 v0
  return v1
 }
 ; block0:
 ;   movz x4, #17392, LSL #48
 ;   fmov d4, x4
 ;   fmin d7, d0, d4
 ;   movi v17.2s, #0
 ;   fmax d19, d7, d17
 ;   fcmp d0, d0
 ;   fcsel d22, d17, d19, ne
 ;   fcvtzu x0, d22
 ;   ret
 function %f21(f32) -> i32 {
 block0(v0: f32):
  v1 = fcvt_to_sint.i32 v0
  return v1
 }
 ; block0:
 ;   fcmp s0, s0
 ;   b.vc 8 ; udf
 ;   movz x6, #52992, LSL #16
 ;   fmov s6, w6
 ;   fcmp s0, s6
 ;   b.ge 8 ; udf
 ;   movz x12, #20224, LSL #16
 ;   fmov s20, w12
 ;   fcmp s0, s20
 ;   b.lt 8 ; udf
 ;   fcvtzs w0, s0
 ;   ret
 function %f22(f32) -> i64 {
 block0(v0: f32):
  v1 = fcvt_to_sint.i64 v0
  return v1
 }
 ; block0:
 ;   fcmp s0, s0
 ;   b.vc 8 ; udf
 ;   movz x6, #57088, LSL #16
 ;   fmov s6, w6
 ;   fcmp s0, s6
 ;   b.ge 8 ; udf
 ;   movz x12, #24320, LSL #16
 ;   fmov s20, w12
 ;   fcmp s0, s20
 ;   b.lt 8 ; udf
 ;   fcvtzs x0, s0
 ;   ret
 function %f23(f64) -> i32 {
 block0(v0: f64):
  v1 = fcvt_to_sint.i32 v0
  return v1
 }
 ; block0:
 ;   fcmp d0, d0
 ;   b.vc 8 ; udf
 ;   ldr d5, pc+8 ; b 12 ; data.f64 -2147483649
 ;   fcmp d0, d5
 ;   b.gt 8 ; udf
 ;   movz x10, #16864, LSL #48
 ;   fmov d18, x10
 ;   fcmp d0, d18
 ;   b.lt 8 ; udf
 ;   fcvtzs w0, d0
 ;   ret
 function %f24(f64) -> i64 {
 block0(v0: f64):
  v1 = fcvt_to_sint.i64 v0
  return v1
 }
 ; block0:
 ;   fcmp d0, d0
 ;   b.vc 8 ; udf
 ;   movz x6, #50144, LSL #48
 ;   fmov d6, x6
 ;   fcmp d0, d6
 ;   b.ge 8 ; udf
 ;   movz x12, #17376, LSL #48
 ;   fmov d20, x12
 ;   fcmp d0, d20
 ;   b.lt 8 ; udf
 ;   fcvtzs x0, d0
 ;   ret
 function %f25(f32) -> i32 {
 block0(v0: f32):
  v1 = fcvt_to_sint_sat.i32 v0
  return v1
 }
 ; block0:
 ;   movz x4, #20224, LSL #16
 ;   fmov s4, w4
 ;   fmin s7, s0, s4
 ;   movz x10, #52992, LSL #16
 ;   fmov s18, w10
 ;   fmax s21, s7, s18
 ;   movi v23.16b, #0
 ;   fcmp s0, s0
 ;   fcsel s26, s23, s21, ne
 ;   fcvtzs w0, s26
 ;   ret
 function %f26(f32) -> i64 {
 block0(v0: f32):
  v1 = fcvt_to_sint_sat.i64 v0
  return v1
 }
 ; block0:
 ;   movz x4, #24320, LSL #16
 ;   fmov s4, w4
 ;   fmin s7, s0, s4
 ;   movz x10, #57088, LSL #16
 ;   fmov s18, w10
 ;   fmax s21, s7, s18
 ;   movi v23.16b, #0
 ;   fcmp s0, s0
 ;   fcsel s26, s23, s21, ne
 ;   fcvtzs x0, s26
 ;   ret
 function %f27(f64) -> i32 {
 block0(v0: f64):
  v1 = fcvt_to_sint_sat.i32 v0
  return v1
 }
 ; block0:
 ;   ldr d3, pc+8 ; b 12 ; data.f64 2147483647
 ;   fmin d5, d0, d3
 ;   movz x8, #49632, LSL #48
 ;   fmov d16, x8
 ;   fmax d19, d5, d16
 ;   movi v21.16b, #0
 ;   fcmp d0, d0
 ;   fcsel d24, d21, d19, ne
 ;   fcvtzs w0, d24
 ;   ret
 function %f28(f64) -> i64 {
 block0(v0: f64):
  v1 = fcvt_to_sint_sat.i64 v0
  return v1
 }
 ; block0:
 ;   movz x4, #17376, LSL #48
 ;   fmov d4, x4
 ;   fmin d7, d0, d4
 ;   movz x10, #50144, LSL #48
 ;   fmov d18, x10
 ;   fmax d21, d7, d18
 ;   movi v23.16b, #0
 ;   fcmp d0, d0
 ;   fcsel d26, d23, d21, ne
 ;   fcvtzs x0, d26
 ;   ret
 function %f29(f32x4) -> i32x4 {
 block0(v0: f32x4):
  v1 = fcvt_to_uint_sat.i32x4 v0
  return v1
 }
 ; block0:
 ;   fcvtzu v0.4s, v0.4s
 ;   ret
 function %f30(f32x4) -> i32x4 {
 block0(v0: f32x4):
  v1 = fcvt_to_sint_sat.i32x4 v0
  return v1
 }
 ; block0:
 ;   fcvtzs v0.4s, v0.4s
 ;   ret
--- a/cranelift/filetests/filetests/isa/aarch64/reftypes.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/reftypes.clif
@@ -71,11 +71,11 @@ block3(v7: r64, v8: r64):
 ;   str x0, [sp, #8]
 ;   ldr x2, 8 ; b 12 ; data TestCase(%f) + 0
 ;   blr x2
-;   mov x8, sp
+;   mov x4, sp
 ;   ldr x11, [sp, #8]
-;   str x11, [x8]
+;   str x11, [x4]
-;   and w6, w0, #1
+;   and w5, w0, #1
-;   cbz x6, label1 ; b label3
+;   cbz x5, label1 ; b label3
 ; block1:
 ;   b label2
 ; block2:
@@ -89,8 +89,8 @@ block3(v7: r64, v8: r64):
 ;   ldr x1, [sp, #16]
 ;   b label5
 ; block5:
-;   mov x3, sp
+;   mov x6, sp
-;   ldr x2, [x3]
+;   ldr x2, [x6]
 ;   add sp, sp, #32
 ;   ldp fp, lr, [sp], #16
 ;   ret
--- a/cranelift/filetests/filetests/isa/aarch64/simd-bitwise-compile.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/simd-bitwise-compile.clif
@@ -0,0 +1,213 @@
 test compile precise-output
 set enable_simd
 target aarch64
 function %band_f32x4(f32x4, f32x4) -> f32x4 {
 block0(v0: f32x4, v1: f32x4):
    v2 = band v0, v1
    return v2
 }
 ; block0:
 ;   and v0.16b, v0.16b, v1.16b
 ;   ret
 function %band_f64x2(f64x2, f64x2) -> f64x2 {
 block0(v0: f64x2, v1: f64x2):
    v2 = band v0, v1
    return v2
 }
 ; block0:
 ;   and v0.16b, v0.16b, v1.16b
 ;   ret
 function %band_i32x4(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = band v0, v1
    return v2
 }
 ; block0:
 ;   and v0.16b, v0.16b, v1.16b
 ;   ret
 function %bor_f32x4(f32x4, f32x4) -> f32x4 {
 block0(v0: f32x4, v1: f32x4):
    v2 = bor v0, v1
    return v2
 }
 ; block0:
 ;   orr v0.16b, v0.16b, v1.16b
 ;   ret
 function %bor_f64x2(f64x2, f64x2) -> f64x2 {
 block0(v0: f64x2, v1: f64x2):
    v2 = bor v0, v1
    return v2
 }
 ; block0:
 ;   orr v0.16b, v0.16b, v1.16b
 ;   ret
 function %bor_i32x4(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = bor v0, v1
    return v2
 }
 ; block0:
 ;   orr v0.16b, v0.16b, v1.16b
 ;   ret
 function %bxor_f32x4(f32x4, f32x4) -> f32x4 {
 block0(v0: f32x4, v1: f32x4):
    v2 = bxor v0, v1
    return v2
 }
 ; block0:
 ;   eor v0.16b, v0.16b, v1.16b
 ;   ret
 function %bxor_f64x2(f64x2, f64x2) -> f64x2 {
 block0(v0: f64x2, v1: f64x2):
    v2 = bxor v0, v1
    return v2
 }
 ; block0:
 ;   eor v0.16b, v0.16b, v1.16b
 ;   ret
 function %bxor_i32x4(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = bxor v0, v1
    return v2
 }
 ; block0:
 ;   eor v0.16b, v0.16b, v1.16b
 ;   ret
 function %bitselect_i16x8() -> i16x8 {
 block0:
    v0 = vconst.i16x8 [0 0 0 0 0 0 0 0]
    v1 = vconst.i16x8 [0 0 0 0 0 0 0 0]
    v2 = vconst.i16x8 [0 0 0 0 0 0 0 0]
    v3 = bitselect v0, v1, v2
    return v3
 }
 ; block0:
 ;   movi v0.16b, #0
 ;   movi v4.16b, #0
 ;   movi v5.16b, #0
 ;   bsl v0.16b, v4.16b, v5.16b
 ;   ret
 function %vselect_i16x8(b16x8, i16x8, i16x8) -> i16x8 {
 block0(v0: b16x8, v1: i16x8, v2: i16x8):
    v3 = vselect v0, v1, v2
    return v3
 }
 ; block0:
 ;   bsl v0.16b, v1.16b, v2.16b
 ;   ret
 function %vselect_f32x4(b32x4, f32x4, f32x4) -> f32x4 {
 block0(v0: b32x4, v1: f32x4, v2: f32x4):
    v3 = vselect v0, v1, v2
    return v3
 }
 ; block0:
 ;   bsl v0.16b, v1.16b, v2.16b
 ;   ret
 function %vselect_f64x2(b64x2, f64x2, f64x2) -> f64x2 {
 block0(v0: b64x2, v1: f64x2, v2: f64x2):
    v3 = vselect v0, v1, v2
    return v3
 }
 ; block0:
 ;   bsl v0.16b, v1.16b, v2.16b
 ;   ret
 function %ishl_i8x16(i32) -> i8x16 {
 block0(v0: i32):
    v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
    v2 = ishl v1, v0
    return v2
 }
 ; block0:
 ;   ldr q6, pc+8 ; b 20 ; data.f128 0x0f0e0d0c0b0a09080706050403020100
 ;   and w4, w0, #7
 ;   dup v7.16b, w4
 ;   sshl v0.16b, v6.16b, v7.16b
 ;   ret
 function %ushr_i8x16_imm() -> i8x16 {
 block0:
    v0 = iconst.i32 1
    v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
    v2 = ushr v1, v0
    return v2
 }
 ; block0:
 ;   ldr q6, pc+8 ; b 20 ; data.f128 0x0f0e0d0c0b0a09080706050403020100
 ;   movz x2, #1
 ;   and w4, w2, #7
 ;   sub x6, xzr, x4
 ;   dup v16.16b, w6
 ;   ushl v0.16b, v6.16b, v16.16b
 ;   ret
 function %sshr_i8x16(i32) -> i8x16 {
 block0(v0: i32):
    v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
    v2 = sshr v1, v0
    return v2
 }
 ; block0:
 ;   ldr q7, pc+8 ; b 20 ; data.f128 0x0f0e0d0c0b0a09080706050403020100
 ;   and w4, w0, #7
 ;   sub x6, xzr, x4
 ;   dup v16.16b, w6
 ;   sshl v0.16b, v7.16b, v16.16b
 ;   ret
 function %sshr_i8x16_imm(i8x16, i32) -> i8x16 {
 block0(v0: i8x16, v1: i32):
    v2 = sshr_imm v0, 3
    return v2
 }
 ; block0:
 ;   movz x5, #3
 ;   and w7, w5, #7
 ;   sub x9, xzr, x7
 ;   dup v19.16b, w9
 ;   sshl v0.16b, v0.16b, v19.16b
 ;   ret
 function %sshr_i64x2(i64x2, i32) -> i64x2 {
 block0(v0: i64x2, v1: i32):
    v2 = sshr v0, v1
    return v2
 }
 ; block0:
 ;   and w5, w0, #63
 ;   sub x7, xzr, x5
 ;   dup v17.2d, x7
 ;   sshl v0.2d, v0.2d, v17.2d
 ;   ret
--- a/cranelift/filetests/filetests/isa/aarch64/simd-comparison-legalize.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/simd-comparison-legalize.clif
@@ -0,0 +1,45 @@
 test compile precise-output
 set enable_simd
 target aarch64
 function %icmp_ne_32x4(i32x4, i32x4) -> b32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = icmp ne v0, v1
    return v2
 }
 ; block0:
 ;   cmeq v0.4s, v0.4s, v1.4s
 ;   mvn v0.16b, v0.16b
 ;   ret
 function %icmp_ugt_i32x4(i32x4, i32x4) -> b32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = icmp ugt v0, v1
    return v2
 }
 ; block0:
 ;   cmhi v0.4s, v0.4s, v1.4s
 ;   ret
 function %icmp_sge_i16x8(i16x8, i16x8) -> b16x8 {
 block0(v0: i16x8, v1: i16x8):
    v2 = icmp sge v0, v1
    return v2
 }
 ; block0:
 ;   cmge v0.8h, v0.8h, v1.8h
 ;   ret
 function %icmp_uge_i8x16(i8x16, i8x16) -> b8x16 {
 block0(v0: i8x16, v1: i8x16):
    v2 = icmp uge v0, v1
    return v2
 }
 ; block0:
 ;   cmhs v0.16b, v0.16b, v1.16b
 ;   ret
--- a/cranelift/filetests/filetests/isa/aarch64/simd-lane-access-compile.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/simd-lane-access-compile.clif
@@ -0,0 +1,124 @@
 test compile precise-output
 set enable_simd
 target aarch64
 ;; shuffle
 function %shuffle_different_ssa_values() -> i8x16 {
 block0:
    v0 = vconst.i8x16 0x00
    v1 = vconst.i8x16 0x01
    v2 = shuffle v0, v1, 0x11000000000000000000000000000000     ;; pick the second lane of v1, the rest use the first lane of v0
    return v2
 }
 ; block0:
 ;   movi v30.16b, #0
 ;   movz x5, #1
 ;   fmov s31, w5
 ;   ldr q4, pc+8 ; b 20 ; data.f128 0x11000000000000000000000000000000
 ;   tbl v0.16b, { v30.16b, v31.16b }, v4.16b
 ;   ret
 function %shuffle_same_ssa_value() -> i8x16 {
 block0:
    v1 = vconst.i8x16 0x01
    v2 = shuffle v1, v1, 0x13000000000000000000000000000000     ;; pick the fourth lane of v1 and the rest from the first lane of v1
    return v2
 }
 ; block0:
 ;   movz x4, #1
 ;   fmov s30, w4
 ;   ldr q3, pc+8 ; b 20 ; data.f128 0x13000000000000000000000000000000
 ;   mov v31.16b, v30.16b
 ;   tbl v0.16b, { v30.16b, v31.16b }, v3.16b
 ;   ret
 function %swizzle() -> i8x16 {
 block0:
    v0 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
    v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
    v2 = swizzle.i8x16 v0, v1
    return v2
 }
 ; block0:
 ;   ldr q3, pc+8 ; b 20 ; data.f128 0x0f0e0d0c0b0a09080706050403020100
 ;   ldr q4, pc+8 ; b 20 ; data.f128 0x0f0e0d0c0b0a09080706050403020100
 ;   tbl v0.16b, { v3.16b }, v4.16b
 ;   ret
 function %splat_i8(i8) -> i8x16 {
 block0(v0: i8):
    v1 = splat.i8x16 v0
    return v1
 }
 ; block0:
 ;   dup v0.16b, w0
 ;   ret
 function %splat_b16() -> b16x8 {
 block0:
    v0 = bconst.b16 true
    v1 = splat.b16x8 v0
    return v1
 }
 ; block0:
 ;   movi v0.16b, #255
 ;   ret
 function %splat_i32(i32) -> i32x4 {
 block0(v0: i32):
    v1 = splat.i32x4 v0
    return v1
 }
 ; block0:
 ;   dup v0.4s, w0
 ;   ret
 function %splat_f64(f64) -> f64x2 {
 block0(v0: f64):
    v1 = splat.f64x2 v0
    return v1
 }
 ; block0:
 ;   dup v0.2d, v0.d[0]
 ;   ret
 function %load32_zero_coalesced(i64) -> i32x4 {
 block0(v0: i64):
    v1 = load.i32 v0
    v2 = scalar_to_vector.i32x4 v1
    return v2
 }
 ; block0:
 ;   ldr w2, [x0]
 ;   fmov s0, w2
 ;   ret
 function %load32_zero_int(i32) -> i32x4 {
 block0(v0: i32):
    v1 = scalar_to_vector.i32x4 v0
    return v1
 }
 ; block0:
 ;   fmov s0, w0
 ;   ret
 function %load32_zero_float(f32) -> f32x4 {
 block0(v0: f32):
    v1 = scalar_to_vector.f32x4 v0
    return v1
 }
 ; block0:
 ;   fmov s0, s0
 ;   ret
--- a/cranelift/filetests/filetests/isa/aarch64/simd-logical-compile.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/simd-logical-compile.clif
@@ -0,0 +1,40 @@
 test compile precise-output
 set enable_simd
 target aarch64
 function %bnot_b32x4(b32x4) -> b32x4 {
 block0(v0: b32x4):
    v1 = bnot v0
    return v1
 }
 ; block0:
 ;   mvn v0.16b, v0.16b
 ;   ret
 function %vany_true_b32x4(b32x4) -> b1 {
 block0(v0: b32x4):
    v1 = vany_true v0
    return v1
 }
 ; block0:
 ;   umaxp v3.4s, v0.4s, v0.4s
 ;   mov x5, v3.d[0]
 ;   subs xzr, x5, #0
 ;   csetm x0, ne
 ;   ret
 function %vall_true_i64x2(i64x2) -> b1 {
 block0(v0: i64x2):
    v1 = vall_true v0
    return v1
 }
 ; block0:
 ;   cmeq v3.2d, v0.2d, #0
 ;   addp v5.2d, v3.2d, v3.2d
 ;   fcmp d5, d5
 ;   cset x0, eq
 ;   ret
--- a/cranelift/filetests/filetests/isa/aarch64/simd-pairwise-add.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/simd-pairwise-add.clif
@@ -1,8 +1,6 @@
 test compile precise-output
 set unwind_info=false
 target aarch64
 function %fn1(i8x16) -> i16x8 {
 block0(v0: i8x16):
  v1 = swiden_low v0
@@ -15,19 +13,7 @@ block0(v0: i8x16):
 ;   saddlp v0.8h, v0.16b
 ;   ret
-function %fn2(i8x16) -> i16x8 {
+function %fn2(i16x8) -> i32x4 {
 block0(v0: i8x16):
  v1 = uwiden_low v0
  v2 = uwiden_high v0
  v3 = iadd_pairwise v1, v2
  return v3
 }
 ; block0:
 ;   uaddlp v0.8h, v0.16b
 ;   ret
 function %fn3(i16x8) -> i32x4 {
 block0(v0: i16x8):
  v1 = swiden_low v0
  v2 = swiden_high v0
@@ -39,6 +25,18 @@ block0(v0: i16x8):
 ;   saddlp v0.4s, v0.8h
 ;   ret
 function %fn3(i8x16) -> i16x8 {
 block0(v0: i8x16):
  v1 = uwiden_low v0
  v2 = uwiden_high v0
  v3 = iadd_pairwise v1, v2
  return v3
 }
 ; block0:
 ;   uaddlp v0.8h, v0.16b
 ;   ret
 function %fn4(i16x8) -> i32x4 {
 block0(v0: i16x8):
  v1 = uwiden_low v0
@@ -51,169 +49,3 @@ block0(v0: i16x8):
 ;   uaddlp v0.4s, v0.8h
 ;   ret
 function %fn5(i8x16, i8x16) -> i16x8 {
 block0(v0: i8x16, v1: i8x16):
  v2 = swiden_low v0
  v3 = swiden_high v1
  v4 = iadd_pairwise v2, v3
  return v4
 }
 ; block0:
 ;   sxtl v7.8h, v0.8b
 ;   sxtl2 v16.8h, v1.16b
 ;   addp v0.8h, v7.8h, v16.8h
 ;   ret
 function %fn6(i8x16, i8x16) -> i16x8 {
 block0(v0: i8x16, v1: i8x16):
  v2 = uwiden_low v0
  v3 = uwiden_high v1
  v4 = iadd_pairwise v2, v3
  return v4
 }
 ; block0:
 ;   uxtl v7.8h, v0.8b
 ;   uxtl2 v16.8h, v1.16b
 ;   addp v0.8h, v7.8h, v16.8h
 ;   ret
 function %fn7(i8x16) -> i16x8 {
 block0(v0: i8x16):
  v1 = uwiden_low v0
  v2 = swiden_high v0
  v3 = iadd_pairwise v1, v2
  return v3
 }
 ; block0:
 ;   uxtl v5.8h, v0.8b
 ;   sxtl2 v6.8h, v0.16b
 ;   addp v0.8h, v5.8h, v6.8h
 ;   ret
 function %fn8(i8x16) -> i16x8 {
 block0(v0: i8x16):
  v1 = swiden_low v0
  v2 = uwiden_high v0
  v3 = iadd_pairwise v1, v2
  return v3
 }
 ; block0:
 ;   sxtl v5.8h, v0.8b
 ;   uxtl2 v6.8h, v0.16b
 ;   addp v0.8h, v5.8h, v6.8h
 ;   ret
 function %fn9(i8x8, i8x8) -> i8x8 {
 block0(v0: i8x8, v1: i8x8):
  v2 = iadd_pairwise v0, v1
  return v2
 }
 ; block0:
 ;   addp v0.8b, v0.8b, v1.8b
 ;   ret
 function %fn10(i8x16, i8x16) -> i8x16 {
 block0(v0: i8x16, v1: i8x16):
  v2 = iadd_pairwise v0, v1
  return v2
 }
 ; block0:
 ;   addp v0.16b, v0.16b, v1.16b
 ;   ret
 function %fn11(i16x4, i16x4) -> i16x4 {
 block0(v0: i16x4, v1: i16x4):
  v2 = iadd_pairwise v0, v1
  return v2
 }
 ; block0:
 ;   addp v0.4h, v0.4h, v1.4h
 ;   ret
 function %fn12(i16x8, i16x8) -> i16x8 {
 block0(v0: i16x8, v1: i16x8):
  v2 = iadd_pairwise v0, v1
  return v2
 }
 ; block0:
 ;   addp v0.8h, v0.8h, v1.8h
 ;   ret
 function %fn14(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
  v2 = iadd_pairwise v0, v1
  return v2
 }
 ; block0:
 ;   addp v0.4s, v0.4s, v1.4s
 ;   ret
 function %fn15(i8x8, i8x8) -> i16x4 {
 block0(v0: i8x8, v1: i8x8):
  v2 = swiden_low v0
  v3 = swiden_high v1
  v4 = iadd_pairwise v2, v3
  return v4
 }
 ; block0:
 ;   sxtl v16.8h, v0.8b
 ;   mov s7, v1.s[1]
 ;   sxtl v17.8h, v7.8b
 ;   addp v0.4h, v16.4h, v17.4h
 ;   ret
 function %fn16(i8x8, i8x8) -> i16x4 {
 block0(v0: i8x8, v1: i8x8):
  v2 = uwiden_low v0
  v3 = uwiden_high v1
  v4 = iadd_pairwise v2, v3
  return v4
 }
 ; block0:
 ;   uxtl v16.8h, v0.8b
 ;   mov s7, v1.s[1]
 ;   uxtl v17.8h, v7.8b
 ;   addp v0.4h, v16.4h, v17.4h
 ;   ret
 function %fn17(i8x8) -> i16x4 {
 block0(v0: i8x8):
  v1 = uwiden_low v0
  v2 = swiden_high v0
  v3 = iadd_pairwise v1, v2
  return v3
 }
 ; block0:
 ;   uxtl v6.8h, v0.8b
 ;   mov s5, v0.s[1]
 ;   sxtl v7.8h, v5.8b
 ;   addp v0.4h, v6.4h, v7.4h
 ;   ret
 function %fn18(i8x8) -> i16x4 {
 block0(v0: i8x8):
  v1 = swiden_low v0
  v2 = uwiden_high v0
  v3 = iadd_pairwise v1, v2
  return v3
 }
 ; block0:
 ;   sxtl v6.8h, v0.8b
 ;   mov s5, v0.s[1]
 ;   uxtl v7.8h, v5.8b
 ;   addp v0.4h, v6.4h, v7.4h
 ;   ret
--- a/cranelift/filetests/filetests/isa/aarch64/stack.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/stack.clif
@@ -53,8 +53,8 @@ block0:
 ;   mov fp, sp
 ;   sub sp, sp, #16
 ; block0:
-;   mov x0, sp
+;   mov x2, sp
-;   ldr x0, [x0]
+;   ldr x0, [x2]
 ;   add sp, sp, #16
 ;   ldp fp, lr, [sp], #16
 ;   ret
@@ -74,8 +74,8 @@ block0:
 ;   movk w16, #1, LSL #16
 ;   sub sp, sp, x16, UXTX
 ; block0:
-;   mov x0, sp
+;   mov x2, sp
-;   ldr x0, [x0]
+;   ldr x0, [x2]
 ;   movz w16, #34480
 ;   movk w16, #1, LSL #16
 ;   add sp, sp, x16, UXTX
@@ -442,8 +442,8 @@ block0(v0: i128):
 ;   mov fp, sp
 ;   sub sp, sp, #16
 ; block0:
-;   mov x4, sp
+;   mov x5, sp
-;   stp x0, x1, [x4]
+;   stp x0, x1, [x5]
 ;   add sp, sp, #16
 ;   ldp fp, lr, [sp], #16
 ;   ret
@@ -461,8 +461,8 @@ block0(v0: i128):
 ;   mov fp, sp
 ;   sub sp, sp, #32
 ; block0:
-;   add x4, sp, #32
+;   add x5, sp, #32
-;   stp x0, x1, [x4]
+;   stp x0, x1, [x5]
 ;   add sp, sp, #32
 ;   ldp fp, lr, [sp], #16
 ;   ret
@@ -482,8 +482,8 @@ block0(v0: i128):
 ;   movk w16, #1, LSL #16
 ;   sub sp, sp, x16, UXTX
 ; block0:
-;   mov x4, sp
+;   mov x5, sp
-;   stp x0, x1, [x4]
+;   stp x0, x1, [x5]
 ;   movz w16, #34480
 ;   movk w16, #1, LSL #16
 ;   add sp, sp, x16, UXTX
@@ -502,8 +502,8 @@ block0:
 ;   mov fp, sp
 ;   sub sp, sp, #16
 ; block0:
-;   mov x0, sp
+;   mov x5, sp
-;   ldp x0, x1, [x0]
+;   ldp x0, x1, [x5]
 ;   add sp, sp, #16
 ;   ldp fp, lr, [sp], #16
 ;   ret
@@ -521,8 +521,8 @@ block0:
 ;   mov fp, sp
 ;   sub sp, sp, #32
 ; block0:
-;   add x0, sp, #32
+;   add x5, sp, #32
-;   ldp x0, x1, [x0]
+;   ldp x0, x1, [x5]
 ;   add sp, sp, #32
 ;   ldp fp, lr, [sp], #16
 ;   ret
@@ -542,8 +542,8 @@ block0:
 ;   movk w16, #1, LSL #16
 ;   sub sp, sp, x16, UXTX
 ; block0:
-;   mov x0, sp
+;   mov x5, sp
-;   ldp x0, x1, [x0]
+;   ldp x0, x1, [x5]
 ;   movz w16, #34480
 ;   movk w16, #1, LSL #16
 ;   add sp, sp, x16, UXTX
--- a/cranelift/filetests/filetests/isa/aarch64/traps.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/traps.clif
@@ -1,8 +1,7 @@
 test compile precise-output
 set unwind_info=false
 target aarch64
-function %f() {
+function %trap() {
 block0:
  trap user0
 }
@@ -10,26 +9,14 @@ block0:
 ; block0:
 ;   udf #0xc11f
-function %g(i64) {
+function %trap_iadd_ifcout(i64, i64) {
-block0(v0: i64):
+block0(v0: i64, v1: i64):
-  v1 = iconst.i64 42
+  v2, v3 = iadd_ifcout v0, v1
-  v2 = ifcmp v0, v1
+  trapif of v3, user0
  trapif eq v2, user0
  return
 }
 ; block0:
-;   subs xzr, x0, #42
+;   b.vc 8 ; udf
 ;   b.ne 8 ; udf
 ;   ret
 function %h() {
 block0:
  debugtrap
  return
 }
 ; block0:
 ;   brk #0
 ;   ret
--- a/cranelift/filetests/filetests/isa/aarch64/vhigh_bits.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/vhigh_bits.clif
@@ -0,0 +1,85 @@
 test compile precise-output
 target aarch64
 function %f1(i8x16) -> i8 {
 block0(v0: i8x16):
  v1 = vhigh_bits.i8 v0
  return v1
 }
 ; block0:
 ;   sshr v3.16b, v0.16b, #7
 ;   movz x6, #513
 ;   movk x6, #2052, LSL #16
 ;   movk x6, #8208, LSL #32
 ;   movk x6, #32832, LSL #48
 ;   dup v17.2d, x6
 ;   and v20.16b, v3.16b, v17.16b
 ;   ext v22.16b, v20.16b, v20.16b, #8
 ;   zip1 v24.16b, v20.16b, v22.16b
 ;   addv h26, v24.8h
 ;   umov w0, v26.h[0]
 ;   ret
 function %f2(i8x16) -> i16 {
 block0(v0: i8x16):
  v1 = vhigh_bits.i16 v0
  return v1
 }
 ; block0:
 ;   sshr v3.16b, v0.16b, #7
 ;   movz x6, #513
 ;   movk x6, #2052, LSL #16
 ;   movk x6, #8208, LSL #32
 ;   movk x6, #32832, LSL #48
 ;   dup v17.2d, x6
 ;   and v20.16b, v3.16b, v17.16b
 ;   ext v22.16b, v20.16b, v20.16b, #8
 ;   zip1 v24.16b, v20.16b, v22.16b
 ;   addv h26, v24.8h
 ;   umov w0, v26.h[0]
 ;   ret
 function %f3(i16x8) -> i8 {
 block0(v0: i16x8):
  v1 = vhigh_bits.i8 v0
  return v1
 }
 ; block0:
 ;   sshr v3.8h, v0.8h, #15
 ;   ldr q5, pc+8 ; b 20 ; data.f128 0x00800040002000100008000400020001
 ;   and v7.16b, v3.16b, v5.16b
 ;   addv h17, v7.8h
 ;   umov w0, v17.h[0]
 ;   ret
 function %f4(i32x4) -> i8 {
 block0(v0: i32x4):
  v1 = vhigh_bits.i8 v0
  return v1
 }
 ; block0:
 ;   sshr v3.4s, v0.4s, #31
 ;   ldr q5, pc+8 ; b 20 ; data.f128 0x00000008000000040000000200000001
 ;   and v7.16b, v3.16b, v5.16b
 ;   addv s17, v7.4s
 ;   mov w0, v17.s[0]
 ;   ret
 function %f5(i64x2) -> i8 {
 block0(v0: i64x2):
  v1 = vhigh_bits.i8 v0
  return v1
 }
 ; block0:
 ;   mov x3, v0.d[1]
 ;   mov x5, v0.d[0]
 ;   lsr x7, x3, #63
 ;   lsr x9, x5, #63
 ;   add x0, x9, x7, LSL 1
 ;   ret