Port Fcopysign..FcvtToSintSat to ISLE (AArch64) (#4753)

* Port `Fcopysign`..``FcvtToSintSat` to ISLE (AArch64) Ported the existing implementations of the following opcodes to ISLE on AArch64: - `Fcopysign` - Also introduced missing support for `fcopysign` on vector values, as per the docs. - This introduces the vector encoding for the `SLI` machine instruction. - `FcvtToUint` - `FcvtToSint` - `FcvtFromUint` - `FcvtFromSint` - `FcvtToUintSat` - `FcvtToSintSat` Copyright (c) 2022 Arm Limited * Document helpers and abstract conversion checks
2022-08-24 18:37:14 +01:00
parent 7e3c481f4e
commit 94bcbe8446
12 changed files with 863 additions and 548 deletions
--- a/cranelift/codegen/src/isa/aarch64/inst.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst.isle
@@ -619,6 +619,14 @@
        (size VectorSize)
        (imm u8))
       ;; Destructive vector shift by immediate.
       (VecShiftImmMod
        (op VecShiftImmModOp)
        (rd WritableReg)
        (rn Reg)
        (size VectorSize)
        (imm u8))
       ;; Vector extract - create a new vector, being the concatenation of the lowest `imm4` bytes
       ;; of `rm` followed by the uppermost `16 - imm4` bytes of `rn`.
       (VecExtract
@@ -1315,6 +1323,13 @@
    (Sshr)
 ))
 ;; Destructive shift-by-immediate operation on each lane of a vector.
 (type VecShiftImmModOp
  (enum
    ;; Shift left and insert
    (Sli)
 ))
 ;; Atomic read-modify-write operations with acquire-release semantics
 (type AtomicRMWOp
  (enum
@@ -1386,6 +1401,48 @@
 (decl u64_into_imm_logic (Type u64) ImmLogic)
 (extern constructor u64_into_imm_logic u64_into_imm_logic)
 ;; Calculate the minimum floating-point bound for a conversion to floating
 ;; point from an integer type.
 ;; Accepts whether the output is signed, the size of the input
 ;; floating point type in bits, and the size of the output integer type
 ;; in bits.
 (decl min_fp_value (bool u8 u8) Reg)
 (extern constructor min_fp_value min_fp_value)
 ;; Calculate the maximum floating-point bound for a conversion to floating
 ;; point from an integer type.
 ;; Accepts whether the output is signed, the size of the input
 ;; floating point type in bits, and the size of the output integer type
 ;; in bits.
 (decl max_fp_value (bool u8 u8) Reg)
 (extern constructor max_fp_value max_fp_value)
 ;; Calculate the minimum acceptable floating-point value for a conversion to
 ;; floating point from an integer type.
 ;; Accepts whether the output is signed, the size of the input
 ;; floating point type in bits, and the size of the output integer type
 ;; in bits.
 (decl min_fp_value_sat (bool u8 u8) Reg)
 (extern constructor min_fp_value_sat min_fp_value_sat)
 ;; Calculate the maximum acceptable floating-point value for a conversion to
 ;; floating point from an integer type.
 ;; Accepts whether the output is signed, the size of the input
 ;; floating point type in bits, and the size of the output integer type
 ;; in bits.
 (decl max_fp_value_sat (bool u8 u8) Reg)
 (extern constructor max_fp_value_sat max_fp_value_sat)
 ;; Constructs an FPUOpRI.Ushr* given the size in bits of the value (or lane)
 ;; and the amount to shift by.
 (decl fpu_op_ri_ushr (u8 u8) FPUOpRI)
 (extern constructor fpu_op_ri_ushr fpu_op_ri_ushr)
 ;; Constructs an FPUOpRI.Sli* given the size in bits of the value (or lane)
 ;; and the amount to shift by.
 (decl fpu_op_ri_sli (u8 u8) FPUOpRI)
 (extern constructor fpu_op_ri_sli fpu_op_ri_sli)
 (decl imm12_from_negated_u64 (Imm12) u64)
 (extern extractor imm12_from_negated_u64 imm12_from_negated_u64)
@@ -1533,6 +1590,12 @@
            (_2 Unit (emit (MInst.VecRRRMod op dst src2 src3 size))))
        dst))
 (decl fpu_rri (FPUOpRI Reg) Reg)
 (rule (fpu_rri op src)
      (let ((dst WritableReg (temp_writable_reg $F64))
            (_ Unit (emit (MInst.FpuRRI op dst src))))
        dst))
 ;; Helper for emitting `MInst.FpuRRR` instructions.
 (decl fpu_rrr (FPUOp2 Reg Reg ScalarSize) Reg)
 (rule (fpu_rrr op src1 src2 size)
@@ -2611,3 +2674,147 @@
      ;; to clobber LR.
      (let ((_ Unit (emit (MInst.Xpaclri))))
           (mov_preg (preg_link))))
 ;; Helper for getting the maximum shift amount for a type.
 (decl max_shift (Type) u8)
 (rule (max_shift $F64) 63)
 (rule (max_shift $F32) 31)
 ;; Helper for generating `fcopysign` instruction sequences.
 (decl fcopy_sign (Reg Reg Type) Reg)
 (rule (fcopy_sign x y (ty_scalar_float ty))
      (let ((dst WritableReg (temp_writable_reg $F64))
            (_ Unit (emit (MInst.FpuMove64 dst x)))
            (tmp Reg (fpu_rri (fpu_op_ri_ushr (ty_bits ty) (max_shift ty)) y))
            (_ Unit (emit (MInst.FpuRRI (fpu_op_ri_sli (ty_bits ty) (max_shift ty)) dst tmp))))
       dst))
 (rule (fcopy_sign x y ty @ (multi_lane _ _))
      (let ((dst WritableReg (temp_writable_reg $I8X16))
            (_ Unit (emit (MInst.FpuMove128 dst x)))
            (tmp Reg (vec_shift_imm (VecShiftImmOp.Ushr) (max_shift (lane_type ty)) y (vector_size ty)))
            (_ Unit (emit (MInst.VecShiftImmMod (VecShiftImmModOp.Sli) dst tmp (vector_size ty) (max_shift (lane_type ty))))))
       dst))
 ;; Helpers for generating `MInst.FpuToInt` instructions.
 (decl fpu_to_int_nan_check (ScalarSize Reg) Reg)
 (rule (fpu_to_int_nan_check size src)
      (let ((r ValueRegs
                  (with_flags (fpu_cmp size src src)
                   (ConsumesFlags.ConsumesFlagsReturnsReg
                    (MInst.TrapIf (cond_br_cond (Cond.Vs))
                        (trap_code_bad_conversion_to_integer))
                    src))))
       (value_regs_get r 0)))
 ;; Checks that the value is not less than the minimum bound,
 ;; accepting a boolean (whether the type is signed), input type,
 ;; output type, and registers containing the source and minimum bound.
 (decl fpu_to_int_underflow_check (bool Type Type Reg Reg) Reg)
 (rule (fpu_to_int_underflow_check $true $F32 (fits_in_16 out_ty) src min)
      (let ((r ValueRegs
                  (with_flags (fpu_cmp (ScalarSize.Size32) src min)
                   (ConsumesFlags.ConsumesFlagsReturnsReg
                    (MInst.TrapIf (cond_br_cond (Cond.Le))
                        (trap_code_integer_overflow))
                    src))))
       (value_regs_get r 0)))
 (rule (fpu_to_int_underflow_check $true $F64 (fits_in_32 out_ty) src min)
      (let ((r ValueRegs
                  (with_flags (fpu_cmp (ScalarSize.Size64) src min)
                   (ConsumesFlags.ConsumesFlagsReturnsReg
                    (MInst.TrapIf (cond_br_cond (Cond.Le))
                        (trap_code_integer_overflow))
                    src))))
       (value_regs_get r 0)))
 (rule -1 (fpu_to_int_underflow_check $true in_ty _out_ty src min)
      (let ((r ValueRegs
                  (with_flags (fpu_cmp (scalar_size in_ty) src min)
                   (ConsumesFlags.ConsumesFlagsReturnsReg
                    (MInst.TrapIf (cond_br_cond (Cond.Lt))
                        (trap_code_integer_overflow))
                    src))))
       (value_regs_get r 0)))
 (rule (fpu_to_int_underflow_check $false in_ty _out_ty src min)
      (let ((r ValueRegs
                  (with_flags (fpu_cmp (scalar_size in_ty) src min)
                   (ConsumesFlags.ConsumesFlagsReturnsReg
                    (MInst.TrapIf (cond_br_cond (Cond.Le))
                        (trap_code_integer_overflow))
                    src))))
       (value_regs_get r 0)))
 (decl fpu_to_int_overflow_check (ScalarSize Reg Reg) Reg)
 (rule (fpu_to_int_overflow_check size src max)
      (let ((r ValueRegs
                  (with_flags (fpu_cmp size src max)
                   (ConsumesFlags.ConsumesFlagsReturnsReg
                    (MInst.TrapIf (cond_br_cond (Cond.Ge))
                        (trap_code_integer_overflow))
                    src))))
       (value_regs_get r 0)))
 ;; Emits the appropriate instruction sequence to convert a
 ;; floating-point value to an integer, trapping if the value
 ;; is a NaN or does not fit in the target type.
 ;; Accepts the specific conversion op, the source register,
 ;; whether the input is signed, and finally the input and output
 ;; types.
 (decl fpu_to_int_cvt (FpuToIntOp Reg bool Type Type) Reg)
 (rule (fpu_to_int_cvt op src signed in_ty out_ty)
      (let ((size ScalarSize (scalar_size in_ty))
            (in_bits u8 (ty_bits in_ty))
            (out_bits u8 (ty_bits out_ty))
            (src Reg (fpu_to_int_nan_check size src))
            (min Reg (min_fp_value signed in_bits out_bits))
            (src Reg (fpu_to_int_underflow_check signed in_ty out_ty src min))
            (max Reg (max_fp_value signed in_bits out_bits))
            (src Reg (fpu_to_int_overflow_check size src max)))
       (fpu_to_int op src)))
 ;; Emits the appropriate instruction sequence to convert a
 ;; floating-point value to an integer, saturating if the value
 ;; does not fit in the target type.
 ;; Accepts the specific conversion op, the source register,
 ;; whether the input is signed, and finally the input and output
 ;; types.
 (decl fpu_to_int_cvt_sat (FpuToIntOp Reg bool Type Type) Reg)
 (rule (fpu_to_int_cvt_sat op src $true in_ty out_ty)
      (let ((size ScalarSize (scalar_size in_ty))
            (in_bits u8 (ty_bits in_ty))
            (out_bits u8 (ty_bits out_ty))
            (max Reg (max_fp_value_sat $true in_bits out_bits))
            (tmp Reg (fpu_rrr (FPUOp2.Min) src max size))
            (min Reg (min_fp_value_sat $true in_bits out_bits))
            (tmp Reg (fpu_rrr (FPUOp2.Max) tmp min size))
            (zero Reg (constant_f128 0))
            (tmp ValueRegs (with_flags (fpu_cmp size src src)
                    (fpu_csel in_ty (Cond.Ne) zero tmp))))
       (fpu_to_int op (value_regs_get tmp 0))))
 (rule (fpu_to_int_cvt_sat op src $false in_ty out_ty)
      (let ((size ScalarSize (scalar_size in_ty))
            (in_bits u8 (ty_bits in_ty))
            (out_bits u8 (ty_bits out_ty))
            (max Reg (max_fp_value_sat $false in_bits out_bits))
            (tmp Reg (fpu_rrr (FPUOp2.Min) src max size))
            (min Reg (min_fp_value_sat $false in_bits out_bits))
            (tmp Reg (fpu_rrr (FPUOp2.Max) tmp min size))
            (tmp ValueRegs (with_flags (fpu_cmp size src src)
                    (fpu_csel in_ty (Cond.Ne) min tmp))))
       (fpu_to_int op (value_regs_get tmp 0))))
 (decl fpu_to_int (FpuToIntOp Reg) Reg)
 (rule (fpu_to_int op src)
      (let ((dst WritableReg (temp_writable_reg $I64))
            (_ Unit (emit (MInst.FpuToInt op dst src))))
       dst))
 ;; Helper for generating `MInst.IntToFpu` instructions.
 (decl int_to_fpu (IntToFpuOp Reg) Reg)
 (rule (int_to_fpu op src)
      (let ((dst WritableReg (temp_writable_reg $I8X16))
            (_ Unit (emit (MInst.IntToFpu op dst src))))
       dst))
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -2033,6 +2033,50 @@ impl MachInstEmit for Inst {
                let rd_enc = machreg_to_vec(rd.to_reg());
                sink.put4(template | (immh_immb << 16) | (rn_enc << 5) | rd_enc);
            }
            &Inst::VecShiftImmMod {
                op,
                rd,
                rn,
                size,
                imm,
            } => {
                let rd = allocs.next_writable(rd);
                let rn = allocs.next(rn);
                let (is_shr, mut template) = match op {
                    VecShiftImmModOp::Sli => (false, 0b_001_011110_0000_000_010101_00000_00000_u32),
                };
                if size.is_128bits() {
                    template |= 0b1 << 30;
                }
                let imm = imm as u32;
                // Deal with the somewhat strange encoding scheme for, and limits on,
                // the shift amount.
                let immh_immb = match (size.lane_size(), is_shr) {
                    (ScalarSize::Size64, true) if imm >= 1 && imm <= 64 => {
                        0b_1000_000_u32 | (64 - imm)
                    }
                    (ScalarSize::Size32, true) if imm >= 1 && imm <= 32 => {
                        0b_0100_000_u32 | (32 - imm)
                    }
                    (ScalarSize::Size16, true) if imm >= 1 && imm <= 16 => {
                        0b_0010_000_u32 | (16 - imm)
                    }
                    (ScalarSize::Size8, true) if imm >= 1 && imm <= 8 => {
                        0b_0001_000_u32 | (8 - imm)
                    }
                    (ScalarSize::Size64, false) if imm <= 63 => 0b_1000_000_u32 | imm,
                    (ScalarSize::Size32, false) if imm <= 31 => 0b_0100_000_u32 | imm,
                    (ScalarSize::Size16, false) if imm <= 15 => 0b_0010_000_u32 | imm,
                    (ScalarSize::Size8, false) if imm <= 7 => 0b_0001_000_u32 | imm,
                    _ => panic!(
                        "aarch64: Inst::VecShiftImmMod: emit: invalid op/size/imm {:?}, {:?}, {:?}",
                        op, size, imm
                    ),
                };
                let rn_enc = machreg_to_vec(rn);
                let rd_enc = machreg_to_vec(rd.to_reg());
                sink.put4(template | (immh_immb << 16) | (rn_enc << 5) | rd_enc);
            }
            &Inst::VecExtract { rd, rn, rm, imm4 } => {
                let rd = allocs.next_writable(rd);
                let rn = allocs.next(rn);
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -39,7 +39,7 @@ pub use crate::isa::aarch64::lower::isle::generated_code::{
    ALUOp, ALUOp3, APIKey, AtomicRMWLoopOp, AtomicRMWOp, BitOp, FPUOp1, FPUOp2, FPUOp3,
    FpuRoundMode, FpuToIntOp, IntToFpuOp, MInst as Inst, MoveWideOp, VecALUModOp, VecALUOp,
    VecExtendOp, VecLanesOp, VecMisc2, VecPairOp, VecRRLongOp, VecRRNarrowOp, VecRRPairLongOp,
-    VecRRRLongOp, VecShiftImmOp,
+    VecRRRLongOp, VecShiftImmModOp, VecShiftImmOp,
 };
 /// A floating-point unit (FPU) operation with two args, a register and an immediate.
@@ -767,6 +767,10 @@ fn aarch64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
            collector.reg_def(rd);
            collector.reg_use(rn);
        }
        &Inst::VecShiftImmMod { rd, rn, .. } => {
            collector.reg_mod(rd);
            collector.reg_use(rn);
        }
        &Inst::VecExtract { rd, rn, rm, .. } => {
            collector.reg_def(rd);
            collector.reg_use(rn);
@@ -2371,6 +2375,20 @@ impl Inst {
                let rn = pretty_print_vreg_vector(rn, size, allocs);
                format!("{} {}, {}, #{}", op, rd, rn, imm)
            }
            &Inst::VecShiftImmMod {
                op,
                rd,
                rn,
                size,
                imm,
            } => {
                let op = match op {
                    VecShiftImmModOp::Sli => "sli",
                };
                let rd = pretty_print_vreg_vector(rd.to_reg(), size, allocs);
                let rn = pretty_print_vreg_vector(rn, size, allocs);
                format!("{} {}, {}, #{}", op, rd, rn, imm)
            }
            &Inst::VecExtract { rd, rn, rm, imm4 } => {
                let rd = pretty_print_vreg_vector(rd.to_reg(), VectorSize::Size8x16, allocs);
                let rn = pretty_print_vreg_vector(rn, VectorSize::Size8x16, allocs);
--- a/cranelift/codegen/src/isa/aarch64/lower.isle
+++ b/cranelift/codegen/src/isa/aarch64/lower.isle
@@ -406,6 +406,119 @@
 (rule (lower (has_type (ty_scalar_float ty) (fma x y z)))
      (fpu_rrrr (FPUOp3.MAdd) (scalar_size ty) x y z))
 ;;;; Rules for `fcopysign` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (has_type ty (fcopysign x y)))
      (fcopy_sign x y ty))
 ;;;; Rules for `fcvt_to_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_uint x @ (value_type $F32))))
      (fpu_to_int_cvt (FpuToIntOp.F32ToU32) x $false $F32 out_ty))
 (rule (lower (has_type $I64 (fcvt_to_uint x @ (value_type $F32))))
      (fpu_to_int_cvt (FpuToIntOp.F32ToU64) x $false $F32 $I64))
 (rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_uint x @ (value_type $F64))))
      (fpu_to_int_cvt (FpuToIntOp.F64ToU32) x $false $F64 out_ty))
 (rule (lower (has_type $I64 (fcvt_to_uint x @ (value_type $F64))))
      (fpu_to_int_cvt (FpuToIntOp.F64ToU64) x $false $F64 $I64))
 ;;;; Rules for `fcvt_to_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_sint x @ (value_type $F32))))
      (fpu_to_int_cvt (FpuToIntOp.F32ToI32) x $true $F32 out_ty))
 (rule (lower (has_type $I64 (fcvt_to_sint x @ (value_type $F32))))
      (fpu_to_int_cvt (FpuToIntOp.F32ToI64) x $true $F32 $I64))
 (rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_sint x @ (value_type $F64))))
      (fpu_to_int_cvt (FpuToIntOp.F64ToI32) x $true $F64 out_ty))
 (rule (lower (has_type $I64 (fcvt_to_sint x @ (value_type $F64))))
      (fpu_to_int_cvt (FpuToIntOp.F64ToI64) x $true $F64 $I64))
 ;;;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (has_type ty @ (multi_lane 32 _) (fcvt_from_uint x @ (value_type (multi_lane 32 _)))))
      (vec_misc (VecMisc2.Ucvtf) x (vector_size ty)))
 (rule (lower (has_type ty @ (multi_lane 64 _) (fcvt_from_uint x @ (value_type (multi_lane 64 _)))))
      (vec_misc (VecMisc2.Ucvtf) x (vector_size ty)))
 (rule (lower (has_type $F32 (fcvt_from_uint x @ (value_type (fits_in_32 _)))))
      (int_to_fpu (IntToFpuOp.U32ToF32) (put_in_reg_zext32 x)))
 (rule (lower (has_type $F64 (fcvt_from_uint x @ (value_type (fits_in_32 _)))))
      (int_to_fpu (IntToFpuOp.U32ToF64) (put_in_reg_zext32 x)))
 (rule (lower (has_type $F32 (fcvt_from_uint x @ (value_type $I64))))
      (int_to_fpu (IntToFpuOp.U64ToF32) x))
 (rule (lower (has_type $F64 (fcvt_from_uint x @ (value_type $I64))))
      (int_to_fpu (IntToFpuOp.U64ToF64) x))
 ;;;; Rules for `fcvt_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (has_type ty @ (multi_lane 32 _) (fcvt_from_sint x @ (value_type (multi_lane 32 _)))))
      (vec_misc (VecMisc2.Scvtf) x (vector_size ty)))
 (rule (lower (has_type ty @ (multi_lane 64 _) (fcvt_from_sint x @ (value_type (multi_lane 64 _)))))
      (vec_misc (VecMisc2.Scvtf) x (vector_size ty)))
 (rule (lower (has_type $F32 (fcvt_from_sint x @ (value_type (fits_in_32 _)))))
      (int_to_fpu (IntToFpuOp.I32ToF32) (put_in_reg_sext32 x)))
 (rule (lower (has_type $F64 (fcvt_from_sint x @ (value_type (fits_in_32 _)))))
      (int_to_fpu (IntToFpuOp.I32ToF64) (put_in_reg_sext32 x)))
 (rule (lower (has_type $F32 (fcvt_from_sint x @ (value_type $I64))))
      (int_to_fpu (IntToFpuOp.I64ToF32) x))
 (rule (lower (has_type $F64 (fcvt_from_sint x @ (value_type $I64))))
      (int_to_fpu (IntToFpuOp.I64ToF64) x))
 ;;;; Rules for `fcvt_to_uint_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (has_type ty @ (multi_lane 32 _) (fcvt_to_uint_sat x @ (value_type (multi_lane 32 _)))))
      (vec_misc (VecMisc2.Fcvtzu) x (vector_size ty)))
 (rule (lower (has_type ty @ (multi_lane 64 _) (fcvt_to_uint_sat x @ (value_type (multi_lane 64 _)))))
      (vec_misc (VecMisc2.Fcvtzu) x (vector_size ty)))
 (rule (lower (has_type $I32 (fcvt_to_uint_sat x @ (value_type $F32))))
      (fpu_to_int_cvt_sat (FpuToIntOp.F32ToU32) x $false $F32 $I32))
 (rule (lower (has_type $I64 (fcvt_to_uint_sat x @ (value_type $F32))))
      (fpu_to_int_cvt_sat (FpuToIntOp.F32ToU64) x $false $F32 $I64))
 (rule (lower (has_type $I32 (fcvt_to_uint_sat x @ (value_type $F64))))
      (fpu_to_int_cvt_sat (FpuToIntOp.F64ToU32) x $false $F64 $I32))
 (rule (lower (has_type $I64 (fcvt_to_uint_sat x @ (value_type $F64))))
      (fpu_to_int_cvt_sat (FpuToIntOp.F64ToU64) x $false $F64 $I64))
 ;;;; Rules for `fcvt_to_sint_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (has_type ty @ (multi_lane 32 _) (fcvt_to_sint_sat x @ (value_type (multi_lane 32 _)))))
      (vec_misc (VecMisc2.Fcvtzs) x (vector_size ty)))
 (rule (lower (has_type ty @ (multi_lane 64 _) (fcvt_to_sint_sat x @ (value_type (multi_lane 64 _)))))
      (vec_misc (VecMisc2.Fcvtzs) x (vector_size ty)))
 (rule (lower (has_type $I32 (fcvt_to_sint_sat x @ (value_type $F32))))
      (fpu_to_int_cvt_sat (FpuToIntOp.F32ToI32) x $true $F32 $I32))
 (rule (lower (has_type $I64 (fcvt_to_sint_sat x @ (value_type $F32))))
      (fpu_to_int_cvt_sat (FpuToIntOp.F32ToI64) x $true $F32 $I64))
 (rule (lower (has_type $I32 (fcvt_to_sint_sat x @ (value_type $F64))))
      (fpu_to_int_cvt_sat (FpuToIntOp.F64ToI32) x $true $F64 $I32))
 (rule (lower (has_type $I64 (fcvt_to_sint_sat x @ (value_type $F64))))
      (fpu_to_int_cvt_sat (FpuToIntOp.F64ToI64) x $true $F64 $I64))
 ;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; `i64` and smaller
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -1065,17 +1065,6 @@ pub(crate) fn condcode_is_signed(cc: IntCC) -> bool {
 //=============================================================================
 // Helpers for instruction lowering.
 pub(crate) fn choose_32_64<T: Copy>(ty: Type, op32: T, op64: T) -> T {
    let bits = ty_bits(ty);
    if bits <= 32 {
        op32
    } else if bits == 64 {
        op64
    } else {
        panic!("choose_32_64 on > 64 bits!")
    }
 }
 /// Checks for an instance of `op` feeding the given input.
 pub(crate) fn maybe_input_insn(
    c: &mut Lower<Inst>,
--- a/cranelift/codegen/src/isa/aarch64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle.rs
@@ -5,12 +5,13 @@ pub mod generated_code;
 // Types that the generated ISLE code uses via `use super::*`.
 use super::{
-    insn_inputs, lower_constant_f128, lower_constant_f64, writable_zero_reg, zero_reg, AMode,
+    insn_inputs, lower_constant_f128, lower_constant_f32, lower_constant_f64, writable_zero_reg,
-    ASIMDFPModImm, ASIMDMovModImm, BranchTarget, CallIndInfo, CallInfo, Cond, CondBrKind, ExtendOp,
+    zero_reg, AMode, ASIMDFPModImm, ASIMDMovModImm, BranchTarget, CallIndInfo, CallInfo, Cond,
-    FPUOpRI, FloatCC, Imm12, ImmLogic, ImmShift, Inst as MInst, IntCC, JTSequenceInfo, MachLabel,
+    CondBrKind, ExtendOp, FPUOpRI, FloatCC, Imm12, ImmLogic, ImmShift, Inst as MInst, IntCC,
-    MoveWideConst, MoveWideOp, NarrowValueMode, Opcode, OperandSize, PairAMode, Reg, ScalarSize,
+    JTSequenceInfo, MachLabel, MoveWideConst, MoveWideOp, NarrowValueMode, Opcode, OperandSize,
-    ShiftOpAndAmt, UImm5, VecMisc2, VectorSize, NZCV,
+    PairAMode, Reg, ScalarSize, ShiftOpAndAmt, UImm5, VecMisc2, VectorSize, NZCV,
 };
 use crate::isa::aarch64::inst::{FPULeftShiftImm, FPURightShiftImm};
 use crate::isa::aarch64::lower::{lower_address, lower_splat_const};
 use crate::isa::aarch64::settings::Flags as IsaFlags;
 use crate::machinst::{isle::*, InputSourceInst};
@@ -519,4 +520,198 @@ impl generated_code::Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6>
    fn preg_link(&mut self) -> PReg {
        super::regs::link_reg().to_real_reg().unwrap().into()
    }
    fn min_fp_value(&mut self, signed: bool, in_bits: u8, out_bits: u8) -> Reg {
        let tmp = self.lower_ctx.alloc_tmp(I8X16).only_reg().unwrap();
        if in_bits == 32 {
            // From float32.
            let min = match (signed, out_bits) {
                (true, 8) => i8::MIN as f32 - 1.,
                (true, 16) => i16::MIN as f32 - 1.,
                (true, 32) => i32::MIN as f32, // I32_MIN - 1 isn't precisely representable as a f32.
                (true, 64) => i64::MIN as f32, // I64_MIN - 1 isn't precisely representable as a f32.
                (false, _) => -1.,
                _ => unimplemented!(
                    "unexpected {} output size of {} bits for 32-bit input",
                    if signed { "signed" } else { "unsigned" },
                    out_bits
                ),
            };
            lower_constant_f32(self.lower_ctx, tmp, min);
        } else if in_bits == 64 {
            // From float64.
            let min = match (signed, out_bits) {
                (true, 8) => i8::MIN as f64 - 1.,
                (true, 16) => i16::MIN as f64 - 1.,
                (true, 32) => i32::MIN as f64 - 1.,
                (true, 64) => i64::MIN as f64,
                (false, _) => -1.,
                _ => unimplemented!(
                    "unexpected {} output size of {} bits for 64-bit input",
                    if signed { "signed" } else { "unsigned" },
                    out_bits
                ),
            };
            lower_constant_f64(self.lower_ctx, tmp, min);
        } else {
            unimplemented!(
                "unexpected input size for min_fp_value: {} (signed: {}, output size: {})",
                in_bits,
                signed,
                out_bits
            );
        }
        tmp.to_reg()
    }
    fn max_fp_value(&mut self, signed: bool, in_bits: u8, out_bits: u8) -> Reg {
        let tmp = self.lower_ctx.alloc_tmp(I8X16).only_reg().unwrap();
        if in_bits == 32 {
            // From float32.
            let max = match (signed, out_bits) {
                (true, 8) => i8::MAX as f32 + 1.,
                (true, 16) => i16::MAX as f32 + 1.,
                (true, 32) => (i32::MAX as u64 + 1) as f32,
                (true, 64) => (i64::MAX as u64 + 1) as f32,
                (false, 8) => u8::MAX as f32 + 1.,
                (false, 16) => u16::MAX as f32 + 1.,
                (false, 32) => (u32::MAX as u64 + 1) as f32,
                (false, 64) => (u64::MAX as u128 + 1) as f32,
                _ => unimplemented!(
                    "unexpected {} output size of {} bits for 32-bit input",
                    if signed { "signed" } else { "unsigned" },
                    out_bits
                ),
            };
            lower_constant_f32(self.lower_ctx, tmp, max);
        } else if in_bits == 64 {
            // From float64.
            let max = match (signed, out_bits) {
                (true, 8) => i8::MAX as f64 + 1.,
                (true, 16) => i16::MAX as f64 + 1.,
                (true, 32) => i32::MAX as f64 + 1.,
                (true, 64) => (i64::MAX as u64 + 1) as f64,
                (false, 8) => u8::MAX as f64 + 1.,
                (false, 16) => u16::MAX as f64 + 1.,
                (false, 32) => u32::MAX as f64 + 1.,
                (false, 64) => (u64::MAX as u128 + 1) as f64,
                _ => unimplemented!(
                    "unexpected {} output size of {} bits for 64-bit input",
                    if signed { "signed" } else { "unsigned" },
                    out_bits
                ),
            };
            lower_constant_f64(self.lower_ctx, tmp, max);
        } else {
            unimplemented!(
                "unexpected input size for max_fp_value: {} (signed: {}, output size: {})",
                in_bits,
                signed,
                out_bits
            );
        }
        tmp.to_reg()
    }
    fn min_fp_value_sat(&mut self, signed: bool, in_bits: u8, out_bits: u8) -> Reg {
        let tmp = self.lower_ctx.alloc_tmp(I8X16).only_reg().unwrap();
        let min: f64 = match (out_bits, signed) {
            (32, true) => i32::MIN as f64,
            (32, false) => 0.0,
            (64, true) => i64::MIN as f64,
            (64, false) => 0.0,
            _ => unimplemented!(
                "unexpected {} output size of {} bits",
                if signed { "signed" } else { "unsigned" },
                out_bits
            ),
        };
        if in_bits == 32 {
            lower_constant_f32(self.lower_ctx, tmp, min as f32)
        } else if in_bits == 64 {
            lower_constant_f64(self.lower_ctx, tmp, min)
        } else {
            unimplemented!(
                "unexpected input size for min_fp_value_sat: {} (signed: {}, output size: {})",
                in_bits,
                signed,
                out_bits
            );
        }
        tmp.to_reg()
    }
    fn max_fp_value_sat(&mut self, signed: bool, in_bits: u8, out_bits: u8) -> Reg {
        let tmp = self.lower_ctx.alloc_tmp(I8X16).only_reg().unwrap();
        let max = match (out_bits, signed) {
            (32, true) => i32::MAX as f64,
            (32, false) => u32::MAX as f64,
            (64, true) => i64::MAX as f64,
            (64, false) => u64::MAX as f64,
            _ => unimplemented!(
                "unexpected {} output size of {} bits",
                if signed { "signed" } else { "unsigned" },
                out_bits
            ),
        };
        if in_bits == 32 {
            lower_constant_f32(self.lower_ctx, tmp, max as f32)
        } else if in_bits == 64 {
            lower_constant_f64(self.lower_ctx, tmp, max)
        } else {
            unimplemented!(
                "unexpected input size for max_fp_value_sat: {} (signed: {}, output size: {})",
                in_bits,
                signed,
                out_bits
            );
        }
        tmp.to_reg()
    }
    fn fpu_op_ri_ushr(&mut self, ty_bits: u8, shift: u8) -> FPUOpRI {
        if ty_bits == 32 {
            FPUOpRI::UShr32(FPURightShiftImm::maybe_from_u8(shift, ty_bits).unwrap())
        } else if ty_bits == 64 {
            FPUOpRI::UShr64(FPURightShiftImm::maybe_from_u8(shift, ty_bits).unwrap())
        } else {
            unimplemented!(
                "unexpected input size for fpu_op_ri_ushr: {} (shift: {})",
                ty_bits,
                shift
            );
        }
    }
    fn fpu_op_ri_sli(&mut self, ty_bits: u8, shift: u8) -> FPUOpRI {
        if ty_bits == 32 {
            FPUOpRI::Sli32(FPULeftShiftImm::maybe_from_u8(shift, ty_bits).unwrap())
        } else if ty_bits == 64 {
            FPUOpRI::Sli64(FPULeftShiftImm::maybe_from_u8(shift, ty_bits).unwrap())
        } else {
            unimplemented!(
                "unexpected input size for fpu_op_ri_sli: {} (shift: {})",
                ty_bits,
                shift
            );
        }
    }
 }
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -2,10 +2,9 @@
 use super::lower::*;
 use crate::binemit::CodeOffset;
 use crate::ir::condcodes::FloatCC;
 use crate::ir::types::*;
 use crate::ir::Inst as IRInst;
-use crate::ir::{InstructionData, Opcode, TrapCode};
+use crate::ir::{InstructionData, Opcode};
 use crate::isa::aarch64::abi::*;
 use crate::isa::aarch64::inst::*;
 use crate::isa::aarch64::settings as aarch64_settings;
@@ -978,408 +977,13 @@ pub(crate) fn lower_insn_to_regs(
        Opcode::Fma => implemented_in_isle(ctx),
-        Opcode::Fcopysign => {
+        Opcode::Fcopysign => implemented_in_isle(ctx),
            // Copy the sign bit from inputs[1] to inputs[0]. We use the following sequence:
            //
            // This is a scalar Fcopysign.
            // This uses scalar NEON operations for 64-bit and vector operations (2S) for 32-bit.
            // In the latter case it still sets all bits except the lowest 32 to 0.
            //
            //  mov vd, vn
            //  ushr vtmp, vm, #63 / #31
            //  sli vd, vtmp, #63 / #31
-            let ty = ctx.output_ty(insn, 0);
+        Opcode::FcvtToUint | Opcode::FcvtToSint => implemented_in_isle(ctx),
-            if ty != F32 && ty != F64 {
+        Opcode::FcvtFromUint | Opcode::FcvtFromSint => implemented_in_isle(ctx),
                return Err(CodegenError::Unsupported(format!(
                    "Fcopysign: Unsupported type: {:?}",
                    ty
                )));
            }
-            let bits = ty_bits(ty) as u8;
+        Opcode::FcvtToUintSat | Opcode::FcvtToSintSat => implemented_in_isle(ctx),
            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            let tmp = ctx.alloc_tmp(F64).only_reg().unwrap();
            // Copy LHS to rd.
            ctx.emit(Inst::gen_move(rd, rn, ty));
            // Copy the sign bit to the lowest bit in tmp.
            let imm = FPURightShiftImm::maybe_from_u8(bits - 1, bits).unwrap();
            ctx.emit(Inst::FpuRRI {
                fpu_op: choose_32_64(ty, FPUOpRI::UShr32(imm), FPUOpRI::UShr64(imm)),
                rd: tmp,
                rn: rm,
            });
            // Insert the bit from tmp into the sign bit of rd.
            let imm = FPULeftShiftImm::maybe_from_u8(bits - 1, bits).unwrap();
            ctx.emit(Inst::FpuRRI {
                fpu_op: choose_32_64(ty, FPUOpRI::Sli32(imm), FPUOpRI::Sli64(imm)),
                rd,
                rn: tmp.to_reg(),
            });
        }
        Opcode::FcvtToUint | Opcode::FcvtToSint => {
            let input_ty = ctx.input_ty(insn, 0);
            let in_bits = ty_bits(input_ty);
            let output_ty = ty.unwrap();
            let out_bits = ty_bits(output_ty);
            let signed = op == Opcode::FcvtToSint;
            let op = match (signed, in_bits, out_bits) {
                (false, 32, 8) | (false, 32, 16) | (false, 32, 32) => FpuToIntOp::F32ToU32,
                (true, 32, 8) | (true, 32, 16) | (true, 32, 32) => FpuToIntOp::F32ToI32,
                (false, 32, 64) => FpuToIntOp::F32ToU64,
                (true, 32, 64) => FpuToIntOp::F32ToI64,
                (false, 64, 8) | (false, 64, 16) | (false, 64, 32) => FpuToIntOp::F64ToU32,
                (true, 64, 8) | (true, 64, 16) | (true, 64, 32) => FpuToIntOp::F64ToI32,
                (false, 64, 64) => FpuToIntOp::F64ToU64,
                (true, 64, 64) => FpuToIntOp::F64ToI64,
                _ => {
                    return Err(CodegenError::Unsupported(format!(
                        "{}: Unsupported types: {:?} -> {:?}",
                        op, input_ty, output_ty
                    )))
                }
            };
            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            // First, check the output: it's important to carry the NaN conversion before the
            // in-bounds conversion, per wasm semantics.
            // Check that the input is not a NaN.
            ctx.emit(Inst::FpuCmp {
                size: ScalarSize::from_ty(input_ty),
                rn,
                rm: rn,
            });
            let trap_code = TrapCode::BadConversionToInteger;
            ctx.emit(Inst::TrapIf {
                trap_code,
                kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::Unordered)),
            });
            let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap();
            // Check that the input is in range, with "truncate towards zero" semantics. This means
            // we allow values that are slightly out of range:
            // - for signed conversions, we allow values strictly greater than INT_MIN-1 (when this
            // can be represented), and strictly less than INT_MAX+1 (when this can be
            // represented).
            // - for unsigned conversions, we allow values strictly greater than -1, and strictly
            // less than UINT_MAX+1 (when this can be represented).
            if in_bits == 32 {
                // From float32.
                let (low_bound, low_cond, high_bound) = match (signed, out_bits) {
                    (true, 8) => (
                        i8::min_value() as f32 - 1.,
                        FloatCC::GreaterThan,
                        i8::max_value() as f32 + 1.,
                    ),
                    (true, 16) => (
                        i16::min_value() as f32 - 1.,
                        FloatCC::GreaterThan,
                        i16::max_value() as f32 + 1.,
                    ),
                    (true, 32) => (
                        i32::min_value() as f32, // I32_MIN - 1 isn't precisely representable as a f32.
                        FloatCC::GreaterThanOrEqual,
                        i32::max_value() as f32 + 1.,
                    ),
                    (true, 64) => (
                        i64::min_value() as f32, // I64_MIN - 1 isn't precisely representable as a f32.
                        FloatCC::GreaterThanOrEqual,
                        i64::max_value() as f32 + 1.,
                    ),
                    (false, 8) => (-1., FloatCC::GreaterThan, u8::max_value() as f32 + 1.),
                    (false, 16) => (-1., FloatCC::GreaterThan, u16::max_value() as f32 + 1.),
                    (false, 32) => (-1., FloatCC::GreaterThan, u32::max_value() as f32 + 1.),
                    (false, 64) => (-1., FloatCC::GreaterThan, u64::max_value() as f32 + 1.),
                    _ => unreachable!(),
                };
                // >= low_bound
                lower_constant_f32(ctx, tmp, low_bound);
                ctx.emit(Inst::FpuCmp {
                    size: ScalarSize::Size32,
                    rn,
                    rm: tmp.to_reg(),
                });
                let trap_code = TrapCode::IntegerOverflow;
                ctx.emit(Inst::TrapIf {
                    trap_code,
                    kind: CondBrKind::Cond(lower_fp_condcode(low_cond).invert()),
                });
                // <= high_bound
                lower_constant_f32(ctx, tmp, high_bound);
                ctx.emit(Inst::FpuCmp {
                    size: ScalarSize::Size32,
                    rn,
                    rm: tmp.to_reg(),
                });
                let trap_code = TrapCode::IntegerOverflow;
                ctx.emit(Inst::TrapIf {
                    trap_code,
                    kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan).invert()),
                });
            } else {
                // From float64.
                let (low_bound, low_cond, high_bound) = match (signed, out_bits) {
                    (true, 8) => (
                        i8::min_value() as f64 - 1.,
                        FloatCC::GreaterThan,
                        i8::max_value() as f64 + 1.,
                    ),
                    (true, 16) => (
                        i16::min_value() as f64 - 1.,
                        FloatCC::GreaterThan,
                        i16::max_value() as f64 + 1.,
                    ),
                    (true, 32) => (
                        i32::min_value() as f64 - 1.,
                        FloatCC::GreaterThan,
                        i32::max_value() as f64 + 1.,
                    ),
                    (true, 64) => (
                        i64::min_value() as f64, // I64_MIN - 1 is not precisely representable as an i64.
                        FloatCC::GreaterThanOrEqual,
                        i64::max_value() as f64 + 1.,
                    ),
                    (false, 8) => (-1., FloatCC::GreaterThan, u8::max_value() as f64 + 1.),
                    (false, 16) => (-1., FloatCC::GreaterThan, u16::max_value() as f64 + 1.),
                    (false, 32) => (-1., FloatCC::GreaterThan, u32::max_value() as f64 + 1.),
                    (false, 64) => (-1., FloatCC::GreaterThan, u64::max_value() as f64 + 1.),
                    _ => unreachable!(),
                };
                // >= low_bound
                lower_constant_f64(ctx, tmp, low_bound);
                ctx.emit(Inst::FpuCmp {
                    size: ScalarSize::Size64,
                    rn,
                    rm: tmp.to_reg(),
                });
                let trap_code = TrapCode::IntegerOverflow;
                ctx.emit(Inst::TrapIf {
                    trap_code,
                    kind: CondBrKind::Cond(lower_fp_condcode(low_cond).invert()),
                });
                // <= high_bound
                lower_constant_f64(ctx, tmp, high_bound);
                ctx.emit(Inst::FpuCmp {
                    size: ScalarSize::Size64,
                    rn,
                    rm: tmp.to_reg(),
                });
                let trap_code = TrapCode::IntegerOverflow;
                ctx.emit(Inst::TrapIf {
                    trap_code,
                    kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan).invert()),
                });
            };
            // Do the conversion.
            ctx.emit(Inst::FpuToInt { op, rd, rn });
        }
        Opcode::FcvtFromUint | Opcode::FcvtFromSint => {
            let input_ty = ctx.input_ty(insn, 0);
            let ty = ty.unwrap();
            let signed = op == Opcode::FcvtFromSint;
            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            if ty.is_vector() {
                if input_ty.lane_bits() != ty.lane_bits() {
                    return Err(CodegenError::Unsupported(format!(
                        "{}: Unsupported types: {:?} -> {:?}",
                        op, input_ty, ty
                    )));
                }
                let op = if signed {
                    VecMisc2::Scvtf
                } else {
                    VecMisc2::Ucvtf
                };
                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                ctx.emit(Inst::VecMisc {
                    op,
                    rd,
                    rn,
                    size: VectorSize::from_ty(ty),
                });
            } else {
                let in_bits = ty_bits(input_ty);
                let out_bits = ty_bits(ty);
                let op = match (signed, in_bits, out_bits) {
                    (false, 8, 32) | (false, 16, 32) | (false, 32, 32) => IntToFpuOp::U32ToF32,
                    (true, 8, 32) | (true, 16, 32) | (true, 32, 32) => IntToFpuOp::I32ToF32,
                    (false, 8, 64) | (false, 16, 64) | (false, 32, 64) => IntToFpuOp::U32ToF64,
                    (true, 8, 64) | (true, 16, 64) | (true, 32, 64) => IntToFpuOp::I32ToF64,
                    (false, 64, 32) => IntToFpuOp::U64ToF32,
                    (true, 64, 32) => IntToFpuOp::I64ToF32,
                    (false, 64, 64) => IntToFpuOp::U64ToF64,
                    (true, 64, 64) => IntToFpuOp::I64ToF64,
                    _ => {
                        return Err(CodegenError::Unsupported(format!(
                            "{}: Unsupported types: {:?} -> {:?}",
                            op, input_ty, ty
                        )))
                    }
                };
                let narrow_mode = match (signed, in_bits) {
                    (false, 8) | (false, 16) | (false, 32) => NarrowValueMode::ZeroExtend32,
                    (true, 8) | (true, 16) | (true, 32) => NarrowValueMode::SignExtend32,
                    (false, 64) => NarrowValueMode::ZeroExtend64,
                    (true, 64) => NarrowValueMode::SignExtend64,
                    _ => unreachable!(),
                };
                let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
                ctx.emit(Inst::IntToFpu { op, rd, rn });
            }
        }
        Opcode::FcvtToUintSat | Opcode::FcvtToSintSat => {
            let in_ty = ctx.input_ty(insn, 0);
            let ty = ty.unwrap();
            let out_signed = op == Opcode::FcvtToSintSat;
            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            if ty.is_vector() {
                if in_ty.lane_bits() != ty.lane_bits() {
                    return Err(CodegenError::Unsupported(format!(
                        "{}: Unsupported types: {:?} -> {:?}",
                        op, in_ty, ty
                    )));
                }
                let op = if out_signed {
                    VecMisc2::Fcvtzs
                } else {
                    VecMisc2::Fcvtzu
                };
                ctx.emit(Inst::VecMisc {
                    op,
                    rd,
                    rn,
                    size: VectorSize::from_ty(ty),
                });
            } else {
                let in_bits = ty_bits(in_ty);
                let out_bits = ty_bits(ty);
                // FIMM Vtmp1, u32::MAX or u64::MAX or i32::MAX or i64::MAX
                // FMIN Vtmp2, Vin, Vtmp1
                // FIMM Vtmp1, 0 or 0 or i32::MIN or i64::MIN
                // FMAX Vtmp2, Vtmp2, Vtmp1
                // (if signed) FIMM Vtmp1, 0
                // FCMP Vin, Vin
                // FCSEL Vtmp2, Vtmp1, Vtmp2, NE  // on NaN, select 0
                // convert Rout, Vtmp2
                assert!(in_ty.is_float() && (in_bits == 32 || in_bits == 64));
                assert!(out_bits == 32 || out_bits == 64);
                let min: f64 = match (out_bits, out_signed) {
                    (32, true) => std::i32::MIN as f64,
                    (32, false) => 0.0,
                    (64, true) => std::i64::MIN as f64,
                    (64, false) => 0.0,
                    _ => unreachable!(),
                };
                let max = match (out_bits, out_signed) {
                    (32, true) => std::i32::MAX as f64,
                    (32, false) => std::u32::MAX as f64,
                    (64, true) => std::i64::MAX as f64,
                    (64, false) => std::u64::MAX as f64,
                    _ => unreachable!(),
                };
                let rtmp1 = ctx.alloc_tmp(in_ty).only_reg().unwrap();
                let rtmp2 = ctx.alloc_tmp(in_ty).only_reg().unwrap();
                if in_bits == 32 {
                    lower_constant_f32(ctx, rtmp1, max as f32);
                } else {
                    lower_constant_f64(ctx, rtmp1, max);
                }
                ctx.emit(Inst::FpuRRR {
                    fpu_op: FPUOp2::Min,
                    size: ScalarSize::from_ty(in_ty),
                    rd: rtmp2,
                    rn,
                    rm: rtmp1.to_reg(),
                });
                if in_bits == 32 {
                    lower_constant_f32(ctx, rtmp1, min as f32);
                } else {
                    lower_constant_f64(ctx, rtmp1, min);
                }
                ctx.emit(Inst::FpuRRR {
                    fpu_op: FPUOp2::Max,
                    size: ScalarSize::from_ty(in_ty),
                    rd: rtmp2,
                    rn: rtmp2.to_reg(),
                    rm: rtmp1.to_reg(),
                });
                if out_signed {
                    if in_bits == 32 {
                        lower_constant_f32(ctx, rtmp1, 0.0);
                    } else {
                        lower_constant_f64(ctx, rtmp1, 0.0);
                    }
                }
                ctx.emit(Inst::FpuCmp {
                    size: ScalarSize::from_ty(in_ty),
                    rn,
                    rm: rn,
                });
                if in_bits == 32 {
                    ctx.emit(Inst::FpuCSel32 {
                        rd: rtmp2,
                        rn: rtmp1.to_reg(),
                        rm: rtmp2.to_reg(),
                        cond: Cond::Ne,
                    });
                } else {
                    ctx.emit(Inst::FpuCSel64 {
                        rd: rtmp2,
                        rn: rtmp1.to_reg(),
                        rm: rtmp2.to_reg(),
                        cond: Cond::Ne,
                    });
                }
                let cvt = match (in_bits, out_bits, out_signed) {
                    (32, 32, false) => FpuToIntOp::F32ToU32,
                    (32, 32, true) => FpuToIntOp::F32ToI32,
                    (32, 64, false) => FpuToIntOp::F32ToU64,
                    (32, 64, true) => FpuToIntOp::F32ToI64,
                    (64, 32, false) => FpuToIntOp::F64ToU32,
                    (64, 32, true) => FpuToIntOp::F64ToI32,
                    (64, 64, false) => FpuToIntOp::F64ToU64,
                    (64, 64, true) => FpuToIntOp::F64ToI64,
                    _ => unreachable!(),
                };
                ctx.emit(Inst::FpuToInt {
                    op: cvt,
                    rd,
                    rn: rtmp2.to_reg(),
                });
            }
        }
        Opcode::IaddIfcout => {
            // This is a two-output instruction that is needed for the
--- a/cranelift/filetests/filetests/isa/aarch64/fcvt-small.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/fcvt-small.clif
@@ -9,8 +9,8 @@ block0(v0: i8):
 }
 ; block0:
-;   uxtb w4, w0
+;   uxtb w3, w0
-;   ucvtf s0, w4
+;   ucvtf s0, w3
 ;   ret
 function u0:0(i8) -> f64 {
@@ -20,8 +20,8 @@ block0(v0: i8):
 }
 ; block0:
-;   uxtb w4, w0
+;   uxtb w3, w0
-;   ucvtf d0, w4
+;   ucvtf d0, w3
 ;   ret
 function u0:0(i16) -> f32 {
@@ -31,8 +31,8 @@ block0(v0: i16):
 }
 ; block0:
-;   uxth w4, w0
+;   uxth w3, w0
-;   ucvtf s0, w4
+;   ucvtf s0, w3
 ;   ret
 function u0:0(i16) -> f64 {
@@ -42,8 +42,8 @@ block0(v0: i16):
 }
 ; block0:
-;   uxth w4, w0
+;   uxth w3, w0
-;   ucvtf d0, w4
+;   ucvtf d0, w3
 ;   ret
 function u0:0(f32) -> i8 {
@@ -55,13 +55,13 @@ block0(v0: f32):
 ; block0:
 ;   fcmp s0, s0
 ;   b.vc 8 ; udf
-;   fmov s6, #-1
+;   fmov s5, #-1
-;   fcmp s0, s6
+;   fcmp s0, s5
 ;   b.gt 8 ; udf
 ;   movz x10, #17280, LSL #16
-;   fmov s6, w10
+;   fmov s18, w10
-;   fcmp s0, s6
+;   fcmp s0, s18
-;   b.mi 8 ; udf
+;   b.lt 8 ; udf
 ;   fcvtzu w0, s0
 ;   ret
@@ -74,13 +74,13 @@ block0(v0: f64):
 ; block0:
 ;   fcmp d0, d0
 ;   b.vc 8 ; udf
-;   fmov d6, #-1
+;   fmov d5, #-1
-;   fcmp d0, d6
+;   fcmp d0, d5
 ;   b.gt 8 ; udf
 ;   movz x10, #16496, LSL #48
-;   fmov d6, x10
+;   fmov d18, x10
-;   fcmp d0, d6
+;   fcmp d0, d18
-;   b.mi 8 ; udf
+;   b.lt 8 ; udf
 ;   fcvtzu w0, d0
 ;   ret
@@ -93,13 +93,13 @@ block0(v0: f32):
 ; block0:
 ;   fcmp s0, s0
 ;   b.vc 8 ; udf
-;   fmov s6, #-1
+;   fmov s5, #-1
-;   fcmp s0, s6
+;   fcmp s0, s5
 ;   b.gt 8 ; udf
 ;   movz x10, #18304, LSL #16
-;   fmov s6, w10
+;   fmov s18, w10
-;   fcmp s0, s6
+;   fcmp s0, s18
-;   b.mi 8 ; udf
+;   b.lt 8 ; udf
 ;   fcvtzu w0, s0
 ;   ret
@@ -112,13 +112,13 @@ block0(v0: f64):
 ; block0:
 ;   fcmp d0, d0
 ;   b.vc 8 ; udf
-;   fmov d6, #-1
+;   fmov d5, #-1
-;   fcmp d0, d6
+;   fcmp d0, d5
 ;   b.gt 8 ; udf
 ;   movz x10, #16624, LSL #48
-;   fmov d6, x10
+;   fmov d18, x10
-;   fcmp d0, d6
+;   fcmp d0, d18
-;   b.mi 8 ; udf
+;   b.lt 8 ; udf
 ;   fcvtzu w0, d0
 ;   ret
--- a/cranelift/filetests/filetests/isa/aarch64/floating-point.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/floating-point.clif
@@ -333,13 +333,13 @@ block0(v0: f32):
 ; block0:
 ;   fcmp s0, s0
 ;   b.vc 8 ; udf
-;   fmov s6, #-1
+;   fmov s5, #-1
-;   fcmp s0, s6
+;   fcmp s0, s5
 ;   b.gt 8 ; udf
 ;   movz x10, #20352, LSL #16
-;   fmov s6, w10
+;   fmov s18, w10
-;   fcmp s0, s6
+;   fcmp s0, s18
-;   b.mi 8 ; udf
+;   b.lt 8 ; udf
 ;   fcvtzu w0, s0
 ;   ret
@@ -352,14 +352,14 @@ block0(v0: f32):
 ; block0:
 ;   fcmp s0, s0
 ;   b.vc 8 ; udf
-;   movz x7, #52992, LSL #16
+;   movz x6, #52992, LSL #16
-;   fmov s7, w7
+;   fmov s6, w6
-;   fcmp s0, s7
+;   fcmp s0, s6
 ;   b.ge 8 ; udf
 ;   movz x12, #20224, LSL #16
-;   fmov s7, w12
+;   fmov s20, w12
-;   fcmp s0, s7
+;   fcmp s0, s20
-;   b.mi 8 ; udf
+;   b.lt 8 ; udf
 ;   fcvtzs w0, s0
 ;   ret
@@ -372,13 +372,13 @@ block0(v0: f32):
 ; block0:
 ;   fcmp s0, s0
 ;   b.vc 8 ; udf
-;   fmov s6, #-1
+;   fmov s5, #-1
-;   fcmp s0, s6
+;   fcmp s0, s5
 ;   b.gt 8 ; udf
 ;   movz x10, #24448, LSL #16
-;   fmov s6, w10
+;   fmov s18, w10
-;   fcmp s0, s6
+;   fcmp s0, s18
-;   b.mi 8 ; udf
+;   b.lt 8 ; udf
 ;   fcvtzu x0, s0
 ;   ret
@@ -391,14 +391,14 @@ block0(v0: f32):
 ; block0:
 ;   fcmp s0, s0
 ;   b.vc 8 ; udf
-;   movz x7, #57088, LSL #16
+;   movz x6, #57088, LSL #16
-;   fmov s7, w7
+;   fmov s6, w6
-;   fcmp s0, s7
+;   fcmp s0, s6
 ;   b.ge 8 ; udf
 ;   movz x12, #24320, LSL #16
-;   fmov s7, w12
+;   fmov s20, w12
-;   fcmp s0, s7
+;   fcmp s0, s20
-;   b.mi 8 ; udf
+;   b.lt 8 ; udf
 ;   fcvtzs x0, s0
 ;   ret
@@ -411,13 +411,13 @@ block0(v0: f64):
 ; block0:
 ;   fcmp d0, d0
 ;   b.vc 8 ; udf
-;   fmov d6, #-1
+;   fmov d5, #-1
-;   fcmp d0, d6
+;   fcmp d0, d5
 ;   b.gt 8 ; udf
 ;   movz x10, #16880, LSL #48
-;   fmov d6, x10
+;   fmov d18, x10
-;   fcmp d0, d6
+;   fcmp d0, d18
-;   b.mi 8 ; udf
+;   b.lt 8 ; udf
 ;   fcvtzu w0, d0
 ;   ret
@@ -430,13 +430,13 @@ block0(v0: f64):
 ; block0:
 ;   fcmp d0, d0
 ;   b.vc 8 ; udf
-;   ldr d6, pc+8 ; b 12 ; data.f64 -2147483649
+;   ldr d5, pc+8 ; b 12 ; data.f64 -2147483649
-;   fcmp d0, d6
+;   fcmp d0, d5
 ;   b.gt 8 ; udf
 ;   movz x10, #16864, LSL #48
-;   fmov d6, x10
+;   fmov d18, x10
-;   fcmp d0, d6
+;   fcmp d0, d18
-;   b.mi 8 ; udf
+;   b.lt 8 ; udf
 ;   fcvtzs w0, d0
 ;   ret
@@ -449,13 +449,13 @@ block0(v0: f64):
 ; block0:
 ;   fcmp d0, d0
 ;   b.vc 8 ; udf
-;   fmov d6, #-1
+;   fmov d5, #-1
-;   fcmp d0, d6
+;   fcmp d0, d5
 ;   b.gt 8 ; udf
 ;   movz x10, #17392, LSL #48
-;   fmov d6, x10
+;   fmov d18, x10
-;   fcmp d0, d6
+;   fcmp d0, d18
-;   b.mi 8 ; udf
+;   b.lt 8 ; udf
 ;   fcvtzu x0, d0
 ;   ret
@@ -468,14 +468,14 @@ block0(v0: f64):
 ; block0:
 ;   fcmp d0, d0
 ;   b.vc 8 ; udf
-;   movz x7, #50144, LSL #48
+;   movz x6, #50144, LSL #48
-;   fmov d7, x7
+;   fmov d6, x6
-;   fcmp d0, d7
+;   fcmp d0, d6
 ;   b.ge 8 ; udf
 ;   movz x12, #17376, LSL #48
-;   fmov d7, x12
+;   fmov d20, x12
-;   fcmp d0, d7
+;   fcmp d0, d20
-;   b.mi 8 ; udf
+;   b.lt 8 ; udf
 ;   fcvtzs x0, d0
 ;   ret
@@ -566,14 +566,14 @@ block0(v0: f32):
 }
 ; block0:
-;   movz x6, #20352, LSL #16
+;   movz x4, #20352, LSL #16
-;   fmov s5, w6
+;   fmov s4, w4
-;   fmin s7, s0, s5
+;   fmin s7, s0, s4
-;   movi v5.2s, #0
+;   movi v17.2s, #0
-;   fmax s7, s7, s5
+;   fmax s19, s7, s17
 ;   fcmp s0, s0
-;   fcsel s7, s5, s7, ne
+;   fcsel s22, s17, s19, ne
-;   fcvtzu w0, s7
+;   fcvtzu w0, s22
 ;   ret
 function %f50(f32) -> i32 {
@@ -583,16 +583,16 @@ block0(v0: f32):
 }
 ; block0:
-;   movz x6, #20224, LSL #16
+;   movz x4, #20224, LSL #16
-;   fmov s5, w6
+;   fmov s4, w4
-;   fmin s7, s0, s5
+;   fmin s7, s0, s4
 ;   movz x10, #52992, LSL #16
-;   fmov s5, w10
+;   fmov s18, w10
-;   fmax s7, s7, s5
+;   fmax s21, s7, s18
-;   movi v5.2s, #0
+;   movi v23.16b, #0
 ;   fcmp s0, s0
-;   fcsel s7, s5, s7, ne
+;   fcsel s26, s23, s21, ne
-;   fcvtzs w0, s7
+;   fcvtzs w0, s26
 ;   ret
 function %f51(f32) -> i64 {
@@ -602,14 +602,14 @@ block0(v0: f32):
 }
 ; block0:
-;   movz x6, #24448, LSL #16
+;   movz x4, #24448, LSL #16
-;   fmov s5, w6
+;   fmov s4, w4
-;   fmin s7, s0, s5
+;   fmin s7, s0, s4
-;   movi v5.2s, #0
+;   movi v17.2s, #0
-;   fmax s7, s7, s5
+;   fmax s19, s7, s17
 ;   fcmp s0, s0
-;   fcsel s7, s5, s7, ne
+;   fcsel s22, s17, s19, ne
-;   fcvtzu x0, s7
+;   fcvtzu x0, s22
 ;   ret
 function %f52(f32) -> i64 {
@@ -619,16 +619,16 @@ block0(v0: f32):
 }
 ; block0:
-;   movz x6, #24320, LSL #16
+;   movz x4, #24320, LSL #16
-;   fmov s5, w6
+;   fmov s4, w4
-;   fmin s7, s0, s5
+;   fmin s7, s0, s4
 ;   movz x10, #57088, LSL #16
-;   fmov s5, w10
+;   fmov s18, w10
-;   fmax s7, s7, s5
+;   fmax s21, s7, s18
-;   movi v5.2s, #0
+;   movi v23.16b, #0
 ;   fcmp s0, s0
-;   fcsel s7, s5, s7, ne
+;   fcsel s26, s23, s21, ne
-;   fcvtzs x0, s7
+;   fcvtzs x0, s26
 ;   ret
 function %f53(f64) -> i32 {
@@ -638,13 +638,13 @@ block0(v0: f64):
 }
 ; block0:
-;   ldr d4, pc+8 ; b 12 ; data.f64 4294967295
+;   ldr d3, pc+8 ; b 12 ; data.f64 4294967295
-;   fmin d6, d0, d4
+;   fmin d5, d0, d3
-;   movi v4.2s, #0
+;   movi v7.2s, #0
-;   fmax d6, d6, d4
+;   fmax d17, d5, d7
 ;   fcmp d0, d0
-;   fcsel d6, d4, d6, ne
+;   fcsel d20, d7, d17, ne
-;   fcvtzu w0, d6
+;   fcvtzu w0, d20
 ;   ret
 function %f54(f64) -> i32 {
@@ -654,15 +654,15 @@ block0(v0: f64):
 }
 ; block0:
-;   ldr d4, pc+8 ; b 12 ; data.f64 2147483647
+;   ldr d3, pc+8 ; b 12 ; data.f64 2147483647
-;   fmin d6, d0, d4
+;   fmin d5, d0, d3
 ;   movz x8, #49632, LSL #48
-;   fmov d4, x8
+;   fmov d16, x8
-;   fmax d6, d6, d4
+;   fmax d19, d5, d16
-;   movi v4.2s, #0
+;   movi v21.16b, #0
 ;   fcmp d0, d0
-;   fcsel d6, d4, d6, ne
+;   fcsel d24, d21, d19, ne
-;   fcvtzs w0, d6
+;   fcvtzs w0, d24
 ;   ret
 function %f55(f64) -> i64 {
@@ -672,14 +672,14 @@ block0(v0: f64):
 }
 ; block0:
-;   movz x6, #17392, LSL #48
+;   movz x4, #17392, LSL #48
-;   fmov d5, x6
+;   fmov d4, x4
-;   fmin d7, d0, d5
+;   fmin d7, d0, d4
-;   movi v5.2s, #0
+;   movi v17.2s, #0
-;   fmax d7, d7, d5
+;   fmax d19, d7, d17
 ;   fcmp d0, d0
-;   fcsel d7, d5, d7, ne
+;   fcsel d22, d17, d19, ne
-;   fcvtzu x0, d7
+;   fcvtzu x0, d22
 ;   ret
 function %f56(f64) -> i64 {
@@ -689,16 +689,16 @@ block0(v0: f64):
 }
 ; block0:
-;   movz x6, #17376, LSL #48
+;   movz x4, #17376, LSL #48
-;   fmov d5, x6
+;   fmov d4, x4
-;   fmin d7, d0, d5
+;   fmin d7, d0, d4
 ;   movz x10, #50144, LSL #48
-;   fmov d5, x10
+;   fmov d18, x10
-;   fmax d7, d7, d5
+;   fmax d21, d7, d18
-;   movi v5.2s, #0
+;   movi v23.16b, #0
 ;   fcmp d0, d0
-;   fcsel d7, d5, d7, ne
+;   fcsel d26, d23, d21, ne
-;   fcvtzs x0, d7
+;   fcvtzs x0, d26
 ;   ret
 function %f57(f32x2) -> f32x2 {
@@ -946,3 +946,36 @@ block0(v0: f64x2, v1: f64x2, v2: f64x2):
 ;   mov v0.16b, v2.16b
 ;   fmla v0.2d, v17.2d, v1.2d
 ;   ret
 function %f81(f32x2, f32x2) -> f32x2 {
 block0(v0: f32x2, v1: f32x2):
  v2 = fcopysign v0, v1
  return v2
 }
 ; block0:
 ;   ushr v7.2s, v1.2s, #31
 ;   sli v0.2s, v7.2s, #31
 ;   ret
 function %f82(f32x4, f32x4) -> f32x4 {
 block0(v0: f32x4, v1: f32x4):
  v2 = fcopysign v0, v1
  return v2
 }
 ; block0:
 ;   ushr v7.4s, v1.4s, #31
 ;   sli v0.4s, v7.4s, #31
 ;   ret
 function %f83(f64x2, f64x2) -> f64x2 {
 block0(v0: f64x2, v1: f64x2):
  v2 = fcopysign v0, v1
  return v2
 }
 ; block0:
 ;   ushr v7.2d, v1.2d, #63
 ;   sli v0.2d, v7.2d, #63
 ;   ret
--- a/cranelift/filetests/filetests/runtests/simd-fcopysign-64bit.clif
+++ b/cranelift/filetests/filetests/runtests/simd-fcopysign-64bit.clif
@@ -0,0 +1,37 @@
 test interpret
 test run
 target aarch64
 ; x86_64 and s390x do not support 64-bit vectors in `fcopysign`.
 function %fcopysign_f32x2(f32x2, f32x2) -> f32x2 {
 block0(v0: f32x2, v1: f32x2):
    v2 = fcopysign v0, v1
    return v2
 }
 ; run: %fcopysign_f32x2([0x9.0 -0x9.0], [0x9.0 0x9.0]) == [0x9.0 0x9.0]
 ; run: %fcopysign_f32x2([0x9.0 -0x9.0], [-0x9.0 -0x9.0]) == [-0x9.0 -0x9.0]
 ; run: %fcopysign_f32x2([0x0.0 -0x0.0], [-0x0.0 0x0.0]) == [-0x0.0 0x0.0]
 ; F32 Inf
 ; run: %fcopysign_f32x2([Inf -Inf], [Inf Inf]) == [Inf Inf]
 ; run: %fcopysign_f32x2([Inf -Inf], [-Inf -Inf]) == [-Inf -Inf]
 ; F32 Epsilon  / Max / Min Positive
 ; run: %fcopysign_f32x2([0x1.000000p-23 -0x1.000000p-23], [-0x0.0 0x0.0]) == [-0x1.000000p-23 0x1.000000p-23]
 ; run: %fcopysign_f32x2([0x1.fffffep127 -0x1.fffffep127], [-0x0.0 0x0.0]) == [-0x1.fffffep127 0x1.fffffep127]
 ; run: %fcopysign_f32x2([0x1.000000p-126 -0x1.000000p-126], [-0x0.0 0x0.0]) == [-0x1.000000p-126 0x1.000000p-126]
 ; F32 Subnormals
 ; run: %fcopysign_f32x2([0x0.800000p-126 -0x0.800000p-126], [-0x0.0 0x0.0]) == [-0x0.800000p-126 0x0.800000p-126]
 ; run: %fcopysign_f32x2([0x0.000002p-126 -0x0.000002p-126], [-0x0.0 0x0.0]) == [-0x0.000002p-126 0x0.000002p-126]
 ; F32 NaN's
 ; Unlike with other operations fcopysign is guaranteed to only affect the sign bit
 ; run: %fcopysign_f32x2([0x0.0 0x3.0], [-NaN +sNaN:0x1]) == [-0x0.0 0x3.0]
 ; run: %fcopysign_f32x2([Inf +NaN], [-NaN -NaN]) == [-Inf -NaN]
 ; run: %fcopysign_f32x2([-NaN +NaN:0x0], [+NaN -NaN]) == [+NaN -NaN:0x0]
 ; run: %fcopysign_f32x2([+NaN:0x1 +NaN:0x300001], [-NaN -NaN]) == [-NaN:0x1 -NaN:0x300001]
 ; run: %fcopysign_f32x2([-NaN:0x0 -NaN:0x1], [+NaN +NaN]) == [+NaN:0x0 +NaN:0x1]
 ; run: %fcopysign_f32x2([-NaN:0x300001 +sNaN:0x1], [+NaN -NaN]) == [+NaN:0x300001 -sNaN:0x1]
 ; run: %fcopysign_f32x2([-sNaN:0x1 +sNaN:0x200001], [+NaN -NaN]) == [+sNaN:0x1 -sNaN:0x200001]
 ; run: %fcopysign_f32x2([-sNaN:0x200001 -sNaN:0x200001], [+NaN +NaN]) == [+sNaN:0x200001 +sNaN:0x200001]
--- a/cranelift/filetests/filetests/runtests/simd-fcopysign.clif
+++ b/cranelift/filetests/filetests/runtests/simd-fcopysign.clif
@@ -0,0 +1,63 @@
 test interpret
 test run
 target s390x
 target aarch64
 ; x86_64 does not support SIMD fcopysign.
 function %fcopysign_f32x4(f32x4, f32x4) -> f32x4 {
 block0(v0: f32x4, v1: f32x4):
    v2 = fcopysign v0, v1
    return v2
 }
 ; run: %fcopysign_f32x4([0x9.0 -0x9.0 0x9.0 -0x9.0], [0x9.0 0x9.0 -0x9.0 -0x9.0]) == [0x9.0 0x9.0 -0x9.0 -0x9.0]
 ; run: %fcopysign_f32x4([0x0.0 -0x0.0 0x0.0 -0x0.0], [-0x0.0 0x0.0 -0x0.0 0x0.0]) == [-0x0.0 0x0.0 -0x0.0 0x0.0]
 ; F32 Inf
 ; run: %fcopysign_f32x4([Inf -Inf Inf -Inf], [Inf Inf -Inf -Inf]) == [Inf Inf -Inf -Inf]
 ; F32 Epsilon  / Max / Min Positive
 ; run: %fcopysign_f32x4([0x1.000000p-23 -0x1.000000p-23 0x1.fffffep127 -0x1.fffffep127], [-0x0.0 0x0.0 -0x0.0 0x0.0]) == [-0x1.000000p-23 0x1.000000p-23 -0x1.fffffep127 0x1.fffffep127]
 ; run: %fcopysign_f32x4([0x1.000000p-126 -0x1.000000p-126 0x1.000000p-126 -0x1.000000p-126], [-0x0.0 0x0.0 -0x0.0 0x0.0]) == [-0x1.000000p-126 0x1.000000p-126 -0x1.000000p-126 0x1.000000p-126]
 ; F32 Subnormals
 ; run: %fcopysign_f32x4([0x0.800000p-126 -0x0.800000p-126 0x0.000002p-126 -0x0.000002p-126], [-0x0.0 0x0.0 -0x0.0 0x0.0]) == [-0x0.800000p-126 0x0.800000p-126 -0x0.000002p-126 0x0.000002p-126]
 ; F32 NaN's
 ; Unlike with other operations fcopysign is guaranteed to only affect the sign bit
 ; run: %fcopysign_f32x4([0x0.0 0x3.0 Inf +NaN], [-NaN +sNaN:0x1 -NaN -NaN]) == [-0x0.0 0x3.0 -Inf -NaN]
 ; run: %fcopysign_f32x4([-NaN +NaN:0x0 +NaN:0x1 +NaN:0x300001], [+NaN -NaN -NaN -NaN]) == [+NaN -NaN:0x0 -NaN:0x1 -NaN:0x300001]
 ; run: %fcopysign_f32x4([-NaN:0x0 -NaN:0x1 -NaN:0x300001 +sNaN:0x1], [+NaN +NaN +NaN -NaN]) == [+NaN:0x0 +NaN:0x1 +NaN:0x300001 -sNaN:0x1]
 ; run: %fcopysign_f32x4([-sNaN:0x1 +sNaN:0x200001 -sNaN:0x200001 -sNaN:0x200001], [+NaN -NaN +NaN +NaN]) == [+sNaN:0x1 -sNaN:0x200001 +sNaN:0x200001 +sNaN:0x200001]
 function %fcopysign_f64x2(f64x2, f64x2) -> f64x2 {
 block0(v0: f64x2, v1: f64x2):
    v2 = fcopysign v0, v1
    return v2
 }
 ; run: %fcopysign_f64x2([0x9.0 -0x9.0], [0x9.0 0x9.0]) == [0x9.0 0x9.0]
 ; run: %fcopysign_f64x2([0x9.0 -0x9.0], [-0x9.0 -0x9.0]) == [-0x9.0 -0x9.0]
 ; run: %fcopysign_f64x2([0x0.0 -0x0.0], [-0x0.0 0x0.0]) == [-0x0.0 0x0.0]
 ; F64 Inf
 ; run: %fcopysign_f64x2([Inf -Inf], [Inf Inf]) == [Inf Inf]
 ; run: %fcopysign_f64x2([Inf -Inf], [-Inf -Inf]) == [-Inf -Inf]
 ; F64 Epsilon / Max / Min Positive
 ; run: %fcopysign_f64x2([0x1.0000000000000p-52 -0x1.0000000000000p-52], [-0x0.0 0x0.0]) == [-0x1.0000000000000p-52 0x1.0000000000000p-52]
 ; run: %fcopysign_f64x2([0x1.fffffffffffffp1023 -0x1.fffffffffffffp1023], [-0x0.0 0x0.0]) == [-0x1.fffffffffffffp1023 0x1.fffffffffffffp1023]
 ; run: %fcopysign_f64x2([0x1.0000000000000p-1022 -0x1.0000000000000p-1022], [-0x0.0 0x0.0]) == [-0x1.0000000000000p-1022 0x1.0000000000000p-1022]
 ; F64 Subnormals
 ; run: %fcopysign_f64x2([0x0.8000000000000p-1022 -0x0.8000000000000p-1022], [-0x0.0 0x0.0]) == [-0x0.8000000000000p-1022 0x0.8000000000000p-1022]
 ; run: %fcopysign_f64x2([0x0.0000000000001p-1022 -0x0.0000000000001p-1022], [-0x0.0 0x0.0]) == [-0x0.0000000000001p-1022 0x0.0000000000001p-1022]
 ; F64 NaN's
 ; Unlike with other operations fcopysign is guaranteed to only affect the sign bit
 ; run: %fcopysign_f64x2([0x0.0 0x3.0], [-NaN +sNaN:0x1]) == [-0x0.0 0x3.0]
 ; run: %fcopysign_f64x2([Inf +NaN], [-NaN -NaN]) == [-Inf -NaN]
 ; run: %fcopysign_f64x2([-NaN +NaN:0x0], [+NaN -NaN]) == [+NaN -NaN:0x0]
 ; run: %fcopysign_f64x2([+NaN:0x1 +NaN:0x4000000000001], [-NaN -NaN]) == [-NaN:0x1 -NaN:0x4000000000001]
 ; run: %fcopysign_f64x2([-NaN:0x0 -NaN:0x1], [+NaN +NaN]) == [+NaN:0x0 +NaN:0x1]
 ; run: %fcopysign_f64x2([-NaN:0x4000000000001 +sNaN:0x1], [+NaN -NaN]) == [+NaN:0x4000000000001 -sNaN:0x1]
 ; run: %fcopysign_f64x2([-sNaN:0x1 +sNaN:0x4000000000001], [+NaN -NaN]) == [+sNaN:0x1 -sNaN:0x4000000000001]
 ; run: %fcopysign_f64x2([-sNaN:0x4000000000001 -sNaN:0x4000000000001], [+NaN +NaN]) == [+sNaN:0x4000000000001 +sNaN:0x4000000000001]
--- a/cranelift/interpreter/src/step.rs
+++ b/cranelift/interpreter/src/step.rs
@@ -808,7 +808,19 @@ where
        }
        Opcode::Fneg => assign(Value::neg(arg(0)?)?),
        Opcode::Fabs => assign(Value::abs(arg(0)?)?),
-        Opcode::Fcopysign => binary(Value::copysign, arg(0)?, arg(1)?)?,
+        Opcode::Fcopysign => {
            let arg0 = extractlanes(&arg(0)?, ctrl_ty)?;
            let arg1 = extractlanes(&arg(1)?, ctrl_ty)?;
            assign(vectorizelanes(
                &arg0
                    .into_iter()
                    .zip(arg1.into_iter())
                    .map(|(x, y)| V::copysign(x, y))
                    .collect::<ValueResult<SimdVec<V>>>()?,
                ctrl_ty,
            )?)
        }
        Opcode::Fmin => assign(match (arg(0)?, arg(1)?) {
            (a, _) if a.is_nan()? => a,
            (_, b) if b.is_nan()? => b,