x64: Lower fcvt_to_{u,s}int{,_sat} in ISLE (#4704)

https://github.com/bytecodealliance/wasmtime/pull/4704
2022-08-16 09:03:50 -07:00
parent 2ce03cce08
commit 3c1490dd59
6 changed files with 446 additions and 281 deletions
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -3047,6 +3047,10 @@
            (_ Unit (emit (MInst.GprToXmm (SseOpcode.Cvtsi2sd) x dst size))))
        dst))

+(decl x64_cvttps2dq (Type XmmMem) Xmm)
+(rule (x64_cvttps2dq ty x)
+      (xmm_unary_rm_r (SseOpcode.Cvttps2dq) x))
+
 (decl cvt_u64_to_float_seq (Type Gpr) Xmm)
 (rule (cvt_u64_to_float_seq ty src)
      (let ((size OperandSize (raw_operand_size_of_type ty))
@@ -3058,6 +3062,34 @@
            (_ Unit (emit (MInst.CvtUint64ToFloatSeq size src_copy dst tmp_gpr1 tmp_gpr2))))
        dst))

+(decl cvt_float_to_uint_seq (Type Value bool) Gpr)
+(rule (cvt_float_to_uint_seq out_ty src @ (value_type src_ty) is_saturating)
+      (let ((out_size OperandSize (raw_operand_size_of_type out_ty))
+            (src_size OperandSize (raw_operand_size_of_type src_ty))
+
+            (tmp WritableXmm (temp_writable_xmm))
+            (_ Unit (emit (gen_move src_ty tmp src)))
+
+            (dst WritableGpr (temp_writable_gpr))
+            (tmp_xmm WritableXmm (temp_writable_xmm))
+            (tmp_gpr WritableGpr (temp_writable_gpr))
+            (_ Unit (emit (MInst.CvtFloatToUintSeq out_size src_size is_saturating tmp dst tmp_gpr tmp_xmm))))
+        dst))
+
+(decl cvt_float_to_sint_seq (Type Value bool) Gpr)
+(rule (cvt_float_to_sint_seq out_ty src @ (value_type src_ty) is_saturating)
+      (let ((out_size OperandSize (raw_operand_size_of_type out_ty))
+            (src_size OperandSize (raw_operand_size_of_type src_ty))
+
+            (tmp WritableXmm (temp_writable_xmm))
+            (_ Unit (emit (gen_move src_ty tmp src)))
+
+            (dst WritableGpr (temp_writable_gpr))
+            (tmp_xmm WritableXmm (temp_writable_xmm))
+            (tmp_gpr WritableGpr (temp_writable_gpr))
+            (_ Unit (emit (MInst.CvtFloatToSintSeq out_size src_size is_saturating tmp dst tmp_gpr tmp_xmm))))
+        dst))
+
 (decl fcvt_uint_mask_const () VCodeConstant)
 (extern constructor fcvt_uint_mask_const fcvt_uint_mask_const)

--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -408,58 +408,6 @@ impl Inst {
        Inst::XmmCmpRmR { op, src, dst }
    }

-    pub(crate) fn cvt_float_to_sint_seq(
-        src_size: OperandSize,
-        dst_size: OperandSize,
-        is_saturating: bool,
-        src: Writable<Reg>,
-        dst: Writable<Reg>,
-        tmp_gpr: Writable<Reg>,
-        tmp_xmm: Writable<Reg>,
-    ) -> Inst {
-        debug_assert!(src_size.is_one_of(&[OperandSize::Size32, OperandSize::Size64]));
-        debug_assert!(dst_size.is_one_of(&[OperandSize::Size32, OperandSize::Size64]));
-        debug_assert!(src.to_reg().class() == RegClass::Float);
-        debug_assert!(tmp_xmm.to_reg().class() == RegClass::Float);
-        debug_assert!(tmp_gpr.to_reg().class() == RegClass::Int);
-        debug_assert!(dst.to_reg().class() == RegClass::Int);
-        Inst::CvtFloatToSintSeq {
-            src_size,
-            dst_size,
-            is_saturating,
-            src: WritableXmm::from_writable_reg(src).unwrap(),
-            dst: WritableGpr::from_writable_reg(dst).unwrap(),
-            tmp_gpr: WritableGpr::from_writable_reg(tmp_gpr).unwrap(),
-            tmp_xmm: WritableXmm::from_writable_reg(tmp_xmm).unwrap(),
-        }
-    }
-
-    pub(crate) fn cvt_float_to_uint_seq(
-        src_size: OperandSize,
-        dst_size: OperandSize,
-        is_saturating: bool,
-        src: Writable<Reg>,
-        dst: Writable<Reg>,
-        tmp_gpr: Writable<Reg>,
-        tmp_xmm: Writable<Reg>,
-    ) -> Inst {
-        debug_assert!(src_size.is_one_of(&[OperandSize::Size32, OperandSize::Size64]));
-        debug_assert!(dst_size.is_one_of(&[OperandSize::Size32, OperandSize::Size64]));
-        debug_assert!(src.to_reg().class() == RegClass::Float);
-        debug_assert!(tmp_xmm.to_reg().class() == RegClass::Float);
-        debug_assert!(tmp_gpr.to_reg().class() == RegClass::Int);
-        debug_assert!(dst.to_reg().class() == RegClass::Int);
-        Inst::CvtFloatToUintSeq {
-            src_size,
-            dst_size,
-            is_saturating,
-            src: WritableXmm::from_writable_reg(src).unwrap(),
-            dst: WritableGpr::from_writable_reg(dst).unwrap(),
-            tmp_gpr: WritableGpr::from_writable_reg(tmp_gpr).unwrap(),
-            tmp_xmm: WritableXmm::from_writable_reg(tmp_xmm).unwrap(),
-        }
-    }
-
    #[allow(dead_code)]
    pub(crate) fn xmm_min_max_seq(
        size: OperandSize,
@@ -1257,7 +1205,7 @@ impl PrettyPrint for Inst {
                dst_size,
                tmp_xmm,
                tmp_gpr,
-                ..
+                is_saturating,
            } => {
                let src = pretty_print_reg(src.to_reg().to_reg(), src_size.to_bytes(), allocs);
                let dst = pretty_print_reg(dst.to_reg().to_reg(), dst_size.to_bytes(), allocs);
@@ -1266,9 +1214,10 @@ impl PrettyPrint for Inst {
                format!(
                    "{} {}, {}, {}, {}",
                    ljustify(format!(
-                        "cvt_float{}_to_sint{}_seq",
+                        "cvt_float{}_to_sint{}{}_seq",
                        src_size.to_bits(),
-                        dst_size.to_bits()
+                        dst_size.to_bits(),
+                        if *is_saturating { "_sat" } else { "" },
                    )),
                    src,
                    dst,
@@ -1284,7 +1233,7 @@ impl PrettyPrint for Inst {
                dst_size,
                tmp_gpr,
                tmp_xmm,
-                ..
+                is_saturating,
            } => {
                let src = pretty_print_reg(src.to_reg().to_reg(), src_size.to_bytes(), allocs);
                let dst = pretty_print_reg(dst.to_reg().to_reg(), dst_size.to_bytes(), allocs);
@@ -1293,9 +1242,10 @@ impl PrettyPrint for Inst {
                format!(
                    "{} {}, {}, {}, {}",
                    ljustify(format!(
-                        "cvt_float{}_to_uint{}_seq",
+                        "cvt_float{}_to_uint{}{}_seq",
                        src_size.to_bits(),
-                        dst_size.to_bits()
+                        dst_size.to_bits(),
+                        if *is_saturating { "_sat" } else { "" },
                    )),
                    src,
                    dst,
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -3062,3 +3062,130 @@

        ;; add together the two converted values
        (x64_addps a_hi a_lo)))
+
+;; Rules for `fcvt_to_uint` and `fcvt_to_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type out_ty (fcvt_to_uint val @ (value_type (ty_scalar_float _)))))
+      (cvt_float_to_uint_seq out_ty val $false))
+
+(rule (lower (has_type out_ty (fcvt_to_uint_sat val @ (value_type (ty_scalar_float _)))))
+      (cvt_float_to_uint_seq out_ty val $true))
+
+(rule (lower (has_type out_ty (fcvt_to_sint val @ (value_type (ty_scalar_float _)))))
+      (cvt_float_to_sint_seq out_ty val $false))
+
+(rule (lower (has_type out_ty (fcvt_to_sint_sat val @ (value_type (ty_scalar_float _)))))
+      (cvt_float_to_sint_seq out_ty val $true))
+
+;; The x64 backend currently only supports these two type combinations.
+(rule (lower (has_type $I32X4 (fcvt_to_sint_sat val @ (value_type $F32X4))))
+      (let (;; Sets tmp to zero if float is NaN
+            (tmp Xmm (x64_cmpps val val (FcmpImm.Equal)))
+            (dst Xmm (x64_andps val tmp))
+
+            ;; Sets top bit of tmp if float is positive
+            ;; Setting up to set top bit on negative float values
+            (tmp Xmm (x64_pxor tmp dst))
+
+            ;; Convert the packed float to packed doubleword.
+            (dst Xmm (x64_cvttps2dq $F32X4 dst))
+
+            ;; Set top bit only if < 0
+            (tmp Xmm (x64_pand dst tmp))
+            (tmp Xmm (x64_psrad tmp (RegMemImm.Imm 31))))
+
+        ;; On overflow 0x80000000 is returned to a lane.
+        ;; Below sets positive overflow lanes to 0x7FFFFFFF
+        ;; Keeps negative overflow lanes as is.
+        (x64_pxor tmp dst)))
+
+;; The algorithm for converting floats to unsigned ints is a little tricky. The
+;; complication arises because we are converting from a signed 64-bit int with a positive
+;; integer range from 1..INT_MAX (0x1..0x7FFFFFFF) to an unsigned integer with an extended
+;; range from (INT_MAX+1)..UINT_MAX. It's this range from (INT_MAX+1)..UINT_MAX
+;; (0x80000000..0xFFFFFFFF) that needs to be accounted for as a special case since our
+;; conversion instruction (cvttps2dq) only converts as high as INT_MAX (0x7FFFFFFF), but
+;; which conveniently setting underflows and overflows (smaller than MIN_INT or larger than
+;; MAX_INT) to be INT_MAX+1 (0x80000000). Nothing that the range (INT_MAX+1)..UINT_MAX includes
+;; precisely INT_MAX values we can correctly account for and convert every value in this range
+;; if we simply subtract INT_MAX+1 before doing the cvttps2dq conversion. After the subtraction
+;; every value originally (INT_MAX+1)..UINT_MAX is now the range (0..INT_MAX).
+;; After the conversion we add INT_MAX+1 back to this converted value, noting again that
+;; values we are trying to account for were already set to INT_MAX+1 during the original conversion.
+;; We simply have to create a mask and make sure we are adding together only the lanes that need
+;; to be accounted for. Digesting it all the steps then are:
+;;
+;; Step 1 - Account for NaN and negative floats by setting these src values to zero.
+;; Step 2 - Make a copy (tmp1) of the src value since we need to convert twice for
+;;          reasons described above.
+;; Step 3 - Convert the original src values. This will convert properly all floats up to INT_MAX
+;; Step 4 - Subtract INT_MAX from the copy set (tmp1). Note, all zero and negative values are those
+;;          values that were originally in the range (0..INT_MAX). This will come in handy during
+;;          step 7 when we zero negative lanes.
+;; Step 5 - Create a bit mask for tmp1 that will correspond to all lanes originally less than
+;;          UINT_MAX that are now less than INT_MAX thanks to the subtraction.
+;; Step 6 - Convert the second set of values (tmp1)
+;; Step 7 - Prep the converted second set by zeroing out negative lanes (these have already been
+;;          converted correctly with the first set) and by setting overflow lanes to 0x7FFFFFFF
+;;          as this will allow us to properly saturate overflow lanes when adding to 0x80000000
+;; Step 8 - Add the orginal converted src and the converted tmp1 where float values originally less
+;;          than and equal to INT_MAX will be unchanged, float values originally between INT_MAX+1 and
+;;          UINT_MAX will add together (INT_MAX) + (SRC - INT_MAX), and float values originally
+;;          greater than UINT_MAX will be saturated to UINT_MAX (0xFFFFFFFF) after adding (0x8000000 + 0x7FFFFFFF).
+;;
+;;
+;; The table below illustrates the result after each step where it matters for the converted set.
+;; Note the original value range (original src set) is the final dst in Step 8:
+;;
+;; Original src set:
+;; | Original Value Range |    Step 1    |         Step 3         |          Step 8           |
+;; |  -FLT_MIN..FLT_MAX   | 0.0..FLT_MAX | 0..INT_MAX(w/overflow) | 0..UINT_MAX(w/saturation) |
+;;
+;; Copied src set (tmp1):
+;; |    Step 2    |                  Step 4                  |
+;; | 0.0..FLT_MAX | (0.0-(INT_MAX+1))..(FLT_MAX-(INT_MAX+1)) |
+;;
+;; |                       Step 6                        |                 Step 7                 |
+;; | (0-(INT_MAX+1))..(UINT_MAX-(INT_MAX+1))(w/overflow) | ((INT_MAX+1)-(INT_MAX+1))..(INT_MAX+1) |
+(rule (lower (has_type $I32X4 (fcvt_to_uint_sat val @ (value_type $F32X4))))
+      (let (;; Converting to unsigned int so if float src is negative or NaN
+            ;; will first set to zero.
+            (tmp2 Xmm (x64_pxor val val)) ;; make a zero
+            (dst Xmm (x64_maxps val tmp2))
+
+            ;; Set tmp2 to INT_MAX+1. It is important to note here that after it looks
+            ;; like we are only converting INT_MAX (0x7FFFFFFF) but in fact because
+            ;; single precision IEEE-754 floats can only accurately represent contingous
+            ;; integers up to 2^23 and outside of this range it rounds to the closest
+            ;; integer that it can represent. In the case of INT_MAX, this value gets
+            ;; represented as 0x4f000000 which is the integer value (INT_MAX+1).
+            (tmp2 Xmm (x64_pcmpeqd tmp2 tmp2))
+            (tmp2 Xmm (x64_psrld tmp2 (RegMemImm.Imm 1)))
+            (tmp2 Xmm (x64_cvtdq2ps tmp2))
+
+            ;; Make a copy of these lanes and then do the first conversion.
+            ;; Overflow lanes greater than the maximum allowed signed value will
+            ;; set to 0x80000000. Negative and NaN lanes will be 0x0
+            (tmp1 Xmm dst)
+            (dst Xmm (x64_cvttps2dq $F32X4 dst))
+
+            ;; Set lanes to src - max_signed_int
+            (tmp1 Xmm (x64_subps tmp1 tmp2))
+
+            ;; Create mask for all positive lanes to saturate (i.e. greater than
+            ;; or equal to the maxmimum allowable unsigned int).
+            (tmp2 Xmm (x64_cmpps tmp2 tmp1 (FcmpImm.LessThanOrEqual)))
+
+            ;; Convert those set of lanes that have the max_signed_int factored out.
+            (tmp1 Xmm (x64_cvttps2dq $F32X4 tmp1))
+
+            ;; Prepare converted lanes by zeroing negative lanes and prepping lanes
+            ;; that have positive overflow (based on the mask) by setting these lanes
+            ;; to 0x7FFFFFFF
+            (tmp1 Xmm (x64_pxor tmp1 tmp2))
+            (tmp2 Xmm (x64_pxor tmp2 tmp2)) ;; make another zero
+            (tmp1 Xmm (x64_pmaxsd tmp1 tmp2)))
+
+        ;; Add this second set of converted lanes to the original to properly handle
+        ;; values greater than max signed int.
+        (x64_paddd tmp1 dst)))
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -557,232 +557,14 @@ fn lower_insn_to_regs(
        | Opcode::SelectifSpectreGuard
        | Opcode::FcvtFromSint
        | Opcode::FcvtLowFromSint
-        | Opcode::FcvtFromUint => {
+        | Opcode::FcvtFromUint
+        | Opcode::FcvtToUint
+        | Opcode::FcvtToSint
+        | Opcode::FcvtToUintSat
+        | Opcode::FcvtToSintSat => {
            implemented_in_isle(ctx);
        }

-        Opcode::FcvtToUint | Opcode::FcvtToUintSat | Opcode::FcvtToSint | Opcode::FcvtToSintSat => {
-            let src = put_input_in_reg(ctx, inputs[0]);
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-
-            let input_ty = ctx.input_ty(insn, 0);
-            if !input_ty.is_vector() {
-                let src_size = if input_ty == types::F32 {
-                    OperandSize::Size32
-                } else {
-                    assert_eq!(input_ty, types::F64);
-                    OperandSize::Size64
-                };
-
-                let output_ty = ty.unwrap();
-                let dst_size = if output_ty == types::I32 {
-                    OperandSize::Size32
-                } else {
-                    assert_eq!(output_ty, types::I64);
-                    OperandSize::Size64
-                };
-
-                let to_signed = op == Opcode::FcvtToSint || op == Opcode::FcvtToSintSat;
-                let is_sat = op == Opcode::FcvtToUintSat || op == Opcode::FcvtToSintSat;
-
-                let src_copy = ctx.alloc_tmp(input_ty).only_reg().unwrap();
-                ctx.emit(Inst::gen_move(src_copy, src, input_ty));
-
-                let tmp_xmm = ctx.alloc_tmp(input_ty).only_reg().unwrap();
-                let tmp_gpr = ctx.alloc_tmp(output_ty).only_reg().unwrap();
-
-                if to_signed {
-                    ctx.emit(Inst::cvt_float_to_sint_seq(
-                        src_size, dst_size, is_sat, src_copy, dst, tmp_gpr, tmp_xmm,
-                    ));
-                } else {
-                    ctx.emit(Inst::cvt_float_to_uint_seq(
-                        src_size, dst_size, is_sat, src_copy, dst, tmp_gpr, tmp_xmm,
-                    ));
-                }
-            } else {
-                if op == Opcode::FcvtToSintSat {
-                    // Sets destination to zero if float is NaN
-                    assert_eq!(types::F32X4, ctx.input_ty(insn, 0));
-                    let tmp = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
-                    ctx.emit(Inst::xmm_unary_rm_r(
-                        SseOpcode::Movapd,
-                        RegMem::reg(src),
-                        tmp,
-                    ));
-                    ctx.emit(Inst::gen_move(dst, src, input_ty));
-                    let cond = FcmpImm::from(FloatCC::Equal);
-                    ctx.emit(Inst::xmm_rm_r_imm(
-                        SseOpcode::Cmpps,
-                        RegMem::reg(tmp.to_reg()),
-                        tmp,
-                        cond.encode(),
-                        OperandSize::Size32,
-                    ));
-                    ctx.emit(Inst::xmm_rm_r(
-                        SseOpcode::Andps,
-                        RegMem::reg(tmp.to_reg()),
-                        dst,
-                    ));
-
-                    // Sets top bit of tmp if float is positive
-                    // Setting up to set top bit on negative float values
-                    ctx.emit(Inst::xmm_rm_r(
-                        SseOpcode::Pxor,
-                        RegMem::reg(dst.to_reg()),
-                        tmp,
-                    ));
-
-                    // Convert the packed float to packed doubleword.
-                    ctx.emit(Inst::xmm_unary_rm_r(
-                        SseOpcode::Cvttps2dq,
-                        RegMem::reg(dst.to_reg()),
-                        dst,
-                    ));
-
-                    // Set top bit only if < 0
-                    // Saturate lane with sign (top) bit.
-                    ctx.emit(Inst::xmm_rm_r(
-                        SseOpcode::Pand,
-                        RegMem::reg(dst.to_reg()),
-                        tmp,
-                    ));
-                    ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrad, RegMemImm::imm(31), tmp));
-
-                    // On overflow 0x80000000 is returned to a lane.
-                    // Below sets positive overflow lanes to 0x7FFFFFFF
-                    // Keeps negative overflow lanes as is.
-                    ctx.emit(Inst::xmm_rm_r(
-                        SseOpcode::Pxor,
-                        RegMem::reg(tmp.to_reg()),
-                        dst,
-                    ));
-                } else if op == Opcode::FcvtToUintSat {
-                    // The algorithm for converting floats to unsigned ints is a little tricky. The
-                    // complication arises because we are converting from a signed 64-bit int with a positive
-                    // integer range from 1..INT_MAX (0x1..0x7FFFFFFF) to an unsigned integer with an extended
-                    // range from (INT_MAX+1)..UINT_MAX. It's this range from (INT_MAX+1)..UINT_MAX
-                    // (0x80000000..0xFFFFFFFF) that needs to be accounted for as a special case since our
-                    // conversion instruction (cvttps2dq) only converts as high as INT_MAX (0x7FFFFFFF), but
-                    // which conveniently setting underflows and overflows (smaller than MIN_INT or larger than
-                    // MAX_INT) to be INT_MAX+1 (0x80000000). Nothing that the range (INT_MAX+1)..UINT_MAX includes
-                    // precisely INT_MAX values we can correctly account for and convert every value in this range
-                    // if we simply subtract INT_MAX+1 before doing the cvttps2dq conversion. After the subtraction
-                    // every value originally (INT_MAX+1)..UINT_MAX is now the range (0..INT_MAX).
-                    // After the conversion we add INT_MAX+1 back to this converted value, noting again that
-                    // values we are trying to account for were already set to INT_MAX+1 during the original conversion.
-                    // We simply have to create a mask and make sure we are adding together only the lanes that need
-                    // to be accounted for. Digesting it all the steps then are:
-                    //
-                    // Step 1 - Account for NaN and negative floats by setting these src values to zero.
-                    // Step 2 - Make a copy (tmp1) of the src value since we need to convert twice for
-                    //          reasons described above.
-                    // Step 3 - Convert the original src values. This will convert properly all floats up to INT_MAX
-                    // Step 4 - Subtract INT_MAX from the copy set (tmp1). Note, all zero and negative values are those
-                    //          values that were originally in the range (0..INT_MAX). This will come in handy during
-                    //          step 7 when we zero negative lanes.
-                    // Step 5 - Create a bit mask for tmp1 that will correspond to all lanes originally less than
-                    //          UINT_MAX that are now less than INT_MAX thanks to the subtraction.
-                    // Step 6 - Convert the second set of values (tmp1)
-                    // Step 7 - Prep the converted second set by zeroing out negative lanes (these have already been
-                    //          converted correctly with the first set) and by setting overflow lanes to 0x7FFFFFFF
-                    //          as this will allow us to properly saturate overflow lanes when adding to 0x80000000
-                    // Step 8 - Add the orginal converted src and the converted tmp1 where float values originally less
-                    //          than and equal to INT_MAX will be unchanged, float values originally between INT_MAX+1 and
-                    //          UINT_MAX will add together (INT_MAX) + (SRC - INT_MAX), and float values originally
-                    //          greater than UINT_MAX will be saturated to UINT_MAX (0xFFFFFFFF) after adding (0x8000000 + 0x7FFFFFFF).
-                    //
-                    //
-                    // The table below illustrates the result after each step where it matters for the converted set.
-                    // Note the original value range (original src set) is the final dst in Step 8:
-                    //
-                    // Original src set:
-                    // | Original Value Range |    Step 1    |         Step 3         |          Step 8           |
-                    // |  -FLT_MIN..FLT_MAX   | 0.0..FLT_MAX | 0..INT_MAX(w/overflow) | 0..UINT_MAX(w/saturation) |
-                    //
-                    // Copied src set (tmp1):
-                    // |    Step 2    |                  Step 4                  |
-                    // | 0.0..FLT_MAX | (0.0-(INT_MAX+1))..(FLT_MAX-(INT_MAX+1)) |
-                    //
-                    // |                       Step 6                        |                 Step 7                 |
-                    // | (0-(INT_MAX+1))..(UINT_MAX-(INT_MAX+1))(w/overflow) | ((INT_MAX+1)-(INT_MAX+1))..(INT_MAX+1) |
-
-                    // Create temporaries
-                    assert_eq!(types::F32X4, ctx.input_ty(insn, 0));
-                    let tmp1 = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
-                    let tmp2 = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
-
-                    // Converting to unsigned int so if float src is negative or NaN
-                    // will first set to zero.
-                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp2));
-                    ctx.emit(Inst::gen_move(dst, src, input_ty));
-                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Maxps, RegMem::from(tmp2), dst));
-
-                    // Set tmp2 to INT_MAX+1. It is important to note here that after it looks
-                    // like we are only converting INT_MAX (0x7FFFFFFF) but in fact because
-                    // single precision IEEE-754 floats can only accurately represent contingous
-                    // integers up to 2^23 and outside of this range it rounds to the closest
-                    // integer that it can represent. In the case of INT_MAX, this value gets
-                    // represented as 0x4f000000 which is the integer value (INT_MAX+1).
-
-                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Pcmpeqd, RegMem::from(tmp2), tmp2));
-                    ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(1), tmp2));
-                    ctx.emit(Inst::xmm_unary_rm_r(
-                        SseOpcode::Cvtdq2ps,
-                        RegMem::from(tmp2),
-                        tmp2,
-                    ));
-
-                    // Make a copy of these lanes and then do the first conversion.
-                    // Overflow lanes greater than the maximum allowed signed value will
-                    // set to 0x80000000. Negative and NaN lanes will be 0x0
-                    ctx.emit(Inst::xmm_mov(SseOpcode::Movaps, RegMem::from(dst), tmp1));
-                    ctx.emit(Inst::xmm_unary_rm_r(
-                        SseOpcode::Cvttps2dq,
-                        RegMem::from(dst),
-                        dst,
-                    ));
-
-                    // Set lanes to src - max_signed_int
-                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Subps, RegMem::from(tmp2), tmp1));
-
-                    // Create mask for all positive lanes to saturate (i.e. greater than
-                    // or equal to the maxmimum allowable unsigned int).
-                    let cond = FcmpImm::from(FloatCC::LessThanOrEqual);
-                    ctx.emit(Inst::xmm_rm_r_imm(
-                        SseOpcode::Cmpps,
-                        RegMem::from(tmp1),
-                        tmp2,
-                        cond.encode(),
-                        OperandSize::Size32,
-                    ));
-
-                    // Convert those set of lanes that have the max_signed_int factored out.
-                    ctx.emit(Inst::xmm_unary_rm_r(
-                        SseOpcode::Cvttps2dq,
-                        RegMem::from(tmp1),
-                        tmp1,
-                    ));
-
-                    // Prepare converted lanes by zeroing negative lanes and prepping lanes
-                    // that have positive overflow (based on the mask) by setting these lanes
-                    // to 0x7FFFFFFF
-                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp1));
-                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp2));
-                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmaxsd, RegMem::from(tmp2), tmp1));
-
-                    // Add this second set of converted lanes to the original to properly handle
-                    // values greater than max signed int.
-                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Paddd, RegMem::from(tmp1), dst));
-                } else {
-                    // Since this branch is also guarded by a check for vector types
-                    // neither Opcode::FcvtToUint nor Opcode::FcvtToSint can reach here
-                    // due to vector varients not existing. The first two branches will
-                    // cover all reachable cases.
-                    unreachable!();
-                }
-            }
-        }
        Opcode::IaddPairwise => {
            if let (Some(swiden_low), Some(swiden_high)) = (
                matches_input(ctx, inputs[0], Opcode::SwidenLow),
--- a/cranelift/filetests/filetests/isa/x64/fcvt.clif
+++ b/cranelift/filetests/filetests/isa/x64/fcvt.clif
@@ -200,3 +200,275 @@ block0(v0: i32x4):
 ;   popq    %rbp
 ;   ret

+function %f13(f32) -> i32 {
+block0(v0: f32):
+  v1 = fcvt_to_uint.i32 v0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   cvt_float32_to_uint32_seq %xmm0, %eax, %r10, %xmm6
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f14(f32) -> i64 {
+block0(v0: f32):
+  v1 = fcvt_to_uint.i64 v0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   cvt_float32_to_uint64_seq %xmm0, %rax, %r10, %xmm6
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f15(f64) -> i32 {
+block0(v0: f64):
+  v1 = fcvt_to_uint.i32 v0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   cvt_float64_to_uint32_seq %xmm0, %eax, %r10, %xmm6
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f16(f64) -> i64 {
+block0(v0: f64):
+  v1 = fcvt_to_uint.i64 v0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   cvt_float64_to_uint64_seq %xmm0, %rax, %r10, %xmm6
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f17(f32) -> i32 {
+block0(v0: f32):
+  v1 = fcvt_to_uint_sat.i32 v0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   cvt_float32_to_uint32_sat_seq %xmm0, %eax, %r10, %xmm6
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f18(f32) -> i64 {
+block0(v0: f32):
+  v1 = fcvt_to_uint_sat.i64 v0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   cvt_float32_to_uint64_sat_seq %xmm0, %rax, %r10, %xmm6
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f19(f64) -> i32 {
+block0(v0: f64):
+  v1 = fcvt_to_uint_sat.i32 v0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   cvt_float64_to_uint32_sat_seq %xmm0, %eax, %r10, %xmm6
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f20(f64) -> i64 {
+block0(v0: f64):
+  v1 = fcvt_to_uint_sat.i64 v0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   cvt_float64_to_uint64_sat_seq %xmm0, %rax, %r10, %xmm6
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f21(f32) -> i32 {
+block0(v0: f32):
+  v1 = fcvt_to_sint.i32 v0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   cvt_float32_to_sint32_seq %xmm0, %eax, %r10, %xmm6
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f22(f32) -> i64 {
+block0(v0: f32):
+  v1 = fcvt_to_sint.i64 v0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   cvt_float32_to_sint64_seq %xmm0, %rax, %r10, %xmm6
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f23(f64) -> i32 {
+block0(v0: f64):
+  v1 = fcvt_to_sint.i32 v0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   cvt_float64_to_sint32_seq %xmm0, %eax, %r10, %xmm6
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f24(f64) -> i64 {
+block0(v0: f64):
+  v1 = fcvt_to_sint.i64 v0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   cvt_float64_to_sint64_seq %xmm0, %rax, %r10, %xmm6
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f25(f32) -> i32 {
+block0(v0: f32):
+  v1 = fcvt_to_sint_sat.i32 v0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   cvt_float32_to_sint32_sat_seq %xmm0, %eax, %r10, %xmm6
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f26(f32) -> i64 {
+block0(v0: f32):
+  v1 = fcvt_to_sint_sat.i64 v0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   cvt_float32_to_sint64_sat_seq %xmm0, %rax, %r10, %xmm6
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f27(f64) -> i32 {
+block0(v0: f64):
+  v1 = fcvt_to_sint_sat.i32 v0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   cvt_float64_to_sint32_sat_seq %xmm0, %eax, %r10, %xmm6
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f28(f64) -> i64 {
+block0(v0: f64):
+  v1 = fcvt_to_sint_sat.i64 v0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   cvt_float64_to_sint64_sat_seq %xmm0, %rax, %r10, %xmm6
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f29(f32x4) -> i32x4 {
+block0(v0: f32x4):
+  v1 = fcvt_to_uint_sat.i32x4 v0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pxor    %xmm3, %xmm3, %xmm3
+;   maxps   %xmm0, %xmm3, %xmm0
+;   pcmpeqd %xmm8, %xmm8, %xmm8
+;   psrld   %xmm8, $1, %xmm8
+;   cvtdq2ps %xmm8, %xmm14
+;   cvttps2dq %xmm0, %xmm13
+;   subps   %xmm0, %xmm14, %xmm0
+;   cmpps   $2, %xmm14, %xmm0, %xmm14
+;   cvttps2dq %xmm0, %xmm0
+;   pxor    %xmm0, %xmm14, %xmm0
+;   pxor    %xmm7, %xmm7, %xmm7
+;   pmaxsd  %xmm0, %xmm7, %xmm0
+;   paddd   %xmm0, %xmm13, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f30(f32x4) -> i32x4 {
+block0(v0: f32x4):
+  v1 = fcvt_to_sint_sat.i32x4 v0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm0, %xmm5
+;   cmpps   $0, %xmm5, %xmm0, %xmm5
+;   andps   %xmm0, %xmm5, %xmm0
+;   pxor    %xmm5, %xmm0, %xmm5
+;   cvttps2dq %xmm0, %xmm9
+;   movdqa  %xmm9, %xmm0
+;   pand    %xmm0, %xmm5, %xmm0
+;   psrad   %xmm0, $31, %xmm0
+;   pxor    %xmm0, %xmm9, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
--- a/cranelift/filetests/filetests/runtests/simd-conversion.clif
+++ b/cranelift/filetests/filetests/runtests/simd-conversion.clif
@@ -28,6 +28,7 @@ block0(v0:f32x4):
 }
 ; run: %fcvt_to_sint_sat([0x0.0 -0x1.0 0x1.0 0x1.0p100]) == [0 -1 1 0x7FFFFFFF]
 ; run: %fcvt_to_sint_sat([-0x8.1 0x0.0 0x0.0 -0x1.0p100]) == [-8 0 0 0x80000000]
+; run: %fcvt_to_sint_sat([+NaN +NaN +NaN +NaN]) == [0 0 0 0]

 function %fcvt_to_uint_sat(f32x4) -> i32x4 {
 block0(v0:f32x4):
@@ -37,3 +38,4 @@ block0(v0:f32x4):
 ; run: %fcvt_to_uint_sat([0x1.0 0x4.2 0x4.6 0x1.0p100]) == [1 4 4 0xFFFFFFFF]
 ; run: %fcvt_to_uint_sat([-0x8.1 -0x0.0 0x0.0 -0x1.0p100]) == [0 0 0 0]
 ; run: %fcvt_to_uint_sat([0xB2D05E00.0 0.0 0.0 0.0]) == [3000000000 0 0 0]
+; run: %fcvt_to_uint_sat([+NaN +NaN +NaN +NaN]) == [0 0 0 0]