diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index c270817564..8043a1059b 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -3047,6 +3047,10 @@ (_ Unit (emit (MInst.GprToXmm (SseOpcode.Cvtsi2sd) x dst size)))) dst)) +(decl x64_cvttps2dq (Type XmmMem) Xmm) +(rule (x64_cvttps2dq ty x) + (xmm_unary_rm_r (SseOpcode.Cvttps2dq) x)) + (decl cvt_u64_to_float_seq (Type Gpr) Xmm) (rule (cvt_u64_to_float_seq ty src) (let ((size OperandSize (raw_operand_size_of_type ty)) @@ -3058,6 +3062,34 @@ (_ Unit (emit (MInst.CvtUint64ToFloatSeq size src_copy dst tmp_gpr1 tmp_gpr2)))) dst)) +(decl cvt_float_to_uint_seq (Type Value bool) Gpr) +(rule (cvt_float_to_uint_seq out_ty src @ (value_type src_ty) is_saturating) + (let ((out_size OperandSize (raw_operand_size_of_type out_ty)) + (src_size OperandSize (raw_operand_size_of_type src_ty)) + + (tmp WritableXmm (temp_writable_xmm)) + (_ Unit (emit (gen_move src_ty tmp src))) + + (dst WritableGpr (temp_writable_gpr)) + (tmp_xmm WritableXmm (temp_writable_xmm)) + (tmp_gpr WritableGpr (temp_writable_gpr)) + (_ Unit (emit (MInst.CvtFloatToUintSeq out_size src_size is_saturating tmp dst tmp_gpr tmp_xmm)))) + dst)) + +(decl cvt_float_to_sint_seq (Type Value bool) Gpr) +(rule (cvt_float_to_sint_seq out_ty src @ (value_type src_ty) is_saturating) + (let ((out_size OperandSize (raw_operand_size_of_type out_ty)) + (src_size OperandSize (raw_operand_size_of_type src_ty)) + + (tmp WritableXmm (temp_writable_xmm)) + (_ Unit (emit (gen_move src_ty tmp src))) + + (dst WritableGpr (temp_writable_gpr)) + (tmp_xmm WritableXmm (temp_writable_xmm)) + (tmp_gpr WritableGpr (temp_writable_gpr)) + (_ Unit (emit (MInst.CvtFloatToSintSeq out_size src_size is_saturating tmp dst tmp_gpr tmp_xmm)))) + dst)) + (decl fcvt_uint_mask_const () VCodeConstant) (extern constructor fcvt_uint_mask_const fcvt_uint_mask_const) diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 5a51424a43..354be76df2 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -408,58 +408,6 @@ impl Inst { Inst::XmmCmpRmR { op, src, dst } } - pub(crate) fn cvt_float_to_sint_seq( - src_size: OperandSize, - dst_size: OperandSize, - is_saturating: bool, - src: Writable, - dst: Writable, - tmp_gpr: Writable, - tmp_xmm: Writable, - ) -> Inst { - debug_assert!(src_size.is_one_of(&[OperandSize::Size32, OperandSize::Size64])); - debug_assert!(dst_size.is_one_of(&[OperandSize::Size32, OperandSize::Size64])); - debug_assert!(src.to_reg().class() == RegClass::Float); - debug_assert!(tmp_xmm.to_reg().class() == RegClass::Float); - debug_assert!(tmp_gpr.to_reg().class() == RegClass::Int); - debug_assert!(dst.to_reg().class() == RegClass::Int); - Inst::CvtFloatToSintSeq { - src_size, - dst_size, - is_saturating, - src: WritableXmm::from_writable_reg(src).unwrap(), - dst: WritableGpr::from_writable_reg(dst).unwrap(), - tmp_gpr: WritableGpr::from_writable_reg(tmp_gpr).unwrap(), - tmp_xmm: WritableXmm::from_writable_reg(tmp_xmm).unwrap(), - } - } - - pub(crate) fn cvt_float_to_uint_seq( - src_size: OperandSize, - dst_size: OperandSize, - is_saturating: bool, - src: Writable, - dst: Writable, - tmp_gpr: Writable, - tmp_xmm: Writable, - ) -> Inst { - debug_assert!(src_size.is_one_of(&[OperandSize::Size32, OperandSize::Size64])); - debug_assert!(dst_size.is_one_of(&[OperandSize::Size32, OperandSize::Size64])); - debug_assert!(src.to_reg().class() == RegClass::Float); - debug_assert!(tmp_xmm.to_reg().class() == RegClass::Float); - debug_assert!(tmp_gpr.to_reg().class() == RegClass::Int); - debug_assert!(dst.to_reg().class() == RegClass::Int); - Inst::CvtFloatToUintSeq { - src_size, - dst_size, - is_saturating, - src: WritableXmm::from_writable_reg(src).unwrap(), - dst: WritableGpr::from_writable_reg(dst).unwrap(), - tmp_gpr: WritableGpr::from_writable_reg(tmp_gpr).unwrap(), - tmp_xmm: WritableXmm::from_writable_reg(tmp_xmm).unwrap(), - } - } - #[allow(dead_code)] pub(crate) fn xmm_min_max_seq( size: OperandSize, @@ -1257,7 +1205,7 @@ impl PrettyPrint for Inst { dst_size, tmp_xmm, tmp_gpr, - .. + is_saturating, } => { let src = pretty_print_reg(src.to_reg().to_reg(), src_size.to_bytes(), allocs); let dst = pretty_print_reg(dst.to_reg().to_reg(), dst_size.to_bytes(), allocs); @@ -1266,9 +1214,10 @@ impl PrettyPrint for Inst { format!( "{} {}, {}, {}, {}", ljustify(format!( - "cvt_float{}_to_sint{}_seq", + "cvt_float{}_to_sint{}{}_seq", src_size.to_bits(), - dst_size.to_bits() + dst_size.to_bits(), + if *is_saturating { "_sat" } else { "" }, )), src, dst, @@ -1284,7 +1233,7 @@ impl PrettyPrint for Inst { dst_size, tmp_gpr, tmp_xmm, - .. + is_saturating, } => { let src = pretty_print_reg(src.to_reg().to_reg(), src_size.to_bytes(), allocs); let dst = pretty_print_reg(dst.to_reg().to_reg(), dst_size.to_bytes(), allocs); @@ -1293,9 +1242,10 @@ impl PrettyPrint for Inst { format!( "{} {}, {}, {}, {}", ljustify(format!( - "cvt_float{}_to_uint{}_seq", + "cvt_float{}_to_uint{}{}_seq", src_size.to_bits(), - dst_size.to_bits() + dst_size.to_bits(), + if *is_saturating { "_sat" } else { "" }, )), src, dst, diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 8f840a086f..759a956ec6 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -3062,3 +3062,130 @@ ;; add together the two converted values (x64_addps a_hi a_lo))) + +;; Rules for `fcvt_to_uint` and `fcvt_to_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type out_ty (fcvt_to_uint val @ (value_type (ty_scalar_float _))))) + (cvt_float_to_uint_seq out_ty val $false)) + +(rule (lower (has_type out_ty (fcvt_to_uint_sat val @ (value_type (ty_scalar_float _))))) + (cvt_float_to_uint_seq out_ty val $true)) + +(rule (lower (has_type out_ty (fcvt_to_sint val @ (value_type (ty_scalar_float _))))) + (cvt_float_to_sint_seq out_ty val $false)) + +(rule (lower (has_type out_ty (fcvt_to_sint_sat val @ (value_type (ty_scalar_float _))))) + (cvt_float_to_sint_seq out_ty val $true)) + +;; The x64 backend currently only supports these two type combinations. +(rule (lower (has_type $I32X4 (fcvt_to_sint_sat val @ (value_type $F32X4)))) + (let (;; Sets tmp to zero if float is NaN + (tmp Xmm (x64_cmpps val val (FcmpImm.Equal))) + (dst Xmm (x64_andps val tmp)) + + ;; Sets top bit of tmp if float is positive + ;; Setting up to set top bit on negative float values + (tmp Xmm (x64_pxor tmp dst)) + + ;; Convert the packed float to packed doubleword. + (dst Xmm (x64_cvttps2dq $F32X4 dst)) + + ;; Set top bit only if < 0 + (tmp Xmm (x64_pand dst tmp)) + (tmp Xmm (x64_psrad tmp (RegMemImm.Imm 31)))) + + ;; On overflow 0x80000000 is returned to a lane. + ;; Below sets positive overflow lanes to 0x7FFFFFFF + ;; Keeps negative overflow lanes as is. + (x64_pxor tmp dst))) + +;; The algorithm for converting floats to unsigned ints is a little tricky. The +;; complication arises because we are converting from a signed 64-bit int with a positive +;; integer range from 1..INT_MAX (0x1..0x7FFFFFFF) to an unsigned integer with an extended +;; range from (INT_MAX+1)..UINT_MAX. It's this range from (INT_MAX+1)..UINT_MAX +;; (0x80000000..0xFFFFFFFF) that needs to be accounted for as a special case since our +;; conversion instruction (cvttps2dq) only converts as high as INT_MAX (0x7FFFFFFF), but +;; which conveniently setting underflows and overflows (smaller than MIN_INT or larger than +;; MAX_INT) to be INT_MAX+1 (0x80000000). Nothing that the range (INT_MAX+1)..UINT_MAX includes +;; precisely INT_MAX values we can correctly account for and convert every value in this range +;; if we simply subtract INT_MAX+1 before doing the cvttps2dq conversion. After the subtraction +;; every value originally (INT_MAX+1)..UINT_MAX is now the range (0..INT_MAX). +;; After the conversion we add INT_MAX+1 back to this converted value, noting again that +;; values we are trying to account for were already set to INT_MAX+1 during the original conversion. +;; We simply have to create a mask and make sure we are adding together only the lanes that need +;; to be accounted for. Digesting it all the steps then are: +;; +;; Step 1 - Account for NaN and negative floats by setting these src values to zero. +;; Step 2 - Make a copy (tmp1) of the src value since we need to convert twice for +;; reasons described above. +;; Step 3 - Convert the original src values. This will convert properly all floats up to INT_MAX +;; Step 4 - Subtract INT_MAX from the copy set (tmp1). Note, all zero and negative values are those +;; values that were originally in the range (0..INT_MAX). This will come in handy during +;; step 7 when we zero negative lanes. +;; Step 5 - Create a bit mask for tmp1 that will correspond to all lanes originally less than +;; UINT_MAX that are now less than INT_MAX thanks to the subtraction. +;; Step 6 - Convert the second set of values (tmp1) +;; Step 7 - Prep the converted second set by zeroing out negative lanes (these have already been +;; converted correctly with the first set) and by setting overflow lanes to 0x7FFFFFFF +;; as this will allow us to properly saturate overflow lanes when adding to 0x80000000 +;; Step 8 - Add the orginal converted src and the converted tmp1 where float values originally less +;; than and equal to INT_MAX will be unchanged, float values originally between INT_MAX+1 and +;; UINT_MAX will add together (INT_MAX) + (SRC - INT_MAX), and float values originally +;; greater than UINT_MAX will be saturated to UINT_MAX (0xFFFFFFFF) after adding (0x8000000 + 0x7FFFFFFF). +;; +;; +;; The table below illustrates the result after each step where it matters for the converted set. +;; Note the original value range (original src set) is the final dst in Step 8: +;; +;; Original src set: +;; | Original Value Range | Step 1 | Step 3 | Step 8 | +;; | -FLT_MIN..FLT_MAX | 0.0..FLT_MAX | 0..INT_MAX(w/overflow) | 0..UINT_MAX(w/saturation) | +;; +;; Copied src set (tmp1): +;; | Step 2 | Step 4 | +;; | 0.0..FLT_MAX | (0.0-(INT_MAX+1))..(FLT_MAX-(INT_MAX+1)) | +;; +;; | Step 6 | Step 7 | +;; | (0-(INT_MAX+1))..(UINT_MAX-(INT_MAX+1))(w/overflow) | ((INT_MAX+1)-(INT_MAX+1))..(INT_MAX+1) | +(rule (lower (has_type $I32X4 (fcvt_to_uint_sat val @ (value_type $F32X4)))) + (let (;; Converting to unsigned int so if float src is negative or NaN + ;; will first set to zero. + (tmp2 Xmm (x64_pxor val val)) ;; make a zero + (dst Xmm (x64_maxps val tmp2)) + + ;; Set tmp2 to INT_MAX+1. It is important to note here that after it looks + ;; like we are only converting INT_MAX (0x7FFFFFFF) but in fact because + ;; single precision IEEE-754 floats can only accurately represent contingous + ;; integers up to 2^23 and outside of this range it rounds to the closest + ;; integer that it can represent. In the case of INT_MAX, this value gets + ;; represented as 0x4f000000 which is the integer value (INT_MAX+1). + (tmp2 Xmm (x64_pcmpeqd tmp2 tmp2)) + (tmp2 Xmm (x64_psrld tmp2 (RegMemImm.Imm 1))) + (tmp2 Xmm (x64_cvtdq2ps tmp2)) + + ;; Make a copy of these lanes and then do the first conversion. + ;; Overflow lanes greater than the maximum allowed signed value will + ;; set to 0x80000000. Negative and NaN lanes will be 0x0 + (tmp1 Xmm dst) + (dst Xmm (x64_cvttps2dq $F32X4 dst)) + + ;; Set lanes to src - max_signed_int + (tmp1 Xmm (x64_subps tmp1 tmp2)) + + ;; Create mask for all positive lanes to saturate (i.e. greater than + ;; or equal to the maxmimum allowable unsigned int). + (tmp2 Xmm (x64_cmpps tmp2 tmp1 (FcmpImm.LessThanOrEqual))) + + ;; Convert those set of lanes that have the max_signed_int factored out. + (tmp1 Xmm (x64_cvttps2dq $F32X4 tmp1)) + + ;; Prepare converted lanes by zeroing negative lanes and prepping lanes + ;; that have positive overflow (based on the mask) by setting these lanes + ;; to 0x7FFFFFFF + (tmp1 Xmm (x64_pxor tmp1 tmp2)) + (tmp2 Xmm (x64_pxor tmp2 tmp2)) ;; make another zero + (tmp1 Xmm (x64_pmaxsd tmp1 tmp2))) + + ;; Add this second set of converted lanes to the original to properly handle + ;; values greater than max signed int. + (x64_paddd tmp1 dst))) diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 4da2fe1d1d..3e4decfda9 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -557,232 +557,14 @@ fn lower_insn_to_regs( | Opcode::SelectifSpectreGuard | Opcode::FcvtFromSint | Opcode::FcvtLowFromSint - | Opcode::FcvtFromUint => { + | Opcode::FcvtFromUint + | Opcode::FcvtToUint + | Opcode::FcvtToSint + | Opcode::FcvtToUintSat + | Opcode::FcvtToSintSat => { implemented_in_isle(ctx); } - Opcode::FcvtToUint | Opcode::FcvtToUintSat | Opcode::FcvtToSint | Opcode::FcvtToSintSat => { - let src = put_input_in_reg(ctx, inputs[0]); - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - - let input_ty = ctx.input_ty(insn, 0); - if !input_ty.is_vector() { - let src_size = if input_ty == types::F32 { - OperandSize::Size32 - } else { - assert_eq!(input_ty, types::F64); - OperandSize::Size64 - }; - - let output_ty = ty.unwrap(); - let dst_size = if output_ty == types::I32 { - OperandSize::Size32 - } else { - assert_eq!(output_ty, types::I64); - OperandSize::Size64 - }; - - let to_signed = op == Opcode::FcvtToSint || op == Opcode::FcvtToSintSat; - let is_sat = op == Opcode::FcvtToUintSat || op == Opcode::FcvtToSintSat; - - let src_copy = ctx.alloc_tmp(input_ty).only_reg().unwrap(); - ctx.emit(Inst::gen_move(src_copy, src, input_ty)); - - let tmp_xmm = ctx.alloc_tmp(input_ty).only_reg().unwrap(); - let tmp_gpr = ctx.alloc_tmp(output_ty).only_reg().unwrap(); - - if to_signed { - ctx.emit(Inst::cvt_float_to_sint_seq( - src_size, dst_size, is_sat, src_copy, dst, tmp_gpr, tmp_xmm, - )); - } else { - ctx.emit(Inst::cvt_float_to_uint_seq( - src_size, dst_size, is_sat, src_copy, dst, tmp_gpr, tmp_xmm, - )); - } - } else { - if op == Opcode::FcvtToSintSat { - // Sets destination to zero if float is NaN - assert_eq!(types::F32X4, ctx.input_ty(insn, 0)); - let tmp = ctx.alloc_tmp(types::I32X4).only_reg().unwrap(); - ctx.emit(Inst::xmm_unary_rm_r( - SseOpcode::Movapd, - RegMem::reg(src), - tmp, - )); - ctx.emit(Inst::gen_move(dst, src, input_ty)); - let cond = FcmpImm::from(FloatCC::Equal); - ctx.emit(Inst::xmm_rm_r_imm( - SseOpcode::Cmpps, - RegMem::reg(tmp.to_reg()), - tmp, - cond.encode(), - OperandSize::Size32, - )); - ctx.emit(Inst::xmm_rm_r( - SseOpcode::Andps, - RegMem::reg(tmp.to_reg()), - dst, - )); - - // Sets top bit of tmp if float is positive - // Setting up to set top bit on negative float values - ctx.emit(Inst::xmm_rm_r( - SseOpcode::Pxor, - RegMem::reg(dst.to_reg()), - tmp, - )); - - // Convert the packed float to packed doubleword. - ctx.emit(Inst::xmm_unary_rm_r( - SseOpcode::Cvttps2dq, - RegMem::reg(dst.to_reg()), - dst, - )); - - // Set top bit only if < 0 - // Saturate lane with sign (top) bit. - ctx.emit(Inst::xmm_rm_r( - SseOpcode::Pand, - RegMem::reg(dst.to_reg()), - tmp, - )); - ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrad, RegMemImm::imm(31), tmp)); - - // On overflow 0x80000000 is returned to a lane. - // Below sets positive overflow lanes to 0x7FFFFFFF - // Keeps negative overflow lanes as is. - ctx.emit(Inst::xmm_rm_r( - SseOpcode::Pxor, - RegMem::reg(tmp.to_reg()), - dst, - )); - } else if op == Opcode::FcvtToUintSat { - // The algorithm for converting floats to unsigned ints is a little tricky. The - // complication arises because we are converting from a signed 64-bit int with a positive - // integer range from 1..INT_MAX (0x1..0x7FFFFFFF) to an unsigned integer with an extended - // range from (INT_MAX+1)..UINT_MAX. It's this range from (INT_MAX+1)..UINT_MAX - // (0x80000000..0xFFFFFFFF) that needs to be accounted for as a special case since our - // conversion instruction (cvttps2dq) only converts as high as INT_MAX (0x7FFFFFFF), but - // which conveniently setting underflows and overflows (smaller than MIN_INT or larger than - // MAX_INT) to be INT_MAX+1 (0x80000000). Nothing that the range (INT_MAX+1)..UINT_MAX includes - // precisely INT_MAX values we can correctly account for and convert every value in this range - // if we simply subtract INT_MAX+1 before doing the cvttps2dq conversion. After the subtraction - // every value originally (INT_MAX+1)..UINT_MAX is now the range (0..INT_MAX). - // After the conversion we add INT_MAX+1 back to this converted value, noting again that - // values we are trying to account for were already set to INT_MAX+1 during the original conversion. - // We simply have to create a mask and make sure we are adding together only the lanes that need - // to be accounted for. Digesting it all the steps then are: - // - // Step 1 - Account for NaN and negative floats by setting these src values to zero. - // Step 2 - Make a copy (tmp1) of the src value since we need to convert twice for - // reasons described above. - // Step 3 - Convert the original src values. This will convert properly all floats up to INT_MAX - // Step 4 - Subtract INT_MAX from the copy set (tmp1). Note, all zero and negative values are those - // values that were originally in the range (0..INT_MAX). This will come in handy during - // step 7 when we zero negative lanes. - // Step 5 - Create a bit mask for tmp1 that will correspond to all lanes originally less than - // UINT_MAX that are now less than INT_MAX thanks to the subtraction. - // Step 6 - Convert the second set of values (tmp1) - // Step 7 - Prep the converted second set by zeroing out negative lanes (these have already been - // converted correctly with the first set) and by setting overflow lanes to 0x7FFFFFFF - // as this will allow us to properly saturate overflow lanes when adding to 0x80000000 - // Step 8 - Add the orginal converted src and the converted tmp1 where float values originally less - // than and equal to INT_MAX will be unchanged, float values originally between INT_MAX+1 and - // UINT_MAX will add together (INT_MAX) + (SRC - INT_MAX), and float values originally - // greater than UINT_MAX will be saturated to UINT_MAX (0xFFFFFFFF) after adding (0x8000000 + 0x7FFFFFFF). - // - // - // The table below illustrates the result after each step where it matters for the converted set. - // Note the original value range (original src set) is the final dst in Step 8: - // - // Original src set: - // | Original Value Range | Step 1 | Step 3 | Step 8 | - // | -FLT_MIN..FLT_MAX | 0.0..FLT_MAX | 0..INT_MAX(w/overflow) | 0..UINT_MAX(w/saturation) | - // - // Copied src set (tmp1): - // | Step 2 | Step 4 | - // | 0.0..FLT_MAX | (0.0-(INT_MAX+1))..(FLT_MAX-(INT_MAX+1)) | - // - // | Step 6 | Step 7 | - // | (0-(INT_MAX+1))..(UINT_MAX-(INT_MAX+1))(w/overflow) | ((INT_MAX+1)-(INT_MAX+1))..(INT_MAX+1) | - - // Create temporaries - assert_eq!(types::F32X4, ctx.input_ty(insn, 0)); - let tmp1 = ctx.alloc_tmp(types::I32X4).only_reg().unwrap(); - let tmp2 = ctx.alloc_tmp(types::I32X4).only_reg().unwrap(); - - // Converting to unsigned int so if float src is negative or NaN - // will first set to zero. - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp2)); - ctx.emit(Inst::gen_move(dst, src, input_ty)); - ctx.emit(Inst::xmm_rm_r(SseOpcode::Maxps, RegMem::from(tmp2), dst)); - - // Set tmp2 to INT_MAX+1. It is important to note here that after it looks - // like we are only converting INT_MAX (0x7FFFFFFF) but in fact because - // single precision IEEE-754 floats can only accurately represent contingous - // integers up to 2^23 and outside of this range it rounds to the closest - // integer that it can represent. In the case of INT_MAX, this value gets - // represented as 0x4f000000 which is the integer value (INT_MAX+1). - - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pcmpeqd, RegMem::from(tmp2), tmp2)); - ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(1), tmp2)); - ctx.emit(Inst::xmm_unary_rm_r( - SseOpcode::Cvtdq2ps, - RegMem::from(tmp2), - tmp2, - )); - - // Make a copy of these lanes and then do the first conversion. - // Overflow lanes greater than the maximum allowed signed value will - // set to 0x80000000. Negative and NaN lanes will be 0x0 - ctx.emit(Inst::xmm_mov(SseOpcode::Movaps, RegMem::from(dst), tmp1)); - ctx.emit(Inst::xmm_unary_rm_r( - SseOpcode::Cvttps2dq, - RegMem::from(dst), - dst, - )); - - // Set lanes to src - max_signed_int - ctx.emit(Inst::xmm_rm_r(SseOpcode::Subps, RegMem::from(tmp2), tmp1)); - - // Create mask for all positive lanes to saturate (i.e. greater than - // or equal to the maxmimum allowable unsigned int). - let cond = FcmpImm::from(FloatCC::LessThanOrEqual); - ctx.emit(Inst::xmm_rm_r_imm( - SseOpcode::Cmpps, - RegMem::from(tmp1), - tmp2, - cond.encode(), - OperandSize::Size32, - )); - - // Convert those set of lanes that have the max_signed_int factored out. - ctx.emit(Inst::xmm_unary_rm_r( - SseOpcode::Cvttps2dq, - RegMem::from(tmp1), - tmp1, - )); - - // Prepare converted lanes by zeroing negative lanes and prepping lanes - // that have positive overflow (based on the mask) by setting these lanes - // to 0x7FFFFFFF - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp1)); - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp2)); - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmaxsd, RegMem::from(tmp2), tmp1)); - - // Add this second set of converted lanes to the original to properly handle - // values greater than max signed int. - ctx.emit(Inst::xmm_rm_r(SseOpcode::Paddd, RegMem::from(tmp1), dst)); - } else { - // Since this branch is also guarded by a check for vector types - // neither Opcode::FcvtToUint nor Opcode::FcvtToSint can reach here - // due to vector varients not existing. The first two branches will - // cover all reachable cases. - unreachable!(); - } - } - } Opcode::IaddPairwise => { if let (Some(swiden_low), Some(swiden_high)) = ( matches_input(ctx, inputs[0], Opcode::SwidenLow), diff --git a/cranelift/filetests/filetests/isa/x64/fcvt.clif b/cranelift/filetests/filetests/isa/x64/fcvt.clif index 09d7c80336..09c6093c54 100644 --- a/cranelift/filetests/filetests/isa/x64/fcvt.clif +++ b/cranelift/filetests/filetests/isa/x64/fcvt.clif @@ -200,3 +200,275 @@ block0(v0: i32x4): ; popq %rbp ; ret +function %f13(f32) -> i32 { +block0(v0: f32): + v1 = fcvt_to_uint.i32 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float32_to_uint32_seq %xmm0, %eax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f14(f32) -> i64 { +block0(v0: f32): + v1 = fcvt_to_uint.i64 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float32_to_uint64_seq %xmm0, %rax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f15(f64) -> i32 { +block0(v0: f64): + v1 = fcvt_to_uint.i32 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float64_to_uint32_seq %xmm0, %eax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f16(f64) -> i64 { +block0(v0: f64): + v1 = fcvt_to_uint.i64 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float64_to_uint64_seq %xmm0, %rax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f17(f32) -> i32 { +block0(v0: f32): + v1 = fcvt_to_uint_sat.i32 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float32_to_uint32_sat_seq %xmm0, %eax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f18(f32) -> i64 { +block0(v0: f32): + v1 = fcvt_to_uint_sat.i64 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float32_to_uint64_sat_seq %xmm0, %rax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f19(f64) -> i32 { +block0(v0: f64): + v1 = fcvt_to_uint_sat.i32 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float64_to_uint32_sat_seq %xmm0, %eax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f20(f64) -> i64 { +block0(v0: f64): + v1 = fcvt_to_uint_sat.i64 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float64_to_uint64_sat_seq %xmm0, %rax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f21(f32) -> i32 { +block0(v0: f32): + v1 = fcvt_to_sint.i32 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float32_to_sint32_seq %xmm0, %eax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f22(f32) -> i64 { +block0(v0: f32): + v1 = fcvt_to_sint.i64 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float32_to_sint64_seq %xmm0, %rax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f23(f64) -> i32 { +block0(v0: f64): + v1 = fcvt_to_sint.i32 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float64_to_sint32_seq %xmm0, %eax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f24(f64) -> i64 { +block0(v0: f64): + v1 = fcvt_to_sint.i64 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float64_to_sint64_seq %xmm0, %rax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f25(f32) -> i32 { +block0(v0: f32): + v1 = fcvt_to_sint_sat.i32 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float32_to_sint32_sat_seq %xmm0, %eax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f26(f32) -> i64 { +block0(v0: f32): + v1 = fcvt_to_sint_sat.i64 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float32_to_sint64_sat_seq %xmm0, %rax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f27(f64) -> i32 { +block0(v0: f64): + v1 = fcvt_to_sint_sat.i32 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float64_to_sint32_sat_seq %xmm0, %eax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f28(f64) -> i64 { +block0(v0: f64): + v1 = fcvt_to_sint_sat.i64 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float64_to_sint64_sat_seq %xmm0, %rax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f29(f32x4) -> i32x4 { +block0(v0: f32x4): + v1 = fcvt_to_uint_sat.i32x4 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pxor %xmm3, %xmm3, %xmm3 +; maxps %xmm0, %xmm3, %xmm0 +; pcmpeqd %xmm8, %xmm8, %xmm8 +; psrld %xmm8, $1, %xmm8 +; cvtdq2ps %xmm8, %xmm14 +; cvttps2dq %xmm0, %xmm13 +; subps %xmm0, %xmm14, %xmm0 +; cmpps $2, %xmm14, %xmm0, %xmm14 +; cvttps2dq %xmm0, %xmm0 +; pxor %xmm0, %xmm14, %xmm0 +; pxor %xmm7, %xmm7, %xmm7 +; pmaxsd %xmm0, %xmm7, %xmm0 +; paddd %xmm0, %xmm13, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f30(f32x4) -> i32x4 { +block0(v0: f32x4): + v1 = fcvt_to_sint_sat.i32x4 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movdqa %xmm0, %xmm5 +; cmpps $0, %xmm5, %xmm0, %xmm5 +; andps %xmm0, %xmm5, %xmm0 +; pxor %xmm5, %xmm0, %xmm5 +; cvttps2dq %xmm0, %xmm9 +; movdqa %xmm9, %xmm0 +; pand %xmm0, %xmm5, %xmm0 +; psrad %xmm0, $31, %xmm0 +; pxor %xmm0, %xmm9, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + diff --git a/cranelift/filetests/filetests/runtests/simd-conversion.clif b/cranelift/filetests/filetests/runtests/simd-conversion.clif index da8bced238..9648acda8c 100644 --- a/cranelift/filetests/filetests/runtests/simd-conversion.clif +++ b/cranelift/filetests/filetests/runtests/simd-conversion.clif @@ -28,6 +28,7 @@ block0(v0:f32x4): } ; run: %fcvt_to_sint_sat([0x0.0 -0x1.0 0x1.0 0x1.0p100]) == [0 -1 1 0x7FFFFFFF] ; run: %fcvt_to_sint_sat([-0x8.1 0x0.0 0x0.0 -0x1.0p100]) == [-8 0 0 0x80000000] +; run: %fcvt_to_sint_sat([+NaN +NaN +NaN +NaN]) == [0 0 0 0] function %fcvt_to_uint_sat(f32x4) -> i32x4 { block0(v0:f32x4): @@ -37,3 +38,4 @@ block0(v0:f32x4): ; run: %fcvt_to_uint_sat([0x1.0 0x4.2 0x4.6 0x1.0p100]) == [1 4 4 0xFFFFFFFF] ; run: %fcvt_to_uint_sat([-0x8.1 -0x0.0 0x0.0 -0x1.0p100]) == [0 0 0 0] ; run: %fcvt_to_uint_sat([0xB2D05E00.0 0.0 0.0 0.0]) == [3000000000 0 0 0] +; run: %fcvt_to_uint_sat([+NaN +NaN +NaN +NaN]) == [0 0 0 0]