x64: Lower fcvt_from_uint in ISLE (#4684)

* Add a test for the existing behavior of fcvt_from_unit * Migrate the I8, I16, I32 cases of fcvt_from_uint * Implement the I64 case of fcvt_from_uint * Add a test for the existing behavior of fcvt_from_uint.f64x2 * Migrate fcvt_from_uint.f64x2 to ISLE * Lower the last case of `fcvt_from_uint` * Add a test for `fcvt_from_uint` * Finish lowering fcmp_from_uint * Format
2022-08-11 12:28:41 -07:00
parent c4fd6a95da
commit 0c2e0494bd
8 changed files with 223 additions and 280 deletions
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -2985,3 +2985,76 @@

 (rule (lower (fcvt_low_from_sint a @ (value_type ty)))
      (x64_cvtdq2pd ty a))
+
+;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type $F32 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty))))))
+      (x64_cvtsi2ss $I64 (extend_to_gpr val $I64 (ExtendKind.Zero))))
+
+(rule (lower (has_type $F64 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty))))))
+      (x64_cvtsi2sd $I64 (extend_to_gpr val $I64 (ExtendKind.Zero))))
+
+(rule (lower (has_type ty (fcvt_from_uint val @ (value_type $I64))))
+      (cvt_u64_to_float_seq ty val))
+
+;; Algorithm uses unpcklps to help create a float that is equivalent
+;; 0x1.0p52 + double(src). 0x1.0p52 is unique because at this exponent
+;; every value of the mantissa represents a corresponding uint32 number.
+;; When we subtract 0x1.0p52 we are left with double(src).
+(rule (lower (has_type $F64X2 (fcvt_from_uint (uwiden_low val @ (value_type $I32X4)))))
+      (let ((uint_mask Xmm (x64_xmm_load_const $I32X4 (fcvt_uint_mask_const)))
+            (res Xmm (x64_unpcklps val uint_mask))
+            (uint_mask_high Xmm (x64_xmm_load_const $I32X4 (fcvt_uint_mask_high_const))))
+        (x64_subpd res uint_mask_high)))
+
+;; When AVX512VL and AVX512F are available,
+;; `fcvt_from_uint` can be lowered to a single instruction.
+;;
+;; NOTE: the priority of 1 here is to break ties with the next case for $F32X4,
+;; as it doesn't require either of the avx512 extensions to be enabled.
+(rule 1 (lower (has_type (and (avx512vl_enabled) (avx512f_enabled) $F32X4)
+                         (fcvt_from_uint src)))
+      (x64_vcvtudq2ps src))
+
+;; Converting packed unsigned integers to packed floats
+;; requires a few steps. There is no single instruction
+;; lowering for converting unsigned floats but there is for
+;; converting packed signed integers to float (cvtdq2ps). In
+;; the steps below we isolate the upper half (16 bits) and
+;; lower half (16 bits) of each lane and then we convert
+;; each half separately using cvtdq2ps meant for signed
+;; integers. In order for this to work for the upper half
+;; bits we must shift right by 1 (divide by 2) these bits in
+;; order to ensure the most significant bit is 0 not signed,
+;; and then after the conversion we double the value.
+;; Finally we add the converted values where addition will
+;; correctly round.
+;;
+;; Sequence:
+;; -> A = 0xffffffff
+;; -> Ah = 0xffff0000
+;; -> Al = 0x0000ffff
+;; -> Convert(Al) // Convert int to float
+;; -> Ah = Ah >> 1 // Shift right 1 to assure Ah conversion isn't treated as signed
+;; -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift
+;; -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion.
+;; -> dst = Ah + Al // Add the two floats together
+(rule (lower (has_type $F32X4 (fcvt_from_uint a)))
+      (let (;;  get the low 16 bits
+            (a_lo Xmm (x64_pslld a (RegMemImm.Imm 16)))
+            (a_lo Xmm (x64_psrld a_lo (RegMemImm.Imm 16)))
+
+            ;; get the high 16 bits
+            (a_hi Xmm (x64_psubd a a_lo))
+
+            ;; convert the low 16 bits
+            (a_lo Xmm (x64_cvtdq2ps a_lo))
+
+            ;; shift the high bits by 1, convert, and double to get the correct
+            ;; value
+            (a_hi Xmm (x64_psrld a_hi (RegMemImm.Imm 1)))
+            (a_hi Xmm (x64_cvtdq2ps a_hi))
+            (a_hi Xmm (x64_addps a_hi a_hi)))
+
+        ;; add together the two converted values
+        (x64_addps a_hi a_lo)))