x64: lower fcvt_from_uint to VCVTUDQ2PS when possible

When AVX512VL and AVX512F are available, use a single instruction (`VCVTUDQ2PS`) instead of a length 9-instruction sequence. This optimization is a port from the legacy x86 backend.
2021-05-19 12:20:11 -07:00
parent 3b3b126fe2
commit 54b45d28a3
5 changed files with 93 additions and 63 deletions
--- a/cranelift/filetests/filetests/isa/x64/simd-conversion-run.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-conversion-run.clif
@@ -2,17 +2,21 @@ test run
 set enable_simd
 target x86_64 machinst

-function %fcvt_from_sint() -> b1 {
-block0:
-    v0 = vconst.i32x4 [-1 0 1 123456789]
+function %fcvt_from_sint(i32x4) -> f32x4 {
+block0(v0: i32x4):
    v1 = fcvt_from_sint.f32x4 v0
-
-    v2 = vconst.f32x4 [-0x1.0 0.0 0x1.0 0x75bcd18.0] ; 123456789 rounds to 123456792.0, an error of 3
-    v3 = fcmp eq v1, v2
-    v4 = vall_true v3
-    return v4
+    return v1
 }
-; run
+; run: %fcvt_from_sint([-1 0 1 123456789]) == [-0x1.0 0.0 0x1.0 0x75bcd18.0]
+; Note that 123456789 rounds to 123456792.0, an error of 3
+
+function %fcvt_from_uint(i32x4) -> f32x4 {
+block0(v0: i32x4):
+    v1 = fcvt_from_uint.f32x4 v0
+    return v1
+}
+; run: %fcvt_from_uint([0xFFFFFFFF 0 1 123456789]) == [0x100000000.0 0.0 0x1.0 0x75bcd18.0]
+; Note that 0xFFFFFFFF is decimal 4,294,967,295 and is rounded up 1 to 4,294,967,296 in f32x4.

 function %fcvt_to_sint_sat(f32x4) -> i32x4 {
 block0(v0:f32x4):