x64: lower fcvt_from_uint to VCVTUDQ2PS when possible

When AVX512VL and AVX512F are available, use a single instruction
(`VCVTUDQ2PS`) instead of a length 9-instruction sequence. This
optimization is a port from the legacy x86 backend.
This commit is contained in:
Andrew Brown
2021-05-19 12:20:11 -07:00
parent 3b3b126fe2
commit 54b45d28a3
5 changed files with 93 additions and 63 deletions

View File

@@ -2,17 +2,21 @@ test run
set enable_simd
target x86_64 machinst
function %fcvt_from_sint() -> b1 {
block0:
v0 = vconst.i32x4 [-1 0 1 123456789]
function %fcvt_from_sint(i32x4) -> f32x4 {
block0(v0: i32x4):
v1 = fcvt_from_sint.f32x4 v0
v2 = vconst.f32x4 [-0x1.0 0.0 0x1.0 0x75bcd18.0] ; 123456789 rounds to 123456792.0, an error of 3
v3 = fcmp eq v1, v2
v4 = vall_true v3
return v4
return v1
}
; run
; run: %fcvt_from_sint([-1 0 1 123456789]) == [-0x1.0 0.0 0x1.0 0x75bcd18.0]
; Note that 123456789 rounds to 123456792.0, an error of 3
function %fcvt_from_uint(i32x4) -> f32x4 {
block0(v0: i32x4):
v1 = fcvt_from_uint.f32x4 v0
return v1
}
; run: %fcvt_from_uint([0xFFFFFFFF 0 1 123456789]) == [0x100000000.0 0.0 0x1.0 0x75bcd18.0]
; Note that 0xFFFFFFFF is decimal 4,294,967,295 and is rounded up 1 to 4,294,967,296 in f32x4.
function %fcvt_to_sint_sat(f32x4) -> i32x4 {
block0(v0:f32x4):