x64: Implement SIMD fma (#4474)
* x64: Add VEX Instruction Encoder This uses a similar builder pattern to the EVEX Encoder. Does not yet support memory accesses. * x64: Add FMA Flag * x64: Implement SIMD `fma` * x64: Use 4 register Vex Inst * x64: Reorder VEX pretty print args
This commit is contained in:
85
cranelift/filetests/filetests/runtests/simd-fma.clif
Normal file
85
cranelift/filetests/filetests/runtests/simd-fma.clif
Normal file
@@ -0,0 +1,85 @@
|
||||
test run
|
||||
target x86_64 has_avx has_fma
|
||||
|
||||
function %fma_f32x4(f32x4, f32x4, f32x4) -> f32x4 {
|
||||
block0(v0: f32x4, v1: f32x4, v2: f32x4):
|
||||
v3 = fma v0, v1, v2
|
||||
return v3
|
||||
}
|
||||
; run: %fma_f32x4([0x9.0 0x83.0 0x1.99999ap-2 -0x1.4cccccp0], [0x9.0 0x2.68091p6 0x1.333334p-1 -0x1.666666p1], [0x9.0 0x9.88721p1 0x1.400000p1 -0x1.b33334p0]) == [0x1.680000p6 0x1.3b88e6p14 0x1.5eb852p1 0x1.f0a3d2p0]
|
||||
|
||||
; Zeroes
|
||||
; run: %fma_f32x4([0x0.0 0x0.0 0x0.0 -0x0.0], [0x0.0 0x0.0 -0x0.0 0x0.0], [0x0.0 -0x0.0 0x0.0 0x0.0]) == [0x0.0 0x0.0 0x0.0 0x0.0]
|
||||
|
||||
; Infinites
|
||||
; run: %fma_f32x4([-Inf Inf -Inf Inf], [-Inf -Inf Inf -Inf], [0x0.0 0x0.0 0x0.0 -Inf]) == [Inf -Inf -Inf -Inf]
|
||||
; run: %fma_f32x4([-Inf 0x0.0 0x0.0 0x0.0], [Inf 0x0.0 0x0.0 0x0.0], [-Inf 0x0.0 0x0.0 0x0.0]) == [-Inf 0x0.0 0x0.0 0x0.0]
|
||||
|
||||
; F32 Epsilon / Max / Min Positive
|
||||
; run: %fma_f32x4([0x1.000000p-23 0x0.0 0x1.fffffep127 0x0.0], [0x1.000000p-23 0x0.0 0x1.fffffep127 0x0.0], [0x1.000000p-23 0x1.000000p-23 0x1.fffffep127 0x1.fffffep127]) == [0x1.000002p-23 0x1.000000p-23 +Inf 0x1.fffffep127]
|
||||
; run: %fma_f32x4([0x1.000000p-126 0x0.0 0x0.0 0x0.0], [0x1.000000p-126 0x0.0 0x0.0 0x0.0], [0x1.000000p-126 0x1.000000p-126 0x0.0 0x0.0]) == [0x1.000000p-126 0x1.000000p-126 0x0.0 0x0.0]
|
||||
|
||||
; F32 Subnormals
|
||||
; run: %fma_f32x4([0x0.800000p-126 0x0.800000p-126 0x0.0 0x0.000002p-126], [0x0.800000p-126 0x0.800000p-126 0x0.0 0x0.000002p-126], [0x0.800000p-126 0x0.0 0x0.000002p-126 0x0.000002p-126]) == [0x0.800000p-126 0x0.0 0x0.000002p-126 0x0.000002p-126]
|
||||
; run: %fma_f32x4([0x0.000002p-126 0x0.0 0x0.0 0x0.0], [0x0.000002p-126 0x0.0 0x0.0 0x0.0], [0x0.0 0x0.000002p-126 0x0.0 0x0.0]) == [0x0.0 0x0.000002p-126 0x0.0 0x0.0]
|
||||
|
||||
|
||||
|
||||
;; The IEEE754 Standard does not make a lot of guarantees about what
|
||||
;; comes out of NaN producing operations, we just check if its a NaN
|
||||
function %fma_is_nan_f32x4(f32x4, f32x4, f32x4) -> b1 {
|
||||
block0(v0: f32x4, v1: f32x4, v2: f32x4):
|
||||
v3 = fma v0, v1, v2
|
||||
v4 = fcmp ne v3, v3
|
||||
v5 = vall_true v4
|
||||
return v5
|
||||
}
|
||||
; run: %fma_is_nan_f32x4([Inf -Inf -Inf +NaN], [-Inf Inf -Inf 0x0.0], [Inf Inf -Inf 0x0.0]) == true
|
||||
; run: %fma_is_nan_f32x4([0x0.0 0x0.0 -NaN 0x0.0], [+NaN 0x0.0 0x0.0 -NaN], [0x0.0 +NaN 0x0.0 0x0.0]) == true
|
||||
; run: %fma_is_nan_f32x4([0x0.0 NaN NaN NaN], [0x0.0 NaN NaN NaN], [-NaN NaN NaN NaN]) == true
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
function %fma_f64x2(f64x2, f64x2, f64x2) -> f64x2 {
|
||||
block0(v0: f64x2, v1: f64x2, v2: f64x2):
|
||||
v3 = fma v0, v1, v2
|
||||
return v3
|
||||
}
|
||||
; run: %fma_f64x2([0x9.0 0x1.3b88ea148dd4ap14], [0x9.0 0x2.680916809121p6], [0x9.0 0x9.887218721837p1]) == [0x1.680000p6 0x1.7ba6ebee17417p21]
|
||||
|
||||
; Zeroes
|
||||
; run: %fma_f64x2([0x0.0 0x0.0], [0x0.0 0x0.0], [0x0.0 -0x0.0]) == [0x0.0 0x0.0]
|
||||
; run: %fma_f64x2([0x0.0 -0x0.0], [-0x0.0 0x0.0], [0x0.0 0x0.0]) == [0x0.0 0x0.0]
|
||||
|
||||
; Infinites
|
||||
; run: %fma_f64x2([-Inf Inf], [-Inf -Inf], [0x0.0 0x0.0]) == [+Inf -Inf]
|
||||
; run: %fma_f64x2([-Inf Inf], [Inf -Inf], [0x0.0 -Inf]) == [-Inf -Inf]
|
||||
; run: %fma_f64x2([-Inf Inf], [Inf Inf], [-Inf Inf]) == [-Inf Inf]
|
||||
|
||||
; F64 Epsilon / Max / Min Positive
|
||||
; run: %fma_f64x2([0x1.0p-52 0x0.0], [0x1.0p-52 0x0.0], [0x1.0p-52 0x1.0p-52]) == [0x1.0000000000001p-52 0x1.0p-52]
|
||||
; run: %fma_f64x2([0x1.fffffffffffffp1023 0x0.0], [0x1.fffffffffffffp1023 0x0.0], [0x1.fffffffffffffp1023 0x1.fffffffffffffp1023]) == [+Inf 0x1.fffffffffffffp1023]
|
||||
; run: %fma_f64x2([0x1.0p-1022 0x0.0], [0x1.0p-1022 0x0.0], [0x1.0p-1022 0x1.0p-1022]) == [0x1.0p-1022 0x1.0p-1022]
|
||||
|
||||
; F64 Subnormals
|
||||
; run: %fma_f64x2([0x0.8p-1022 0x0.8p-1022], [0x0.8p-1022 0x0.8p-1022], [0x0.8p-1022 0x0.0]) == [0x0.8p-1022 0x0.0]
|
||||
; run: %fma_f64x2([0x0.0 0x0.0000000000001p-1022], [0x0.0 0x0.0000000000001p-1022], [0x0.8p-1022 0x0.0000000000001p-1022]) == [0x0.8p-1022 0x0.0000000000001p-1022]
|
||||
; run: %fma_f64x2([0x0.0000000000001p-1022 0x0.0], [0x0.0000000000001p-1022 0x0.0], [0x0.0 0x0.0000000000001p-1022]) == [0x0.0 0x0.0000000000001p-1022]
|
||||
|
||||
|
||||
;; The IEEE754 Standard does not make a lot of guarantees about what
|
||||
;; comes out of NaN producing operations, we just check if its a NaN
|
||||
function %fma_is_nan_f64x2(f64x2, f64x2, f64x2) -> b1 {
|
||||
block0(v0: f64x2, v1: f64x2, v2: f64x2):
|
||||
v3 = fma v0, v1, v2
|
||||
v4 = fcmp ne v3, v3
|
||||
v5 = vall_true v4
|
||||
return v5
|
||||
}
|
||||
; run: %fma_is_nan_f64x2([Inf -Inf], [-Inf Inf], [Inf Inf]) == true
|
||||
; run: %fma_is_nan_f64x2([-Inf +NaN], [-Inf 0x0.0], [-Inf 0x0.0]) == true
|
||||
; run: %fma_is_nan_f64x2([0x0.0 0x0.0], [+NaN 0x0.0], [0x0.0 +NaN]) == true
|
||||
; run: %fma_is_nan_f64x2([-NaN 0x0.0], [0x0.0 -NaN], [0x0.0 0x0.0]) == true
|
||||
; run: %fma_is_nan_f64x2([0x0.0 NaN], [0x0.0 NaN], [-NaN NaN]) == true
|
||||
Reference in New Issue
Block a user