This commit adds new lowerings to the AArch64 backend of the element-based `fmla` and `fmls` instructions. These instructions have one of the multiplicands as an implicit broadcast of a single lane of another register and can help remove `shuffle` or `dup` instructions that would otherwise be used to implement them.
126 lines
6.0 KiB
Plaintext
126 lines
6.0 KiB
Plaintext
test interpret
|
|
test run
|
|
set enable_simd
|
|
target x86_64 has_avx has_fma
|
|
target x86_64 has_avx=false has_fma=false
|
|
target aarch64
|
|
|
|
function %fma_f32x4(f32x4, f32x4, f32x4) -> f32x4 {
|
|
block0(v0: f32x4, v1: f32x4, v2: f32x4):
|
|
v3 = fma v0, v1, v2
|
|
return v3
|
|
}
|
|
; run: %fma_f32x4([0x9.0 0x83.0 0x1.99999ap-2 -0x1.4cccccp0], [0x9.0 0x2.68091p6 0x1.333334p-1 -0x1.666666p1], [0x9.0 0x9.88721p1 0x1.400000p1 -0x1.b33334p0]) == [0x1.680000p6 0x1.3b88e6p14 0x1.5eb852p1 0x1.f0a3d2p0]
|
|
|
|
; Zeroes
|
|
; run: %fma_f32x4([0x0.0 0x0.0 0x0.0 -0x0.0], [0x0.0 0x0.0 -0x0.0 0x0.0], [0x0.0 -0x0.0 0x0.0 0x0.0]) == [0x0.0 0x0.0 0x0.0 0x0.0]
|
|
|
|
; Infinites
|
|
; run: %fma_f32x4([-Inf Inf -Inf Inf], [-Inf -Inf Inf -Inf], [0x0.0 0x0.0 0x0.0 -Inf]) == [Inf -Inf -Inf -Inf]
|
|
; run: %fma_f32x4([-Inf 0x0.0 0x0.0 0x0.0], [Inf 0x0.0 0x0.0 0x0.0], [-Inf 0x0.0 0x0.0 0x0.0]) == [-Inf 0x0.0 0x0.0 0x0.0]
|
|
|
|
; F32 Epsilon / Max / Min Positive
|
|
; run: %fma_f32x4([0x1.000000p-23 0x0.0 0x1.fffffep127 0x0.0], [0x1.000000p-23 0x0.0 0x1.fffffep127 0x0.0], [0x1.000000p-23 0x1.000000p-23 0x1.fffffep127 0x1.fffffep127]) == [0x1.000002p-23 0x1.000000p-23 +Inf 0x1.fffffep127]
|
|
; run: %fma_f32x4([0x1.000000p-126 0x0.0 0x0.0 0x0.0], [0x1.000000p-126 0x0.0 0x0.0 0x0.0], [0x1.000000p-126 0x1.000000p-126 0x0.0 0x0.0]) == [0x1.000000p-126 0x1.000000p-126 0x0.0 0x0.0]
|
|
|
|
; F32 Subnormals
|
|
; run: %fma_f32x4([0x0.800000p-126 0x0.800000p-126 0x0.0 0x0.000002p-126], [0x0.800000p-126 0x0.800000p-126 0x0.0 0x0.000002p-126], [0x0.800000p-126 0x0.0 0x0.000002p-126 0x0.000002p-126]) == [0x0.800000p-126 0x0.0 0x0.000002p-126 0x0.000002p-126]
|
|
; run: %fma_f32x4([0x0.000002p-126 0x0.0 0x0.0 0x0.0], [0x0.000002p-126 0x0.0 0x0.0 0x0.0], [0x0.0 0x0.000002p-126 0x0.0 0x0.0]) == [0x0.0 0x0.000002p-126 0x0.0 0x0.0]
|
|
|
|
|
|
|
|
;; The IEEE754 Standard does not make a lot of guarantees about what
|
|
;; comes out of NaN producing operations, we just check if its a NaN
|
|
function %fma_is_nan_f32x4(f32x4, f32x4, f32x4) -> i8 {
|
|
block0(v0: f32x4, v1: f32x4, v2: f32x4):
|
|
v3 = fma v0, v1, v2
|
|
v4 = fcmp ne v3, v3
|
|
v5 = vall_true v4
|
|
return v5
|
|
}
|
|
; run: %fma_is_nan_f32x4([Inf -Inf -Inf +NaN], [-Inf Inf -Inf 0x0.0], [Inf Inf -Inf 0x0.0]) == 1
|
|
; run: %fma_is_nan_f32x4([0x0.0 0x0.0 -NaN 0x0.0], [+NaN 0x0.0 0x0.0 -NaN], [0x0.0 +NaN 0x0.0 0x0.0]) == 1
|
|
; run: %fma_is_nan_f32x4([0x0.0 NaN NaN NaN], [0x0.0 NaN NaN NaN], [-NaN NaN NaN NaN]) == 1
|
|
|
|
|
|
|
|
|
|
|
|
function %fma_f64x2(f64x2, f64x2, f64x2) -> f64x2 {
|
|
block0(v0: f64x2, v1: f64x2, v2: f64x2):
|
|
v3 = fma v0, v1, v2
|
|
return v3
|
|
}
|
|
; run: %fma_f64x2([0x9.0 0x1.3b88ea148dd4ap14], [0x9.0 0x2.680916809121p6], [0x9.0 0x9.887218721837p1]) == [0x1.680000p6 0x1.7ba6ebee17417p21]
|
|
|
|
; Zeroes
|
|
; run: %fma_f64x2([0x0.0 0x0.0], [0x0.0 0x0.0], [0x0.0 -0x0.0]) == [0x0.0 0x0.0]
|
|
; run: %fma_f64x2([0x0.0 -0x0.0], [-0x0.0 0x0.0], [0x0.0 0x0.0]) == [0x0.0 0x0.0]
|
|
|
|
; Infinites
|
|
; run: %fma_f64x2([-Inf Inf], [-Inf -Inf], [0x0.0 0x0.0]) == [+Inf -Inf]
|
|
; run: %fma_f64x2([-Inf Inf], [Inf -Inf], [0x0.0 -Inf]) == [-Inf -Inf]
|
|
; run: %fma_f64x2([-Inf Inf], [Inf Inf], [-Inf Inf]) == [-Inf Inf]
|
|
|
|
; F64 Epsilon / Max / Min Positive
|
|
; run: %fma_f64x2([0x1.0p-52 0x0.0], [0x1.0p-52 0x0.0], [0x1.0p-52 0x1.0p-52]) == [0x1.0000000000001p-52 0x1.0p-52]
|
|
; run: %fma_f64x2([0x1.fffffffffffffp1023 0x0.0], [0x1.fffffffffffffp1023 0x0.0], [0x1.fffffffffffffp1023 0x1.fffffffffffffp1023]) == [+Inf 0x1.fffffffffffffp1023]
|
|
; run: %fma_f64x2([0x1.0p-1022 0x0.0], [0x1.0p-1022 0x0.0], [0x1.0p-1022 0x1.0p-1022]) == [0x1.0p-1022 0x1.0p-1022]
|
|
|
|
; F64 Subnormals
|
|
; run: %fma_f64x2([0x0.8p-1022 0x0.8p-1022], [0x0.8p-1022 0x0.8p-1022], [0x0.8p-1022 0x0.0]) == [0x0.8p-1022 0x0.0]
|
|
; run: %fma_f64x2([0x0.0 0x0.0000000000001p-1022], [0x0.0 0x0.0000000000001p-1022], [0x0.8p-1022 0x0.0000000000001p-1022]) == [0x0.8p-1022 0x0.0000000000001p-1022]
|
|
; run: %fma_f64x2([0x0.0000000000001p-1022 0x0.0], [0x0.0000000000001p-1022 0x0.0], [0x0.0 0x0.0000000000001p-1022]) == [0x0.0 0x0.0000000000001p-1022]
|
|
|
|
|
|
;; The IEEE754 Standard does not make a lot of guarantees about what
|
|
;; comes out of NaN producing operations, we just check if its a NaN
|
|
function %fma_is_nan_f64x2(f64x2, f64x2, f64x2) -> i8 {
|
|
block0(v0: f64x2, v1: f64x2, v2: f64x2):
|
|
v3 = fma v0, v1, v2
|
|
v4 = fcmp ne v3, v3
|
|
v5 = vall_true v4
|
|
return v5
|
|
}
|
|
; run: %fma_is_nan_f64x2([Inf -Inf], [-Inf Inf], [Inf Inf]) == 1
|
|
; run: %fma_is_nan_f64x2([-Inf +NaN], [-Inf 0x0.0], [-Inf 0x0.0]) == 1
|
|
; run: %fma_is_nan_f64x2([0x0.0 0x0.0], [+NaN 0x0.0], [0x0.0 +NaN]) == 1
|
|
; run: %fma_is_nan_f64x2([-NaN 0x0.0], [0x0.0 -NaN], [0x0.0 0x0.0]) == 1
|
|
; run: %fma_is_nan_f64x2([0x0.0 NaN], [0x0.0 NaN], [-NaN NaN]) == 1
|
|
|
|
function %fma_f32x4_splat1(f32x4, f32, f32x4) -> f32x4 {
|
|
block0(v0: f32x4, v1: f32, v2: f32x4):
|
|
v3 = splat.f32x4 v1
|
|
v4 = fma v0, v3, v2
|
|
return v4
|
|
}
|
|
; run: %fma_f32x4_splat1([0x9.0 0x9.0 0x9.0 0x9.0], 0x9.0, [0x9.0 0x9.0 0x9.0 0x9.0]) == [0x1.680000p6 0x1.680000p6 0x1.680000p6 0x1.680000p6]
|
|
; run: %fma_f32x4_splat1([0x1.0 0x2.0 0x3.0 0x4.0], 0x0.0, [0x5.0 0x6.0 0x7.0 0x8.0]) == [0x5.0 0x6.0 0x7.0 0x8.0]
|
|
|
|
function %fma_f32x4_splat2(f32, f32x4, f32x4) -> f32x4 {
|
|
block0(v0: f32, v1: f32x4, v2: f32x4):
|
|
v3 = splat.f32x4 v0
|
|
v4 = fma v3, v1, v2
|
|
return v4
|
|
}
|
|
; run: %fma_f32x4_splat2(0x9.0, [0x9.0 0x9.0 0x9.0 0x9.0], [0x9.0 0x9.0 0x9.0 0x9.0]) == [0x1.680000p6 0x1.680000p6 0x1.680000p6 0x1.680000p6]
|
|
; run: %fma_f32x4_splat2(0x0.0, [0x1.0 0x2.0 0x3.0 0x4.0], [0x5.0 0x6.0 0x7.0 0x8.0]) == [0x5.0 0x6.0 0x7.0 0x8.0]
|
|
|
|
function %fma_f64x2_splat1(f64x2, f64, f64x2) -> f64x2 {
|
|
block0(v0: f64x2, v1: f64, v2: f64x2):
|
|
v3 = splat.f64x2 v1
|
|
v4 = fma v0, v3, v2
|
|
return v4
|
|
}
|
|
; run: %fma_f64x2_splat1([0x9.0 0x9.0], 0x9.0, [0x9.0 0x9.0]) == [0x1.680000p6 0x1.680000p6]
|
|
; run: %fma_f64x2_splat1([0x1.0 0x2.0], 0x0.0, [0x5.0 0x6.0]) == [0x5.0 0x6.0]
|
|
|
|
function %fma_f64x2_splat2(f64, f64x2, f64x2) -> f64x2 {
|
|
block0(v0: f64, v1: f64x2, v2: f64x2):
|
|
v3 = splat.f64x2 v0
|
|
v4 = fma v3, v1, v2
|
|
return v4
|
|
}
|
|
; run: %fma_f64x2_splat2(0x9.0, [0x9.0 0x9.0], [0x9.0 0x9.0]) == [0x1.680000p6 0x1.680000p6]
|
|
; run: %fma_f64x2_splat2(0x0.0, [0x1.0 0x2.0], [0x5.0 0x6.0]) == [0x5.0 0x6.0]
|