Convert fma, valltrue & vanytrue to ISLE (AArch64) (#4608)

* Convert `fma`, `valltrue` & `vanytrue` to ISLE (AArch64)

Ported the existing implementations of the following opcodes to ISLE on
AArch64:
- `fma`
  - Introduced missing support for `fma` on vector values, as per the
    docs.
- `valltrue`
- `vanytrue`

Also fixed `fcmp` on scalar values in the interpreter, and enabled
interpreter tests in `simd-fma.clif`.

This introduces the `FMLA` machine instruction.

Copyright (c) 2022 Arm Limited

* Add comments for `Fmla` and `Bsl`

Copyright (c) 2022 Arm Limited
This commit is contained in:
Damian Heaton
2022-08-05 17:47:56 +01:00
committed by GitHub
parent 1ed7b43e62
commit eb332b8369
19 changed files with 608 additions and 206 deletions

View File

@@ -910,3 +910,39 @@ block0(v0: f64x2):
; block0:
; frintn v0.2d, v0.2d
; ret
function %f78(f32x4, f32x4, f32x4) -> f32x4 {
block0(v0: f32x4, v1: f32x4, v2: f32x4):
v3 = fma v0, v1, v2
return v3
}
; block0:
; mov v17.16b, v0.16b
; mov v0.16b, v2.16b
; fmla v0.4s, v17.4s, v1.4s
; ret
function %f79(f32x2, f32x2, f32x2) -> f32x2 {
block0(v0: f32x2, v1: f32x2, v2: f32x2):
v3 = fma v0, v1, v2
return v3
}
; block0:
; mov v17.16b, v0.16b
; mov v0.16b, v2.16b
; fmla v0.2s, v17.2s, v1.2s
; ret
function %f80(f64x2, f64x2, f64x2) -> f64x2 {
block0(v0: f64x2, v1: f64x2, v2: f64x2):
v3 = fma v0, v1, v2
return v3
}
; block0:
; mov v17.16b, v0.16b
; mov v0.16b, v2.16b
; fmla v0.2d, v17.2d, v1.2d
; ret

View File

@@ -0,0 +1,94 @@
test compile precise-output
set unwind_info=false
target aarch64
function %fn0(b8x8) -> b1 {
block0(v0: b8x8):
v1 = vall_true v0
return v1
}
; block0:
; uminv b3, v0.8b
; mov x5, v3.d[0]
; subs xzr, x5, #0
; cset x0, ne
; ret
function %fn1(b8x16) -> b1 {
block0(v0: b8x16):
v1 = vall_true v0
return v1
}
; block0:
; uminv b3, v0.16b
; mov x5, v3.d[0]
; subs xzr, x5, #0
; cset x0, ne
; ret
function %fn2(b16x4) -> b1 {
block0(v0: b16x4):
v1 = vall_true v0
return v1
}
; block0:
; uminv h3, v0.4h
; mov x5, v3.d[0]
; subs xzr, x5, #0
; cset x0, ne
; ret
function %fn3(b16x8) -> b1 {
block0(v0: b16x8):
v1 = vall_true v0
return v1
}
; block0:
; uminv h3, v0.8h
; mov x5, v3.d[0]
; subs xzr, x5, #0
; cset x0, ne
; ret
function %fn4(b32x2) -> b1 {
block0(v0: b32x2):
v1 = vall_true v0
return v1
}
; block0:
; mov x3, v0.d[0]
; subs xzr, xzr, x3, LSR 32
; ccmp w3, #0, #nZcv, ne
; cset x0, ne
; ret
function %fn5(b32x4) -> b1 {
block0(v0: b32x4):
v1 = vall_true v0
return v1
}
; block0:
; uminv s3, v0.4s
; mov x5, v3.d[0]
; subs xzr, x5, #0
; cset x0, ne
; ret
function %fn6(b64x2) -> b1 {
block0(v0: b64x2):
v1 = vall_true v0
return v1
}
; block0:
; cmeq v3.2d, v0.2d, #0
; addp v5.2d, v3.2d, v3.2d
; fcmp d5, d5
; cset x0, eq
; ret

View File

@@ -0,0 +1,47 @@
test interpret
test run
target aarch64
; x86_64 panics: `not implemented: unable to move type: f32x2`
function %fma_f32x2(f32x2, f32x2, f32x2) -> f32x2 {
block0(v0: f32x2, v1: f32x2, v2: f32x2):
v3 = fma v0, v1, v2
return v3
}
; run: %fma_f32x2([0x9.0 0x83.0], [0x9.0 0x2.68091p6], [0x9.0 0x9.88721p1]) == [0x1.680000p6 0x1.3b88e6p14]
; run: %fma_f32x2([0x0.0 0x0.0], [0x0.0 0x0.0], [0x0.0 -0x0.0]) == [0x0.0 0x0.0]
; run: %fma_f32x2([0x0.0 0x0.0], [-0x0.0 -0x0.0], [0x0.0 0x0.0]) == [0x0.0 0x0.0]
; run: %fma_f32x2([-0x0.0 -0x0.0], [0x0.0 0x0.0], [0x0.0 0x0.0]) == [0x0.0 0x0.0]
; run: %fma_f32x2([-Inf Inf], [-Inf -Inf], [0x0.0 0x0.0]) == [+Inf -Inf]
; run: %fma_f32x2([-Inf Inf], [Inf -Inf], [0x0.0 -Inf]) == [-Inf -Inf]
; run: %fma_f32x2([-Inf -Inf], [Inf Inf], [-Inf -Inf]) == [-Inf -Inf]
; run: %fma_f32x2([-Inf -Inf], [Inf Inf], [-Inf -Inf]) == [-Inf -Inf]
; F32 Epsilon / Max / Min Positive
; run: %fma_f32x2([0x1.000000p-23 0x0.0], [0x1.000000p-23 0x0.0], [0x1.000000p-23 0x1.000000p-23]) == [0x1.000002p-23 0x1.000000p-23]
; run: %fma_f32x2([0x1.fffffep127 0x0.0], [0x1.fffffep127 0x0.0], [0x1.fffffep127 0x1.fffffep127]) == [+Inf 0x1.fffffep127]
; run: %fma_f32x2([0x1.000000p-126 0x1.000000p-126], [0x1.000000p-126 0x1.000000p-126], [0x1.000000p-126 0x1.000000p-126]) == [0x1.000000p-126 0x1.000000p-126]
; run: %fma_f32x2([0x0.0 0x0.0], [0x0.0 0x0.0], [0x1.000000p-126 0x1.000000p-126]) == [0x1.000000p-126 0x1.000000p-126]
; F32 Subnormals
; run: %fma_f32x2([0x0.800000p-126 0x0.800000p-126], [0x0.800000p-126 0x0.800000p-126], [0x0.800000p-126 0x0.0]) == [0x0.800000p-126 0x0.0]
; run: %fma_f32x2([0x0.0 0x0.000002p-126], [0x0.0 0x0.000002p-126], [0x0.800000p-126 0x0.000002p-126]) == [0x0.800000p-126 0x0.000002p-126]
; run: %fma_f32x2([0x0.000002p-126 0x0.000002p-126], [0x0.000002p-126 0x0.000002p-126], [0x0.0 0x0.0]) == [0x0.0 0x0.0]
; run: %fma_f32x2([0x0.0 0x0.0], [0x0.0 0x0.0], [0x0.000002p-126 0x0.000002p-126]) == [0x0.000002p-126 0x0.000002p-126]
;; The IEEE754 Standard does not make a lot of guarantees about what
;; comes out of NaN producing operations, we just check if its a NaN
function %fma_is_nan_f32x2(f32x2, f32x2, f32x2) -> b1 {
block0(v0: f32x2, v1: f32x2, v2: f32x2):
v3 = fma v0, v1, v2
v4 = fcmp ne v3, v3
v5 = vall_true v4
return v5
}
; run: %fma_is_nan_f32x2([Inf -Inf], [-Inf Inf], [Inf Inf]) == true
; run: %fma_is_nan_f32x2([-Inf +NaN], [-Inf 0x0.0], [-Inf 0x0.0]) == true
; run: %fma_is_nan_f32x2([0x0.0 0x0.0], [+NaN 0x0.0], [0x0.0 +NaN]) == true
; run: %fma_is_nan_f32x2([-NaN 0x0.0], [0x0.0 -NaN], [0x0.0 0x0.0]) == true
; run: %fma_is_nan_f32x2([0x0.0 NaN], [0x0.0 NaN], [-NaN NaN]) == true
; run: %fma_is_nan_f32x2([NaN NaN], [NaN NaN], [NaN NaN]) == true

View File

@@ -1,5 +1,7 @@
test interpret
test run
target x86_64 has_avx has_fma
target aarch64
function %fma_f32x4(f32x4, f32x4, f32x4) -> f32x4 {
block0(v0: f32x4, v1: f32x4, v2: f32x4):

View File

@@ -0,0 +1,58 @@
test interpret
test run
target aarch64
; s390x and x86_64 do not support 64-bit vectors.
function %valltrue_b8x8_f() -> b1 {
block0:
v0 = bconst.b8 false
v1 = splat.b8x8 v0
v2 = vall_true v1
return v2
}
; run: %valltrue_b8x8_f() == false
function %valltrue_b8x8_t() -> b1 {
block0:
v0 = bconst.b8 true
v1 = splat.b8x8 v0
v2 = vall_true v1
return v2
}
; run: %valltrue_b8x8_t() == true
function %valltrue_b16x4_f() -> b1 {
block0:
v0 = bconst.b16 false
v1 = splat.b16x4 v0
v2 = vall_true v1
return v2
}
; run: %valltrue_b16x4_f() == false
function %valltrue_b16x4_t() -> b1 {
block0:
v0 = bconst.b16 true
v1 = splat.b16x4 v0
v2 = vall_true v1
return v2
}
; run: %valltrue_b16x4_t() == true
function %valltrue_b32x2_f() -> b1 {
block0:
v0 = bconst.b32 false
v1 = splat.b32x2 v0
v2 = vall_true v1
return v2
}
; run: %valltrue_b32x2_f() == false
function %valltrue_b32x2_t() -> b1 {
block0:
v0 = bconst.b32 true
v1 = splat.b32x2 v0
v2 = vall_true v1
return v2
}
; run: %valltrue_b32x2_t() == true

View File

@@ -0,0 +1,58 @@
test interpret
test run
target aarch64
; s390x and x86_64 do not support 64-bit vectors.
function %vanytrue_b8x8_f() -> b1 {
block0:
v0 = bconst.b8 false
v1 = splat.b8x8 v0
v2 = vany_true v1
return v2
}
; run: %vanytrue_b8x8_f() == false
function %vanytrue_b8x8_t() -> b1 {
block0:
v0 = bconst.b8 true
v1 = splat.b8x8 v0
v2 = vany_true v1
return v2
}
; run: %vanytrue_b8x8_t() == true
function %vanytrue_b16x4_f() -> b1 {
block0:
v0 = bconst.b16 false
v1 = splat.b16x4 v0
v2 = vany_true v1
return v2
}
; run: %vanytrue_b16x4_f() == false
function %vanytrue_b16x4_t() -> b1 {
block0:
v0 = bconst.b16 true
v1 = splat.b16x4 v0
v2 = vany_true v1
return v2
}
; run: %vanytrue_b16x4_t() == true
function %vanytrue_b32x2_f() -> b1 {
block0:
v0 = bconst.b32 false
v1 = splat.b32x2 v0
v2 = vany_true v1
return v2
}
; run: %vanytrue_b32x2_f() == false
function %vanytrue_b32x2_t() -> b1 {
block0:
v0 = bconst.b32 true
v1 = splat.b32x2 v0
v2 = vany_true v1
return v2
}
; run: %vanytrue_b32x2_t() == true