aarch64: Add more lowerings for the CLIF fma (#6150)

This commit adds new lowerings to the AArch64 backend of the element-based `fmla` and `fmls` instructions. These instructions have one of the multiplicands as an implicit broadcast of a single lane of another register and can help remove `shuffle` or `dup` instructions that would otherwise be used to implement them.
2023-04-05 12:22:55 -05:00
parent bf741955f0
commit 967543eb43
8 changed files with 321 additions and 15 deletions
--- a/cranelift/filetests/filetests/isa/aarch64/fma.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/fma.clif
@@ -157,3 +157,152 @@ block0(v0: f64x2, v1: f64x2, v2: f64x2):
 ;   fmls v0.2d, v5.2d, v1.2d
 ;   ret

+function %f32x4_splat0(f32, f32x4, f32x4) -> f32x4 {
+block0(v0: f32, v1: f32x4, v2: f32x4):
+    v3 = splat.f32x4 v0
+    v4 = fma v3, v1, v2
+    return v4
+}
+
+; VCode:
+; block0:
+;   mov v5.16b, v0.16b
+;   mov v0.16b, v2.16b
+;   fmla v0.4s, v0.4s, v1.4s, v5.s[0]
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   mov v5.16b, v0.16b
+;   mov v0.16b, v2.16b
+;   fmla v0.4s, v1.4s, v5.s[0]
+;   ret
+
+function %f32x4_splat1(f32x4, f32, f32x4) -> f32x4 {
+block0(v0: f32x4, v1: f32, v2: f32x4):
+    v3 = splat.f32x4 v1
+    v4 = fneg v0
+    v5 = fma v4, v3, v2
+    return v5
+}
+
+; VCode:
+; block0:
+;   mov v5.16b, v0.16b
+;   mov v0.16b, v2.16b
+;   fmls v0.4s, v0.4s, v5.4s, v1.s[0]
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   mov v5.16b, v0.16b
+;   mov v0.16b, v2.16b
+;   fmls v0.4s, v5.4s, v1.s[0]
+;   ret
+
+function %f32x4_splat2(f32x4, f32x4, f32x4) -> f32x4 {
+block0(v0: f32x4, v1: f32x4, v2: f32x4):
+    v3 = bitcast.i8x16 little v0
+    v4 = shuffle v3, v3, 0x07060504_07060504_07060504_07060504
+    v5 = bitcast.f32x4 little v4
+    v6 = fma v5, v1, v2
+    return v6
+}
+
+; VCode:
+; block0:
+;   mov v5.16b, v0.16b
+;   mov v0.16b, v2.16b
+;   fmla v0.4s, v0.4s, v1.4s, v5.s[1]
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   mov v5.16b, v0.16b
+;   mov v0.16b, v2.16b
+;   fmla v0.4s, v1.4s, v5.s[1]
+;   ret
+
+function %f32x4_splat3(f32x4, f32x4, f32x4) -> f32x4 {
+block0(v0: f32x4, v1: f32x4, v2: f32x4):
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v3, v3, 0x0f0e0d0c_0f0e0d0c_0f0e0d0c_0f0e0d0c
+    v5 = bitcast.f32x4 little v4
+    v6 = fneg v5
+    v7 = fma v0, v6, v2
+    return v7
+}
+
+; VCode:
+; block0:
+;   mov v5.16b, v0.16b
+;   mov v0.16b, v2.16b
+;   fmls v0.4s, v0.4s, v5.4s, v1.s[3]
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   mov v5.16b, v0.16b
+;   mov v0.16b, v2.16b
+;   fmls v0.4s, v5.4s, v1.s[3]
+;   ret
+
+function %f32x4_splat4(f32x4, f32x4, f32x4) -> f32x4 {
+block0(v0: f32x4, v1: f32x4, v2: f32x4):
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v3, v3, 0x1f1e1d1c_1f1e1d1c_1f1e1d1c_1f1e1d1c
+    v5 = bitcast.f32x4 little v4
+    v6 = fma v0, v5, v2
+    return v6
+}
+
+; VCode:
+; block0:
+;   mov v31.16b, v1.16b
+;   movz w6, #7452
+;   movk w6, w6, #7966, LSL #16
+;   dup v17.4s, w6
+;   mov v30.16b, v31.16b
+;   tbl v19.16b, { v30.16b, v31.16b }, v17.16b
+;   mov v23.16b, v0.16b
+;   mov v0.16b, v2.16b
+;   fmla v0.4s, v0.4s, v23.4s, v19.4s
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   mov v31.16b, v1.16b
+;   mov w6, #0x1d1c
+;   movk w6, #0x1f1e, lsl #16
+;   dup v17.4s, w6
+;   mov v30.16b, v31.16b
+;   tbl v19.16b, {v30.16b, v31.16b}, v17.16b
+;   mov v23.16b, v0.16b
+;   mov v0.16b, v2.16b
+;   fmla v0.4s, v23.4s, v19.4s
+;   ret
+
+function %f64x2_splat0(f64x2, f64x2, f64x2) -> f64x2 {
+block0(v0: f64x2, v1: f64x2, v2: f64x2):
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v3, v3, 0x0f0e0d0c0b0a0908_0f0e0d0c0b0a0908
+    v5 = bitcast.f64x2 little v4
+    v6 = fneg v5
+    v7 = fma v0, v6, v2
+    return v7
+}
+
+; VCode:
+; block0:
+;   mov v5.16b, v0.16b
+;   mov v0.16b, v2.16b
+;   fmls v0.2d, v0.2d, v5.2d, v1.d[1]
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   mov v5.16b, v0.16b
+;   mov v0.16b, v2.16b
+;   fmls v0.2d, v5.2d, v1.d[1]
+;   ret
+
--- a/cranelift/filetests/filetests/runtests/simd-fma.clif
+++ b/cranelift/filetests/filetests/runtests/simd-fma.clif
@@ -87,3 +87,39 @@ block0(v0: f64x2, v1: f64x2, v2: f64x2):
 ; run: %fma_is_nan_f64x2([0x0.0 0x0.0], [+NaN 0x0.0], [0x0.0 +NaN]) == 1
 ; run: %fma_is_nan_f64x2([-NaN 0x0.0], [0x0.0 -NaN], [0x0.0 0x0.0]) == 1
 ; run: %fma_is_nan_f64x2([0x0.0 NaN], [0x0.0 NaN], [-NaN NaN]) == 1
+
+function %fma_f32x4_splat1(f32x4, f32, f32x4) -> f32x4 {
+block0(v0: f32x4, v1: f32, v2: f32x4):
+    v3 = splat.f32x4 v1
+    v4 = fma v0, v3, v2
+    return v4
+}
+; run: %fma_f32x4_splat1([0x9.0 0x9.0 0x9.0 0x9.0], 0x9.0, [0x9.0 0x9.0 0x9.0 0x9.0]) == [0x1.680000p6 0x1.680000p6 0x1.680000p6 0x1.680000p6]
+; run: %fma_f32x4_splat1([0x1.0 0x2.0 0x3.0 0x4.0], 0x0.0, [0x5.0 0x6.0 0x7.0 0x8.0]) == [0x5.0 0x6.0 0x7.0 0x8.0]
+
+function %fma_f32x4_splat2(f32, f32x4, f32x4) -> f32x4 {
+block0(v0: f32, v1: f32x4, v2: f32x4):
+    v3 = splat.f32x4 v0
+    v4 = fma v3, v1, v2
+    return v4
+}
+; run: %fma_f32x4_splat2(0x9.0, [0x9.0 0x9.0 0x9.0 0x9.0], [0x9.0 0x9.0 0x9.0 0x9.0]) == [0x1.680000p6 0x1.680000p6 0x1.680000p6 0x1.680000p6]
+; run: %fma_f32x4_splat2(0x0.0, [0x1.0 0x2.0 0x3.0 0x4.0], [0x5.0 0x6.0 0x7.0 0x8.0]) == [0x5.0 0x6.0 0x7.0 0x8.0]
+
+function %fma_f64x2_splat1(f64x2, f64, f64x2) -> f64x2 {
+block0(v0: f64x2, v1: f64, v2: f64x2):
+    v3 = splat.f64x2 v1
+    v4 = fma v0, v3, v2
+    return v4
+}
+; run: %fma_f64x2_splat1([0x9.0 0x9.0], 0x9.0, [0x9.0 0x9.0]) == [0x1.680000p6 0x1.680000p6]
+; run: %fma_f64x2_splat1([0x1.0 0x2.0], 0x0.0, [0x5.0 0x6.0]) == [0x5.0 0x6.0]
+
+function %fma_f64x2_splat2(f64, f64x2, f64x2) -> f64x2 {
+block0(v0: f64, v1: f64x2, v2: f64x2):
+    v3 = splat.f64x2 v0
+    v4 = fma v3, v1, v2
+    return v4
+}
+; run: %fma_f64x2_splat2(0x9.0, [0x9.0 0x9.0], [0x9.0 0x9.0]) == [0x1.680000p6 0x1.680000p6]
+; run: %fma_f64x2_splat2(0x0.0, [0x1.0 0x2.0], [0x5.0 0x6.0]) == [0x5.0 0x6.0]