aarch64: Add support for the fmls instruction (#5895)

This commit adds lowerings to the AArch64 backend for the `fmls`
instruction which is intended to be leveraged in the relaxed-simd
proposal for WebAssembly. This should hopefully allow for a
teeny-bit-more efficient codegen for this operator instead of using the
`fmla` instruction plus a negation instruction.
This commit is contained in:
Alex Crichton
2023-03-01 23:45:58 -06:00
committed by GitHub
parent 52b4c48a1b
commit 9984e959cd
5 changed files with 173 additions and 2 deletions

View File

@@ -0,0 +1,159 @@
test compile precise-output
target aarch64
function %fma_f32(f32, f32, f32) -> f32 {
block0(v0: f32, v1: f32, v2: f32):
v3 = fma v0, v1, v2
return v3
}
; VCode:
; block0:
; fmadd s0, s0, s1, s2
; ret
;
; Disassembled:
; block0: ; offset 0x0
; fmadd s0, s0, s1, s2
; ret
function %fma_f64(f64, f64, f64) -> f64 {
block0(v0: f64, v1: f64, v2: f64):
v3 = fma v0, v1, v2
return v3
}
; VCode:
; block0:
; fmadd d0, d0, d1, d2
; ret
;
; Disassembled:
; block0: ; offset 0x0
; fmadd d0, d0, d1, d2
; ret
function %fma_f32x4(f32x4, f32x4, f32x4) -> f32x4 {
block0(v0: f32x4, v1: f32x4, v2: f32x4):
v3 = fma v0, v1, v2
return v3
}
; VCode:
; block0:
; mov v5.16b, v0.16b
; mov v0.16b, v2.16b
; fmla v0.4s, v0.4s, v5.4s, v1.4s
; ret
;
; Disassembled:
; block0: ; offset 0x0
; mov v5.16b, v0.16b
; mov v0.16b, v2.16b
; fmla v0.4s, v5.4s, v1.4s
; ret
function %fma_f64x2(f64x2, f64x2, f64x2) -> f64x2 {
block0(v0: f64x2, v1: f64x2, v2: f64x2):
v3 = fma v0, v1, v2
return v3
}
; VCode:
; block0:
; mov v5.16b, v0.16b
; mov v0.16b, v2.16b
; fmla v0.2d, v0.2d, v5.2d, v1.2d
; ret
;
; Disassembled:
; block0: ; offset 0x0
; mov v5.16b, v0.16b
; mov v0.16b, v2.16b
; fmla v0.2d, v5.2d, v1.2d
; ret
function %fma_neg_f32x4(f32x4, f32x4, f32x4) -> f32x4 {
block0(v0: f32x4, v1: f32x4, v2: f32x4):
v3 = fneg v0
v4 = fma v3, v1, v2
return v4
}
; VCode:
; block0:
; mov v5.16b, v0.16b
; mov v0.16b, v2.16b
; fmls v0.4s, v0.4s, v5.4s, v1.4s
; ret
;
; Disassembled:
; block0: ; offset 0x0
; mov v5.16b, v0.16b
; mov v0.16b, v2.16b
; fmls v0.4s, v5.4s, v1.4s
; ret
function %fma_neg_f64x2(f64x2, f64x2, f64x2) -> f64x2 {
block0(v0: f64x2, v1: f64x2, v2: f64x2):
v3 = fneg v0
v4 = fma v3, v1, v2
return v4
}
; VCode:
; block0:
; mov v5.16b, v0.16b
; mov v0.16b, v2.16b
; fmls v0.2d, v0.2d, v5.2d, v1.2d
; ret
;
; Disassembled:
; block0: ; offset 0x0
; mov v5.16b, v0.16b
; mov v0.16b, v2.16b
; fmls v0.2d, v5.2d, v1.2d
; ret
function %fma_neg_other_f32x4(f32x4, f32x4, f32x4) -> f32x4 {
block0(v0: f32x4, v1: f32x4, v2: f32x4):
v3 = fneg v1
v4 = fma v0, v3, v2
return v4
}
; VCode:
; block0:
; mov v5.16b, v0.16b
; mov v0.16b, v2.16b
; fmls v0.4s, v0.4s, v5.4s, v1.4s
; ret
;
; Disassembled:
; block0: ; offset 0x0
; mov v5.16b, v0.16b
; mov v0.16b, v2.16b
; fmls v0.4s, v5.4s, v1.4s
; ret
function %fma_neg_other_f64x2(f64x2, f64x2, f64x2) -> f64x2 {
block0(v0: f64x2, v1: f64x2, v2: f64x2):
v3 = fneg v1
v4 = fma v0, v3, v2
return v4
}
; VCode:
; block0:
; mov v5.16b, v0.16b
; mov v0.16b, v2.16b
; fmls v0.2d, v0.2d, v5.2d, v1.2d
; ret
;
; Disassembled:
; block0: ; offset 0x0
; mov v5.16b, v0.16b
; mov v0.16b, v2.16b
; fmls v0.2d, v5.2d, v1.2d
; ret