The relaxed-simd proposal for WebAssembly adds a fused-multiply-add operation for `v128` types so I was poking around at Cranelift's existing support for its `fma` instruction. I was also poking around at the x86_64 ISA's offerings for the FMA operation and ended up with this PR that improves the lowering of the `fma` instruction on the x64 backend in a number of ways: * A libcall-based fallback is now provided for `f32x4` and `f64x2` types in preparation for eventual support of the relaxed-simd proposal. These encodings are horribly slow, but it's expected that if FMA semantics must be guaranteed then it's the best that can be done without the `fma` feature. Otherwise it'll be up to producers (e.g. Wasmtime embedders) whether wasm-level FMA operations should be FMA or multiply-then-add. * In addition to the existing `vfmadd213*` instructions opcodes were added for `vfmadd132*`. The `132` variant is selected based on which argument can have a sinkable load. * Any argument in the `fma` CLIF instruction can now have a `sinkable_load` and it'll generate a single FMA instruction. * All `vfnmadd*` opcodes were added as well. These are pattern-matched where one of the arguments to the CLIF instruction is an `fneg`. I opted to not add a new CLIF instruction here since it seemed like pattern matching was easy enough but I'm also not intimately familiar with the semantics here so if that's the preferred approach I can do that too.
238 lines
6.0 KiB
Plaintext
238 lines
6.0 KiB
Plaintext
test compile precise-output
|
|
target x86_64 has_avx=false has_fma=false
|
|
|
|
function %fma_f32(f32, f32, f32) -> f32 {
|
|
block0(v0: f32, v1: f32, v2: f32):
|
|
v3 = fma v0, v1, v2
|
|
return v3
|
|
}
|
|
|
|
; VCode:
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block0:
|
|
; load_ext_name %FmaF32+0, %r8
|
|
; call *%r8
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; ret
|
|
;
|
|
; Disassembled:
|
|
; block0: ; offset 0x0
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block1: ; offset 0x4
|
|
; movabsq $0, %r8 ; reloc_external Abs8 %FmaF32 0
|
|
; callq *%r8
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; retq
|
|
|
|
function %fma_f64(f64, f64, f64) -> f64 {
|
|
block0(v0: f64, v1: f64, v2: f64):
|
|
v3 = fma v0, v1, v2
|
|
return v3
|
|
}
|
|
|
|
; VCode:
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block0:
|
|
; load_ext_name %FmaF64+0, %r8
|
|
; call *%r8
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; ret
|
|
;
|
|
; Disassembled:
|
|
; block0: ; offset 0x0
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block1: ; offset 0x4
|
|
; movabsq $0, %r8 ; reloc_external Abs8 %FmaF64 0
|
|
; callq *%r8
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; retq
|
|
|
|
function %fma_f32x4(f32x4, f32x4, f32x4) -> f32x4 {
|
|
block0(v0: f32x4, v1: f32x4, v2: f32x4):
|
|
v3 = fma v0, v1, v2
|
|
return v3
|
|
}
|
|
|
|
; VCode:
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; subq %rsp, $96, %rsp
|
|
; block0:
|
|
; movdqu %xmm0, rsp(0 + virtual offset)
|
|
; movdqu %xmm1, rsp(16 + virtual offset)
|
|
; movdqu %xmm2, rsp(32 + virtual offset)
|
|
; load_ext_name %FmaF32+0, %r8
|
|
; movdqu rsp(0 + virtual offset), %xmm0
|
|
; movdqu rsp(16 + virtual offset), %xmm1
|
|
; movdqu rsp(32 + virtual offset), %xmm2
|
|
; call *%r8
|
|
; movdqu %xmm0, rsp(48 + virtual offset)
|
|
; movdqu rsp(0 + virtual offset), %xmm4
|
|
; pshufd $1, %xmm4, %xmm0
|
|
; movdqu rsp(16 + virtual offset), %xmm2
|
|
; pshufd $1, %xmm2, %xmm1
|
|
; movdqu rsp(32 + virtual offset), %xmm3
|
|
; pshufd $1, %xmm3, %xmm2
|
|
; load_ext_name %FmaF32+0, %r9
|
|
; call *%r9
|
|
; movdqu %xmm0, rsp(64 + virtual offset)
|
|
; movdqu rsp(0 + virtual offset), %xmm14
|
|
; pshufd $2, %xmm14, %xmm0
|
|
; movdqu rsp(16 + virtual offset), %xmm13
|
|
; pshufd $2, %xmm13, %xmm1
|
|
; movdqu rsp(32 + virtual offset), %xmm15
|
|
; pshufd $2, %xmm15, %xmm2
|
|
; load_ext_name %FmaF32+0, %r10
|
|
; call *%r10
|
|
; movdqu %xmm0, rsp(80 + virtual offset)
|
|
; movdqu rsp(0 + virtual offset), %xmm14
|
|
; pshufd $3, %xmm14, %xmm0
|
|
; movdqu rsp(16 + virtual offset), %xmm1
|
|
; pshufd $3, %xmm1, %xmm1
|
|
; movdqu rsp(32 + virtual offset), %xmm2
|
|
; pshufd $3, %xmm2, %xmm2
|
|
; load_ext_name %FmaF32+0, %r11
|
|
; call *%r11
|
|
; movdqa %xmm0, %xmm13
|
|
; movdqu rsp(64 + virtual offset), %xmm4
|
|
; movdqu rsp(48 + virtual offset), %xmm0
|
|
; insertps $16, %xmm0, %xmm4, %xmm0
|
|
; movdqu rsp(80 + virtual offset), %xmm10
|
|
; insertps $32, %xmm0, %xmm10, %xmm0
|
|
; movdqa %xmm13, %xmm1
|
|
; insertps $48, %xmm0, %xmm1, %xmm0
|
|
; addq %rsp, $96, %rsp
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; ret
|
|
;
|
|
; Disassembled:
|
|
; block0: ; offset 0x0
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; subq $0x60, %rsp
|
|
; block1: ; offset 0x8
|
|
; movdqu %xmm0, (%rsp)
|
|
; movdqu %xmm1, 0x10(%rsp)
|
|
; movdqu %xmm2, 0x20(%rsp)
|
|
; movabsq $0, %r8 ; reloc_external Abs8 %FmaF32 0
|
|
; movdqu (%rsp), %xmm0
|
|
; movdqu 0x10(%rsp), %xmm1
|
|
; movdqu 0x20(%rsp), %xmm2
|
|
; callq *%r8
|
|
; movdqu %xmm0, 0x30(%rsp)
|
|
; movdqu (%rsp), %xmm4
|
|
; pshufd $1, %xmm4, %xmm0
|
|
; movdqu 0x10(%rsp), %xmm2
|
|
; pshufd $1, %xmm2, %xmm1
|
|
; movdqu 0x20(%rsp), %xmm3
|
|
; pshufd $1, %xmm3, %xmm2
|
|
; movabsq $0, %r9 ; reloc_external Abs8 %FmaF32 0
|
|
; callq *%r9
|
|
; movdqu %xmm0, 0x40(%rsp)
|
|
; movdqu (%rsp), %xmm14
|
|
; pshufd $2, %xmm14, %xmm0
|
|
; movdqu 0x10(%rsp), %xmm13
|
|
; pshufd $2, %xmm13, %xmm1
|
|
; movdqu 0x20(%rsp), %xmm15
|
|
; pshufd $2, %xmm15, %xmm2
|
|
; movabsq $0, %r10 ; reloc_external Abs8 %FmaF32 0
|
|
; callq *%r10
|
|
; movdqu %xmm0, 0x50(%rsp)
|
|
; movdqu (%rsp), %xmm14
|
|
; pshufd $3, %xmm14, %xmm0
|
|
; movdqu 0x10(%rsp), %xmm1
|
|
; pshufd $3, %xmm1, %xmm1
|
|
; movdqu 0x20(%rsp), %xmm2
|
|
; pshufd $3, %xmm2, %xmm2
|
|
; movabsq $0, %r11 ; reloc_external Abs8 %FmaF32 0
|
|
; callq *%r11
|
|
; movdqa %xmm0, %xmm13
|
|
; movdqu 0x40(%rsp), %xmm4
|
|
; movdqu 0x30(%rsp), %xmm0
|
|
; insertps $0x10, %xmm4, %xmm0
|
|
; movdqu 0x50(%rsp), %xmm10
|
|
; insertps $0x20, %xmm10, %xmm0
|
|
; movdqa %xmm13, %xmm1
|
|
; insertps $0x30, %xmm1, %xmm0
|
|
; addq $0x60, %rsp
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; retq
|
|
|
|
function %fma_f64x2(f64x2, f64x2, f64x2) -> f64x2 {
|
|
block0(v0: f64x2, v1: f64x2, v2: f64x2):
|
|
v3 = fma v0, v1, v2
|
|
return v3
|
|
}
|
|
|
|
; VCode:
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; subq %rsp, $64, %rsp
|
|
; block0:
|
|
; movdqu %xmm0, rsp(0 + virtual offset)
|
|
; movdqu %xmm1, rsp(16 + virtual offset)
|
|
; movdqu %xmm2, rsp(32 + virtual offset)
|
|
; load_ext_name %FmaF64+0, %r8
|
|
; movdqu rsp(0 + virtual offset), %xmm0
|
|
; movdqu rsp(16 + virtual offset), %xmm1
|
|
; movdqu rsp(32 + virtual offset), %xmm2
|
|
; call *%r8
|
|
; movdqu %xmm0, rsp(48 + virtual offset)
|
|
; movdqu rsp(0 + virtual offset), %xmm0
|
|
; pshufd $238, %xmm0, %xmm0
|
|
; movdqu rsp(16 + virtual offset), %xmm1
|
|
; pshufd $238, %xmm1, %xmm1
|
|
; movdqu rsp(32 + virtual offset), %xmm2
|
|
; pshufd $238, %xmm2, %xmm2
|
|
; load_ext_name %FmaF64+0, %r9
|
|
; call *%r9
|
|
; movdqa %xmm0, %xmm14
|
|
; movdqu rsp(48 + virtual offset), %xmm0
|
|
; movlhps %xmm0, %xmm14, %xmm0
|
|
; addq %rsp, $64, %rsp
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; ret
|
|
;
|
|
; Disassembled:
|
|
; block0: ; offset 0x0
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; subq $0x40, %rsp
|
|
; block1: ; offset 0x8
|
|
; movdqu %xmm0, (%rsp)
|
|
; movdqu %xmm1, 0x10(%rsp)
|
|
; movdqu %xmm2, 0x20(%rsp)
|
|
; movabsq $0, %r8 ; reloc_external Abs8 %FmaF64 0
|
|
; movdqu (%rsp), %xmm0
|
|
; movdqu 0x10(%rsp), %xmm1
|
|
; movdqu 0x20(%rsp), %xmm2
|
|
; callq *%r8
|
|
; movdqu %xmm0, 0x30(%rsp)
|
|
; movdqu (%rsp), %xmm0
|
|
; pshufd $0xee, %xmm0, %xmm0
|
|
; movdqu 0x10(%rsp), %xmm1
|
|
; pshufd $0xee, %xmm1, %xmm1
|
|
; movdqu 0x20(%rsp), %xmm2
|
|
; pshufd $0xee, %xmm2, %xmm2
|
|
; movabsq $0, %r9 ; reloc_external Abs8 %FmaF64 0
|
|
; callq *%r9
|
|
; movdqa %xmm0, %xmm14
|
|
; movdqu 0x30(%rsp), %xmm0
|
|
; movlhps %xmm14, %xmm0
|
|
; addq $0x40, %rsp
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; retq
|
|
|