x64: Add more fma instruction lowerings (#5846)

The relaxed-simd proposal for WebAssembly adds a fused-multiply-add operation for `v128` types so I was poking around at Cranelift's existing support for its `fma` instruction. I was also poking around at the x86_64 ISA's offerings for the FMA operation and ended up with this PR that improves the lowering of the `fma` instruction on the x64 backend in a number of ways: * A libcall-based fallback is now provided for `f32x4` and `f64x2` types in preparation for eventual support of the relaxed-simd proposal. These encodings are horribly slow, but it's expected that if FMA semantics must be guaranteed then it's the best that can be done without the `fma` feature. Otherwise it'll be up to producers (e.g. Wasmtime embedders) whether wasm-level FMA operations should be FMA or multiply-then-add. * In addition to the existing `vfmadd213*` instructions opcodes were added for `vfmadd132*`. The `132` variant is selected based on which argument can have a sinkable load. * Any argument in the `fma` CLIF instruction can now have a `sinkable_load` and it'll generate a single FMA instruction. * All `vfnmadd*` opcodes were added as well. These are pattern-matched where one of the arguments to the CLIF instruction is an `fneg`. I opted to not add a new CLIF instruction here since it seemed like pattern matching was easy enough but I'm also not intimately familiar with the semantics here so if that's the preferred approach I can do that too.
2023-02-21 14:51:22 -06:00
parent d82ebcc102
commit bd3dcd313d
9 changed files with 718 additions and 77 deletions
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -1515,7 +1515,19 @@ impl AvxOpcode {
            AvxOpcode::Vfmadd213ss
            | AvxOpcode::Vfmadd213sd
            | AvxOpcode::Vfmadd213ps
-            | AvxOpcode::Vfmadd213pd => smallvec![InstructionSet::FMA],
+            | AvxOpcode::Vfmadd213pd
+            | AvxOpcode::Vfmadd132ss
+            | AvxOpcode::Vfmadd132sd
+            | AvxOpcode::Vfmadd132ps
+            | AvxOpcode::Vfmadd132pd
+            | AvxOpcode::Vfnmadd213ss
+            | AvxOpcode::Vfnmadd213sd
+            | AvxOpcode::Vfnmadd213ps
+            | AvxOpcode::Vfnmadd213pd
+            | AvxOpcode::Vfnmadd132ss
+            | AvxOpcode::Vfnmadd132sd
+            | AvxOpcode::Vfnmadd132ps
+            | AvxOpcode::Vfnmadd132pd => smallvec![InstructionSet::FMA],
            AvxOpcode::Vminps
            | AvxOpcode::Vminpd
            | AvxOpcode::Vmaxps
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -2281,32 +2281,46 @@ pub(crate) fn emit(
            let dst = allocs.next(dst.to_reg().to_reg());
            debug_assert_eq!(src1, dst);
            let src2 = allocs.next(src2.to_reg());
-            let src3 = src3.clone().to_reg_mem().with_allocs(allocs);
+            let src3 = match src3.clone().to_reg_mem().with_allocs(allocs) {
+                RegMem::Reg { reg } => {
+                    RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
+                }
+                RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
+            };

            let (w, map, opcode) = match op {
+                AvxOpcode::Vfmadd132ss => (false, OpcodeMap::_0F38, 0x99),
                AvxOpcode::Vfmadd213ss => (false, OpcodeMap::_0F38, 0xA9),
+                AvxOpcode::Vfnmadd132ss => (false, OpcodeMap::_0F38, 0x9D),
+                AvxOpcode::Vfnmadd213ss => (false, OpcodeMap::_0F38, 0xAD),
+                AvxOpcode::Vfmadd132sd => (true, OpcodeMap::_0F38, 0x99),
                AvxOpcode::Vfmadd213sd => (true, OpcodeMap::_0F38, 0xA9),
+                AvxOpcode::Vfnmadd132sd => (true, OpcodeMap::_0F38, 0x9D),
+                AvxOpcode::Vfnmadd213sd => (true, OpcodeMap::_0F38, 0xAD),
+                AvxOpcode::Vfmadd132ps => (false, OpcodeMap::_0F38, 0x98),
                AvxOpcode::Vfmadd213ps => (false, OpcodeMap::_0F38, 0xA8),
+                AvxOpcode::Vfnmadd132ps => (false, OpcodeMap::_0F38, 0x9C),
+                AvxOpcode::Vfnmadd213ps => (false, OpcodeMap::_0F38, 0xAC),
+                AvxOpcode::Vfmadd132pd => (true, OpcodeMap::_0F38, 0x98),
                AvxOpcode::Vfmadd213pd => (true, OpcodeMap::_0F38, 0xA8),
+                AvxOpcode::Vfnmadd132pd => (true, OpcodeMap::_0F38, 0x9C),
+                AvxOpcode::Vfnmadd213pd => (true, OpcodeMap::_0F38, 0xAC),
                AvxOpcode::Vblendvps => (false, OpcodeMap::_0F3A, 0x4A),
                AvxOpcode::Vblendvpd => (false, OpcodeMap::_0F3A, 0x4B),
                AvxOpcode::Vpblendvb => (false, OpcodeMap::_0F3A, 0x4C),
                _ => unreachable!(),
            };

-            match src3 {
-                RegMem::Reg { reg: src } => VexInstruction::new()
-                    .length(VexVectorLength::V128)
-                    .prefix(LegacyPrefixes::_66)
-                    .map(map)
-                    .w(w)
-                    .opcode(opcode)
-                    .reg(dst.to_real_reg().unwrap().hw_enc())
-                    .rm(src.to_real_reg().unwrap().hw_enc())
-                    .vvvv(src2.to_real_reg().unwrap().hw_enc())
-                    .encode(sink),
-                _ => todo!(),
-            };
+            VexInstruction::new()
+                .length(VexVectorLength::V128)
+                .prefix(LegacyPrefixes::_66)
+                .map(map)
+                .w(w)
+                .opcode(opcode)
+                .reg(dst.to_real_reg().unwrap().hw_enc())
+                .rm(src3)
+                .vvvv(src2.to_real_reg().unwrap().hw_enc())
+                .encode(sink);
        }

        Inst::XmmRmRBlendVex {
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -1944,23 +1944,12 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
            src2.get_operands(collector);
        }
        Inst::XmmRmRVex3 {
-            op,
            src1,
            src2,
            src3,
            dst,
            ..
        } => {
-            // Vfmadd uses and defs the dst reg, that is not the case with all
-            // AVX's ops, if you're adding a new op, make sure to correctly define
-            // register uses.
-            assert!(
-                *op == AvxOpcode::Vfmadd213ss
-                    || *op == AvxOpcode::Vfmadd213sd
-                    || *op == AvxOpcode::Vfmadd213ps
-                    || *op == AvxOpcode::Vfmadd213pd
-            );
-
            collector.reg_use(src1.to_reg());
            collector.reg_reuse_def(dst.to_writable_reg(), 0);
            collector.reg_use(src2.to_reg());