x64: Add more fma instruction lowerings (#5846)

The relaxed-simd proposal for WebAssembly adds a fused-multiply-add
operation for `v128` types so I was poking around at Cranelift's
existing support for its `fma` instruction. I was also poking around at
the x86_64 ISA's offerings for the FMA operation and ended up with this
PR that improves the lowering of the `fma` instruction on the x64
backend in a number of ways:

* A libcall-based fallback is now provided for `f32x4` and `f64x2` types
  in preparation for eventual support of the relaxed-simd proposal.
  These encodings are horribly slow, but it's expected that if FMA
  semantics must be guaranteed then it's the best that can be done
  without the `fma` feature. Otherwise it'll be up to producers (e.g.
  Wasmtime embedders) whether wasm-level FMA operations should be FMA or
  multiply-then-add.

* In addition to the existing `vfmadd213*` instructions opcodes were
  added for `vfmadd132*`. The `132` variant is selected based on which
  argument can have a sinkable load.

* Any argument in the `fma` CLIF instruction can now have a
  `sinkable_load` and it'll generate a single FMA instruction.

* All `vfnmadd*` opcodes were added as well. These are pattern-matched
  where one of the arguments to the CLIF instruction is an `fneg`. I
  opted to not add a new CLIF instruction here since it seemed like
  pattern matching was easy enough but I'm also not intimately familiar
  with the semantics here so if that's the preferred approach I can do
  that too.
This commit is contained in:
Alex Crichton
2023-02-21 14:51:22 -06:00
committed by GitHub
parent d82ebcc102
commit bd3dcd313d
9 changed files with 718 additions and 77 deletions

View File

@@ -2281,32 +2281,46 @@ pub(crate) fn emit(
let dst = allocs.next(dst.to_reg().to_reg());
debug_assert_eq!(src1, dst);
let src2 = allocs.next(src2.to_reg());
let src3 = src3.clone().to_reg_mem().with_allocs(allocs);
let src3 = match src3.clone().to_reg_mem().with_allocs(allocs) {
RegMem::Reg { reg } => {
RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
}
RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
};
let (w, map, opcode) = match op {
AvxOpcode::Vfmadd132ss => (false, OpcodeMap::_0F38, 0x99),
AvxOpcode::Vfmadd213ss => (false, OpcodeMap::_0F38, 0xA9),
AvxOpcode::Vfnmadd132ss => (false, OpcodeMap::_0F38, 0x9D),
AvxOpcode::Vfnmadd213ss => (false, OpcodeMap::_0F38, 0xAD),
AvxOpcode::Vfmadd132sd => (true, OpcodeMap::_0F38, 0x99),
AvxOpcode::Vfmadd213sd => (true, OpcodeMap::_0F38, 0xA9),
AvxOpcode::Vfnmadd132sd => (true, OpcodeMap::_0F38, 0x9D),
AvxOpcode::Vfnmadd213sd => (true, OpcodeMap::_0F38, 0xAD),
AvxOpcode::Vfmadd132ps => (false, OpcodeMap::_0F38, 0x98),
AvxOpcode::Vfmadd213ps => (false, OpcodeMap::_0F38, 0xA8),
AvxOpcode::Vfnmadd132ps => (false, OpcodeMap::_0F38, 0x9C),
AvxOpcode::Vfnmadd213ps => (false, OpcodeMap::_0F38, 0xAC),
AvxOpcode::Vfmadd132pd => (true, OpcodeMap::_0F38, 0x98),
AvxOpcode::Vfmadd213pd => (true, OpcodeMap::_0F38, 0xA8),
AvxOpcode::Vfnmadd132pd => (true, OpcodeMap::_0F38, 0x9C),
AvxOpcode::Vfnmadd213pd => (true, OpcodeMap::_0F38, 0xAC),
AvxOpcode::Vblendvps => (false, OpcodeMap::_0F3A, 0x4A),
AvxOpcode::Vblendvpd => (false, OpcodeMap::_0F3A, 0x4B),
AvxOpcode::Vpblendvb => (false, OpcodeMap::_0F3A, 0x4C),
_ => unreachable!(),
};
match src3 {
RegMem::Reg { reg: src } => VexInstruction::new()
.length(VexVectorLength::V128)
.prefix(LegacyPrefixes::_66)
.map(map)
.w(w)
.opcode(opcode)
.reg(dst.to_real_reg().unwrap().hw_enc())
.rm(src.to_real_reg().unwrap().hw_enc())
.vvvv(src2.to_real_reg().unwrap().hw_enc())
.encode(sink),
_ => todo!(),
};
VexInstruction::new()
.length(VexVectorLength::V128)
.prefix(LegacyPrefixes::_66)
.map(map)
.w(w)
.opcode(opcode)
.reg(dst.to_real_reg().unwrap().hw_enc())
.rm(src3)
.vvvv(src2.to_real_reg().unwrap().hw_enc())
.encode(sink);
}
Inst::XmmRmRBlendVex {