x64: Implement SIMD fma (#4474)

* x64: Add VEX Instruction Encoder This uses a similar builder pattern to the EVEX Encoder. Does not yet support memory accesses. * x64: Add FMA Flag * x64: Implement SIMD `fma` * x64: Use 4 register Vex Inst * x64: Reorder VEX pretty print args
2022-07-25 23:01:02 +01:00
parent 4aaf7ff8d9
commit 02c3b47db2
15 changed files with 640 additions and 3 deletions
--- a/cranelift/codegen/meta/src/isa/x86.rs
+++ b/cranelift/codegen/meta/src/isa/x86.rs
@@ -52,6 +52,12 @@ fn define_settings(shared: &SettingGroup) -> SettingGroup {
        "AVX2: CPUID.07H:EBX.AVX2[bit 5]",
        false,
    );
+    let has_fma = settings.add_bool(
+        "has_fma",
+        "Has support for FMA.",
+        "FMA: CPUID.01H:ECX.FMA[bit 12]",
+        false,
+    );
    let has_avx512bitalg = settings.add_bool(
        "has_avx512bitalg",
        "Has support for AVX512BITALG.",
@@ -116,6 +122,7 @@ fn define_settings(shared: &SettingGroup) -> SettingGroup {
    settings.add_predicate("use_ssse3", predicate!(has_ssse3));
    settings.add_predicate("use_sse41", predicate!(has_sse41));
    settings.add_predicate("use_sse42", predicate!(has_sse41 && has_sse42));
+    settings.add_predicate("use_fma", predicate!(has_avx && has_fma));

    settings.add_predicate(
        "use_ssse3_simd",
@@ -195,7 +202,7 @@ fn define_settings(shared: &SettingGroup) -> SettingGroup {
    let broadwell = settings.add_preset(
        "broadwell",
        "Broadwell microarchitecture.",
-        preset!(haswell),
+        preset!(haswell && has_fma),
    );
    let skylake = settings.add_preset("skylake", "Skylake microarchitecture.", preset!(broadwell));
    let cannonlake = settings.add_preset(
--- a/cranelift/codegen/src/isa/x64/encoding/rex.rs
+++ b/cranelift/codegen/src/isa/x64/encoding/rex.rs
@@ -159,6 +159,7 @@ impl From<(OperandSize, Reg)> for RexFlags {
 /// Allows using the same opcode byte in different "opcode maps" to allow for more instruction
 /// encodings. See appendix A in the Intel Software Developer's Manual, volume 2A, for more details.
 #[allow(missing_docs)]
+#[derive(PartialEq)]
 pub enum OpcodeMap {
    None,
    _0F,
@@ -168,7 +169,7 @@ pub enum OpcodeMap {

 impl OpcodeMap {
    /// Normally the opcode map is specified as bytes in the instruction, but some x64 encoding
-    /// formats pack this information as bits in a prefix (e.g. EVEX).
+    /// formats pack this information as bits in a prefix (e.g. VEX / EVEX).
    pub(crate) fn bits(&self) -> u8 {
        match self {
            OpcodeMap::None => 0b00,
@@ -187,6 +188,7 @@ impl Default for OpcodeMap {

 /// We may need to include one or more legacy prefix bytes before the REX prefix.  This enum
 /// covers only the small set of possibilities that we actually need.
+#[derive(PartialEq)]
 pub enum LegacyPrefixes {
    /// No prefix bytes.
    None,
--- a/cranelift/codegen/src/isa/x64/encoding/vex.rs
+++ b/cranelift/codegen/src/isa/x64/encoding/vex.rs
@@ -1,2 +1,357 @@
 //! Encodes VEX instructions. These instructions are those added by the Advanced Vector Extensions
 //! (AVX).
+
+use super::evex::Register;
+use super::rex::{LegacyPrefixes, OpcodeMap};
+use super::ByteSink;
+use crate::isa::x64::encoding::rex::encode_modrm;
+
+/// Constructs a VEX-encoded instruction using a builder pattern. This approach makes it visually
+/// easier to transform something the manual's syntax, `VEX.128.66.0F 73 /7 ib` to code:
+/// `VexInstruction::new().length(...).prefix(...).map(...).w(true).opcode(0x1F).reg(...).rm(...)`.
+pub struct VexInstruction {
+    length: VexVectorLength,
+    prefix: LegacyPrefixes,
+    map: OpcodeMap,
+    opcode: u8,
+    w: bool,
+    reg: u8,
+    rm: Register,
+    vvvv: Option<Register>,
+    imm: Option<u8>,
+}
+
+impl Default for VexInstruction {
+    fn default() -> Self {
+        Self {
+            length: VexVectorLength::default(),
+            prefix: LegacyPrefixes::None,
+            map: OpcodeMap::None,
+            opcode: 0x00,
+            w: false,
+            reg: 0x00,
+            rm: Register::default(),
+            vvvv: None,
+            imm: None,
+        }
+    }
+}
+
+impl VexInstruction {
+    /// Construct a default VEX instruction.
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Set the length of the instruction.
+    #[inline(always)]
+    pub fn length(mut self, length: VexVectorLength) -> Self {
+        self.length = length;
+        self
+    }
+
+    /// Set the legacy prefix byte of the instruction: None | 66 | F2 | F3. VEX instructions
+    /// pack these into the prefix, not as separate bytes.
+    #[inline(always)]
+    pub fn prefix(mut self, prefix: LegacyPrefixes) -> Self {
+        debug_assert!(
+            prefix == LegacyPrefixes::None
+                || prefix == LegacyPrefixes::_66
+                || prefix == LegacyPrefixes::_F2
+                || prefix == LegacyPrefixes::_F3
+        );
+
+        self.prefix = prefix;
+        self
+    }
+
+    /// Set the opcode map byte of the instruction: None | 0F | 0F38 | 0F3A. VEX instructions pack
+    /// these into the prefix, not as separate bytes.
+    #[inline(always)]
+    pub fn map(mut self, map: OpcodeMap) -> Self {
+        self.map = map;
+        self
+    }
+
+    /// Set the W bit, denoted by `.W1` or `.W0` in the instruction string.
+    /// Typically used to indicate an instruction using 64 bits of an operand (e.g.
+    /// 64 bit lanes). EVEX packs this bit in the EVEX prefix; previous encodings used the REX
+    /// prefix.
+    #[inline(always)]
+    pub fn w(mut self, w: bool) -> Self {
+        self.w = w;
+        self
+    }
+
+    /// Set the instruction opcode byte.
+    #[inline(always)]
+    pub fn opcode(mut self, opcode: u8) -> Self {
+        self.opcode = opcode;
+        self
+    }
+
+    /// Set the register to use for the `reg` bits; many instructions use this as the write operand.
+    #[inline(always)]
+    pub fn reg(mut self, reg: impl Into<Register>) -> Self {
+        self.reg = reg.into().into();
+        self
+    }
+
+    /// Some instructions use the ModRM.reg field as an opcode extension. This is usually denoted by
+    /// a `/n` field in the manual.
+    #[inline(always)]
+    pub fn opcode_ext(mut self, n: u8) -> Self {
+        self.reg = n;
+        self
+    }
+
+    /// Set the register to use for the `rm` bits; many instructions use this as the "read from
+    /// register/memory" operand. Currently this does not support memory addressing (TODO).Setting
+    /// this affects both the ModRM byte (`rm` section) and the VEX prefix (the extension bits for
+    /// register encodings > 8).
+    #[inline(always)]
+    pub fn rm(mut self, reg: impl Into<Register>) -> Self {
+        self.rm = reg.into();
+        self
+    }
+
+    /// Set the `vvvv` register; some instructions allow using this as a second, non-destructive
+    /// source register in 3-operand instructions (e.g. 2 read, 1 write).
+    #[allow(dead_code)]
+    #[inline(always)]
+    pub fn vvvv(mut self, reg: impl Into<Register>) -> Self {
+        self.vvvv = Some(reg.into());
+        self
+    }
+
+    /// Set the imm byte when used for a register. The reg bits are stored in `imm8[7:4]` with
+    /// the lower bits unused. Overrides a previously set [Self::imm] field.
+    #[inline(always)]
+    pub fn imm_reg(mut self, reg: impl Into<Register>) -> Self {
+        let reg: u8 = reg.into().into();
+        self.imm = Some((reg & 0xf) << 4);
+        self
+    }
+
+    /// Set the imm byte.
+    /// Overrides a previously set [Self::imm_reg] field.
+    #[inline(always)]
+    pub fn imm(mut self, imm: u8) -> Self {
+        self.imm = Some(imm);
+        self
+    }
+
+    /// The R bit in encoded format (inverted).
+    #[inline(always)]
+    fn r_bit(&self) -> u8 {
+        (!(self.reg >> 3)) & 1
+    }
+
+    /// The X bit in encoded format (inverted).
+    #[inline(always)]
+    fn x_bit(&self) -> u8 {
+        // TODO
+        (!0) & 1
+    }
+
+    /// The B bit in encoded format (inverted).
+    #[inline(always)]
+    fn b_bit(&self) -> u8 {
+        let rm: u8 = self.rm.into();
+        (!(rm >> 3)) & 1
+    }
+
+    /// Is the 2 byte prefix available for this instruction?
+    /// We essentially just check if we need any of the bits that are only available
+    /// in the 3 byte instruction
+    #[inline(always)]
+    fn use_2byte_prefix(&self) -> bool {
+        // These bits are only represented on the 3 byte prefix, so their presence
+        // implies the use of the 3 byte prefix
+        self.b_bit() == 1 && self.x_bit() == 1 &&
+        // The presence of W1 in the opcode column implies the opcode must be encoded using the
+        // 3-byte form of the VEX prefix.
+        self.w == false &&
+        // The presence of 0F3A and 0F38 in the opcode column implies that opcode can only be
+        // encoded by the three-byte form of VEX
+        !(self.map == OpcodeMap::_0F3A || self.map == OpcodeMap::_0F38)
+    }
+    /// The last byte of the 2byte and 3byte prefixes is mostly the same, share the common
+    /// encoding logic here.
+    #[inline(always)]
+    fn prefix_last_byte(&self) -> u8 {
+        let vvvv = self.vvvv.map(|r| r.into()).unwrap_or(0x00);
+
+        let mut byte = 0x00;
+        byte |= self.prefix.bits();
+        byte |= self.length.bits() << 2;
+        byte |= ((!vvvv) & 0xF) << 3;
+        byte
+    }
+
+    /// Encode the 2 byte prefix
+    #[inline(always)]
+    fn encode_2byte_prefix<CS: ByteSink + ?Sized>(&self, sink: &mut CS) {
+        //  2 bytes:
+        //    +-----+ +-------------------+
+        //    | C5h | | R | vvvv | L | pp |
+        //    +-----+ +-------------------+
+
+        let last_byte = self.prefix_last_byte() | (self.r_bit() << 7);
+
+        sink.put1(0xC5);
+        sink.put1(last_byte);
+    }
+
+    /// Encode the 3 byte prefix
+    #[inline(always)]
+    fn encode_3byte_prefix<CS: ByteSink + ?Sized>(&self, sink: &mut CS) {
+        //  3 bytes:
+        //    +-----+ +--------------+ +-------------------+
+        //    | C4h | | RXB | m-mmmm | | W | vvvv | L | pp |
+        //    +-----+ +--------------+ +-------------------+
+
+        let mut second_byte = 0x00;
+        second_byte |= self.map.bits(); // m-mmmm field
+        second_byte |= self.b_bit() << 5;
+        second_byte |= self.x_bit() << 6;
+        second_byte |= self.r_bit() << 7;
+
+        let w_bit = self.w as u8;
+        let last_byte = self.prefix_last_byte() | (w_bit << 7);
+
+        sink.put1(0xC4);
+        sink.put1(second_byte);
+        sink.put1(last_byte);
+    }
+
+    /// Emit the VEX-encoded instruction to the code sink:
+    pub fn encode<CS: ByteSink + ?Sized>(&self, sink: &mut CS) {
+        // 2/3 byte prefix
+        if self.use_2byte_prefix() {
+            self.encode_2byte_prefix(sink);
+        } else {
+            self.encode_3byte_prefix(sink);
+        }
+
+        // 1 Byte Opcode
+        sink.put1(self.opcode);
+
+        // 1 ModRM Byte
+        // Not all instructions use Reg as a reg, some use it as an extension of the opcode.
+        let rm: u8 = self.rm.into();
+        sink.put1(encode_modrm(3, self.reg & 7, rm & 7));
+
+        // TODO: 0/1 byte SIB
+        // TODO: 0/1/2/4 bytes DISP
+
+        // Optional 1 Byte imm
+        if let Some(imm) = self.imm {
+            sink.put1(imm);
+        }
+    }
+}
+
+/// The VEX format allows choosing a vector length in the `L` bit.
+#[allow(dead_code, missing_docs)] // Wider-length vectors are not yet used.
+pub enum VexVectorLength {
+    V128,
+    V256,
+}
+
+impl VexVectorLength {
+    /// Encode the `L` bit.
+    fn bits(&self) -> u8 {
+        match self {
+            Self::V128 => 0b0,
+            Self::V256 => 0b1,
+        }
+    }
+}
+
+impl Default for VexVectorLength {
+    fn default() -> Self {
+        Self::V128
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::isa::x64::inst::regs;
+    use std::vec::Vec;
+
+    #[test]
+    fn vpslldq() {
+        // VEX.128.66.0F 73 /7 ib
+        // VPSLLDQ xmm1, xmm2, imm8
+
+        let dst = regs::xmm1().to_real_reg().unwrap().hw_enc();
+        let src = regs::xmm2().to_real_reg().unwrap().hw_enc();
+        let mut sink0 = Vec::new();
+
+        VexInstruction::new()
+            .length(VexVectorLength::V128)
+            .prefix(LegacyPrefixes::_66)
+            .map(OpcodeMap::_0F)
+            .opcode(0x73)
+            .opcode_ext(7)
+            .vvvv(dst)
+            .rm(src)
+            .imm(0x17)
+            .encode(&mut sink0);
+
+        assert_eq!(sink0, vec![0xc5, 0xf1, 0x73, 0xfa, 0x17]);
+    }
+
+    #[test]
+    fn vblendvpd() {
+        // A four operand instruction
+        // VEX.128.66.0F3A.W0 4B /r /is4
+        // VBLENDVPD xmm1, xmm2, xmm3, xmm4
+
+        let dst = regs::xmm1().to_real_reg().unwrap().hw_enc();
+        let a = regs::xmm2().to_real_reg().unwrap().hw_enc();
+        let b = regs::xmm3().to_real_reg().unwrap().hw_enc();
+        let c = regs::xmm4().to_real_reg().unwrap().hw_enc();
+        let mut sink0 = Vec::new();
+
+        VexInstruction::new()
+            .length(VexVectorLength::V128)
+            .prefix(LegacyPrefixes::_66)
+            .map(OpcodeMap::_0F3A)
+            .w(false)
+            .opcode(0x4B)
+            .reg(dst)
+            .vvvv(a)
+            .rm(b)
+            .imm_reg(c)
+            .encode(&mut sink0);
+
+        assert_eq!(sink0, vec![0xc4, 0xe3, 0x69, 0x4b, 0xcb, 0x40]);
+    }
+
+    #[test]
+    fn vcmpps() {
+        // VEX.128.0F.WIG C2 /r ib
+        // VCMPPS ymm10, ymm11, ymm12, 4 // neq
+
+        let dst = regs::xmm10().to_real_reg().unwrap().hw_enc();
+        let a = regs::xmm11().to_real_reg().unwrap().hw_enc();
+        let b = regs::xmm12().to_real_reg().unwrap().hw_enc();
+        let mut sink0 = Vec::new();
+
+        VexInstruction::new()
+            .length(VexVectorLength::V256)
+            .prefix(LegacyPrefixes::None)
+            .map(OpcodeMap::_0F)
+            .opcode(0xC2)
+            .reg(dst)
+            .vvvv(a)
+            .rm(b)
+            .imm(4)
+            .encode(&mut sink0);
+
+        assert_eq!(sink0, vec![0xc4, 0x41, 0x24, 0xc2, 0xd4, 0x04]);
+    }
+}
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -193,6 +193,13 @@
               (src2 XmmMem)
               (dst WritableXmm))

+       ;; XMM (scalar or vector) binary op that relies on the VEX prefix.
+       (XmmRmRVex (op AvxOpcode)
+                   (src1 Xmm)
+                   (src2 Xmm)
+                   (src3 XmmMem)
+                   (dst WritableXmm))
+
       ;; XMM (scalar or vector) binary op that relies on the EVEX prefix.
       (XmmRmREvex (op Avx512Opcode)
                   (src1 XmmMem)
@@ -1042,6 +1049,10 @@
 (decl intcc_to_cc (IntCC) CC)
 (extern constructor intcc_to_cc intcc_to_cc)

+(type AvxOpcode extern
+      (enum Vfmadd213ps
+            Vfmadd213pd))
+
 (type Avx512Opcode extern
      (enum Vcvtudq2ps
            Vpabsq
@@ -2839,6 +2850,28 @@
        dst))


+;; Helper for creating `MInst.XmmRmRVex` instructions.
+(decl xmm_rmr_vex (AvxOpcode Xmm Xmm XmmMem) Xmm)
+(rule (xmm_rmr_vex op src1 src2 src3)
+      (let ((dst WritableXmm (temp_writable_xmm))
+            (_ Unit (emit (MInst.XmmRmRVex op
+                                           src1
+                                           src2
+                                           src3
+                                           dst))))
+        dst))
+
+;; Helper for creating `vfmadd213ps` instructions.
+(decl x64_vfmadd213ps (Xmm Xmm XmmMem) Xmm)
+(rule (x64_vfmadd213ps x y z)
+      (xmm_rmr_vex (AvxOpcode.Vfmadd213ps) x y z))
+
+;; Helper for creating `vfmadd213pd` instructions.
+(decl x64_vfmadd213pd (Xmm Xmm XmmMem) Xmm)
+(rule (x64_vfmadd213pd x y z)
+      (xmm_rmr_vex (AvxOpcode.Vfmadd213pd) x y z))
+
+
 ;; Helper for creating `sqrtss` instructions.
 (decl x64_sqrtss (Xmm) Xmm)
 (rule (x64_sqrtss x)
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -794,6 +794,7 @@ pub(crate) enum InstructionSet {
    BMI1,
    #[allow(dead_code)] // never constructed (yet).
    BMI2,
+    FMA,
    AVX512BITALG,
    AVX512DQ,
    AVX512F,
@@ -1386,6 +1387,38 @@ impl fmt::Display for SseOpcode {
    }
 }

+#[derive(Clone, PartialEq)]
+pub enum AvxOpcode {
+    Vfmadd213ps,
+    Vfmadd213pd,
+}
+
+impl AvxOpcode {
+    /// Which `InstructionSet`s support the opcode?
+    pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> {
+        match self {
+            AvxOpcode::Vfmadd213ps => smallvec![InstructionSet::FMA],
+            AvxOpcode::Vfmadd213pd => smallvec![InstructionSet::FMA],
+        }
+    }
+}
+
+impl fmt::Debug for AvxOpcode {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        let name = match self {
+            AvxOpcode::Vfmadd213ps => "vfmadd213ps",
+            AvxOpcode::Vfmadd213pd => "vfmadd213pd",
+        };
+        write!(fmt, "{}", name)
+    }
+}
+
+impl fmt::Display for AvxOpcode {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        fmt::Debug::fmt(self, f)
+    }
+}
+
 #[derive(Clone, PartialEq)]
 pub enum Avx512Opcode {
    Vcvtudq2ps,
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -8,6 +8,7 @@ use crate::isa::x64::encoding::rex::{
    low8_will_sign_extend_to_32, low8_will_sign_extend_to_64, reg_enc, LegacyPrefixes, OpcodeMap,
    RexFlags,
 };
+use crate::isa::x64::encoding::vex::{VexInstruction, VexVectorLength};
 use crate::isa::x64::inst::args::*;
 use crate::isa::x64::inst::*;
 use crate::machinst::{inst_common, MachBuffer, MachInstEmit, MachLabel, Reg, Writable};
@@ -119,6 +120,7 @@ pub(crate) fn emit(
            InstructionSet::Lzcnt => info.isa_flags.use_lzcnt(),
            InstructionSet::BMI1 => info.isa_flags.use_bmi1(),
            InstructionSet::BMI2 => info.isa_flags.has_bmi2(),
+            InstructionSet::FMA => info.isa_flags.has_fma(),
            InstructionSet::AVX512BITALG => info.isa_flags.has_avx512bitalg(),
            InstructionSet::AVX512DQ => info.isa_flags.has_avx512dq(),
            InstructionSet::AVX512F => info.isa_flags.has_avx512f(),
@@ -1689,6 +1691,39 @@ pub(crate) fn emit(
            }
        }

+        Inst::XmmRmRVex {
+            op,
+            src1,
+            src2,
+            src3,
+            dst,
+        } => {
+            let src1 = allocs.next(src1.to_reg());
+            let dst = allocs.next(dst.to_reg().to_reg());
+            debug_assert_eq!(src1, dst);
+            let src2 = allocs.next(src2.to_reg());
+            let src3 = src3.clone().to_reg_mem().with_allocs(allocs);
+
+            let (w, opcode) = match op {
+                AvxOpcode::Vfmadd213ps => (false, 0xA8),
+                AvxOpcode::Vfmadd213pd => (true, 0xA8),
+            };
+
+            match src3 {
+                RegMem::Reg { reg: src } => VexInstruction::new()
+                    .length(VexVectorLength::V128)
+                    .prefix(LegacyPrefixes::_66)
+                    .map(OpcodeMap::_0F38)
+                    .w(w)
+                    .opcode(opcode)
+                    .reg(dst.to_real_reg().unwrap().hw_enc())
+                    .rm(src.to_real_reg().unwrap().hw_enc())
+                    .vvvv(src2.to_real_reg().unwrap().hw_enc())
+                    .encode(sink),
+                _ => todo!(),
+            };
+        }
+
        Inst::XmmRmREvex {
            op,
            src1,
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -3701,6 +3701,21 @@ fn test_x64_emit() {
        "jmp     *321(%r10,%rdx,4)",
    ));

+    // ========================================================
+    // XMM FMA
+
+    insns.push((
+        Inst::xmm_rm_r_vex(AvxOpcode::Vfmadd213ps, RegMem::reg(xmm2), xmm1, w_xmm0),
+        "C4E271A8C2",
+        "vfmadd213ps %xmm0, %xmm1, %xmm2, %xmm0",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r_vex(AvxOpcode::Vfmadd213pd, RegMem::reg(xmm5), xmm4, w_xmm3),
+        "C4E2D9A8DD",
+        "vfmadd213pd %xmm3, %xmm4, %xmm5, %xmm3",
+    ));
+
    // ========================================================
    // XMM_CMP_RM_R

@@ -4866,6 +4881,7 @@ fn test_x64_emit() {
    let mut isa_flag_builder = x64::settings::builder();
    isa_flag_builder.enable("has_ssse3").unwrap();
    isa_flag_builder.enable("has_sse41").unwrap();
+    isa_flag_builder.enable("has_fma").unwrap();
    isa_flag_builder.enable("has_avx512bitalg").unwrap();
    isa_flag_builder.enable("has_avx512dq").unwrap();
    isa_flag_builder.enable("has_avx512f").unwrap();
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -129,6 +129,8 @@ impl Inst {
            | Inst::XmmUnaryRmR { op, .. } => smallvec![op.available_from()],

            Inst::XmmUnaryRmREvex { op, .. } | Inst::XmmRmREvex { op, .. } => op.available_from(),
+
+            Inst::XmmRmRVex { op, .. } => op.available_from(),
        }
    }
 }
@@ -324,6 +326,20 @@ impl Inst {
        }
    }

+    #[cfg(test)]
+    pub(crate) fn xmm_rm_r_vex(op: AvxOpcode, src3: RegMem, src2: Reg, dst: Writable<Reg>) -> Self {
+        src3.assert_regclass_is(RegClass::Float);
+        debug_assert!(src2.class() == RegClass::Float);
+        debug_assert!(dst.to_reg().class() == RegClass::Float);
+        Inst::XmmRmRVex {
+            op,
+            src3: XmmMem::new(src3).unwrap(),
+            src2: Xmm::new(src2).unwrap(),
+            src1: Xmm::new(dst.to_reg()).unwrap(),
+            dst: WritableXmm::from_writable_reg(dst).unwrap(),
+        }
+    }
+
    pub(crate) fn xmm_rm_r_evex(
        op: Avx512Opcode,
        src1: RegMem,
@@ -1136,6 +1152,29 @@ impl PrettyPrint for Inst {
                format!("{} {}, {}, {}", ljustify(op.to_string()), src1, src2, dst)
            }

+            Inst::XmmRmRVex {
+                op,
+                src1,
+                src2,
+                src3,
+                dst,
+                ..
+            } => {
+                let src1 = pretty_print_reg(src1.to_reg(), 8, allocs);
+                let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
+                let src2 = pretty_print_reg(src2.to_reg(), 8, allocs);
+                let src3 = src3.pretty_print(8, allocs);
+
+                format!(
+                    "{} {}, {}, {}, {}",
+                    ljustify(op.to_string()),
+                    src1,
+                    src2,
+                    src3,
+                    dst
+                )
+            }
+
            Inst::XmmRmREvex {
                op,
                src1,
@@ -1840,6 +1879,24 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
                }
            }
        }
+        Inst::XmmRmRVex {
+            op,
+            src1,
+            src2,
+            src3,
+            dst,
+            ..
+        } => {
+            // Vfmadd uses and defs the dst reg, that is not the case with all
+            // AVX's ops, if you're adding a new op, make sure to correctly define
+            // register uses.
+            assert!(*op == AvxOpcode::Vfmadd213ps || *op == AvxOpcode::Vfmadd213pd);
+
+            collector.reg_use(src1.to_reg());
+            collector.reg_reuse_def(dst.to_writable_reg(), 0);
+            collector.reg_use(src2.to_reg());
+            src3.get_operands(collector);
+        }
        Inst::XmmRmREvex {
            op,
            src1,
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -2566,6 +2566,13 @@
 (rule (lower (has_type $F64X2 (fmax_pseudo x y)))
      (x64_maxpd y x))

+;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type $F32X4 (fma x y z)))
+      (x64_vfmadd213ps x y z))
+(rule (lower (has_type $F64X2 (fma x y z)))
+      (x64_vfmadd213pd x y z))
+
 ;; Rules for `load*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 ;; In order to load a value from memory to a GPR register, we may need to extend
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -2832,7 +2832,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(

        Opcode::Cls => unimplemented!("Cls not supported"),

-        Opcode::Fma => unimplemented!("Fma not supported"),
+        Opcode::Fma => implemented_in_isle(ctx),

        Opcode::BorNot | Opcode::BxorNot => {
            unimplemented!("or-not / xor-not opcodes not implemented");
--- a/cranelift/filetests/filetests/runtests/simd-fma.clif
+++ b/cranelift/filetests/filetests/runtests/simd-fma.clif
@@ -0,0 +1,85 @@
+test run
+target x86_64 has_avx has_fma
+
+function %fma_f32x4(f32x4, f32x4, f32x4) -> f32x4 {
+block0(v0: f32x4, v1: f32x4, v2: f32x4):
+    v3 = fma v0, v1, v2
+    return v3
+}
+; run: %fma_f32x4([0x9.0 0x83.0 0x1.99999ap-2 -0x1.4cccccp0], [0x9.0 0x2.68091p6 0x1.333334p-1 -0x1.666666p1], [0x9.0 0x9.88721p1 0x1.400000p1 -0x1.b33334p0]) == [0x1.680000p6 0x1.3b88e6p14 0x1.5eb852p1 0x1.f0a3d2p0]
+
+; Zeroes
+; run: %fma_f32x4([0x0.0 0x0.0 0x0.0 -0x0.0], [0x0.0 0x0.0 -0x0.0 0x0.0], [0x0.0 -0x0.0 0x0.0 0x0.0]) == [0x0.0 0x0.0 0x0.0 0x0.0]
+
+; Infinites
+; run: %fma_f32x4([-Inf Inf -Inf Inf], [-Inf -Inf Inf -Inf], [0x0.0 0x0.0 0x0.0 -Inf]) == [Inf -Inf -Inf -Inf]
+; run: %fma_f32x4([-Inf 0x0.0 0x0.0 0x0.0], [Inf 0x0.0 0x0.0 0x0.0], [-Inf 0x0.0 0x0.0 0x0.0]) == [-Inf 0x0.0 0x0.0 0x0.0]
+
+; F32 Epsilon / Max / Min Positive
+; run: %fma_f32x4([0x1.000000p-23 0x0.0 0x1.fffffep127 0x0.0], [0x1.000000p-23 0x0.0 0x1.fffffep127 0x0.0], [0x1.000000p-23 0x1.000000p-23 0x1.fffffep127 0x1.fffffep127]) == [0x1.000002p-23 0x1.000000p-23 +Inf 0x1.fffffep127]
+; run: %fma_f32x4([0x1.000000p-126 0x0.0 0x0.0 0x0.0], [0x1.000000p-126 0x0.0 0x0.0 0x0.0], [0x1.000000p-126 0x1.000000p-126 0x0.0 0x0.0]) == [0x1.000000p-126 0x1.000000p-126 0x0.0 0x0.0]
+
+; F32 Subnormals
+; run: %fma_f32x4([0x0.800000p-126 0x0.800000p-126 0x0.0 0x0.000002p-126], [0x0.800000p-126 0x0.800000p-126 0x0.0 0x0.000002p-126], [0x0.800000p-126 0x0.0 0x0.000002p-126 0x0.000002p-126]) == [0x0.800000p-126 0x0.0 0x0.000002p-126 0x0.000002p-126]
+; run: %fma_f32x4([0x0.000002p-126 0x0.0 0x0.0 0x0.0], [0x0.000002p-126 0x0.0 0x0.0 0x0.0], [0x0.0 0x0.000002p-126 0x0.0 0x0.0]) == [0x0.0 0x0.000002p-126 0x0.0 0x0.0]
+
+
+
+;; The IEEE754 Standard does not make a lot of guarantees about what
+;; comes out of NaN producing operations, we just check if its a NaN
+function %fma_is_nan_f32x4(f32x4, f32x4, f32x4) -> b1 {
+block0(v0: f32x4, v1: f32x4, v2: f32x4):
+    v3 = fma v0, v1, v2
+    v4 = fcmp ne v3, v3
+    v5 = vall_true v4
+    return v5
+}
+; run: %fma_is_nan_f32x4([Inf -Inf -Inf +NaN], [-Inf Inf -Inf 0x0.0], [Inf Inf -Inf 0x0.0]) == true
+; run: %fma_is_nan_f32x4([0x0.0 0x0.0 -NaN 0x0.0], [+NaN 0x0.0 0x0.0 -NaN], [0x0.0 +NaN 0x0.0 0x0.0]) == true
+; run: %fma_is_nan_f32x4([0x0.0 NaN NaN NaN], [0x0.0 NaN NaN NaN], [-NaN NaN NaN NaN]) == true
+
+
+
+
+
+function %fma_f64x2(f64x2, f64x2, f64x2) -> f64x2 {
+block0(v0: f64x2, v1: f64x2, v2: f64x2):
+    v3 = fma v0, v1, v2
+    return v3
+}
+; run: %fma_f64x2([0x9.0 0x1.3b88ea148dd4ap14], [0x9.0 0x2.680916809121p6], [0x9.0 0x9.887218721837p1]) == [0x1.680000p6 0x1.7ba6ebee17417p21]
+
+; Zeroes
+; run: %fma_f64x2([0x0.0 0x0.0], [0x0.0 0x0.0], [0x0.0 -0x0.0]) == [0x0.0 0x0.0]
+; run: %fma_f64x2([0x0.0 -0x0.0], [-0x0.0 0x0.0], [0x0.0 0x0.0]) == [0x0.0 0x0.0]
+
+; Infinites
+; run: %fma_f64x2([-Inf Inf], [-Inf -Inf], [0x0.0 0x0.0]) == [+Inf -Inf]
+; run: %fma_f64x2([-Inf Inf], [Inf -Inf], [0x0.0 -Inf]) == [-Inf -Inf]
+; run: %fma_f64x2([-Inf Inf], [Inf Inf], [-Inf Inf]) == [-Inf Inf]
+
+; F64 Epsilon / Max / Min Positive
+; run: %fma_f64x2([0x1.0p-52 0x0.0], [0x1.0p-52 0x0.0], [0x1.0p-52 0x1.0p-52]) == [0x1.0000000000001p-52 0x1.0p-52]
+; run: %fma_f64x2([0x1.fffffffffffffp1023 0x0.0], [0x1.fffffffffffffp1023 0x0.0], [0x1.fffffffffffffp1023 0x1.fffffffffffffp1023]) == [+Inf 0x1.fffffffffffffp1023]
+; run: %fma_f64x2([0x1.0p-1022 0x0.0], [0x1.0p-1022 0x0.0], [0x1.0p-1022 0x1.0p-1022]) == [0x1.0p-1022 0x1.0p-1022]
+
+; F64 Subnormals
+; run: %fma_f64x2([0x0.8p-1022 0x0.8p-1022], [0x0.8p-1022 0x0.8p-1022], [0x0.8p-1022 0x0.0]) == [0x0.8p-1022 0x0.0]
+; run: %fma_f64x2([0x0.0 0x0.0000000000001p-1022], [0x0.0 0x0.0000000000001p-1022], [0x0.8p-1022 0x0.0000000000001p-1022]) == [0x0.8p-1022 0x0.0000000000001p-1022]
+; run: %fma_f64x2([0x0.0000000000001p-1022 0x0.0], [0x0.0000000000001p-1022 0x0.0], [0x0.0 0x0.0000000000001p-1022]) == [0x0.0 0x0.0000000000001p-1022]
+
+
+;; The IEEE754 Standard does not make a lot of guarantees about what
+;; comes out of NaN producing operations, we just check if its a NaN
+function %fma_is_nan_f64x2(f64x2, f64x2, f64x2) -> b1 {
+block0(v0: f64x2, v1: f64x2, v2: f64x2):
+    v3 = fma v0, v1, v2
+    v4 = fcmp ne v3, v3
+    v5 = vall_true v4
+    return v5
+}
+; run: %fma_is_nan_f64x2([Inf -Inf], [-Inf Inf], [Inf Inf]) == true
+; run: %fma_is_nan_f64x2([-Inf +NaN], [-Inf 0x0.0], [-Inf 0x0.0]) == true
+; run: %fma_is_nan_f64x2([0x0.0 0x0.0], [+NaN 0x0.0], [0x0.0 +NaN]) == true
+; run: %fma_is_nan_f64x2([-NaN 0x0.0], [0x0.0 -NaN], [0x0.0 0x0.0]) == true
+; run: %fma_is_nan_f64x2([0x0.0 NaN], [0x0.0 NaN], [-NaN NaN]) == true
--- a/cranelift/native/src/lib.rs
+++ b/cranelift/native/src/lib.rs
@@ -92,6 +92,9 @@ pub fn builder_with_options(infer_native_flags: bool) -> Result<isa::Builder, &'
        if std::is_x86_feature_detected!("avx2") {
            isa_builder.enable("has_avx2").unwrap();
        }
+        if std::is_x86_feature_detected!("fma") {
+            isa_builder.enable("has_fma").unwrap();
+        }
        if std::is_x86_feature_detected!("bmi1") {
            isa_builder.enable("has_bmi1").unwrap();
        }
--- a/crates/fuzzing/src/generators/codegen_settings.rs
+++ b/crates/fuzzing/src/generators/codegen_settings.rs
@@ -111,6 +111,7 @@ impl<'a> Arbitrary<'a> for CodegenSettings {
                    std:"popcnt" => clif:"has_popcnt",
                    std:"avx" => clif:"has_avx",
                    std:"avx2" => clif:"has_avx2",
+                    std:"fma" => clif:"has_fma",
                    std:"bmi1" => clif:"has_bmi1",
                    std:"bmi2" => clif:"has_bmi2",
                    std:"lzcnt" => clif:"has_lzcnt",
--- a/crates/wasmtime/src/engine.rs
+++ b/crates/wasmtime/src/engine.rs
@@ -486,6 +486,7 @@ impl Engine {
                "has_popcnt" => Some(std::is_x86_feature_detected!("popcnt")),
                "has_avx" => Some(std::is_x86_feature_detected!("avx")),
                "has_avx2" => Some(std::is_x86_feature_detected!("avx2")),
+                "has_fma" => Some(std::is_x86_feature_detected!("fma")),
                "has_bmi1" => Some(std::is_x86_feature_detected!("bmi1")),
                "has_bmi2" => Some(std::is_x86_feature_detected!("bmi2")),
                "has_avx512bitalg" => Some(std::is_x86_feature_detected!("avx512bitalg")),
--- a/src/commands/compile.rs
+++ b/src/commands/compile.rs
@@ -155,6 +155,8 @@ mod test {
            "--cranelift-enable",
            "has_avx2",
            "--cranelift-enable",
+            "has_fma",
+            "--cranelift-enable",
            "has_avx512dq",
            "--cranelift-enable",
            "has_avx512vl",