diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index bd0a841a9c..c270817564 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -1095,7 +1095,9 @@ (extern extractor cc_nz_or_z cc_nz_or_z) (type AvxOpcode extern - (enum Vfmadd213ps + (enum Vfmadd213ss + Vfmadd213sd + Vfmadd213ps Vfmadd213pd)) (type Avx512Opcode extern @@ -1389,6 +1391,9 @@ (decl use_popcnt () Type) (extern extractor use_popcnt use_popcnt) +(decl use_fma () Type) +(extern extractor use_fma use_fma) + ;;;; Helpers for Merging and Sinking Immediates/Loads ;;;;;;;;;;;;;;;;;;;;;;;;; ;; Extract a constant `Imm8Reg.Imm8` from a value operand. @@ -2935,6 +2940,16 @@ dst)))) dst)) +;; Helper for creating `vfmadd213ss` instructions. +(decl x64_vfmadd213ss (Xmm Xmm XmmMem) Xmm) +(rule (x64_vfmadd213ss x y z) + (xmm_rmr_vex (AvxOpcode.Vfmadd213ss) x y z)) + +;; Helper for creating `vfmadd213sd` instructions. +(decl x64_vfmadd213sd (Xmm Xmm XmmMem) Xmm) +(rule (x64_vfmadd213sd x y z) + (xmm_rmr_vex (AvxOpcode.Vfmadd213sd) x y z)) + ;; Helper for creating `vfmadd213ps` instructions. (decl x64_vfmadd213ps (Xmm Xmm XmmMem) Xmm) (rule (x64_vfmadd213ps x y z) diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 99d88c0cc5..8baf4061b4 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -1383,6 +1383,8 @@ impl fmt::Display for SseOpcode { #[derive(Clone, PartialEq)] pub enum AvxOpcode { + Vfmadd213ss, + Vfmadd213sd, Vfmadd213ps, Vfmadd213pd, } @@ -1391,8 +1393,10 @@ impl AvxOpcode { /// Which `InstructionSet`s support the opcode? pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> { match self { - AvxOpcode::Vfmadd213ps => smallvec![InstructionSet::FMA], - AvxOpcode::Vfmadd213pd => smallvec![InstructionSet::FMA], + AvxOpcode::Vfmadd213ss + | AvxOpcode::Vfmadd213sd + | AvxOpcode::Vfmadd213ps + | AvxOpcode::Vfmadd213pd => smallvec![InstructionSet::FMA], } } } @@ -1400,6 +1404,8 @@ impl AvxOpcode { impl fmt::Debug for AvxOpcode { fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { let name = match self { + AvxOpcode::Vfmadd213ss => "vfmadd213ss", + AvxOpcode::Vfmadd213sd => "vfmadd213sd", AvxOpcode::Vfmadd213ps => "vfmadd213ps", AvxOpcode::Vfmadd213pd => "vfmadd213pd", }; diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 64baebd95c..c6b8aad386 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1742,6 +1742,8 @@ pub(crate) fn emit( let src3 = src3.clone().to_reg_mem().with_allocs(allocs); let (w, opcode) = match op { + AvxOpcode::Vfmadd213ss => (false, 0xA9), + AvxOpcode::Vfmadd213sd => (true, 0xA9), AvxOpcode::Vfmadd213ps => (false, 0xA8), AvxOpcode::Vfmadd213pd => (true, 0xA8), }; diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index ccc6d396d0..087613d52c 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -3531,6 +3531,18 @@ fn test_x64_emit() { // ======================================================== // XMM FMA + insns.push(( + Inst::xmm_rm_r_vex(AvxOpcode::Vfmadd213ss, RegMem::reg(xmm2), xmm1, w_xmm0), + "C4E271A9C2", + "vfmadd213ss %xmm0, %xmm1, %xmm2, %xmm0", + )); + + insns.push(( + Inst::xmm_rm_r_vex(AvxOpcode::Vfmadd213sd, RegMem::reg(xmm5), xmm4, w_xmm3), + "C4E2D9A9DD", + "vfmadd213sd %xmm3, %xmm4, %xmm5, %xmm3", + )); + insns.push(( Inst::xmm_rm_r_vex(AvxOpcode::Vfmadd213ps, RegMem::reg(xmm2), xmm1, w_xmm0), "C4E271A8C2", diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 18ec33c793..581a55ef7c 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -1847,7 +1847,12 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol // Vfmadd uses and defs the dst reg, that is not the case with all // AVX's ops, if you're adding a new op, make sure to correctly define // register uses. - assert!(*op == AvxOpcode::Vfmadd213ps || *op == AvxOpcode::Vfmadd213pd); + assert!( + *op == AvxOpcode::Vfmadd213ss + || *op == AvxOpcode::Vfmadd213sd + || *op == AvxOpcode::Vfmadd213ps + || *op == AvxOpcode::Vfmadd213pd + ); collector.reg_use(src1.to_reg()); collector.reg_reuse_def(dst.to_writable_reg(), 0); diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 355e467b63..8f840a086f 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -2504,9 +2504,13 @@ (libcall_3 (LibCall.FmaF32) x y z)) (rule (lower (has_type $F64 (fma x y z))) (libcall_3 (LibCall.FmaF64) x y z)) -(rule (lower (has_type $F32X4 (fma x y z))) +(rule 1 (lower (has_type (and (use_fma) $F32) (fma x y z))) + (x64_vfmadd213ss x y z)) +(rule 1 (lower (has_type (and (use_fma) $F64) (fma x y z))) + (x64_vfmadd213sd x y z)) +(rule (lower (has_type (and (use_fma) $F32X4) (fma x y z))) (x64_vfmadd213ps x y z)) -(rule (lower (has_type $F64X2 (fma x y z))) +(rule (lower (has_type (and (use_fma) $F64X2) (fma x y z))) (x64_vfmadd213pd x y z)) ;; Rules for `load*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs index 8fffd3857f..93aa383df6 100644 --- a/cranelift/codegen/src/isa/x64/lower/isle.rs +++ b/cranelift/codegen/src/isa/x64/lower/isle.rs @@ -248,6 +248,15 @@ where } } + #[inline] + fn use_fma(&mut self, _: Type) -> Option<()> { + if self.isa_flags.use_fma() { + Some(()) + } else { + None + } + } + #[inline] fn imm8_from_value(&mut self, val: Value) -> Option { let inst = self.lower_ctx.dfg().value_def(val).inst()?; diff --git a/cranelift/filetests/filetests/isa/x64/fma-call.clif b/cranelift/filetests/filetests/isa/x64/fma-call.clif new file mode 100644 index 0000000000..7de6624f3e --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/fma-call.clif @@ -0,0 +1,33 @@ +test compile precise-output +target x86_64 has_avx=false has_fma=false + +function %fma_f32(f32, f32, f32) -> f32 { +block0(v0: f32, v1: f32, v2: f32): + v3 = fma v0, v1, v2 + return v3 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; load_ext_name %FmaF32+0, %rax +; call *%rax +; movq %rbp, %rsp +; popq %rbp +; ret + +function %fma_f64(f64, f64, f64) -> f64 { +block0(v0: f64, v1: f64, v2: f64): + v3 = fma v0, v1, v2 + return v3 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; load_ext_name %FmaF64+0, %rax +; call *%rax +; movq %rbp, %rsp +; popq %rbp +; ret + diff --git a/cranelift/filetests/filetests/isa/x64/fma-inst.clif b/cranelift/filetests/filetests/isa/x64/fma-inst.clif new file mode 100644 index 0000000000..dd8b40081a --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/fma-inst.clif @@ -0,0 +1,31 @@ +test compile precise-output +target x86_64 has_avx=true has_fma=true + +function %fma_f32(f32, f32, f32) -> f32 { +block0(v0: f32, v1: f32, v2: f32): + v3 = fma v0, v1, v2 + return v3 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vfmadd213ss %xmm0, %xmm1, %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %fma_f64(f64, f64, f64) -> f64 { +block0(v0: f64, v1: f64, v2: f64): + v3 = fma v0, v1, v2 + return v3 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vfmadd213sd %xmm0, %xmm1, %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + diff --git a/cranelift/filetests/filetests/runtests/fma.clif b/cranelift/filetests/filetests/runtests/fma.clif index 4f11b48f9c..7da28aa380 100644 --- a/cranelift/filetests/filetests/runtests/fma.clif +++ b/cranelift/filetests/filetests/runtests/fma.clif @@ -2,6 +2,7 @@ test interpret test run target aarch64 target s390x +target x86_64 has_avx has_fma target x86_64 has_avx=false has_fma=false function %fma_f32(f32, f32, f32) -> f32 {