x64: Add native lowering for scalar fma (#4539)
Use `vfmadd213{ss,sd}` for these lowerings.
This commit is contained in:
@@ -1095,7 +1095,9 @@
|
|||||||
(extern extractor cc_nz_or_z cc_nz_or_z)
|
(extern extractor cc_nz_or_z cc_nz_or_z)
|
||||||
|
|
||||||
(type AvxOpcode extern
|
(type AvxOpcode extern
|
||||||
(enum Vfmadd213ps
|
(enum Vfmadd213ss
|
||||||
|
Vfmadd213sd
|
||||||
|
Vfmadd213ps
|
||||||
Vfmadd213pd))
|
Vfmadd213pd))
|
||||||
|
|
||||||
(type Avx512Opcode extern
|
(type Avx512Opcode extern
|
||||||
@@ -1389,6 +1391,9 @@
|
|||||||
(decl use_popcnt () Type)
|
(decl use_popcnt () Type)
|
||||||
(extern extractor use_popcnt use_popcnt)
|
(extern extractor use_popcnt use_popcnt)
|
||||||
|
|
||||||
|
(decl use_fma () Type)
|
||||||
|
(extern extractor use_fma use_fma)
|
||||||
|
|
||||||
;;;; Helpers for Merging and Sinking Immediates/Loads ;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;; Helpers for Merging and Sinking Immediates/Loads ;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
;; Extract a constant `Imm8Reg.Imm8` from a value operand.
|
;; Extract a constant `Imm8Reg.Imm8` from a value operand.
|
||||||
@@ -2935,6 +2940,16 @@
|
|||||||
dst))))
|
dst))))
|
||||||
dst))
|
dst))
|
||||||
|
|
||||||
|
;; Helper for creating `vfmadd213ss` instructions.
|
||||||
|
(decl x64_vfmadd213ss (Xmm Xmm XmmMem) Xmm)
|
||||||
|
(rule (x64_vfmadd213ss x y z)
|
||||||
|
(xmm_rmr_vex (AvxOpcode.Vfmadd213ss) x y z))
|
||||||
|
|
||||||
|
;; Helper for creating `vfmadd213sd` instructions.
|
||||||
|
(decl x64_vfmadd213sd (Xmm Xmm XmmMem) Xmm)
|
||||||
|
(rule (x64_vfmadd213sd x y z)
|
||||||
|
(xmm_rmr_vex (AvxOpcode.Vfmadd213sd) x y z))
|
||||||
|
|
||||||
;; Helper for creating `vfmadd213ps` instructions.
|
;; Helper for creating `vfmadd213ps` instructions.
|
||||||
(decl x64_vfmadd213ps (Xmm Xmm XmmMem) Xmm)
|
(decl x64_vfmadd213ps (Xmm Xmm XmmMem) Xmm)
|
||||||
(rule (x64_vfmadd213ps x y z)
|
(rule (x64_vfmadd213ps x y z)
|
||||||
|
|||||||
@@ -1383,6 +1383,8 @@ impl fmt::Display for SseOpcode {
|
|||||||
|
|
||||||
#[derive(Clone, PartialEq)]
|
#[derive(Clone, PartialEq)]
|
||||||
pub enum AvxOpcode {
|
pub enum AvxOpcode {
|
||||||
|
Vfmadd213ss,
|
||||||
|
Vfmadd213sd,
|
||||||
Vfmadd213ps,
|
Vfmadd213ps,
|
||||||
Vfmadd213pd,
|
Vfmadd213pd,
|
||||||
}
|
}
|
||||||
@@ -1391,8 +1393,10 @@ impl AvxOpcode {
|
|||||||
/// Which `InstructionSet`s support the opcode?
|
/// Which `InstructionSet`s support the opcode?
|
||||||
pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> {
|
pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> {
|
||||||
match self {
|
match self {
|
||||||
AvxOpcode::Vfmadd213ps => smallvec![InstructionSet::FMA],
|
AvxOpcode::Vfmadd213ss
|
||||||
AvxOpcode::Vfmadd213pd => smallvec![InstructionSet::FMA],
|
| AvxOpcode::Vfmadd213sd
|
||||||
|
| AvxOpcode::Vfmadd213ps
|
||||||
|
| AvxOpcode::Vfmadd213pd => smallvec![InstructionSet::FMA],
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1400,6 +1404,8 @@ impl AvxOpcode {
|
|||||||
impl fmt::Debug for AvxOpcode {
|
impl fmt::Debug for AvxOpcode {
|
||||||
fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
|
fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
|
||||||
let name = match self {
|
let name = match self {
|
||||||
|
AvxOpcode::Vfmadd213ss => "vfmadd213ss",
|
||||||
|
AvxOpcode::Vfmadd213sd => "vfmadd213sd",
|
||||||
AvxOpcode::Vfmadd213ps => "vfmadd213ps",
|
AvxOpcode::Vfmadd213ps => "vfmadd213ps",
|
||||||
AvxOpcode::Vfmadd213pd => "vfmadd213pd",
|
AvxOpcode::Vfmadd213pd => "vfmadd213pd",
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -1742,6 +1742,8 @@ pub(crate) fn emit(
|
|||||||
let src3 = src3.clone().to_reg_mem().with_allocs(allocs);
|
let src3 = src3.clone().to_reg_mem().with_allocs(allocs);
|
||||||
|
|
||||||
let (w, opcode) = match op {
|
let (w, opcode) = match op {
|
||||||
|
AvxOpcode::Vfmadd213ss => (false, 0xA9),
|
||||||
|
AvxOpcode::Vfmadd213sd => (true, 0xA9),
|
||||||
AvxOpcode::Vfmadd213ps => (false, 0xA8),
|
AvxOpcode::Vfmadd213ps => (false, 0xA8),
|
||||||
AvxOpcode::Vfmadd213pd => (true, 0xA8),
|
AvxOpcode::Vfmadd213pd => (true, 0xA8),
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -3531,6 +3531,18 @@ fn test_x64_emit() {
|
|||||||
// ========================================================
|
// ========================================================
|
||||||
// XMM FMA
|
// XMM FMA
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_rm_r_vex(AvxOpcode::Vfmadd213ss, RegMem::reg(xmm2), xmm1, w_xmm0),
|
||||||
|
"C4E271A9C2",
|
||||||
|
"vfmadd213ss %xmm0, %xmm1, %xmm2, %xmm0",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_rm_r_vex(AvxOpcode::Vfmadd213sd, RegMem::reg(xmm5), xmm4, w_xmm3),
|
||||||
|
"C4E2D9A9DD",
|
||||||
|
"vfmadd213sd %xmm3, %xmm4, %xmm5, %xmm3",
|
||||||
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r_vex(AvxOpcode::Vfmadd213ps, RegMem::reg(xmm2), xmm1, w_xmm0),
|
Inst::xmm_rm_r_vex(AvxOpcode::Vfmadd213ps, RegMem::reg(xmm2), xmm1, w_xmm0),
|
||||||
"C4E271A8C2",
|
"C4E271A8C2",
|
||||||
|
|||||||
@@ -1847,7 +1847,12 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
|
|||||||
// Vfmadd uses and defs the dst reg, that is not the case with all
|
// Vfmadd uses and defs the dst reg, that is not the case with all
|
||||||
// AVX's ops, if you're adding a new op, make sure to correctly define
|
// AVX's ops, if you're adding a new op, make sure to correctly define
|
||||||
// register uses.
|
// register uses.
|
||||||
assert!(*op == AvxOpcode::Vfmadd213ps || *op == AvxOpcode::Vfmadd213pd);
|
assert!(
|
||||||
|
*op == AvxOpcode::Vfmadd213ss
|
||||||
|
|| *op == AvxOpcode::Vfmadd213sd
|
||||||
|
|| *op == AvxOpcode::Vfmadd213ps
|
||||||
|
|| *op == AvxOpcode::Vfmadd213pd
|
||||||
|
);
|
||||||
|
|
||||||
collector.reg_use(src1.to_reg());
|
collector.reg_use(src1.to_reg());
|
||||||
collector.reg_reuse_def(dst.to_writable_reg(), 0);
|
collector.reg_reuse_def(dst.to_writable_reg(), 0);
|
||||||
|
|||||||
@@ -2504,9 +2504,13 @@
|
|||||||
(libcall_3 (LibCall.FmaF32) x y z))
|
(libcall_3 (LibCall.FmaF32) x y z))
|
||||||
(rule (lower (has_type $F64 (fma x y z)))
|
(rule (lower (has_type $F64 (fma x y z)))
|
||||||
(libcall_3 (LibCall.FmaF64) x y z))
|
(libcall_3 (LibCall.FmaF64) x y z))
|
||||||
(rule (lower (has_type $F32X4 (fma x y z)))
|
(rule 1 (lower (has_type (and (use_fma) $F32) (fma x y z)))
|
||||||
|
(x64_vfmadd213ss x y z))
|
||||||
|
(rule 1 (lower (has_type (and (use_fma) $F64) (fma x y z)))
|
||||||
|
(x64_vfmadd213sd x y z))
|
||||||
|
(rule (lower (has_type (and (use_fma) $F32X4) (fma x y z)))
|
||||||
(x64_vfmadd213ps x y z))
|
(x64_vfmadd213ps x y z))
|
||||||
(rule (lower (has_type $F64X2 (fma x y z)))
|
(rule (lower (has_type (and (use_fma) $F64X2) (fma x y z)))
|
||||||
(x64_vfmadd213pd x y z))
|
(x64_vfmadd213pd x y z))
|
||||||
|
|
||||||
;; Rules for `load*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;; Rules for `load*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|||||||
@@ -248,6 +248,15 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn use_fma(&mut self, _: Type) -> Option<()> {
|
||||||
|
if self.isa_flags.use_fma() {
|
||||||
|
Some(())
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn imm8_from_value(&mut self, val: Value) -> Option<Imm8Reg> {
|
fn imm8_from_value(&mut self, val: Value) -> Option<Imm8Reg> {
|
||||||
let inst = self.lower_ctx.dfg().value_def(val).inst()?;
|
let inst = self.lower_ctx.dfg().value_def(val).inst()?;
|
||||||
|
|||||||
33
cranelift/filetests/filetests/isa/x64/fma-call.clif
Normal file
33
cranelift/filetests/filetests/isa/x64/fma-call.clif
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
test compile precise-output
|
||||||
|
target x86_64 has_avx=false has_fma=false
|
||||||
|
|
||||||
|
function %fma_f32(f32, f32, f32) -> f32 {
|
||||||
|
block0(v0: f32, v1: f32, v2: f32):
|
||||||
|
v3 = fma v0, v1, v2
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; load_ext_name %FmaF32+0, %rax
|
||||||
|
; call *%rax
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %fma_f64(f64, f64, f64) -> f64 {
|
||||||
|
block0(v0: f64, v1: f64, v2: f64):
|
||||||
|
v3 = fma v0, v1, v2
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; load_ext_name %FmaF64+0, %rax
|
||||||
|
; call *%rax
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
31
cranelift/filetests/filetests/isa/x64/fma-inst.clif
Normal file
31
cranelift/filetests/filetests/isa/x64/fma-inst.clif
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
test compile precise-output
|
||||||
|
target x86_64 has_avx=true has_fma=true
|
||||||
|
|
||||||
|
function %fma_f32(f32, f32, f32) -> f32 {
|
||||||
|
block0(v0: f32, v1: f32, v2: f32):
|
||||||
|
v3 = fma v0, v1, v2
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; vfmadd213ss %xmm0, %xmm1, %xmm2, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %fma_f64(f64, f64, f64) -> f64 {
|
||||||
|
block0(v0: f64, v1: f64, v2: f64):
|
||||||
|
v3 = fma v0, v1, v2
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; vfmadd213sd %xmm0, %xmm1, %xmm2, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
@@ -2,6 +2,7 @@ test interpret
|
|||||||
test run
|
test run
|
||||||
target aarch64
|
target aarch64
|
||||||
target s390x
|
target s390x
|
||||||
|
target x86_64 has_avx has_fma
|
||||||
target x86_64 has_avx=false has_fma=false
|
target x86_64 has_avx=false has_fma=false
|
||||||
|
|
||||||
function %fma_f32(f32, f32, f32) -> f32 {
|
function %fma_f32(f32, f32, f32) -> f32 {
|
||||||
|
|||||||
Reference in New Issue
Block a user