x64: Add more fma instruction lowerings (#5846)
The relaxed-simd proposal for WebAssembly adds a fused-multiply-add operation for `v128` types so I was poking around at Cranelift's existing support for its `fma` instruction. I was also poking around at the x86_64 ISA's offerings for the FMA operation and ended up with this PR that improves the lowering of the `fma` instruction on the x64 backend in a number of ways: * A libcall-based fallback is now provided for `f32x4` and `f64x2` types in preparation for eventual support of the relaxed-simd proposal. These encodings are horribly slow, but it's expected that if FMA semantics must be guaranteed then it's the best that can be done without the `fma` feature. Otherwise it'll be up to producers (e.g. Wasmtime embedders) whether wasm-level FMA operations should be FMA or multiply-then-add. * In addition to the existing `vfmadd213*` instructions opcodes were added for `vfmadd132*`. The `132` variant is selected based on which argument can have a sinkable load. * Any argument in the `fma` CLIF instruction can now have a `sinkable_load` and it'll generate a single FMA instruction. * All `vfnmadd*` opcodes were added as well. These are pattern-matched where one of the arguments to the CLIF instruction is an `fneg`. I opted to not add a new CLIF instruction here since it seemed like pattern matching was easy enough but I'm also not intimately familiar with the semantics here so if that's the preferred approach I can do that too.
This commit is contained in:
@@ -1199,6 +1199,18 @@
|
||||
Vfmadd213sd
|
||||
Vfmadd213ps
|
||||
Vfmadd213pd
|
||||
Vfmadd132ss
|
||||
Vfmadd132sd
|
||||
Vfmadd132ps
|
||||
Vfmadd132pd
|
||||
Vfnmadd213ss
|
||||
Vfnmadd213sd
|
||||
Vfnmadd213ps
|
||||
Vfnmadd213pd
|
||||
Vfnmadd132ss
|
||||
Vfnmadd132sd
|
||||
Vfnmadd132ps
|
||||
Vfnmadd132pd
|
||||
Vcmpps
|
||||
Vcmppd
|
||||
Vpsrlw
|
||||
@@ -1623,8 +1635,8 @@
|
||||
(decl use_popcnt (bool) Type)
|
||||
(extern extractor infallible use_popcnt use_popcnt)
|
||||
|
||||
(decl use_fma (bool) Type)
|
||||
(extern extractor infallible use_fma use_fma)
|
||||
(decl pure use_fma () bool)
|
||||
(extern constructor use_fma use_fma)
|
||||
|
||||
(decl use_sse41 (bool) Type)
|
||||
(extern extractor infallible use_sse41 use_sse41)
|
||||
@@ -3598,34 +3610,33 @@
|
||||
(_ Unit (emit (MInst.XmmRmRVex3 op src1 src2 src3 dst))))
|
||||
dst))
|
||||
|
||||
;; Helper for creating `vfmadd213ss` instructions.
|
||||
; TODO: This should have the (Xmm Xmm XmmMem) signature
|
||||
; but we don't support VEX memory encodings yet
|
||||
(decl x64_vfmadd213ss (Xmm Xmm Xmm) Xmm)
|
||||
(rule (x64_vfmadd213ss x y z)
|
||||
(xmm_rmr_vex3 (AvxOpcode.Vfmadd213ss) x y z))
|
||||
;; Helper for creating `vfmadd213*` instructions
|
||||
(decl x64_vfmadd213 (Type Xmm Xmm XmmMem) Xmm)
|
||||
(rule (x64_vfmadd213 $F32 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfmadd213ss) a b c))
|
||||
(rule (x64_vfmadd213 $F64 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfmadd213sd) a b c))
|
||||
(rule (x64_vfmadd213 $F32X4 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfmadd213ps) a b c))
|
||||
(rule (x64_vfmadd213 $F64X2 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfmadd213pd) a b c))
|
||||
|
||||
;; Helper for creating `vfmadd213sd` instructions.
|
||||
; TODO: This should have the (Xmm Xmm XmmMem) signature
|
||||
; but we don't support VEX memory encodings yet
|
||||
(decl x64_vfmadd213sd (Xmm Xmm Xmm) Xmm)
|
||||
(rule (x64_vfmadd213sd x y z)
|
||||
(xmm_rmr_vex3 (AvxOpcode.Vfmadd213sd) x y z))
|
||||
;; Helper for creating `vfmadd132*` instructions
|
||||
(decl x64_vfmadd132 (Type Xmm Xmm XmmMem) Xmm)
|
||||
(rule (x64_vfmadd132 $F32 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfmadd132ss) a b c))
|
||||
(rule (x64_vfmadd132 $F64 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfmadd132sd) a b c))
|
||||
(rule (x64_vfmadd132 $F32X4 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfmadd132ps) a b c))
|
||||
(rule (x64_vfmadd132 $F64X2 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfmadd132pd) a b c))
|
||||
|
||||
;; Helper for creating `vfmadd213ps` instructions.
|
||||
; TODO: This should have the (Xmm Xmm XmmMem) signature
|
||||
; but we don't support VEX memory encodings yet
|
||||
(decl x64_vfmadd213ps (Xmm Xmm Xmm) Xmm)
|
||||
(rule (x64_vfmadd213ps x y z)
|
||||
(xmm_rmr_vex3 (AvxOpcode.Vfmadd213ps) x y z))
|
||||
|
||||
;; Helper for creating `vfmadd213pd` instructions.
|
||||
; TODO: This should have the (Xmm Xmm XmmMem) signature
|
||||
; but we don't support VEX memory encodings yet
|
||||
(decl x64_vfmadd213pd (Xmm Xmm Xmm) Xmm)
|
||||
(rule (x64_vfmadd213pd x y z)
|
||||
(xmm_rmr_vex3 (AvxOpcode.Vfmadd213pd) x y z))
|
||||
;; Helper for creating `vfnmadd213*` instructions
|
||||
(decl x64_vfnmadd213 (Type Xmm Xmm XmmMem) Xmm)
|
||||
(rule (x64_vfnmadd213 $F32 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfnmadd213ss) a b c))
|
||||
(rule (x64_vfnmadd213 $F64 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfnmadd213sd) a b c))
|
||||
(rule (x64_vfnmadd213 $F32X4 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfnmadd213ps) a b c))
|
||||
(rule (x64_vfnmadd213 $F64X2 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfnmadd213pd) a b c))
|
||||
|
||||
;; Helper for creating `vfnmadd132*` instructions
|
||||
(decl x64_vfnmadd132 (Type Xmm Xmm XmmMem) Xmm)
|
||||
(rule (x64_vfnmadd132 $F32 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfnmadd132ss) a b c))
|
||||
(rule (x64_vfnmadd132 $F64 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfnmadd132sd) a b c))
|
||||
(rule (x64_vfnmadd132 $F32X4 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfnmadd132ps) a b c))
|
||||
(rule (x64_vfnmadd132 $F64X2 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfnmadd132pd) a b c))
|
||||
|
||||
;; Helper for creating `sqrtss` instructions.
|
||||
(decl x64_sqrtss (XmmMem) Xmm)
|
||||
|
||||
@@ -1515,7 +1515,19 @@ impl AvxOpcode {
|
||||
AvxOpcode::Vfmadd213ss
|
||||
| AvxOpcode::Vfmadd213sd
|
||||
| AvxOpcode::Vfmadd213ps
|
||||
| AvxOpcode::Vfmadd213pd => smallvec![InstructionSet::FMA],
|
||||
| AvxOpcode::Vfmadd213pd
|
||||
| AvxOpcode::Vfmadd132ss
|
||||
| AvxOpcode::Vfmadd132sd
|
||||
| AvxOpcode::Vfmadd132ps
|
||||
| AvxOpcode::Vfmadd132pd
|
||||
| AvxOpcode::Vfnmadd213ss
|
||||
| AvxOpcode::Vfnmadd213sd
|
||||
| AvxOpcode::Vfnmadd213ps
|
||||
| AvxOpcode::Vfnmadd213pd
|
||||
| AvxOpcode::Vfnmadd132ss
|
||||
| AvxOpcode::Vfnmadd132sd
|
||||
| AvxOpcode::Vfnmadd132ps
|
||||
| AvxOpcode::Vfnmadd132pd => smallvec![InstructionSet::FMA],
|
||||
AvxOpcode::Vminps
|
||||
| AvxOpcode::Vminpd
|
||||
| AvxOpcode::Vmaxps
|
||||
|
||||
@@ -2281,32 +2281,46 @@ pub(crate) fn emit(
|
||||
let dst = allocs.next(dst.to_reg().to_reg());
|
||||
debug_assert_eq!(src1, dst);
|
||||
let src2 = allocs.next(src2.to_reg());
|
||||
let src3 = src3.clone().to_reg_mem().with_allocs(allocs);
|
||||
let src3 = match src3.clone().to_reg_mem().with_allocs(allocs) {
|
||||
RegMem::Reg { reg } => {
|
||||
RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
|
||||
}
|
||||
RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
|
||||
};
|
||||
|
||||
let (w, map, opcode) = match op {
|
||||
AvxOpcode::Vfmadd132ss => (false, OpcodeMap::_0F38, 0x99),
|
||||
AvxOpcode::Vfmadd213ss => (false, OpcodeMap::_0F38, 0xA9),
|
||||
AvxOpcode::Vfnmadd132ss => (false, OpcodeMap::_0F38, 0x9D),
|
||||
AvxOpcode::Vfnmadd213ss => (false, OpcodeMap::_0F38, 0xAD),
|
||||
AvxOpcode::Vfmadd132sd => (true, OpcodeMap::_0F38, 0x99),
|
||||
AvxOpcode::Vfmadd213sd => (true, OpcodeMap::_0F38, 0xA9),
|
||||
AvxOpcode::Vfnmadd132sd => (true, OpcodeMap::_0F38, 0x9D),
|
||||
AvxOpcode::Vfnmadd213sd => (true, OpcodeMap::_0F38, 0xAD),
|
||||
AvxOpcode::Vfmadd132ps => (false, OpcodeMap::_0F38, 0x98),
|
||||
AvxOpcode::Vfmadd213ps => (false, OpcodeMap::_0F38, 0xA8),
|
||||
AvxOpcode::Vfnmadd132ps => (false, OpcodeMap::_0F38, 0x9C),
|
||||
AvxOpcode::Vfnmadd213ps => (false, OpcodeMap::_0F38, 0xAC),
|
||||
AvxOpcode::Vfmadd132pd => (true, OpcodeMap::_0F38, 0x98),
|
||||
AvxOpcode::Vfmadd213pd => (true, OpcodeMap::_0F38, 0xA8),
|
||||
AvxOpcode::Vfnmadd132pd => (true, OpcodeMap::_0F38, 0x9C),
|
||||
AvxOpcode::Vfnmadd213pd => (true, OpcodeMap::_0F38, 0xAC),
|
||||
AvxOpcode::Vblendvps => (false, OpcodeMap::_0F3A, 0x4A),
|
||||
AvxOpcode::Vblendvpd => (false, OpcodeMap::_0F3A, 0x4B),
|
||||
AvxOpcode::Vpblendvb => (false, OpcodeMap::_0F3A, 0x4C),
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
match src3 {
|
||||
RegMem::Reg { reg: src } => VexInstruction::new()
|
||||
.length(VexVectorLength::V128)
|
||||
.prefix(LegacyPrefixes::_66)
|
||||
.map(map)
|
||||
.w(w)
|
||||
.opcode(opcode)
|
||||
.reg(dst.to_real_reg().unwrap().hw_enc())
|
||||
.rm(src.to_real_reg().unwrap().hw_enc())
|
||||
.vvvv(src2.to_real_reg().unwrap().hw_enc())
|
||||
.encode(sink),
|
||||
_ => todo!(),
|
||||
};
|
||||
VexInstruction::new()
|
||||
.length(VexVectorLength::V128)
|
||||
.prefix(LegacyPrefixes::_66)
|
||||
.map(map)
|
||||
.w(w)
|
||||
.opcode(opcode)
|
||||
.reg(dst.to_real_reg().unwrap().hw_enc())
|
||||
.rm(src3)
|
||||
.vvvv(src2.to_real_reg().unwrap().hw_enc())
|
||||
.encode(sink);
|
||||
}
|
||||
|
||||
Inst::XmmRmRBlendVex {
|
||||
|
||||
@@ -1944,23 +1944,12 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
|
||||
src2.get_operands(collector);
|
||||
}
|
||||
Inst::XmmRmRVex3 {
|
||||
op,
|
||||
src1,
|
||||
src2,
|
||||
src3,
|
||||
dst,
|
||||
..
|
||||
} => {
|
||||
// Vfmadd uses and defs the dst reg, that is not the case with all
|
||||
// AVX's ops, if you're adding a new op, make sure to correctly define
|
||||
// register uses.
|
||||
assert!(
|
||||
*op == AvxOpcode::Vfmadd213ss
|
||||
|| *op == AvxOpcode::Vfmadd213sd
|
||||
|| *op == AvxOpcode::Vfmadd213ps
|
||||
|| *op == AvxOpcode::Vfmadd213pd
|
||||
);
|
||||
|
||||
collector.reg_use(src1.to_reg());
|
||||
collector.reg_reuse_def(dst.to_writable_reg(), 0);
|
||||
collector.reg_use(src2.to_reg());
|
||||
|
||||
@@ -2167,13 +2167,13 @@
|
||||
;; The above rules automatically sink loads for rhs operands, so additionally
|
||||
;; add rules for sinking loads with lhs operands.
|
||||
(rule 1 (lower (has_type $F32 (fadd (sinkable_load x) y)))
|
||||
(x64_addss y (sink_load x)))
|
||||
(x64_addss y x))
|
||||
(rule 1 (lower (has_type $F64 (fadd (sinkable_load x) y)))
|
||||
(x64_addsd y (sink_load x)))
|
||||
(x64_addsd y x))
|
||||
(rule 1 (lower (has_type $F32X4 (fadd (sinkable_load x) y)))
|
||||
(x64_addps y (sink_load x)))
|
||||
(x64_addps y x))
|
||||
(rule 1 (lower (has_type $F64X2 (fadd (sinkable_load x) y)))
|
||||
(x64_addpd y (sink_load x)))
|
||||
(x64_addpd y x))
|
||||
|
||||
;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
@@ -2200,13 +2200,13 @@
|
||||
;; The above rules automatically sink loads for rhs operands, so additionally
|
||||
;; add rules for sinking loads with lhs operands.
|
||||
(rule 1 (lower (has_type $F32 (fmul (sinkable_load x) y)))
|
||||
(x64_mulss y (sink_load x)))
|
||||
(x64_mulss y x))
|
||||
(rule 1 (lower (has_type $F64 (fmul (sinkable_load x) y)))
|
||||
(x64_mulsd y (sink_load x)))
|
||||
(x64_mulsd y x))
|
||||
(rule 1 (lower (has_type $F32X4 (fmul (sinkable_load x) y)))
|
||||
(x64_mulps y (sink_load x)))
|
||||
(x64_mulps y x))
|
||||
(rule 1 (lower (has_type $F64X2 (fmul (sinkable_load x) y)))
|
||||
(x64_mulpd y (sink_load x)))
|
||||
(x64_mulpd y x))
|
||||
|
||||
;; Rules for `fdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
@@ -2438,18 +2438,83 @@
|
||||
|
||||
;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;; Base case for fma is to call out to one of two libcalls. For vectors they
|
||||
;; need to be decomposed, handle each element individually, and then recomposed.
|
||||
|
||||
(rule (lower (has_type $F32 (fma x y z)))
|
||||
(libcall_3 (LibCall.FmaF32) x y z))
|
||||
(rule (lower (has_type $F64 (fma x y z)))
|
||||
(libcall_3 (LibCall.FmaF64) x y z))
|
||||
(rule 1 (lower (has_type (and (use_fma $true) $F32) (fma x y z)))
|
||||
(x64_vfmadd213ss x y z))
|
||||
(rule 1 (lower (has_type (and (use_fma $true) $F64) (fma x y z)))
|
||||
(x64_vfmadd213sd x y z))
|
||||
(rule (lower (has_type (and (use_fma $true) $F32X4) (fma x y z)))
|
||||
(x64_vfmadd213ps x y z))
|
||||
(rule (lower (has_type (and (use_fma $true) $F64X2) (fma x y z)))
|
||||
(x64_vfmadd213pd x y z))
|
||||
|
||||
(rule (lower (has_type $F32X4 (fma x y z)))
|
||||
(let (
|
||||
(x Xmm (put_in_xmm x))
|
||||
(y Xmm (put_in_xmm y))
|
||||
(z Xmm (put_in_xmm z))
|
||||
(x0 Xmm (libcall_3 (LibCall.FmaF32) x y z))
|
||||
(x1 Xmm (libcall_3 (LibCall.FmaF32)
|
||||
(x64_pshufd x 1)
|
||||
(x64_pshufd y 1)
|
||||
(x64_pshufd z 1)))
|
||||
(x2 Xmm (libcall_3 (LibCall.FmaF32)
|
||||
(x64_pshufd x 2)
|
||||
(x64_pshufd y 2)
|
||||
(x64_pshufd z 2)))
|
||||
(x3 Xmm (libcall_3 (LibCall.FmaF32)
|
||||
(x64_pshufd x 3)
|
||||
(x64_pshufd y 3)
|
||||
(x64_pshufd z 3)))
|
||||
|
||||
(tmp Xmm (vec_insert_lane $F32X4 x0 x1 1))
|
||||
(tmp Xmm (vec_insert_lane $F32X4 tmp x2 2))
|
||||
(tmp Xmm (vec_insert_lane $F32X4 tmp x3 3))
|
||||
)
|
||||
tmp))
|
||||
(rule (lower (has_type $F64X2 (fma x y z)))
|
||||
(let (
|
||||
(x Xmm (put_in_xmm x))
|
||||
(y Xmm (put_in_xmm y))
|
||||
(z Xmm (put_in_xmm z))
|
||||
(x0 Xmm (libcall_3 (LibCall.FmaF64) x y z))
|
||||
(x1 Xmm (libcall_3 (LibCall.FmaF64)
|
||||
(x64_pshufd x 0xee)
|
||||
(x64_pshufd y 0xee)
|
||||
(x64_pshufd z 0xee)))
|
||||
)
|
||||
(vec_insert_lane $F64X2 x0 x1 1)))
|
||||
|
||||
|
||||
;; Special case for when the `fma` feature is active and a native instruction
|
||||
;; can be used.
|
||||
(rule 1 (lower (has_type ty (fma x y z)))
|
||||
(if-let $true (use_fma))
|
||||
(fmadd ty x y z))
|
||||
|
||||
(decl fmadd (Type Value Value Value) Xmm)
|
||||
(decl fnmadd (Type Value Value Value) Xmm)
|
||||
|
||||
;; Base case. Note that this will automatically sink a load with `z`, the value
|
||||
;; to add.
|
||||
(rule (fmadd ty x y z) (x64_vfmadd213 ty x y z))
|
||||
|
||||
;; Allow sinking loads with one of the two values being multiplied in addition
|
||||
;; to the value being added. Note that both x and y can be sunk here due to
|
||||
;; multiplication being commutative.
|
||||
(rule 1 (fmadd ty (sinkable_load x) y z) (x64_vfmadd132 ty y z x))
|
||||
(rule 2 (fmadd ty x (sinkable_load y) z) (x64_vfmadd132 ty x z y))
|
||||
|
||||
;; If one of the values being multiplied is negated then use a `vfnmadd*`
|
||||
;; instruction instead
|
||||
(rule 3 (fmadd ty (fneg x) y z) (fnmadd ty x y z))
|
||||
(rule 4 (fmadd ty x (fneg y) z) (fnmadd ty x y z))
|
||||
|
||||
(rule (fnmadd ty x y z) (x64_vfnmadd213 ty x y z))
|
||||
(rule 1 (fnmadd ty (sinkable_load x) y z) (x64_vfnmadd132 ty y z x))
|
||||
(rule 2 (fnmadd ty x (sinkable_load y) z) (x64_vfnmadd132 ty x z y))
|
||||
|
||||
;; Like `fmadd` if one argument is negated switch which one is being codegen'd
|
||||
(rule 3 (fnmadd ty (fneg x) y z) (fmadd ty x y z))
|
||||
(rule 4 (fnmadd ty x (fneg y) z) (fmadd ty x y z))
|
||||
|
||||
;; Rules for `load*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
|
||||
@@ -213,7 +213,7 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn use_fma(&mut self, _: Type) -> bool {
|
||||
fn use_fma(&mut self) -> bool {
|
||||
self.backend.x64_flags.use_fma()
|
||||
}
|
||||
|
||||
|
||||
@@ -55,3 +55,183 @@ block0(v0: f64, v1: f64, v2: f64):
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
function %fma_f32x4(f32x4, f32x4, f32x4) -> f32x4 {
|
||||
block0(v0: f32x4, v1: f32x4, v2: f32x4):
|
||||
v3 = fma v0, v1, v2
|
||||
return v3
|
||||
}
|
||||
|
||||
; VCode:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; subq %rsp, $96, %rsp
|
||||
; block0:
|
||||
; movdqu %xmm0, rsp(0 + virtual offset)
|
||||
; movdqu %xmm1, rsp(16 + virtual offset)
|
||||
; movdqu %xmm2, rsp(32 + virtual offset)
|
||||
; load_ext_name %FmaF32+0, %r8
|
||||
; movdqu rsp(0 + virtual offset), %xmm0
|
||||
; movdqu rsp(16 + virtual offset), %xmm1
|
||||
; movdqu rsp(32 + virtual offset), %xmm2
|
||||
; call *%r8
|
||||
; movdqu %xmm0, rsp(48 + virtual offset)
|
||||
; movdqu rsp(0 + virtual offset), %xmm4
|
||||
; pshufd $1, %xmm4, %xmm0
|
||||
; movdqu rsp(16 + virtual offset), %xmm2
|
||||
; pshufd $1, %xmm2, %xmm1
|
||||
; movdqu rsp(32 + virtual offset), %xmm3
|
||||
; pshufd $1, %xmm3, %xmm2
|
||||
; load_ext_name %FmaF32+0, %r9
|
||||
; call *%r9
|
||||
; movdqu %xmm0, rsp(64 + virtual offset)
|
||||
; movdqu rsp(0 + virtual offset), %xmm14
|
||||
; pshufd $2, %xmm14, %xmm0
|
||||
; movdqu rsp(16 + virtual offset), %xmm13
|
||||
; pshufd $2, %xmm13, %xmm1
|
||||
; movdqu rsp(32 + virtual offset), %xmm15
|
||||
; pshufd $2, %xmm15, %xmm2
|
||||
; load_ext_name %FmaF32+0, %r10
|
||||
; call *%r10
|
||||
; movdqu %xmm0, rsp(80 + virtual offset)
|
||||
; movdqu rsp(0 + virtual offset), %xmm14
|
||||
; pshufd $3, %xmm14, %xmm0
|
||||
; movdqu rsp(16 + virtual offset), %xmm1
|
||||
; pshufd $3, %xmm1, %xmm1
|
||||
; movdqu rsp(32 + virtual offset), %xmm2
|
||||
; pshufd $3, %xmm2, %xmm2
|
||||
; load_ext_name %FmaF32+0, %r11
|
||||
; call *%r11
|
||||
; movdqa %xmm0, %xmm13
|
||||
; movdqu rsp(64 + virtual offset), %xmm4
|
||||
; movdqu rsp(48 + virtual offset), %xmm0
|
||||
; insertps $16, %xmm0, %xmm4, %xmm0
|
||||
; movdqu rsp(80 + virtual offset), %xmm10
|
||||
; insertps $32, %xmm0, %xmm10, %xmm0
|
||||
; movdqa %xmm13, %xmm1
|
||||
; insertps $48, %xmm0, %xmm1, %xmm0
|
||||
; addq %rsp, $96, %rsp
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
;
|
||||
; Disassembled:
|
||||
; block0: ; offset 0x0
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; subq $0x60, %rsp
|
||||
; block1: ; offset 0x8
|
||||
; movdqu %xmm0, (%rsp)
|
||||
; movdqu %xmm1, 0x10(%rsp)
|
||||
; movdqu %xmm2, 0x20(%rsp)
|
||||
; movabsq $0, %r8 ; reloc_external Abs8 %FmaF32 0
|
||||
; movdqu (%rsp), %xmm0
|
||||
; movdqu 0x10(%rsp), %xmm1
|
||||
; movdqu 0x20(%rsp), %xmm2
|
||||
; callq *%r8
|
||||
; movdqu %xmm0, 0x30(%rsp)
|
||||
; movdqu (%rsp), %xmm4
|
||||
; pshufd $1, %xmm4, %xmm0
|
||||
; movdqu 0x10(%rsp), %xmm2
|
||||
; pshufd $1, %xmm2, %xmm1
|
||||
; movdqu 0x20(%rsp), %xmm3
|
||||
; pshufd $1, %xmm3, %xmm2
|
||||
; movabsq $0, %r9 ; reloc_external Abs8 %FmaF32 0
|
||||
; callq *%r9
|
||||
; movdqu %xmm0, 0x40(%rsp)
|
||||
; movdqu (%rsp), %xmm14
|
||||
; pshufd $2, %xmm14, %xmm0
|
||||
; movdqu 0x10(%rsp), %xmm13
|
||||
; pshufd $2, %xmm13, %xmm1
|
||||
; movdqu 0x20(%rsp), %xmm15
|
||||
; pshufd $2, %xmm15, %xmm2
|
||||
; movabsq $0, %r10 ; reloc_external Abs8 %FmaF32 0
|
||||
; callq *%r10
|
||||
; movdqu %xmm0, 0x50(%rsp)
|
||||
; movdqu (%rsp), %xmm14
|
||||
; pshufd $3, %xmm14, %xmm0
|
||||
; movdqu 0x10(%rsp), %xmm1
|
||||
; pshufd $3, %xmm1, %xmm1
|
||||
; movdqu 0x20(%rsp), %xmm2
|
||||
; pshufd $3, %xmm2, %xmm2
|
||||
; movabsq $0, %r11 ; reloc_external Abs8 %FmaF32 0
|
||||
; callq *%r11
|
||||
; movdqa %xmm0, %xmm13
|
||||
; movdqu 0x40(%rsp), %xmm4
|
||||
; movdqu 0x30(%rsp), %xmm0
|
||||
; insertps $0x10, %xmm4, %xmm0
|
||||
; movdqu 0x50(%rsp), %xmm10
|
||||
; insertps $0x20, %xmm10, %xmm0
|
||||
; movdqa %xmm13, %xmm1
|
||||
; insertps $0x30, %xmm1, %xmm0
|
||||
; addq $0x60, %rsp
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
function %fma_f64x2(f64x2, f64x2, f64x2) -> f64x2 {
|
||||
block0(v0: f64x2, v1: f64x2, v2: f64x2):
|
||||
v3 = fma v0, v1, v2
|
||||
return v3
|
||||
}
|
||||
|
||||
; VCode:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; subq %rsp, $64, %rsp
|
||||
; block0:
|
||||
; movdqu %xmm0, rsp(0 + virtual offset)
|
||||
; movdqu %xmm1, rsp(16 + virtual offset)
|
||||
; movdqu %xmm2, rsp(32 + virtual offset)
|
||||
; load_ext_name %FmaF64+0, %r8
|
||||
; movdqu rsp(0 + virtual offset), %xmm0
|
||||
; movdqu rsp(16 + virtual offset), %xmm1
|
||||
; movdqu rsp(32 + virtual offset), %xmm2
|
||||
; call *%r8
|
||||
; movdqu %xmm0, rsp(48 + virtual offset)
|
||||
; movdqu rsp(0 + virtual offset), %xmm0
|
||||
; pshufd $238, %xmm0, %xmm0
|
||||
; movdqu rsp(16 + virtual offset), %xmm1
|
||||
; pshufd $238, %xmm1, %xmm1
|
||||
; movdqu rsp(32 + virtual offset), %xmm2
|
||||
; pshufd $238, %xmm2, %xmm2
|
||||
; load_ext_name %FmaF64+0, %r9
|
||||
; call *%r9
|
||||
; movdqa %xmm0, %xmm14
|
||||
; movdqu rsp(48 + virtual offset), %xmm0
|
||||
; movlhps %xmm0, %xmm14, %xmm0
|
||||
; addq %rsp, $64, %rsp
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
;
|
||||
; Disassembled:
|
||||
; block0: ; offset 0x0
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; subq $0x40, %rsp
|
||||
; block1: ; offset 0x8
|
||||
; movdqu %xmm0, (%rsp)
|
||||
; movdqu %xmm1, 0x10(%rsp)
|
||||
; movdqu %xmm2, 0x20(%rsp)
|
||||
; movabsq $0, %r8 ; reloc_external Abs8 %FmaF64 0
|
||||
; movdqu (%rsp), %xmm0
|
||||
; movdqu 0x10(%rsp), %xmm1
|
||||
; movdqu 0x20(%rsp), %xmm2
|
||||
; callq *%r8
|
||||
; movdqu %xmm0, 0x30(%rsp)
|
||||
; movdqu (%rsp), %xmm0
|
||||
; pshufd $0xee, %xmm0, %xmm0
|
||||
; movdqu 0x10(%rsp), %xmm1
|
||||
; pshufd $0xee, %xmm1, %xmm1
|
||||
; movdqu 0x20(%rsp), %xmm2
|
||||
; pshufd $0xee, %xmm2, %xmm2
|
||||
; movabsq $0, %r9 ; reloc_external Abs8 %FmaF64 0
|
||||
; callq *%r9
|
||||
; movdqa %xmm0, %xmm14
|
||||
; movdqu 0x30(%rsp), %xmm0
|
||||
; movlhps %xmm14, %xmm0
|
||||
; addq $0x40, %rsp
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
test compile precise-output
|
||||
target x86_64 has_avx=true has_fma=true
|
||||
|
||||
function %fma_f32(f32, f32, f32) -> f32 {
|
||||
function %vfmadd213ss(f32, f32, f32) -> f32 {
|
||||
block0(v0: f32, v1: f32, v2: f32):
|
||||
v3 = fma v0, v1, v2
|
||||
return v3
|
||||
@@ -26,17 +26,18 @@ block0(v0: f32, v1: f32, v2: f32):
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
function %fma_f64(f64, f64, f64) -> f64 {
|
||||
block0(v0: f64, v1: f64, v2: f64):
|
||||
v3 = fma v0, v1, v2
|
||||
return v3
|
||||
function %vfmadd213sd(f64, f64, i64) -> f64 {
|
||||
block0(v0: f64, v1: f64, v2: i64):
|
||||
v3 = load.f64 v2
|
||||
v4 = fma v0, v1, v3
|
||||
return v4
|
||||
}
|
||||
|
||||
; VCode:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; vfmadd213sd %xmm0, %xmm1, %xmm2, %xmm0
|
||||
; vfmadd213sd %xmm0, %xmm1, 0(%rdi), %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
@@ -46,7 +47,375 @@ block0(v0: f64, v1: f64, v2: f64):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; vfmadd213sd %xmm2, %xmm1, %xmm0
|
||||
; vfmadd213sd (%rdi), %xmm1, %xmm0 ; trap: heap_oob
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
function %vfmadd213ps(f32x4, f32x4, f32x4) -> f32x4 {
|
||||
block0(v0: f32x4, v1: f32x4, v2: f32x4):
|
||||
v3 = fma v0, v1, v2
|
||||
return v3
|
||||
}
|
||||
|
||||
; VCode:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; vfmadd213ps %xmm0, %xmm1, %xmm2, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
;
|
||||
; Disassembled:
|
||||
; block0: ; offset 0x0
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; vfmadd213ps %xmm2, %xmm1, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
function %vfmadd213pd(f64x2, f64x2, f64x2) -> f64x2 {
|
||||
block0(v0: f64x2, v1: f64x2, v2: f64x2):
|
||||
v3 = fma v0, v1, v2
|
||||
return v3
|
||||
}
|
||||
|
||||
; VCode:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; vfmadd213pd %xmm0, %xmm1, %xmm2, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
;
|
||||
; Disassembled:
|
||||
; block0: ; offset 0x0
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; vfmadd213pd %xmm2, %xmm1, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
function %vfmadd132ss(f32, i64, f32) -> f32 {
|
||||
block0(v0: f32, v1: i64, v2: f32):
|
||||
v3 = load.f32 v1
|
||||
v4 = fma v0, v3, v2
|
||||
return v4
|
||||
}
|
||||
|
||||
; VCode:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; vfmadd132ss %xmm0, %xmm1, 0(%rdi), %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
;
|
||||
; Disassembled:
|
||||
; block0: ; offset 0x0
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; vfmadd132ss (%rdi), %xmm1, %xmm0 ; trap: heap_oob
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
function %vfmadd132sd(i64, f64, f64) -> f64 {
|
||||
block0(v0: i64, v1: f64, v2: f64):
|
||||
v3 = load.f64 v0
|
||||
v4 = fma v3, v1, v2
|
||||
return v4
|
||||
}
|
||||
|
||||
; VCode:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; vfmadd132sd %xmm0, %xmm1, 0(%rdi), %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
;
|
||||
; Disassembled:
|
||||
; block0: ; offset 0x0
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; vfmadd132sd (%rdi), %xmm1, %xmm0 ; trap: heap_oob
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
function %vfmadd132ps(f32x4, i64, f32x4) -> f32x4 {
|
||||
block0(v0: f32x4, v1: i64, v2: f32x4):
|
||||
v3 = load.f32x4 v1
|
||||
v4 = fma v0, v3, v2
|
||||
return v4
|
||||
}
|
||||
|
||||
; VCode:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; vfmadd132ps %xmm0, %xmm1, 0(%rdi), %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
;
|
||||
; Disassembled:
|
||||
; block0: ; offset 0x0
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; vfmadd132ps (%rdi), %xmm1, %xmm0 ; trap: heap_oob
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
function %vfmadd132pd(i64, f64x2, f64x2) -> f64x2 {
|
||||
block0(v0: i64, v1: f64x2, v2: f64x2):
|
||||
v3 = load.f64x2 v0
|
||||
v4 = fma v3, v1, v2
|
||||
return v4
|
||||
}
|
||||
|
||||
; VCode:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; vfmadd132pd %xmm0, %xmm1, 0(%rdi), %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
;
|
||||
; Disassembled:
|
||||
; block0: ; offset 0x0
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; vfmadd132pd (%rdi), %xmm1, %xmm0 ; trap: heap_oob
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
function %vfnmadd213ss(f32, f32, f32) -> f32 {
|
||||
block0(v0: f32, v1: f32, v2: f32):
|
||||
v3 = fneg v0
|
||||
v4 = fma v3, v1, v2
|
||||
return v4
|
||||
}
|
||||
|
||||
; VCode:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; vfnmadd213ss %xmm0, %xmm1, %xmm2, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
;
|
||||
; Disassembled:
|
||||
; block0: ; offset 0x0
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; vfnmadd213ss %xmm2, %xmm1, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
function %vfnmadd213sd(f64, f64, f64) -> f64 {
|
||||
block0(v0: f64, v1: f64, v2: f64):
|
||||
v3 = fneg v1
|
||||
v4 = fma v0, v3, v2
|
||||
return v4
|
||||
}
|
||||
|
||||
; VCode:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; vfnmadd213sd %xmm0, %xmm1, %xmm2, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
;
|
||||
; Disassembled:
|
||||
; block0: ; offset 0x0
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; vfnmadd213sd %xmm2, %xmm1, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
function %vfnmadd213ps(f32x4, f32x4, i64) -> f32x4 {
|
||||
block0(v0: f32x4, v1: f32x4, v2: i64):
|
||||
v3 = fneg v0
|
||||
v4 = load.f32x4 v2
|
||||
v5 = fma v3, v1, v4
|
||||
return v5
|
||||
}
|
||||
|
||||
; VCode:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; vfnmadd213ps %xmm0, %xmm1, 0(%rdi), %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
;
|
||||
; Disassembled:
|
||||
; block0: ; offset 0x0
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; vfnmadd213ps (%rdi), %xmm1, %xmm0 ; trap: heap_oob
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
function %vfnmadd213pd(f64x2, f64x2, i64) -> f64x2 {
|
||||
block0(v0: f64x2, v1: f64x2, v2: i64):
|
||||
v3 = fneg v1
|
||||
v4 = load.f64x2 v2
|
||||
v5 = fma v0, v3, v4
|
||||
return v5
|
||||
}
|
||||
|
||||
; VCode:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; vfnmadd213pd %xmm0, %xmm1, 0(%rdi), %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
;
|
||||
; Disassembled:
|
||||
; block0: ; offset 0x0
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; vfnmadd213pd (%rdi), %xmm1, %xmm0 ; trap: heap_oob
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
function %vfnmadd132ss(f32, i64, f32) -> f32 {
|
||||
block0(v0: f32, v1: i64, v2: f32):
|
||||
v3 = fneg v0
|
||||
v4 = load.f32 v1
|
||||
v5 = fma v3, v4, v2
|
||||
return v5
|
||||
}
|
||||
|
||||
; VCode:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; vfnmadd132ss %xmm0, %xmm1, 0(%rdi), %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
;
|
||||
; Disassembled:
|
||||
; block0: ; offset 0x0
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; vfnmadd132ss (%rdi), %xmm1, %xmm0 ; trap: heap_oob
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
function %vfnmadd132sd(i64, f64, f64) -> f64 {
|
||||
block0(v0: i64, v1: f64, v2: f64):
|
||||
v3 = fneg v1
|
||||
v4 = load.f64 v0
|
||||
v5 = fma v4, v3, v2
|
||||
return v5
|
||||
}
|
||||
|
||||
; VCode:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; vfnmadd132sd %xmm0, %xmm1, 0(%rdi), %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
;
|
||||
; Disassembled:
|
||||
; block0: ; offset 0x0
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; vfnmadd132sd (%rdi), %xmm1, %xmm0 ; trap: heap_oob
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
function %vfnmadd132ps(i64, f32x4, f32x4) -> f32x4 {
|
||||
block0(v0: i64, v1: f32x4, v2: f32x4):
|
||||
v3 = load.f32x4 v0
|
||||
v4 = fneg v3
|
||||
v5 = fma v4, v1, v2
|
||||
return v5
|
||||
}
|
||||
|
||||
; VCode:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; vfnmadd132ps %xmm0, %xmm1, 0(%rdi), %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
;
|
||||
; Disassembled:
|
||||
; block0: ; offset 0x0
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; vfnmadd132ps (%rdi), %xmm1, %xmm0 ; trap: heap_oob
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
function %vfnmadd132pd(f64x2, i64, f64x2) -> f64x2 {
|
||||
block0(v0: f64x2, v1: i64, v2: f64x2):
|
||||
v3 = load.f64x2 v1
|
||||
v4 = fneg v3
|
||||
v5 = fma v0, v4, v2
|
||||
return v5
|
||||
}
|
||||
|
||||
; VCode:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; vfnmadd132pd %xmm0, %xmm1, 0(%rdi), %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
;
|
||||
; Disassembled:
|
||||
; block0: ; offset 0x0
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; vfnmadd132pd (%rdi), %xmm1, %xmm0 ; trap: heap_oob
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
test interpret
|
||||
test run
|
||||
target x86_64 has_avx has_fma
|
||||
target x86_64 has_avx=false has_fma=false
|
||||
target aarch64
|
||||
|
||||
function %fma_f32x4(f32x4, f32x4, f32x4) -> f32x4 {
|
||||
|
||||
Reference in New Issue
Block a user