x64: Improve memory support in {insert,extract}lane (#5982)
* x64: Improve memory support in `{insert,extract}lane`
This commit improves adds support to Cranelift to emit `pextr{b,w,d,q}`
with a memory destination, merging a store-of-extract operation into one
instruction. Additionally AVX support is added for the `pextr*`
instructions.
I've additionally tried to ensure that codegen tests and runtests exist
for all forms of these instructions too.
* Add missing commas
* Fix tests
This commit is contained in:
@@ -290,8 +290,18 @@
|
|||||||
;; XMM (scalar or vector) unary op (from xmm to reg/mem) using the
|
;; XMM (scalar or vector) unary op (from xmm to reg/mem) using the
|
||||||
;; VEX prefix
|
;; VEX prefix
|
||||||
(XmmMovRMVex (op AvxOpcode)
|
(XmmMovRMVex (op AvxOpcode)
|
||||||
(src Reg)
|
(src Xmm)
|
||||||
(dst SyntheticAmode))
|
(dst SyntheticAmode))
|
||||||
|
(XmmMovRMImmVex (op AvxOpcode)
|
||||||
|
(src Xmm)
|
||||||
|
(dst SyntheticAmode)
|
||||||
|
(imm u8))
|
||||||
|
|
||||||
|
;; XMM (scalar) unary op (from xmm to integer reg): vpextr{w,b,d,q}
|
||||||
|
(XmmToGprImmVex (op AvxOpcode)
|
||||||
|
(src Xmm)
|
||||||
|
(dst WritableGpr)
|
||||||
|
(imm u8))
|
||||||
|
|
||||||
;; XMM (scalar or vector) binary op that relies on the EVEX
|
;; XMM (scalar or vector) binary op that relies on the EVEX
|
||||||
;; prefix. Takes two inputs.
|
;; prefix. Takes two inputs.
|
||||||
@@ -343,8 +353,12 @@
|
|||||||
;; XMM (scalar or vector) unary op (from xmm to reg/mem): stores, movd,
|
;; XMM (scalar or vector) unary op (from xmm to reg/mem): stores, movd,
|
||||||
;; movq
|
;; movq
|
||||||
(XmmMovRM (op SseOpcode)
|
(XmmMovRM (op SseOpcode)
|
||||||
(src Reg)
|
(src Xmm)
|
||||||
(dst SyntheticAmode))
|
(dst SyntheticAmode))
|
||||||
|
(XmmMovRMImm (op SseOpcode)
|
||||||
|
(src Xmm)
|
||||||
|
(dst SyntheticAmode)
|
||||||
|
(imm u8))
|
||||||
|
|
||||||
;; XMM (scalar) unary op (from xmm to integer reg): movd, movq,
|
;; XMM (scalar) unary op (from xmm to integer reg): movd, movq,
|
||||||
;; cvtts{s,d}2si
|
;; cvtts{s,d}2si
|
||||||
@@ -1364,6 +1378,10 @@
|
|||||||
Vmovups
|
Vmovups
|
||||||
Vmovupd
|
Vmovupd
|
||||||
Vmovdqu
|
Vmovdqu
|
||||||
|
Vpextrb
|
||||||
|
Vpextrw
|
||||||
|
Vpextrd
|
||||||
|
Vpextrq
|
||||||
))
|
))
|
||||||
|
|
||||||
(type Avx512Opcode extern
|
(type Avx512Opcode extern
|
||||||
@@ -2043,10 +2061,18 @@
|
|||||||
(rule (xmm_movrm op addr data)
|
(rule (xmm_movrm op addr data)
|
||||||
(SideEffectNoResult.Inst (MInst.XmmMovRM op data addr)))
|
(SideEffectNoResult.Inst (MInst.XmmMovRM op data addr)))
|
||||||
|
|
||||||
|
(decl xmm_movrm_imm (SseOpcode SyntheticAmode Xmm u8) SideEffectNoResult)
|
||||||
|
(rule (xmm_movrm_imm op addr data imm)
|
||||||
|
(SideEffectNoResult.Inst (MInst.XmmMovRMImm op data addr imm)))
|
||||||
|
|
||||||
(decl xmm_movrm_vex (AvxOpcode SyntheticAmode Xmm) SideEffectNoResult)
|
(decl xmm_movrm_vex (AvxOpcode SyntheticAmode Xmm) SideEffectNoResult)
|
||||||
(rule (xmm_movrm_vex op addr data)
|
(rule (xmm_movrm_vex op addr data)
|
||||||
(SideEffectNoResult.Inst (MInst.XmmMovRMVex op data addr)))
|
(SideEffectNoResult.Inst (MInst.XmmMovRMVex op data addr)))
|
||||||
|
|
||||||
|
(decl xmm_movrm_imm_vex (AvxOpcode SyntheticAmode Xmm u8) SideEffectNoResult)
|
||||||
|
(rule (xmm_movrm_imm_vex op addr data imm)
|
||||||
|
(SideEffectNoResult.Inst (MInst.XmmMovRMImmVex op data addr imm)))
|
||||||
|
|
||||||
;; Load a constant into an XMM register.
|
;; Load a constant into an XMM register.
|
||||||
(decl x64_xmm_load_const (Type VCodeConstant) Xmm)
|
(decl x64_xmm_load_const (Type VCodeConstant) Xmm)
|
||||||
(rule (x64_xmm_load_const ty const)
|
(rule (x64_xmm_load_const ty const)
|
||||||
@@ -3603,21 +3629,61 @@
|
|||||||
(decl x64_pextrb (Xmm u8) Gpr)
|
(decl x64_pextrb (Xmm u8) Gpr)
|
||||||
(rule (x64_pextrb src lane)
|
(rule (x64_pextrb src lane)
|
||||||
(xmm_to_gpr_imm (SseOpcode.Pextrb) src lane))
|
(xmm_to_gpr_imm (SseOpcode.Pextrb) src lane))
|
||||||
|
(rule 1 (x64_pextrb src lane)
|
||||||
|
(if-let $true (has_avx))
|
||||||
|
(xmm_to_gpr_imm_vex (AvxOpcode.Vpextrb) src lane))
|
||||||
|
|
||||||
|
(decl x64_pextrb_store (SyntheticAmode Xmm u8) SideEffectNoResult)
|
||||||
|
(rule (x64_pextrb_store addr src lane)
|
||||||
|
(xmm_movrm_imm (SseOpcode.Pextrb) addr src lane))
|
||||||
|
(rule 1 (x64_pextrb_store addr src lane)
|
||||||
|
(if-let $true (has_avx))
|
||||||
|
(xmm_movrm_imm_vex (AvxOpcode.Vpextrb) addr src lane))
|
||||||
|
|
||||||
;; Helper for creating `pextrw` instructions.
|
;; Helper for creating `pextrw` instructions.
|
||||||
(decl x64_pextrw (Xmm u8) Gpr)
|
(decl x64_pextrw (Xmm u8) Gpr)
|
||||||
(rule (x64_pextrw src lane)
|
(rule (x64_pextrw src lane)
|
||||||
(xmm_to_gpr_imm (SseOpcode.Pextrw) src lane))
|
(xmm_to_gpr_imm (SseOpcode.Pextrw) src lane))
|
||||||
|
(rule 1 (x64_pextrw src lane)
|
||||||
|
(if-let $true (has_avx))
|
||||||
|
(xmm_to_gpr_imm_vex (AvxOpcode.Vpextrw) src lane))
|
||||||
|
|
||||||
|
(decl x64_pextrw_store (SyntheticAmode Xmm u8) SideEffectNoResult)
|
||||||
|
(rule (x64_pextrw_store addr src lane)
|
||||||
|
(xmm_movrm_imm (SseOpcode.Pextrw) addr src lane))
|
||||||
|
(rule 1 (x64_pextrw_store addr src lane)
|
||||||
|
(if-let $true (has_avx))
|
||||||
|
(xmm_movrm_imm_vex (AvxOpcode.Vpextrw) addr src lane))
|
||||||
|
|
||||||
;; Helper for creating `pextrd` instructions.
|
;; Helper for creating `pextrd` instructions.
|
||||||
(decl x64_pextrd (Xmm u8) Gpr)
|
(decl x64_pextrd (Xmm u8) Gpr)
|
||||||
(rule (x64_pextrd src lane)
|
(rule (x64_pextrd src lane)
|
||||||
(xmm_to_gpr_imm (SseOpcode.Pextrd) src lane))
|
(xmm_to_gpr_imm (SseOpcode.Pextrd) src lane))
|
||||||
|
(rule 1 (x64_pextrd src lane)
|
||||||
|
(if-let $true (has_avx))
|
||||||
|
(xmm_to_gpr_imm_vex (AvxOpcode.Vpextrd) src lane))
|
||||||
|
|
||||||
|
(decl x64_pextrd_store (SyntheticAmode Xmm u8) SideEffectNoResult)
|
||||||
|
(rule (x64_pextrd_store addr src lane)
|
||||||
|
(xmm_movrm_imm (SseOpcode.Pextrd) addr src lane))
|
||||||
|
(rule 1 (x64_pextrd_store addr src lane)
|
||||||
|
(if-let $true (has_avx))
|
||||||
|
(xmm_movrm_imm_vex (AvxOpcode.Vpextrd) addr src lane))
|
||||||
|
|
||||||
;; Helper for creating `pextrq` instructions.
|
;; Helper for creating `pextrq` instructions.
|
||||||
(decl x64_pextrq (Xmm u8) Gpr)
|
(decl x64_pextrq (Xmm u8) Gpr)
|
||||||
(rule (x64_pextrq src lane)
|
(rule (x64_pextrq src lane)
|
||||||
(xmm_to_gpr_imm (SseOpcode.Pextrq) src lane))
|
(xmm_to_gpr_imm (SseOpcode.Pextrq) src lane))
|
||||||
|
(rule 1 (x64_pextrq src lane)
|
||||||
|
(if-let $true (has_avx))
|
||||||
|
(xmm_to_gpr_imm_vex (AvxOpcode.Vpextrq) src lane))
|
||||||
|
|
||||||
|
(decl x64_pextrq_store (SyntheticAmode Xmm u8) SideEffectNoResult)
|
||||||
|
(rule (x64_pextrq_store addr src lane)
|
||||||
|
(xmm_movrm_imm (SseOpcode.Pextrq) addr src lane))
|
||||||
|
(rule 1 (x64_pextrq_store addr src lane)
|
||||||
|
(if-let $true (has_avx))
|
||||||
|
(xmm_movrm_imm_vex (AvxOpcode.Vpextrq) addr src lane))
|
||||||
|
|
||||||
;; Helper for creating `MInst.XmmToGpr` instructions.
|
;; Helper for creating `MInst.XmmToGpr` instructions.
|
||||||
(decl xmm_to_gpr (SseOpcode Xmm OperandSize) Gpr)
|
(decl xmm_to_gpr (SseOpcode Xmm OperandSize) Gpr)
|
||||||
@@ -3626,13 +3692,20 @@
|
|||||||
(_ Unit (emit (MInst.XmmToGpr op src dst size))))
|
(_ Unit (emit (MInst.XmmToGpr op src dst size))))
|
||||||
dst))
|
dst))
|
||||||
|
|
||||||
;; Helper for creating `MInst.XmmToGpr` instructions.
|
;; Helper for creating `MInst.XmmToGprImm` instructions.
|
||||||
(decl xmm_to_gpr_imm (SseOpcode Xmm u8) Gpr)
|
(decl xmm_to_gpr_imm (SseOpcode Xmm u8) Gpr)
|
||||||
(rule (xmm_to_gpr_imm op src imm)
|
(rule (xmm_to_gpr_imm op src imm)
|
||||||
(let ((dst WritableGpr (temp_writable_gpr))
|
(let ((dst WritableGpr (temp_writable_gpr))
|
||||||
(_ Unit (emit (MInst.XmmToGprImm op src dst imm))))
|
(_ Unit (emit (MInst.XmmToGprImm op src dst imm))))
|
||||||
dst))
|
dst))
|
||||||
|
|
||||||
|
;; Helper for creating `MInst.XmmToGprImmVex` instructions.
|
||||||
|
(decl xmm_to_gpr_imm_vex (AvxOpcode Xmm u8) Gpr)
|
||||||
|
(rule (xmm_to_gpr_imm_vex op src imm)
|
||||||
|
(let ((dst WritableGpr (temp_writable_gpr))
|
||||||
|
(_ Unit (emit (MInst.XmmToGprImmVex op src dst imm))))
|
||||||
|
dst))
|
||||||
|
|
||||||
;; Helper for creating `pmovmskb` instructions.
|
;; Helper for creating `pmovmskb` instructions.
|
||||||
(decl x64_pmovmskb (OperandSize Xmm) Gpr)
|
(decl x64_pmovmskb (OperandSize Xmm) Gpr)
|
||||||
(rule (x64_pmovmskb size src)
|
(rule (x64_pmovmskb size src)
|
||||||
|
|||||||
@@ -1699,7 +1699,11 @@ impl AvxOpcode {
|
|||||||
| AvxOpcode::Vmovsd
|
| AvxOpcode::Vmovsd
|
||||||
| AvxOpcode::Vmovups
|
| AvxOpcode::Vmovups
|
||||||
| AvxOpcode::Vmovupd
|
| AvxOpcode::Vmovupd
|
||||||
| AvxOpcode::Vmovdqu => {
|
| AvxOpcode::Vmovdqu
|
||||||
|
| AvxOpcode::Vpextrb
|
||||||
|
| AvxOpcode::Vpextrw
|
||||||
|
| AvxOpcode::Vpextrd
|
||||||
|
| AvxOpcode::Vpextrq => {
|
||||||
smallvec![InstructionSet::AVX]
|
smallvec![InstructionSet::AVX]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2430,7 +2430,7 @@ pub(crate) fn emit(
|
|||||||
}
|
}
|
||||||
|
|
||||||
Inst::XmmMovRMVex { op, src, dst } => {
|
Inst::XmmMovRMVex { op, src, dst } => {
|
||||||
let src = allocs.next(*src);
|
let src = allocs.next(src.to_reg());
|
||||||
let dst = dst.with_allocs(allocs).finalize(state, sink);
|
let dst = dst.with_allocs(allocs).finalize(state, sink);
|
||||||
|
|
||||||
let (prefix, map, opcode) = match op {
|
let (prefix, map, opcode) = match op {
|
||||||
@@ -2451,6 +2451,52 @@ pub(crate) fn emit(
|
|||||||
.encode(sink);
|
.encode(sink);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Inst::XmmMovRMImmVex { op, src, dst, imm } => {
|
||||||
|
let src = allocs.next(src.to_reg());
|
||||||
|
let dst = dst.with_allocs(allocs).finalize(state, sink);
|
||||||
|
|
||||||
|
let (w, prefix, map, opcode) = match op {
|
||||||
|
AvxOpcode::Vpextrb => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x14),
|
||||||
|
AvxOpcode::Vpextrw => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x15),
|
||||||
|
AvxOpcode::Vpextrd => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x16),
|
||||||
|
AvxOpcode::Vpextrq => (true, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x16),
|
||||||
|
_ => unimplemented!("Opcode {:?} not implemented", op),
|
||||||
|
};
|
||||||
|
VexInstruction::new()
|
||||||
|
.length(VexVectorLength::V128)
|
||||||
|
.w(w)
|
||||||
|
.prefix(prefix)
|
||||||
|
.map(map)
|
||||||
|
.opcode(opcode)
|
||||||
|
.rm(dst)
|
||||||
|
.reg(src.to_real_reg().unwrap().hw_enc())
|
||||||
|
.imm(*imm)
|
||||||
|
.encode(sink);
|
||||||
|
}
|
||||||
|
|
||||||
|
Inst::XmmToGprImmVex { op, src, dst, imm } => {
|
||||||
|
let src = allocs.next(src.to_reg());
|
||||||
|
let dst = allocs.next(dst.to_reg().to_reg());
|
||||||
|
|
||||||
|
let (w, prefix, map, opcode) = match op {
|
||||||
|
AvxOpcode::Vpextrb => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x14),
|
||||||
|
AvxOpcode::Vpextrw => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x15),
|
||||||
|
AvxOpcode::Vpextrd => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x16),
|
||||||
|
AvxOpcode::Vpextrq => (true, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x16),
|
||||||
|
_ => unimplemented!("Opcode {:?} not implemented", op),
|
||||||
|
};
|
||||||
|
VexInstruction::new()
|
||||||
|
.length(VexVectorLength::V128)
|
||||||
|
.w(w)
|
||||||
|
.prefix(prefix)
|
||||||
|
.map(map)
|
||||||
|
.opcode(opcode)
|
||||||
|
.rm(dst.to_real_reg().unwrap().hw_enc())
|
||||||
|
.reg(src.to_real_reg().unwrap().hw_enc())
|
||||||
|
.imm(*imm)
|
||||||
|
.encode(sink);
|
||||||
|
}
|
||||||
|
|
||||||
Inst::XmmRmREvex {
|
Inst::XmmRmREvex {
|
||||||
op,
|
op,
|
||||||
src1,
|
src1,
|
||||||
@@ -2649,7 +2695,7 @@ pub(crate) fn emit(
|
|||||||
}
|
}
|
||||||
|
|
||||||
Inst::XmmMovRM { op, src, dst } => {
|
Inst::XmmMovRM { op, src, dst } => {
|
||||||
let src = allocs.next(*src);
|
let src = allocs.next(src.to_reg());
|
||||||
let dst = dst.with_allocs(allocs);
|
let dst = dst.with_allocs(allocs);
|
||||||
|
|
||||||
let (prefix, opcode) = match op {
|
let (prefix, opcode) = match op {
|
||||||
@@ -2666,6 +2712,27 @@ pub(crate) fn emit(
|
|||||||
emit_std_reg_mem(sink, prefix, opcode, 2, src, dst, RexFlags::clear_w(), 0);
|
emit_std_reg_mem(sink, prefix, opcode, 2, src, dst, RexFlags::clear_w(), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Inst::XmmMovRMImm { op, src, dst, imm } => {
|
||||||
|
let src = allocs.next(src.to_reg());
|
||||||
|
let dst = dst.with_allocs(allocs);
|
||||||
|
|
||||||
|
let (w, prefix, opcode) = match op {
|
||||||
|
SseOpcode::Pextrb => (false, LegacyPrefixes::_66, 0x0F3A14),
|
||||||
|
SseOpcode::Pextrw => (false, LegacyPrefixes::_66, 0x0F3A15),
|
||||||
|
SseOpcode::Pextrd => (false, LegacyPrefixes::_66, 0x0F3A16),
|
||||||
|
SseOpcode::Pextrq => (true, LegacyPrefixes::_66, 0x0F3A16),
|
||||||
|
_ => unimplemented!("Opcode {:?} not implemented", op),
|
||||||
|
};
|
||||||
|
let rex = if w {
|
||||||
|
RexFlags::set_w()
|
||||||
|
} else {
|
||||||
|
RexFlags::clear_w()
|
||||||
|
};
|
||||||
|
let dst = &dst.finalize(state, sink);
|
||||||
|
emit_std_reg_mem(sink, prefix, opcode, 3, src, dst, rex, 1);
|
||||||
|
sink.put1(*imm);
|
||||||
|
}
|
||||||
|
|
||||||
Inst::XmmToGpr {
|
Inst::XmmToGpr {
|
||||||
op,
|
op,
|
||||||
src,
|
src,
|
||||||
|
|||||||
@@ -4871,7 +4871,7 @@ fn test_x64_emit() {
|
|||||||
imm: 2,
|
imm: 2,
|
||||||
},
|
},
|
||||||
"C4430920EF02",
|
"C4430920EF02",
|
||||||
"vpinsrb $2 %xmm14, %r15, %xmm13",
|
"vpinsrb $2, %xmm14, %r15, %xmm13",
|
||||||
));
|
));
|
||||||
|
|
||||||
// ========================================================
|
// ========================================================
|
||||||
|
|||||||
@@ -131,6 +131,7 @@ impl Inst {
|
|||||||
// These use dynamic SSE opcodes.
|
// These use dynamic SSE opcodes.
|
||||||
Inst::GprToXmm { op, .. }
|
Inst::GprToXmm { op, .. }
|
||||||
| Inst::XmmMovRM { op, .. }
|
| Inst::XmmMovRM { op, .. }
|
||||||
|
| Inst::XmmMovRMImm { op, .. }
|
||||||
| Inst::XmmRmiReg { opcode: op, .. }
|
| Inst::XmmRmiReg { opcode: op, .. }
|
||||||
| Inst::XmmRmR { op, .. }
|
| Inst::XmmRmR { op, .. }
|
||||||
| Inst::XmmRmRUnaligned { op, .. }
|
| Inst::XmmRmRUnaligned { op, .. }
|
||||||
@@ -153,7 +154,9 @@ impl Inst {
|
|||||||
| Inst::XmmVexPinsr { op, .. }
|
| Inst::XmmVexPinsr { op, .. }
|
||||||
| Inst::XmmUnaryRmRVex { op, .. }
|
| Inst::XmmUnaryRmRVex { op, .. }
|
||||||
| Inst::XmmUnaryRmRImmVex { op, .. }
|
| Inst::XmmUnaryRmRImmVex { op, .. }
|
||||||
| Inst::XmmMovRMVex { op, .. } => op.available_from(),
|
| Inst::XmmMovRMVex { op, .. }
|
||||||
|
| Inst::XmmMovRMImmVex { op, .. }
|
||||||
|
| Inst::XmmToGprImmVex { op, .. } => op.available_from(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -331,7 +334,7 @@ impl Inst {
|
|||||||
debug_assert!(src.class() == RegClass::Float);
|
debug_assert!(src.class() == RegClass::Float);
|
||||||
Inst::XmmMovRM {
|
Inst::XmmMovRM {
|
||||||
op,
|
op,
|
||||||
src,
|
src: Xmm::new(src).unwrap(),
|
||||||
dst: dst.into(),
|
dst: dst.into(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -933,17 +936,33 @@ impl PrettyPrint for Inst {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Inst::XmmMovRM { op, src, dst, .. } => {
|
Inst::XmmMovRM { op, src, dst, .. } => {
|
||||||
let src = pretty_print_reg(*src, 8, allocs);
|
let src = pretty_print_reg(src.to_reg(), 8, allocs);
|
||||||
let dst = dst.pretty_print(8, allocs);
|
let dst = dst.pretty_print(8, allocs);
|
||||||
format!("{} {}, {}", ljustify(op.to_string()), src, dst)
|
format!("{} {}, {}", ljustify(op.to_string()), src, dst)
|
||||||
}
|
}
|
||||||
|
|
||||||
Inst::XmmMovRMVex { op, src, dst, .. } => {
|
Inst::XmmMovRMVex { op, src, dst, .. } => {
|
||||||
let src = pretty_print_reg(*src, 8, allocs);
|
let src = pretty_print_reg(src.to_reg(), 8, allocs);
|
||||||
let dst = dst.pretty_print(8, allocs);
|
let dst = dst.pretty_print(8, allocs);
|
||||||
format!("{} {}, {}", ljustify(op.to_string()), src, dst)
|
format!("{} {}, {}", ljustify(op.to_string()), src, dst)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Inst::XmmMovRMImm {
|
||||||
|
op, src, dst, imm, ..
|
||||||
|
} => {
|
||||||
|
let src = pretty_print_reg(src.to_reg(), 8, allocs);
|
||||||
|
let dst = dst.pretty_print(8, allocs);
|
||||||
|
format!("{} ${imm}, {}, {}", ljustify(op.to_string()), src, dst)
|
||||||
|
}
|
||||||
|
|
||||||
|
Inst::XmmMovRMImmVex {
|
||||||
|
op, src, dst, imm, ..
|
||||||
|
} => {
|
||||||
|
let src = pretty_print_reg(src.to_reg(), 8, allocs);
|
||||||
|
let dst = dst.pretty_print(8, allocs);
|
||||||
|
format!("{} ${imm}, {}, {}", ljustify(op.to_string()), src, dst)
|
||||||
|
}
|
||||||
|
|
||||||
Inst::XmmRmR {
|
Inst::XmmRmR {
|
||||||
op,
|
op,
|
||||||
src1,
|
src1,
|
||||||
@@ -1023,7 +1042,7 @@ impl PrettyPrint for Inst {
|
|||||||
let src1 = pretty_print_reg(src1.to_reg(), 8, allocs);
|
let src1 = pretty_print_reg(src1.to_reg(), 8, allocs);
|
||||||
let src2 = src2.pretty_print(8, allocs);
|
let src2 = src2.pretty_print(8, allocs);
|
||||||
|
|
||||||
format!("{} ${imm} {src1}, {src2}, {dst}", ljustify(op.to_string()))
|
format!("{} ${imm}, {src1}, {src2}, {dst}", ljustify(op.to_string()))
|
||||||
}
|
}
|
||||||
|
|
||||||
Inst::XmmVexPinsr {
|
Inst::XmmVexPinsr {
|
||||||
@@ -1038,7 +1057,7 @@ impl PrettyPrint for Inst {
|
|||||||
let src1 = pretty_print_reg(src1.to_reg(), 8, allocs);
|
let src1 = pretty_print_reg(src1.to_reg(), 8, allocs);
|
||||||
let src2 = src2.pretty_print(8, allocs);
|
let src2 = src2.pretty_print(8, allocs);
|
||||||
|
|
||||||
format!("{} ${imm} {src1}, {src2}, {dst}", ljustify(op.to_string()))
|
format!("{} ${imm}, {src1}, {src2}, {dst}", ljustify(op.to_string()))
|
||||||
}
|
}
|
||||||
|
|
||||||
Inst::XmmRmRVex3 {
|
Inst::XmmRmRVex3 {
|
||||||
@@ -1190,6 +1209,12 @@ impl PrettyPrint for Inst {
|
|||||||
format!("{} ${imm}, {}, {}", ljustify(op.to_string()), src, dst)
|
format!("{} ${imm}, {}, {}", ljustify(op.to_string()), src, dst)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Inst::XmmToGprImmVex { op, src, dst, imm } => {
|
||||||
|
let src = pretty_print_reg(src.to_reg(), 8, allocs);
|
||||||
|
let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
|
||||||
|
format!("{} ${imm}, {}, {}", ljustify(op.to_string()), src, dst)
|
||||||
|
}
|
||||||
|
|
||||||
Inst::GprToXmm {
|
Inst::GprToXmm {
|
||||||
op,
|
op,
|
||||||
src,
|
src,
|
||||||
@@ -2033,8 +2058,11 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
|
|||||||
collector.reg_reuse_def(dst.to_writable_reg(), 0); // Reuse RHS.
|
collector.reg_reuse_def(dst.to_writable_reg(), 0); // Reuse RHS.
|
||||||
src2.get_operands(collector);
|
src2.get_operands(collector);
|
||||||
}
|
}
|
||||||
Inst::XmmMovRM { src, dst, .. } | Inst::XmmMovRMVex { src, dst, .. } => {
|
Inst::XmmMovRM { src, dst, .. }
|
||||||
collector.reg_use(*src);
|
| Inst::XmmMovRMVex { src, dst, .. }
|
||||||
|
| Inst::XmmMovRMImm { src, dst, .. }
|
||||||
|
| Inst::XmmMovRMImmVex { src, dst, .. } => {
|
||||||
|
collector.reg_use(src.to_reg());
|
||||||
dst.get_operands(collector);
|
dst.get_operands(collector);
|
||||||
}
|
}
|
||||||
Inst::XmmCmpRmR { src, dst, .. } => {
|
Inst::XmmCmpRmR { src, dst, .. } => {
|
||||||
@@ -2058,7 +2086,9 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
|
|||||||
collector.reg_use(src.to_reg());
|
collector.reg_use(src.to_reg());
|
||||||
collector.reg_fixed_nonallocatable(*dst);
|
collector.reg_fixed_nonallocatable(*dst);
|
||||||
}
|
}
|
||||||
Inst::XmmToGpr { src, dst, .. } | Inst::XmmToGprImm { src, dst, .. } => {
|
Inst::XmmToGpr { src, dst, .. }
|
||||||
|
| Inst::XmmToGprImm { src, dst, .. }
|
||||||
|
| Inst::XmmToGprImmVex { src, dst, .. } => {
|
||||||
collector.reg_use(src.to_reg());
|
collector.reg_use(src.to_reg());
|
||||||
collector.reg_def(dst.to_writable_reg());
|
collector.reg_def(dst.to_writable_reg());
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2659,17 +2659,41 @@
|
|||||||
;; f32 or f64 despite the source perhaps being an integer vector since the
|
;; f32 or f64 despite the source perhaps being an integer vector since the
|
||||||
;; result of the instruction is the same.
|
;; result of the instruction is the same.
|
||||||
(rule 2 (lower (store flags
|
(rule 2 (lower (store flags
|
||||||
(has_type (ty_32 _) (extractlane value (u8_from_uimm8 0)))
|
(has_type $F32 (extractlane value (u8_from_uimm8 0)))
|
||||||
address
|
address
|
||||||
offset))
|
offset))
|
||||||
(side_effect
|
(side_effect
|
||||||
(x64_movss_store (to_amode flags address offset) value)))
|
(x64_movss_store (to_amode flags address offset) value)))
|
||||||
(rule 3 (lower (store flags
|
(rule 2 (lower (store flags
|
||||||
(has_type (ty_64 _) (extractlane value (u8_from_uimm8 0)))
|
(has_type $F64 (extractlane value (u8_from_uimm8 0)))
|
||||||
address
|
address
|
||||||
offset))
|
offset))
|
||||||
(side_effect
|
(side_effect
|
||||||
(x64_movsd_store (to_amode flags address offset) value)))
|
(x64_movsd_store (to_amode flags address offset) value)))
|
||||||
|
(rule 2 (lower (store flags
|
||||||
|
(has_type $I8 (extractlane value (u8_from_uimm8 n)))
|
||||||
|
address
|
||||||
|
offset))
|
||||||
|
(side_effect
|
||||||
|
(x64_pextrb_store (to_amode flags address offset) value n)))
|
||||||
|
(rule 2 (lower (store flags
|
||||||
|
(has_type $I16 (extractlane value (u8_from_uimm8 n)))
|
||||||
|
address
|
||||||
|
offset))
|
||||||
|
(side_effect
|
||||||
|
(x64_pextrw_store (to_amode flags address offset) value n)))
|
||||||
|
(rule 2 (lower (store flags
|
||||||
|
(has_type $I32 (extractlane value (u8_from_uimm8 n)))
|
||||||
|
address
|
||||||
|
offset))
|
||||||
|
(side_effect
|
||||||
|
(x64_pextrd_store (to_amode flags address offset) value n)))
|
||||||
|
(rule 2 (lower (store flags
|
||||||
|
(has_type $I64 (extractlane value (u8_from_uimm8 n)))
|
||||||
|
address
|
||||||
|
offset))
|
||||||
|
(side_effect
|
||||||
|
(x64_pextrq_store (to_amode flags address offset) value n)))
|
||||||
|
|
||||||
;; Rules for `load*` + ALU op + `store*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;; Rules for `load*` + ALU op + `store*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
|||||||
309
cranelift/filetests/filetests/isa/x64/extractlane-avx.clif
Normal file
309
cranelift/filetests/filetests/isa/x64/extractlane-avx.clif
Normal file
@@ -0,0 +1,309 @@
|
|||||||
|
test compile precise-output
|
||||||
|
target x86_64 has_avx
|
||||||
|
|
||||||
|
function %f1(i8x16) -> i8 {
|
||||||
|
block0(v0: i8x16):
|
||||||
|
v1 = extractlane v0, 1
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; vpextrb $1, %xmm0, %rax
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; vpextrb $1, %xmm0, %eax
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
|
function %f2(i16x8) -> i16 {
|
||||||
|
block0(v0: i16x8):
|
||||||
|
v1 = extractlane v0, 1
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; vpextrw $1, %xmm0, %rax
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; vpextrw $1, %xmm0, %eax
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
|
function %f3(i32x4) -> i32 {
|
||||||
|
block0(v0: i32x4):
|
||||||
|
v1 = extractlane v0, 1
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; vpextrd $1, %xmm0, %rax
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; vpextrd $1, %xmm0, %eax
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
|
function %f4(i64x2) -> i64 {
|
||||||
|
block0(v0: i64x2):
|
||||||
|
v1 = extractlane v0, 1
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; vpextrq $1, %xmm0, %rax
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; vpextrq $1, %xmm0, %rax
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
|
function %f5(f32x4) -> f32 {
|
||||||
|
block0(v0: f32x4):
|
||||||
|
v1 = extractlane v0, 1
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; vpshufd $1, %xmm0, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; vpshufd $1, %xmm0, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
|
function %f6(f64x2) -> f64 {
|
||||||
|
block0(v0: f64x2):
|
||||||
|
v1 = extractlane v0, 1
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; vpshufd $238, %xmm0, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; vpshufd $0xee, %xmm0, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
|
function %extract_i8x16_lane0_to_memory(i8x16, i64) {
|
||||||
|
block0(v0: i8x16, v1: i64):
|
||||||
|
v2 = extractlane v0, 0
|
||||||
|
store v2, v1
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; vpextrb $0, %xmm0, 0(%rdi)
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; vpextrb $0, %xmm0, (%rdi) ; trap: heap_oob
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
|
function %extract_i16x8_lane0_to_memory(i16x8, i64) {
|
||||||
|
block0(v0: i16x8, v1: i64):
|
||||||
|
v2 = extractlane v0, 0
|
||||||
|
store v2, v1
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; vpextrw $0, %xmm0, 0(%rdi)
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; vpextrw $0, %xmm0, (%rdi) ; trap: heap_oob
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
|
function %extract_i32x4_lane0_to_memory(i32x4, i64) {
|
||||||
|
block0(v0: i32x4, v1: i64):
|
||||||
|
v2 = extractlane v0, 0
|
||||||
|
store v2, v1
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; vpextrd $0, %xmm0, 0(%rdi)
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; vpextrd $0, %xmm0, (%rdi) ; trap: heap_oob
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
|
function %extract_f32x4_lane0_to_memory(f32x4, i64) {
|
||||||
|
block0(v0: f32x4, v1: i64):
|
||||||
|
v2 = extractlane v0, 0
|
||||||
|
store v2, v1
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; vmovss %xmm0, 0(%rdi)
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; vmovss %xmm0, (%rdi) ; trap: heap_oob
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
|
function %extract_i64x2_lane0_to_memory(i64x2, i64) {
|
||||||
|
block0(v0: i64x2, v1: i64):
|
||||||
|
v2 = extractlane v0, 0
|
||||||
|
store v2, v1
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; vpextrq $0, %xmm0, 0(%rdi)
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; vpextrq $0, %xmm0, (%rdi) ; trap: heap_oob
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
|
function %extract_f64x2_lane0_to_memory(f64x2, i64) {
|
||||||
|
block0(v0: f64x2, v1: i64):
|
||||||
|
v2 = extractlane v0, 0
|
||||||
|
store v2, v1
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; vmovsd %xmm0, 0(%rdi)
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; vmovsd %xmm0, (%rdi) ; trap: heap_oob
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
@@ -151,6 +151,58 @@ block0(v0: f64x2):
|
|||||||
; popq %rbp
|
; popq %rbp
|
||||||
; retq
|
; retq
|
||||||
|
|
||||||
|
function %extract_i8x16_lane0_to_memory(i8x16, i64) {
|
||||||
|
block0(v0: i8x16, v1: i64):
|
||||||
|
v2 = extractlane v0, 0
|
||||||
|
store v2, v1
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; pextrb $0, %xmm0, 0(%rdi)
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; pextrb $0, %xmm0, (%rdi) ; trap: heap_oob
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
|
function %extract_i16x8_lane0_to_memory(i16x8, i64) {
|
||||||
|
block0(v0: i16x8, v1: i64):
|
||||||
|
v2 = extractlane v0, 0
|
||||||
|
store v2, v1
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; pextrw $0, %xmm0, 0(%rdi)
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; pextrw $0, %xmm0, (%rdi) ; trap: heap_oob
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
function %extract_i32x4_lane0_to_memory(i32x4, i64) {
|
function %extract_i32x4_lane0_to_memory(i32x4, i64) {
|
||||||
block0(v0: i32x4, v1: i64):
|
block0(v0: i32x4, v1: i64):
|
||||||
v2 = extractlane v0, 0
|
v2 = extractlane v0, 0
|
||||||
@@ -162,7 +214,7 @@ block0(v0: i32x4, v1: i64):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; movss %xmm0, 0(%rdi)
|
; pextrd $0, %xmm0, 0(%rdi)
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
@@ -172,7 +224,7 @@ block0(v0: i32x4, v1: i64):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block1: ; offset 0x4
|
; block1: ; offset 0x4
|
||||||
; movss %xmm0, (%rdi) ; trap: heap_oob
|
; pextrd $0, %xmm0, (%rdi) ; trap: heap_oob
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; retq
|
; retq
|
||||||
@@ -214,7 +266,7 @@ block0(v0: i64x2, v1: i64):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; movsd %xmm0, 0(%rdi)
|
; pextrq $0, %xmm0, 0(%rdi)
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
@@ -224,7 +276,7 @@ block0(v0: i64x2, v1: i64):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block1: ; offset 0x4
|
; block1: ; offset 0x4
|
||||||
; movsd %xmm0, (%rdi) ; trap: heap_oob
|
; pextrq $0, %xmm0, (%rdi) ; trap: heap_oob
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; retq
|
; retq
|
||||||
|
|||||||
@@ -526,7 +526,7 @@ block0(v0: f32x4):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; vcmpps $0 %xmm0, %xmm0, %xmm2
|
; vcmpps $0, %xmm0, %xmm0, %xmm2
|
||||||
; vandps %xmm0, %xmm2, %xmm4
|
; vandps %xmm0, %xmm2, %xmm4
|
||||||
; vpxor %xmm2, %xmm4, %xmm6
|
; vpxor %xmm2, %xmm4, %xmm6
|
||||||
; vcvttps2dq %xmm4, %xmm8
|
; vcvttps2dq %xmm4, %xmm8
|
||||||
@@ -565,7 +565,7 @@ block0(v0: f64x2):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; vcmppd $0 %xmm0, %xmm0, %xmm2
|
; vcmppd $0, %xmm0, %xmm0, %xmm2
|
||||||
; vandps %xmm2, const(0), %xmm4
|
; vandps %xmm2, const(0), %xmm4
|
||||||
; vminpd %xmm0, %xmm4, %xmm6
|
; vminpd %xmm0, %xmm4, %xmm6
|
||||||
; vcvttpd2dq %xmm6, %xmm0
|
; vcvttpd2dq %xmm6, %xmm0
|
||||||
|
|||||||
190
cranelift/filetests/filetests/isa/x64/insertlane-avx.clif
Normal file
190
cranelift/filetests/filetests/isa/x64/insertlane-avx.clif
Normal file
@@ -0,0 +1,190 @@
|
|||||||
|
test compile precise-output
|
||||||
|
set enable_simd
|
||||||
|
target x86_64 has_avx
|
||||||
|
|
||||||
|
function %insertlane_f64x2_zero(f64x2, f64) -> f64x2 {
|
||||||
|
block0(v0: f64x2, v1: f64):
|
||||||
|
v2 = insertlane v0, v1, 0
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; vmovsd %xmm0, %xmm1, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; vmovsd %xmm1, %xmm0, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
|
function %insertlane_f64x2_one(f64x2, f64) -> f64x2 {
|
||||||
|
block0(v0: f64x2, v1: f64):
|
||||||
|
v2 = insertlane v0, v1, 1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; vmovlhps %xmm0, %xmm1, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; vmovlhps %xmm1, %xmm0, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
|
function %insertlane_f64x2_zero_with_load(f64x2, i64) -> f64x2 {
|
||||||
|
block0(v0: f64x2, v1: i64):
|
||||||
|
v2 = load.f64 v1
|
||||||
|
v3 = insertlane v0, v2, 0
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; vmovsd 0(%rdi), %xmm3
|
||||||
|
; vmovsd %xmm0, %xmm3, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; vmovsd (%rdi), %xmm3 ; trap: heap_oob
|
||||||
|
; vmovsd %xmm3, %xmm0, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
|
function %insertlane_i8x16_one_load(i8x16, i64) -> i8x16 {
|
||||||
|
block0(v0: i8x16, v1: i64):
|
||||||
|
v2 = load.i8 v1
|
||||||
|
v3 = insertlane v0, v2, 1
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; movzbq 0(%rdi), %rdx
|
||||||
|
; vpinsrb $1, %xmm0, %rdx, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; movzbq (%rdi), %rdx ; trap: heap_oob
|
||||||
|
; vpinsrb $1, %edx, %xmm0, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
|
function %insertlane_i16x8_one_load(i16x8, i64) -> i16x8 {
|
||||||
|
block0(v0: i16x8, v1: i64):
|
||||||
|
v2 = load.i16 v1
|
||||||
|
v3 = insertlane v0, v2, 1
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; movzwq 0(%rdi), %rdx
|
||||||
|
; vpinsrw $1, %xmm0, %rdx, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; movzwq (%rdi), %rdx ; trap: heap_oob
|
||||||
|
; vpinsrw $1, %edx, %xmm0, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
|
function %insertlane_i32x4_one_load(i32x4, i64) -> i32x4 {
|
||||||
|
block0(v0: i32x4, v1: i64):
|
||||||
|
v2 = load.i32 v1
|
||||||
|
v3 = insertlane v0, v2, 1
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; vpinsrd $1, %xmm0, 0(%rdi), %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; vpinsrd $1, (%rdi), %xmm0, %xmm0 ; trap: heap_oob
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
|
function %insertlane_i64x2_one_load(i64x2, i64) -> i64x2 {
|
||||||
|
block0(v0: i64x2, v1: i64):
|
||||||
|
v2 = load.i64 v1
|
||||||
|
v3 = insertlane v0, v2, 1
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; vpinsrq $1, %xmm0, 0(%rdi), %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; vpinsrq $1, (%rdi), %xmm0, %xmm0 ; trap: heap_oob
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
test compile precise-output
|
test compile precise-output
|
||||||
set enable_simd
|
set enable_simd
|
||||||
target x86_64 has_avx
|
target x86_64
|
||||||
|
|
||||||
function %insertlane_f64x2_zero(f64x2, f64) -> f64x2 {
|
function %insertlane_f64x2_zero(f64x2, f64) -> f64x2 {
|
||||||
block0(v0: f64x2, v1: f64):
|
block0(v0: f64x2, v1: f64):
|
||||||
@@ -12,7 +12,7 @@ block0(v0: f64x2, v1: f64):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; vmovsd %xmm0, %xmm1, %xmm0
|
; movsd %xmm0, %xmm1, %xmm0
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
@@ -22,7 +22,7 @@ block0(v0: f64x2, v1: f64):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block1: ; offset 0x4
|
; block1: ; offset 0x4
|
||||||
; vmovsd %xmm1, %xmm0, %xmm0
|
; movsd %xmm1, %xmm0
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; retq
|
; retq
|
||||||
@@ -37,7 +37,7 @@ block0(v0: f64x2, v1: f64):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; vmovlhps %xmm0, %xmm1, %xmm0
|
; movlhps %xmm0, %xmm1, %xmm0
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
@@ -47,7 +47,7 @@ block0(v0: f64x2, v1: f64):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block1: ; offset 0x4
|
; block1: ; offset 0x4
|
||||||
; vmovlhps %xmm1, %xmm0, %xmm0
|
; movlhps %xmm1, %xmm0
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; retq
|
; retq
|
||||||
@@ -63,8 +63,8 @@ block0(v0: f64x2, v1: i64):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; vmovsd 0(%rdi), %xmm3
|
; movsd 0(%rdi), %xmm3
|
||||||
; vmovsd %xmm0, %xmm3, %xmm0
|
; movsd %xmm0, %xmm3, %xmm0
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
@@ -74,8 +74,116 @@ block0(v0: f64x2, v1: i64):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block1: ; offset 0x4
|
; block1: ; offset 0x4
|
||||||
; vmovsd (%rdi), %xmm3 ; trap: heap_oob
|
; movsd (%rdi), %xmm3 ; trap: heap_oob
|
||||||
; vmovsd %xmm3, %xmm0, %xmm0
|
; movsd %xmm3, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
|
function %insertlane_i8x16_one_load(i8x16, i64) -> i8x16 {
|
||||||
|
block0(v0: i8x16, v1: i64):
|
||||||
|
v2 = load.i8 v1
|
||||||
|
v3 = insertlane v0, v2, 1
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; movzbq 0(%rdi), %rdx
|
||||||
|
; pinsrb $1, %xmm0, %rdx, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; movzbq (%rdi), %rdx ; trap: heap_oob
|
||||||
|
; pinsrb $1, %edx, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
|
function %insertlane_i16x8_one_load(i16x8, i64) -> i16x8 {
|
||||||
|
block0(v0: i16x8, v1: i64):
|
||||||
|
v2 = load.i16 v1
|
||||||
|
v3 = insertlane v0, v2, 1
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; movzwq 0(%rdi), %rdx
|
||||||
|
; pinsrw $1, %xmm0, %rdx, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; movzwq (%rdi), %rdx ; trap: heap_oob
|
||||||
|
; pinsrw $1, %edx, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
|
function %insertlane_i32x4_one_load(i32x4, i64) -> i32x4 {
|
||||||
|
block0(v0: i32x4, v1: i64):
|
||||||
|
v2 = load.i32 v1
|
||||||
|
v3 = insertlane v0, v2, 1
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; pinsrd $1, %xmm0, 0(%rdi), %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; pinsrd $1, (%rdi), %xmm0 ; trap: heap_oob
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
|
function %insertlane_i64x2_one_load(i64x2, i64) -> i64x2 {
|
||||||
|
block0(v0: i64x2, v1: i64):
|
||||||
|
v2 = load.i64 v1
|
||||||
|
v3 = insertlane v0, v2, 1
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; pinsrd.w $1, %xmm0, 0(%rdi), %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; pinsrq $1, (%rdi), %xmm0 ; trap: heap_oob
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; retq
|
; retq
|
||||||
|
|||||||
@@ -1204,7 +1204,7 @@ block0(v0: i8x16):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; vpalignr $8 %xmm0, %xmm0, %xmm2
|
; vpalignr $8, %xmm0, %xmm0, %xmm2
|
||||||
; vpmovzxbw %xmm2, %xmm0
|
; vpmovzxbw %xmm2, %xmm0
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
@@ -1316,7 +1316,7 @@ block0(v0: i8):
|
|||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; uninit %xmm2
|
; uninit %xmm2
|
||||||
; vpinsrb $0 %xmm2, %rdi, %xmm4
|
; vpinsrb $0, %xmm2, %rdi, %xmm4
|
||||||
; uninit %xmm6
|
; uninit %xmm6
|
||||||
; vpxor %xmm6, %xmm6, %xmm8
|
; vpxor %xmm6, %xmm6, %xmm8
|
||||||
; vpshufb %xmm4, %xmm8, %xmm0
|
; vpshufb %xmm4, %xmm8, %xmm0
|
||||||
@@ -1354,7 +1354,7 @@ block0(v0: f64x2):
|
|||||||
; vminpd %xmm6, const(0), %xmm8
|
; vminpd %xmm6, const(0), %xmm8
|
||||||
; vroundpd $3, %xmm8, %xmm10
|
; vroundpd $3, %xmm8, %xmm10
|
||||||
; vaddpd %xmm10, const(1), %xmm12
|
; vaddpd %xmm10, const(1), %xmm12
|
||||||
; vshufps $136 %xmm12, %xmm4, %xmm0
|
; vshufps $136, %xmm12, %xmm4, %xmm0
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
|
|||||||
@@ -529,7 +529,7 @@ block0(v0: f32x4, v1: f32):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; vinsertps $16 %xmm0, %xmm1, %xmm0
|
; vinsertps $16, %xmm0, %xmm1, %xmm0
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
@@ -579,7 +579,7 @@ block0(v0: i8x16, v1: i8):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; vpinsrb $1 %xmm0, %rdi, %xmm0
|
; vpinsrb $1, %xmm0, %rdi, %xmm0
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
@@ -604,7 +604,7 @@ block0(v0: i16x8, v1: i16):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; vpinsrw $1 %xmm0, %rdi, %xmm0
|
; vpinsrw $1, %xmm0, %rdi, %xmm0
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
@@ -629,7 +629,7 @@ block0(v0: i32x4, v1: i32):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; vpinsrd $1 %xmm0, %rdi, %xmm0
|
; vpinsrd $1, %xmm0, %rdi, %xmm0
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
@@ -654,7 +654,7 @@ block0(v0: i64x2, v1: i64):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; vpinsrq $1 %xmm0, %rdi, %xmm0
|
; vpinsrq $1, %xmm0, %rdi, %xmm0
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
|
|||||||
@@ -215,7 +215,7 @@ block0(v0: f32x4, v1: f32x4):
|
|||||||
; vminps %xmm0, %xmm1, %xmm3
|
; vminps %xmm0, %xmm1, %xmm3
|
||||||
; vminps %xmm1, %xmm0, %xmm5
|
; vminps %xmm1, %xmm0, %xmm5
|
||||||
; vorps %xmm3, %xmm5, %xmm7
|
; vorps %xmm3, %xmm5, %xmm7
|
||||||
; vcmpps $3 %xmm7, %xmm5, %xmm9
|
; vcmpps $3, %xmm7, %xmm5, %xmm9
|
||||||
; vorps %xmm7, %xmm9, %xmm11
|
; vorps %xmm7, %xmm9, %xmm11
|
||||||
; vpsrld %xmm9, $10, %xmm13
|
; vpsrld %xmm9, $10, %xmm13
|
||||||
; vandnps %xmm13, %xmm11, %xmm0
|
; vandnps %xmm13, %xmm11, %xmm0
|
||||||
@@ -252,7 +252,7 @@ block0(v0: f64x2, v1: f64x2):
|
|||||||
; vminpd %xmm0, %xmm1, %xmm3
|
; vminpd %xmm0, %xmm1, %xmm3
|
||||||
; vminpd %xmm1, %xmm0, %xmm5
|
; vminpd %xmm1, %xmm0, %xmm5
|
||||||
; vorpd %xmm3, %xmm5, %xmm7
|
; vorpd %xmm3, %xmm5, %xmm7
|
||||||
; vcmppd $3 %xmm3, %xmm5, %xmm9
|
; vcmppd $3, %xmm3, %xmm5, %xmm9
|
||||||
; vorpd %xmm7, %xmm9, %xmm11
|
; vorpd %xmm7, %xmm9, %xmm11
|
||||||
; vpsrlq %xmm9, $13, %xmm13
|
; vpsrlq %xmm9, $13, %xmm13
|
||||||
; vandnpd %xmm13, %xmm11, %xmm0
|
; vandnpd %xmm13, %xmm11, %xmm0
|
||||||
@@ -291,7 +291,7 @@ block0(v0: f32x4, v1: f32x4):
|
|||||||
; vxorps %xmm3, %xmm5, %xmm7
|
; vxorps %xmm3, %xmm5, %xmm7
|
||||||
; vorps %xmm3, %xmm7, %xmm9
|
; vorps %xmm3, %xmm7, %xmm9
|
||||||
; vsubps %xmm9, %xmm7, %xmm11
|
; vsubps %xmm9, %xmm7, %xmm11
|
||||||
; vcmpps $3 %xmm9, %xmm9, %xmm13
|
; vcmpps $3, %xmm9, %xmm9, %xmm13
|
||||||
; vpsrld %xmm13, $10, %xmm15
|
; vpsrld %xmm13, $10, %xmm15
|
||||||
; vandnps %xmm15, %xmm11, %xmm0
|
; vandnps %xmm15, %xmm11, %xmm0
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
@@ -330,7 +330,7 @@ block0(v0: f64x2, v1: f64x2):
|
|||||||
; vxorpd %xmm3, %xmm5, %xmm7
|
; vxorpd %xmm3, %xmm5, %xmm7
|
||||||
; vorpd %xmm3, %xmm7, %xmm9
|
; vorpd %xmm3, %xmm7, %xmm9
|
||||||
; vsubpd %xmm9, %xmm7, %xmm11
|
; vsubpd %xmm9, %xmm7, %xmm11
|
||||||
; vcmppd $3 %xmm9, %xmm9, %xmm13
|
; vcmppd $3, %xmm9, %xmm9, %xmm13
|
||||||
; vpsrlq %xmm13, $13, %xmm15
|
; vpsrlq %xmm13, $13, %xmm15
|
||||||
; vandnpd %xmm15, %xmm11, %xmm0
|
; vandnpd %xmm15, %xmm11, %xmm0
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ target aarch64
|
|||||||
target s390x
|
target s390x
|
||||||
set enable_simd
|
set enable_simd
|
||||||
target x86_64 has_sse3 has_ssse3 has_sse41
|
target x86_64 has_sse3 has_ssse3 has_sse41
|
||||||
|
target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
|
||||||
|
|
||||||
function %extractlane_4(i8x16) -> i8 {
|
function %extractlane_4(i8x16) -> i8 {
|
||||||
block0(v0: i8x16):
|
block0(v0: i8x16):
|
||||||
@@ -33,3 +34,69 @@ block0(v0: i64x2):
|
|||||||
return v1
|
return v1
|
||||||
}
|
}
|
||||||
; run: %extractlane_1([0 4294967297]) == 4294967297
|
; run: %extractlane_1([0 4294967297]) == 4294967297
|
||||||
|
|
||||||
|
function %extractlane_i8x16_through_stack(i8x16) -> i8 {
|
||||||
|
ss0 = explicit_slot 8
|
||||||
|
block0(v0: i8x16):
|
||||||
|
v2 = stack_addr.i64 ss0
|
||||||
|
v3 = extractlane v0, 1
|
||||||
|
store v3, v2
|
||||||
|
v4 = load.i8 v2
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
; run: %extractlane_i8x16_through_stack([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]) == 2
|
||||||
|
|
||||||
|
function %extractlane_i16x8_through_stack(i16x8) -> i16 {
|
||||||
|
ss0 = explicit_slot 8
|
||||||
|
block0(v0: i16x8):
|
||||||
|
v2 = stack_addr.i64 ss0
|
||||||
|
v3 = extractlane v0, 2
|
||||||
|
store v3, v2
|
||||||
|
v4 = load.i16 v2
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
; run: %extractlane_i16x8_through_stack([1 2 3 4 5 6 7 8]) == 3
|
||||||
|
|
||||||
|
function %extractlane_i32x4_through_stack(i32x4) -> i32 {
|
||||||
|
ss0 = explicit_slot 8
|
||||||
|
block0(v0: i32x4):
|
||||||
|
v2 = stack_addr.i64 ss0
|
||||||
|
v3 = extractlane v0, 3
|
||||||
|
store v3, v2
|
||||||
|
v4 = load.i32 v2
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
; run: %extractlane_i32x4_through_stack([1 2 3 4]) == 4
|
||||||
|
|
||||||
|
function %extractlane_i64x2_through_stack(i64x2) -> i64 {
|
||||||
|
ss0 = explicit_slot 8
|
||||||
|
block0(v0: i64x2):
|
||||||
|
v2 = stack_addr.i64 ss0
|
||||||
|
v3 = extractlane v0, 0
|
||||||
|
store v3, v2
|
||||||
|
v4 = load.i64 v2
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
; run: %extractlane_i64x2_through_stack([1 2]) == 1
|
||||||
|
|
||||||
|
function %extractlane_f32x4_through_stack(f32x4) -> f32 {
|
||||||
|
ss0 = explicit_slot 8
|
||||||
|
block0(v0: f32x4):
|
||||||
|
v2 = stack_addr.i64 ss0
|
||||||
|
v3 = extractlane v0, 3
|
||||||
|
store v3, v2
|
||||||
|
v4 = load.f32 v2
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
; run: %extractlane_f32x4_through_stack([0x1.0 0x2.0 0x3.0 0x4.0]) == 0x4.0
|
||||||
|
|
||||||
|
function %extractlane_f64x2_through_stack(f64x2) -> f64 {
|
||||||
|
ss0 = explicit_slot 8
|
||||||
|
block0(v0: f64x2):
|
||||||
|
v2 = stack_addr.i64 ss0
|
||||||
|
v3 = extractlane v0, 0
|
||||||
|
store v3, v2
|
||||||
|
v4 = load.f64 v2
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
; run: %extractlane_f64x2_through_stack([0x1.0 0x2.0]) == 0x1.0
|
||||||
|
|||||||
@@ -47,3 +47,91 @@ block0(v0: f64x2, v1: f64):
|
|||||||
return v2
|
return v2
|
||||||
}
|
}
|
||||||
; run: %insertlane_1_in_f64x2([0x1.0 0x2.0], 0x3.0) == [0x1.0 0x3.0]
|
; run: %insertlane_1_in_f64x2([0x1.0 0x2.0], 0x3.0) == [0x1.0 0x3.0]
|
||||||
|
|
||||||
|
function %insertlane_i8x16_through_stack(i8x16, i8) -> i8x16 {
|
||||||
|
ss0 = explicit_slot 8
|
||||||
|
block0(v0: i8x16, v1: i8):
|
||||||
|
v2 = stack_addr.i64 ss0
|
||||||
|
store v1, v2
|
||||||
|
v3 = load.i8 v2
|
||||||
|
v4 = insertlane v0, v3, 1
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
; run: %insertlane_i8x16_through_stack([1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1], 2) == [1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
|
||||||
|
|
||||||
|
function %insertlane_i16x8_through_stack(i16x8, i16) -> i16x8 {
|
||||||
|
ss0 = explicit_slot 8
|
||||||
|
block0(v0: i16x8, v1: i16):
|
||||||
|
v2 = stack_addr.i64 ss0
|
||||||
|
store v1, v2
|
||||||
|
v3 = load.i16 v2
|
||||||
|
v4 = insertlane v0, v3, 2
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
; run: %insertlane_i16x8_through_stack([1 1 1 1 1 1 1 1], 2) == [1 1 2 1 1 1 1 1]
|
||||||
|
|
||||||
|
function %insertlane_i32x4_through_stack(i32x4, i32) -> i32x4 {
|
||||||
|
ss0 = explicit_slot 8
|
||||||
|
block0(v0: i32x4, v1: i32):
|
||||||
|
v2 = stack_addr.i64 ss0
|
||||||
|
store v1, v2
|
||||||
|
v3 = load.i32 v2
|
||||||
|
v4 = insertlane v0, v3, 3
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
; run: %insertlane_i32x4_through_stack([1 1 1 1], 2) == [1 1 1 2]
|
||||||
|
|
||||||
|
function %insertlane_i64x2_through_stack(i64x2, i64) -> i64x2 {
|
||||||
|
ss0 = explicit_slot 8
|
||||||
|
block0(v0: i64x2, v1: i64):
|
||||||
|
v2 = stack_addr.i64 ss0
|
||||||
|
store v1, v2
|
||||||
|
v3 = load.i64 v2
|
||||||
|
v4 = insertlane v0, v3, 0
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
; run: %insertlane_i64x2_through_stack([1 1], 2) == [2 1]
|
||||||
|
|
||||||
|
function %insertlane_f32x4_through_stack(f32x4, f32) -> f32x4 {
|
||||||
|
ss0 = explicit_slot 8
|
||||||
|
block0(v0: f32x4, v1: f32):
|
||||||
|
v2 = stack_addr.i64 ss0
|
||||||
|
store v1, v2
|
||||||
|
v3 = load.f32 v2
|
||||||
|
v4 = insertlane v0, v3, 3
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
; run: %insertlane_f32x4_through_stack([0x1.0 0x1.0 0x1.0 0x1.0], 0x2.0) == [0x1.0 0x1.0 0x1.0 0x2.0]
|
||||||
|
|
||||||
|
function %insertlane_f32x4_through_stack2(f32x4, f32) -> f32x4 {
|
||||||
|
ss0 = explicit_slot 8
|
||||||
|
block0(v0: f32x4, v1: f32):
|
||||||
|
v2 = stack_addr.i64 ss0
|
||||||
|
store v1, v2
|
||||||
|
v3 = load.f32 v2
|
||||||
|
v4 = insertlane v0, v3, 0
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
; run: %insertlane_f32x4_through_stack2([0x1.0 0x1.0 0x1.0 0x1.0], 0x2.0) == [0x2.0 0x1.0 0x1.0 0x1.0]
|
||||||
|
|
||||||
|
function %insertlane_f64x2_through_stack(f64x2, f64) -> f64x2 {
|
||||||
|
ss0 = explicit_slot 8
|
||||||
|
block0(v0: f64x2, v1: f64):
|
||||||
|
v2 = stack_addr.i64 ss0
|
||||||
|
store v1, v2
|
||||||
|
v3 = load.f64 v2
|
||||||
|
v4 = insertlane v0, v3, 0
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
; run: %insertlane_f64x2_through_stack([0x1.0 0x1.0], 0x2.0) == [0x2.0 0x1.0]
|
||||||
|
|
||||||
|
function %insertlane_f64x2_through_stack2(f64x2, f64) -> f64x2 {
|
||||||
|
ss0 = explicit_slot 8
|
||||||
|
block0(v0: f64x2, v1: f64):
|
||||||
|
v2 = stack_addr.i64 ss0
|
||||||
|
store v1, v2
|
||||||
|
v3 = load.f64 v2
|
||||||
|
v4 = insertlane v0, v3, 1
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
; run: %insertlane_f64x2_through_stack2([0x1.0 0x1.0], 0x2.0) == [0x1.0 0x2.0]
|
||||||
|
|||||||
@@ -44,7 +44,7 @@
|
|||||||
;; movq %rsp, %rbp
|
;; movq %rsp, %rbp
|
||||||
;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 }
|
;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 }
|
||||||
;; block0:
|
;; block0:
|
||||||
;; vcmpps $0 %xmm0, %xmm0, %xmm3
|
;; vcmpps $0, %xmm0, %xmm0, %xmm3
|
||||||
;; vandps %xmm0, %xmm3, %xmm5
|
;; vandps %xmm0, %xmm3, %xmm5
|
||||||
;; vpxor %xmm3, %xmm5, %xmm7
|
;; vpxor %xmm3, %xmm5, %xmm7
|
||||||
;; vcvttps2dq %xmm5, %xmm9
|
;; vcvttps2dq %xmm5, %xmm9
|
||||||
@@ -71,7 +71,7 @@
|
|||||||
;; vcvtdq2ps %xmm11, %xmm13
|
;; vcvtdq2ps %xmm11, %xmm13
|
||||||
;; vcvttps2dq %xmm7, %xmm15
|
;; vcvttps2dq %xmm7, %xmm15
|
||||||
;; vsubps %xmm7, %xmm13, %xmm1
|
;; vsubps %xmm7, %xmm13, %xmm1
|
||||||
;; vcmpps $2 %xmm13, %xmm1, %xmm3
|
;; vcmpps $2, %xmm13, %xmm1, %xmm3
|
||||||
;; vcvttps2dq %xmm1, %xmm5
|
;; vcvttps2dq %xmm1, %xmm5
|
||||||
;; vpxor %xmm5, %xmm3, %xmm7
|
;; vpxor %xmm5, %xmm3, %xmm7
|
||||||
;; uninit %xmm9
|
;; uninit %xmm9
|
||||||
@@ -90,7 +90,7 @@
|
|||||||
;; movq %rsp, %rbp
|
;; movq %rsp, %rbp
|
||||||
;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 }
|
;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 }
|
||||||
;; block0:
|
;; block0:
|
||||||
;; vcmppd $0 %xmm0, %xmm0, %xmm3
|
;; vcmppd $0, %xmm0, %xmm0, %xmm3
|
||||||
;; vandps %xmm3, const(0), %xmm5
|
;; vandps %xmm3, const(0), %xmm5
|
||||||
;; vminpd %xmm0, %xmm5, %xmm7
|
;; vminpd %xmm0, %xmm5, %xmm7
|
||||||
;; vcvttpd2dq %xmm7, %xmm0
|
;; vcvttpd2dq %xmm7, %xmm0
|
||||||
@@ -112,7 +112,7 @@
|
|||||||
;; vminpd %xmm7, const(0), %xmm9
|
;; vminpd %xmm7, const(0), %xmm9
|
||||||
;; vroundpd $3, %xmm9, %xmm11
|
;; vroundpd $3, %xmm9, %xmm11
|
||||||
;; vaddpd %xmm11, const(1), %xmm13
|
;; vaddpd %xmm11, const(1), %xmm13
|
||||||
;; vshufps $136 %xmm13, %xmm5, %xmm0
|
;; vshufps $136, %xmm13, %xmm5, %xmm0
|
||||||
;; jmp label1
|
;; jmp label1
|
||||||
;; block1:
|
;; block1:
|
||||||
;; movq %rbp, %rsp
|
;; movq %rbp, %rsp
|
||||||
@@ -128,9 +128,9 @@
|
|||||||
;; vpmovsxbw %xmm0, %xmm10
|
;; vpmovsxbw %xmm0, %xmm10
|
||||||
;; vpmovsxbw %xmm1, %xmm12
|
;; vpmovsxbw %xmm1, %xmm12
|
||||||
;; vpmullw %xmm10, %xmm12, %xmm14
|
;; vpmullw %xmm10, %xmm12, %xmm14
|
||||||
;; vpalignr $8 %xmm0, %xmm0, %xmm8
|
;; vpalignr $8, %xmm0, %xmm0, %xmm8
|
||||||
;; vpmovsxbw %xmm8, %xmm10
|
;; vpmovsxbw %xmm8, %xmm10
|
||||||
;; vpalignr $8 %xmm1, %xmm1, %xmm12
|
;; vpalignr $8, %xmm1, %xmm1, %xmm12
|
||||||
;; vpmovsxbw %xmm12, %xmm15
|
;; vpmovsxbw %xmm12, %xmm15
|
||||||
;; vpmullw %xmm10, %xmm15, %xmm0
|
;; vpmullw %xmm10, %xmm15, %xmm0
|
||||||
;; vphaddw %xmm14, %xmm0, %xmm0
|
;; vphaddw %xmm14, %xmm0, %xmm0
|
||||||
@@ -149,9 +149,9 @@
|
|||||||
;; vpmovsxbw %xmm0, %xmm13
|
;; vpmovsxbw %xmm0, %xmm13
|
||||||
;; vpmovsxbw %xmm1, %xmm15
|
;; vpmovsxbw %xmm1, %xmm15
|
||||||
;; vpmullw %xmm13, %xmm15, %xmm3
|
;; vpmullw %xmm13, %xmm15, %xmm3
|
||||||
;; vpalignr $8 %xmm0, %xmm0, %xmm11
|
;; vpalignr $8, %xmm0, %xmm0, %xmm11
|
||||||
;; vpmovsxbw %xmm11, %xmm13
|
;; vpmovsxbw %xmm11, %xmm13
|
||||||
;; vpalignr $8 %xmm1, %xmm1, %xmm15
|
;; vpalignr $8, %xmm1, %xmm1, %xmm15
|
||||||
;; vpmovsxbw %xmm15, %xmm1
|
;; vpmovsxbw %xmm15, %xmm1
|
||||||
;; vpmullw %xmm13, %xmm1, %xmm4
|
;; vpmullw %xmm13, %xmm1, %xmm4
|
||||||
;; vphaddw %xmm3, %xmm4, %xmm15
|
;; vphaddw %xmm3, %xmm4, %xmm15
|
||||||
|
|||||||
Reference in New Issue
Block a user