x64: Improve memory support in {insert,extract}lane (#5982)
* x64: Improve memory support in `{insert,extract}lane`
This commit improves adds support to Cranelift to emit `pextr{b,w,d,q}`
with a memory destination, merging a store-of-extract operation into one
instruction. Additionally AVX support is added for the `pextr*`
instructions.
I've additionally tried to ensure that codegen tests and runtests exist
for all forms of these instructions too.
* Add missing commas
* Fix tests
This commit is contained in:
@@ -290,8 +290,18 @@
|
||||
;; XMM (scalar or vector) unary op (from xmm to reg/mem) using the
|
||||
;; VEX prefix
|
||||
(XmmMovRMVex (op AvxOpcode)
|
||||
(src Reg)
|
||||
(src Xmm)
|
||||
(dst SyntheticAmode))
|
||||
(XmmMovRMImmVex (op AvxOpcode)
|
||||
(src Xmm)
|
||||
(dst SyntheticAmode)
|
||||
(imm u8))
|
||||
|
||||
;; XMM (scalar) unary op (from xmm to integer reg): vpextr{w,b,d,q}
|
||||
(XmmToGprImmVex (op AvxOpcode)
|
||||
(src Xmm)
|
||||
(dst WritableGpr)
|
||||
(imm u8))
|
||||
|
||||
;; XMM (scalar or vector) binary op that relies on the EVEX
|
||||
;; prefix. Takes two inputs.
|
||||
@@ -343,8 +353,12 @@
|
||||
;; XMM (scalar or vector) unary op (from xmm to reg/mem): stores, movd,
|
||||
;; movq
|
||||
(XmmMovRM (op SseOpcode)
|
||||
(src Reg)
|
||||
(src Xmm)
|
||||
(dst SyntheticAmode))
|
||||
(XmmMovRMImm (op SseOpcode)
|
||||
(src Xmm)
|
||||
(dst SyntheticAmode)
|
||||
(imm u8))
|
||||
|
||||
;; XMM (scalar) unary op (from xmm to integer reg): movd, movq,
|
||||
;; cvtts{s,d}2si
|
||||
@@ -1364,6 +1378,10 @@
|
||||
Vmovups
|
||||
Vmovupd
|
||||
Vmovdqu
|
||||
Vpextrb
|
||||
Vpextrw
|
||||
Vpextrd
|
||||
Vpextrq
|
||||
))
|
||||
|
||||
(type Avx512Opcode extern
|
||||
@@ -2043,10 +2061,18 @@
|
||||
(rule (xmm_movrm op addr data)
|
||||
(SideEffectNoResult.Inst (MInst.XmmMovRM op data addr)))
|
||||
|
||||
(decl xmm_movrm_imm (SseOpcode SyntheticAmode Xmm u8) SideEffectNoResult)
|
||||
(rule (xmm_movrm_imm op addr data imm)
|
||||
(SideEffectNoResult.Inst (MInst.XmmMovRMImm op data addr imm)))
|
||||
|
||||
(decl xmm_movrm_vex (AvxOpcode SyntheticAmode Xmm) SideEffectNoResult)
|
||||
(rule (xmm_movrm_vex op addr data)
|
||||
(SideEffectNoResult.Inst (MInst.XmmMovRMVex op data addr)))
|
||||
|
||||
(decl xmm_movrm_imm_vex (AvxOpcode SyntheticAmode Xmm u8) SideEffectNoResult)
|
||||
(rule (xmm_movrm_imm_vex op addr data imm)
|
||||
(SideEffectNoResult.Inst (MInst.XmmMovRMImmVex op data addr imm)))
|
||||
|
||||
;; Load a constant into an XMM register.
|
||||
(decl x64_xmm_load_const (Type VCodeConstant) Xmm)
|
||||
(rule (x64_xmm_load_const ty const)
|
||||
@@ -3603,21 +3629,61 @@
|
||||
(decl x64_pextrb (Xmm u8) Gpr)
|
||||
(rule (x64_pextrb src lane)
|
||||
(xmm_to_gpr_imm (SseOpcode.Pextrb) src lane))
|
||||
(rule 1 (x64_pextrb src lane)
|
||||
(if-let $true (has_avx))
|
||||
(xmm_to_gpr_imm_vex (AvxOpcode.Vpextrb) src lane))
|
||||
|
||||
(decl x64_pextrb_store (SyntheticAmode Xmm u8) SideEffectNoResult)
|
||||
(rule (x64_pextrb_store addr src lane)
|
||||
(xmm_movrm_imm (SseOpcode.Pextrb) addr src lane))
|
||||
(rule 1 (x64_pextrb_store addr src lane)
|
||||
(if-let $true (has_avx))
|
||||
(xmm_movrm_imm_vex (AvxOpcode.Vpextrb) addr src lane))
|
||||
|
||||
;; Helper for creating `pextrw` instructions.
|
||||
(decl x64_pextrw (Xmm u8) Gpr)
|
||||
(rule (x64_pextrw src lane)
|
||||
(xmm_to_gpr_imm (SseOpcode.Pextrw) src lane))
|
||||
(rule 1 (x64_pextrw src lane)
|
||||
(if-let $true (has_avx))
|
||||
(xmm_to_gpr_imm_vex (AvxOpcode.Vpextrw) src lane))
|
||||
|
||||
(decl x64_pextrw_store (SyntheticAmode Xmm u8) SideEffectNoResult)
|
||||
(rule (x64_pextrw_store addr src lane)
|
||||
(xmm_movrm_imm (SseOpcode.Pextrw) addr src lane))
|
||||
(rule 1 (x64_pextrw_store addr src lane)
|
||||
(if-let $true (has_avx))
|
||||
(xmm_movrm_imm_vex (AvxOpcode.Vpextrw) addr src lane))
|
||||
|
||||
;; Helper for creating `pextrd` instructions.
|
||||
(decl x64_pextrd (Xmm u8) Gpr)
|
||||
(rule (x64_pextrd src lane)
|
||||
(xmm_to_gpr_imm (SseOpcode.Pextrd) src lane))
|
||||
(rule 1 (x64_pextrd src lane)
|
||||
(if-let $true (has_avx))
|
||||
(xmm_to_gpr_imm_vex (AvxOpcode.Vpextrd) src lane))
|
||||
|
||||
(decl x64_pextrd_store (SyntheticAmode Xmm u8) SideEffectNoResult)
|
||||
(rule (x64_pextrd_store addr src lane)
|
||||
(xmm_movrm_imm (SseOpcode.Pextrd) addr src lane))
|
||||
(rule 1 (x64_pextrd_store addr src lane)
|
||||
(if-let $true (has_avx))
|
||||
(xmm_movrm_imm_vex (AvxOpcode.Vpextrd) addr src lane))
|
||||
|
||||
;; Helper for creating `pextrq` instructions.
|
||||
(decl x64_pextrq (Xmm u8) Gpr)
|
||||
(rule (x64_pextrq src lane)
|
||||
(xmm_to_gpr_imm (SseOpcode.Pextrq) src lane))
|
||||
(rule 1 (x64_pextrq src lane)
|
||||
(if-let $true (has_avx))
|
||||
(xmm_to_gpr_imm_vex (AvxOpcode.Vpextrq) src lane))
|
||||
|
||||
(decl x64_pextrq_store (SyntheticAmode Xmm u8) SideEffectNoResult)
|
||||
(rule (x64_pextrq_store addr src lane)
|
||||
(xmm_movrm_imm (SseOpcode.Pextrq) addr src lane))
|
||||
(rule 1 (x64_pextrq_store addr src lane)
|
||||
(if-let $true (has_avx))
|
||||
(xmm_movrm_imm_vex (AvxOpcode.Vpextrq) addr src lane))
|
||||
|
||||
;; Helper for creating `MInst.XmmToGpr` instructions.
|
||||
(decl xmm_to_gpr (SseOpcode Xmm OperandSize) Gpr)
|
||||
@@ -3626,13 +3692,20 @@
|
||||
(_ Unit (emit (MInst.XmmToGpr op src dst size))))
|
||||
dst))
|
||||
|
||||
;; Helper for creating `MInst.XmmToGpr` instructions.
|
||||
;; Helper for creating `MInst.XmmToGprImm` instructions.
|
||||
(decl xmm_to_gpr_imm (SseOpcode Xmm u8) Gpr)
|
||||
(rule (xmm_to_gpr_imm op src imm)
|
||||
(let ((dst WritableGpr (temp_writable_gpr))
|
||||
(_ Unit (emit (MInst.XmmToGprImm op src dst imm))))
|
||||
dst))
|
||||
|
||||
;; Helper for creating `MInst.XmmToGprImmVex` instructions.
|
||||
(decl xmm_to_gpr_imm_vex (AvxOpcode Xmm u8) Gpr)
|
||||
(rule (xmm_to_gpr_imm_vex op src imm)
|
||||
(let ((dst WritableGpr (temp_writable_gpr))
|
||||
(_ Unit (emit (MInst.XmmToGprImmVex op src dst imm))))
|
||||
dst))
|
||||
|
||||
;; Helper for creating `pmovmskb` instructions.
|
||||
(decl x64_pmovmskb (OperandSize Xmm) Gpr)
|
||||
(rule (x64_pmovmskb size src)
|
||||
|
||||
@@ -1699,7 +1699,11 @@ impl AvxOpcode {
|
||||
| AvxOpcode::Vmovsd
|
||||
| AvxOpcode::Vmovups
|
||||
| AvxOpcode::Vmovupd
|
||||
| AvxOpcode::Vmovdqu => {
|
||||
| AvxOpcode::Vmovdqu
|
||||
| AvxOpcode::Vpextrb
|
||||
| AvxOpcode::Vpextrw
|
||||
| AvxOpcode::Vpextrd
|
||||
| AvxOpcode::Vpextrq => {
|
||||
smallvec![InstructionSet::AVX]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2430,7 +2430,7 @@ pub(crate) fn emit(
|
||||
}
|
||||
|
||||
Inst::XmmMovRMVex { op, src, dst } => {
|
||||
let src = allocs.next(*src);
|
||||
let src = allocs.next(src.to_reg());
|
||||
let dst = dst.with_allocs(allocs).finalize(state, sink);
|
||||
|
||||
let (prefix, map, opcode) = match op {
|
||||
@@ -2451,6 +2451,52 @@ pub(crate) fn emit(
|
||||
.encode(sink);
|
||||
}
|
||||
|
||||
Inst::XmmMovRMImmVex { op, src, dst, imm } => {
|
||||
let src = allocs.next(src.to_reg());
|
||||
let dst = dst.with_allocs(allocs).finalize(state, sink);
|
||||
|
||||
let (w, prefix, map, opcode) = match op {
|
||||
AvxOpcode::Vpextrb => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x14),
|
||||
AvxOpcode::Vpextrw => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x15),
|
||||
AvxOpcode::Vpextrd => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x16),
|
||||
AvxOpcode::Vpextrq => (true, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x16),
|
||||
_ => unimplemented!("Opcode {:?} not implemented", op),
|
||||
};
|
||||
VexInstruction::new()
|
||||
.length(VexVectorLength::V128)
|
||||
.w(w)
|
||||
.prefix(prefix)
|
||||
.map(map)
|
||||
.opcode(opcode)
|
||||
.rm(dst)
|
||||
.reg(src.to_real_reg().unwrap().hw_enc())
|
||||
.imm(*imm)
|
||||
.encode(sink);
|
||||
}
|
||||
|
||||
Inst::XmmToGprImmVex { op, src, dst, imm } => {
|
||||
let src = allocs.next(src.to_reg());
|
||||
let dst = allocs.next(dst.to_reg().to_reg());
|
||||
|
||||
let (w, prefix, map, opcode) = match op {
|
||||
AvxOpcode::Vpextrb => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x14),
|
||||
AvxOpcode::Vpextrw => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x15),
|
||||
AvxOpcode::Vpextrd => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x16),
|
||||
AvxOpcode::Vpextrq => (true, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x16),
|
||||
_ => unimplemented!("Opcode {:?} not implemented", op),
|
||||
};
|
||||
VexInstruction::new()
|
||||
.length(VexVectorLength::V128)
|
||||
.w(w)
|
||||
.prefix(prefix)
|
||||
.map(map)
|
||||
.opcode(opcode)
|
||||
.rm(dst.to_real_reg().unwrap().hw_enc())
|
||||
.reg(src.to_real_reg().unwrap().hw_enc())
|
||||
.imm(*imm)
|
||||
.encode(sink);
|
||||
}
|
||||
|
||||
Inst::XmmRmREvex {
|
||||
op,
|
||||
src1,
|
||||
@@ -2649,7 +2695,7 @@ pub(crate) fn emit(
|
||||
}
|
||||
|
||||
Inst::XmmMovRM { op, src, dst } => {
|
||||
let src = allocs.next(*src);
|
||||
let src = allocs.next(src.to_reg());
|
||||
let dst = dst.with_allocs(allocs);
|
||||
|
||||
let (prefix, opcode) = match op {
|
||||
@@ -2666,6 +2712,27 @@ pub(crate) fn emit(
|
||||
emit_std_reg_mem(sink, prefix, opcode, 2, src, dst, RexFlags::clear_w(), 0);
|
||||
}
|
||||
|
||||
Inst::XmmMovRMImm { op, src, dst, imm } => {
|
||||
let src = allocs.next(src.to_reg());
|
||||
let dst = dst.with_allocs(allocs);
|
||||
|
||||
let (w, prefix, opcode) = match op {
|
||||
SseOpcode::Pextrb => (false, LegacyPrefixes::_66, 0x0F3A14),
|
||||
SseOpcode::Pextrw => (false, LegacyPrefixes::_66, 0x0F3A15),
|
||||
SseOpcode::Pextrd => (false, LegacyPrefixes::_66, 0x0F3A16),
|
||||
SseOpcode::Pextrq => (true, LegacyPrefixes::_66, 0x0F3A16),
|
||||
_ => unimplemented!("Opcode {:?} not implemented", op),
|
||||
};
|
||||
let rex = if w {
|
||||
RexFlags::set_w()
|
||||
} else {
|
||||
RexFlags::clear_w()
|
||||
};
|
||||
let dst = &dst.finalize(state, sink);
|
||||
emit_std_reg_mem(sink, prefix, opcode, 3, src, dst, rex, 1);
|
||||
sink.put1(*imm);
|
||||
}
|
||||
|
||||
Inst::XmmToGpr {
|
||||
op,
|
||||
src,
|
||||
|
||||
@@ -4871,7 +4871,7 @@ fn test_x64_emit() {
|
||||
imm: 2,
|
||||
},
|
||||
"C4430920EF02",
|
||||
"vpinsrb $2 %xmm14, %r15, %xmm13",
|
||||
"vpinsrb $2, %xmm14, %r15, %xmm13",
|
||||
));
|
||||
|
||||
// ========================================================
|
||||
|
||||
@@ -131,6 +131,7 @@ impl Inst {
|
||||
// These use dynamic SSE opcodes.
|
||||
Inst::GprToXmm { op, .. }
|
||||
| Inst::XmmMovRM { op, .. }
|
||||
| Inst::XmmMovRMImm { op, .. }
|
||||
| Inst::XmmRmiReg { opcode: op, .. }
|
||||
| Inst::XmmRmR { op, .. }
|
||||
| Inst::XmmRmRUnaligned { op, .. }
|
||||
@@ -153,7 +154,9 @@ impl Inst {
|
||||
| Inst::XmmVexPinsr { op, .. }
|
||||
| Inst::XmmUnaryRmRVex { op, .. }
|
||||
| Inst::XmmUnaryRmRImmVex { op, .. }
|
||||
| Inst::XmmMovRMVex { op, .. } => op.available_from(),
|
||||
| Inst::XmmMovRMVex { op, .. }
|
||||
| Inst::XmmMovRMImmVex { op, .. }
|
||||
| Inst::XmmToGprImmVex { op, .. } => op.available_from(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -331,7 +334,7 @@ impl Inst {
|
||||
debug_assert!(src.class() == RegClass::Float);
|
||||
Inst::XmmMovRM {
|
||||
op,
|
||||
src,
|
||||
src: Xmm::new(src).unwrap(),
|
||||
dst: dst.into(),
|
||||
}
|
||||
}
|
||||
@@ -933,17 +936,33 @@ impl PrettyPrint for Inst {
|
||||
}
|
||||
|
||||
Inst::XmmMovRM { op, src, dst, .. } => {
|
||||
let src = pretty_print_reg(*src, 8, allocs);
|
||||
let src = pretty_print_reg(src.to_reg(), 8, allocs);
|
||||
let dst = dst.pretty_print(8, allocs);
|
||||
format!("{} {}, {}", ljustify(op.to_string()), src, dst)
|
||||
}
|
||||
|
||||
Inst::XmmMovRMVex { op, src, dst, .. } => {
|
||||
let src = pretty_print_reg(*src, 8, allocs);
|
||||
let src = pretty_print_reg(src.to_reg(), 8, allocs);
|
||||
let dst = dst.pretty_print(8, allocs);
|
||||
format!("{} {}, {}", ljustify(op.to_string()), src, dst)
|
||||
}
|
||||
|
||||
Inst::XmmMovRMImm {
|
||||
op, src, dst, imm, ..
|
||||
} => {
|
||||
let src = pretty_print_reg(src.to_reg(), 8, allocs);
|
||||
let dst = dst.pretty_print(8, allocs);
|
||||
format!("{} ${imm}, {}, {}", ljustify(op.to_string()), src, dst)
|
||||
}
|
||||
|
||||
Inst::XmmMovRMImmVex {
|
||||
op, src, dst, imm, ..
|
||||
} => {
|
||||
let src = pretty_print_reg(src.to_reg(), 8, allocs);
|
||||
let dst = dst.pretty_print(8, allocs);
|
||||
format!("{} ${imm}, {}, {}", ljustify(op.to_string()), src, dst)
|
||||
}
|
||||
|
||||
Inst::XmmRmR {
|
||||
op,
|
||||
src1,
|
||||
@@ -1023,7 +1042,7 @@ impl PrettyPrint for Inst {
|
||||
let src1 = pretty_print_reg(src1.to_reg(), 8, allocs);
|
||||
let src2 = src2.pretty_print(8, allocs);
|
||||
|
||||
format!("{} ${imm} {src1}, {src2}, {dst}", ljustify(op.to_string()))
|
||||
format!("{} ${imm}, {src1}, {src2}, {dst}", ljustify(op.to_string()))
|
||||
}
|
||||
|
||||
Inst::XmmVexPinsr {
|
||||
@@ -1038,7 +1057,7 @@ impl PrettyPrint for Inst {
|
||||
let src1 = pretty_print_reg(src1.to_reg(), 8, allocs);
|
||||
let src2 = src2.pretty_print(8, allocs);
|
||||
|
||||
format!("{} ${imm} {src1}, {src2}, {dst}", ljustify(op.to_string()))
|
||||
format!("{} ${imm}, {src1}, {src2}, {dst}", ljustify(op.to_string()))
|
||||
}
|
||||
|
||||
Inst::XmmRmRVex3 {
|
||||
@@ -1190,6 +1209,12 @@ impl PrettyPrint for Inst {
|
||||
format!("{} ${imm}, {}, {}", ljustify(op.to_string()), src, dst)
|
||||
}
|
||||
|
||||
Inst::XmmToGprImmVex { op, src, dst, imm } => {
|
||||
let src = pretty_print_reg(src.to_reg(), 8, allocs);
|
||||
let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
|
||||
format!("{} ${imm}, {}, {}", ljustify(op.to_string()), src, dst)
|
||||
}
|
||||
|
||||
Inst::GprToXmm {
|
||||
op,
|
||||
src,
|
||||
@@ -2033,8 +2058,11 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
|
||||
collector.reg_reuse_def(dst.to_writable_reg(), 0); // Reuse RHS.
|
||||
src2.get_operands(collector);
|
||||
}
|
||||
Inst::XmmMovRM { src, dst, .. } | Inst::XmmMovRMVex { src, dst, .. } => {
|
||||
collector.reg_use(*src);
|
||||
Inst::XmmMovRM { src, dst, .. }
|
||||
| Inst::XmmMovRMVex { src, dst, .. }
|
||||
| Inst::XmmMovRMImm { src, dst, .. }
|
||||
| Inst::XmmMovRMImmVex { src, dst, .. } => {
|
||||
collector.reg_use(src.to_reg());
|
||||
dst.get_operands(collector);
|
||||
}
|
||||
Inst::XmmCmpRmR { src, dst, .. } => {
|
||||
@@ -2058,7 +2086,9 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
|
||||
collector.reg_use(src.to_reg());
|
||||
collector.reg_fixed_nonallocatable(*dst);
|
||||
}
|
||||
Inst::XmmToGpr { src, dst, .. } | Inst::XmmToGprImm { src, dst, .. } => {
|
||||
Inst::XmmToGpr { src, dst, .. }
|
||||
| Inst::XmmToGprImm { src, dst, .. }
|
||||
| Inst::XmmToGprImmVex { src, dst, .. } => {
|
||||
collector.reg_use(src.to_reg());
|
||||
collector.reg_def(dst.to_writable_reg());
|
||||
}
|
||||
|
||||
@@ -2659,17 +2659,41 @@
|
||||
;; f32 or f64 despite the source perhaps being an integer vector since the
|
||||
;; result of the instruction is the same.
|
||||
(rule 2 (lower (store flags
|
||||
(has_type (ty_32 _) (extractlane value (u8_from_uimm8 0)))
|
||||
(has_type $F32 (extractlane value (u8_from_uimm8 0)))
|
||||
address
|
||||
offset))
|
||||
(side_effect
|
||||
(x64_movss_store (to_amode flags address offset) value)))
|
||||
(rule 3 (lower (store flags
|
||||
(has_type (ty_64 _) (extractlane value (u8_from_uimm8 0)))
|
||||
(rule 2 (lower (store flags
|
||||
(has_type $F64 (extractlane value (u8_from_uimm8 0)))
|
||||
address
|
||||
offset))
|
||||
(side_effect
|
||||
(x64_movsd_store (to_amode flags address offset) value)))
|
||||
(rule 2 (lower (store flags
|
||||
(has_type $I8 (extractlane value (u8_from_uimm8 n)))
|
||||
address
|
||||
offset))
|
||||
(side_effect
|
||||
(x64_pextrb_store (to_amode flags address offset) value n)))
|
||||
(rule 2 (lower (store flags
|
||||
(has_type $I16 (extractlane value (u8_from_uimm8 n)))
|
||||
address
|
||||
offset))
|
||||
(side_effect
|
||||
(x64_pextrw_store (to_amode flags address offset) value n)))
|
||||
(rule 2 (lower (store flags
|
||||
(has_type $I32 (extractlane value (u8_from_uimm8 n)))
|
||||
address
|
||||
offset))
|
||||
(side_effect
|
||||
(x64_pextrd_store (to_amode flags address offset) value n)))
|
||||
(rule 2 (lower (store flags
|
||||
(has_type $I64 (extractlane value (u8_from_uimm8 n)))
|
||||
address
|
||||
offset))
|
||||
(side_effect
|
||||
(x64_pextrq_store (to_amode flags address offset) value n)))
|
||||
|
||||
;; Rules for `load*` + ALU op + `store*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user