[machinst x64]: implement bitmask
This commit is contained in:
@@ -393,6 +393,8 @@ pub enum SseOpcode {
|
||||
Movdqa,
|
||||
Movdqu,
|
||||
Movlhps,
|
||||
Movmskps,
|
||||
Movmskpd,
|
||||
Movq,
|
||||
Movss,
|
||||
Movsd,
|
||||
@@ -407,6 +409,7 @@ pub enum SseOpcode {
|
||||
Pabsb,
|
||||
Pabsw,
|
||||
Pabsd,
|
||||
Packsswb,
|
||||
Paddb,
|
||||
Paddd,
|
||||
Paddq,
|
||||
@@ -445,6 +448,7 @@ pub enum SseOpcode {
|
||||
Pminub,
|
||||
Pminuw,
|
||||
Pminud,
|
||||
Pmovmskb,
|
||||
Pmulld,
|
||||
Pmullw,
|
||||
Pmuludq,
|
||||
@@ -510,6 +514,7 @@ impl SseOpcode {
|
||||
| SseOpcode::Minss
|
||||
| SseOpcode::Movaps
|
||||
| SseOpcode::Movlhps
|
||||
| SseOpcode::Movmskps
|
||||
| SseOpcode::Movss
|
||||
| SseOpcode::Movups
|
||||
| SseOpcode::Mulps
|
||||
@@ -546,6 +551,7 @@ impl SseOpcode {
|
||||
| SseOpcode::Minsd
|
||||
| SseOpcode::Movapd
|
||||
| SseOpcode::Movd
|
||||
| SseOpcode::Movmskpd
|
||||
| SseOpcode::Movq
|
||||
| SseOpcode::Movsd
|
||||
| SseOpcode::Movupd
|
||||
@@ -554,6 +560,7 @@ impl SseOpcode {
|
||||
| SseOpcode::Mulpd
|
||||
| SseOpcode::Mulsd
|
||||
| SseOpcode::Orpd
|
||||
| SseOpcode::Packsswb
|
||||
| SseOpcode::Paddb
|
||||
| SseOpcode::Paddd
|
||||
| SseOpcode::Paddq
|
||||
@@ -578,6 +585,7 @@ impl SseOpcode {
|
||||
| SseOpcode::Pmaxub
|
||||
| SseOpcode::Pminsw
|
||||
| SseOpcode::Pminub
|
||||
| SseOpcode::Pmovmskb
|
||||
| SseOpcode::Pmullw
|
||||
| SseOpcode::Pmuludq
|
||||
| SseOpcode::Por
|
||||
@@ -686,6 +694,8 @@ impl fmt::Debug for SseOpcode {
|
||||
SseOpcode::Movdqa => "movdqa",
|
||||
SseOpcode::Movdqu => "movdqu",
|
||||
SseOpcode::Movlhps => "movlhps",
|
||||
SseOpcode::Movmskps => "movmskps",
|
||||
SseOpcode::Movmskpd => "movmskpd",
|
||||
SseOpcode::Movq => "movq",
|
||||
SseOpcode::Movss => "movss",
|
||||
SseOpcode::Movsd => "movsd",
|
||||
@@ -700,6 +710,7 @@ impl fmt::Debug for SseOpcode {
|
||||
SseOpcode::Pabsb => "pabsb",
|
||||
SseOpcode::Pabsw => "pabsw",
|
||||
SseOpcode::Pabsd => "pabsd",
|
||||
SseOpcode::Packsswb => "packsswb",
|
||||
SseOpcode::Paddb => "paddb",
|
||||
SseOpcode::Paddd => "paddd",
|
||||
SseOpcode::Paddq => "paddq",
|
||||
@@ -738,6 +749,7 @@ impl fmt::Debug for SseOpcode {
|
||||
SseOpcode::Pminub => "pminub",
|
||||
SseOpcode::Pminuw => "pminuw",
|
||||
SseOpcode::Pminud => "pminud",
|
||||
SseOpcode::Pmovmskb => "pmovmskb",
|
||||
SseOpcode::Pmulld => "pmulld",
|
||||
SseOpcode::Pmullw => "pmullw",
|
||||
SseOpcode::Pmuludq => "pmuludq",
|
||||
|
||||
@@ -1762,6 +1762,7 @@ pub(crate) fn emit(
|
||||
SseOpcode::Mulsd => (LegacyPrefixes::_F2, 0x0F59, 2),
|
||||
SseOpcode::Orpd => (LegacyPrefixes::_66, 0x0F56, 2),
|
||||
SseOpcode::Orps => (LegacyPrefixes::None, 0x0F56, 2),
|
||||
SseOpcode::Packsswb => (LegacyPrefixes::_66, 0x0F63, 2),
|
||||
SseOpcode::Paddb => (LegacyPrefixes::_66, 0x0FFC, 2),
|
||||
SseOpcode::Paddd => (LegacyPrefixes::_66, 0x0FFE, 2),
|
||||
SseOpcode::Paddq => (LegacyPrefixes::_66, 0x0FD4, 2),
|
||||
@@ -2040,11 +2041,14 @@ pub(crate) fn emit(
|
||||
dst_size,
|
||||
} => {
|
||||
let (prefix, opcode, dst_first) = match op {
|
||||
SseOpcode::Cvttss2si => (LegacyPrefixes::_F3, 0x0F2C, true),
|
||||
SseOpcode::Cvttsd2si => (LegacyPrefixes::_F2, 0x0F2C, true),
|
||||
// Movd and movq use the same opcode; the presence of the REX prefix (set below)
|
||||
// actually determines which is used.
|
||||
SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F7E, false),
|
||||
SseOpcode::Cvttss2si => (LegacyPrefixes::_F3, 0x0F2C, true),
|
||||
SseOpcode::Cvttsd2si => (LegacyPrefixes::_F2, 0x0F2C, true),
|
||||
SseOpcode::Movmskps => (LegacyPrefixes::None, 0x0F50, true),
|
||||
SseOpcode::Movmskpd => (LegacyPrefixes::_66, 0x0F50, true),
|
||||
SseOpcode::Pmovmskb => (LegacyPrefixes::_66, 0x0FD7, true),
|
||||
_ => panic!("unexpected opcode {:?}", op),
|
||||
};
|
||||
let rex = match dst_size {
|
||||
|
||||
@@ -3292,6 +3292,12 @@ fn test_x64_emit() {
|
||||
"pshufb %xmm11, %xmm2",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(xmm11), w_xmm2, None),
|
||||
"66410F63D3",
|
||||
"packsswb %xmm11, %xmm2",
|
||||
));
|
||||
|
||||
// ========================================================
|
||||
// XMM_RM_R: Integer Conversion
|
||||
insns.push((
|
||||
@@ -3422,6 +3428,22 @@ fn test_x64_emit() {
|
||||
"cvttsd2si %xmm0, %r15",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::xmm_to_gpr(SseOpcode::Pmovmskb, xmm10, w_rax, OperandSize::Size32),
|
||||
"66410FD7C2",
|
||||
"pmovmskb %xmm10, %eax",
|
||||
));
|
||||
insns.push((
|
||||
Inst::xmm_to_gpr(SseOpcode::Movmskps, xmm2, w_rax, OperandSize::Size32),
|
||||
"0F50C2",
|
||||
"movmskps %xmm2, %eax",
|
||||
));
|
||||
insns.push((
|
||||
Inst::xmm_to_gpr(SseOpcode::Movmskpd, xmm0, w_rcx, OperandSize::Size32),
|
||||
"660F50C8",
|
||||
"movmskpd %xmm0, %ecx",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::gpr_to_xmm(
|
||||
SseOpcode::Movd,
|
||||
|
||||
@@ -3657,6 +3657,58 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
ctx.emit(Inst::setcc(CC::Z, dst));
|
||||
}
|
||||
|
||||
Opcode::VhighBits => {
|
||||
let src = put_input_in_reg(ctx, inputs[0]);
|
||||
let src_ty = ctx.input_ty(insn, 0);
|
||||
debug_assert!(src_ty.is_vector() && src_ty.bits() == 128);
|
||||
let dst = get_output_reg(ctx, outputs[0]);
|
||||
debug_assert!(dst.to_reg().get_class() == RegClass::I64);
|
||||
|
||||
// The Intel specification allows using both 32-bit and 64-bit GPRs as destination for
|
||||
// the "move mask" instructions. This is controlled by the REX.R bit: "In 64-bit mode,
|
||||
// the instruction can access additional registers when used with a REX.R prefix. The
|
||||
// default operand size is 64-bit in 64-bit mode" (PMOVMSKB in IA Software Development
|
||||
// Manual, vol. 2). This being the case, we will always clear REX.W since its use is
|
||||
// unnecessary (`OperandSize` is used for setting/clearing REX.W).
|
||||
let size = OperandSize::Size32;
|
||||
|
||||
match src_ty {
|
||||
types::I8X16 | types::B8X16 => {
|
||||
ctx.emit(Inst::xmm_to_gpr(SseOpcode::Pmovmskb, src, dst, size))
|
||||
}
|
||||
types::I32X4 | types::B32X4 | types::F32X4 => {
|
||||
ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskps, src, dst, size))
|
||||
}
|
||||
types::I64X2 | types::B64X2 | types::F64X2 => {
|
||||
ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskpd, src, dst, size))
|
||||
}
|
||||
types::I16X8 | types::B16X8 => {
|
||||
// There is no x86 instruction for extracting the high bit of 16-bit lanes so
|
||||
// here we:
|
||||
// - duplicate the 16-bit lanes of `src` into 8-bit lanes:
|
||||
// PACKSSWB([x1, x2, ...], [x1, x2, ...]) = [x1', x2', ..., x1', x2', ...]
|
||||
// - use PMOVMSKB to gather the high bits; now we have duplicates, though
|
||||
// - shift away the bottom 8 high bits to remove the duplicates.
|
||||
let tmp = ctx.alloc_tmp(RegClass::V128, src_ty);
|
||||
ctx.emit(Inst::gen_move(tmp, src, src_ty));
|
||||
ctx.emit(Inst::xmm_rm_r(
|
||||
SseOpcode::Packsswb,
|
||||
RegMem::reg(src),
|
||||
tmp,
|
||||
None,
|
||||
));
|
||||
ctx.emit(Inst::xmm_to_gpr(
|
||||
SseOpcode::Pmovmskb,
|
||||
tmp.to_reg(),
|
||||
dst,
|
||||
size,
|
||||
));
|
||||
ctx.emit(Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(8), dst));
|
||||
}
|
||||
_ => unimplemented!("unknown input type {} for {}", src_ty, op),
|
||||
}
|
||||
}
|
||||
|
||||
Opcode::IaddImm
|
||||
| Opcode::ImulImm
|
||||
| Opcode::UdivImm
|
||||
|
||||
Reference in New Issue
Block a user