[machinst x64]: implement bitmask

This commit is contained in:
Andrew Brown
2020-10-28 13:18:20 -07:00
parent 5b9a21e099
commit 6725b6b129
5 changed files with 100 additions and 4 deletions

View File

@@ -182,6 +182,7 @@ fn experimental_x64_should_panic(testsuite: &str, testname: &str, strategy: &str
match (testsuite, testname) {
("simd", "simd_address") => return false,
("simd", "simd_bitwise") => return false,
("simd", "simd_boolean") => return false,
("simd", "simd_const") => return false,
("simd", "simd_i8x16_arith") => return false,
("simd", "simd_i8x16_arith2") => return false,
@@ -229,9 +230,14 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
return env::var("CARGO_CFG_TARGET_ARCH").unwrap() != "x86_64";
}
// These are only implemented on aarch64 and x64.
("simd", "simd_boolean") => {
return !(cfg!(feature = "experimental_x64")
|| env::var("CARGO_CFG_TARGET_ARCH").unwrap() == "aarch64")
}
// These are only implemented on aarch64.
("simd", "simd_boolean")
| ("simd", "simd_f32x4_pmin_pmax")
("simd", "simd_f32x4_pmin_pmax")
| ("simd", "simd_f32x4_rounding")
| ("simd", "simd_f64x2_pmin_pmax")
| ("simd", "simd_f64x2_rounding") => {

View File

@@ -393,6 +393,8 @@ pub enum SseOpcode {
Movdqa,
Movdqu,
Movlhps,
Movmskps,
Movmskpd,
Movq,
Movss,
Movsd,
@@ -407,6 +409,7 @@ pub enum SseOpcode {
Pabsb,
Pabsw,
Pabsd,
Packsswb,
Paddb,
Paddd,
Paddq,
@@ -445,6 +448,7 @@ pub enum SseOpcode {
Pminub,
Pminuw,
Pminud,
Pmovmskb,
Pmulld,
Pmullw,
Pmuludq,
@@ -510,6 +514,7 @@ impl SseOpcode {
| SseOpcode::Minss
| SseOpcode::Movaps
| SseOpcode::Movlhps
| SseOpcode::Movmskps
| SseOpcode::Movss
| SseOpcode::Movups
| SseOpcode::Mulps
@@ -546,6 +551,7 @@ impl SseOpcode {
| SseOpcode::Minsd
| SseOpcode::Movapd
| SseOpcode::Movd
| SseOpcode::Movmskpd
| SseOpcode::Movq
| SseOpcode::Movsd
| SseOpcode::Movupd
@@ -554,6 +560,7 @@ impl SseOpcode {
| SseOpcode::Mulpd
| SseOpcode::Mulsd
| SseOpcode::Orpd
| SseOpcode::Packsswb
| SseOpcode::Paddb
| SseOpcode::Paddd
| SseOpcode::Paddq
@@ -578,6 +585,7 @@ impl SseOpcode {
| SseOpcode::Pmaxub
| SseOpcode::Pminsw
| SseOpcode::Pminub
| SseOpcode::Pmovmskb
| SseOpcode::Pmullw
| SseOpcode::Pmuludq
| SseOpcode::Por
@@ -686,6 +694,8 @@ impl fmt::Debug for SseOpcode {
SseOpcode::Movdqa => "movdqa",
SseOpcode::Movdqu => "movdqu",
SseOpcode::Movlhps => "movlhps",
SseOpcode::Movmskps => "movmskps",
SseOpcode::Movmskpd => "movmskpd",
SseOpcode::Movq => "movq",
SseOpcode::Movss => "movss",
SseOpcode::Movsd => "movsd",
@@ -700,6 +710,7 @@ impl fmt::Debug for SseOpcode {
SseOpcode::Pabsb => "pabsb",
SseOpcode::Pabsw => "pabsw",
SseOpcode::Pabsd => "pabsd",
SseOpcode::Packsswb => "packsswb",
SseOpcode::Paddb => "paddb",
SseOpcode::Paddd => "paddd",
SseOpcode::Paddq => "paddq",
@@ -738,6 +749,7 @@ impl fmt::Debug for SseOpcode {
SseOpcode::Pminub => "pminub",
SseOpcode::Pminuw => "pminuw",
SseOpcode::Pminud => "pminud",
SseOpcode::Pmovmskb => "pmovmskb",
SseOpcode::Pmulld => "pmulld",
SseOpcode::Pmullw => "pmullw",
SseOpcode::Pmuludq => "pmuludq",

View File

@@ -1762,6 +1762,7 @@ pub(crate) fn emit(
SseOpcode::Mulsd => (LegacyPrefixes::_F2, 0x0F59, 2),
SseOpcode::Orpd => (LegacyPrefixes::_66, 0x0F56, 2),
SseOpcode::Orps => (LegacyPrefixes::None, 0x0F56, 2),
SseOpcode::Packsswb => (LegacyPrefixes::_66, 0x0F63, 2),
SseOpcode::Paddb => (LegacyPrefixes::_66, 0x0FFC, 2),
SseOpcode::Paddd => (LegacyPrefixes::_66, 0x0FFE, 2),
SseOpcode::Paddq => (LegacyPrefixes::_66, 0x0FD4, 2),
@@ -2040,11 +2041,14 @@ pub(crate) fn emit(
dst_size,
} => {
let (prefix, opcode, dst_first) = match op {
SseOpcode::Cvttss2si => (LegacyPrefixes::_F3, 0x0F2C, true),
SseOpcode::Cvttsd2si => (LegacyPrefixes::_F2, 0x0F2C, true),
// Movd and movq use the same opcode; the presence of the REX prefix (set below)
// actually determines which is used.
SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F7E, false),
SseOpcode::Cvttss2si => (LegacyPrefixes::_F3, 0x0F2C, true),
SseOpcode::Cvttsd2si => (LegacyPrefixes::_F2, 0x0F2C, true),
SseOpcode::Movmskps => (LegacyPrefixes::None, 0x0F50, true),
SseOpcode::Movmskpd => (LegacyPrefixes::_66, 0x0F50, true),
SseOpcode::Pmovmskb => (LegacyPrefixes::_66, 0x0FD7, true),
_ => panic!("unexpected opcode {:?}", op),
};
let rex = match dst_size {

View File

@@ -3292,6 +3292,12 @@ fn test_x64_emit() {
"pshufb %xmm11, %xmm2",
));
insns.push((
Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(xmm11), w_xmm2, None),
"66410F63D3",
"packsswb %xmm11, %xmm2",
));
// ========================================================
// XMM_RM_R: Integer Conversion
insns.push((
@@ -3422,6 +3428,22 @@ fn test_x64_emit() {
"cvttsd2si %xmm0, %r15",
));
insns.push((
Inst::xmm_to_gpr(SseOpcode::Pmovmskb, xmm10, w_rax, OperandSize::Size32),
"66410FD7C2",
"pmovmskb %xmm10, %eax",
));
insns.push((
Inst::xmm_to_gpr(SseOpcode::Movmskps, xmm2, w_rax, OperandSize::Size32),
"0F50C2",
"movmskps %xmm2, %eax",
));
insns.push((
Inst::xmm_to_gpr(SseOpcode::Movmskpd, xmm0, w_rcx, OperandSize::Size32),
"660F50C8",
"movmskpd %xmm0, %ecx",
));
insns.push((
Inst::gpr_to_xmm(
SseOpcode::Movd,

View File

@@ -3657,6 +3657,58 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
ctx.emit(Inst::setcc(CC::Z, dst));
}
Opcode::VhighBits => {
let src = put_input_in_reg(ctx, inputs[0]);
let src_ty = ctx.input_ty(insn, 0);
debug_assert!(src_ty.is_vector() && src_ty.bits() == 128);
let dst = get_output_reg(ctx, outputs[0]);
debug_assert!(dst.to_reg().get_class() == RegClass::I64);
// The Intel specification allows using both 32-bit and 64-bit GPRs as destination for
// the "move mask" instructions. This is controlled by the REX.R bit: "In 64-bit mode,
// the instruction can access additional registers when used with a REX.R prefix. The
// default operand size is 64-bit in 64-bit mode" (PMOVMSKB in IA Software Development
// Manual, vol. 2). This being the case, we will always clear REX.W since its use is
// unnecessary (`OperandSize` is used for setting/clearing REX.W).
let size = OperandSize::Size32;
match src_ty {
types::I8X16 | types::B8X16 => {
ctx.emit(Inst::xmm_to_gpr(SseOpcode::Pmovmskb, src, dst, size))
}
types::I32X4 | types::B32X4 | types::F32X4 => {
ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskps, src, dst, size))
}
types::I64X2 | types::B64X2 | types::F64X2 => {
ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskpd, src, dst, size))
}
types::I16X8 | types::B16X8 => {
// There is no x86 instruction for extracting the high bit of 16-bit lanes so
// here we:
// - duplicate the 16-bit lanes of `src` into 8-bit lanes:
// PACKSSWB([x1, x2, ...], [x1, x2, ...]) = [x1', x2', ..., x1', x2', ...]
// - use PMOVMSKB to gather the high bits; now we have duplicates, though
// - shift away the bottom 8 high bits to remove the duplicates.
let tmp = ctx.alloc_tmp(RegClass::V128, src_ty);
ctx.emit(Inst::gen_move(tmp, src, src_ty));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Packsswb,
RegMem::reg(src),
tmp,
None,
));
ctx.emit(Inst::xmm_to_gpr(
SseOpcode::Pmovmskb,
tmp.to_reg(),
dst,
size,
));
ctx.emit(Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(8), dst));
}
_ => unimplemented!("unknown input type {} for {}", src_ty, op),
}
}
Opcode::IaddImm
| Opcode::ImulImm
| Opcode::UdivImm