[machinst x64]: implement bitmask

This commit is contained in:
Andrew Brown
2020-10-28 13:18:20 -07:00
parent 5b9a21e099
commit 6725b6b129
5 changed files with 100 additions and 4 deletions

View File

@@ -182,6 +182,7 @@ fn experimental_x64_should_panic(testsuite: &str, testname: &str, strategy: &str
match (testsuite, testname) { match (testsuite, testname) {
("simd", "simd_address") => return false, ("simd", "simd_address") => return false,
("simd", "simd_bitwise") => return false, ("simd", "simd_bitwise") => return false,
("simd", "simd_boolean") => return false,
("simd", "simd_const") => return false, ("simd", "simd_const") => return false,
("simd", "simd_i8x16_arith") => return false, ("simd", "simd_i8x16_arith") => return false,
("simd", "simd_i8x16_arith2") => return false, ("simd", "simd_i8x16_arith2") => return false,
@@ -229,9 +230,14 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
return env::var("CARGO_CFG_TARGET_ARCH").unwrap() != "x86_64"; return env::var("CARGO_CFG_TARGET_ARCH").unwrap() != "x86_64";
} }
// These are only implemented on aarch64 and x64.
("simd", "simd_boolean") => {
return !(cfg!(feature = "experimental_x64")
|| env::var("CARGO_CFG_TARGET_ARCH").unwrap() == "aarch64")
}
// These are only implemented on aarch64. // These are only implemented on aarch64.
("simd", "simd_boolean") ("simd", "simd_f32x4_pmin_pmax")
| ("simd", "simd_f32x4_pmin_pmax")
| ("simd", "simd_f32x4_rounding") | ("simd", "simd_f32x4_rounding")
| ("simd", "simd_f64x2_pmin_pmax") | ("simd", "simd_f64x2_pmin_pmax")
| ("simd", "simd_f64x2_rounding") => { | ("simd", "simd_f64x2_rounding") => {

View File

@@ -393,6 +393,8 @@ pub enum SseOpcode {
Movdqa, Movdqa,
Movdqu, Movdqu,
Movlhps, Movlhps,
Movmskps,
Movmskpd,
Movq, Movq,
Movss, Movss,
Movsd, Movsd,
@@ -407,6 +409,7 @@ pub enum SseOpcode {
Pabsb, Pabsb,
Pabsw, Pabsw,
Pabsd, Pabsd,
Packsswb,
Paddb, Paddb,
Paddd, Paddd,
Paddq, Paddq,
@@ -445,6 +448,7 @@ pub enum SseOpcode {
Pminub, Pminub,
Pminuw, Pminuw,
Pminud, Pminud,
Pmovmskb,
Pmulld, Pmulld,
Pmullw, Pmullw,
Pmuludq, Pmuludq,
@@ -510,6 +514,7 @@ impl SseOpcode {
| SseOpcode::Minss | SseOpcode::Minss
| SseOpcode::Movaps | SseOpcode::Movaps
| SseOpcode::Movlhps | SseOpcode::Movlhps
| SseOpcode::Movmskps
| SseOpcode::Movss | SseOpcode::Movss
| SseOpcode::Movups | SseOpcode::Movups
| SseOpcode::Mulps | SseOpcode::Mulps
@@ -546,6 +551,7 @@ impl SseOpcode {
| SseOpcode::Minsd | SseOpcode::Minsd
| SseOpcode::Movapd | SseOpcode::Movapd
| SseOpcode::Movd | SseOpcode::Movd
| SseOpcode::Movmskpd
| SseOpcode::Movq | SseOpcode::Movq
| SseOpcode::Movsd | SseOpcode::Movsd
| SseOpcode::Movupd | SseOpcode::Movupd
@@ -554,6 +560,7 @@ impl SseOpcode {
| SseOpcode::Mulpd | SseOpcode::Mulpd
| SseOpcode::Mulsd | SseOpcode::Mulsd
| SseOpcode::Orpd | SseOpcode::Orpd
| SseOpcode::Packsswb
| SseOpcode::Paddb | SseOpcode::Paddb
| SseOpcode::Paddd | SseOpcode::Paddd
| SseOpcode::Paddq | SseOpcode::Paddq
@@ -578,6 +585,7 @@ impl SseOpcode {
| SseOpcode::Pmaxub | SseOpcode::Pmaxub
| SseOpcode::Pminsw | SseOpcode::Pminsw
| SseOpcode::Pminub | SseOpcode::Pminub
| SseOpcode::Pmovmskb
| SseOpcode::Pmullw | SseOpcode::Pmullw
| SseOpcode::Pmuludq | SseOpcode::Pmuludq
| SseOpcode::Por | SseOpcode::Por
@@ -686,6 +694,8 @@ impl fmt::Debug for SseOpcode {
SseOpcode::Movdqa => "movdqa", SseOpcode::Movdqa => "movdqa",
SseOpcode::Movdqu => "movdqu", SseOpcode::Movdqu => "movdqu",
SseOpcode::Movlhps => "movlhps", SseOpcode::Movlhps => "movlhps",
SseOpcode::Movmskps => "movmskps",
SseOpcode::Movmskpd => "movmskpd",
SseOpcode::Movq => "movq", SseOpcode::Movq => "movq",
SseOpcode::Movss => "movss", SseOpcode::Movss => "movss",
SseOpcode::Movsd => "movsd", SseOpcode::Movsd => "movsd",
@@ -700,6 +710,7 @@ impl fmt::Debug for SseOpcode {
SseOpcode::Pabsb => "pabsb", SseOpcode::Pabsb => "pabsb",
SseOpcode::Pabsw => "pabsw", SseOpcode::Pabsw => "pabsw",
SseOpcode::Pabsd => "pabsd", SseOpcode::Pabsd => "pabsd",
SseOpcode::Packsswb => "packsswb",
SseOpcode::Paddb => "paddb", SseOpcode::Paddb => "paddb",
SseOpcode::Paddd => "paddd", SseOpcode::Paddd => "paddd",
SseOpcode::Paddq => "paddq", SseOpcode::Paddq => "paddq",
@@ -738,6 +749,7 @@ impl fmt::Debug for SseOpcode {
SseOpcode::Pminub => "pminub", SseOpcode::Pminub => "pminub",
SseOpcode::Pminuw => "pminuw", SseOpcode::Pminuw => "pminuw",
SseOpcode::Pminud => "pminud", SseOpcode::Pminud => "pminud",
SseOpcode::Pmovmskb => "pmovmskb",
SseOpcode::Pmulld => "pmulld", SseOpcode::Pmulld => "pmulld",
SseOpcode::Pmullw => "pmullw", SseOpcode::Pmullw => "pmullw",
SseOpcode::Pmuludq => "pmuludq", SseOpcode::Pmuludq => "pmuludq",

View File

@@ -1762,6 +1762,7 @@ pub(crate) fn emit(
SseOpcode::Mulsd => (LegacyPrefixes::_F2, 0x0F59, 2), SseOpcode::Mulsd => (LegacyPrefixes::_F2, 0x0F59, 2),
SseOpcode::Orpd => (LegacyPrefixes::_66, 0x0F56, 2), SseOpcode::Orpd => (LegacyPrefixes::_66, 0x0F56, 2),
SseOpcode::Orps => (LegacyPrefixes::None, 0x0F56, 2), SseOpcode::Orps => (LegacyPrefixes::None, 0x0F56, 2),
SseOpcode::Packsswb => (LegacyPrefixes::_66, 0x0F63, 2),
SseOpcode::Paddb => (LegacyPrefixes::_66, 0x0FFC, 2), SseOpcode::Paddb => (LegacyPrefixes::_66, 0x0FFC, 2),
SseOpcode::Paddd => (LegacyPrefixes::_66, 0x0FFE, 2), SseOpcode::Paddd => (LegacyPrefixes::_66, 0x0FFE, 2),
SseOpcode::Paddq => (LegacyPrefixes::_66, 0x0FD4, 2), SseOpcode::Paddq => (LegacyPrefixes::_66, 0x0FD4, 2),
@@ -2040,11 +2041,14 @@ pub(crate) fn emit(
dst_size, dst_size,
} => { } => {
let (prefix, opcode, dst_first) = match op { let (prefix, opcode, dst_first) = match op {
SseOpcode::Cvttss2si => (LegacyPrefixes::_F3, 0x0F2C, true),
SseOpcode::Cvttsd2si => (LegacyPrefixes::_F2, 0x0F2C, true),
// Movd and movq use the same opcode; the presence of the REX prefix (set below) // Movd and movq use the same opcode; the presence of the REX prefix (set below)
// actually determines which is used. // actually determines which is used.
SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F7E, false), SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F7E, false),
SseOpcode::Cvttss2si => (LegacyPrefixes::_F3, 0x0F2C, true), SseOpcode::Movmskps => (LegacyPrefixes::None, 0x0F50, true),
SseOpcode::Cvttsd2si => (LegacyPrefixes::_F2, 0x0F2C, true), SseOpcode::Movmskpd => (LegacyPrefixes::_66, 0x0F50, true),
SseOpcode::Pmovmskb => (LegacyPrefixes::_66, 0x0FD7, true),
_ => panic!("unexpected opcode {:?}", op), _ => panic!("unexpected opcode {:?}", op),
}; };
let rex = match dst_size { let rex = match dst_size {

View File

@@ -3292,6 +3292,12 @@ fn test_x64_emit() {
"pshufb %xmm11, %xmm2", "pshufb %xmm11, %xmm2",
)); ));
insns.push((
Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(xmm11), w_xmm2, None),
"66410F63D3",
"packsswb %xmm11, %xmm2",
));
// ======================================================== // ========================================================
// XMM_RM_R: Integer Conversion // XMM_RM_R: Integer Conversion
insns.push(( insns.push((
@@ -3422,6 +3428,22 @@ fn test_x64_emit() {
"cvttsd2si %xmm0, %r15", "cvttsd2si %xmm0, %r15",
)); ));
insns.push((
Inst::xmm_to_gpr(SseOpcode::Pmovmskb, xmm10, w_rax, OperandSize::Size32),
"66410FD7C2",
"pmovmskb %xmm10, %eax",
));
insns.push((
Inst::xmm_to_gpr(SseOpcode::Movmskps, xmm2, w_rax, OperandSize::Size32),
"0F50C2",
"movmskps %xmm2, %eax",
));
insns.push((
Inst::xmm_to_gpr(SseOpcode::Movmskpd, xmm0, w_rcx, OperandSize::Size32),
"660F50C8",
"movmskpd %xmm0, %ecx",
));
insns.push(( insns.push((
Inst::gpr_to_xmm( Inst::gpr_to_xmm(
SseOpcode::Movd, SseOpcode::Movd,

View File

@@ -3657,6 +3657,58 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
ctx.emit(Inst::setcc(CC::Z, dst)); ctx.emit(Inst::setcc(CC::Z, dst));
} }
Opcode::VhighBits => {
let src = put_input_in_reg(ctx, inputs[0]);
let src_ty = ctx.input_ty(insn, 0);
debug_assert!(src_ty.is_vector() && src_ty.bits() == 128);
let dst = get_output_reg(ctx, outputs[0]);
debug_assert!(dst.to_reg().get_class() == RegClass::I64);
// The Intel specification allows using both 32-bit and 64-bit GPRs as destination for
// the "move mask" instructions. This is controlled by the REX.R bit: "In 64-bit mode,
// the instruction can access additional registers when used with a REX.R prefix. The
// default operand size is 64-bit in 64-bit mode" (PMOVMSKB in IA Software Development
// Manual, vol. 2). This being the case, we will always clear REX.W since its use is
// unnecessary (`OperandSize` is used for setting/clearing REX.W).
let size = OperandSize::Size32;
match src_ty {
types::I8X16 | types::B8X16 => {
ctx.emit(Inst::xmm_to_gpr(SseOpcode::Pmovmskb, src, dst, size))
}
types::I32X4 | types::B32X4 | types::F32X4 => {
ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskps, src, dst, size))
}
types::I64X2 | types::B64X2 | types::F64X2 => {
ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskpd, src, dst, size))
}
types::I16X8 | types::B16X8 => {
// There is no x86 instruction for extracting the high bit of 16-bit lanes so
// here we:
// - duplicate the 16-bit lanes of `src` into 8-bit lanes:
// PACKSSWB([x1, x2, ...], [x1, x2, ...]) = [x1', x2', ..., x1', x2', ...]
// - use PMOVMSKB to gather the high bits; now we have duplicates, though
// - shift away the bottom 8 high bits to remove the duplicates.
let tmp = ctx.alloc_tmp(RegClass::V128, src_ty);
ctx.emit(Inst::gen_move(tmp, src, src_ty));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Packsswb,
RegMem::reg(src),
tmp,
None,
));
ctx.emit(Inst::xmm_to_gpr(
SseOpcode::Pmovmskb,
tmp.to_reg(),
dst,
size,
));
ctx.emit(Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(8), dst));
}
_ => unimplemented!("unknown input type {} for {}", src_ty, op),
}
}
Opcode::IaddImm Opcode::IaddImm
| Opcode::ImulImm | Opcode::ImulImm
| Opcode::UdivImm | Opcode::UdivImm