[machinst x64]: implement bitmask

2020-10-28 13:18:20 -07:00
parent 5b9a21e099
commit 6725b6b129
5 changed files with 100 additions and 4 deletions
--- a/build.rs
+++ b/build.rs
@@ -182,6 +182,7 @@ fn experimental_x64_should_panic(testsuite: &str, testname: &str, strategy: &str
    match (testsuite, testname) {
        ("simd", "simd_address") => return false,
        ("simd", "simd_bitwise") => return false,
+        ("simd", "simd_boolean") => return false,
        ("simd", "simd_const") => return false,
        ("simd", "simd_i8x16_arith") => return false,
        ("simd", "simd_i8x16_arith2") => return false,
@@ -229,9 +230,14 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
                return env::var("CARGO_CFG_TARGET_ARCH").unwrap() != "x86_64";
            }

+            // These are only implemented on aarch64 and x64.
+            ("simd", "simd_boolean") => {
+                return !(cfg!(feature = "experimental_x64")
+                    || env::var("CARGO_CFG_TARGET_ARCH").unwrap() == "aarch64")
+            }
+
            // These are only implemented on aarch64.
-            ("simd", "simd_boolean")
-            | ("simd", "simd_f32x4_pmin_pmax")
+            ("simd", "simd_f32x4_pmin_pmax")
            | ("simd", "simd_f32x4_rounding")
            | ("simd", "simd_f64x2_pmin_pmax")
            | ("simd", "simd_f64x2_rounding") => {
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -393,6 +393,8 @@ pub enum SseOpcode {
    Movdqa,
    Movdqu,
    Movlhps,
+    Movmskps,
+    Movmskpd,
    Movq,
    Movss,
    Movsd,
@@ -407,6 +409,7 @@ pub enum SseOpcode {
    Pabsb,
    Pabsw,
    Pabsd,
+    Packsswb,
    Paddb,
    Paddd,
    Paddq,
@@ -445,6 +448,7 @@ pub enum SseOpcode {
    Pminub,
    Pminuw,
    Pminud,
+    Pmovmskb,
    Pmulld,
    Pmullw,
    Pmuludq,
@@ -510,6 +514,7 @@ impl SseOpcode {
            | SseOpcode::Minss
            | SseOpcode::Movaps
            | SseOpcode::Movlhps
+            | SseOpcode::Movmskps
            | SseOpcode::Movss
            | SseOpcode::Movups
            | SseOpcode::Mulps
@@ -546,6 +551,7 @@ impl SseOpcode {
            | SseOpcode::Minsd
            | SseOpcode::Movapd
            | SseOpcode::Movd
+            | SseOpcode::Movmskpd
            | SseOpcode::Movq
            | SseOpcode::Movsd
            | SseOpcode::Movupd
@@ -554,6 +560,7 @@ impl SseOpcode {
            | SseOpcode::Mulpd
            | SseOpcode::Mulsd
            | SseOpcode::Orpd
+            | SseOpcode::Packsswb
            | SseOpcode::Paddb
            | SseOpcode::Paddd
            | SseOpcode::Paddq
@@ -578,6 +585,7 @@ impl SseOpcode {
            | SseOpcode::Pmaxub
            | SseOpcode::Pminsw
            | SseOpcode::Pminub
+            | SseOpcode::Pmovmskb
            | SseOpcode::Pmullw
            | SseOpcode::Pmuludq
            | SseOpcode::Por
@@ -686,6 +694,8 @@ impl fmt::Debug for SseOpcode {
            SseOpcode::Movdqa => "movdqa",
            SseOpcode::Movdqu => "movdqu",
            SseOpcode::Movlhps => "movlhps",
+            SseOpcode::Movmskps => "movmskps",
+            SseOpcode::Movmskpd => "movmskpd",
            SseOpcode::Movq => "movq",
            SseOpcode::Movss => "movss",
            SseOpcode::Movsd => "movsd",
@@ -700,6 +710,7 @@ impl fmt::Debug for SseOpcode {
            SseOpcode::Pabsb => "pabsb",
            SseOpcode::Pabsw => "pabsw",
            SseOpcode::Pabsd => "pabsd",
+            SseOpcode::Packsswb => "packsswb",
            SseOpcode::Paddb => "paddb",
            SseOpcode::Paddd => "paddd",
            SseOpcode::Paddq => "paddq",
@@ -738,6 +749,7 @@ impl fmt::Debug for SseOpcode {
            SseOpcode::Pminub => "pminub",
            SseOpcode::Pminuw => "pminuw",
            SseOpcode::Pminud => "pminud",
+            SseOpcode::Pmovmskb => "pmovmskb",
            SseOpcode::Pmulld => "pmulld",
            SseOpcode::Pmullw => "pmullw",
            SseOpcode::Pmuludq => "pmuludq",
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -1762,6 +1762,7 @@ pub(crate) fn emit(
                SseOpcode::Mulsd => (LegacyPrefixes::_F2, 0x0F59, 2),
                SseOpcode::Orpd => (LegacyPrefixes::_66, 0x0F56, 2),
                SseOpcode::Orps => (LegacyPrefixes::None, 0x0F56, 2),
+                SseOpcode::Packsswb => (LegacyPrefixes::_66, 0x0F63, 2),
                SseOpcode::Paddb => (LegacyPrefixes::_66, 0x0FFC, 2),
                SseOpcode::Paddd => (LegacyPrefixes::_66, 0x0FFE, 2),
                SseOpcode::Paddq => (LegacyPrefixes::_66, 0x0FD4, 2),
@@ -2040,11 +2041,14 @@ pub(crate) fn emit(
            dst_size,
        } => {
            let (prefix, opcode, dst_first) = match op {
+                SseOpcode::Cvttss2si => (LegacyPrefixes::_F3, 0x0F2C, true),
+                SseOpcode::Cvttsd2si => (LegacyPrefixes::_F2, 0x0F2C, true),
                // Movd and movq use the same opcode; the presence of the REX prefix (set below)
                // actually determines which is used.
                SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F7E, false),
-                SseOpcode::Cvttss2si => (LegacyPrefixes::_F3, 0x0F2C, true),
-                SseOpcode::Cvttsd2si => (LegacyPrefixes::_F2, 0x0F2C, true),
+                SseOpcode::Movmskps => (LegacyPrefixes::None, 0x0F50, true),
+                SseOpcode::Movmskpd => (LegacyPrefixes::_66, 0x0F50, true),
+                SseOpcode::Pmovmskb => (LegacyPrefixes::_66, 0x0FD7, true),
                _ => panic!("unexpected opcode {:?}", op),
            };
            let rex = match dst_size {
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -3292,6 +3292,12 @@ fn test_x64_emit() {
        "pshufb  %xmm11, %xmm2",
    ));

+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(xmm11), w_xmm2, None),
+        "66410F63D3",
+        "packsswb %xmm11, %xmm2",
+    ));
+
    // ========================================================
    // XMM_RM_R: Integer Conversion
    insns.push((
@@ -3422,6 +3428,22 @@ fn test_x64_emit() {
        "cvttsd2si %xmm0, %r15",
    ));

+    insns.push((
+        Inst::xmm_to_gpr(SseOpcode::Pmovmskb, xmm10, w_rax, OperandSize::Size32),
+        "66410FD7C2",
+        "pmovmskb %xmm10, %eax",
+    ));
+    insns.push((
+        Inst::xmm_to_gpr(SseOpcode::Movmskps, xmm2, w_rax, OperandSize::Size32),
+        "0F50C2",
+        "movmskps %xmm2, %eax",
+    ));
+    insns.push((
+        Inst::xmm_to_gpr(SseOpcode::Movmskpd, xmm0, w_rcx, OperandSize::Size32),
+        "660F50C8",
+        "movmskpd %xmm0, %ecx",
+    ));
+
    insns.push((
        Inst::gpr_to_xmm(
            SseOpcode::Movd,
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -3657,6 +3657,58 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            ctx.emit(Inst::setcc(CC::Z, dst));
        }

+        Opcode::VhighBits => {
+            let src = put_input_in_reg(ctx, inputs[0]);
+            let src_ty = ctx.input_ty(insn, 0);
+            debug_assert!(src_ty.is_vector() && src_ty.bits() == 128);
+            let dst = get_output_reg(ctx, outputs[0]);
+            debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+
+            // The Intel specification allows using both 32-bit and 64-bit GPRs as destination for
+            // the "move mask" instructions. This is controlled by the REX.R bit: "In 64-bit mode,
+            // the instruction can access additional registers when used with a REX.R prefix. The
+            // default operand size is 64-bit in 64-bit mode" (PMOVMSKB in IA Software Development
+            // Manual, vol. 2). This being the case, we will always clear REX.W since its use is
+            // unnecessary (`OperandSize` is used for setting/clearing REX.W).
+            let size = OperandSize::Size32;
+
+            match src_ty {
+                types::I8X16 | types::B8X16 => {
+                    ctx.emit(Inst::xmm_to_gpr(SseOpcode::Pmovmskb, src, dst, size))
+                }
+                types::I32X4 | types::B32X4 | types::F32X4 => {
+                    ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskps, src, dst, size))
+                }
+                types::I64X2 | types::B64X2 | types::F64X2 => {
+                    ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskpd, src, dst, size))
+                }
+                types::I16X8 | types::B16X8 => {
+                    // There is no x86 instruction for extracting the high bit of 16-bit lanes so
+                    // here we:
+                    // - duplicate the 16-bit lanes of `src` into 8-bit lanes:
+                    //     PACKSSWB([x1, x2, ...], [x1, x2, ...]) = [x1', x2', ..., x1', x2', ...]
+                    // - use PMOVMSKB to gather the high bits; now we have duplicates, though
+                    // - shift away the bottom 8 high bits to remove the duplicates.
+                    let tmp = ctx.alloc_tmp(RegClass::V128, src_ty);
+                    ctx.emit(Inst::gen_move(tmp, src, src_ty));
+                    ctx.emit(Inst::xmm_rm_r(
+                        SseOpcode::Packsswb,
+                        RegMem::reg(src),
+                        tmp,
+                        None,
+                    ));
+                    ctx.emit(Inst::xmm_to_gpr(
+                        SseOpcode::Pmovmskb,
+                        tmp.to_reg(),
+                        dst,
+                        size,
+                    ));
+                    ctx.emit(Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(8), dst));
+                }
+                _ => unimplemented!("unknown input type {} for {}", src_ty, op),
+            }
+        }
+
        Opcode::IaddImm
        | Opcode::ImulImm
        | Opcode::UdivImm