x64: implement vselect with variable blend instructions

This change implements `vselect` using SSE4.1's `BLENDVPS`, `BLENDVPD`, and `PBLENDVB`. `vselect` is a lane-selecting instruction that is used by [simple_preopt.rs](fa1faf5d22/cranelift/codegen/src/simple_preopt.rs (L947-L999)) to lower `bitselect` to a single x86 instruction when the condition mask is known to be boolean (all 1s or 0s, e.g., from a conversion). This is better than `bitselect` in general, which lowers to 4-5 instructions. The old backend had the `vselect` lowering; this simply introduces it to the new backend.
2021-05-13 20:04:40 -07:00
parent 0742bb4699
commit 7ef3ae2903
7 changed files with 93 additions and 2 deletions
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -478,6 +478,7 @@ pub enum SseOpcode {
    Andnps,
    Andnpd,
    Blendvpd,
+    Blendvps,
    Comiss,
    Comisd,
    Cmpps,
@@ -547,6 +548,7 @@ pub enum SseOpcode {
    Pandn,
    Pavgb,
    Pavgw,
+    Pblendvb,
    Pcmpeqb,
    Pcmpeqw,
    Pcmpeqd,
@@ -769,8 +771,10 @@ impl SseOpcode {
            | SseOpcode::Pshufb => SSSE3,

            SseOpcode::Blendvpd
+            | SseOpcode::Blendvps
            | SseOpcode::Insertps
            | SseOpcode::Packusdw
+            | SseOpcode::Pblendvb
            | SseOpcode::Pcmpeqq
            | SseOpcode::Pextrb
            | SseOpcode::Pextrd
@@ -828,6 +832,7 @@ impl fmt::Debug for SseOpcode {
            SseOpcode::Andnps => "andnps",
            SseOpcode::Andnpd => "andnpd",
            SseOpcode::Blendvpd => "blendvpd",
+            SseOpcode::Blendvps => "blendvps",
            SseOpcode::Cmpps => "cmpps",
            SseOpcode::Cmppd => "cmppd",
            SseOpcode::Cmpss => "cmpss",
@@ -897,6 +902,7 @@ impl fmt::Debug for SseOpcode {
            SseOpcode::Pandn => "pandn",
            SseOpcode::Pavgb => "pavgb",
            SseOpcode::Pavgw => "pavgw",
+            SseOpcode::Pblendvb => "pblendvb",
            SseOpcode::Pcmpeqb => "pcmpeqb",
            SseOpcode::Pcmpeqw => "pcmpeqw",
            SseOpcode::Pcmpeqd => "pcmpeqd",
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -1441,6 +1441,7 @@ pub(crate) fn emit(
                SseOpcode::Andpd => (LegacyPrefixes::_66, 0x0F54, 2),
                SseOpcode::Andnps => (LegacyPrefixes::None, 0x0F55, 2),
                SseOpcode::Andnpd => (LegacyPrefixes::_66, 0x0F55, 2),
+                SseOpcode::Blendvps => (LegacyPrefixes::_66, 0x0F3814, 3),
                SseOpcode::Blendvpd => (LegacyPrefixes::_66, 0x0F3815, 3),
                SseOpcode::Cvttps2dq => (LegacyPrefixes::_F3, 0x0F5B, 2),
                SseOpcode::Cvtdq2ps => (LegacyPrefixes::None, 0x0F5B, 2),
@@ -1480,6 +1481,7 @@ pub(crate) fn emit(
                SseOpcode::Pandn => (LegacyPrefixes::_66, 0x0FDF, 2),
                SseOpcode::Pavgb => (LegacyPrefixes::_66, 0x0FE0, 2),
                SseOpcode::Pavgw => (LegacyPrefixes::_66, 0x0FE3, 2),
+                SseOpcode::Pblendvb => (LegacyPrefixes::_66, 0x0F3810, 3),
                SseOpcode::Pcmpeqb => (LegacyPrefixes::_66, 0x0F74, 2),
                SseOpcode::Pcmpeqw => (LegacyPrefixes::_66, 0x0F75, 2),
                SseOpcode::Pcmpeqd => (LegacyPrefixes::_66, 0x0F76, 2),
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -3432,6 +3432,18 @@ fn test_x64_emit() {
        "blendvpd %xmm15, %xmm4",
    ));

+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Blendvps, RegMem::reg(xmm2), w_xmm3),
+        "660F3814DA",
+        "blendvps %xmm2, %xmm3",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pblendvb, RegMem::reg(xmm12), w_xmm13),
+        "66450F3810EC",
+        "pblendvb %xmm12, %xmm13",
+    ));
+
    // ========================================================
    // XMM_RM_R: Integer Packed

--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -1927,13 +1927,20 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
            src.get_regs_as_uses(collector);
            collector.add_def(*dst);
        }
-        Inst::XmmRmR { src, dst, .. } => {
+        Inst::XmmRmR { src, dst, op, .. } => {
            if inst.produces_const() {
                // No need to account for src, since src == dst.
                collector.add_def(*dst);
            } else {
                src.get_regs_as_uses(collector);
                collector.add_mod(*dst);
+                // Some instructions have an implicit use of XMM0.
+                if *op == SseOpcode::Blendvpd
+                    || *op == SseOpcode::Blendvps
+                    || *op == SseOpcode::Pblendvb
+                {
+                    collector.add_use(regs::xmm0());
+                }
            }
        }
        Inst::XmmRmREvex {
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -2029,7 +2029,50 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                ctx.emit(Inst::gen_move(dst, tmp2.to_reg(), ty));
                ctx.emit(Inst::or(ty, RegMem::from(tmp1), dst));
            } else {
-                unimplemented!("scalar bitselect")
+                unimplemented!("no lowering for scalar bitselect instruction")
+            }
+        }
+
+        Opcode::Vselect => {
+            let ty = ty.unwrap();
+            let condition = put_input_in_reg(ctx, inputs[0]);
+            let condition_ty = ctx.input_ty(insn, 0);
+            let if_true = input_to_reg_mem(ctx, inputs[1]);
+            let if_false = put_input_in_reg(ctx, inputs[2]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+
+            if ty.is_vector() {
+                // `vselect` relies on the bit representation of the condition:
+                // vector boolean types are defined in Cranelift to be all 1s or
+                // all 0s. This lowering relies on that fact to use x86's
+                // variable blend instructions, which look at the _high_bit_ of
+                // the condition mask. All the bits of vector booleans will
+                // match (all 1s or all 0s), so we can just use the high bit.
+                assert!(condition_ty.lane_type().is_bool());
+
+                // Variable blend instructions expect the condition mask to be
+                // in XMM0.
+                let xmm0 = Writable::from_reg(regs::xmm0());
+                ctx.emit(Inst::gen_move(xmm0, condition, ty));
+
+                // Match up the source and destination registers for regalloc.
+                ctx.emit(Inst::gen_move(dst, if_false, ty));
+
+                // Technically PBLENDVB would work in all cases (since the bytes
+                // inside the mask will be all 1s or 0s we can blend
+                // byte-by-byte instead of word-by-word, e.g.) but
+                // type-specialized versions are included here for clarity when
+                // troubleshooting and due to slight improvements in
+                // latency/throughput on certain processor families.
+                let opcode = match condition_ty {
+                    types::B64X2 => SseOpcode::Blendvpd,
+                    types::B32X4 => SseOpcode::Blendvps,
+                    types::B16X8 | types::B8X16 => SseOpcode::Pblendvb,
+                    _ => unimplemented!("unable lower vselect for type: {}", condition_ty),
+                };
+                ctx.emit(Inst::xmm_rm_r(opcode, if_true, dst));
+            } else {
+                unimplemented!("no lowering for scalar vselect instruction")
            }
        }

--- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
@@ -15,6 +15,16 @@ block0:
 ; nextln: por     %xmm1, %xmm0
 ; not:    movdqa

+function %vselect_i16x8() -> i16x8 {
+block0:
+    v0 = vconst.b16x8 [false true false true false true false true]
+    v1 = vconst.i16x8 [0 0 0 0 0 0 0 0]
+    v2 = vconst.i16x8 [0 0 0 0 0 0 0 0]
+    v3 = vselect v0, v1, v2
+    return v3
+}
+; check:  pblendvb %xmm1, %xmm2
+


 ; 8x16 shifts: these lower to complex sequences of instructions
--- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-run.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-run.clif
@@ -10,6 +10,17 @@ block0(v0: i8x16, v1: i8x16, v2: i8x16):
 ; Remember that bitselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector.
 ; run: %bitselect_i8x16([0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255], [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42], [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127]) == [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42]

+function %vselect_i32x4(i32x4, i32x4) -> i32x4 {
+block0(v1: i32x4, v2: i32x4):
+    ; `make_trampoline` still does not know how to convert boolean vector types
+    ; so we load the value directly here.
+    v0 = vconst.b32x4 [true true false false]
+    v3 = vselect v0, v1, v2
+    return v3
+}
+; Remember that vselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector.
+; run: %vselect_i8x16([1 2 -1 -1], [-1 -1 3 4]) == [1 2 3 4]
+


 ; shift left