x64: implement vselect with variable blend instructions

This change implements `vselect` using SSE4.1's `BLENDVPS`, `BLENDVPD`, and `PBLENDVB`. `vselect` is a lane-selecting instruction that is used by [simple_preopt.rs](fa1faf5d22/cranelift/codegen/src/simple_preopt.rs (L947-L999)) to lower `bitselect` to a single x86 instruction when the condition mask is known to be boolean (all 1s or 0s, e.g., from a conversion). This is better than `bitselect` in general, which lowers to 4-5 instructions. The old backend had the `vselect` lowering; this simply introduces it to the new backend.
2021-05-13 20:04:40 -07:00
parent 0742bb4699
commit 7ef3ae2903
7 changed files with 93 additions and 2 deletions
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -2029,7 +2029,50 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                ctx.emit(Inst::gen_move(dst, tmp2.to_reg(), ty));
                ctx.emit(Inst::or(ty, RegMem::from(tmp1), dst));
            } else {
-                unimplemented!("scalar bitselect")
+                unimplemented!("no lowering for scalar bitselect instruction")
+            }
+        }
+
+        Opcode::Vselect => {
+            let ty = ty.unwrap();
+            let condition = put_input_in_reg(ctx, inputs[0]);
+            let condition_ty = ctx.input_ty(insn, 0);
+            let if_true = input_to_reg_mem(ctx, inputs[1]);
+            let if_false = put_input_in_reg(ctx, inputs[2]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+
+            if ty.is_vector() {
+                // `vselect` relies on the bit representation of the condition:
+                // vector boolean types are defined in Cranelift to be all 1s or
+                // all 0s. This lowering relies on that fact to use x86's
+                // variable blend instructions, which look at the _high_bit_ of
+                // the condition mask. All the bits of vector booleans will
+                // match (all 1s or all 0s), so we can just use the high bit.
+                assert!(condition_ty.lane_type().is_bool());
+
+                // Variable blend instructions expect the condition mask to be
+                // in XMM0.
+                let xmm0 = Writable::from_reg(regs::xmm0());
+                ctx.emit(Inst::gen_move(xmm0, condition, ty));
+
+                // Match up the source and destination registers for regalloc.
+                ctx.emit(Inst::gen_move(dst, if_false, ty));
+
+                // Technically PBLENDVB would work in all cases (since the bytes
+                // inside the mask will be all 1s or 0s we can blend
+                // byte-by-byte instead of word-by-word, e.g.) but
+                // type-specialized versions are included here for clarity when
+                // troubleshooting and due to slight improvements in
+                // latency/throughput on certain processor families.
+                let opcode = match condition_ty {
+                    types::B64X2 => SseOpcode::Blendvpd,
+                    types::B32X4 => SseOpcode::Blendvps,
+                    types::B16X8 | types::B8X16 => SseOpcode::Pblendvb,
+                    _ => unimplemented!("unable lower vselect for type: {}", condition_ty),
+                };
+                ctx.emit(Inst::xmm_rm_r(opcode, if_true, dst));
+            } else {
+                unimplemented!("no lowering for scalar vselect instruction")
            }
        }