Improve bitselect codegen with knowledge of operand origin (#1783)

* Encode vselect using BLEND instructions on x86 * Legalize vselect to bitselect * Optimize bitselect to vselect for some operands * Add run tests for bitselect-vselect optimization * Address review feedback
2020-05-30 04:53:11 +02:00
parent 16afca4451
commit e430984ac4
11 changed files with 341 additions and 1 deletions
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -1634,6 +1634,7 @@ fn define_simd(
    let ushr_imm = shared.by_name("ushr_imm");
    let usub_sat = shared.by_name("usub_sat");
    let vconst = shared.by_name("vconst");
+    let vselect = shared.by_name("vselect");
    let x86_insertps = x86.by_name("x86_insertps");
    let x86_movlhps = x86.by_name("x86_movlhps");
    let x86_movsd = x86.by_name("x86_movsd");
@@ -1654,6 +1655,7 @@ fn define_simd(
    let x86_punpckl = x86.by_name("x86_punpckl");

    // Shorthands for recipes.
+    let rec_blend = r.template("blend");
    let rec_evex_reg_vvvv_rm_128 = r.template("evex_reg_vvvv_rm_128");
    let rec_f_ib = r.template("f_ib");
    let rec_fa = r.template("fa");
@@ -1723,6 +1725,20 @@ fn define_simd(
        e.enc_both_inferred(instruction, template);
    }

+    // SIMD vselect; controlling value of vselect is a boolean vector, so each lane should be
+    // either all ones or all zeroes - it makes it possible to always use 8-bit PBLENDVB;
+    // for 32/64-bit lanes we can also use BLENDVPS and BLENDVPD
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
+        let opcode = match ty.lane_bits() {
+            32 => &BLENDVPS,
+            64 => &BLENDVPD,
+            _ => &PBLENDVB,
+        };
+        let instruction = vselect.bind(vector(ty, sse_vector_size));
+        let template = rec_blend.opcodes(opcode);
+        e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd));
+    }
+
    // SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according
    // to the Intel manual: "When the destination operand is an XMM register, the source operand is
    // written to the low doubleword of the register and the register is zero-extended to 128 bits."