Improve bitselect codegen with knowledge of operand origin (#1783)

* Encode vselect using BLEND instructions on x86

* Legalize vselect to bitselect

* Optimize bitselect to vselect for some operands

* Add run tests for bitselect-vselect optimization

* Address review feedback
This commit is contained in:
teapotd
2020-05-30 04:53:11 +02:00
committed by GitHub
parent 16afca4451
commit e430984ac4
11 changed files with 341 additions and 1 deletions

View File

@@ -1634,6 +1634,7 @@ fn define_simd(
let ushr_imm = shared.by_name("ushr_imm");
let usub_sat = shared.by_name("usub_sat");
let vconst = shared.by_name("vconst");
let vselect = shared.by_name("vselect");
let x86_insertps = x86.by_name("x86_insertps");
let x86_movlhps = x86.by_name("x86_movlhps");
let x86_movsd = x86.by_name("x86_movsd");
@@ -1654,6 +1655,7 @@ fn define_simd(
let x86_punpckl = x86.by_name("x86_punpckl");
// Shorthands for recipes.
let rec_blend = r.template("blend");
let rec_evex_reg_vvvv_rm_128 = r.template("evex_reg_vvvv_rm_128");
let rec_f_ib = r.template("f_ib");
let rec_fa = r.template("fa");
@@ -1723,6 +1725,20 @@ fn define_simd(
e.enc_both_inferred(instruction, template);
}
// SIMD vselect; controlling value of vselect is a boolean vector, so each lane should be
// either all ones or all zeroes - it makes it possible to always use 8-bit PBLENDVB;
// for 32/64-bit lanes we can also use BLENDVPS and BLENDVPD
for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
let opcode = match ty.lane_bits() {
32 => &BLENDVPS,
64 => &BLENDVPD,
_ => &PBLENDVB,
};
let instruction = vselect.bind(vector(ty, sse_vector_size));
let template = rec_blend.opcodes(opcode);
e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd));
}
// SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according
// to the Intel manual: "When the destination operand is an XMM register, the source operand is
// written to the low doubleword of the register and the register is zero-extended to 128 bits."