Improve bitselect codegen with knowledge of operand origin (#1783)

* Encode vselect using BLEND instructions on x86

* Legalize vselect to bitselect

* Optimize bitselect to vselect for some operands

* Add run tests for bitselect-vselect optimization

* Address review feedback
This commit is contained in:
teapotd
2020-05-30 04:53:11 +02:00
committed by GitHub
parent 16afca4451
commit e430984ac4
11 changed files with 341 additions and 1 deletions

View File

@@ -1634,6 +1634,7 @@ fn define_simd(
let ushr_imm = shared.by_name("ushr_imm");
let usub_sat = shared.by_name("usub_sat");
let vconst = shared.by_name("vconst");
let vselect = shared.by_name("vselect");
let x86_insertps = x86.by_name("x86_insertps");
let x86_movlhps = x86.by_name("x86_movlhps");
let x86_movsd = x86.by_name("x86_movsd");
@@ -1654,6 +1655,7 @@ fn define_simd(
let x86_punpckl = x86.by_name("x86_punpckl");
// Shorthands for recipes.
let rec_blend = r.template("blend");
let rec_evex_reg_vvvv_rm_128 = r.template("evex_reg_vvvv_rm_128");
let rec_f_ib = r.template("f_ib");
let rec_fa = r.template("fa");
@@ -1723,6 +1725,20 @@ fn define_simd(
e.enc_both_inferred(instruction, template);
}
// SIMD vselect; controlling value of vselect is a boolean vector, so each lane should be
// either all ones or all zeroes - it makes it possible to always use 8-bit PBLENDVB;
// for 32/64-bit lanes we can also use BLENDVPS and BLENDVPD
for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
let opcode = match ty.lane_bits() {
32 => &BLENDVPS,
64 => &BLENDVPD,
_ => &PBLENDVB,
};
let instruction = vselect.bind(vector(ty, sse_vector_size));
let template = rec_blend.opcodes(opcode);
e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd));
}
// SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according
// to the Intel manual: "When the destination operand is an XMM register, the source operand is
// written to the low doubleword of the register and the register is zero-extended to 128 bits."

View File

@@ -378,6 +378,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
let vconst = insts.by_name("vconst");
let vall_true = insts.by_name("vall_true");
let vany_true = insts.by_name("vany_true");
let vselect = insts.by_name("vselect");
let x86_packss = x86_instructions.by_name("x86_packss");
let x86_pmaxs = x86_instructions.by_name("x86_pmaxs");
@@ -589,6 +590,17 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
);
}
// SIMD vselect; replace with bitselect if BLEND* instructions are not available.
// This works, because each lane of boolean vector is filled with zeroes or ones.
for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
let vselect = vselect.bind(vector(ty, sse_vector_size));
let raw_bitcast = raw_bitcast.bind(vector(ty, sse_vector_size));
narrow.legalize(
def!(d = vselect(c, x, y)),
vec![def!(a = raw_bitcast(c)), def!(d = bitselect(a, x, y))],
);
}
// SIMD vany_true
let ne = Literal::enumerator_for(&imm.intcc, "ne");
for ty in ValueType::all_lane_types().filter(allowed_simd_type) {

View File

@@ -54,6 +54,14 @@ pub static BIT_SCAN_FORWARD: [u8; 2] = [0x0f, 0xbc];
/// Bit scan reverse (stores index of first encountered 1 from the back).
pub static BIT_SCAN_REVERSE: [u8; 2] = [0x0f, 0xbd];
/// Select packed single-precision floating-point values from xmm1 and xmm2/m128
/// from mask specified in XMM0 and store the values into xmm1 (SSE4.1).
pub static BLENDVPS: [u8; 4] = [0x66, 0x0f, 0x38, 0x14];
/// Select packed double-precision floating-point values from xmm1 and xmm2/m128
/// from mask specified in XMM0 and store the values into xmm1 (SSE4.1).
pub static BLENDVPD: [u8; 4] = [0x66, 0x0f, 0x38, 0x15];
/// Call near, relative, displacement relative to next instruction (sign-extended).
pub static CALL_RELATIVE: [u8; 1] = [0xe8];
@@ -335,6 +343,10 @@ pub static PAVGB: [u8; 3] = [0x66, 0x0f, 0xE0];
/// Average packed unsigned word integers from xmm2/m128 and xmm1 with rounding (SSE2).
pub static PAVGW: [u8; 3] = [0x66, 0x0f, 0xE3];
/// Select byte values from xmm1 and xmm2/m128 from mask specified in the high bit of each byte
/// in XMM0 and store the values into xmm1 (SSE4.1).
pub static PBLENDVB: [u8; 4] = [0x66, 0x0f, 0x38, 0x10];
/// Compare packed data for equal (SSE2).
pub static PCMPEQB: [u8; 3] = [0x66, 0x0f, 0x74];

View File

@@ -427,6 +427,7 @@ pub(crate) fn define<'shared>(
let reg_rcx = Register::new(gpr, regs.regunit_by_name(gpr, "rcx"));
let reg_rdx = Register::new(gpr, regs.regunit_by_name(gpr, "rdx"));
let reg_r15 = Register::new(gpr, regs.regunit_by_name(gpr, "r15"));
let reg_xmm0 = Register::new(fpr, regs.regunit_by_name(fpr, "xmm0"));
// Stack operand with a 32-bit signed displacement from either RBP or RSP.
let stack_gpr32 = Stack::new(gpr);
@@ -904,6 +905,24 @@ pub(crate) fn define<'shared>(
.inferred_rex_compute_size("size_with_inferred_rex_for_inreg1"),
);
// XX /r for BLEND* instructions
recipes.add_template_inferred(
EncodingRecipeBuilder::new("blend", &formats.ternary, 1)
.operands_in(vec![
OperandConstraint::FixedReg(reg_xmm0),
OperandConstraint::RegClass(fpr),
OperandConstraint::RegClass(fpr),
])
.operands_out(vec![2])
.emit(
r#"
{{PUT_OP}}(bits, rex2(in_reg1, in_reg2), sink);
modrm_rr(in_reg1, in_reg2, sink);
"#,
),
"size_with_inferred_rex_for_inreg1_inreg2",
);
// XX /n ib with 8-bit immediate sign-extended.
{
recipes.add_template_inferred(

View File

@@ -246,6 +246,20 @@ fn size_with_inferred_rex_for_inreg0_inreg1(
sizing.base_size + if needs_rex { 1 } else { 0 }
}
/// Infers whether a dynamic REX prefix will be emitted, based on second and third operand.
fn size_with_inferred_rex_for_inreg1_inreg2(
sizing: &RecipeSizing,
_enc: Encoding,
inst: Inst,
divert: &RegDiversions,
func: &Function,
) -> u8 {
// No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed.
let needs_rex = test_input(1, inst, divert, func, is_extended_reg)
|| test_input(2, inst, divert, func, is_extended_reg);
sizing.base_size + if needs_rex { 1 } else { 0 }
}
/// Infers whether a dynamic REX prefix will be emitted, based on a single
/// input register and a single output register.
fn size_with_inferred_rex_for_inreg0_outreg0(

View File

@@ -656,7 +656,7 @@ mod simplify {
dfg::ValueDef,
immediates,
instructions::{Opcode, ValueList},
types::{I16, I32, I8},
types::{B8, I16, I32, I8},
};
use std::marker::PhantomData;
@@ -935,6 +935,69 @@ mod simplify {
}
}
InstructionData::Ternary {
opcode: Opcode::Bitselect,
args,
} => {
let old_cond_type = pos.func.dfg.value_type(args[0]);
if !old_cond_type.is_vector() {
return;
}
// Replace bitselect with vselect if each lane of controlling mask is either
// all ones or all zeroes; on x86 bitselect is encoded using 3 instructions,
// while vselect can be encoded using single BLEND instruction.
if let ValueDef::Result(def_inst, _) = pos.func.dfg.value_def(args[0]) {
let (cond_val, cond_type) = match pos.func.dfg[def_inst] {
InstructionData::Unary {
opcode: Opcode::RawBitcast,
arg,
} => {
// If controlling mask is raw-bitcasted boolean vector then
// we know each lane is either all zeroes or ones,
// so we can use vselect instruction instead.
let arg_type = pos.func.dfg.value_type(arg);
if !arg_type.is_vector() || !arg_type.lane_type().is_bool() {
return;
}
(arg, arg_type)
}
InstructionData::UnaryConst {
opcode: Opcode::Vconst,
constant_handle,
} => {
// If each byte of controlling mask is 0x00 or 0xFF then
// we will always bitcast our way to vselect(B8x16, I8x16, I8x16).
// Bitselect operates at bit level, so the lane types don't matter.
let const_data = pos.func.dfg.constants.get(constant_handle);
if !const_data.iter().all(|&b| b == 0 || b == 0xFF) {
return;
}
let new_type = B8.by(old_cond_type.bytes() as u16).unwrap();
(pos.ins().raw_bitcast(new_type, args[0]), new_type)
}
_ => return,
};
let lane_type = Type::int(cond_type.lane_bits() as u16).unwrap();
let arg_type = lane_type.by(cond_type.lane_count()).unwrap();
let old_arg_type = pos.func.dfg.value_type(args[1]);
if arg_type != old_arg_type {
// Operands types must match, we need to add bitcasts.
let arg1 = pos.ins().raw_bitcast(arg_type, args[1]);
let arg2 = pos.ins().raw_bitcast(arg_type, args[2]);
let ret = pos.ins().vselect(cond_val, arg1, arg2);
pos.func.dfg.replace(inst).raw_bitcast(old_arg_type, ret);
} else {
pos.func
.dfg
.replace(inst)
.vselect(cond_val, args[1], args[2]);
}
}
}
_ => {}
}
}