Improve bitselect codegen with knowledge of operand origin (#1783)

* Encode vselect using BLEND instructions on x86

* Legalize vselect to bitselect

* Optimize bitselect to vselect for some operands

* Add run tests for bitselect-vselect optimization

* Address review feedback
This commit is contained in:
teapotd
2020-05-30 04:53:11 +02:00
committed by GitHub
parent 16afca4451
commit e430984ac4
11 changed files with 341 additions and 1 deletions

View File

@@ -246,6 +246,20 @@ fn size_with_inferred_rex_for_inreg0_inreg1(
sizing.base_size + if needs_rex { 1 } else { 0 }
}
/// Infers whether a dynamic REX prefix will be emitted, based on second and third operand.
fn size_with_inferred_rex_for_inreg1_inreg2(
sizing: &RecipeSizing,
_enc: Encoding,
inst: Inst,
divert: &RegDiversions,
func: &Function,
) -> u8 {
// No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed.
let needs_rex = test_input(1, inst, divert, func, is_extended_reg)
|| test_input(2, inst, divert, func, is_extended_reg);
sizing.base_size + if needs_rex { 1 } else { 0 }
}
/// Infers whether a dynamic REX prefix will be emitted, based on a single
/// input register and a single output register.
fn size_with_inferred_rex_for_inreg0_outreg0(

View File

@@ -656,7 +656,7 @@ mod simplify {
dfg::ValueDef,
immediates,
instructions::{Opcode, ValueList},
types::{I16, I32, I8},
types::{B8, I16, I32, I8},
};
use std::marker::PhantomData;
@@ -935,6 +935,69 @@ mod simplify {
}
}
InstructionData::Ternary {
opcode: Opcode::Bitselect,
args,
} => {
let old_cond_type = pos.func.dfg.value_type(args[0]);
if !old_cond_type.is_vector() {
return;
}
// Replace bitselect with vselect if each lane of controlling mask is either
// all ones or all zeroes; on x86 bitselect is encoded using 3 instructions,
// while vselect can be encoded using single BLEND instruction.
if let ValueDef::Result(def_inst, _) = pos.func.dfg.value_def(args[0]) {
let (cond_val, cond_type) = match pos.func.dfg[def_inst] {
InstructionData::Unary {
opcode: Opcode::RawBitcast,
arg,
} => {
// If controlling mask is raw-bitcasted boolean vector then
// we know each lane is either all zeroes or ones,
// so we can use vselect instruction instead.
let arg_type = pos.func.dfg.value_type(arg);
if !arg_type.is_vector() || !arg_type.lane_type().is_bool() {
return;
}
(arg, arg_type)
}
InstructionData::UnaryConst {
opcode: Opcode::Vconst,
constant_handle,
} => {
// If each byte of controlling mask is 0x00 or 0xFF then
// we will always bitcast our way to vselect(B8x16, I8x16, I8x16).
// Bitselect operates at bit level, so the lane types don't matter.
let const_data = pos.func.dfg.constants.get(constant_handle);
if !const_data.iter().all(|&b| b == 0 || b == 0xFF) {
return;
}
let new_type = B8.by(old_cond_type.bytes() as u16).unwrap();
(pos.ins().raw_bitcast(new_type, args[0]), new_type)
}
_ => return,
};
let lane_type = Type::int(cond_type.lane_bits() as u16).unwrap();
let arg_type = lane_type.by(cond_type.lane_count()).unwrap();
let old_arg_type = pos.func.dfg.value_type(args[1]);
if arg_type != old_arg_type {
// Operands types must match, we need to add bitcasts.
let arg1 = pos.ins().raw_bitcast(arg_type, args[1]);
let arg2 = pos.ins().raw_bitcast(arg_type, args[2]);
let ret = pos.ins().vselect(cond_val, arg1, arg2);
pos.func.dfg.replace(inst).raw_bitcast(old_arg_type, ret);
} else {
pos.func
.dfg
.replace(inst)
.vselect(cond_val, args[1], args[2]);
}
}
}
_ => {}
}
}