Improve bitselect codegen with knowledge of operand origin (#1783)
* Encode vselect using BLEND instructions on x86 * Legalize vselect to bitselect * Optimize bitselect to vselect for some operands * Add run tests for bitselect-vselect optimization * Address review feedback
This commit is contained in:
@@ -1634,6 +1634,7 @@ fn define_simd(
|
||||
let ushr_imm = shared.by_name("ushr_imm");
|
||||
let usub_sat = shared.by_name("usub_sat");
|
||||
let vconst = shared.by_name("vconst");
|
||||
let vselect = shared.by_name("vselect");
|
||||
let x86_insertps = x86.by_name("x86_insertps");
|
||||
let x86_movlhps = x86.by_name("x86_movlhps");
|
||||
let x86_movsd = x86.by_name("x86_movsd");
|
||||
@@ -1654,6 +1655,7 @@ fn define_simd(
|
||||
let x86_punpckl = x86.by_name("x86_punpckl");
|
||||
|
||||
// Shorthands for recipes.
|
||||
let rec_blend = r.template("blend");
|
||||
let rec_evex_reg_vvvv_rm_128 = r.template("evex_reg_vvvv_rm_128");
|
||||
let rec_f_ib = r.template("f_ib");
|
||||
let rec_fa = r.template("fa");
|
||||
@@ -1723,6 +1725,20 @@ fn define_simd(
|
||||
e.enc_both_inferred(instruction, template);
|
||||
}
|
||||
|
||||
// SIMD vselect; controlling value of vselect is a boolean vector, so each lane should be
|
||||
// either all ones or all zeroes - it makes it possible to always use 8-bit PBLENDVB;
|
||||
// for 32/64-bit lanes we can also use BLENDVPS and BLENDVPD
|
||||
for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
|
||||
let opcode = match ty.lane_bits() {
|
||||
32 => &BLENDVPS,
|
||||
64 => &BLENDVPD,
|
||||
_ => &PBLENDVB,
|
||||
};
|
||||
let instruction = vselect.bind(vector(ty, sse_vector_size));
|
||||
let template = rec_blend.opcodes(opcode);
|
||||
e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd));
|
||||
}
|
||||
|
||||
// SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according
|
||||
// to the Intel manual: "When the destination operand is an XMM register, the source operand is
|
||||
// written to the low doubleword of the register and the register is zero-extended to 128 bits."
|
||||
|
||||
@@ -378,6 +378,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
|
||||
let vconst = insts.by_name("vconst");
|
||||
let vall_true = insts.by_name("vall_true");
|
||||
let vany_true = insts.by_name("vany_true");
|
||||
let vselect = insts.by_name("vselect");
|
||||
|
||||
let x86_packss = x86_instructions.by_name("x86_packss");
|
||||
let x86_pmaxs = x86_instructions.by_name("x86_pmaxs");
|
||||
@@ -589,6 +590,17 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
|
||||
);
|
||||
}
|
||||
|
||||
// SIMD vselect; replace with bitselect if BLEND* instructions are not available.
|
||||
// This works, because each lane of boolean vector is filled with zeroes or ones.
|
||||
for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
|
||||
let vselect = vselect.bind(vector(ty, sse_vector_size));
|
||||
let raw_bitcast = raw_bitcast.bind(vector(ty, sse_vector_size));
|
||||
narrow.legalize(
|
||||
def!(d = vselect(c, x, y)),
|
||||
vec![def!(a = raw_bitcast(c)), def!(d = bitselect(a, x, y))],
|
||||
);
|
||||
}
|
||||
|
||||
// SIMD vany_true
|
||||
let ne = Literal::enumerator_for(&imm.intcc, "ne");
|
||||
for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
|
||||
|
||||
@@ -54,6 +54,14 @@ pub static BIT_SCAN_FORWARD: [u8; 2] = [0x0f, 0xbc];
|
||||
/// Bit scan reverse (stores index of first encountered 1 from the back).
|
||||
pub static BIT_SCAN_REVERSE: [u8; 2] = [0x0f, 0xbd];
|
||||
|
||||
/// Select packed single-precision floating-point values from xmm1 and xmm2/m128
|
||||
/// from mask specified in XMM0 and store the values into xmm1 (SSE4.1).
|
||||
pub static BLENDVPS: [u8; 4] = [0x66, 0x0f, 0x38, 0x14];
|
||||
|
||||
/// Select packed double-precision floating-point values from xmm1 and xmm2/m128
|
||||
/// from mask specified in XMM0 and store the values into xmm1 (SSE4.1).
|
||||
pub static BLENDVPD: [u8; 4] = [0x66, 0x0f, 0x38, 0x15];
|
||||
|
||||
/// Call near, relative, displacement relative to next instruction (sign-extended).
|
||||
pub static CALL_RELATIVE: [u8; 1] = [0xe8];
|
||||
|
||||
@@ -335,6 +343,10 @@ pub static PAVGB: [u8; 3] = [0x66, 0x0f, 0xE0];
|
||||
/// Average packed unsigned word integers from xmm2/m128 and xmm1 with rounding (SSE2).
|
||||
pub static PAVGW: [u8; 3] = [0x66, 0x0f, 0xE3];
|
||||
|
||||
/// Select byte values from xmm1 and xmm2/m128 from mask specified in the high bit of each byte
|
||||
/// in XMM0 and store the values into xmm1 (SSE4.1).
|
||||
pub static PBLENDVB: [u8; 4] = [0x66, 0x0f, 0x38, 0x10];
|
||||
|
||||
/// Compare packed data for equal (SSE2).
|
||||
pub static PCMPEQB: [u8; 3] = [0x66, 0x0f, 0x74];
|
||||
|
||||
|
||||
@@ -427,6 +427,7 @@ pub(crate) fn define<'shared>(
|
||||
let reg_rcx = Register::new(gpr, regs.regunit_by_name(gpr, "rcx"));
|
||||
let reg_rdx = Register::new(gpr, regs.regunit_by_name(gpr, "rdx"));
|
||||
let reg_r15 = Register::new(gpr, regs.regunit_by_name(gpr, "r15"));
|
||||
let reg_xmm0 = Register::new(fpr, regs.regunit_by_name(fpr, "xmm0"));
|
||||
|
||||
// Stack operand with a 32-bit signed displacement from either RBP or RSP.
|
||||
let stack_gpr32 = Stack::new(gpr);
|
||||
@@ -904,6 +905,24 @@ pub(crate) fn define<'shared>(
|
||||
.inferred_rex_compute_size("size_with_inferred_rex_for_inreg1"),
|
||||
);
|
||||
|
||||
// XX /r for BLEND* instructions
|
||||
recipes.add_template_inferred(
|
||||
EncodingRecipeBuilder::new("blend", &formats.ternary, 1)
|
||||
.operands_in(vec![
|
||||
OperandConstraint::FixedReg(reg_xmm0),
|
||||
OperandConstraint::RegClass(fpr),
|
||||
OperandConstraint::RegClass(fpr),
|
||||
])
|
||||
.operands_out(vec![2])
|
||||
.emit(
|
||||
r#"
|
||||
{{PUT_OP}}(bits, rex2(in_reg1, in_reg2), sink);
|
||||
modrm_rr(in_reg1, in_reg2, sink);
|
||||
"#,
|
||||
),
|
||||
"size_with_inferred_rex_for_inreg1_inreg2",
|
||||
);
|
||||
|
||||
// XX /n ib with 8-bit immediate sign-extended.
|
||||
{
|
||||
recipes.add_template_inferred(
|
||||
|
||||
Reference in New Issue
Block a user