Aarch64: handle csel with icmp/fcmp source without materializing the bool.

Previously, we simply compared the input bool to 0, which forced the
value into a register (usually via a cmp and cset), zero-extended it,
etc. This patch performs the same pattern-matching that branches do to
directly perform the cmp and use its flag results with the csel.

On the `bz2` benchmark, the runtime is affected as follows (measuring
with `perf stat`, using wasmtime with its cache enabled, and taking the
second run after the first compiles and populates the cache):

pre:

       1117.232000      task-clock (msec)         #    1.000 CPUs utilized
               133      context-switches          #    0.119 K/sec
                 1      cpu-migrations            #    0.001 K/sec
             5,041      page-faults               #    0.005 M/sec
     3,511,615,100      cycles                    #    3.143 GHz
     4,272,427,772      instructions              #    1.22  insn per cycle
   <not supported>      branches
        27,980,906      branch-misses

       1.117299838 seconds time elapsed

post:

       1003.738075      task-clock (msec)         #    1.000 CPUs utilized
               121      context-switches          #    0.121 K/sec
                 0      cpu-migrations            #    0.000 K/sec
             5,052      page-faults               #    0.005 M/sec
     3,224,875,393      cycles                    #    3.213 GHz
     4,000,838,686      instructions              #    1.24  insn per cycle
   <not supported>      branches
        27,928,232      branch-misses

       1.003440004 seconds time elapsed

In other words, with this change, on `bz2`, we see a 6.3% reduction in
executed instructions.
This commit is contained in:
Chris Fallin
2020-07-17 16:30:45 -07:00
parent 8dd4ab2f1e
commit 21dac670f0
2 changed files with 53 additions and 11 deletions

View File

@@ -1078,8 +1078,24 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
// Nothing. // Nothing.
} }
Opcode::Select | Opcode::Selectif | Opcode::SelectifSpectreGuard => { Opcode::Select => {
let cond = if op == Opcode::Select { let flag_input = inputs[0];
let cond = if let Some(icmp_insn) =
maybe_input_insn_via_conv(ctx, flag_input, Opcode::Icmp, Opcode::Bint)
{
let condcode = inst_condcode(ctx.data(icmp_insn)).unwrap();
let cond = lower_condcode(condcode);
let is_signed = condcode_is_signed(condcode);
lower_icmp_or_ifcmp_to_flags(ctx, icmp_insn, is_signed);
cond
} else if let Some(fcmp_insn) =
maybe_input_insn_via_conv(ctx, flag_input, Opcode::Fcmp, Opcode::Bint)
{
let condcode = inst_fp_condcode(ctx.data(fcmp_insn)).unwrap();
let cond = lower_fp_condcode(condcode);
lower_fcmp_or_ffcmp_to_flags(ctx, fcmp_insn);
cond
} else {
let (cmp_op, narrow_mode) = if ty_bits(ctx.input_ty(insn, 0)) > 32 { let (cmp_op, narrow_mode) = if ty_bits(ctx.input_ty(insn, 0)) > 32 {
(ALUOp::SubS64, NarrowValueMode::ZeroExtend64) (ALUOp::SubS64, NarrowValueMode::ZeroExtend64)
} else { } else {
@@ -1095,17 +1111,32 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
rm: zero_reg(), rm: zero_reg(),
}); });
Cond::Ne Cond::Ne
} else {
let condcode = inst_condcode(ctx.data(insn)).unwrap();
let cond = lower_condcode(condcode);
let is_signed = condcode_is_signed(condcode);
// Verification ensures that the input is always a
// single-def ifcmp.
let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
cond
}; };
// csel.cond rd, rn, rm
let rd = get_output_reg(ctx, outputs[0]);
let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
let ty = ctx.output_ty(insn, 0);
let bits = ty_bits(ty);
if ty_is_float(ty) && bits == 32 {
ctx.emit(Inst::FpuCSel32 { cond, rd, rn, rm });
} else if ty_is_float(ty) && bits == 64 {
ctx.emit(Inst::FpuCSel64 { cond, rd, rn, rm });
} else {
ctx.emit(Inst::CSel { cond, rd, rn, rm });
}
}
Opcode::Selectif | Opcode::SelectifSpectreGuard => {
let condcode = inst_condcode(ctx.data(insn)).unwrap();
let cond = lower_condcode(condcode);
let is_signed = condcode_is_signed(condcode);
// Verification ensures that the input is always a
// single-def ifcmp.
let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
// csel.COND rd, rn, rm // csel.COND rd, rn, rm
let rd = get_output_reg(ctx, outputs[0]); let rd = get_output_reg(ctx, outputs[0]);
let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);

View File

@@ -41,3 +41,14 @@ block0(v0: b1, v1: i8, v2: i8):
; check: subs wzr ; check: subs wzr
; nextln: csel ; nextln: csel
function %i(i32, i8, i8) -> i8 {
block0(v0: i32, v1: i8, v2: i8):
v3 = iconst.i32 42
v4 = icmp.i32 eq v0, v3
v5 = select.i8 v4, v1, v2
return v5
}
; check: subs wzr, w0, #42
; nextln: csel x0, x1, x2, eq