Aarch64: handle csel with icmp/fcmp source without materializing the bool.
Previously, we simply compared the input bool to 0, which forced the
value into a register (usually via a cmp and cset), zero-extended it,
etc. This patch performs the same pattern-matching that branches do to
directly perform the cmp and use its flag results with the csel.
On the `bz2` benchmark, the runtime is affected as follows (measuring
with `perf stat`, using wasmtime with its cache enabled, and taking the
second run after the first compiles and populates the cache):
pre:
1117.232000 task-clock (msec) # 1.000 CPUs utilized
133 context-switches # 0.119 K/sec
1 cpu-migrations # 0.001 K/sec
5,041 page-faults # 0.005 M/sec
3,511,615,100 cycles # 3.143 GHz
4,272,427,772 instructions # 1.22 insn per cycle
<not supported> branches
27,980,906 branch-misses
1.117299838 seconds time elapsed
post:
1003.738075 task-clock (msec) # 1.000 CPUs utilized
121 context-switches # 0.121 K/sec
0 cpu-migrations # 0.000 K/sec
5,052 page-faults # 0.005 M/sec
3,224,875,393 cycles # 3.213 GHz
4,000,838,686 instructions # 1.24 insn per cycle
<not supported> branches
27,928,232 branch-misses
1.003440004 seconds time elapsed
In other words, with this change, on `bz2`, we see a 6.3% reduction in
executed instructions.
This commit is contained in:
@@ -1078,8 +1078,24 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
// Nothing.
|
||||
}
|
||||
|
||||
Opcode::Select | Opcode::Selectif | Opcode::SelectifSpectreGuard => {
|
||||
let cond = if op == Opcode::Select {
|
||||
Opcode::Select => {
|
||||
let flag_input = inputs[0];
|
||||
let cond = if let Some(icmp_insn) =
|
||||
maybe_input_insn_via_conv(ctx, flag_input, Opcode::Icmp, Opcode::Bint)
|
||||
{
|
||||
let condcode = inst_condcode(ctx.data(icmp_insn)).unwrap();
|
||||
let cond = lower_condcode(condcode);
|
||||
let is_signed = condcode_is_signed(condcode);
|
||||
lower_icmp_or_ifcmp_to_flags(ctx, icmp_insn, is_signed);
|
||||
cond
|
||||
} else if let Some(fcmp_insn) =
|
||||
maybe_input_insn_via_conv(ctx, flag_input, Opcode::Fcmp, Opcode::Bint)
|
||||
{
|
||||
let condcode = inst_fp_condcode(ctx.data(fcmp_insn)).unwrap();
|
||||
let cond = lower_fp_condcode(condcode);
|
||||
lower_fcmp_or_ffcmp_to_flags(ctx, fcmp_insn);
|
||||
cond
|
||||
} else {
|
||||
let (cmp_op, narrow_mode) = if ty_bits(ctx.input_ty(insn, 0)) > 32 {
|
||||
(ALUOp::SubS64, NarrowValueMode::ZeroExtend64)
|
||||
} else {
|
||||
@@ -1095,17 +1111,32 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
rm: zero_reg(),
|
||||
});
|
||||
Cond::Ne
|
||||
} else {
|
||||
let condcode = inst_condcode(ctx.data(insn)).unwrap();
|
||||
let cond = lower_condcode(condcode);
|
||||
let is_signed = condcode_is_signed(condcode);
|
||||
// Verification ensures that the input is always a
|
||||
// single-def ifcmp.
|
||||
let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
|
||||
lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
|
||||
cond
|
||||
};
|
||||
|
||||
// csel.cond rd, rn, rm
|
||||
let rd = get_output_reg(ctx, outputs[0]);
|
||||
let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
||||
let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
|
||||
let ty = ctx.output_ty(insn, 0);
|
||||
let bits = ty_bits(ty);
|
||||
if ty_is_float(ty) && bits == 32 {
|
||||
ctx.emit(Inst::FpuCSel32 { cond, rd, rn, rm });
|
||||
} else if ty_is_float(ty) && bits == 64 {
|
||||
ctx.emit(Inst::FpuCSel64 { cond, rd, rn, rm });
|
||||
} else {
|
||||
ctx.emit(Inst::CSel { cond, rd, rn, rm });
|
||||
}
|
||||
}
|
||||
|
||||
Opcode::Selectif | Opcode::SelectifSpectreGuard => {
|
||||
let condcode = inst_condcode(ctx.data(insn)).unwrap();
|
||||
let cond = lower_condcode(condcode);
|
||||
let is_signed = condcode_is_signed(condcode);
|
||||
// Verification ensures that the input is always a
|
||||
// single-def ifcmp.
|
||||
let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
|
||||
lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
|
||||
|
||||
// csel.COND rd, rn, rm
|
||||
let rd = get_output_reg(ctx, outputs[0]);
|
||||
let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
||||
|
||||
Reference in New Issue
Block a user