Aarch64: handle csel with icmp/fcmp source without materializing the bool.
Previously, we simply compared the input bool to 0, which forced the
value into a register (usually via a cmp and cset), zero-extended it,
etc. This patch performs the same pattern-matching that branches do to
directly perform the cmp and use its flag results with the csel.
On the `bz2` benchmark, the runtime is affected as follows (measuring
with `perf stat`, using wasmtime with its cache enabled, and taking the
second run after the first compiles and populates the cache):
pre:
1117.232000 task-clock (msec) # 1.000 CPUs utilized
133 context-switches # 0.119 K/sec
1 cpu-migrations # 0.001 K/sec
5,041 page-faults # 0.005 M/sec
3,511,615,100 cycles # 3.143 GHz
4,272,427,772 instructions # 1.22 insn per cycle
<not supported> branches
27,980,906 branch-misses
1.117299838 seconds time elapsed
post:
1003.738075 task-clock (msec) # 1.000 CPUs utilized
121 context-switches # 0.121 K/sec
0 cpu-migrations # 0.000 K/sec
5,052 page-faults # 0.005 M/sec
3,224,875,393 cycles # 3.213 GHz
4,000,838,686 instructions # 1.24 insn per cycle
<not supported> branches
27,928,232 branch-misses
1.003440004 seconds time elapsed
In other words, with this change, on `bz2`, we see a 6.3% reduction in
executed instructions.
This commit is contained in:
@@ -1078,8 +1078,24 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
// Nothing.
|
// Nothing.
|
||||||
}
|
}
|
||||||
|
|
||||||
Opcode::Select | Opcode::Selectif | Opcode::SelectifSpectreGuard => {
|
Opcode::Select => {
|
||||||
let cond = if op == Opcode::Select {
|
let flag_input = inputs[0];
|
||||||
|
let cond = if let Some(icmp_insn) =
|
||||||
|
maybe_input_insn_via_conv(ctx, flag_input, Opcode::Icmp, Opcode::Bint)
|
||||||
|
{
|
||||||
|
let condcode = inst_condcode(ctx.data(icmp_insn)).unwrap();
|
||||||
|
let cond = lower_condcode(condcode);
|
||||||
|
let is_signed = condcode_is_signed(condcode);
|
||||||
|
lower_icmp_or_ifcmp_to_flags(ctx, icmp_insn, is_signed);
|
||||||
|
cond
|
||||||
|
} else if let Some(fcmp_insn) =
|
||||||
|
maybe_input_insn_via_conv(ctx, flag_input, Opcode::Fcmp, Opcode::Bint)
|
||||||
|
{
|
||||||
|
let condcode = inst_fp_condcode(ctx.data(fcmp_insn)).unwrap();
|
||||||
|
let cond = lower_fp_condcode(condcode);
|
||||||
|
lower_fcmp_or_ffcmp_to_flags(ctx, fcmp_insn);
|
||||||
|
cond
|
||||||
|
} else {
|
||||||
let (cmp_op, narrow_mode) = if ty_bits(ctx.input_ty(insn, 0)) > 32 {
|
let (cmp_op, narrow_mode) = if ty_bits(ctx.input_ty(insn, 0)) > 32 {
|
||||||
(ALUOp::SubS64, NarrowValueMode::ZeroExtend64)
|
(ALUOp::SubS64, NarrowValueMode::ZeroExtend64)
|
||||||
} else {
|
} else {
|
||||||
@@ -1095,7 +1111,24 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
rm: zero_reg(),
|
rm: zero_reg(),
|
||||||
});
|
});
|
||||||
Cond::Ne
|
Cond::Ne
|
||||||
|
};
|
||||||
|
|
||||||
|
// csel.cond rd, rn, rm
|
||||||
|
let rd = get_output_reg(ctx, outputs[0]);
|
||||||
|
let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
||||||
|
let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
|
||||||
|
let ty = ctx.output_ty(insn, 0);
|
||||||
|
let bits = ty_bits(ty);
|
||||||
|
if ty_is_float(ty) && bits == 32 {
|
||||||
|
ctx.emit(Inst::FpuCSel32 { cond, rd, rn, rm });
|
||||||
|
} else if ty_is_float(ty) && bits == 64 {
|
||||||
|
ctx.emit(Inst::FpuCSel64 { cond, rd, rn, rm });
|
||||||
} else {
|
} else {
|
||||||
|
ctx.emit(Inst::CSel { cond, rd, rn, rm });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Opcode::Selectif | Opcode::SelectifSpectreGuard => {
|
||||||
let condcode = inst_condcode(ctx.data(insn)).unwrap();
|
let condcode = inst_condcode(ctx.data(insn)).unwrap();
|
||||||
let cond = lower_condcode(condcode);
|
let cond = lower_condcode(condcode);
|
||||||
let is_signed = condcode_is_signed(condcode);
|
let is_signed = condcode_is_signed(condcode);
|
||||||
@@ -1103,8 +1136,6 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
// single-def ifcmp.
|
// single-def ifcmp.
|
||||||
let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
|
let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
|
||||||
lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
|
lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
|
||||||
cond
|
|
||||||
};
|
|
||||||
|
|
||||||
// csel.COND rd, rn, rm
|
// csel.COND rd, rn, rm
|
||||||
let rd = get_output_reg(ctx, outputs[0]);
|
let rd = get_output_reg(ctx, outputs[0]);
|
||||||
|
|||||||
@@ -41,3 +41,14 @@ block0(v0: b1, v1: i8, v2: i8):
|
|||||||
|
|
||||||
; check: subs wzr
|
; check: subs wzr
|
||||||
; nextln: csel
|
; nextln: csel
|
||||||
|
|
||||||
|
function %i(i32, i8, i8) -> i8 {
|
||||||
|
block0(v0: i32, v1: i8, v2: i8):
|
||||||
|
v3 = iconst.i32 42
|
||||||
|
v4 = icmp.i32 eq v0, v3
|
||||||
|
v5 = select.i8 v4, v1, v2
|
||||||
|
return v5
|
||||||
|
}
|
||||||
|
|
||||||
|
; check: subs wzr, w0, #42
|
||||||
|
; nextln: csel x0, x1, x2, eq
|
||||||
|
|||||||
Reference in New Issue
Block a user