From 21dac670f054d0c707309c6aa671be613db64245 Mon Sep 17 00:00:00 2001 From: Chris Fallin Date: Fri, 17 Jul 2020 16:30:45 -0700 Subject: [PATCH] Aarch64: handle csel with icmp/fcmp source without materializing the bool. Previously, we simply compared the input bool to 0, which forced the value into a register (usually via a cmp and cset), zero-extended it, etc. This patch performs the same pattern-matching that branches do to directly perform the cmp and use its flag results with the csel. On the `bz2` benchmark, the runtime is affected as follows (measuring with `perf stat`, using wasmtime with its cache enabled, and taking the second run after the first compiles and populates the cache): pre: 1117.232000 task-clock (msec) # 1.000 CPUs utilized 133 context-switches # 0.119 K/sec 1 cpu-migrations # 0.001 K/sec 5,041 page-faults # 0.005 M/sec 3,511,615,100 cycles # 3.143 GHz 4,272,427,772 instructions # 1.22 insn per cycle branches 27,980,906 branch-misses 1.117299838 seconds time elapsed post: 1003.738075 task-clock (msec) # 1.000 CPUs utilized 121 context-switches # 0.121 K/sec 0 cpu-migrations # 0.000 K/sec 5,052 page-faults # 0.005 M/sec 3,224,875,393 cycles # 3.213 GHz 4,000,838,686 instructions # 1.24 insn per cycle branches 27,928,232 branch-misses 1.003440004 seconds time elapsed In other words, with this change, on `bz2`, we see a 6.3% reduction in executed instructions. --- .../codegen/src/isa/aarch64/lower_inst.rs | 53 +++++++++++++++---- .../filetests/vcode/aarch64/condops.clif | 11 ++++ 2 files changed, 53 insertions(+), 11 deletions(-) diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 1dfc4091fe..77fe58aed6 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -1078,8 +1078,24 @@ pub(crate) fn lower_insn_to_regs>( // Nothing. } - Opcode::Select | Opcode::Selectif | Opcode::SelectifSpectreGuard => { - let cond = if op == Opcode::Select { + Opcode::Select => { + let flag_input = inputs[0]; + let cond = if let Some(icmp_insn) = + maybe_input_insn_via_conv(ctx, flag_input, Opcode::Icmp, Opcode::Bint) + { + let condcode = inst_condcode(ctx.data(icmp_insn)).unwrap(); + let cond = lower_condcode(condcode); + let is_signed = condcode_is_signed(condcode); + lower_icmp_or_ifcmp_to_flags(ctx, icmp_insn, is_signed); + cond + } else if let Some(fcmp_insn) = + maybe_input_insn_via_conv(ctx, flag_input, Opcode::Fcmp, Opcode::Bint) + { + let condcode = inst_fp_condcode(ctx.data(fcmp_insn)).unwrap(); + let cond = lower_fp_condcode(condcode); + lower_fcmp_or_ffcmp_to_flags(ctx, fcmp_insn); + cond + } else { let (cmp_op, narrow_mode) = if ty_bits(ctx.input_ty(insn, 0)) > 32 { (ALUOp::SubS64, NarrowValueMode::ZeroExtend64) } else { @@ -1095,17 +1111,32 @@ pub(crate) fn lower_insn_to_regs>( rm: zero_reg(), }); Cond::Ne - } else { - let condcode = inst_condcode(ctx.data(insn)).unwrap(); - let cond = lower_condcode(condcode); - let is_signed = condcode_is_signed(condcode); - // Verification ensures that the input is always a - // single-def ifcmp. - let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap(); - lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed); - cond }; + // csel.cond rd, rn, rm + let rd = get_output_reg(ctx, outputs[0]); + let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None); + let ty = ctx.output_ty(insn, 0); + let bits = ty_bits(ty); + if ty_is_float(ty) && bits == 32 { + ctx.emit(Inst::FpuCSel32 { cond, rd, rn, rm }); + } else if ty_is_float(ty) && bits == 64 { + ctx.emit(Inst::FpuCSel64 { cond, rd, rn, rm }); + } else { + ctx.emit(Inst::CSel { cond, rd, rn, rm }); + } + } + + Opcode::Selectif | Opcode::SelectifSpectreGuard => { + let condcode = inst_condcode(ctx.data(insn)).unwrap(); + let cond = lower_condcode(condcode); + let is_signed = condcode_is_signed(condcode); + // Verification ensures that the input is always a + // single-def ifcmp. + let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap(); + lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed); + // csel.COND rd, rn, rm let rd = get_output_reg(ctx, outputs[0]); let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); diff --git a/cranelift/filetests/filetests/vcode/aarch64/condops.clif b/cranelift/filetests/filetests/vcode/aarch64/condops.clif index ebdff13850..94e86a7a12 100644 --- a/cranelift/filetests/filetests/vcode/aarch64/condops.clif +++ b/cranelift/filetests/filetests/vcode/aarch64/condops.clif @@ -41,3 +41,14 @@ block0(v0: b1, v1: i8, v2: i8): ; check: subs wzr ; nextln: csel + +function %i(i32, i8, i8) -> i8 { +block0(v0: i32, v1: i8, v2: i8): + v3 = iconst.i32 42 + v4 = icmp.i32 eq v0, v3 + v5 = select.i8 v4, v1, v2 + return v5 +} + +; check: subs wzr, w0, #42 +; nextln: csel x0, x1, x2, eq