Aarch64 codegen quality: handle add-negative-imm as subtract.

We often see patterns like: ``` mov w2, #0xffff_ffff // uses ORR with logical immediate form add w0, w1, w2 ``` which is just `w0 := w1 - 1`. It would be much better to recognize when the inverse of an immediate will fit in a 12-bit immediate field if the immediate itself does not, and flip add to subtract (and vice versa), so we can instead generate: ``` sub w0, w1, #1 ``` We see this pattern in e.g. `bz2`, where this commit makes the following difference (counting instructions with `perf stat`, filling in the wasmtime cache first then running again to get just runtime): pre: ``` 992.762250 task-clock (msec) # 0.998 CPUs utilized 109 context-switches # 0.110 K/sec 0 cpu-migrations # 0.000 K/sec 5,035 page-faults # 0.005 M/sec 3,224,119,134 cycles # 3.248 GHz 4,000,521,171 instructions # 1.24 insn per cycle <not supported> branches 27,573,755 branch-misses 0.995072322 seconds time elapsed ``` post: ``` 993.853850 task-clock (msec) # 0.998 CPUs utilized 123 context-switches # 0.124 K/sec 1 cpu-migrations # 0.001 K/sec 5,072 page-faults # 0.005 M/sec 3,201,278,337 cycles # 3.221 GHz 3,917,061,340 instructions # 1.22 insn per cycle <not supported> branches 28,410,633 branch-misses 0.996008047 seconds time elapsed ``` In other words, a 2.1% reduction in instruction count on `bz2`.
2020-07-20 13:32:05 -07:00
parent 4c15a4daf2
commit 1b80860f1f
3 changed files with 93 additions and 4 deletions
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -424,6 +424,35 @@ pub(crate) fn put_input_in_rse_imm12<C: LowerCtx<I = Inst>>(
    ResultRSEImm12::from_rse(put_input_in_rse(ctx, input, narrow_mode))
 }
 /// Like `put_input_in_rse_imm12` above, except is allowed to negate the
 /// argument (assuming a two's-complement representation with the given bit
 /// width) if this allows use of 12-bit immediate. Used to flip `add`s with
 /// negative immediates to `sub`s (and vice-versa).
 pub(crate) fn put_input_in_rse_imm12_maybe_negated<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
    input: InsnInput,
    twos_complement_bits: usize,
    narrow_mode: NarrowValueMode,
 ) -> (ResultRSEImm12, bool) {
    assert!(twos_complement_bits <= 64);
    if let Some(imm_value) = input_to_const(ctx, input) {
        if let Some(i) = Imm12::maybe_from_u64(imm_value) {
            return (ResultRSEImm12::Imm12(i), false);
        }
        let sign_extended =
            ((imm_value as i64) << (64 - twos_complement_bits)) >> (64 - twos_complement_bits);
        let inverted = sign_extended.wrapping_neg();
        if let Some(i) = Imm12::maybe_from_u64(inverted as u64) {
            return (ResultRSEImm12::Imm12(i), true);
        }
    }
    (
        ResultRSEImm12::from_rse(put_input_in_rse(ctx, input, narrow_mode)),
        false,
    )
 }
 pub(crate) fn put_input_in_rs_immlogic<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
    input: InsnInput,
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -60,8 +60,17 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            let ty = ty.unwrap();
            if ty_bits(ty) < 128 {
-                let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None);
+                let (rm, negated) = put_input_in_rse_imm12_maybe_negated(
-                let alu_op = choose_32_64(ty, ALUOp::Add32, ALUOp::Add64);
+                    ctx,
                    inputs[1],
                    ty_bits(ty),
                    NarrowValueMode::None,
                );
                let alu_op = if !negated {
                    choose_32_64(ty, ALUOp::Add32, ALUOp::Add64)
                } else {
                    choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64)
                };
                ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
            } else {
                let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
@@ -79,8 +88,17 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            let ty = ty.unwrap();
            if ty_bits(ty) < 128 {
-                let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None);
+                let (rm, negated) = put_input_in_rse_imm12_maybe_negated(
-                let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
+                    ctx,
                    inputs[1],
                    ty_bits(ty),
                    NarrowValueMode::None,
                );
                let alu_op = if !negated {
                    choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64)
                } else {
                    choose_32_64(ty, ALUOp::Add32, ALUOp::Add64)
                };
                ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
            } else {
                let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
--- a/cranelift/filetests/filetests/vcode/aarch64/arithmetic.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/arithmetic.clif
@@ -380,3 +380,45 @@ block0(v0: i32, v1: i32):
 ; nextln:  mov sp, fp
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
 function %f26(i32) -> i32 {
 block0(v0: i32):
  v1 = iconst.i32 -1
  v2 = iadd.i32 v0, v1
  return v2
 }
 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln:  mov fp, sp
 ; nextln:  sub w0, w0, #1
 ; nextln:  mov sp, fp
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
 function %f27(i32) -> i32 {
 block0(v0: i32):
  v1 = iconst.i32 -1
  v2 = isub.i32 v0, v1
  return v2
 }
 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln:  mov fp, sp
 ; nextln:  add w0, w0, #1
 ; nextln:  mov sp, fp
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
 function %f28(i64) -> i64 {
 block0(v0: i64):
  v1 = iconst.i64 -1
  v2 = isub.i64 v0, v1
  return v2
 }
 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln:  mov fp, sp
 ; nextln:  add x0, x0, #1
 ; nextln:  mov sp, fp
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret