Aarch64 codegen quality: handle add-negative-imm as subtract.

We often see patterns like:

```
    mov w2, #0xffff_ffff   // uses ORR with logical immediate form
    add w0, w1, w2
```

which is just `w0 := w1 - 1`. It would be much better to recognize when
the inverse of an immediate will fit in a 12-bit immediate field if the
immediate itself does not, and flip add to subtract (and vice versa), so
we can instead generate:

```
    sub w0, w1, #1
```

We see this pattern in e.g. `bz2`, where this commit makes the following
difference (counting instructions with `perf stat`, filling in the
wasmtime cache first then running again to get just runtime):

pre:

```
        992.762250      task-clock (msec)         #    0.998 CPUs utilized
               109      context-switches          #    0.110 K/sec
                 0      cpu-migrations            #    0.000 K/sec
             5,035      page-faults               #    0.005 M/sec
     3,224,119,134      cycles                    #    3.248 GHz
     4,000,521,171      instructions              #    1.24  insn per cycle
   <not supported>      branches
        27,573,755      branch-misses

       0.995072322 seconds time elapsed
```

post:

```
        993.853850      task-clock (msec)         #    0.998 CPUs utilized
               123      context-switches          #    0.124 K/sec
                 1      cpu-migrations            #    0.001 K/sec
             5,072      page-faults               #    0.005 M/sec
     3,201,278,337      cycles                    #    3.221 GHz
     3,917,061,340      instructions              #    1.22  insn per cycle
   <not supported>      branches
        28,410,633      branch-misses

       0.996008047 seconds time elapsed
```

In other words, a 2.1% reduction in instruction count on `bz2`.
This commit is contained in:
Chris Fallin
2020-07-20 13:32:05 -07:00
parent 4c15a4daf2
commit 1b80860f1f
3 changed files with 93 additions and 4 deletions

View File

@@ -424,6 +424,35 @@ pub(crate) fn put_input_in_rse_imm12<C: LowerCtx<I = Inst>>(
ResultRSEImm12::from_rse(put_input_in_rse(ctx, input, narrow_mode)) ResultRSEImm12::from_rse(put_input_in_rse(ctx, input, narrow_mode))
} }
/// Like `put_input_in_rse_imm12` above, except is allowed to negate the
/// argument (assuming a two's-complement representation with the given bit
/// width) if this allows use of 12-bit immediate. Used to flip `add`s with
/// negative immediates to `sub`s (and vice-versa).
pub(crate) fn put_input_in_rse_imm12_maybe_negated<C: LowerCtx<I = Inst>>(
ctx: &mut C,
input: InsnInput,
twos_complement_bits: usize,
narrow_mode: NarrowValueMode,
) -> (ResultRSEImm12, bool) {
assert!(twos_complement_bits <= 64);
if let Some(imm_value) = input_to_const(ctx, input) {
if let Some(i) = Imm12::maybe_from_u64(imm_value) {
return (ResultRSEImm12::Imm12(i), false);
}
let sign_extended =
((imm_value as i64) << (64 - twos_complement_bits)) >> (64 - twos_complement_bits);
let inverted = sign_extended.wrapping_neg();
if let Some(i) = Imm12::maybe_from_u64(inverted as u64) {
return (ResultRSEImm12::Imm12(i), true);
}
}
(
ResultRSEImm12::from_rse(put_input_in_rse(ctx, input, narrow_mode)),
false,
)
}
pub(crate) fn put_input_in_rs_immlogic<C: LowerCtx<I = Inst>>( pub(crate) fn put_input_in_rs_immlogic<C: LowerCtx<I = Inst>>(
ctx: &mut C, ctx: &mut C,
input: InsnInput, input: InsnInput,

View File

@@ -60,8 +60,17 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let ty = ty.unwrap(); let ty = ty.unwrap();
if ty_bits(ty) < 128 { if ty_bits(ty) < 128 {
let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None); let (rm, negated) = put_input_in_rse_imm12_maybe_negated(
let alu_op = choose_32_64(ty, ALUOp::Add32, ALUOp::Add64); ctx,
inputs[1],
ty_bits(ty),
NarrowValueMode::None,
);
let alu_op = if !negated {
choose_32_64(ty, ALUOp::Add32, ALUOp::Add64)
} else {
choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64)
};
ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm)); ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
} else { } else {
let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
@@ -79,8 +88,17 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let ty = ty.unwrap(); let ty = ty.unwrap();
if ty_bits(ty) < 128 { if ty_bits(ty) < 128 {
let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None); let (rm, negated) = put_input_in_rse_imm12_maybe_negated(
let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64); ctx,
inputs[1],
ty_bits(ty),
NarrowValueMode::None,
);
let alu_op = if !negated {
choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64)
} else {
choose_32_64(ty, ALUOp::Add32, ALUOp::Add64)
};
ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm)); ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
} else { } else {
let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);

View File

@@ -380,3 +380,45 @@ block0(v0: i32, v1: i32):
; nextln: mov sp, fp ; nextln: mov sp, fp
; nextln: ldp fp, lr, [sp], #16 ; nextln: ldp fp, lr, [sp], #16
; nextln: ret ; nextln: ret
function %f26(i32) -> i32 {
block0(v0: i32):
v1 = iconst.i32 -1
v2 = iadd.i32 v0, v1
return v2
}
; check: stp fp, lr, [sp, #-16]!
; nextln: mov fp, sp
; nextln: sub w0, w0, #1
; nextln: mov sp, fp
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
function %f27(i32) -> i32 {
block0(v0: i32):
v1 = iconst.i32 -1
v2 = isub.i32 v0, v1
return v2
}
; check: stp fp, lr, [sp, #-16]!
; nextln: mov fp, sp
; nextln: add w0, w0, #1
; nextln: mov sp, fp
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
function %f28(i64) -> i64 {
block0(v0: i64):
v1 = iconst.i64 -1
v2 = isub.i64 v0, v1
return v2
}
; check: stp fp, lr, [sp, #-16]!
; nextln: mov fp, sp
; nextln: add x0, x0, #1
; nextln: mov sp, fp
; nextln: ldp fp, lr, [sp], #16
; nextln: ret