Aarch64 codegen quality: support more general add+extend computations.

Previously, our pattern-matching for generating load/store addresses was somewhat limited. For example, it could not use a register-extend address mode to handle the following CLIF: ``` v2760 = uextend.i64 v985 v2761 = load.i64 notrap aligned readonly v1 v1018 = iadd v2761, v2760 store v1017, v1018 ``` This PR adds more general support for address expressions made up of additions and extensions. In particular, it pattern-matches a tree of 64-bit `iadd`s, optionally with `uextend`/`sextend` from 32-bit values at the leaves, to collect the list of all addends that form the address. It also collects all offsets at leaves, combining them. It applies a series of heuristics to make the best use of the available addressing modes, filling the load/store itself with as many 64-bit registers, zero/sign-extended 32-bit registers, and/or an offset, then computing the rest with add instructions as necessary. It attempts to make use of immediate forms (add-immediate or subtract-immediate) whenever possible, and also uses the built-in extend operators on add instructions when possible. There are certainly cases where this is not optimal (i.e., does not generate the strictly shortest sequence of instructions), but it should be good enough for most code. Using `perf stat` to measure instruction count (runtime only, on wasmtime, after populating the cache to avoid measuring compilation), this impacts `bz2` as follows: ``` pre: 1006.410425 task-clock (msec) # 1.000 CPUs utilized 113 context-switches # 0.112 K/sec 1 cpu-migrations # 0.001 K/sec 5,036 page-faults # 0.005 M/sec 3,221,547,476 cycles # 3.201 GHz 4,000,670,104 instructions # 1.24 insn per cycle <not supported> branches 27,958,613 branch-misses 1.006071348 seconds time elapsed post: 963.499525 task-clock (msec) # 0.997 CPUs utilized 117 context-switches # 0.121 K/sec 0 cpu-migrations # 0.000 K/sec 5,081 page-faults # 0.005 M/sec 3,039,687,673 cycles # 3.155 GHz 3,837,761,690 instructions # 1.26 insn per cycle <not supported> branches 28,254,585 branch-misses 0.966072682 seconds time elapsed ``` In other words, this reduces instruction count by 4.1% on `bz2`.
2020-07-21 22:50:49 -07:00
parent 399ee0a54c
commit f9b98f0ddc
5 changed files with 442 additions and 83 deletions
--- a/cranelift/filetests/filetests/vcode/aarch64/amodes.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/amodes.clif
@@ -15,7 +15,7 @@ block0(v0: i64, v1: i32):
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret

-function %f1(i64, i32) -> i32 {
+function %f2(i64, i32) -> i32 {
 block0(v0: i64, v1: i32):
  v2 = uextend.i64 v1
  v3 = load_complex.i32 v2+v0
@@ -29,7 +29,7 @@ block0(v0: i64, v1: i32):
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret

-function %f1(i64, i32) -> i32 {
+function %f3(i64, i32) -> i32 {
 block0(v0: i64, v1: i32):
  v2 = sextend.i64 v1
  v3 = load_complex.i32 v0+v2
@@ -43,7 +43,7 @@ block0(v0: i64, v1: i32):
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret

-function %f1(i64, i32) -> i32 {
+function %f4(i64, i32) -> i32 {
 block0(v0: i64, v1: i32):
  v2 = sextend.i64 v1
  v3 = load_complex.i32 v2+v0
@@ -56,3 +56,216 @@ block0(v0: i64, v1: i32):
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
+
+function %f5(i64, i32) -> i32 {
+block0(v0: i64, v1: i32):
+  v2 = sextend.i64 v1
+  v3 = iadd.i64 v0, v2
+  v4 = load.i32 v3
+  return v4
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: ldr w0, [x0, w1, SXTW]
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f6(i64, i32) -> i32 {
+block0(v0: i64, v1: i32):
+  v2 = sextend.i64 v1
+  v3 = iadd.i64 v2, v0
+  v4 = load.i32 v3
+  return v4
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: ldr w0, [x0, w1, SXTW]
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f7(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = uextend.i64 v0
+  v3 = uextend.i64 v1
+  v4 = iadd.i64 v2, v3
+  v5 = load.i32 v4
+  return v5
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: mov w0, w0
+; nextln: ldr w0, [x0, w1, UXTW]
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f8(i64, i32) -> i32 {
+block0(v0: i64, v1: i32):
+  v2 = sextend.i64 v1
+  v3 = iconst.i64 32
+  v4 = iadd.i64 v2, v3
+  v5 = iadd.i64 v4, v0
+  v6 = iadd.i64 v5, v5
+  v7 = load.i32 v6+4
+  return v7
+}
+
+; v6+4 = 2*v5 = 2*v4 + 2*v0 + 4 = 2*v2 + 2*v3 + 2*v0 + 4
+;      = 2*sextend($x1) + 2*$x0 + 68
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: add x2, x0, #68
+; nextln: add x0, x2, x0
+; nextln: add x0, x0, x1, SXTW
+; nextln: ldr w0, [x0, w1, SXTW]
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f9(i64, i64, i64) -> i32 {
+block0(v0: i64, v1: i64, v2: i64):
+  v3 = iconst.i64 48
+  v4 = iadd.i64 v0, v1
+  v5 = iadd.i64 v4, v2
+  v6 = iadd.i64 v5, v3
+  v7 = load.i32 v6
+  return v7
+}
+
+; v6 = $x0 + $x1 + $x2 + 48
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: add x0, x0, x2
+; nextln: add x0, x0, x1
+; nextln: ldur w0, [x0, #48]
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f10(i64, i64, i64) -> i32 {
+block0(v0: i64, v1: i64, v2: i64):
+  v3 = iconst.i64 4100
+  v4 = iadd.i64 v0, v1
+  v5 = iadd.i64 v4, v2
+  v6 = iadd.i64 v5, v3
+  v7 = load.i32 v6
+  return v7
+}
+
+; v6 = $x0 + $x1 + $x2 + 4100
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movz x3, #4100
+; nextln: add x1, x3, x1
+; nextln: add x1, x1, x2
+; nextln: ldr w0, [x1, x0]
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f10() -> i32 {
+block0:
+  v1 = iconst.i64 1234
+  v2 = load.i32 v1
+  return v2
+}
+
+; v6 = $x0 + $x1 + $x2 + 48
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movz x0, #1234
+; nextln: ldr w0, [x0]
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f11(i64) -> i32 {
+block0(v0: i64):
+  v1 = iconst.i64 8388608 ; Imm12: 0x800 << 12
+  v2 = iadd.i64 v0, v1
+  v3 = load.i32 v2
+  return v3
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: add x0, x0, #8388608
+; nextln: ldr w0, [x0]
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f12(i64) -> i32 {
+block0(v0: i64):
+  v1 = iconst.i64 -4
+  v2 = iadd.i64 v0, v1
+  v3 = load.i32 v2
+  return v3
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: sub x0, x0, #4
+; nextln: ldr w0, [x0]
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f13(i64) -> i32 {
+block0(v0: i64):
+  v1 = iconst.i64 1000000000
+  v2 = iadd.i64 v0, v1
+  v3 = load.i32 v2
+  return v3
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movz x1, #51712
+; nextln: movk x1, #15258, LSL #16
+; nextln: add x0, x1, x0
+; nextln: ldr w0, [x0]
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f14(i32) -> i32 {
+block0(v0: i32):
+  v1 = sextend.i64 v0
+  v2 = load.i32 v1
+  return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: sxtw x0, w0
+; nextln: ldr w0, [x0]
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f15(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = sextend.i64 v0
+  v3 = sextend.i64 v1
+  v4 = iadd.i64 v2, v3
+  v5 = load.i32 v4
+  return v5
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: sxtw x0, w0
+; nextln: ldr w0, [x0, w1, SXTW]
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
--- a/cranelift/filetests/filetests/vcode/aarch64/heap_addr.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/heap_addr.clif
@@ -15,7 +15,7 @@ block0(v0: i64, v1: i32):
 ; check: Block 0:
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
-; nextln: ldur w2, [x0]
+; nextln: ldr w2, [x0]
 ; nextln: add w2, w2, #0
 ; nextln: subs wzr, w1, w2
 ; nextln: b.ls label1 ; b label2
--- a/cranelift/filetests/filetests/vcode/aarch64/reftypes.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/reftypes.clif
@@ -92,7 +92,7 @@ block3(v7: r64, v8: r64):
 ; nextln: ldur x19, [sp, #32]
 ; nextln: ldur x20, [sp, #40]
 ; nextln: add x1, sp, #16
-; nextln: stur x19, [x1]
+; nextln: str x19, [x1]
 ; nextln: and w0, w0, #1
 ; nextln: cbz x0, label1 ; b label3
 ; check: Block 1:
@@ -108,7 +108,7 @@ block3(v7: r64, v8: r64):
 ; nextln: b label5
 ; check: Block 5:
 ; check: add x1, sp, #16
-; nextln: ldur x1, [x1]
+; nextln: ldr x1, [x1]
 ; nextln: mov x2, x1
 ; nextln: mov x1, x19
 ; nextln: ldp x19, x20, [sp], #16
--- a/cranelift/filetests/filetests/vcode/aarch64/stack.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/stack.clif
@@ -51,7 +51,7 @@ block0:
 ; nextln: mov fp, sp
 ; nextln: sub sp, sp, #16
 ; nextln: mov x0, sp
-; nextln: ldur x0, [x0]
+; nextln: ldr x0, [x0]
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
@@ -71,7 +71,7 @@ block0:
 ; nextln: ldr x16, 8 ; b 12 ; data 100016
 ; nextln: sub sp, sp, x16, UXTX
 ; nextln: mov x0, sp
-; nextln: ldur x0, [x0]
+; nextln: ldr x0, [x0]
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
@@ -89,7 +89,7 @@ block0(v0: i64):
 ; nextln: mov fp, sp
 ; nextln: sub sp, sp, #16
 ; nextln: mov x1, sp
-; nextln: stur x0, [x1]
+; nextln: str x0, [x1]
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
@@ -109,7 +109,7 @@ block0(v0: i64):
 ; nextln: ldr x16, 8 ; b 12 ; data 100016
 ; nextln: sub sp, sp, x16, UXTX
 ; nextln: mov x1, sp
-; nextln: stur x0, [x1]
+; nextln: str x0, [x1]
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret