Rework of MachInst isel, branch fixups and lowering, and block ordering.

This patch includes: - A complete rework of the way that CLIF blocks and edge blocks are lowered into VCode blocks. The new mechanism in `BlockLoweringOrder` computes RPO over the CFG, but with a twist: it merges edge blocks intto heads or tails of original CLIF blocks wherever possible, and it does this without ever actually materializing the full nodes-plus-edges graph first. The backend driver lowers blocks in final order so there's no need to reshuffle later. - A new `MachBuffer` that replaces the `MachSection`. This is a special version of a code-sink that is far more than a humble `Vec<u8>`. In particular, it keeps a record of label definitions and label uses, with a machine-pluggable `LabelUse` trait that defines various types of fixups (basically internal relocations). Importantly, it implements some simple peephole-style branch rewrites *inline in the emission pass*, without any separate traversals over the code to use fallthroughs, swap taken/not-taken arms, etc. It tracks branches at the tail of the buffer and can (i) remove blocks that are just unconditional branches (by redirecting the label), (ii) understand a conditional/unconditional pair and swap the conditional polarity when it's helpful; and (iii) remove branches that branch to the fallthrough PC. The `MachBuffer` also implements branch-island support. On architectures like AArch64, this is needed to allow conditional branches within plausibly-attainable ranges (+/- 1MB on AArch64 specifically). It also does this inline while streaming through the emission, without any sort of fixpoint algorithm or later moving of code, by simply tracking outstanding references and "deadlines" and emitting an island just-in-time when we're in danger of going out of range. - A rework of the instruction selector driver. This is largely following the same algorithm as before, but is cleaned up significantly, in particular in the API: the machine backend can ask for an input arg and get any of three forms (constant, register, producing instruction), indicating it needs the register or can merge the constant or producing instruction as appropriate. This new driver takes special care to emit constants right at use-sites (and at phi inputs), minimizing their live-ranges, and also special-cases the "pinned register" to avoid superfluous moves. Overall, on `bz2.wasm`, the results are: wasmtime full run (compile + runtime) of bz2: baseline: 9774M insns, 9742M cycles, 3.918s w/ changes: 7012M insns, 6888M cycles, 2.958s (24.5% faster, 28.3% fewer insns) clif-util wasm compile bz2: baseline: 2633M insns, 3278M cycles, 1.034s w/ changes: 2366M insns, 2920M cycles, 0.923s (10.7% faster, 10.1% fewer insns) All numbers are averages of two runs on an Ampere eMAG.
2020-05-15 19:04:50 -07:00
parent 463734b002
commit 72e6be9342
27 changed files with 3021 additions and 2035 deletions
--- a/cranelift/filetests/filetests/vcode/aarch64/arithmetic.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/arithmetic.clif
@@ -1,7 +1,7 @@
 test vcode
 target aarch64

-function %f(i64, i64) -> i64 {
+function %f1(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = iadd.i64 v0, v1
  return v2
@@ -15,7 +15,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ret


-function %f(i64, i64) -> i64 {
+function %f2(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = isub.i64 v0, v1
  return v2
@@ -28,7 +28,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f3(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = imul.i64 v0, v1
  return v2
@@ -41,7 +41,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f4(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = umulhi.i64 v0, v1
  return v2
@@ -54,7 +54,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f5(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = smulhi.i64 v0, v1
  return v2
@@ -67,7 +67,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f6(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = sdiv.i64 v0, v1
  return v2
@@ -87,7 +87,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64) -> i64 {
+function %f7(i64) -> i64 {
 block0(v0: i64):
  v1 = iconst.i64 2
  v2 = sdiv.i64 v0, v1
@@ -109,7 +109,7 @@ block0(v0: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f8(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = udiv.i64 v0, v1
  return v2
@@ -124,7 +124,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64) -> i64 {
+function %f9(i64) -> i64 {
 block0(v0: i64):
  v1 = iconst.i64 2
  v2 = udiv.i64 v0, v1
@@ -141,7 +141,7 @@ block0(v0: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f10(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = srem.i64 v0, v1
  return v2
@@ -157,7 +157,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f11(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = urem.i64 v0, v1
  return v2
@@ -174,7 +174,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ret


-function %f(i32, i32) -> i32 {
+function %f12(i32, i32) -> i32 {
 block0(v0: i32, v1: i32):
  v2 = sdiv.i32 v0, v1
  return v2
@@ -195,48 +195,48 @@ block0(v0: i32, v1: i32):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i32) -> i32 {
+function %f13(i32) -> i32 {
 block0(v0: i32):
  v1 = iconst.i32 2
  v2 = sdiv.i32 v0, v1
  return v2
 }

-; check:  stp fp, lr, [sp, #-16]!
-; nextln:  mov fp, sp
-; nextln:  mov x1, x0
-; nextln:  movz x0, #2
-; nextln:  sxtw x1, w1
-; nextln:  sxtw x2, w0
-; nextln:  sdiv x0, x1, x2
-; nextln:  cbz x2, 20
-; nextln:  adds wzr, w2, #1
-; nextln:  ccmp w1, #1, #nzcv, eq
-; nextln:  b.vc 12
-; nextln:  udf
-; nextln:  udf
-; nextln:  mov sp, fp
-; nextln:  ldp fp, lr, [sp], #16
-; nextln:  ret
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: sxtw x1, w0
+; nextln: movz x0, #2
+; nextln: sxtw x2, w0
+; nextln: sdiv x0, x1, x2
+; nextln: cbz x2, 20
+; nextln: adds wzr, w2, #1
+; nextln: ccmp w1, #1, #nzcv, eq
+; nextln: b.vc 12
+; nextln: udf
+; nextln: udf
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret

-function %f(i32, i32) -> i32 {
+function %f14(i32, i32) -> i32 {
 block0(v0: i32, v1: i32):
  v2 = udiv.i32 v0, v1
  return v2
 }

-; check:  stp fp, lr, [sp, #-16]!
-; nextln:  mov fp, sp
-; nextln:  mov w0, w0
-; nextln:  mov w1, w1
-; nextln:  udiv x0, x0, x1
-; nextln:  cbnz x1, 8
-; nextln:  udf
-; nextln:  mov sp, fp
-; nextln:  ldp fp, lr, [sp], #16
-; nextln:  ret
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: mov w0, w0
+; nextln: mov w1, w1
+; nextln: udiv x0, x0, x1
+; nextln: cbnz x1, 8
+; nextln: udf
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret

-function %f(i32) -> i32 {
+
+function %f15(i32) -> i32 {
 block0(v0: i32):
  v1 = iconst.i32 2
  v2 = udiv.i32 v0, v1
@@ -245,9 +245,8 @@ block0(v0: i32):

 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln:  mov fp, sp
-; nextln:  movz x1, #2
 ; nextln:  mov w0, w0
-; nextln:  mov w1, w1
+; nextln:  movz x1, #2
 ; nextln:  udiv x0, x0, x1
 ; nextln:  cbnz x1, 8
 ; nextln:  udf
@@ -255,7 +254,7 @@ block0(v0: i32):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i32, i32) -> i32 {
+function %f16(i32, i32) -> i32 {
 block0(v0: i32, v1: i32):
  v2 = srem.i32 v0, v1
  return v2
@@ -273,7 +272,7 @@ block0(v0: i32, v1: i32):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i32, i32) -> i32 {
+function %f17(i32, i32) -> i32 {
 block0(v0: i32, v1: i32):
  v2 = urem.i32 v0, v1
  return v2
@@ -291,7 +290,7 @@ block0(v0: i32, v1: i32):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f18(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = band.i64 v0, v1
  return v2
@@ -304,7 +303,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f19(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = bor.i64 v0, v1
  return v2
@@ -317,7 +316,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f20(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = bxor.i64 v0, v1
  return v2
@@ -330,7 +329,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f21(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = band_not.i64 v0, v1
  return v2
@@ -343,7 +342,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f22(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = bor_not.i64 v0, v1
  return v2
@@ -356,7 +355,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f23(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = bxor_not.i64 v0, v1
  return v2
@@ -369,7 +368,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f24(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = bnot.i64 v0
  return v2
--- a/cranelift/filetests/filetests/vcode/aarch64/condbr.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/condbr.clif
@@ -30,17 +30,18 @@ block2:
  return v5
 }

+; check: Block 0:
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
 ; nextln: subs xzr, x0, x1
-; nextln: b.eq 20
-; check: Block 2:
-; check: movz x0, #2
+; nextln: b.eq label1 ; b label2
+; check: Block 1:
+; check: movz x0, #1
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
-; check: Block 1:
-; check: movz x0, #1
+; check: Block 2:
+; check: movz x0, #2
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
--- a/cranelift/filetests/filetests/vcode/aarch64/jumptable.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/jumptable.clif
@@ -30,15 +30,15 @@ block5(v5: i64):

 ; check:   subs wzr, w0, #3
 ; nextln:   b.hs
-; nextln:   adr x2, pc+16 ; ldrsw x1, [x2, x0, LSL 2] ; add x2, x2, x1 ; br x2 ; jt_entries
+; nextln:   adr x1, pc+16 ; ldrsw x2, [x1, x0, LSL 2] ; add x1, x1, x2 ; br x1 ; jt_entries

-; check:   movz x1, #3
+; check:   movz x1, #1
 ; nextln:   b

 ; check:   movz x1, #2
 ; nextln:   b

-; check:   movz x1, #1
+; check:   movz x1, #3

 ; check:   add x0, x0, x1

--- a/cranelift/filetests/filetests/vcode/aarch64/saturating-ops.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/saturating-ops.clif
@@ -25,10 +25,10 @@ block0(v0: i8, v1: i8):

 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
-; nextln: uxtb x0, w0
-; nextln: uxtb x1, w1
-; nextln: mov v0.d[0], x0
-; nextln: mov v1.d[0], x1
+; nextln: uxtb x2, w0
+; nextln: uxtb x0, w1
+; nextln: mov v0.d[0], x2
+; nextln: mov v1.d[0], x0
 ; nextln: uqadd d0, d0, d1
 ; nextln: mov x0, v0.d[0]
 ; nextln: mov sp, fp
--- a/cranelift/filetests/filetests/vcode/aarch64/shift-rotate.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/shift-rotate.clif
@@ -366,15 +366,15 @@ block0(v0: i16):
  return v2
 }

-; check:  stp fp, lr, [sp, #-16]!
-; nextln:  mov fp, sp
-; nextln:  uxth w0, w0
-; nextln:  lsr w1, w0, #6
-; nextln:  lsl w0, w0, #10
-; nextln:  orr w0, w0, w1
-; nextln:  mov sp, fp
-; nextln:  ldp fp, lr, [sp], #16
-; nextln:  ret
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: uxth w1, w0
+; nextln: lsr w0, w1, #6
+; nextln: lsl w1, w1, #10
+; nextln: orr w0, w1, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret

 function %f24(i8) -> i8 {
 block0(v0: i8):
@@ -385,10 +385,10 @@ block0(v0: i8):

 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln:  mov fp, sp
-; nextln:  uxtb w0, w0
-; nextln:  lsr w1, w0, #5
-; nextln:  lsl w0, w0, #3
-; nextln:  orr w0, w0, w1
+; nextln:  uxtb w1, w0
+; nextln:  lsr w0, w1, #5
+; nextln:  lsl w1, w1, #3
+; nextln:  orr w0, w1, w0
 ; nextln:  mov sp, fp
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret