x64 regalloc register order: put caller-saves (volatiles) first.

The x64 backend currently builds the `RealRegUniverse` in a way that is generating somewhat suboptimal code. In many blocks, we see uses of callee-save (non-volatile) registers (r12, r13, r14, rbx) first, even in very short leaf functions where there are plenty of volatiles to use. This is leading to unnecessary spills/reloads. On one (local) test program, a medium-sized C benchmark compiled to Wasm and run on Wasmtime, I am seeing a ~10% performance improvement with this change; it will be less pronounced in programs with high register pressure (there we are likely to use all registers regardless, so the prologue/epilogue will save/restore all callee-saves), or in programs with fewer calls, but this is a clear win for small functions and in many cases removes prologue/epilogue clobber-saves altogether. Separately, I think the RA's coalescing is tripping up a bit in some cases; see e.g. the filetest touched by this commit that loads a value into %rsi then moves to %rax and returns immediately. This is an orthogonal issue, though, and should be addressed (if worthwhile) in regalloc.rs.
2020-12-06 22:26:42 -08:00
parent fc752efa89
commit 1dddba649a
7 changed files with 74 additions and 68 deletions
--- a/cranelift/filetests/filetests/isa/x64/amode-opt.clif
+++ b/cranelift/filetests/filetests/isa/x64/amode-opt.clif
@@ -7,7 +7,7 @@ block0(v0: i64, v1: i64):
    v2 = iadd v0, v1
    v3 = load.i64 v2
    return v3
-    ; check: movq    0(%rdi,%rsi,1), %r12
+    ; check: movq    0(%rdi,%rsi,1), %rsi
 }

 function %amode_add_imm(i64) -> i64 {
@@ -16,7 +16,7 @@ block0(v0: i64):
    v2 = iadd v0, v1
    v3 = load.i64 v2
    return v3
-    ; check: movq    42(%rdi), %r12
+    ; check: movq    42(%rdi), %rsi
 }

 ;; Same as above, but add operands have been reversed.
@@ -26,7 +26,7 @@ block0(v0: i64):
    v2 = iadd v1, v0
    v3 = load.i64 v2
    return v3
-    ; check: movq    42(%rdi), %r12
+    ; check: movq    42(%rdi), %rsi
 }

 ;; Make sure that uextend(cst) are ignored when the cst will naturally sign-extend.
@@ -37,5 +37,5 @@ block0(v0: i64):
    v3 = iadd v2, v0
    v4 = load.i64 v3
    return v4
-    ; check: movq    42(%rdi), %r12
+    ; check: movq    42(%rdi), %rsi
 }
--- a/cranelift/filetests/filetests/isa/x64/heap.clif
+++ b/cranelift/filetests/filetests/isa/x64/heap.clif
@@ -11,11 +11,11 @@ function %f(i32, i64 vmctx) -> i64 {
 block0(v0: i32, v1: i64):

    v2 = heap_addr.i64 heap0, v0, 0x8000
-    ; check:  movl    8(%rsi), %r12d
-    ; nextln: movq    %rdi, %r13
-    ; nextln: addl    $$32768, %r13d
+    ; check:  movl    8(%rsi), %ecx
+    ; nextln: movq    %rdi, %rax
+    ; nextln: addl    $$32768, %eax
    ; nextln: jnb ; ud2 heap_oob ;
-    ; nextln: cmpl    %r12d, %r13d
+    ; nextln: cmpl    %ecx, %eax
    ; nextln: jbe     label1; j label2
    ; check:  Block 1:
 
--- a/cranelift/filetests/filetests/isa/x64/load-op.clif
+++ b/cranelift/filetests/filetests/isa/x64/load-op.clif
@@ -6,7 +6,7 @@ function %add_from_mem_u32_1(i64, i32) -> i32 {
 block0(v0: i64, v1: i32):
  v2 = load.i32 v0
  v3 = iadd.i32 v2, v1
-  ; check: addl    0(%rdi), %r12d
+  ; check: addl    0(%rdi), %esi
  return v3
 }

@@ -14,7 +14,7 @@ function %add_from_mem_u32_2(i64, i32) -> i32 {
 block0(v0: i64, v1: i32):
  v2 = load.i32 v0
  v3 = iadd.i32 v1, v2
-  ; check: addl    0(%rdi), %r12d
+  ; check: addl    0(%rdi), %esi
  return v3
 }

@@ -22,7 +22,7 @@ function %add_from_mem_u64_1(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = load.i64 v0
  v3 = iadd.i64 v2, v1
-  ; check: addq    0(%rdi), %r12
+  ; check: addq    0(%rdi), %rsi
  return v3
 }

@@ -30,7 +30,7 @@ function %add_from_mem_u64_2(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = load.i64 v0
  v3 = iadd.i64 v1, v2
-  ; check: addq    0(%rdi), %r12
+  ; check: addq    0(%rdi), %rsi
  return v3
 }

@@ -40,8 +40,8 @@ function %add_from_mem_not_narrow(i64, i8) -> i8 {
 block0(v0: i64, v1: i8):
  v2 = load.i8 v0
  v3 = iadd.i8 v2, v1
-  ; check: movzbq  0(%rdi), %r12
-  ; nextln: addl    %esi, %r12d
+  ; check: movzbq  0(%rdi), %rdi
+  ; nextln: addl    %esi, %edi
  return v3
 }

@@ -52,10 +52,10 @@ block0(v0: i64, v1: i64):
  store.i64 v3, v1
  v4 = load.i64 v3
  return v4
-  ; check:  movq    0(%rdi), %r12
-  ; nextln: movq    %r12, %r13
-  ; nextln: addq    %rdi, %r13
-  ; nextln: movq    %r13, 0(%rsi)
-  ; nextln: movq    0(%r12,%rdi,1), %r12
-  ; nextln: movq    %r12, %rax
+  ; check:  movq    0(%rdi), %rax
+  ; nextln: movq    %rax, %rcx
+  ; nextln: addq    %rdi, %rcx
+  ; nextln: movq    %rcx, 0(%rsi)
+  ; nextln: movq    0(%rax,%rdi,1), %rsi
+  ; nextln: movq    %rsi, %rax
 }
--- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
@@ -28,9 +28,9 @@ block0(v0: i32):
 }
 ; check:  movd    %edi, %xmm1
 ; nextln: psllw   %xmm1, %xmm0
-; nextln: lea     const(VCodeConstant(0)), %r12
+; nextln: lea     const(VCodeConstant(0)), %rsi
 ; nextln: shlq    $$4, %rdi
-; nextln: movdqu  0(%r12,%rdi,1), %xmm1
+; nextln: movdqu  0(%rsi,%rdi,1), %xmm1
 ; nextln: pand    %xmm1, %xmm0

 function %ushr_i8x16_imm() -> i8x16 {
@@ -81,12 +81,12 @@ block0(v0: i64x2, v1: i32):
    v2 = sshr v0, v1
    return v2
 }
-; check:  pextrd.w $$0, %xmm0, %r12
-; nextln: pextrd.w $$1, %xmm0, %r13
+; check:  pextrd.w $$0, %xmm0, %rsi
+; nextln: pextrd.w $$1, %xmm0, %rax
 ; nextln: movq    %rdi, %rcx
-; nextln: sarq    %cl, %r12
+; nextln: sarq    %cl, %rsi
 ; nextln: movq    %rdi, %rcx
-; nextln: sarq    %cl, %r13
-; nextln: pinsrd.w $$0, %r12, %xmm1
-; nextln: pinsrd.w $$1, %r13, %xmm1
+; nextln: sarq    %cl, %rax
+; nextln: pinsrd.w $$0, %rsi, %xmm1
+; nextln: pinsrd.w $$1, %rax, %xmm1
 ; nextln: movdqa  %xmm1, %xmm0
--- a/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif
@@ -70,8 +70,8 @@ block0:
    return v1
 }
 ; check:  uninit  %xmm0
-; nextln: pinsrw  $$0, %r12, %xmm0
-; nextln: pinsrw  $$1, %r12, %xmm0
+; nextln: pinsrw  $$0, %rsi, %xmm0
+; nextln: pinsrw  $$1, %rsi, %xmm0
 ; nextln: pshufd  $$0, %xmm0, %xmm0

 function %splat_i32(i32) -> i32x4 {
--- a/cranelift/filetests/filetests/isa/x64/simd-logical-compile.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-logical-compile.clif
@@ -17,7 +17,7 @@ block0(v0: b32x4):
    return v1
 }
 ; check:  ptest   %xmm0, %xmm0
-; nextln: setnz   %r12b
+; nextln: setnz   %sil

 function %vall_true_i64x2(i64x2) -> b1 {
 block0(v0: i64x2):
@@ -27,4 +27,4 @@ block0(v0: i64x2):
 ; check:  pxor    %xmm1, %xmm1
 ; nextln: pcmpeqq %xmm0, %xmm1
 ; nextln: ptest   %xmm1, %xmm1
-; nextln: setz    %r12b
+; nextln: setz    %sil