The x64 backend currently builds the `RealRegUniverse` in a way that is generating somewhat suboptimal code. In many blocks, we see uses of callee-save (non-volatile) registers (r12, r13, r14, rbx) first, even in very short leaf functions where there are plenty of volatiles to use. This is leading to unnecessary spills/reloads. On one (local) test program, a medium-sized C benchmark compiled to Wasm and run on Wasmtime, I am seeing a ~10% performance improvement with this change; it will be less pronounced in programs with high register pressure (there we are likely to use all registers regardless, so the prologue/epilogue will save/restore all callee-saves), or in programs with fewer calls, but this is a clear win for small functions and in many cases removes prologue/epilogue clobber-saves altogether. Separately, I think the RA's coalescing is tripping up a bit in some cases; see e.g. the filetest touched by this commit that loads a value into %rsi then moves to %rax and returns immediately. This is an orthogonal issue, though, and should be addressed (if worthwhile) in regalloc.rs.
62 lines
1.4 KiB
Plaintext
62 lines
1.4 KiB
Plaintext
test compile
|
|
target x86_64
|
|
feature "experimental_x64"
|
|
|
|
function %add_from_mem_u32_1(i64, i32) -> i32 {
|
|
block0(v0: i64, v1: i32):
|
|
v2 = load.i32 v0
|
|
v3 = iadd.i32 v2, v1
|
|
; check: addl 0(%rdi), %esi
|
|
return v3
|
|
}
|
|
|
|
function %add_from_mem_u32_2(i64, i32) -> i32 {
|
|
block0(v0: i64, v1: i32):
|
|
v2 = load.i32 v0
|
|
v3 = iadd.i32 v1, v2
|
|
; check: addl 0(%rdi), %esi
|
|
return v3
|
|
}
|
|
|
|
function %add_from_mem_u64_1(i64, i64) -> i64 {
|
|
block0(v0: i64, v1: i64):
|
|
v2 = load.i64 v0
|
|
v3 = iadd.i64 v2, v1
|
|
; check: addq 0(%rdi), %rsi
|
|
return v3
|
|
}
|
|
|
|
function %add_from_mem_u64_2(i64, i64) -> i64 {
|
|
block0(v0: i64, v1: i64):
|
|
v2 = load.i64 v0
|
|
v3 = iadd.i64 v1, v2
|
|
; check: addq 0(%rdi), %rsi
|
|
return v3
|
|
}
|
|
|
|
; test narrow loads: 8-bit load should not merge because the `addl` is 32 bits
|
|
; and would load 32 bits from memory, which may go beyond the end of the heap.
|
|
function %add_from_mem_not_narrow(i64, i8) -> i8 {
|
|
block0(v0: i64, v1: i8):
|
|
v2 = load.i8 v0
|
|
v3 = iadd.i8 v2, v1
|
|
; check: movzbq 0(%rdi), %rdi
|
|
; nextln: addl %esi, %edi
|
|
return v3
|
|
}
|
|
|
|
function %no_merge_if_lookback_use(i64, i64) -> i64 {
|
|
block0(v0: i64, v1: i64):
|
|
v2 = load.i64 v0
|
|
v3 = iadd.i64 v2, v0
|
|
store.i64 v3, v1
|
|
v4 = load.i64 v3
|
|
return v4
|
|
; check: movq 0(%rdi), %rax
|
|
; nextln: movq %rax, %rcx
|
|
; nextln: addq %rdi, %rcx
|
|
; nextln: movq %rcx, 0(%rsi)
|
|
; nextln: movq 0(%rax,%rdi,1), %rsi
|
|
; nextln: movq %rsi, %rax
|
|
}
|