x64 regalloc register order: put caller-saves (volatiles) first.
The x64 backend currently builds the `RealRegUniverse` in a way that is generating somewhat suboptimal code. In many blocks, we see uses of callee-save (non-volatile) registers (r12, r13, r14, rbx) first, even in very short leaf functions where there are plenty of volatiles to use. This is leading to unnecessary spills/reloads. On one (local) test program, a medium-sized C benchmark compiled to Wasm and run on Wasmtime, I am seeing a ~10% performance improvement with this change; it will be less pronounced in programs with high register pressure (there we are likely to use all registers regardless, so the prologue/epilogue will save/restore all callee-saves), or in programs with fewer calls, but this is a clear win for small functions and in many cases removes prologue/epilogue clobber-saves altogether. Separately, I think the RA's coalescing is tripping up a bit in some cases; see e.g. the filetest touched by this commit that loads a value into %rsi then moves to %rax and returns immediately. This is an orthogonal issue, though, and should be addressed (if worthwhile) in regalloc.rs.
This commit is contained in:
@@ -1,14 +1,20 @@
|
|||||||
//! Registers, the Universe thereof, and printing.
|
//! Registers, the Universe thereof, and printing.
|
||||||
//!
|
//!
|
||||||
//! These are ordered by sequence number, as required in the Universe. The strange ordering is
|
//! These are ordered by sequence number, as required in the Universe.
|
||||||
//! intended to make callee-save registers available before caller-saved ones. This is a net win
|
|
||||||
//! provided that each function makes at least one onward call. It'll be a net loss for leaf
|
|
||||||
//! functions, and we should change the ordering in that case, so as to make caller-save regs
|
|
||||||
//! available first.
|
|
||||||
//!
|
//!
|
||||||
//! TODO Maybe have two different universes, one for leaf functions and one for non-leaf functions?
|
//! The caller-saved registers are placed first in order to prefer not to clobber (requiring
|
||||||
//! Also, they will have to be ABI dependent. Need to find a way to avoid constructing a universe
|
//! saves/restores in prologue/epilogue code) when possible. Note that there is no other heuristic
|
||||||
//! for each function we compile.
|
//! in the backend that will apply such pressure; the register allocator's cost heuristics are not
|
||||||
|
//! aware of the cost of clobber-save/restore code.
|
||||||
|
//!
|
||||||
|
//! One might worry that this pessimizes code with many callsites, where using caller-saves causes
|
||||||
|
//! us to have to save them (as we are the caller) frequently. However, the register allocator
|
||||||
|
//! *should be* aware of *this* cost, because it sees that the call instruction modifies all of the
|
||||||
|
//! caller-saved (i.e., callee-clobbered) registers.
|
||||||
|
//!
|
||||||
|
//! Hence, this ordering encodes pressure in one direction (prefer not to clobber registers that we
|
||||||
|
//! ourselves have to save) and this is balanaced against the RA's pressure in the other direction
|
||||||
|
//! at callsites.
|
||||||
|
|
||||||
use crate::settings;
|
use crate::settings;
|
||||||
use alloc::vec::Vec;
|
use alloc::vec::Vec;
|
||||||
@@ -31,44 +37,44 @@ fn gpr(enc: u8, index: u8) -> Reg {
|
|||||||
Reg::new_real(RegClass::I64, enc, index)
|
Reg::new_real(RegClass::I64, enc, index)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn r12() -> Reg {
|
|
||||||
gpr(ENC_R12, 16)
|
|
||||||
}
|
|
||||||
pub(crate) fn r13() -> Reg {
|
|
||||||
gpr(ENC_R13, 17)
|
|
||||||
}
|
|
||||||
pub(crate) fn r14() -> Reg {
|
|
||||||
gpr(ENC_R14, 18)
|
|
||||||
}
|
|
||||||
pub(crate) fn rbx() -> Reg {
|
|
||||||
gpr(ENC_RBX, 19)
|
|
||||||
}
|
|
||||||
pub(crate) fn rsi() -> Reg {
|
pub(crate) fn rsi() -> Reg {
|
||||||
gpr(6, 20)
|
gpr(6, 16)
|
||||||
}
|
}
|
||||||
pub(crate) fn rdi() -> Reg {
|
pub(crate) fn rdi() -> Reg {
|
||||||
gpr(7, 21)
|
gpr(7, 17)
|
||||||
}
|
}
|
||||||
pub(crate) fn rax() -> Reg {
|
pub(crate) fn rax() -> Reg {
|
||||||
gpr(0, 22)
|
gpr(0, 18)
|
||||||
}
|
}
|
||||||
pub(crate) fn rcx() -> Reg {
|
pub(crate) fn rcx() -> Reg {
|
||||||
gpr(1, 23)
|
gpr(1, 19)
|
||||||
}
|
}
|
||||||
pub(crate) fn rdx() -> Reg {
|
pub(crate) fn rdx() -> Reg {
|
||||||
gpr(2, 24)
|
gpr(2, 20)
|
||||||
}
|
}
|
||||||
pub(crate) fn r8() -> Reg {
|
pub(crate) fn r8() -> Reg {
|
||||||
gpr(8, 25)
|
gpr(8, 21)
|
||||||
}
|
}
|
||||||
pub(crate) fn r9() -> Reg {
|
pub(crate) fn r9() -> Reg {
|
||||||
gpr(9, 26)
|
gpr(9, 22)
|
||||||
}
|
}
|
||||||
pub(crate) fn r10() -> Reg {
|
pub(crate) fn r10() -> Reg {
|
||||||
gpr(10, 27)
|
gpr(10, 23)
|
||||||
}
|
}
|
||||||
pub(crate) fn r11() -> Reg {
|
pub(crate) fn r11() -> Reg {
|
||||||
gpr(11, 28)
|
gpr(11, 24)
|
||||||
|
}
|
||||||
|
pub(crate) fn r12() -> Reg {
|
||||||
|
gpr(ENC_R12, 25)
|
||||||
|
}
|
||||||
|
pub(crate) fn r13() -> Reg {
|
||||||
|
gpr(ENC_R13, 26)
|
||||||
|
}
|
||||||
|
pub(crate) fn r14() -> Reg {
|
||||||
|
gpr(ENC_R14, 27)
|
||||||
|
}
|
||||||
|
pub(crate) fn rbx() -> Reg {
|
||||||
|
gpr(ENC_RBX, 28)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn r15() -> Reg {
|
pub(crate) fn r15() -> Reg {
|
||||||
@@ -176,13 +182,6 @@ pub(crate) fn create_reg_universe_systemv(flags: &settings::Flags) -> RealRegUni
|
|||||||
// Integer regs.
|
// Integer regs.
|
||||||
let first_gpr = regs.len();
|
let first_gpr = regs.len();
|
||||||
|
|
||||||
// Callee-saved, in the SystemV x86_64 ABI.
|
|
||||||
regs.push((r12().to_real_reg(), "%r12".into()));
|
|
||||||
regs.push((r13().to_real_reg(), "%r13".into()));
|
|
||||||
regs.push((r14().to_real_reg(), "%r14".into()));
|
|
||||||
|
|
||||||
regs.push((rbx().to_real_reg(), "%rbx".into()));
|
|
||||||
|
|
||||||
// Caller-saved, in the SystemV x86_64 ABI.
|
// Caller-saved, in the SystemV x86_64 ABI.
|
||||||
regs.push((rsi().to_real_reg(), "%rsi".into()));
|
regs.push((rsi().to_real_reg(), "%rsi".into()));
|
||||||
regs.push((rdi().to_real_reg(), "%rdi".into()));
|
regs.push((rdi().to_real_reg(), "%rdi".into()));
|
||||||
@@ -194,6 +193,13 @@ pub(crate) fn create_reg_universe_systemv(flags: &settings::Flags) -> RealRegUni
|
|||||||
regs.push((r10().to_real_reg(), "%r10".into()));
|
regs.push((r10().to_real_reg(), "%r10".into()));
|
||||||
regs.push((r11().to_real_reg(), "%r11".into()));
|
regs.push((r11().to_real_reg(), "%r11".into()));
|
||||||
|
|
||||||
|
// Callee-saved, in the SystemV x86_64 ABI.
|
||||||
|
regs.push((r12().to_real_reg(), "%r12".into()));
|
||||||
|
regs.push((r13().to_real_reg(), "%r13".into()));
|
||||||
|
regs.push((r14().to_real_reg(), "%r14".into()));
|
||||||
|
|
||||||
|
regs.push((rbx().to_real_reg(), "%rbx".into()));
|
||||||
|
|
||||||
// Other regs, not available to the allocator.
|
// Other regs, not available to the allocator.
|
||||||
debug_assert_eq!(r15(), pinned_reg());
|
debug_assert_eq!(r15(), pinned_reg());
|
||||||
let allocable = if use_pinned_reg {
|
let allocable = if use_pinned_reg {
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ block0(v0: i64, v1: i64):
|
|||||||
v2 = iadd v0, v1
|
v2 = iadd v0, v1
|
||||||
v3 = load.i64 v2
|
v3 = load.i64 v2
|
||||||
return v3
|
return v3
|
||||||
; check: movq 0(%rdi,%rsi,1), %r12
|
; check: movq 0(%rdi,%rsi,1), %rsi
|
||||||
}
|
}
|
||||||
|
|
||||||
function %amode_add_imm(i64) -> i64 {
|
function %amode_add_imm(i64) -> i64 {
|
||||||
@@ -16,7 +16,7 @@ block0(v0: i64):
|
|||||||
v2 = iadd v0, v1
|
v2 = iadd v0, v1
|
||||||
v3 = load.i64 v2
|
v3 = load.i64 v2
|
||||||
return v3
|
return v3
|
||||||
; check: movq 42(%rdi), %r12
|
; check: movq 42(%rdi), %rsi
|
||||||
}
|
}
|
||||||
|
|
||||||
;; Same as above, but add operands have been reversed.
|
;; Same as above, but add operands have been reversed.
|
||||||
@@ -26,7 +26,7 @@ block0(v0: i64):
|
|||||||
v2 = iadd v1, v0
|
v2 = iadd v1, v0
|
||||||
v3 = load.i64 v2
|
v3 = load.i64 v2
|
||||||
return v3
|
return v3
|
||||||
; check: movq 42(%rdi), %r12
|
; check: movq 42(%rdi), %rsi
|
||||||
}
|
}
|
||||||
|
|
||||||
;; Make sure that uextend(cst) are ignored when the cst will naturally sign-extend.
|
;; Make sure that uextend(cst) are ignored when the cst will naturally sign-extend.
|
||||||
@@ -37,5 +37,5 @@ block0(v0: i64):
|
|||||||
v3 = iadd v2, v0
|
v3 = iadd v2, v0
|
||||||
v4 = load.i64 v3
|
v4 = load.i64 v3
|
||||||
return v4
|
return v4
|
||||||
; check: movq 42(%rdi), %r12
|
; check: movq 42(%rdi), %rsi
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,11 +11,11 @@ function %f(i32, i64 vmctx) -> i64 {
|
|||||||
block0(v0: i32, v1: i64):
|
block0(v0: i32, v1: i64):
|
||||||
|
|
||||||
v2 = heap_addr.i64 heap0, v0, 0x8000
|
v2 = heap_addr.i64 heap0, v0, 0x8000
|
||||||
; check: movl 8(%rsi), %r12d
|
; check: movl 8(%rsi), %ecx
|
||||||
; nextln: movq %rdi, %r13
|
; nextln: movq %rdi, %rax
|
||||||
; nextln: addl $$32768, %r13d
|
; nextln: addl $$32768, %eax
|
||||||
; nextln: jnb ; ud2 heap_oob ;
|
; nextln: jnb ; ud2 heap_oob ;
|
||||||
; nextln: cmpl %r12d, %r13d
|
; nextln: cmpl %ecx, %eax
|
||||||
; nextln: jbe label1; j label2
|
; nextln: jbe label1; j label2
|
||||||
; check: Block 1:
|
; check: Block 1:
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ function %add_from_mem_u32_1(i64, i32) -> i32 {
|
|||||||
block0(v0: i64, v1: i32):
|
block0(v0: i64, v1: i32):
|
||||||
v2 = load.i32 v0
|
v2 = load.i32 v0
|
||||||
v3 = iadd.i32 v2, v1
|
v3 = iadd.i32 v2, v1
|
||||||
; check: addl 0(%rdi), %r12d
|
; check: addl 0(%rdi), %esi
|
||||||
return v3
|
return v3
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -14,7 +14,7 @@ function %add_from_mem_u32_2(i64, i32) -> i32 {
|
|||||||
block0(v0: i64, v1: i32):
|
block0(v0: i64, v1: i32):
|
||||||
v2 = load.i32 v0
|
v2 = load.i32 v0
|
||||||
v3 = iadd.i32 v1, v2
|
v3 = iadd.i32 v1, v2
|
||||||
; check: addl 0(%rdi), %r12d
|
; check: addl 0(%rdi), %esi
|
||||||
return v3
|
return v3
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -22,7 +22,7 @@ function %add_from_mem_u64_1(i64, i64) -> i64 {
|
|||||||
block0(v0: i64, v1: i64):
|
block0(v0: i64, v1: i64):
|
||||||
v2 = load.i64 v0
|
v2 = load.i64 v0
|
||||||
v3 = iadd.i64 v2, v1
|
v3 = iadd.i64 v2, v1
|
||||||
; check: addq 0(%rdi), %r12
|
; check: addq 0(%rdi), %rsi
|
||||||
return v3
|
return v3
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -30,7 +30,7 @@ function %add_from_mem_u64_2(i64, i64) -> i64 {
|
|||||||
block0(v0: i64, v1: i64):
|
block0(v0: i64, v1: i64):
|
||||||
v2 = load.i64 v0
|
v2 = load.i64 v0
|
||||||
v3 = iadd.i64 v1, v2
|
v3 = iadd.i64 v1, v2
|
||||||
; check: addq 0(%rdi), %r12
|
; check: addq 0(%rdi), %rsi
|
||||||
return v3
|
return v3
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -40,8 +40,8 @@ function %add_from_mem_not_narrow(i64, i8) -> i8 {
|
|||||||
block0(v0: i64, v1: i8):
|
block0(v0: i64, v1: i8):
|
||||||
v2 = load.i8 v0
|
v2 = load.i8 v0
|
||||||
v3 = iadd.i8 v2, v1
|
v3 = iadd.i8 v2, v1
|
||||||
; check: movzbq 0(%rdi), %r12
|
; check: movzbq 0(%rdi), %rdi
|
||||||
; nextln: addl %esi, %r12d
|
; nextln: addl %esi, %edi
|
||||||
return v3
|
return v3
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -52,10 +52,10 @@ block0(v0: i64, v1: i64):
|
|||||||
store.i64 v3, v1
|
store.i64 v3, v1
|
||||||
v4 = load.i64 v3
|
v4 = load.i64 v3
|
||||||
return v4
|
return v4
|
||||||
; check: movq 0(%rdi), %r12
|
; check: movq 0(%rdi), %rax
|
||||||
; nextln: movq %r12, %r13
|
; nextln: movq %rax, %rcx
|
||||||
; nextln: addq %rdi, %r13
|
; nextln: addq %rdi, %rcx
|
||||||
; nextln: movq %r13, 0(%rsi)
|
; nextln: movq %rcx, 0(%rsi)
|
||||||
; nextln: movq 0(%r12,%rdi,1), %r12
|
; nextln: movq 0(%rax,%rdi,1), %rsi
|
||||||
; nextln: movq %r12, %rax
|
; nextln: movq %rsi, %rax
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -28,9 +28,9 @@ block0(v0: i32):
|
|||||||
}
|
}
|
||||||
; check: movd %edi, %xmm1
|
; check: movd %edi, %xmm1
|
||||||
; nextln: psllw %xmm1, %xmm0
|
; nextln: psllw %xmm1, %xmm0
|
||||||
; nextln: lea const(VCodeConstant(0)), %r12
|
; nextln: lea const(VCodeConstant(0)), %rsi
|
||||||
; nextln: shlq $$4, %rdi
|
; nextln: shlq $$4, %rdi
|
||||||
; nextln: movdqu 0(%r12,%rdi,1), %xmm1
|
; nextln: movdqu 0(%rsi,%rdi,1), %xmm1
|
||||||
; nextln: pand %xmm1, %xmm0
|
; nextln: pand %xmm1, %xmm0
|
||||||
|
|
||||||
function %ushr_i8x16_imm() -> i8x16 {
|
function %ushr_i8x16_imm() -> i8x16 {
|
||||||
@@ -81,12 +81,12 @@ block0(v0: i64x2, v1: i32):
|
|||||||
v2 = sshr v0, v1
|
v2 = sshr v0, v1
|
||||||
return v2
|
return v2
|
||||||
}
|
}
|
||||||
; check: pextrd.w $$0, %xmm0, %r12
|
; check: pextrd.w $$0, %xmm0, %rsi
|
||||||
; nextln: pextrd.w $$1, %xmm0, %r13
|
; nextln: pextrd.w $$1, %xmm0, %rax
|
||||||
; nextln: movq %rdi, %rcx
|
; nextln: movq %rdi, %rcx
|
||||||
; nextln: sarq %cl, %r12
|
; nextln: sarq %cl, %rsi
|
||||||
; nextln: movq %rdi, %rcx
|
; nextln: movq %rdi, %rcx
|
||||||
; nextln: sarq %cl, %r13
|
; nextln: sarq %cl, %rax
|
||||||
; nextln: pinsrd.w $$0, %r12, %xmm1
|
; nextln: pinsrd.w $$0, %rsi, %xmm1
|
||||||
; nextln: pinsrd.w $$1, %r13, %xmm1
|
; nextln: pinsrd.w $$1, %rax, %xmm1
|
||||||
; nextln: movdqa %xmm1, %xmm0
|
; nextln: movdqa %xmm1, %xmm0
|
||||||
|
|||||||
@@ -70,8 +70,8 @@ block0:
|
|||||||
return v1
|
return v1
|
||||||
}
|
}
|
||||||
; check: uninit %xmm0
|
; check: uninit %xmm0
|
||||||
; nextln: pinsrw $$0, %r12, %xmm0
|
; nextln: pinsrw $$0, %rsi, %xmm0
|
||||||
; nextln: pinsrw $$1, %r12, %xmm0
|
; nextln: pinsrw $$1, %rsi, %xmm0
|
||||||
; nextln: pshufd $$0, %xmm0, %xmm0
|
; nextln: pshufd $$0, %xmm0, %xmm0
|
||||||
|
|
||||||
function %splat_i32(i32) -> i32x4 {
|
function %splat_i32(i32) -> i32x4 {
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ block0(v0: b32x4):
|
|||||||
return v1
|
return v1
|
||||||
}
|
}
|
||||||
; check: ptest %xmm0, %xmm0
|
; check: ptest %xmm0, %xmm0
|
||||||
; nextln: setnz %r12b
|
; nextln: setnz %sil
|
||||||
|
|
||||||
function %vall_true_i64x2(i64x2) -> b1 {
|
function %vall_true_i64x2(i64x2) -> b1 {
|
||||||
block0(v0: i64x2):
|
block0(v0: i64x2):
|
||||||
@@ -27,4 +27,4 @@ block0(v0: i64x2):
|
|||||||
; check: pxor %xmm1, %xmm1
|
; check: pxor %xmm1, %xmm1
|
||||||
; nextln: pcmpeqq %xmm0, %xmm1
|
; nextln: pcmpeqq %xmm0, %xmm1
|
||||||
; nextln: ptest %xmm1, %xmm1
|
; nextln: ptest %xmm1, %xmm1
|
||||||
; nextln: setz %r12b
|
; nextln: setz %sil
|
||||||
|
|||||||
Reference in New Issue
Block a user