The x64 backend currently builds the `RealRegUniverse` in a way that is generating somewhat suboptimal code. In many blocks, we see uses of callee-save (non-volatile) registers (r12, r13, r14, rbx) first, even in very short leaf functions where there are plenty of volatiles to use. This is leading to unnecessary spills/reloads. On one (local) test program, a medium-sized C benchmark compiled to Wasm and run on Wasmtime, I am seeing a ~10% performance improvement with this change; it will be less pronounced in programs with high register pressure (there we are likely to use all registers regardless, so the prologue/epilogue will save/restore all callee-saves), or in programs with fewer calls, but this is a clear win for small functions and in many cases removes prologue/epilogue clobber-saves altogether. Separately, I think the RA's coalescing is tripping up a bit in some cases; see e.g. the filetest touched by this commit that loads a value into %rsi then moves to %rax and returns immediately. This is an orthogonal issue, though, and should be addressed (if worthwhile) in regalloc.rs.
94 lines
2.2 KiB
Plaintext
94 lines
2.2 KiB
Plaintext
test compile
|
|
set enable_simd
|
|
target x86_64 has_ssse3 has_sse41
|
|
feature "experimental_x64"
|
|
|
|
;; shuffle
|
|
|
|
function %shuffle_different_ssa_values() -> i8x16 {
|
|
block0:
|
|
v0 = vconst.i8x16 0x00
|
|
v1 = vconst.i8x16 0x01
|
|
v2 = shuffle v0, v1, 0x11000000000000000000000000000000 ; pick the second lane of v1, the rest use the first lane of v0
|
|
return v2
|
|
}
|
|
; check: load_const VCodeConstant(3), %xmm1
|
|
; nextln: load_const VCodeConstant(2), %xmm0
|
|
; nextln: load_const VCodeConstant(0), %xmm2
|
|
; nextln: pshufb %xmm2, %xmm1
|
|
; nextln: load_const VCodeConstant(1), %xmm2
|
|
; nextln: pshufb %xmm2, %xmm0
|
|
; nextln: orps %xmm1, %xmm0
|
|
|
|
|
|
function %shuffle_same_ssa_value() -> i8x16 {
|
|
block0:
|
|
v1 = vconst.i8x16 0x01
|
|
v2 = shuffle v1, v1, 0x13000000000000000000000000000000 ; pick the fourth lane of v1 and the rest from the first lane of v1
|
|
return v2
|
|
}
|
|
; check: load_const VCodeConstant(1), %xmm0
|
|
; nextln: load_const VCodeConstant(0), %xmm1
|
|
; nextln: pshufb %xmm1, %xmm0
|
|
|
|
|
|
|
|
;; swizzle
|
|
|
|
function %swizzle() -> i8x16 {
|
|
block0:
|
|
v0 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
|
|
v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
|
|
v2 = swizzle.i8x16 v0, v1
|
|
return v2
|
|
}
|
|
; check: load_const VCodeConstant(1), %xmm1
|
|
; nextln: load_const VCodeConstant(1), %xmm0
|
|
; nextln: load_const VCodeConstant(0), %xmm2
|
|
; nextln: paddusb %xmm2, %xmm0
|
|
; nextln: pshufb %xmm0, %xmm1
|
|
; nextln: movdqa %xmm1, %xmm0
|
|
|
|
|
|
|
|
;; splat
|
|
|
|
function %splat_i8(i8) -> i8x16 {
|
|
block0(v0: i8):
|
|
v1 = splat.i8x16 v0
|
|
return v1
|
|
}
|
|
; check: uninit %xmm0
|
|
; nextln: pinsrb $$0, %rdi, %xmm0
|
|
; nextln: pxor %xmm1, %xmm1
|
|
; nextln: pshufb %xmm1, %xmm0
|
|
|
|
function %splat_b16() -> b16x8 {
|
|
block0:
|
|
v0 = bconst.b16 true
|
|
v1 = splat.b16x8 v0
|
|
return v1
|
|
}
|
|
; check: uninit %xmm0
|
|
; nextln: pinsrw $$0, %rsi, %xmm0
|
|
; nextln: pinsrw $$1, %rsi, %xmm0
|
|
; nextln: pshufd $$0, %xmm0, %xmm0
|
|
|
|
function %splat_i32(i32) -> i32x4 {
|
|
block0(v0: i32):
|
|
v1 = splat.i32x4 v0
|
|
return v1
|
|
}
|
|
; check: uninit %xmm0
|
|
; nextln: pinsrd $$0, %rdi, %xmm0
|
|
; nextln: pshufd $$0, %xmm0, %xmm0
|
|
|
|
function %splat_f64(f64) -> f64x2 {
|
|
block0(v0: f64):
|
|
v1 = splat.f64x2 v0
|
|
return v1
|
|
}
|
|
; check: uninit %xmm1
|
|
; nextln: movsd %xmm0, %xmm1
|
|
; nextln: movlhps %xmm0, %xmm1
|