Fix FPR saving and shadow space allocation for Windows x64.

This commit fixes both how FPR callee-saved registers are saved and how the
shadow space allocation occurs when laying out the stack for Windows x64
calling convention.

Importantly, this commit removes the compiler limitation of stack size for
Windows x64 that was imposed because FPR saves previously couldn't always be
represented in the unwind information.

The FPR saves are now performed without using stack slots, much like how the
callee-saved GPRs are saved. The total CSR space is given to `layout_stack` so
that it is included in the frame size and to offset the layout of spills and
explicit slots.

The FPR saves are now done via an RSP offset (post adjustment) and they always
follow the GPR saves on the stack. A simpler calculation can now be made to
determine the proper offsets of the FPR saves for representing the unwind
information.

Additionally, the shadow space is no longer treated as an incoming argument,
but an explicit stack slot that gets laid out at the lowest address possible in
the local frame. This prevents `layout_stack` from putting a spill or explicit
slot in this reserved space. In the future, `layout_stack` should take
advantage of the *caller-provided* shadow space for spills, but this commit does
not attempt to address that.

The shadow space is now omitted from the local frame for leaf functions.

Fixes #1728.
Fixes #1587.
Fixes #1475.
This commit is contained in:
Peter Huene
2020-05-20 00:42:44 -07:00
parent c9e3b71c39
commit 78c3091e84
4 changed files with 380 additions and 263 deletions

View File

@@ -8,29 +8,57 @@ function %one_arg(i64) windows_fastcall {
block0(v0: i64):
return
}
; check: function %one_arg(i64 [%rcx], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
; nextln: ss0 = incoming_arg 16, offset -48
; check: function %one_arg(i64 [%rcx], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
; nextln: ss0 = incoming_arg 16, offset -16
; check: block0(v0: i64 [%rcx], v1: i64 [%rbp]):
; nextln: x86_push v1
; nextln: copy_special %rsp -> %rbp
; nextln: v2 = x86_pop.i64
; nextln: return v2
; nextln: }
; check if we still use registers for 4 arguments
function %four_args(i64, i64, i64, i64) windows_fastcall {
block0(v0: i64, v1: i64, v2: i64, v3: i64):
return
}
; check: function %four_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
; check: function %four_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
; nextln: ss0 = incoming_arg 16, offset -16
; check: block0(v0: i64 [%rcx], v1: i64 [%rdx], v2: i64 [%r8], v3: i64 [%r9], v4: i64 [%rbp]):
; nextln: x86_push v4
; nextln: copy_special %rsp -> %rbp
; nextln: v5 = x86_pop.i64
; nextln: return v5
; nextln: }
; check if float arguments are passed through XMM registers
function %four_float_args(f64, f64, f64, f64) windows_fastcall {
block0(v0: f64, v1: f64, v2: f64, v3: f64):
return
}
; check: function %four_float_args(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
; check: function %four_float_args(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
; nextln: ss0 = incoming_arg 16, offset -16
; check: block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v4: i64 [%rbp]):
; nextln: x86_push v4
; nextln: copy_special %rsp -> %rbp
; nextln: v5 = x86_pop.i64
; nextln: return v5
; nextln: }
; check if we use stack space for > 4 arguments
function %five_args(i64, i64, i64, i64, i64) windows_fastcall {
block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64):
return
}
; check: function %five_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 [32], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
; check: function %five_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 [32], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
; nextln: ss0 = incoming_arg 8, offset 32
; nextln: ss1 = incoming_arg 16, offset -16
; check: block0(v0: i64 [%rcx], v1: i64 [%rdx], v2: i64 [%r8], v3: i64 [%r9], v4: i64 [ss0], v5: i64 [%rbp]):
; nextln: x86_push v5
; nextln: copy_special %rsp -> %rbp
; nextln: v6 = x86_pop.i64
; nextln: return v6
; nextln: }
; check that we preserve xmm6 and above if we're using them locally
function %float_callee_saves(f64, f64, f64, f64) windows_fastcall {
@@ -40,38 +68,51 @@ block0(v0: f64, v1: f64, v2: f64, v3: f64):
[-, %xmm7] v5 = fadd v0, v1
return
}
; check: function %float_callee_sav(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7]) -> i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7] windows_fastcall {
; nextln: ss0 = explicit_slot 32, offset -80
; nextln: ss1 = incoming_arg 16, offset -48
; check: block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v6: i64 [%rbp], v8: f64x2 [%xmm6], v9: f64x2 [%xmm7]):
; nextln: x86_push v6
; nextln: copy_special %rsp -> %rbp
; nextln: adjust_sp_down_imm 64
; nextln: v7 = stack_addr.i64 ss0
; nextln: store notrap aligned v8, v7
; nextln: store notrap aligned v9, v7+16
; check: v10 = stack_addr.i64 ss0
; nextln: v11 = load.f64x2 notrap aligned v10
; nextln: v12 = load.f64x2 notrap aligned v10+16
; nextln: adjust_sp_up_imm 64
; nextln: v13 = x86_pop.i64
; nextln: v13, v11, v12
; check: function %float_callee_sav(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 csr [%rsp], i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7]) -> i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7] windows_fastcall {
; nextln: ss0 = incoming_arg 48, offset -48
; check: block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v6: i64 [%rsp], v7: i64 [%rbp], v8: f64x2 [%xmm6], v9: f64x2 [%xmm7]):
; nextln: x86_push v7
; nextln: copy_special %rsp -> %rbp
; nextln: adjust_sp_down_imm 32
; nextln: store notrap aligned v8, v6+16
; nextln: store notrap aligned v9, v6
; nextln: v11 = load.f64x2 notrap aligned v6+16
; nextln: v12 = load.f64x2 notrap aligned v6
; nextln: adjust_sp_up_imm 32
; nextln: v10 = x86_pop.i64
; nextln: return v10, v11, v12
; nextln: }
function %mixed_int_float(i64, f64, i64, f32) windows_fastcall {
block0(v0: i64, v1: f64, v2: i64, v3: f32):
return
}
; check: function %mixed_int_float(i64 [%rcx], f64 [%xmm1], i64 [%r8], f32 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
; check: function %mixed_int_float(i64 [%rcx], f64 [%xmm1], i64 [%r8], f32 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
; nextln: ss0 = incoming_arg 16, offset -16
; check: block0(v0: i64 [%rcx], v1: f64 [%xmm1], v2: i64 [%r8], v3: f32 [%xmm3], v4: i64 [%rbp]):
; nextln: x86_push v4
; nextln: copy_special %rsp -> %rbp
; nextln: v5 = x86_pop.i64
; nextln: return v5
; nextln: }
function %ret_val_float(f32, f64, i64, i64) -> f64 windows_fastcall {
block0(v0: f32, v1: f64, v2: i64, v3: i64):
return v1
}
; check: function %ret_val_float(f32 [%xmm0], f64 [%xmm1], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> f64 [%xmm0], i64 fp [%rbp] windows_fastcall {
; check: function %ret_val_float(f32 [%xmm0], f64 [%xmm1], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> f64 [%xmm0], i64 fp [%rbp] windows_fastcall {
; nextln: ss0 = incoming_arg 16, offset -16
; check: block0(v0: f32 [%xmm0], v1: f64 [%xmm1], v2: i64 [%r8], v3: i64 [%r9], v4: i64 [%rbp]):
; nextln: x86_push v4
; nextln: copy_special %rsp -> %rbp
; nextln: regmove v1, %xmm1 -> %xmm0
; nextln: v5 = x86_pop.i64
; nextln: return v1, v5
; nextln: }
function %internal_stack_arg_function_call(i64) -> i64 windows_fastcall {
fn0 = %foo(i64, i64, i64, i64) -> i64
fn1 = %foo2(i64, i64, i64, i64) -> i64
fn0 = %foo(i64, i64, i64, i64) -> i64 windows_fastcall
fn1 = %foo2(i64, i64, i64, i64) -> i64 windows_fastcall
block0(v0: i64):
v1 = load.i64 v0+0
v2 = load.i64 v0+8
@@ -94,3 +135,100 @@ block0(v0: i64):
store.i64 v9, v0+72
return v10
}
; check: function %internal_stack_a(i64 [%rcx], i64 fp [%rbp], i64 csr [%r12], i64 csr [%r13], i64 csr [%r14], i64 csr [%r15]) -> i64 [%rax], i64 fp [%rbp], i64 csr [%r12], i64 csr [%r13], i64 csr [%r14], i64 csr [%r15] windows_fastcall {
; nextln: ss0 = spill_slot 8, offset -56
; nextln: ss1 = spill_slot 8, offset -64
; nextln: ss2 = spill_slot 8, offset -72
; nextln: ss3 = spill_slot 8, offset -80
; nextln: ss4 = spill_slot 8, offset -88
; nextln: ss5 = spill_slot 8, offset -96
; nextln: ss6 = spill_slot 8, offset -104
; nextln: ss7 = spill_slot 8, offset -112
; nextln: ss8 = spill_slot 8, offset -120
; nextln: ss9 = spill_slot 8, offset -128
; nextln: ss10 = incoming_arg 48, offset -48
; nextln: ss11 = explicit_slot 32, offset -160
; nextln: sig0 = (i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9]) -> i64 [%rax] windows_fastcall
; nextln: sig1 = (i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9]) -> i64 [%rax] windows_fastcall
; nextln: fn0 = %foo sig0
; nextln: fn1 = %foo2 sig1
; check: block0(v11: i64 [%rcx], v52: i64 [%rbp], v53: i64 [%r12], v54: i64 [%r13], v55: i64 [%r14], v56: i64 [%r15]):
; nextln: x86_push v52
; nextln: copy_special %rsp -> %rbp
; nextln: x86_push v53
; nextln: x86_push v54
; nextln: x86_push v55
; nextln: x86_push v56
; nextln: adjust_sp_down_imm 112
; nextln: v0 = spill v11
; nextln: v12 = copy_to_ssa.i64 %rcx
; nextln: v13 = load.i64 v12
; nextln: v1 = spill v13
; nextln: v14 = fill_nop v0
; nextln: v15 = load.i64 v14+8
; nextln: v2 = spill v15
; nextln: v16 = fill_nop v0
; nextln: v17 = load.i64 v16+16
; nextln: v3 = spill v17
; nextln: v18 = fill_nop v0
; nextln: v19 = load.i64 v18+24
; nextln: v4 = spill v19
; nextln: v20 = fill_nop v0
; nextln: v21 = load.i64 v20+32
; nextln: v5 = spill v21
; nextln: v22 = fill_nop v0
; nextln: v23 = load.i64 v22+40
; nextln: v6 = spill v23
; nextln: v24 = fill_nop v0
; nextln: v25 = load.i64 v24+48
; nextln: v7 = spill v25
; nextln: v26 = fill_nop v0
; nextln: v27 = load.i64 v26+56
; nextln: v8 = spill v27
; nextln: v28 = fill_nop v0
; nextln: v29 = load.i64 v28+64
; nextln: v9 = spill v29
; nextln: v30 = fill v1
; nextln: v31 = fill v2
; nextln: v32 = fill v3
; nextln: v33 = fill v4
; nextln: regmove v30, %r15 -> %rcx
; nextln: regmove v31, %r14 -> %rdx
; nextln: regmove v32, %r13 -> %r8
; nextln: regmove v33, %r12 -> %r9
; nextln: v10 = call fn0(v30, v31, v32, v33)
; nextln: v34 = fill v1
; nextln: v35 = fill v0
; nextln: store v34, v35+8
; nextln: v36 = fill v2
; nextln: v37 = fill_nop v0
; nextln: store v36, v37+16
; nextln: v38 = fill v3
; nextln: v39 = fill_nop v0
; nextln: store v38, v39+24
; nextln: v40 = fill v4
; nextln: v41 = fill_nop v0
; nextln: store v40, v41+32
; nextln: v42 = fill v5
; nextln: v43 = fill_nop v0
; nextln: store v42, v43+40
; nextln: v44 = fill v6
; nextln: v45 = fill_nop v0
; nextln: store v44, v45+48
; nextln: v46 = fill v7
; nextln: v47 = fill_nop v0
; nextln: store v46, v47+56
; nextln: v48 = fill v8
; nextln: v49 = fill_nop v0
; nextln: store v48, v49+64
; nextln: v50 = fill v9
; nextln: v51 = fill_nop v0
; nextln: store v50, v51+72
; nextln: adjust_sp_up_imm 112
; nextln: v61 = x86_pop.i64
; nextln: v60 = x86_pop.i64
; nextln: v59 = x86_pop.i64
; nextln: v58 = x86_pop.i64
; nextln: v57 = x86_pop.i64
; nextln: return v10, v57, v58, v59, v60, v61
; nextln: }

View File

@@ -3,13 +3,35 @@ set opt_level=speed_and_size
set is_pic
target x86_64 haswell
; check the unwind information with a function with no args
function %no_args() windows_fastcall {
; check the unwind information with a leaf function with no args
function %no_args_leaf() windows_fastcall {
block0:
return
}
; sameln: version: 1
; nextln: flags: 0
; nextln: prologue size: 4
; nextln: frame register: 5
; nextln: frame register offset: 0
; nextln: unwind codes: 2
; nextln:
; nextln: offset: 1
; nextln: op: PushNonvolatileRegister
; nextln: info: 5
; nextln:
; nextln: offset: 4
; nextln: op: SetFramePointer
; nextln: info: 0
; check the unwind information with a non-leaf function with no args
function %no_args() windows_fastcall {
fn0 = %foo()
block0:
call fn0()
return
}
; sameln: version: 1
; nextln: flags: 0
; nextln: prologue size: 8
; nextln: frame register: 5
; nextln: frame register offset: 0
@@ -51,7 +73,7 @@ block0:
; nextln: offset: 17
; nextln: op: LargeStackAlloc
; nextln: info: 0
; nextln: value: 12504 (u16)
; nextln: value: 12500 (u16)
; check a function with large-sized stack alloc
function %large_stack() windows_fastcall {
@@ -77,7 +99,7 @@ block0:
; nextln: offset: 17
; nextln: op: LargeStackAlloc
; nextln: info: 1
; nextln: value: 524320 (u32)
; nextln: value: 524288 (u32)
function %fpr_with_function_call(i64, i64) windows_fastcall {
fn0 = %foo(f64, f64, i64, i64, i64) windows_fastcall;
@@ -113,9 +135,9 @@ block0(v0: i64, v1: i64):
;
; sameln: version: 1
; nextln: flags: 0
; nextln: prologue size: 25
; nextln: prologue size: 22
; nextln: frame register: 5
; nextln: frame register offset: 12
; nextln: frame register offset: 2
; nextln: unwind codes: 5
; nextln:
; nextln: offset: 1
@@ -135,10 +157,10 @@ block0(v0: i64, v1: i64):
; nextln: info: 0
; nextln: value: 23 (u16)
; nextln:
; nextln: offset: 25
; nextln: offset: 22
; nextln: op: SaveXmm128
; nextln: info: 15
; nextln: value: 3 (u16)
; nextln: value: 0 (u16)
; check a function that has CSRs
function %lots_of_registers(i64, i64) windows_fastcall {
@@ -191,9 +213,9 @@ block0(v0: i64, v1: i64):
}
; sameln: version: 1
; nextln: flags: 0
; nextln: prologue size: 41
; nextln: prologue size: 35
; nextln: frame register: 5
; nextln: frame register offset: 10
; nextln: frame register offset: 7
; nextln: unwind codes: 13
; nextln:
; nextln: offset: 1
@@ -234,19 +256,19 @@ block0(v0: i64, v1: i64):
; nextln:
; nextln: offset: 19
; nextln: op: SmallStackAlloc
; nextln: info: 12
; nextln: info: 8
; nextln:
; nextln: offset: 31
; nextln: offset: 24
; nextln: op: SaveXmm128
; nextln: info: 6
; nextln: value: 0 (u16)
; nextln: value: 2 (u16)
; nextln:
; nextln: offset: 36
; nextln: offset: 29
; nextln: op: SaveXmm128
; nextln: info: 7
; nextln: value: 1 (u16)
; nextln:
; nextln: offset: 41
; nextln: offset: 35
; nextln: op: SaveXmm128
; nextln: info: 8
; nextln: value: 2 (u16)
; nextln: value: 0 (u16)