Fix FPR saving and shadow space allocation for Windows x64.
This commit fixes both how FPR callee-saved registers are saved and how the shadow space allocation occurs when laying out the stack for Windows x64 calling convention. Importantly, this commit removes the compiler limitation of stack size for Windows x64 that was imposed because FPR saves previously couldn't always be represented in the unwind information. The FPR saves are now performed without using stack slots, much like how the callee-saved GPRs are saved. The total CSR space is given to `layout_stack` so that it is included in the frame size and to offset the layout of spills and explicit slots. The FPR saves are now done via an RSP offset (post adjustment) and they always follow the GPR saves on the stack. A simpler calculation can now be made to determine the proper offsets of the FPR saves for representing the unwind information. Additionally, the shadow space is no longer treated as an incoming argument, but an explicit stack slot that gets laid out at the lowest address possible in the local frame. This prevents `layout_stack` from putting a spill or explicit slot in this reserved space. In the future, `layout_stack` should take advantage of the *caller-provided* shadow space for spills, but this commit does not attempt to address that. The shadow space is now omitted from the local frame for leaf functions. Fixes #1728. Fixes #1587. Fixes #1475.
This commit is contained in:
@@ -8,29 +8,57 @@ function %one_arg(i64) windows_fastcall {
|
||||
block0(v0: i64):
|
||||
return
|
||||
}
|
||||
; check: function %one_arg(i64 [%rcx], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
|
||||
; nextln: ss0 = incoming_arg 16, offset -48
|
||||
; check: function %one_arg(i64 [%rcx], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
|
||||
; nextln: ss0 = incoming_arg 16, offset -16
|
||||
; check: block0(v0: i64 [%rcx], v1: i64 [%rbp]):
|
||||
; nextln: x86_push v1
|
||||
; nextln: copy_special %rsp -> %rbp
|
||||
; nextln: v2 = x86_pop.i64
|
||||
; nextln: return v2
|
||||
; nextln: }
|
||||
|
||||
; check if we still use registers for 4 arguments
|
||||
function %four_args(i64, i64, i64, i64) windows_fastcall {
|
||||
block0(v0: i64, v1: i64, v2: i64, v3: i64):
|
||||
return
|
||||
}
|
||||
; check: function %four_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
|
||||
; check: function %four_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
|
||||
; nextln: ss0 = incoming_arg 16, offset -16
|
||||
; check: block0(v0: i64 [%rcx], v1: i64 [%rdx], v2: i64 [%r8], v3: i64 [%r9], v4: i64 [%rbp]):
|
||||
; nextln: x86_push v4
|
||||
; nextln: copy_special %rsp -> %rbp
|
||||
; nextln: v5 = x86_pop.i64
|
||||
; nextln: return v5
|
||||
; nextln: }
|
||||
|
||||
; check if float arguments are passed through XMM registers
|
||||
function %four_float_args(f64, f64, f64, f64) windows_fastcall {
|
||||
block0(v0: f64, v1: f64, v2: f64, v3: f64):
|
||||
return
|
||||
}
|
||||
; check: function %four_float_args(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
|
||||
; check: function %four_float_args(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
|
||||
; nextln: ss0 = incoming_arg 16, offset -16
|
||||
; check: block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v4: i64 [%rbp]):
|
||||
; nextln: x86_push v4
|
||||
; nextln: copy_special %rsp -> %rbp
|
||||
; nextln: v5 = x86_pop.i64
|
||||
; nextln: return v5
|
||||
; nextln: }
|
||||
|
||||
; check if we use stack space for > 4 arguments
|
||||
function %five_args(i64, i64, i64, i64, i64) windows_fastcall {
|
||||
block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64):
|
||||
return
|
||||
}
|
||||
; check: function %five_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 [32], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
|
||||
; check: function %five_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 [32], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
|
||||
; nextln: ss0 = incoming_arg 8, offset 32
|
||||
; nextln: ss1 = incoming_arg 16, offset -16
|
||||
; check: block0(v0: i64 [%rcx], v1: i64 [%rdx], v2: i64 [%r8], v3: i64 [%r9], v4: i64 [ss0], v5: i64 [%rbp]):
|
||||
; nextln: x86_push v5
|
||||
; nextln: copy_special %rsp -> %rbp
|
||||
; nextln: v6 = x86_pop.i64
|
||||
; nextln: return v6
|
||||
; nextln: }
|
||||
|
||||
; check that we preserve xmm6 and above if we're using them locally
|
||||
function %float_callee_saves(f64, f64, f64, f64) windows_fastcall {
|
||||
@@ -40,38 +68,51 @@ block0(v0: f64, v1: f64, v2: f64, v3: f64):
|
||||
[-, %xmm7] v5 = fadd v0, v1
|
||||
return
|
||||
}
|
||||
; check: function %float_callee_sav(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7]) -> i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7] windows_fastcall {
|
||||
; nextln: ss0 = explicit_slot 32, offset -80
|
||||
; nextln: ss1 = incoming_arg 16, offset -48
|
||||
; check: block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v6: i64 [%rbp], v8: f64x2 [%xmm6], v9: f64x2 [%xmm7]):
|
||||
; nextln: x86_push v6
|
||||
; nextln: copy_special %rsp -> %rbp
|
||||
; nextln: adjust_sp_down_imm 64
|
||||
; nextln: v7 = stack_addr.i64 ss0
|
||||
; nextln: store notrap aligned v8, v7
|
||||
; nextln: store notrap aligned v9, v7+16
|
||||
; check: v10 = stack_addr.i64 ss0
|
||||
; nextln: v11 = load.f64x2 notrap aligned v10
|
||||
; nextln: v12 = load.f64x2 notrap aligned v10+16
|
||||
; nextln: adjust_sp_up_imm 64
|
||||
; nextln: v13 = x86_pop.i64
|
||||
; nextln: v13, v11, v12
|
||||
; check: function %float_callee_sav(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 csr [%rsp], i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7]) -> i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7] windows_fastcall {
|
||||
; nextln: ss0 = incoming_arg 48, offset -48
|
||||
; check: block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v6: i64 [%rsp], v7: i64 [%rbp], v8: f64x2 [%xmm6], v9: f64x2 [%xmm7]):
|
||||
; nextln: x86_push v7
|
||||
; nextln: copy_special %rsp -> %rbp
|
||||
; nextln: adjust_sp_down_imm 32
|
||||
; nextln: store notrap aligned v8, v6+16
|
||||
; nextln: store notrap aligned v9, v6
|
||||
; nextln: v11 = load.f64x2 notrap aligned v6+16
|
||||
; nextln: v12 = load.f64x2 notrap aligned v6
|
||||
; nextln: adjust_sp_up_imm 32
|
||||
; nextln: v10 = x86_pop.i64
|
||||
; nextln: return v10, v11, v12
|
||||
; nextln: }
|
||||
|
||||
function %mixed_int_float(i64, f64, i64, f32) windows_fastcall {
|
||||
block0(v0: i64, v1: f64, v2: i64, v3: f32):
|
||||
return
|
||||
}
|
||||
; check: function %mixed_int_float(i64 [%rcx], f64 [%xmm1], i64 [%r8], f32 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
|
||||
; check: function %mixed_int_float(i64 [%rcx], f64 [%xmm1], i64 [%r8], f32 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
|
||||
; nextln: ss0 = incoming_arg 16, offset -16
|
||||
; check: block0(v0: i64 [%rcx], v1: f64 [%xmm1], v2: i64 [%r8], v3: f32 [%xmm3], v4: i64 [%rbp]):
|
||||
; nextln: x86_push v4
|
||||
; nextln: copy_special %rsp -> %rbp
|
||||
; nextln: v5 = x86_pop.i64
|
||||
; nextln: return v5
|
||||
; nextln: }
|
||||
|
||||
function %ret_val_float(f32, f64, i64, i64) -> f64 windows_fastcall {
|
||||
block0(v0: f32, v1: f64, v2: i64, v3: i64):
|
||||
return v1
|
||||
}
|
||||
; check: function %ret_val_float(f32 [%xmm0], f64 [%xmm1], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> f64 [%xmm0], i64 fp [%rbp] windows_fastcall {
|
||||
; check: function %ret_val_float(f32 [%xmm0], f64 [%xmm1], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> f64 [%xmm0], i64 fp [%rbp] windows_fastcall {
|
||||
; nextln: ss0 = incoming_arg 16, offset -16
|
||||
; check: block0(v0: f32 [%xmm0], v1: f64 [%xmm1], v2: i64 [%r8], v3: i64 [%r9], v4: i64 [%rbp]):
|
||||
; nextln: x86_push v4
|
||||
; nextln: copy_special %rsp -> %rbp
|
||||
; nextln: regmove v1, %xmm1 -> %xmm0
|
||||
; nextln: v5 = x86_pop.i64
|
||||
; nextln: return v1, v5
|
||||
; nextln: }
|
||||
|
||||
function %internal_stack_arg_function_call(i64) -> i64 windows_fastcall {
|
||||
fn0 = %foo(i64, i64, i64, i64) -> i64
|
||||
fn1 = %foo2(i64, i64, i64, i64) -> i64
|
||||
fn0 = %foo(i64, i64, i64, i64) -> i64 windows_fastcall
|
||||
fn1 = %foo2(i64, i64, i64, i64) -> i64 windows_fastcall
|
||||
block0(v0: i64):
|
||||
v1 = load.i64 v0+0
|
||||
v2 = load.i64 v0+8
|
||||
@@ -94,3 +135,100 @@ block0(v0: i64):
|
||||
store.i64 v9, v0+72
|
||||
return v10
|
||||
}
|
||||
; check: function %internal_stack_a(i64 [%rcx], i64 fp [%rbp], i64 csr [%r12], i64 csr [%r13], i64 csr [%r14], i64 csr [%r15]) -> i64 [%rax], i64 fp [%rbp], i64 csr [%r12], i64 csr [%r13], i64 csr [%r14], i64 csr [%r15] windows_fastcall {
|
||||
; nextln: ss0 = spill_slot 8, offset -56
|
||||
; nextln: ss1 = spill_slot 8, offset -64
|
||||
; nextln: ss2 = spill_slot 8, offset -72
|
||||
; nextln: ss3 = spill_slot 8, offset -80
|
||||
; nextln: ss4 = spill_slot 8, offset -88
|
||||
; nextln: ss5 = spill_slot 8, offset -96
|
||||
; nextln: ss6 = spill_slot 8, offset -104
|
||||
; nextln: ss7 = spill_slot 8, offset -112
|
||||
; nextln: ss8 = spill_slot 8, offset -120
|
||||
; nextln: ss9 = spill_slot 8, offset -128
|
||||
; nextln: ss10 = incoming_arg 48, offset -48
|
||||
; nextln: ss11 = explicit_slot 32, offset -160
|
||||
; nextln: sig0 = (i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9]) -> i64 [%rax] windows_fastcall
|
||||
; nextln: sig1 = (i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9]) -> i64 [%rax] windows_fastcall
|
||||
; nextln: fn0 = %foo sig0
|
||||
; nextln: fn1 = %foo2 sig1
|
||||
; check: block0(v11: i64 [%rcx], v52: i64 [%rbp], v53: i64 [%r12], v54: i64 [%r13], v55: i64 [%r14], v56: i64 [%r15]):
|
||||
; nextln: x86_push v52
|
||||
; nextln: copy_special %rsp -> %rbp
|
||||
; nextln: x86_push v53
|
||||
; nextln: x86_push v54
|
||||
; nextln: x86_push v55
|
||||
; nextln: x86_push v56
|
||||
; nextln: adjust_sp_down_imm 112
|
||||
; nextln: v0 = spill v11
|
||||
; nextln: v12 = copy_to_ssa.i64 %rcx
|
||||
; nextln: v13 = load.i64 v12
|
||||
; nextln: v1 = spill v13
|
||||
; nextln: v14 = fill_nop v0
|
||||
; nextln: v15 = load.i64 v14+8
|
||||
; nextln: v2 = spill v15
|
||||
; nextln: v16 = fill_nop v0
|
||||
; nextln: v17 = load.i64 v16+16
|
||||
; nextln: v3 = spill v17
|
||||
; nextln: v18 = fill_nop v0
|
||||
; nextln: v19 = load.i64 v18+24
|
||||
; nextln: v4 = spill v19
|
||||
; nextln: v20 = fill_nop v0
|
||||
; nextln: v21 = load.i64 v20+32
|
||||
; nextln: v5 = spill v21
|
||||
; nextln: v22 = fill_nop v0
|
||||
; nextln: v23 = load.i64 v22+40
|
||||
; nextln: v6 = spill v23
|
||||
; nextln: v24 = fill_nop v0
|
||||
; nextln: v25 = load.i64 v24+48
|
||||
; nextln: v7 = spill v25
|
||||
; nextln: v26 = fill_nop v0
|
||||
; nextln: v27 = load.i64 v26+56
|
||||
; nextln: v8 = spill v27
|
||||
; nextln: v28 = fill_nop v0
|
||||
; nextln: v29 = load.i64 v28+64
|
||||
; nextln: v9 = spill v29
|
||||
; nextln: v30 = fill v1
|
||||
; nextln: v31 = fill v2
|
||||
; nextln: v32 = fill v3
|
||||
; nextln: v33 = fill v4
|
||||
; nextln: regmove v30, %r15 -> %rcx
|
||||
; nextln: regmove v31, %r14 -> %rdx
|
||||
; nextln: regmove v32, %r13 -> %r8
|
||||
; nextln: regmove v33, %r12 -> %r9
|
||||
; nextln: v10 = call fn0(v30, v31, v32, v33)
|
||||
; nextln: v34 = fill v1
|
||||
; nextln: v35 = fill v0
|
||||
; nextln: store v34, v35+8
|
||||
; nextln: v36 = fill v2
|
||||
; nextln: v37 = fill_nop v0
|
||||
; nextln: store v36, v37+16
|
||||
; nextln: v38 = fill v3
|
||||
; nextln: v39 = fill_nop v0
|
||||
; nextln: store v38, v39+24
|
||||
; nextln: v40 = fill v4
|
||||
; nextln: v41 = fill_nop v0
|
||||
; nextln: store v40, v41+32
|
||||
; nextln: v42 = fill v5
|
||||
; nextln: v43 = fill_nop v0
|
||||
; nextln: store v42, v43+40
|
||||
; nextln: v44 = fill v6
|
||||
; nextln: v45 = fill_nop v0
|
||||
; nextln: store v44, v45+48
|
||||
; nextln: v46 = fill v7
|
||||
; nextln: v47 = fill_nop v0
|
||||
; nextln: store v46, v47+56
|
||||
; nextln: v48 = fill v8
|
||||
; nextln: v49 = fill_nop v0
|
||||
; nextln: store v48, v49+64
|
||||
; nextln: v50 = fill v9
|
||||
; nextln: v51 = fill_nop v0
|
||||
; nextln: store v50, v51+72
|
||||
; nextln: adjust_sp_up_imm 112
|
||||
; nextln: v61 = x86_pop.i64
|
||||
; nextln: v60 = x86_pop.i64
|
||||
; nextln: v59 = x86_pop.i64
|
||||
; nextln: v58 = x86_pop.i64
|
||||
; nextln: v57 = x86_pop.i64
|
||||
; nextln: return v10, v57, v58, v59, v60, v61
|
||||
; nextln: }
|
||||
@@ -3,13 +3,35 @@ set opt_level=speed_and_size
|
||||
set is_pic
|
||||
target x86_64 haswell
|
||||
|
||||
; check the unwind information with a function with no args
|
||||
function %no_args() windows_fastcall {
|
||||
; check the unwind information with a leaf function with no args
|
||||
function %no_args_leaf() windows_fastcall {
|
||||
block0:
|
||||
return
|
||||
}
|
||||
; sameln: version: 1
|
||||
; nextln: flags: 0
|
||||
; nextln: prologue size: 4
|
||||
; nextln: frame register: 5
|
||||
; nextln: frame register offset: 0
|
||||
; nextln: unwind codes: 2
|
||||
; nextln:
|
||||
; nextln: offset: 1
|
||||
; nextln: op: PushNonvolatileRegister
|
||||
; nextln: info: 5
|
||||
; nextln:
|
||||
; nextln: offset: 4
|
||||
; nextln: op: SetFramePointer
|
||||
; nextln: info: 0
|
||||
|
||||
; check the unwind information with a non-leaf function with no args
|
||||
function %no_args() windows_fastcall {
|
||||
fn0 = %foo()
|
||||
block0:
|
||||
call fn0()
|
||||
return
|
||||
}
|
||||
; sameln: version: 1
|
||||
; nextln: flags: 0
|
||||
; nextln: prologue size: 8
|
||||
; nextln: frame register: 5
|
||||
; nextln: frame register offset: 0
|
||||
@@ -51,7 +73,7 @@ block0:
|
||||
; nextln: offset: 17
|
||||
; nextln: op: LargeStackAlloc
|
||||
; nextln: info: 0
|
||||
; nextln: value: 12504 (u16)
|
||||
; nextln: value: 12500 (u16)
|
||||
|
||||
; check a function with large-sized stack alloc
|
||||
function %large_stack() windows_fastcall {
|
||||
@@ -77,7 +99,7 @@ block0:
|
||||
; nextln: offset: 17
|
||||
; nextln: op: LargeStackAlloc
|
||||
; nextln: info: 1
|
||||
; nextln: value: 524320 (u32)
|
||||
; nextln: value: 524288 (u32)
|
||||
|
||||
function %fpr_with_function_call(i64, i64) windows_fastcall {
|
||||
fn0 = %foo(f64, f64, i64, i64, i64) windows_fastcall;
|
||||
@@ -113,9 +135,9 @@ block0(v0: i64, v1: i64):
|
||||
;
|
||||
; sameln: version: 1
|
||||
; nextln: flags: 0
|
||||
; nextln: prologue size: 25
|
||||
; nextln: prologue size: 22
|
||||
; nextln: frame register: 5
|
||||
; nextln: frame register offset: 12
|
||||
; nextln: frame register offset: 2
|
||||
; nextln: unwind codes: 5
|
||||
; nextln:
|
||||
; nextln: offset: 1
|
||||
@@ -135,10 +157,10 @@ block0(v0: i64, v1: i64):
|
||||
; nextln: info: 0
|
||||
; nextln: value: 23 (u16)
|
||||
; nextln:
|
||||
; nextln: offset: 25
|
||||
; nextln: offset: 22
|
||||
; nextln: op: SaveXmm128
|
||||
; nextln: info: 15
|
||||
; nextln: value: 3 (u16)
|
||||
; nextln: value: 0 (u16)
|
||||
|
||||
; check a function that has CSRs
|
||||
function %lots_of_registers(i64, i64) windows_fastcall {
|
||||
@@ -191,9 +213,9 @@ block0(v0: i64, v1: i64):
|
||||
}
|
||||
; sameln: version: 1
|
||||
; nextln: flags: 0
|
||||
; nextln: prologue size: 41
|
||||
; nextln: prologue size: 35
|
||||
; nextln: frame register: 5
|
||||
; nextln: frame register offset: 10
|
||||
; nextln: frame register offset: 7
|
||||
; nextln: unwind codes: 13
|
||||
; nextln:
|
||||
; nextln: offset: 1
|
||||
@@ -234,19 +256,19 @@ block0(v0: i64, v1: i64):
|
||||
; nextln:
|
||||
; nextln: offset: 19
|
||||
; nextln: op: SmallStackAlloc
|
||||
; nextln: info: 12
|
||||
; nextln: info: 8
|
||||
; nextln:
|
||||
; nextln: offset: 31
|
||||
; nextln: offset: 24
|
||||
; nextln: op: SaveXmm128
|
||||
; nextln: info: 6
|
||||
; nextln: value: 0 (u16)
|
||||
; nextln: value: 2 (u16)
|
||||
; nextln:
|
||||
; nextln: offset: 36
|
||||
; nextln: offset: 29
|
||||
; nextln: op: SaveXmm128
|
||||
; nextln: info: 7
|
||||
; nextln: value: 1 (u16)
|
||||
; nextln:
|
||||
; nextln: offset: 41
|
||||
; nextln: offset: 35
|
||||
; nextln: op: SaveXmm128
|
||||
; nextln: info: 8
|
||||
; nextln: value: 2 (u16)
|
||||
; nextln: value: 0 (u16)
|
||||
|
||||
Reference in New Issue
Block a user