Our previous implementation of unwind infrastructure was somewhat complex and brittle: it parsed generated instructions in order to reverse-engineer unwind info from prologues. It also relied on some fragile linkage to communicate instruction-layout information that VCode was not designed to provide. A much simpler, more reliable, and easier-to-reason-about approach is to embed unwind directives as pseudo-instructions in the prologue as we generate it. That way, we can say what we mean and just emit it directly. The usual reasoning that leads to the reverse-engineering approach is that metadata is hard to keep in sync across optimization passes; but here, (i) prologues are generated at the very end of the pipeline, and (ii) if we ever do a post-prologue-gen optimization, we can treat unwind directives as black boxes with unknown side-effects, just as we do for some other pseudo-instructions today. It turns out that it was easier to just build this for both x64 and aarch64 (since they share a factored-out ABI implementation), and wire up the platform-specific unwind-info generation for Windows and SystemV. Now we have simpler unwind on all platforms and we can delete the old unwind infra as soon as we remove the old backend. There were a few consequences to supporting Fastcall unwind in particular that led to a refactor of the common ABI. Windows only supports naming clobbered-register save locations within 240 bytes of the frame-pointer register, whatever one chooses that to be (RSP or RBP). We had previously saved clobbers below the fixed frame (and below nominal-SP). The 240-byte range has to include the old RBP too, so we're forced to place clobbers at the top of the frame, just below saved RBP/RIP. This is fine; we always keep a frame pointer anyway because we use it to refer to stack args. It does mean that offsets of fixed-frame slots (spillslots, stackslots) from RBP are no longer known before we do regalloc, so if we ever want to index these off of RBP rather than nominal-SP because we add support for `alloca` (dynamic frame growth), then we'll need a "nominal-BP" mode that is resolved after regalloc and clobber-save code is generated. I added a comment to this effect in `abi_impl.rs`. The above refactor touched both x64 and aarch64 because of shared code. This had a further effect in that the old aarch64 prologue generation subtracted from `sp` once to allocate space, then used stores to `[sp, offset]` to save clobbers. Unfortunately the offset only has 7-bit range, so if there are enough clobbered registers (and there can be -- aarch64 has 384 bytes of registers; at least one unit test hits this) the stores/loads will be out-of-range. I really don't want to synthesize large-offset sequences here; better to go back to the simpler pre-index/post-index `stp r1, r2, [sp, #-16]` form that works just like a "push". It's likely not much worse microarchitecturally (dependence chain on SP, but oh well) and it actually saves an instruction if there's no other frame to allocate. As a further advantage, it's much simpler to understand; simpler is usually better. This PR adds the new backend on Windows to CI as well.
279 lines
5.8 KiB
Plaintext
279 lines
5.8 KiB
Plaintext
test compile
|
|
set unwind_info=false
|
|
target aarch64
|
|
|
|
function %stack_addr_small() -> i64 {
|
|
ss0 = explicit_slot 8
|
|
|
|
block0:
|
|
v0 = stack_addr.i64 ss0
|
|
return v0
|
|
}
|
|
|
|
; check: stp fp, lr, [sp, #-16]!
|
|
; nextln: mov fp, sp
|
|
; nextln: sub sp, sp, #16
|
|
; nextln: mov x0, sp
|
|
; nextln: add sp, sp, #16
|
|
; nextln: ldp fp, lr, [sp], #16
|
|
; nextln: ret
|
|
|
|
|
|
function %stack_addr_big() -> i64 {
|
|
ss0 = explicit_slot 100000
|
|
ss1 = explicit_slot 8
|
|
|
|
block0:
|
|
v0 = stack_addr.i64 ss0
|
|
return v0
|
|
}
|
|
|
|
; check: stp fp, lr, [sp, #-16]!
|
|
; nextln: mov fp, sp
|
|
; nextln: movz w16, #34480
|
|
; nextln: movk w16, #1, LSL #16
|
|
; nextln: sub sp, sp, x16, UXTX
|
|
; nextln: mov x0, sp
|
|
; nextln: movz w16, #34480
|
|
; nextln: movk w16, #1, LSL #16
|
|
; nextln: add sp, sp, x16, UXTX
|
|
; nextln: ldp fp, lr, [sp], #16
|
|
; nextln: ret
|
|
|
|
|
|
; FIXME: don't use stack_addr legalization for stack_load and stack_store
|
|
|
|
function %stack_load_small() -> i64 {
|
|
ss0 = explicit_slot 8
|
|
|
|
block0:
|
|
v0 = stack_load.i64 ss0
|
|
return v0
|
|
}
|
|
|
|
; check: stp fp, lr, [sp, #-16]!
|
|
; nextln: mov fp, sp
|
|
; nextln: sub sp, sp, #16
|
|
; nextln: mov x0, sp
|
|
; nextln: ldr x0, [x0]
|
|
; nextln: add sp, sp, #16
|
|
; nextln: ldp fp, lr, [sp], #16
|
|
; nextln: ret
|
|
|
|
|
|
function %stack_load_big() -> i64 {
|
|
ss0 = explicit_slot 100000
|
|
ss1 = explicit_slot 8
|
|
|
|
block0:
|
|
v0 = stack_load.i64 ss0
|
|
return v0
|
|
}
|
|
|
|
; check: stp fp, lr, [sp, #-16]!
|
|
; nextln: mov fp, sp
|
|
; nextln: movz w16, #34480
|
|
; nextln: movk w16, #1, LSL #16
|
|
; nextln: sub sp, sp, x16, UXTX
|
|
; nextln: mov x0, sp
|
|
; nextln: ldr x0, [x0]
|
|
; nextln: movz w16, #34480
|
|
; nextln: movk w16, #1, LSL #16
|
|
; nextln: add sp, sp, x16, UXTX
|
|
; nextln: ldp fp, lr, [sp], #16
|
|
; nextln: ret
|
|
|
|
|
|
function %stack_store_small(i64) {
|
|
ss0 = explicit_slot 8
|
|
|
|
block0(v0: i64):
|
|
stack_store.i64 v0, ss0
|
|
return
|
|
}
|
|
|
|
; check: stp fp, lr, [sp, #-16]!
|
|
; nextln: mov fp, sp
|
|
; nextln: sub sp, sp, #16
|
|
; nextln: mov x1, sp
|
|
; nextln: str x0, [x1]
|
|
; nextln: add sp, sp, #16
|
|
; nextln: ldp fp, lr, [sp], #16
|
|
; nextln: ret
|
|
|
|
|
|
function %stack_store_big(i64) {
|
|
ss0 = explicit_slot 100000
|
|
ss1 = explicit_slot 8
|
|
|
|
block0(v0: i64):
|
|
stack_store.i64 v0, ss0
|
|
return
|
|
}
|
|
|
|
; check: stp fp, lr, [sp, #-16]!
|
|
; nextln: mov fp, sp
|
|
; nextln: movz w16, #34480
|
|
; nextln: movk w16, #1, LSL #16
|
|
; nextln: sub sp, sp, x16, UXTX
|
|
; nextln: mov x1, sp
|
|
; nextln: str x0, [x1]
|
|
; nextln: movz w16, #34480
|
|
; nextln: movk w16, #1, LSL #16
|
|
; nextln: add sp, sp, x16, UXTX
|
|
; nextln: ldp fp, lr, [sp], #16
|
|
; nextln: ret
|
|
|
|
; Force a b1 to be spilled into a slot at an SP offset between 0x100 and
|
|
; 0x1fff, to exercise the scaled addressing mode.
|
|
function %b1_spill_slot(b1) -> b1, i64 {
|
|
ss0 = explicit_slot 1000
|
|
|
|
block0(v0: b1):
|
|
v1 = iconst.i64 1
|
|
v2 = iconst.i64 2
|
|
v3 = iconst.i64 3
|
|
v4 = iconst.i64 4
|
|
v5 = iconst.i64 5
|
|
v6 = iconst.i64 6
|
|
v7 = iconst.i64 7
|
|
v8 = iconst.i64 8
|
|
v9 = iconst.i64 9
|
|
v10 = iconst.i64 10
|
|
v11 = iconst.i64 11
|
|
v12 = iconst.i64 12
|
|
v13 = iconst.i64 13
|
|
v14 = iconst.i64 14
|
|
v15 = iconst.i64 15
|
|
v16 = iconst.i64 16
|
|
v17 = iconst.i64 17
|
|
v18 = iconst.i64 18
|
|
v19 = iconst.i64 19
|
|
v20 = iconst.i64 20
|
|
v21 = iconst.i64 21
|
|
v22 = iconst.i64 22
|
|
v23 = iconst.i64 23
|
|
v24 = iconst.i64 24
|
|
v25 = iconst.i64 25
|
|
v26 = iconst.i64 26
|
|
v27 = iconst.i64 27
|
|
v28 = iconst.i64 28
|
|
v29 = iconst.i64 29
|
|
v30 = iconst.i64 30
|
|
v31 = iconst.i64 31
|
|
v32 = iconst.i64 32
|
|
v33 = iconst.i64 33
|
|
v34 = iconst.i64 34
|
|
v35 = iconst.i64 35
|
|
v36 = iconst.i64 36
|
|
v37 = iconst.i64 37
|
|
v38 = iconst.i64 38
|
|
v39 = iconst.i64 39
|
|
v40 = iconst.i64 30
|
|
v41 = iconst.i64 31
|
|
v42 = iconst.i64 32
|
|
v43 = iconst.i64 33
|
|
v44 = iconst.i64 34
|
|
v45 = iconst.i64 35
|
|
v46 = iconst.i64 36
|
|
v47 = iconst.i64 37
|
|
v48 = iconst.i64 38
|
|
v49 = iconst.i64 39
|
|
v50 = iconst.i64 30
|
|
v51 = iconst.i64 31
|
|
v52 = iconst.i64 32
|
|
v53 = iconst.i64 33
|
|
v54 = iconst.i64 34
|
|
v55 = iconst.i64 35
|
|
v56 = iconst.i64 36
|
|
v57 = iconst.i64 37
|
|
v58 = iconst.i64 38
|
|
v59 = iconst.i64 39
|
|
v60 = iconst.i64 30
|
|
v61 = iconst.i64 31
|
|
v62 = iconst.i64 32
|
|
v63 = iconst.i64 33
|
|
v64 = iconst.i64 34
|
|
v65 = iconst.i64 35
|
|
v66 = iconst.i64 36
|
|
v67 = iconst.i64 37
|
|
v68 = iconst.i64 38
|
|
v69 = iconst.i64 39
|
|
|
|
v70 = iadd.i64 v1, v2
|
|
v71 = iadd.i64 v3, v4
|
|
v72 = iadd.i64 v5, v6
|
|
v73 = iadd.i64 v7, v8
|
|
v74 = iadd.i64 v9, v10
|
|
v75 = iadd.i64 v11, v12
|
|
v76 = iadd.i64 v13, v14
|
|
v77 = iadd.i64 v15, v16
|
|
v78 = iadd.i64 v17, v18
|
|
v79 = iadd.i64 v19, v20
|
|
v80 = iadd.i64 v21, v22
|
|
v81 = iadd.i64 v23, v24
|
|
v82 = iadd.i64 v25, v26
|
|
v83 = iadd.i64 v27, v28
|
|
v84 = iadd.i64 v29, v30
|
|
v85 = iadd.i64 v31, v32
|
|
v86 = iadd.i64 v33, v34
|
|
v87 = iadd.i64 v35, v36
|
|
v88 = iadd.i64 v37, v38
|
|
v89 = iadd.i64 v39, v40
|
|
v90 = iadd.i64 v41, v42
|
|
v91 = iadd.i64 v43, v44
|
|
v92 = iadd.i64 v45, v46
|
|
v93 = iadd.i64 v47, v48
|
|
v94 = iadd.i64 v49, v50
|
|
v95 = iadd.i64 v51, v52
|
|
v96 = iadd.i64 v53, v54
|
|
v97 = iadd.i64 v55, v56
|
|
v98 = iadd.i64 v57, v58
|
|
v99 = iadd.i64 v59, v60
|
|
v100 = iadd.i64 v61, v62
|
|
v101 = iadd.i64 v63, v64
|
|
v102 = iadd.i64 v65, v66
|
|
v103 = iadd.i64 v67, v68
|
|
|
|
v104 = iadd.i64 v69, v70
|
|
v105 = iadd.i64 v71, v72
|
|
v106 = iadd.i64 v73, v74
|
|
v107 = iadd.i64 v75, v76
|
|
v108 = iadd.i64 v77, v78
|
|
v109 = iadd.i64 v79, v80
|
|
v110 = iadd.i64 v81, v82
|
|
v111 = iadd.i64 v83, v84
|
|
v112 = iadd.i64 v85, v86
|
|
v113 = iadd.i64 v87, v88
|
|
v114 = iadd.i64 v89, v90
|
|
v115 = iadd.i64 v91, v92
|
|
v116 = iadd.i64 v93, v94
|
|
v117 = iadd.i64 v95, v96
|
|
v118 = iadd.i64 v97, v98
|
|
v119 = iadd.i64 v99, v100
|
|
v120 = iadd.i64 v101, v102
|
|
|
|
v121 = iadd.i64 v103, v104
|
|
v122 = iadd.i64 v105, v106
|
|
v123 = iadd.i64 v107, v108
|
|
v124 = iadd.i64 v109, v110
|
|
v125 = iadd.i64 v111, v112
|
|
v126 = iadd.i64 v113, v114
|
|
v127 = iadd.i64 v115, v116
|
|
v128 = iadd.i64 v117, v118
|
|
v129 = iadd.i64 v119, v120
|
|
|
|
v130 = iadd.i64 v121, v122
|
|
v131 = iadd.i64 v123, v124
|
|
v132 = iadd.i64 v125, v126
|
|
v133 = iadd.i64 v127, v128
|
|
|
|
v134 = iadd.i64 v129, v130
|
|
v135 = iadd.i64 v131, v132
|
|
|
|
v136 = iadd.i64 v133, v134
|
|
v137 = iadd.i64 v135, v136
|
|
|
|
return v0, v137
|
|
}
|