x86-64 Windows fastcall ABI support.

This adds support for the "fastcall" ABI, which is the native C/C++ ABI
on Windows platforms on x86-64. It is similar to but not exactly like
System V; primarily, its argument register assignments are different,
and it requires stack shadow space.

Note that this also adjusts the handling of multi-register values in the
shared ABI implementation, and with this change, adjusts handling of
`i128`s on *both* Fastcall/x64 *and* SysV/x64 platforms. This was done
to align with actual behavior by the "rustc ABI" on both platforms, as
mapped out experimentally (Compiler Explorer link in comments). This
behavior is gated under the `enable_llvm_abi_extensions` flag.

Note also that this does *not* add x64 unwind info on Windows. That will
come in a future PR (but is planned!).
This commit is contained in:
Chris Fallin
2021-02-22 20:28:49 -08:00
parent 98d3e6823f
commit 6c94eb82aa
13 changed files with 997 additions and 475 deletions

View File

@@ -0,0 +1,299 @@
test compile
set enable_llvm_abi_extensions=true
target x86_64
feature "experimental_x64"
function %f0(i64, i64, i64, i64) -> i64 windows_fastcall {
block0(v0: i64, v1: i64, v2: i64, v3: i64):
return v0
}
; check: pushq %rbp
; nextln: movq %rsp, %rbp
; nextln: movq %rcx, %rax
; nextln: movq %rbp, %rsp
; nextln: popq %rbp
; nextln: ret
function %f1(i64, i64, i64, i64) -> i64 windows_fastcall {
block0(v0: i64, v1: i64, v2: i64, v3: i64):
return v1
}
; check: pushq %rbp
; nextln: movq %rsp, %rbp
; nextln: movq %rdx, %rax
; nextln: movq %rbp, %rsp
; nextln: popq %rbp
; nextln: ret
function %f2(i64, i64, i64, i64) -> i64 windows_fastcall {
block0(v0: i64, v1: i64, v2: i64, v3: i64):
return v2
}
; check: pushq %rbp
; nextln: movq %rsp, %rbp
; nextln: movq %r8, %rax
; nextln: movq %rbp, %rsp
; nextln: popq %rbp
; nextln: ret
function %f3(i64, i64, i64, i64) -> i64 windows_fastcall {
block0(v0: i64, v1: i64, v2: i64, v3: i64):
return v3
}
; check: pushq %rbp
; nextln: movq %rsp, %rbp
; nextln: movq %r9, %rax
; nextln: movq %rbp, %rsp
; nextln: popq %rbp
; nextln: ret
function %f4(i64, i64, f64, i64) -> f64 windows_fastcall {
block0(v0: i64, v1: i64, v2: f64, v3: i64):
return v2
}
; check: pushq %rbp
; nextln: movq %rsp, %rbp
; nextln: movaps %xmm2, %xmm0
; nextln: movq %rbp, %rsp
; nextln: popq %rbp
; nextln: ret
function %f5(i64, i64, f64, i64) -> i64 windows_fastcall {
block0(v0: i64, v1: i64, v2: f64, v3: i64):
return v3
}
; check: pushq %rbp
; nextln: movq %rsp, %rbp
; nextln: movq %r9, %rax
; nextln: movq %rbp, %rsp
; nextln: popq %rbp
; nextln: ret
function %f6(i64, i64, i64, i64, i64, i64) -> i64 windows_fastcall {
block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64, v5: i64):
return v5
}
;; This is truly odd (because of the regalloc ordering), but it works. Note
;; that we're spilling and using rsi, which is a callee-save in fastcall, because
;; the regalloc order is optimized for SysV. Also note that because we copy args
;; out of their input locations to separate vregs, we have a spurious load
;; from [rbp+48]. Ordinarily these moves are coalesced because the dest vreg
;; is allocated as a caller-save (volatile), but here again we allocate rsi
;; first and so have to spill it (and consequently don't coalesce).
;;
;; TODO(#2704): fix regalloc's register priority ordering!
; check: pushq %rbp
; nextln: movq %rsp, %rbp
; nextln: subq $$16, %rsp
; nextln: movq %rsi, 0(%rsp)
; nextln: virtual_sp_offset_adjust 16
; nextln: movq 48(%rbp), %rsi
; nextln: movq 56(%rbp), %rsi
; nextln: movq %rsi, %rax
; nextln: movq 0(%rsp), %rsi
; nextln: addq $$16, %rsp
; nextln: movq %rbp, %rsp
; nextln: popq %rbp
; nextln: ret
function %f7(i128, i64, i128, i128) -> i128 windows_fastcall {
block0(v0: i128, v1: i64, v2: i128, v3: i128):
return v3
}
;; Again, terrible regalloc behavior. The important part is that `v3` comes
;; from [rbp+56] and [rbp+64], i.e., the second and third non-shadow
;; stack slot.
; check: pushq %rbp
; nextln: movq %rsp, %rbp
; nextln: subq $$16, %rsp
; nextln: movq %rsi, 0(%rsp)
; nextln: movq %rdi, 8(%rsp)
; nextln: virtual_sp_offset_adjust 16
; nextln: movq 48(%rbp), %rsi
; nextln: movq 56(%rbp), %rsi
; nextln: movq 64(%rbp), %rdi
; nextln: movq %rsi, %rax
; nextln: movq %rdi, %rdx
; nextln: movq 0(%rsp), %rsi
; nextln: movq 8(%rsp), %rdi
; nextln: addq $$16, %rsp
; nextln: movq %rbp, %rsp
; nextln: popq %rbp
; nextln: ret
function %f8(i64) -> i64 windows_fastcall {
sig0 = (i64, i64, f64, f64, i64, i64) -> i64 windows_fastcall
fn0 = %g sig0
block0(v0: i64):
v1 = fcvt_from_sint.f64 v0
v2 = call fn0(v0, v0, v1, v1, v0, v0)
return v2
}
; check: pushq %rbp
; nextln: movq %rsp, %rbp
; nextln: subq $$16, %rsp
; nextln: movq %rsi, 0(%rsp)
; nextln: virtual_sp_offset_adjust 16
; nextln: movq %rcx, %rsi
; nextln: cvtsi2sd %rsi, %xmm3
; nextln: subq $$48, %rsp
; nextln: virtual_sp_offset_adjust 48
; nextln: movq %rsi, %rcx
; nextln: movq %rsi, %rdx
; nextln: movaps %xmm3, %xmm2
; nextln: movq %rsi, 32(%rsp)
; nextln: movq %rsi, 40(%rsp)
; nextln: load_ext_name %g+0, %rsi
; nextln: call *%rsi
; nextln: addq $$48, %rsp
; nextln: virtual_sp_offset_adjust -48
; nextln: movq 0(%rsp), %rsi
; nextln: addq $$16, %rsp
; nextln: movq %rbp, %rsp
; nextln: popq %rbp
; nextln: ret
function %f9(i64) -> f64 windows_fastcall {
block0(v0: i64):
v1 = load.f64 v0+0
v2 = load.f64 v0+8
v3 = load.f64 v0+16
v4 = load.f64 v0+24
v5 = load.f64 v0+32
v6 = load.f64 v0+40
v7 = load.f64 v0+48
v8 = load.f64 v0+56
v9 = load.f64 v0+64
v10 = load.f64 v0+72
v11 = load.f64 v0+80
v12 = load.f64 v0+88
v13 = load.f64 v0+96
v14 = load.f64 v0+104
v15 = load.f64 v0+112
v16 = load.f64 v0+120
v17 = load.f64 v0+128
v18 = load.f64 v0+136
v19 = load.f64 v0+144
v20 = load.f64 v0+152
v21 = fadd.f64 v1, v2
v22 = fadd.f64 v3, v4
v23 = fadd.f64 v5, v6
v24 = fadd.f64 v7, v8
v25 = fadd.f64 v9, v10
v26 = fadd.f64 v11, v12
v27 = fadd.f64 v13, v14
v28 = fadd.f64 v15, v16
v29 = fadd.f64 v17, v18
v30 = fadd.f64 v19, v20
v31 = fadd.f64 v21, v22
v32 = fadd.f64 v23, v24
v33 = fadd.f64 v25, v26
v34 = fadd.f64 v27, v28
v35 = fadd.f64 v29, v30
v36 = fadd.f64 v31, v32
v37 = fadd.f64 v33, v34
v38 = fadd.f64 v36, v37
v39 = fadd.f64 v38, v35
return v39
}
; check: pushq %rbp
; nextln: movq %rsp, %rbp
; nextln: subq $$208, %rsp
; nextln: movdqu %xmm6, 0(%rsp)
; nextln: movdqu %xmm7, 16(%rsp)
; nextln: movdqu %xmm8, 32(%rsp)
; nextln: movdqu %xmm9, 48(%rsp)
; nextln: movdqu %xmm10, 64(%rsp)
; nextln: movdqu %xmm11, 80(%rsp)
; nextln: movdqu %xmm12, 96(%rsp)
; nextln: movdqu %xmm13, 112(%rsp)
; nextln: movdqu %xmm14, 128(%rsp)
; nextln: movdqu %xmm15, 144(%rsp)
; nextln: virtual_sp_offset_adjust 160
; nextln: movsd 0(%rcx), %xmm0
; nextln: movsd %xmm0, rsp(16 + virtual offset)
; nextln: movsd 8(%rcx), %xmm1
; nextln: movsd 16(%rcx), %xmm0
; nextln: movsd %xmm0, rsp(24 + virtual offset)
; nextln: movsd 24(%rcx), %xmm3
; nextln: movsd 32(%rcx), %xmm0
; nextln: movsd %xmm0, rsp(32 + virtual offset)
; nextln: movsd 40(%rcx), %xmm5
; nextln: movsd 48(%rcx), %xmm6
; nextln: movsd 56(%rcx), %xmm7
; nextln: movsd 64(%rcx), %xmm8
; nextln: movsd 72(%rcx), %xmm9
; nextln: movsd 80(%rcx), %xmm10
; nextln: movsd 88(%rcx), %xmm11
; nextln: movsd 96(%rcx), %xmm12
; nextln: movsd 104(%rcx), %xmm13
; nextln: movsd 112(%rcx), %xmm14
; nextln: movsd 120(%rcx), %xmm15
; nextln: movsd 128(%rcx), %xmm0
; nextln: movsd %xmm0, rsp(0 + virtual offset)
; nextln: movsd 136(%rcx), %xmm0
; nextln: movsd 144(%rcx), %xmm2
; nextln: movsd %xmm2, rsp(8 + virtual offset)
; nextln: movsd 152(%rcx), %xmm2
; nextln: nop len=0
; nextln: movsd rsp(16 + virtual offset), %xmm4
; nextln: addsd %xmm1, %xmm4
; nextln: movsd %xmm4, rsp(16 + virtual offset)
; nextln: movsd rsp(24 + virtual offset), %xmm1
; nextln: addsd %xmm3, %xmm1
; nextln: movsd rsp(32 + virtual offset), %xmm4
; nextln: addsd %xmm5, %xmm4
; nextln: addsd %xmm7, %xmm6
; nextln: addsd %xmm9, %xmm8
; nextln: addsd %xmm11, %xmm10
; nextln: addsd %xmm13, %xmm12
; nextln: addsd %xmm15, %xmm14
; nextln: movsd rsp(0 + virtual offset), %xmm3
; nextln: addsd %xmm0, %xmm3
; nextln: movsd rsp(8 + virtual offset), %xmm0
; nextln: addsd %xmm2, %xmm0
; nextln: movsd rsp(16 + virtual offset), %xmm2
; nextln: addsd %xmm1, %xmm2
; nextln: addsd %xmm6, %xmm4
; nextln: addsd %xmm10, %xmm8
; nextln: addsd %xmm14, %xmm12
; nextln: addsd %xmm0, %xmm3
; nextln: addsd %xmm4, %xmm2
; nextln: addsd %xmm12, %xmm8
; nextln: addsd %xmm8, %xmm2
; nextln: addsd %xmm3, %xmm2
; nextln: movaps %xmm2, %xmm0
; nextln: movdqu 0(%rsp), %xmm6
; nextln: movdqu 16(%rsp), %xmm7
; nextln: movdqu 32(%rsp), %xmm8
; nextln: movdqu 48(%rsp), %xmm9
; nextln: movdqu 64(%rsp), %xmm10
; nextln: movdqu 80(%rsp), %xmm11
; nextln: movdqu 96(%rsp), %xmm12
; nextln: movdqu 112(%rsp), %xmm13
; nextln: movdqu 128(%rsp), %xmm14
; nextln: movdqu 144(%rsp), %xmm15
; nextln: addq $$160, %rsp
; nextln: movq %rbp, %rsp
; nextln: popq %rbp
; nextln: ret

View File

@@ -1,4 +1,5 @@
test compile
set enable_llvm_abi_extensions=true
target x86_64
feature "experimental_x64"
@@ -941,17 +942,17 @@ block0(v0: i128, v1: i128, v2: i64, v3: i128, v4: i128, v5: i128):
v11 = iadd.i128 v9, v10
return v11
; check: movq %rsp, %rbp
; check: pushq %rbp
; nextln: movq %rsp, %rbp
; nextln: subq $$16, %rsp
; nextln: movq %r12, 0(%rsp)
; nextln: movq %r13, 8(%rsp)
; nextln: virtual_sp_offset_adjust 16
; nextln: movq 16(%rbp), %r9
; nextln: movq 24(%rbp), %r10
; nextln: movq 32(%rbp), %r12
; nextln: movq 40(%rbp), %r11
; nextln: movq 48(%rbp), %rax
; nextln: movq 56(%rbp), %r13
; nextln: movq 16(%rbp), %r10
; nextln: movq 24(%rbp), %r12
; nextln: movq 32(%rbp), %r11
; nextln: movq 40(%rbp), %rax
; nextln: movq 48(%rbp), %r13
; nextln: addq %rdx, %rdi
; nextln: adcq %rcx, %rsi
; nextln: xorq %rcx, %rcx
@@ -989,10 +990,10 @@ block0(v0: i128):
; nextln: movq %r10, 16(%rsi)
; nextln: movq %r11, 24(%rsi)
; nextln: movq %r12, 32(%rsi)
; nextln: movq %r13, 48(%rsi)
; nextln: movq %r14, 56(%rsi)
; nextln: movq %rdi, 64(%rsi)
; nextln: movq %rbx, 72(%rsi)
; nextln: movq %r13, 40(%rsi)
; nextln: movq %r14, 48(%rsi)
; nextln: movq %rdi, 56(%rsi)
; nextln: movq %rbx, 64(%rsi)
}

View File

@@ -1,4 +1,5 @@
test compile
set enable_llvm_abi_extensions=true
target x86_64
feature "experimental_x64"