x86-64 Windows fastcall ABI support.

This adds support for the "fastcall" ABI, which is the native C/C++ ABI on Windows platforms on x86-64. It is similar to but not exactly like System V; primarily, its argument register assignments are different, and it requires stack shadow space. Note that this also adjusts the handling of multi-register values in the shared ABI implementation, and with this change, adjusts handling of `i128`s on *both* Fastcall/x64 *and* SysV/x64 platforms. This was done to align with actual behavior by the "rustc ABI" on both platforms, as mapped out experimentally (Compiler Explorer link in comments). This behavior is gated under the `enable_llvm_abi_extensions` flag. Note also that this does *not* add x64 unwind info on Windows. That will come in a future PR (but is planned!).
2021-02-22 20:28:49 -08:00
parent 98d3e6823f
commit 6c94eb82aa
13 changed files with 997 additions and 475 deletions
--- a/cranelift/filetests/filetests/isa/x64/fastcall.clif
+++ b/cranelift/filetests/filetests/isa/x64/fastcall.clif
@@ -0,0 +1,299 @@
+test compile
+set enable_llvm_abi_extensions=true
+target x86_64
+feature "experimental_x64"
+
+function %f0(i64, i64, i64, i64) -> i64 windows_fastcall {
+block0(v0: i64, v1: i64, v2: i64, v3: i64):
+  return v0
+}
+
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+; nextln:  movq    %rcx, %rax
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+
+function %f1(i64, i64, i64, i64) -> i64 windows_fastcall {
+block0(v0: i64, v1: i64, v2: i64, v3: i64):
+  return v1
+}
+
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+; nextln:  movq    %rdx, %rax
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+
+function %f2(i64, i64, i64, i64) -> i64 windows_fastcall {
+block0(v0: i64, v1: i64, v2: i64, v3: i64):
+  return v2
+}
+
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+; nextln:  movq    %r8, %rax
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+
+function %f3(i64, i64, i64, i64) -> i64 windows_fastcall {
+block0(v0: i64, v1: i64, v2: i64, v3: i64):
+  return v3
+}
+
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+; nextln:  movq    %r9, %rax
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+
+function %f4(i64, i64, f64, i64) -> f64 windows_fastcall {
+block0(v0: i64, v1: i64, v2: f64, v3: i64):
+  return v2
+}
+
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+; nextln:  movaps  %xmm2, %xmm0
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+
+function %f5(i64, i64, f64, i64) -> i64 windows_fastcall {
+block0(v0: i64, v1: i64, v2: f64, v3: i64):
+  return v3
+}
+
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+; nextln:  movq    %r9, %rax
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+
+function %f6(i64, i64, i64, i64, i64, i64) -> i64 windows_fastcall {
+block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64, v5: i64):
+  return v5
+}
+
+;; This is truly odd (because of the regalloc ordering), but it works. Note
+;; that we're spilling and using rsi, which is a callee-save in fastcall, because
+;; the regalloc order is optimized for SysV. Also note that because we copy args
+;; out of their input locations to separate vregs, we have a spurious load
+;; from [rbp+48]. Ordinarily these moves are coalesced because the dest vreg
+;; is allocated as a caller-save (volatile), but here again we allocate rsi
+;; first and so have to spill it (and consequently don't coalesce).
+;;
+;; TODO(#2704): fix regalloc's register priority ordering!
+
+; check:  pushq   %rbp
+; nextln: movq    %rsp, %rbp
+; nextln: subq    $$16, %rsp
+; nextln: movq    %rsi, 0(%rsp)
+; nextln: virtual_sp_offset_adjust 16
+; nextln: movq    48(%rbp), %rsi
+; nextln: movq    56(%rbp), %rsi
+; nextln: movq    %rsi, %rax
+; nextln: movq    0(%rsp), %rsi
+; nextln: addq    $$16, %rsp
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
+
+function %f7(i128, i64, i128, i128) -> i128 windows_fastcall {
+block0(v0: i128, v1: i64, v2: i128, v3: i128):
+  return v3
+}
+
+;; Again, terrible regalloc behavior. The important part is that `v3` comes
+;; from [rbp+56] and [rbp+64], i.e., the second and third non-shadow
+;; stack slot.
+
+; check:  pushq   %rbp
+; nextln: movq    %rsp, %rbp
+; nextln: subq    $$16, %rsp
+; nextln: movq    %rsi, 0(%rsp)
+; nextln: movq    %rdi, 8(%rsp)
+; nextln: virtual_sp_offset_adjust 16
+; nextln: movq    48(%rbp), %rsi
+; nextln: movq    56(%rbp), %rsi
+; nextln: movq    64(%rbp), %rdi
+; nextln: movq    %rsi, %rax
+; nextln: movq    %rdi, %rdx
+; nextln: movq    0(%rsp), %rsi
+; nextln: movq    8(%rsp), %rdi
+; nextln: addq    $$16, %rsp
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
+
+function %f8(i64) -> i64 windows_fastcall {
+  sig0 = (i64, i64, f64, f64, i64, i64) -> i64 windows_fastcall
+  fn0 = %g sig0
+
+block0(v0: i64):
+  v1 = fcvt_from_sint.f64 v0
+  v2 = call fn0(v0, v0, v1, v1, v0, v0)
+  return v2
+}
+
+; check:  pushq   %rbp
+; nextln: movq    %rsp, %rbp
+; nextln: subq    $$16, %rsp
+; nextln: movq    %rsi, 0(%rsp)
+; nextln: virtual_sp_offset_adjust 16
+; nextln: movq    %rcx, %rsi
+; nextln: cvtsi2sd %rsi, %xmm3
+; nextln: subq    $$48, %rsp
+; nextln: virtual_sp_offset_adjust 48
+; nextln: movq    %rsi, %rcx
+; nextln: movq    %rsi, %rdx
+; nextln: movaps  %xmm3, %xmm2
+; nextln: movq    %rsi, 32(%rsp)
+; nextln: movq    %rsi, 40(%rsp)
+; nextln: load_ext_name %g+0, %rsi
+; nextln: call    *%rsi
+; nextln: addq    $$48, %rsp
+; nextln: virtual_sp_offset_adjust -48
+; nextln: movq    0(%rsp), %rsi
+; nextln: addq    $$16, %rsp
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
+
+function %f9(i64) -> f64 windows_fastcall {
+block0(v0: i64):
+  v1 = load.f64 v0+0
+  v2 = load.f64 v0+8
+  v3 = load.f64 v0+16
+  v4 = load.f64 v0+24
+  v5 = load.f64 v0+32
+  v6 = load.f64 v0+40
+  v7 = load.f64 v0+48
+  v8 = load.f64 v0+56
+  v9 = load.f64 v0+64
+  v10 = load.f64 v0+72
+  v11 = load.f64 v0+80
+  v12 = load.f64 v0+88
+  v13 = load.f64 v0+96
+  v14 = load.f64 v0+104
+  v15 = load.f64 v0+112
+  v16 = load.f64 v0+120
+  v17 = load.f64 v0+128
+  v18 = load.f64 v0+136
+  v19 = load.f64 v0+144
+  v20 = load.f64 v0+152
+  
+  v21 = fadd.f64 v1, v2
+  v22 = fadd.f64 v3, v4
+  v23 = fadd.f64 v5, v6
+  v24 = fadd.f64 v7, v8
+  v25 = fadd.f64 v9, v10
+  v26 = fadd.f64 v11, v12
+  v27 = fadd.f64 v13, v14
+  v28 = fadd.f64 v15, v16
+  v29 = fadd.f64 v17, v18
+  v30 = fadd.f64 v19, v20
+
+  v31 = fadd.f64 v21, v22
+  v32 = fadd.f64 v23, v24
+  v33 = fadd.f64 v25, v26
+  v34 = fadd.f64 v27, v28
+  v35 = fadd.f64 v29, v30
+
+  v36 = fadd.f64 v31, v32
+  v37 = fadd.f64 v33, v34
+
+  v38 = fadd.f64 v36, v37
+
+  v39 = fadd.f64 v38, v35
+
+  return v39
+}
+
+; check:  pushq   %rbp
+; nextln: movq    %rsp, %rbp
+; nextln: subq    $$208, %rsp
+; nextln: movdqu  %xmm6, 0(%rsp)
+; nextln: movdqu  %xmm7, 16(%rsp)
+; nextln: movdqu  %xmm8, 32(%rsp)
+; nextln: movdqu  %xmm9, 48(%rsp)
+; nextln: movdqu  %xmm10, 64(%rsp)
+; nextln: movdqu  %xmm11, 80(%rsp)
+; nextln: movdqu  %xmm12, 96(%rsp)
+; nextln: movdqu  %xmm13, 112(%rsp)
+; nextln: movdqu  %xmm14, 128(%rsp)
+; nextln: movdqu  %xmm15, 144(%rsp)
+; nextln: virtual_sp_offset_adjust 160
+; nextln: movsd   0(%rcx), %xmm0
+; nextln: movsd   %xmm0, rsp(16 + virtual offset)
+; nextln: movsd   8(%rcx), %xmm1
+; nextln: movsd   16(%rcx), %xmm0
+; nextln: movsd   %xmm0, rsp(24 + virtual offset)
+; nextln: movsd   24(%rcx), %xmm3
+; nextln: movsd   32(%rcx), %xmm0
+; nextln: movsd   %xmm0, rsp(32 + virtual offset)
+; nextln: movsd   40(%rcx), %xmm5
+; nextln: movsd   48(%rcx), %xmm6
+; nextln: movsd   56(%rcx), %xmm7
+; nextln: movsd   64(%rcx), %xmm8
+; nextln: movsd   72(%rcx), %xmm9
+; nextln: movsd   80(%rcx), %xmm10
+; nextln: movsd   88(%rcx), %xmm11
+; nextln: movsd   96(%rcx), %xmm12
+; nextln: movsd   104(%rcx), %xmm13
+; nextln: movsd   112(%rcx), %xmm14
+; nextln: movsd   120(%rcx), %xmm15
+; nextln: movsd   128(%rcx), %xmm0
+; nextln: movsd   %xmm0, rsp(0 + virtual offset)
+; nextln: movsd   136(%rcx), %xmm0
+; nextln: movsd   144(%rcx), %xmm2
+; nextln: movsd   %xmm2, rsp(8 + virtual offset)
+; nextln: movsd   152(%rcx), %xmm2
+; nextln: nop     len=0
+; nextln: movsd   rsp(16 + virtual offset), %xmm4
+; nextln: addsd   %xmm1, %xmm4
+; nextln: movsd   %xmm4, rsp(16 + virtual offset)
+; nextln: movsd   rsp(24 + virtual offset), %xmm1
+; nextln: addsd   %xmm3, %xmm1
+; nextln: movsd   rsp(32 + virtual offset), %xmm4
+; nextln: addsd   %xmm5, %xmm4
+; nextln: addsd   %xmm7, %xmm6
+; nextln: addsd   %xmm9, %xmm8
+; nextln: addsd   %xmm11, %xmm10
+; nextln: addsd   %xmm13, %xmm12
+; nextln: addsd   %xmm15, %xmm14
+; nextln: movsd   rsp(0 + virtual offset), %xmm3
+; nextln: addsd   %xmm0, %xmm3
+; nextln: movsd   rsp(8 + virtual offset), %xmm0
+; nextln: addsd   %xmm2, %xmm0
+; nextln: movsd   rsp(16 + virtual offset), %xmm2
+; nextln: addsd   %xmm1, %xmm2
+; nextln: addsd   %xmm6, %xmm4
+; nextln: addsd   %xmm10, %xmm8
+; nextln: addsd   %xmm14, %xmm12
+; nextln: addsd   %xmm0, %xmm3
+; nextln: addsd   %xmm4, %xmm2
+; nextln: addsd   %xmm12, %xmm8
+; nextln: addsd   %xmm8, %xmm2
+; nextln: addsd   %xmm3, %xmm2
+; nextln: movaps  %xmm2, %xmm0
+; nextln: movdqu  0(%rsp), %xmm6
+; nextln: movdqu  16(%rsp), %xmm7
+; nextln: movdqu  32(%rsp), %xmm8
+; nextln: movdqu  48(%rsp), %xmm9
+; nextln: movdqu  64(%rsp), %xmm10
+; nextln: movdqu  80(%rsp), %xmm11
+; nextln: movdqu  96(%rsp), %xmm12
+; nextln: movdqu  112(%rsp), %xmm13
+; nextln: movdqu  128(%rsp), %xmm14
+; nextln: movdqu  144(%rsp), %xmm15
+; nextln: addq    $$160, %rsp
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
+
--- a/cranelift/filetests/filetests/isa/x64/i128.clif
+++ b/cranelift/filetests/filetests/isa/x64/i128.clif
@@ -1,4 +1,5 @@
 test compile
+set enable_llvm_abi_extensions=true
 target x86_64
 feature "experimental_x64"

@@ -941,17 +942,17 @@ block0(v0: i128, v1: i128, v2: i64, v3: i128, v4: i128, v5: i128):
    v11 = iadd.i128 v9, v10
    return v11

-; check:  movq    %rsp, %rbp
+; check:  pushq   %rbp
+; nextln: movq    %rsp, %rbp
 ; nextln: subq    $$16, %rsp
 ; nextln: movq    %r12, 0(%rsp)
 ; nextln: movq    %r13, 8(%rsp)
 ; nextln: virtual_sp_offset_adjust 16
-; nextln: movq    16(%rbp), %r9
-; nextln: movq    24(%rbp), %r10
-; nextln: movq    32(%rbp), %r12
-; nextln: movq    40(%rbp), %r11
-; nextln: movq    48(%rbp), %rax
-; nextln: movq    56(%rbp), %r13
+; nextln: movq    16(%rbp), %r10
+; nextln: movq    24(%rbp), %r12
+; nextln: movq    32(%rbp), %r11
+; nextln: movq    40(%rbp), %rax
+; nextln: movq    48(%rbp), %r13
 ; nextln: addq    %rdx, %rdi
 ; nextln: adcq    %rcx, %rsi
 ; nextln: xorq    %rcx, %rcx
@@ -989,10 +990,10 @@ block0(v0: i128):
 ; nextln: movq    %r10, 16(%rsi)
 ; nextln: movq    %r11, 24(%rsi)
 ; nextln: movq    %r12, 32(%rsi)
-; nextln: movq    %r13, 48(%rsi)
-; nextln: movq    %r14, 56(%rsi)
-; nextln: movq    %rdi, 64(%rsi)
-; nextln: movq    %rbx, 72(%rsi)
+; nextln: movq    %r13, 40(%rsi)
+; nextln: movq    %r14, 48(%rsi)
+; nextln: movq    %rdi, 56(%rsi)
+; nextln: movq    %rbx, 64(%rsi)

 }

--- a/cranelift/filetests/filetests/isa/x64/select-i128.clif
+++ b/cranelift/filetests/filetests/isa/x64/select-i128.clif
@@ -1,4 +1,5 @@
 test compile
+set enable_llvm_abi_extensions=true
 target x86_64
 feature "experimental_x64"