Merge pull request #2678 from cfallin/x64-fastcall

x86-64 Windows fastcall ABI support.
2021-03-05 10:46:47 -08:00
parent ccdf6ec0b1 6c94eb82aa
commit e41d882144
13 changed files with 997 additions and 475 deletions
--- a/cranelift/filetests/filetests/isa/x64/fastcall.clif
+++ b/cranelift/filetests/filetests/isa/x64/fastcall.clif
@@ -0,0 +1,299 @@
+test compile
+set enable_llvm_abi_extensions=true
+target x86_64
+feature "experimental_x64"
+
+function %f0(i64, i64, i64, i64) -> i64 windows_fastcall {
+block0(v0: i64, v1: i64, v2: i64, v3: i64):
+  return v0
+}
+
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+; nextln:  movq    %rcx, %rax
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+
+function %f1(i64, i64, i64, i64) -> i64 windows_fastcall {
+block0(v0: i64, v1: i64, v2: i64, v3: i64):
+  return v1
+}
+
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+; nextln:  movq    %rdx, %rax
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+
+function %f2(i64, i64, i64, i64) -> i64 windows_fastcall {
+block0(v0: i64, v1: i64, v2: i64, v3: i64):
+  return v2
+}
+
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+; nextln:  movq    %r8, %rax
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+
+function %f3(i64, i64, i64, i64) -> i64 windows_fastcall {
+block0(v0: i64, v1: i64, v2: i64, v3: i64):
+  return v3
+}
+
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+; nextln:  movq    %r9, %rax
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+
+function %f4(i64, i64, f64, i64) -> f64 windows_fastcall {
+block0(v0: i64, v1: i64, v2: f64, v3: i64):
+  return v2
+}
+
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+; nextln:  movaps  %xmm2, %xmm0
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+
+function %f5(i64, i64, f64, i64) -> i64 windows_fastcall {
+block0(v0: i64, v1: i64, v2: f64, v3: i64):
+  return v3
+}
+
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+; nextln:  movq    %r9, %rax
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+
+function %f6(i64, i64, i64, i64, i64, i64) -> i64 windows_fastcall {
+block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64, v5: i64):
+  return v5
+}
+
+;; This is truly odd (because of the regalloc ordering), but it works. Note
+;; that we're spilling and using rsi, which is a callee-save in fastcall, because
+;; the regalloc order is optimized for SysV. Also note that because we copy args
+;; out of their input locations to separate vregs, we have a spurious load
+;; from [rbp+48]. Ordinarily these moves are coalesced because the dest vreg
+;; is allocated as a caller-save (volatile), but here again we allocate rsi
+;; first and so have to spill it (and consequently don't coalesce).
+;;
+;; TODO(#2704): fix regalloc's register priority ordering!
+
+; check:  pushq   %rbp
+; nextln: movq    %rsp, %rbp
+; nextln: subq    $$16, %rsp
+; nextln: movq    %rsi, 0(%rsp)
+; nextln: virtual_sp_offset_adjust 16
+; nextln: movq    48(%rbp), %rsi
+; nextln: movq    56(%rbp), %rsi
+; nextln: movq    %rsi, %rax
+; nextln: movq    0(%rsp), %rsi
+; nextln: addq    $$16, %rsp
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
+
+function %f7(i128, i64, i128, i128) -> i128 windows_fastcall {
+block0(v0: i128, v1: i64, v2: i128, v3: i128):
+  return v3
+}
+
+;; Again, terrible regalloc behavior. The important part is that `v3` comes
+;; from [rbp+56] and [rbp+64], i.e., the second and third non-shadow
+;; stack slot.
+
+; check:  pushq   %rbp
+; nextln: movq    %rsp, %rbp
+; nextln: subq    $$16, %rsp
+; nextln: movq    %rsi, 0(%rsp)
+; nextln: movq    %rdi, 8(%rsp)
+; nextln: virtual_sp_offset_adjust 16
+; nextln: movq    48(%rbp), %rsi
+; nextln: movq    56(%rbp), %rsi
+; nextln: movq    64(%rbp), %rdi
+; nextln: movq    %rsi, %rax
+; nextln: movq    %rdi, %rdx
+; nextln: movq    0(%rsp), %rsi
+; nextln: movq    8(%rsp), %rdi
+; nextln: addq    $$16, %rsp
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
+
+function %f8(i64) -> i64 windows_fastcall {
+  sig0 = (i64, i64, f64, f64, i64, i64) -> i64 windows_fastcall
+  fn0 = %g sig0
+
+block0(v0: i64):
+  v1 = fcvt_from_sint.f64 v0
+  v2 = call fn0(v0, v0, v1, v1, v0, v0)
+  return v2
+}
+
+; check:  pushq   %rbp
+; nextln: movq    %rsp, %rbp
+; nextln: subq    $$16, %rsp
+; nextln: movq    %rsi, 0(%rsp)
+; nextln: virtual_sp_offset_adjust 16
+; nextln: movq    %rcx, %rsi
+; nextln: cvtsi2sd %rsi, %xmm3
+; nextln: subq    $$48, %rsp
+; nextln: virtual_sp_offset_adjust 48
+; nextln: movq    %rsi, %rcx
+; nextln: movq    %rsi, %rdx
+; nextln: movaps  %xmm3, %xmm2
+; nextln: movq    %rsi, 32(%rsp)
+; nextln: movq    %rsi, 40(%rsp)
+; nextln: load_ext_name %g+0, %rsi
+; nextln: call    *%rsi
+; nextln: addq    $$48, %rsp
+; nextln: virtual_sp_offset_adjust -48
+; nextln: movq    0(%rsp), %rsi
+; nextln: addq    $$16, %rsp
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
+
+function %f9(i64) -> f64 windows_fastcall {
+block0(v0: i64):
+  v1 = load.f64 v0+0
+  v2 = load.f64 v0+8
+  v3 = load.f64 v0+16
+  v4 = load.f64 v0+24
+  v5 = load.f64 v0+32
+  v6 = load.f64 v0+40
+  v7 = load.f64 v0+48
+  v8 = load.f64 v0+56
+  v9 = load.f64 v0+64
+  v10 = load.f64 v0+72
+  v11 = load.f64 v0+80
+  v12 = load.f64 v0+88
+  v13 = load.f64 v0+96
+  v14 = load.f64 v0+104
+  v15 = load.f64 v0+112
+  v16 = load.f64 v0+120
+  v17 = load.f64 v0+128
+  v18 = load.f64 v0+136
+  v19 = load.f64 v0+144
+  v20 = load.f64 v0+152
+  
+  v21 = fadd.f64 v1, v2
+  v22 = fadd.f64 v3, v4
+  v23 = fadd.f64 v5, v6
+  v24 = fadd.f64 v7, v8
+  v25 = fadd.f64 v9, v10
+  v26 = fadd.f64 v11, v12
+  v27 = fadd.f64 v13, v14
+  v28 = fadd.f64 v15, v16
+  v29 = fadd.f64 v17, v18
+  v30 = fadd.f64 v19, v20
+
+  v31 = fadd.f64 v21, v22
+  v32 = fadd.f64 v23, v24
+  v33 = fadd.f64 v25, v26
+  v34 = fadd.f64 v27, v28
+  v35 = fadd.f64 v29, v30
+
+  v36 = fadd.f64 v31, v32
+  v37 = fadd.f64 v33, v34
+
+  v38 = fadd.f64 v36, v37
+
+  v39 = fadd.f64 v38, v35
+
+  return v39
+}
+
+; check:  pushq   %rbp
+; nextln: movq    %rsp, %rbp
+; nextln: subq    $$208, %rsp
+; nextln: movdqu  %xmm6, 0(%rsp)
+; nextln: movdqu  %xmm7, 16(%rsp)
+; nextln: movdqu  %xmm8, 32(%rsp)
+; nextln: movdqu  %xmm9, 48(%rsp)
+; nextln: movdqu  %xmm10, 64(%rsp)
+; nextln: movdqu  %xmm11, 80(%rsp)
+; nextln: movdqu  %xmm12, 96(%rsp)
+; nextln: movdqu  %xmm13, 112(%rsp)
+; nextln: movdqu  %xmm14, 128(%rsp)
+; nextln: movdqu  %xmm15, 144(%rsp)
+; nextln: virtual_sp_offset_adjust 160
+; nextln: movsd   0(%rcx), %xmm0
+; nextln: movsd   %xmm0, rsp(16 + virtual offset)
+; nextln: movsd   8(%rcx), %xmm1
+; nextln: movsd   16(%rcx), %xmm0
+; nextln: movsd   %xmm0, rsp(24 + virtual offset)
+; nextln: movsd   24(%rcx), %xmm3
+; nextln: movsd   32(%rcx), %xmm0
+; nextln: movsd   %xmm0, rsp(32 + virtual offset)
+; nextln: movsd   40(%rcx), %xmm5
+; nextln: movsd   48(%rcx), %xmm6
+; nextln: movsd   56(%rcx), %xmm7
+; nextln: movsd   64(%rcx), %xmm8
+; nextln: movsd   72(%rcx), %xmm9
+; nextln: movsd   80(%rcx), %xmm10
+; nextln: movsd   88(%rcx), %xmm11
+; nextln: movsd   96(%rcx), %xmm12
+; nextln: movsd   104(%rcx), %xmm13
+; nextln: movsd   112(%rcx), %xmm14
+; nextln: movsd   120(%rcx), %xmm15
+; nextln: movsd   128(%rcx), %xmm0
+; nextln: movsd   %xmm0, rsp(0 + virtual offset)
+; nextln: movsd   136(%rcx), %xmm0
+; nextln: movsd   144(%rcx), %xmm2
+; nextln: movsd   %xmm2, rsp(8 + virtual offset)
+; nextln: movsd   152(%rcx), %xmm2
+; nextln: nop     len=0
+; nextln: movsd   rsp(16 + virtual offset), %xmm4
+; nextln: addsd   %xmm1, %xmm4
+; nextln: movsd   %xmm4, rsp(16 + virtual offset)
+; nextln: movsd   rsp(24 + virtual offset), %xmm1
+; nextln: addsd   %xmm3, %xmm1
+; nextln: movsd   rsp(32 + virtual offset), %xmm4
+; nextln: addsd   %xmm5, %xmm4
+; nextln: addsd   %xmm7, %xmm6
+; nextln: addsd   %xmm9, %xmm8
+; nextln: addsd   %xmm11, %xmm10
+; nextln: addsd   %xmm13, %xmm12
+; nextln: addsd   %xmm15, %xmm14
+; nextln: movsd   rsp(0 + virtual offset), %xmm3
+; nextln: addsd   %xmm0, %xmm3
+; nextln: movsd   rsp(8 + virtual offset), %xmm0
+; nextln: addsd   %xmm2, %xmm0
+; nextln: movsd   rsp(16 + virtual offset), %xmm2
+; nextln: addsd   %xmm1, %xmm2
+; nextln: addsd   %xmm6, %xmm4
+; nextln: addsd   %xmm10, %xmm8
+; nextln: addsd   %xmm14, %xmm12
+; nextln: addsd   %xmm0, %xmm3
+; nextln: addsd   %xmm4, %xmm2
+; nextln: addsd   %xmm12, %xmm8
+; nextln: addsd   %xmm8, %xmm2
+; nextln: addsd   %xmm3, %xmm2
+; nextln: movaps  %xmm2, %xmm0
+; nextln: movdqu  0(%rsp), %xmm6
+; nextln: movdqu  16(%rsp), %xmm7
+; nextln: movdqu  32(%rsp), %xmm8
+; nextln: movdqu  48(%rsp), %xmm9
+; nextln: movdqu  64(%rsp), %xmm10
+; nextln: movdqu  80(%rsp), %xmm11
+; nextln: movdqu  96(%rsp), %xmm12
+; nextln: movdqu  112(%rsp), %xmm13
+; nextln: movdqu  128(%rsp), %xmm14
+; nextln: movdqu  144(%rsp), %xmm15
+; nextln: addq    $$160, %rsp
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
+
--- a/cranelift/filetests/filetests/isa/x64/i128.clif
+++ b/cranelift/filetests/filetests/isa/x64/i128.clif
@@ -1,4 +1,5 @@
 test compile
+set enable_llvm_abi_extensions=true
 target x86_64
 feature "experimental_x64"

@@ -738,17 +739,17 @@ block0(v0: i128, v1: i128, v2: i64, v3: i128, v4: i128, v5: i128):
    v11 = iadd.i128 v9, v10
    return v11

-; check:  movq    %rsp, %rbp
+; check:  pushq   %rbp
+; nextln: movq    %rsp, %rbp
 ; nextln: subq    $$16, %rsp
 ; nextln: movq    %r12, 0(%rsp)
 ; nextln: movq    %r13, 8(%rsp)
 ; nextln: virtual_sp_offset_adjust 16
-; nextln: movq    16(%rbp), %r9
-; nextln: movq    24(%rbp), %r10
-; nextln: movq    32(%rbp), %r12
-; nextln: movq    40(%rbp), %r11
-; nextln: movq    48(%rbp), %rax
-; nextln: movq    56(%rbp), %r13
+; nextln: movq    16(%rbp), %r10
+; nextln: movq    24(%rbp), %r12
+; nextln: movq    32(%rbp), %r11
+; nextln: movq    40(%rbp), %rax
+; nextln: movq    48(%rbp), %r13
 ; nextln: addq    %rdx, %rdi
 ; nextln: adcq    %rcx, %rsi
 ; nextln: xorq    %rcx, %rcx
@@ -786,10 +787,10 @@ block0(v0: i128):
 ; nextln: movq    %r10, 16(%rsi)
 ; nextln: movq    %r11, 24(%rsi)
 ; nextln: movq    %r12, 32(%rsi)
-; nextln: movq    %r13, 48(%rsi)
-; nextln: movq    %r14, 56(%rsi)
-; nextln: movq    %rdi, 64(%rsi)
-; nextln: movq    %rbx, 72(%rsi)
+; nextln: movq    %r13, 40(%rsi)
+; nextln: movq    %r14, 48(%rsi)
+; nextln: movq    %rdi, 56(%rsi)
+; nextln: movq    %rbx, 64(%rsi)

 }

--- a/cranelift/filetests/filetests/isa/x64/select-i128.clif
+++ b/cranelift/filetests/filetests/isa/x64/select-i128.clif
@@ -1,4 +1,5 @@
 test compile
+set enable_llvm_abi_extensions=true
 target x86_64
 feature "experimental_x64"