Fully support multiple returns in Wasmtime (#2806)

* Fully support multiple returns in Wasmtime For quite some time now Wasmtime has "supported" multiple return values, but only in the mose bare bones ways. Up until recently you couldn't get a typed version of functions with multiple return values, and never have you been able to use `Func::wrap` with functions that return multiple values. Even recently where `Func::typed` can call functions that return multiple values it uses a double-indirection by calling a trampoline which calls the real function. The underlying reason for this lack of support is that cranelift's ABI for returning multiple values is not possible to write in Rust. For example if a wasm function returns two `i32` values there is no Rust (or C!) function you can write to correspond to that. This commit, however fixes that. This commit adds two new ABIs to Cranelift: `WasmtimeSystemV` and `WasmtimeFastcall`. The intention is that these Wasmtime-specific ABIs match their corresponding ABI (e.g. `SystemV` or `WindowsFastcall`) for everything *except* how multiple values are returned. For multiple return values we simply define our own version of the ABI which Wasmtime implements, which is that for N return values the first is returned as if the function only returned that and the latter N-1 return values are returned via an out-ptr that's the last parameter to the function. These custom ABIs provides the ability for Wasmtime to bind these in Rust meaning that `Func::wrap` can now wrap functions that return multiple values and `Func::typed` no longer uses trampolines when calling functions that return multiple values. Although there's lots of internal changes there's no actual changes in the API surface area of Wasmtime, just a few more impls of more public traits which means that more types are supported in more places! Another change made with this PR is a consolidation of how the ABI of each function in a wasm module is selected. The native `SystemV` ABI, for example, is more efficient at returning multiple values than the wasmtime version of the ABI (since more things are in more registers). To continue to take advantage of this Wasmtime will now classify some functions in a wasm module with the "fast" ABI. Only functions that are not reachable externally from the module are classified with the fast ABI (e.g. those not exported, used in tables, or used with `ref.func`). This should enable purely internal functions of modules to have a faster calling convention than those which might be exposed to Wasmtime itself. Closes #1178 * Tweak some names and add docs * "fix" lightbeam compile * Fix TODO with dummy environ * Unwind info is a property of the target, not the ABI * Remove lightbeam unused imports * Attempt to fix arm64 * Document new ABIs aren't stable * Fix filetests to use the right target * Don't always do 64-bit stores with cranelift This was overwriting upper bits when 32-bit registers were being stored into return values, so fix the code inline to do a sized store instead of one-size-fits-all store. * At least get tests passing on the old backend * Fix a typo * Add some filetests with mixed abi calls * Get `multi` example working * Fix doctests on old x86 backend * Add a mixture of wasmtime/system_v tests
2021-04-07 12:34:26 -05:00
parent 7588565078
commit 195bf0e29a
37 changed files with 1116 additions and 459 deletions
--- a/cranelift/filetests/filetests/isa/x64/call-conv.clif
+++ b/cranelift/filetests/filetests/isa/x64/call-conv.clif
@@ -0,0 +1,344 @@
+test compile
+target x86_64 machinst
+
+;; system_v has first param in %rdi, fascall in %rcx
+function %one_arg(i32) system_v {
+    sig0 = (i32) windows_fastcall
+block0(v0: i32):
+    ; check:  movq    %rdi, %rcx
+    ; nextln: call    *%rdi
+    call_indirect sig0, v0(v0)
+    return
+}
+
+;; system_v has params in %rdi, %xmm0, fascall in %rcx, %xmm1
+function %two_args(i32, f32) system_v {
+    sig0 = (i32, f32) windows_fastcall
+    sig1 = (i32, f32) system_v
+block0(v0: i32, v1: f32):
+    ; check:  movq    %rdi, %rsi
+    ; check:  movaps  %xmm0, %xmm6
+    ; check:  movq    %rsi, %rcx
+    ; nextln: movaps  %xmm6, %xmm1
+    ; nextln: call    *%rsi
+    call_indirect sig0, v0(v0, v1)
+
+    ; check:  movq    %rsi, %rdi
+    ; nextln: movaps  %xmm6, %xmm0
+    ; nextln: call    *%rsi
+    call_indirect sig1, v0(v0, v1)
+    return
+}
+
+;; fastcall preserves xmm6+, rbx, rbp, rdi, rsi, r12-r15
+;; system_v preserves no xmm registers, rbx, rbp, r12-r15
+function %fastcall_to_systemv(i32) windows_fastcall {
+    sig0 = () system_v
+block0(v0: i32):
+    ; check:  pushq   %rbp
+    ; nextln: movq    %rsp, %rbp
+    ; nextln: subq    $$176, %rsp
+    ; nextln: movdqu  %xmm6, 0(%rsp)
+    ; nextln: movdqu  %xmm7, 16(%rsp)
+    ; nextln: movdqu  %xmm8, 32(%rsp)
+    ; nextln: movdqu  %xmm9, 48(%rsp)
+    ; nextln: movdqu  %xmm10, 64(%rsp)
+    ; nextln: movdqu  %xmm11, 80(%rsp)
+    ; nextln: movdqu  %xmm12, 96(%rsp)
+    ; nextln: movdqu  %xmm13, 112(%rsp)
+    ; nextln: movdqu  %xmm14, 128(%rsp)
+    ; nextln: movdqu  %xmm15, 144(%rsp)
+    ; nextln: movq    %rsi, 160(%rsp)
+    ; nextln: movq    %rdi, 168(%rsp)
+    ; nextln: call    *%rcx
+    ; nextln: movdqu  0(%rsp), %xmm6
+    ; nextln: movdqu  16(%rsp), %xmm7
+    ; nextln: movdqu  32(%rsp), %xmm8
+    ; nextln: movdqu  48(%rsp), %xmm9
+    ; nextln: movdqu  64(%rsp), %xmm10
+    ; nextln: movdqu  80(%rsp), %xmm11
+    ; nextln: movdqu  96(%rsp), %xmm12
+    ; nextln: movdqu  112(%rsp), %xmm13
+    ; nextln: movdqu  128(%rsp), %xmm14
+    ; nextln: movdqu  144(%rsp), %xmm15
+    ; nextln: movq    160(%rsp), %rsi
+    ; nextln: movq    168(%rsp), %rdi
+    ; nextln: addq    $$176, %rsp
+    ; nextln: movq    %rbp, %rsp
+    ; nextln: popq    %rbp
+    ; nextln: ret
+    call_indirect sig0, v0()
+    return
+}
+
+function %many_args(
+    ;; rdi, rsi, rdx, rcx, r8, r9,
+    i64, i64, i64, i64, i64, i64,
+
+    ;; xmm0-7
+    f64, f64, f64, f64, f64, f64, f64, f64,
+
+    ;; stack args
+    i64, i32, f32, f64
+) system_v {
+    sig0 = (
+      i64, i64, i64, i64, i64, i64, f64, f64, f64, f64, f64, f64, f64, f64, i64,
+      i32, f32, f64
+    ) windows_fastcall
+block0(
+      v0: i64, v1:i64, v2:i64, v3:i64,
+      v4:i64, v5:i64,
+      v6: f64, v7: f64, v8:f64, v9:f64, v10:f64, v11:f64, v12:f64, v13:f64,
+      v14:i64, v15:i32, v16:f32, v17:f64
+):
+    ; check:  pushq   %rbp
+    ; nextln: movq    %rsp, %rbp
+    ; nextln: subq    $$32, %rsp
+    ; nextln: movq    %r12, 0(%rsp)
+    ; nextln: movq    %r13, 8(%rsp)
+    ; nextln: movq    %r14, 16(%rsp)
+    ; nextln: movq    %rdx, %rax
+    ; nextln: movq    %rcx, %r10
+    ; nextln: movq    %r8, %r11
+    ; nextln: movq    %r9, %r12
+    ; nextln: movq    16(%rbp), %r13
+    ; nextln: movslq  24(%rbp), %r14
+    ; nextln: movss   32(%rbp), %xmm8
+    ; nextln: movsd   40(%rbp), %xmm9
+    ; nextln: subq    $$144, %rsp
+    ; nextln: virtual_sp_offset_adjust 144
+    ; nextln: movq    %rdi, %rcx
+    ; nextln: movq    %rsi, %rdx
+    ; nextln: movq    %rax, %r8
+    ; nextln: movq    %r10, %r9
+    ; nextln: movq    %r11, 32(%rsp)
+    ; nextln: movq    %r12, 40(%rsp)
+    ; nextln: movsd   %xmm0, 48(%rsp)
+    ; nextln: movsd   %xmm1, 56(%rsp)
+    ; nextln: movsd   %xmm2, 64(%rsp)
+    ; nextln: movsd   %xmm3, 72(%rsp)
+    ; nextln: movsd   %xmm4, 80(%rsp)
+    ; nextln: movsd   %xmm5, 88(%rsp)
+    ; nextln: movsd   %xmm6, 96(%rsp)
+    ; nextln: movsd   %xmm7, 104(%rsp)
+    ; nextln: movq    %r13, 112(%rsp)
+    ; nextln: movl    %r14d, 120(%rsp)
+    ; nextln: movss   %xmm8, 128(%rsp)
+    ; nextln: movsd   %xmm9, 136(%rsp)
+    ; nextln: call    *%rdi
+    ; nextln: addq    $$144, %rsp
+    ; nextln: virtual_sp_offset_adjust -144
+    ; nextln: movq    0(%rsp), %r12
+    ; nextln: movq    8(%rsp), %r13
+    ; nextln: movq    16(%rsp), %r14
+    ; nextln: addq    $$32, %rsp
+    ; nextln: movq    %rbp, %rsp
+    ; nextln: popq    %rbp
+    ; nextln: ret
+    call_indirect sig0, v0(
+      v0, v1, v2, v3,
+      v4, v5, v6, v7,
+      v8, v9, v10, v11,
+      v12, v13, v14, v15,
+      v16, v17
+    )
+    return
+}
+
+; rdi => rcx
+; rsi => rdx
+; rdx => r8
+; rcx => r9
+; r8 => stack
+function %many_ints(i64, i64, i64, i64, i64) system_v {
+    sig0 = (i64, i64, i64, i64, i64) windows_fastcall
+block0(v0: i64, v1:i64, v2:i64, v3:i64, v4:i64):
+    ; check:  pushq   %rbp
+    ; nextln: movq    %rsp, %rbp
+    ; nextln: movq    %rdx, %rax
+    ; nextln: movq    %rcx, %r9
+    ; nextln: movq    %r8, %r10
+    ; nextln: subq    $$48, %rsp
+    ; nextln: virtual_sp_offset_adjust 48
+    ; nextln: movq    %rdi, %rcx
+    ; nextln: movq    %rsi, %rdx
+    ; nextln: movq    %rax, %r8
+    ; nextln: movq    %r10, 32(%rsp)
+    ; nextln: call    *%rdi
+    ; nextln: addq    $$48, %rsp
+    ; nextln: virtual_sp_offset_adjust -48
+    ; nextln: movq    %rbp, %rsp
+    ; nextln: popq    %rbp
+    ; nextln: ret
+    call_indirect sig0, v0(v0, v1, v2, v3, v4)
+    return
+}
+
+function %many_args2(i32, f32, i64, f64, i32, i32, i32, f32, f64, f32, f64) system_v {
+    sig0 = (i32, f32, i64, f64, i32, i32, i32, f32, f64, f32, f64) windows_fastcall
+block0(v0: i32, v1: f32, v2: i64, v3: f64, v4: i32, v5: i32, v6: i32, v7: f32, v8: f64, v9: f32, v10: f64):
+    ; check:   pushq   %rbp
+    ; nextln:  movq    %rsp, %rbp
+    ; nextln:  movaps  %xmm1, %xmm6
+    ; nextln:  movq    %rcx, %rax
+    ; nextln:  movq    %r8, %r9
+    ; nextln:  movaps  %xmm3, %xmm7
+    ; nextln:  subq    $$96, %rsp
+    ; nextln:  virtual_sp_offset_adjust 96
+    ; nextln:  movq    %rdi, %rcx
+    ; nextln:  movaps  %xmm0, %xmm1
+    ; nextln:  movq    %rsi, %r8
+    ; nextln:  movaps  %xmm6, %xmm3
+    ; nextln:  movl    %edx, 32(%rsp)
+    ; nextln:  movl    %eax, 40(%rsp)
+    ; nextln:  movl    %r9d, 48(%rsp)
+    ; nextln:  movss   %xmm2, 56(%rsp)
+    ; nextln:  movsd   %xmm7, 64(%rsp)
+    ; nextln:  movss   %xmm4, 72(%rsp)
+    ; nextln:  movsd   %xmm5, 80(%rsp)
+    ; nextln:  call    *%rdi
+    ; nextln:  addq    $$96, %rsp
+    ; nextln:  virtual_sp_offset_adjust -96
+    ; nextln:  movq    %rbp, %rsp
+    ; nextln:  popq    %rbp
+    ; nextln:  ret
+    call_indirect sig0, v0(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10)
+    return
+}
+
+function %wasmtime_mix1(i32) wasmtime_system_v {
+    sig0 = (i32) system_v
+block0(v0: i32):
+    ; check:  movq    %rdi, %rsi
+    ; nextln: movq    %rsi, %rdi
+    ; nextln: call    *%rsi
+    call_indirect sig0, v0(v0)
+    return
+}
+
+function %wasmtime_mix2(i32) system_v {
+    sig0 = (i32) wasmtime_system_v
+block0(v0: i32):
+    ; check:  movq    %rdi, %rsi
+    ; nextln: movq    %rsi, %rdi
+    ; nextln: call    *%rsi
+    call_indirect sig0, v0(v0)
+    return
+}
+
+function %wasmtime_mix2() -> i32, i32 system_v {
+    sig0 = () -> i32, i32 wasmtime_system_v
+block0:
+    ; check:  pushq   %rbp
+    ; nextln: movq    %rsp, %rbp
+    ; nextln: movl    $$1, %esi
+    ; nextln: subq    $$16, %rsp
+    ; nextln: virtual_sp_offset_adjust 16
+    ; nextln: lea     0(%rsp), %rdi
+    ; nextln: call    *%rsi
+    ; nextln: movslq  0(%rsp), %rsi
+    ; nextln: addq    $$16, %rsp
+    ; nextln: virtual_sp_offset_adjust -16
+    ; nextln: movq    %rsi, %rdx
+    ; nextln: movq    %rbp, %rsp
+    ; nextln: popq    %rbp
+    ; nextln: ret
+    v2 = iconst.i32 1
+    v0, v1 = call_indirect sig0, v2()
+    return v0, v1
+}
+
+function %wasmtime_mix3() -> i32, i32 wasmtime_system_v {
+    sig0 = () -> i32, i32 system_v
+block0:
+    ; check:  pushq   %rbp
+    ; nextln: movq    %rsp, %rbp
+    ; nextln: subq    $$16, %rsp
+    ; nextln: movq    %r12, 0(%rsp)
+    ; nextln: movq    %rdi, %r12
+    ; nextln: movl    $$1, %esi
+    ; nextln: call    *%rsi
+    ; nextln: movl    %edx, 0(%r12)
+    ; nextln: movq    0(%rsp), %r12
+    ; nextln: addq    $$16, %rsp
+    ; nextln: movq    %rbp, %rsp
+    ; nextln: popq    %rbp
+    ; nextln: ret
+    v2 = iconst.i32 1
+    v0, v1 = call_indirect sig0, v2()
+    return v0, v1
+}
+
+function %wasmtime_mix4() -> i32, i64, i32 wasmtime_system_v {
+    sig0 = () -> i32, i64, i32 system_v
+block0:
+    ; check:  pushq   %rbp
+    ; nextln: movq    %rsp, %rbp
+    ; nextln: subq    $$16, %rsp
+    ; nextln: movq    %r12, 0(%rsp)
+    ; nextln: movq    %rdi, %r12
+    ; nextln: movl    $$1, %esi
+    ; nextln: subq    $$16, %rsp
+    ; nextln: virtual_sp_offset_adjust 16
+    ; nextln: lea     0(%rsp), %rdi
+    ; nextln: call    *%rsi
+    ; nextln: movslq  0(%rsp), %rsi
+    ; nextln: addq    $$16, %rsp
+    ; nextln: virtual_sp_offset_adjust -16
+    ; nextln: movq    %rdx, 0(%r12)
+    ; nextln: movl    %esi, 8(%r12)
+    ; nextln: movq    0(%rsp), %r12
+    ; nextln: addq    $$16, %rsp
+    ; nextln: movq    %rbp, %rsp
+    ; nextln: popq    %rbp
+    ; nextln: ret
+    v3 = iconst.i32 1
+    v0, v1, v2 = call_indirect sig0, v3()
+    return v0, v1, v2
+}
+
+function %wasmtime_mix5() -> f32, i64, i32, f32 wasmtime_system_v {
+    sig0 = () -> f32, i64, i32, f32 system_v
+block0:
+    ; check:  pushq   %rbp
+    ; nextln: movq    %rsp, %rbp
+    ; nextln: subq    $$16, %rsp
+    ; nextln: movq    %r12, 0(%rsp)
+    ; nextln: movq    %rdi, %r12
+    ; nextln: movl    $$1, %esi
+    ; nextln: call    *%rsi
+    ; nextln: movq    %rax, 0(%r12)
+    ; nextln: movl    %edx, 8(%r12)
+    ; nextln: movss   %xmm1, 12(%r12)
+    ; nextln: movq    0(%rsp), %r12
+    ; nextln: addq    $$16, %rsp
+    ; nextln: movq    %rbp, %rsp
+    ; nextln: popq    %rbp
+    ; nextln: ret
+    v5 = iconst.i32 1
+    v0, v1, v2, v3 = call_indirect sig0, v5()
+    return v0, v1, v2, v3
+}
+
+function %wasmtime_mix6(f32, i64, i32, f32) -> f32, i64, i32, f32 wasmtime_system_v {
+    sig0 = (f32, i64, i32, f32) -> f32, i64, i32, f32 system_v
+block0(v0: f32, v1: i64, v2: i32, v3: f32):
+    ; check:  pushq   %rbp
+    ; nextln: movq    %rsp, %rbp
+    ; nextln: subq    $$16, %rsp
+    ; nextln: movq    %r12, 0(%rsp)
+    ; nextln: movq    %rdx, %r12
+    ; nextln: movl    $$1, %eax
+    ; nextln: call    *%rax
+    ; nextln: movq    %rax, 0(%r12)
+    ; nextln: movl    %edx, 8(%r12)
+    ; nextln: movss   %xmm1, 12(%r12)
+    ; nextln: movq    0(%rsp), %r12
+    ; nextln: addq    $$16, %rsp
+    ; nextln: movq    %rbp, %rsp
+    ; nextln: popq    %rbp
+    ; nextln: ret
+    v4 = iconst.i32 1
+    v5, v6, v7, v8 = call_indirect sig0, v4(v0, v1, v2, v3)
+    return v5, v6, v7, v8
+}
--- a/cranelift/filetests/filetests/isa/x86/systemv_x64_unwind.clif
+++ b/cranelift/filetests/filetests/isa/x86/systemv_x64_unwind.clif
@@ -1,7 +1,7 @@
 test unwind
 set opt_level=speed_and_size
 set is_pic
-target x86_64 legacy haswell
+target x86_64-linux legacy haswell

 ; check the unwind information with a function with no args
 function %no_args() system_v {
--- a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif
+++ b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif
@@ -1,7 +1,7 @@
 test unwind
 set opt_level=speed_and_size
 set is_pic
-target x86_64 legacy haswell
+target x86_64-windows legacy haswell

 ; check the unwind information with a leaf function with no args
 function %no_args_leaf() windows_fastcall {