Merge pull request #2892 from afonso360/aarch64-multireg-args

Handle i128 arguments in the aarch64 ABI
2021-05-21 16:57:42 -07:00
parent 7db94f5869 fbcfffdeab
commit 65e0e20210
4 changed files with 505 additions and 98 deletions
--- a/cranelift/filetests/filetests/isa/aarch64/call.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/call.clif
@@ -250,3 +250,232 @@ block0:
 ; nextln:  add sp, sp, #32
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
+
+
+; i128 tests
+function %f11(i128, i64) -> i64 {
+block0(v0: i128, v1: i64):
+    v2, v3 = isplit v0
+    return v3
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: mov x0, x1
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+
+function %f11_call(i64) -> i64 {
+    fn0 = %f11(i128, i64) -> i64
+
+block0(v0: i64):
+    v1 = iconst.i64 42
+    v2 = iconcat v1, v0
+    v3 = call fn0(v2, v1)
+    return v3
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: mov x1, x0
+; nextln: movz x0, #42
+; nextln: movz x2, #42
+; nextln: ldr x3, 8 ; b 12 ; data
+; nextln: blr x3
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+
+; The AArch64 ABI requires that the i128 argument be aligned
+; and to be passed in x2 and x3
+function %f12(i64, i128) -> i64 {
+block0(v0: i64, v1: i128):
+    v2, v3 = isplit v1
+    return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: mov x0, x2
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+
+
+function %f12_call(i64) -> i64 {
+    fn0 = %f12(i64, i128) -> i64
+
+block0(v0: i64):
+    v1 = iconst.i64 42
+    v2 = iconcat v0, v1
+    v3 = call fn0(v1, v2)
+    return v3
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movz x3, #42
+; nextln: mov x2, x0
+; nextln: movz x0, #42
+; nextln: ldr x1, 8 ; b 12 ; data
+; nextln: blr x1
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+
+
+; The Apple AArch64 ABI allows the i128 argument to not be aligned
+; and to be passed in x1 and x2
+function %f13(i64, i128) -> i64 apple_aarch64 {
+block0(v0: i64, v1: i128):
+    v2, v3 = isplit v1
+    return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: mov x0, x1
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+
+function %f13_call(i64) -> i64 apple_aarch64 {
+    fn0 = %f13(i64, i128) -> i64 apple_aarch64
+
+block0(v0: i64):
+    v1 = iconst.i64 42
+    v2 = iconcat v0, v1
+    v3 = call fn0(v1, v2)
+    return v3
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movz x2, #42
+; nextln: mov x1, x0
+; nextln: movz x0, #42
+; nextln: ldr x3, 8 ; b 12 ; data
+; nextln: blr x3
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+
+
+; We only have 8 registers to pass data in
+; make sure we spill the last argument even though there is one slot available
+function %f14(i128, i128, i128, i64, i128) -> i128 {
+block0(v0: i128, v1: i128, v2: i128, v3: i64, v4: i128):
+    return v4
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: ldur x0, [fp, #16]
+; nextln: ldur x1, [fp, #24]
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f14_call(i128, i64) -> i128 {
+    fn0 = %f14(i128, i128, i128, i64, i128) -> i128
+
+block0(v0: i128, v1: i64):
+    v2 = call fn0(v0, v0, v0, v1, v0)
+    return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+
+; TODO: Some codegen optimization possible here with x0,x1 moving to x7,x8 and then moving back
+; nextln: mov x7, x0
+; nextln: mov x8, x1
+; nextln: mov x6, x2
+; nextln: sub sp, sp, #16
+; nextln: virtual_sp_offset_adjust 16
+; nextln: mov x0, x7
+; nextln: mov x1, x8
+; nextln: mov x2, x7
+; nextln: mov x3, x8
+; nextln: mov x4, x7
+; nextln: mov x5, x8
+; nextln: stur x7, [sp]
+; nextln: stur x8, [sp, #8]
+
+; nextln: ldr x7, 8 ; b 12 ; data
+; nextln: blr x7
+; nextln: add sp, sp, #16
+; nextln: virtual_sp_offset_adjust -16
+
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+
+
+; We have one register slot available (Similar to %f14), however apple
+; allows us to start i128 on non even numbered registers (x7 in this case).
+;
+; It is unspecified if we can split the i128 into x7 + the stack.
+; In practice LLVM does not do this, so we are going to go with that.
+function %f15(i128, i128, i128, i64, i128) -> i128 apple_aarch64{
+block0(v0: i128, v1: i128, v2: i128, v3: i64, v4: i128):
+    return v4
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: ldur x0, [fp, #16]
+; nextln: ldur x1, [fp, #24]
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f15_call(i128, i64) -> i128 apple_aarch64 {
+    fn0 = %f15(i128, i128, i128, i64, i128) -> i128 apple_aarch64
+
+block0(v0: i128, v1: i64):
+    v2 = call fn0(v0, v0, v0, v1, v0)
+    return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+
+; nextln: mov x7, x0
+; nextln: mov x8, x1
+; nextln: mov x6, x2
+; nextln: sub sp, sp, #16
+; nextln: virtual_sp_offset_adjust 16
+; nextln: mov x0, x7
+; nextln: mov x1, x8
+; nextln: mov x2, x7
+; nextln: mov x3, x8
+; nextln: mov x4, x7
+; nextln: mov x5, x8
+; nextln: stur x7, [sp]
+; nextln: stur x8, [sp, #8]
+
+; nextln: ldr x7, 8 ; b 12 ; data
+; nextln: blr x7
+; nextln: add sp, sp, #16
+; nextln: virtual_sp_offset_adjust -16
+
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+
+function %f16() -> i32, i32 wasmtime_system_v {
+block0:
+    v0 = iconst.i32 0
+    v1 = iconst.i32 1
+    return v0, v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: mov x1, x0
+; nextln: movz x0, #0
+; nextln: movz x2, #1
+; nextln: stur w2, [x1]
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+