From fbcfffdeab81274fea944e866713872f0d72b2b5 Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonsobordado@az8.co>
Date: Wed, 12 May 2021 12:51:45 +0100
Subject: [PATCH] Handle spilling i128 arguments into the stack in aarch64

---
 cranelift/codegen/src/isa/aarch64/abi.rs      |  46 +++++--
 cranelift/codegen/src/isa/aarch64/lower.rs    |  57 ++++----
 .../filetests/filetests/isa/aarch64/call.clif | 123 +++++++++++++++++-
 3 files changed, 184 insertions(+), 42 deletions(-)

diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs
index f9e0de1498..f521087565 100644
--- a/cranelift/codegen/src/isa/aarch64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -187,7 +187,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
         let is_baldrdash = call_conv.extends_baldrdash();
         let has_baldrdash_tls = call_conv == isa::CallConv::Baldrdash2020;
 
-        // See AArch64 ABI (https://c9x.me/compile/bib/abi-arm64.pdf), sections 5.4.
+        // See AArch64 ABI (https://github.com/ARM-software/abi-aa/blob/2021Q1/aapcs64/aapcs64.rst#64parameter-passing), sections 6.4.
         //
         // MacOS aarch64 is slightly different, see also
         // https://developer.apple.com/documentation/xcode/writing_arm64_code_for_apple_platforms.
@@ -265,7 +265,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
                 param.value_type
             );
 
-            let (rcs, _) = Inst::rc_for_type(param.value_type)?;
+            let (rcs, reg_types) = Inst::rc_for_type(param.value_type)?;
 
             if let Some(param) = try_fill_baldrdash_reg(call_conv, param) {
                 assert!(rcs[0] == RegClass::I64);
@@ -288,7 +288,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
 
             // Handle multi register params
             //
-            // See AArch64 ABI (https://c9x.me/compile/bib/abi-arm64.pdf), (Section 5.4 Stage C).
+            // See AArch64 ABI (https://github.com/ARM-software/abi-aa/blob/2021Q1/aapcs64/aapcs64.rst#642parameter-passing-rules), (Section 6.4.2 Stage C).
             //
             // For arguments with alignment of 16 we round up the the register number
             // to the next even value. So we can never allocate for example an i128
@@ -301,7 +301,11 @@ impl ABIMachineSpec for AArch64MachineDeps {
             // restriction of passing the lower half in Xn and the upper half in Xn+1
             // (Stage C.9)
             //
-            // For examples of how llvm handles this: https://godbolt.org/z/bhd3vvEfh
+            // For examples of how LLVM handles this: https://godbolt.org/z/bhd3vvEfh
+            //
+            // On the Apple ABI it is unspecified if we can spill half the value into the stack
+            // i.e load the lower half into x7 and the upper half into the stack
+            // LLVM does not seem to do this, so we are going to replicate that behaviour
             let is_multi_reg = rcs.len() >= 2;
             if is_multi_reg {
                 assert!(
@@ -348,10 +352,8 @@ impl ABIMachineSpec for AArch64MachineDeps {
                     remaining_reg_vals -= 2;
                     continue;
                 }
-            }
-
-            // Single Register parameters
-            if !is_multi_reg {
+            } else {
+                // Single Register parameters
                 let rc = rcs[0];
                 let next_reg = match rc {
                     RegClass::I64 => &mut next_xreg,
@@ -400,12 +402,28 @@ impl ABIMachineSpec for AArch64MachineDeps {
             debug_assert!(size.is_power_of_two());
             next_stack = align_to(next_stack, size);
 
-            ret.push(ABIArg::stack(
-                next_stack as i64,
-                param.value_type,
-                param.extension,
-                param.purpose,
-            ));
+            let slots = reg_types
+                .iter()
+                .copied()
+                // Build the stack locations from each slot
+                .scan(next_stack, |next_stack, ty| {
+                    let slot_offset = *next_stack as i64;
+                    *next_stack += (ty_bits(ty) / 8) as u64;
+
+                    Some((ty, slot_offset))
+                })
+                .map(|(ty, offset)| ABIArgSlot::Stack {
+                    offset,
+                    ty,
+                    extension: param.extension,
+                })
+                .collect();
+
+            ret.push(ABIArg::Slots {
+                slots,
+                purpose: param.purpose,
+            });
+
             next_stack += size;
         }
 
diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
index 7c1cd5f055..d07311159e 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -158,18 +158,18 @@ impl NarrowValueMode {
     }
 }
 
-/// Emits instruction(s) to generate the given 64-bit constant value into a newly-allocated
-/// temporary register, returning that register.
-fn generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u64) -> ValueRegs<Reg> {
+/// Emits instruction(s) to generate the given constant value into newly-allocated
+/// temporary registers, returning these registers.
+fn generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u128) -> ValueRegs<Reg> {
     let from_bits = ty_bits(ty);
-    let masked = if from_bits < 64 {
-        c & ((1u64 << from_bits) - 1)
+    let masked = if from_bits < 128 {
+        c & ((1u128 << from_bits) - 1)
     } else {
         c
     };
 
     let cst_copy = ctx.alloc_tmp(ty);
-    for inst in Inst::gen_constant(cst_copy, masked as u128, ty, |ty| {
+    for inst in Inst::gen_constant(cst_copy, masked, ty, |ty| {
         ctx.alloc_tmp(ty).only_reg().unwrap()
     })
     .into_iter()
@@ -181,7 +181,7 @@ fn generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u64) -> Va
 
 /// Extends a register according to `narrow_mode`.
 /// If extended, the value is always extended to 64 bits, for simplicity.
-fn narrow_reg<C: LowerCtx<I = Inst>>(
+fn extend_reg<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     ty: Type,
     in_reg: Reg,
@@ -252,6 +252,26 @@ fn narrow_reg<C: LowerCtx<I = Inst>>(
     }
 }
 
+/// Lowers an instruction input to multiple regs
+fn lower_input_to_regs<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    input: InsnInput,
+) -> (ValueRegs<Reg>, Type, bool) {
+    debug!("lower_input_to_regs: input {:?}", input);
+    let ty = ctx.input_ty(input.insn, input.input);
+    let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
+    let is_const = inputs.constant.is_some();
+
+    let in_regs = if let Some(c) = inputs.constant {
+        // Generate constants fresh at each use to minimize long-range register pressure.
+        generate_constant(ctx, ty, c as u128)
+    } else {
+        ctx.put_input_in_regs(input.insn, input.input)
+    };
+
+    (in_regs, ty, is_const)
+}
+
 /// Lower an instruction input to a register
 ///
 /// The given register will be extended appropriately, according to
@@ -262,17 +282,12 @@ pub(crate) fn put_input_in_reg<C: LowerCtx<I = Inst>>(
     input: InsnInput,
     narrow_mode: NarrowValueMode,
 ) -> Reg {
-    let reg = put_input_in_regs(ctx, input)
+    let (in_regs, ty, is_const) = lower_input_to_regs(ctx, input);
+    let reg = in_regs
         .only_reg()
         .expect("Multi-register value not expected");
 
-    let is_const = ctx
-        .get_input_as_source_or_const(input.insn, input.input)
-        .constant
-        .is_some();
-
-    let ty = ctx.input_ty(input.insn, input.input);
-    narrow_reg(ctx, ty, reg, is_const, narrow_mode)
+    extend_reg(ctx, ty, reg, is_const, narrow_mode)
 }
 
 /// Lower an instruction input to multiple regs
@@ -280,17 +295,7 @@ pub(crate) fn put_input_in_regs<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     input: InsnInput,
 ) -> ValueRegs<Reg> {
-    debug!("put_input_in_reg: input {:?}", input);
-    let ty = ctx.input_ty(input.insn, input.input);
-    let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
-
-    let in_regs = if let Some(c) = inputs.constant {
-        // Generate constants fresh at each use to minimize long-range register pressure.
-        generate_constant(ctx, ty, c)
-    } else {
-        ctx.put_input_in_regs(input.insn, input.input)
-    };
-
+    let (in_regs, _, _) = lower_input_to_regs(ctx, input);
     in_regs
 }
 
diff --git a/cranelift/filetests/filetests/isa/aarch64/call.clif b/cranelift/filetests/filetests/isa/aarch64/call.clif
index 090c220167..9491c73b75 100644
--- a/cranelift/filetests/filetests/isa/aarch64/call.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/call.clif
@@ -287,7 +287,7 @@ block0(v0: i64):
 ; nextln: ret
 
 
-; The aarch64 abi requires that the i128 argument be aligned
+; The AArch64 ABI requires that the i128 argument be aligned
 ; and to be passed in x2 and x3
 function %f12(i64, i128) -> i64 {
 block0(v0: i64, v1: i128):
@@ -325,7 +325,7 @@ block0(v0: i64):
 
 
 
-; aarch64 allows the i128 argument to not be aligned
+; The Apple AArch64 ABI allows the i128 argument to not be aligned
 ; and to be passed in x1 and x2
 function %f13(i64, i128) -> i64 apple_aarch64 {
 block0(v0: i64, v1: i128):
@@ -360,3 +360,122 @@ block0(v0: i64):
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
 
+
+
+; We only have 8 registers to pass data in
+; make sure we spill the last argument even though there is one slot available
+function %f14(i128, i128, i128, i64, i128) -> i128 {
+block0(v0: i128, v1: i128, v2: i128, v3: i64, v4: i128):
+    return v4
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: ldur x0, [fp, #16]
+; nextln: ldur x1, [fp, #24]
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f14_call(i128, i64) -> i128 {
+    fn0 = %f14(i128, i128, i128, i64, i128) -> i128
+
+block0(v0: i128, v1: i64):
+    v2 = call fn0(v0, v0, v0, v1, v0)
+    return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+
+; TODO: Some codegen optimization possible here with x0,x1 moving to x7,x8 and then moving back
+; nextln: mov x7, x0
+; nextln: mov x8, x1
+; nextln: mov x6, x2
+; nextln: sub sp, sp, #16
+; nextln: virtual_sp_offset_adjust 16
+; nextln: mov x0, x7
+; nextln: mov x1, x8
+; nextln: mov x2, x7
+; nextln: mov x3, x8
+; nextln: mov x4, x7
+; nextln: mov x5, x8
+; nextln: stur x7, [sp]
+; nextln: stur x8, [sp, #8]
+
+; nextln: ldr x7, 8 ; b 12 ; data
+; nextln: blr x7
+; nextln: add sp, sp, #16
+; nextln: virtual_sp_offset_adjust -16
+
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+
+
+; We have one register slot available (Similar to %f14), however apple
+; allows us to start i128 on non even numbered registers (x7 in this case).
+;
+; It is unspecified if we can split the i128 into x7 + the stack.
+; In practice LLVM does not do this, so we are going to go with that.
+function %f15(i128, i128, i128, i64, i128) -> i128 apple_aarch64{
+block0(v0: i128, v1: i128, v2: i128, v3: i64, v4: i128):
+    return v4
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: ldur x0, [fp, #16]
+; nextln: ldur x1, [fp, #24]
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f15_call(i128, i64) -> i128 apple_aarch64 {
+    fn0 = %f15(i128, i128, i128, i64, i128) -> i128 apple_aarch64
+
+block0(v0: i128, v1: i64):
+    v2 = call fn0(v0, v0, v0, v1, v0)
+    return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+
+; nextln: mov x7, x0
+; nextln: mov x8, x1
+; nextln: mov x6, x2
+; nextln: sub sp, sp, #16
+; nextln: virtual_sp_offset_adjust 16
+; nextln: mov x0, x7
+; nextln: mov x1, x8
+; nextln: mov x2, x7
+; nextln: mov x3, x8
+; nextln: mov x4, x7
+; nextln: mov x5, x8
+; nextln: stur x7, [sp]
+; nextln: stur x8, [sp, #8]
+
+; nextln: ldr x7, 8 ; b 12 ; data
+; nextln: blr x7
+; nextln: add sp, sp, #16
+; nextln: virtual_sp_offset_adjust -16
+
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+
+function %f16() -> i32, i32 wasmtime_system_v {
+block0:
+    v0 = iconst.i32 0
+    v1 = iconst.i32 1
+    return v0, v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: mov x1, x0
+; nextln: movz x0, #0
+; nextln: movz x2, #1
+; nextln: stur w2, [x1]
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+