From ac624da8d97c98eb1fcb0c705eb8cd98b2a620e5 Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonsobordado@az8.co>
Date: Tue, 11 May 2021 10:25:33 +0100
Subject: [PATCH 1/2] Handle i128 arguments in the aarch64 ABI

When dealing with params that need to be split, we follow the
arch64 ABI and split the value in two, and make sure that start that
argument in an even numbered xN register.

The apple ABI does not require this, so on those platforms, we start
params anywhere.
---
 cranelift/codegen/src/isa/aarch64/abi.rs      | 185 ++++++++++++------
 cranelift/codegen/src/isa/aarch64/lower.rs    | 105 ++++++----
 .../codegen/src/isa/aarch64/lower_inst.rs     |  59 +++++-
 .../filetests/filetests/isa/aarch64/call.clif | 110 +++++++++++
 4 files changed, 362 insertions(+), 97 deletions(-)
diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs
index 6a264f1604..f9e0de1498 100644
--- a/cranelift/codegen/src/isa/aarch64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -183,6 +183,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
         args_or_rets: ArgsOrRets,
         add_ret_area_ptr: bool,
     ) -> CodegenResult<(Vec<ABIArg>, i64, Option<usize>)> {
+        let is_apple_cc = call_conv == isa::CallConv::AppleAarch64;
         let is_baldrdash = call_conv.extends_baldrdash();
         let has_baldrdash_tls = call_conv == isa::CallConv::Baldrdash2020;
 
@@ -194,8 +195,6 @@ impl ABIMachineSpec for AArch64MachineDeps {
         // following ways:
         // - sign- and zero- extensions of data types less than 32 bits are not
         // implemented yet.
-        // - i128 arguments passing isn't implemented yet in the standard (non
-        // MacOS) aarch64 ABI.
         // - we align the arguments stack space to a 16-bytes boundary, while
         // the MacOS allows aligning only on 8 bytes. In practice it means we're
         // slightly overallocating when calling, which is fine, and doesn't
@@ -265,20 +264,16 @@ impl ABIMachineSpec for AArch64MachineDeps {
                 "Invalid type for AArch64: {:?}",
                 param.value_type
             );
-            let (rcs, _) = Inst::rc_for_type(param.value_type).unwrap();
-            assert!(rcs.len() == 1, "Multi-reg values not supported yet");
-            let rc = rcs[0];
 
-            let next_reg = match rc {
-                RegClass::I64 => &mut next_xreg,
-                RegClass::V128 => &mut next_vreg,
-                _ => panic!("Invalid register class: {:?}", rc),
-            };
+            let (rcs, _) = Inst::rc_for_type(param.value_type)?;
 
             if let Some(param) = try_fill_baldrdash_reg(call_conv, param) {
-                assert!(rc == RegClass::I64);
+                assert!(rcs[0] == RegClass::I64);
                 ret.push(param);
-            } else if let ir::ArgumentPurpose::StructArgument(size) = param.purpose {
+                continue;
+            }
+
+            if let ir::ArgumentPurpose::StructArgument(size) = param.purpose {
                 let offset = next_stack as i64;
                 let size = size as u64;
                 assert!(size % 8 == 0, "StructArgument size is not properly aligned");
@@ -288,50 +283,130 @@ impl ABIMachineSpec for AArch64MachineDeps {
                     size,
                     purpose: param.purpose,
                 });
-            } else if *next_reg < max_per_class_reg_vals && remaining_reg_vals > 0 {
-                let reg = match rc {
-                    RegClass::I64 => xreg(*next_reg),
-                    RegClass::V128 => vreg(*next_reg),
-                    _ => unreachable!(),
-                };
-                ret.push(ABIArg::reg(
-                    reg.to_real_reg(),
-                    param.value_type,
-                    param.extension,
-                    param.purpose,
-                ));
-                *next_reg += 1;
-                remaining_reg_vals -= 1;
-            } else {
-                // Compute the stack slot's size.
-                let size = (ty_bits(param.value_type) / 8) as u64;
-
-                let size = if call_conv == isa::CallConv::AppleAarch64
-                    || (call_conv.extends_wasmtime() && args_or_rets == ArgsOrRets::Rets)
-                {
-                    // MacOS aarch64 and Wasmtime allow stack slots with
-                    // sizes less than 8 bytes. They still need to be
-                    // properly aligned on their natural data alignment,
-                    // though.
-                    size
-                } else {
-                    // Every arg takes a minimum slot of 8 bytes. (16-byte stack
-                    // alignment happens separately after all args.)
-                    std::cmp::max(size, 8)
-                };
-
-                // Align the stack slot.
-                debug_assert!(size.is_power_of_two());
-                next_stack = align_to(next_stack, size);
-
-                ret.push(ABIArg::stack(
-                    next_stack as i64,
-                    param.value_type,
-                    param.extension,
-                    param.purpose,
-                ));
-                next_stack += size;
+                continue;
             }
+
+            // Handle multi register params
+            //
+            // See AArch64 ABI (https://c9x.me/compile/bib/abi-arm64.pdf), (Section 5.4 Stage C).
+            //
+            // For arguments with alignment of 16 we round up the the register number
+            // to the next even value. So we can never allocate for example an i128
+            // to X1 and X2, we have to skip one register and do X2, X3
+            // (Stage C.8)
+            // Note: The Apple ABI deviates a bit here. They don't respect Stage C.8
+            // and will happily allocate a i128 to X1 and X2
+            //
+            // For integer types with alignment of 16 we also have the additional
+            // restriction of passing the lower half in Xn and the upper half in Xn+1
+            // (Stage C.9)
+            //
+            // For examples of how llvm handles this: https://godbolt.org/z/bhd3vvEfh
+            let is_multi_reg = rcs.len() >= 2;
+            if is_multi_reg {
+                assert!(
+                    rcs.len() == 2,
+                    "Unable to handle multi reg params with more than 2 regs"
+                );
+                assert!(
+                    rcs == &[RegClass::I64, RegClass::I64],
+                    "Unable to handle non i64 regs"
+                );
+
+                let reg_class_space = max_per_class_reg_vals - next_xreg;
+                let reg_space = remaining_reg_vals;
+
+                if reg_space >= 2 && reg_class_space >= 2 {
+                    // The aarch64 ABI does not allow us to start a split argument
+                    // at an odd numbered register. So we need to skip one register
+                    //
+                    // TODO: The Fast ABI should probably not skip the register
+                    if !is_apple_cc && next_xreg % 2 != 0 {
+                        next_xreg += 1;
+                    }
+
+                    let lower_reg = xreg(next_xreg);
+                    let upper_reg = xreg(next_xreg + 1);
+
+                    ret.push(ABIArg::Slots {
+                        slots: vec![
+                            ABIArgSlot::Reg {
+                                reg: lower_reg.to_real_reg(),
+                                ty: param.value_type,
+                                extension: param.extension,
+                            },
+                            ABIArgSlot::Reg {
+                                reg: upper_reg.to_real_reg(),
+                                ty: param.value_type,
+                                extension: param.extension,
+                            },
+                        ],
+                        purpose: param.purpose,
+                    });
+
+                    next_xreg += 2;
+                    remaining_reg_vals -= 2;
+                    continue;
+                }
+            }
+
+            // Single Register parameters
+            if !is_multi_reg {
+                let rc = rcs[0];
+                let next_reg = match rc {
+                    RegClass::I64 => &mut next_xreg,
+                    RegClass::V128 => &mut next_vreg,
+                    _ => panic!("Invalid register class: {:?}", rc),
+                };
+
+                if *next_reg < max_per_class_reg_vals && remaining_reg_vals > 0 {
+                    let reg = match rc {
+                        RegClass::I64 => xreg(*next_reg),
+                        RegClass::V128 => vreg(*next_reg),
+                        _ => unreachable!(),
+                    };
+                    ret.push(ABIArg::reg(
+                        reg.to_real_reg(),
+                        param.value_type,
+                        param.extension,
+                        param.purpose,
+                    ));
+                    *next_reg += 1;
+                    remaining_reg_vals -= 1;
+                    continue;
+                }
+            }
+
+            // Spill to the stack
+
+            // Compute the stack slot's size.
+            let size = (ty_bits(param.value_type) / 8) as u64;
+
+            let size = if is_apple_cc
+                || (call_conv.extends_wasmtime() && args_or_rets == ArgsOrRets::Rets)
+            {
+                // MacOS aarch64 and Wasmtime allow stack slots with
+                // sizes less than 8 bytes. They still need to be
+                // properly aligned on their natural data alignment,
+                // though.
+                size
+            } else {
+                // Every arg takes a minimum slot of 8 bytes. (16-byte stack
+                // alignment happens separately after all args.)
+                std::cmp::max(size, 8)
+            };
+
+            // Align the stack slot.
+            debug_assert!(size.is_power_of_two());
+            next_stack = align_to(next_stack, size);
+
+            ret.push(ABIArg::stack(
+                next_stack as i64,
+                param.value_type,
+                param.extension,
+                param.purpose,
+            ));
+            next_stack += size;
         }
 
         if args_or_rets == ArgsOrRets::Rets && is_baldrdash {
diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
index 4f5893f54b..7c1cd5f055 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -158,42 +158,37 @@ impl NarrowValueMode {
     }
 }
 
-/// Lower an instruction input to a reg.
-///
-/// The given register will be extended appropriately, according to
-/// `narrow_mode` and the input's type. If extended, the value is
-/// always extended to 64 bits, for simplicity.
-pub(crate) fn put_input_in_reg<C: LowerCtx<I = Inst>>(
-    ctx: &mut C,
-    input: InsnInput,
-    narrow_mode: NarrowValueMode,
-) -> Reg {
-    debug!("put_input_in_reg: input {:?}", input);
-    let ty = ctx.input_ty(input.insn, input.input);
-    let from_bits = ty_bits(ty) as u8;
-    let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
-    let in_reg = if let Some(c) = inputs.constant {
-        // Generate constants fresh at each use to minimize long-range register pressure.
-        let masked = if from_bits < 64 {
-            c & ((1u64 << from_bits) - 1)
-        } else {
-            c
-        };
-        let to_reg = ctx.alloc_tmp(ty).only_reg().unwrap();
-        for inst in Inst::gen_constant(ValueRegs::one(to_reg), masked as u128, ty, |ty| {
-            ctx.alloc_tmp(ty).only_reg().unwrap()
-        })
-        .into_iter()
-        {
-            ctx.emit(inst);
-        }
-        to_reg.to_reg()
+/// Emits instruction(s) to generate the given 64-bit constant value into a newly-allocated
+/// temporary register, returning that register.
+fn generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u64) -> ValueRegs<Reg> {
+    let from_bits = ty_bits(ty);
+    let masked = if from_bits < 64 {
+        c & ((1u64 << from_bits) - 1)
     } else {
-        ctx.put_input_in_regs(input.insn, input.input)
-            .only_reg()
-            .unwrap()
+        c
     };
 
+    let cst_copy = ctx.alloc_tmp(ty);
+    for inst in Inst::gen_constant(cst_copy, masked as u128, ty, |ty| {
+        ctx.alloc_tmp(ty).only_reg().unwrap()
+    })
+    .into_iter()
+    {
+        ctx.emit(inst);
+    }
+    non_writable_value_regs(cst_copy)
+}
+
+/// Extends a register according to `narrow_mode`.
+/// If extended, the value is always extended to 64 bits, for simplicity.
+fn narrow_reg<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    ty: Type,
+    in_reg: Reg,
+    is_const: bool,
+    narrow_mode: NarrowValueMode,
+) -> Reg {
+    let from_bits = ty_bits(ty) as u8;
     match (narrow_mode, from_bits) {
         (NarrowValueMode::None, _) => in_reg,
         (NarrowValueMode::ZeroExtend32, n) if n < 32 => {
@@ -221,7 +216,7 @@ pub(crate) fn put_input_in_reg<C: LowerCtx<I = Inst>>(
         (NarrowValueMode::ZeroExtend32, 32) | (NarrowValueMode::SignExtend32, 32) => in_reg,
 
         (NarrowValueMode::ZeroExtend64, n) if n < 64 => {
-            if inputs.constant.is_some() {
+            if is_const {
                 // Constants are zero-extended to full 64-bit width on load already.
                 in_reg
             } else {
@@ -257,6 +252,48 @@ pub(crate) fn put_input_in_reg<C: LowerCtx<I = Inst>>(
     }
 }
 
+/// Lower an instruction input to a register
+///
+/// The given register will be extended appropriately, according to
+/// `narrow_mode` and the input's type. If extended, the value is
+/// always extended to 64 bits, for simplicity.
+pub(crate) fn put_input_in_reg<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    input: InsnInput,
+    narrow_mode: NarrowValueMode,
+) -> Reg {
+    let reg = put_input_in_regs(ctx, input)
+        .only_reg()
+        .expect("Multi-register value not expected");
+
+    let is_const = ctx
+        .get_input_as_source_or_const(input.insn, input.input)
+        .constant
+        .is_some();
+
+    let ty = ctx.input_ty(input.insn, input.input);
+    narrow_reg(ctx, ty, reg, is_const, narrow_mode)
+}
+
+/// Lower an instruction input to multiple regs
+pub(crate) fn put_input_in_regs<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    input: InsnInput,
+) -> ValueRegs<Reg> {
+    debug!("put_input_in_reg: input {:?}", input);
+    let ty = ctx.input_ty(input.insn, input.input);
+    let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
+
+    let in_regs = if let Some(c) = inputs.constant {
+        // Generate constants fresh at each use to minimize long-range register pressure.
+        generate_constant(ctx, ty, c)
+    } else {
+        ctx.put_input_in_regs(input.insn, input.input)
+    };
+
+    in_regs
+}
+
 /// Lower an instruction input to a reg or reg/shift, or reg/extend operand.
 ///
 /// The `narrow_mode` flag indicates whether the consumer of this value needs
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index ede66295e9..5e33c54034 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -1572,10 +1572,21 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 // N.B.: according to the AArch64 ABI, the top bits of a register
                 // (above the bits for the value's type) are undefined, so we
                 // need not extend the return values.
-                let reg = put_input_in_reg(ctx, *input, NarrowValueMode::None);
-                let retval_reg = ctx.retval(i).only_reg().unwrap();
+                let src_regs = put_input_in_regs(ctx, *input);
+                let retval_regs = ctx.retval(i);
+
+                assert_eq!(src_regs.len(), retval_regs.len());
                 let ty = ctx.input_ty(insn, i);
-                ctx.emit(Inst::gen_move(retval_reg, reg, ty));
+                let (_, tys) = Inst::rc_for_type(ty)?;
+
+                src_regs
+                    .regs()
+                    .iter()
+                    .zip(retval_regs.regs().iter())
+                    .zip(tys.iter())
+                    .for_each(|((&src, &dst), &ty)| {
+                        ctx.emit(Inst::gen_move(dst, src, ty));
+                    });
             }
             // N.B.: the Ret itself is generated by the ABI.
         }
@@ -1753,13 +1764,13 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             assert!(inputs.len() == abi.num_args());
             for i in abi.get_copy_to_arg_order() {
                 let input = inputs[i];
-                let arg_reg = put_input_in_reg(ctx, input, NarrowValueMode::None);
-                abi.emit_copy_regs_to_arg(ctx, i, ValueRegs::one(arg_reg));
+                let arg_regs = put_input_in_regs(ctx, input);
+                abi.emit_copy_regs_to_arg(ctx, i, arg_regs);
             }
             abi.emit_call(ctx);
             for (i, output) in outputs.iter().enumerate() {
-                let retval_reg = get_output_reg(ctx, *output).only_reg().unwrap();
-                abi.emit_copy_retval_to_regs(ctx, i, ValueRegs::one(retval_reg));
+                let retval_regs = get_output_reg(ctx, *output);
+                abi.emit_copy_retval_to_regs(ctx, i, retval_regs);
             }
             abi.emit_stack_post_adjust(ctx);
         }
@@ -2306,7 +2317,39 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             panic!("Vector ops not implemented.");
         }
 
-        Opcode::Isplit | Opcode::Iconcat => panic!("Vector ops not supported."),
+        Opcode::Isplit => {
+            assert_eq!(
+                ctx.input_ty(insn, 0),
+                I128,
+                "Isplit only implemented for i128's"
+            );
+            assert_eq!(ctx.output_ty(insn, 0), I64);
+            assert_eq!(ctx.output_ty(insn, 1), I64);
+
+            let src_regs = put_input_in_regs(ctx, inputs[0]);
+            let dst_lo = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+            let dst_hi = get_output_reg(ctx, outputs[1]).only_reg().unwrap();
+
+            ctx.emit(Inst::gen_move(dst_lo, src_regs.regs()[0], I64));
+            ctx.emit(Inst::gen_move(dst_hi, src_regs.regs()[1], I64));
+        }
+
+        Opcode::Iconcat => {
+            assert_eq!(
+                ctx.output_ty(insn, 0),
+                I128,
+                "Iconcat only implemented for i128's"
+            );
+            assert_eq!(ctx.input_ty(insn, 0), I64);
+            assert_eq!(ctx.input_ty(insn, 1), I64);
+
+            let src_lo = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let src_hi = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+            let dst = get_output_reg(ctx, outputs[0]);
+
+            ctx.emit(Inst::gen_move(dst.regs()[0], src_lo, I64));
+            ctx.emit(Inst::gen_move(dst.regs()[1], src_hi, I64));
+        }
 
         Opcode::Imax | Opcode::Umax | Opcode::Umin | Opcode::Imin => {
             let alu_op = match op {
diff --git a/cranelift/filetests/filetests/isa/aarch64/call.clif b/cranelift/filetests/filetests/isa/aarch64/call.clif
index a59c59a5b2..090c220167 100644
--- a/cranelift/filetests/filetests/isa/aarch64/call.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/call.clif
@@ -250,3 +250,113 @@ block0:
 ; nextln:  add sp, sp, #32
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
+
+
+; i128 tests
+function %f11(i128, i64) -> i64 {
+block0(v0: i128, v1: i64):
+    v2, v3 = isplit v0
+    return v3
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: mov x0, x1
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+
+function %f11_call(i64) -> i64 {
+    fn0 = %f11(i128, i64) -> i64
+
+block0(v0: i64):
+    v1 = iconst.i64 42
+    v2 = iconcat v1, v0
+    v3 = call fn0(v2, v1)
+    return v3
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: mov x1, x0
+; nextln: movz x0, #42
+; nextln: movz x2, #42
+; nextln: ldr x3, 8 ; b 12 ; data
+; nextln: blr x3
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+
+; The aarch64 abi requires that the i128 argument be aligned
+; and to be passed in x2 and x3
+function %f12(i64, i128) -> i64 {
+block0(v0: i64, v1: i128):
+    v2, v3 = isplit v1
+    return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: mov x0, x2
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+
+
+function %f12_call(i64) -> i64 {
+    fn0 = %f12(i64, i128) -> i64
+
+block0(v0: i64):
+    v1 = iconst.i64 42
+    v2 = iconcat v0, v1
+    v3 = call fn0(v1, v2)
+    return v3
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movz x3, #42
+; nextln: mov x2, x0
+; nextln: movz x0, #42
+; nextln: ldr x1, 8 ; b 12 ; data
+; nextln: blr x1
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+
+
+; aarch64 allows the i128 argument to not be aligned
+; and to be passed in x1 and x2
+function %f13(i64, i128) -> i64 apple_aarch64 {
+block0(v0: i64, v1: i128):
+    v2, v3 = isplit v1
+    return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: mov x0, x1
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+
+function %f13_call(i64) -> i64 apple_aarch64 {
+    fn0 = %f13(i64, i128) -> i64 apple_aarch64
+
+block0(v0: i64):
+    v1 = iconst.i64 42
+    v2 = iconcat v0, v1
+    v3 = call fn0(v1, v2)
+    return v3
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movz x2, #42
+; nextln: mov x1, x0
+; nextln: movz x0, #42
+; nextln: ldr x3, 8 ; b 12 ; data
+; nextln: blr x3
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+

From fbcfffdeab81274fea944e866713872f0d72b2b5 Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonsobordado@az8.co>
Date: Wed, 12 May 2021 12:51:45 +0100
Subject: [PATCH 2/2] Handle spilling i128 arguments into the stack in aarch64

---
 cranelift/codegen/src/isa/aarch64/abi.rs      |  46 +++++--
 cranelift/codegen/src/isa/aarch64/lower.rs    |  57 ++++----
 .../filetests/filetests/isa/aarch64/call.clif | 123 +++++++++++++++++-
 3 files changed, 184 insertions(+), 42 deletions(-)

diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs
index f9e0de1498..f521087565 100644
--- a/cranelift/codegen/src/isa/aarch64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -187,7 +187,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
         let is_baldrdash = call_conv.extends_baldrdash();
         let has_baldrdash_tls = call_conv == isa::CallConv::Baldrdash2020;
 
-        // See AArch64 ABI (https://c9x.me/compile/bib/abi-arm64.pdf), sections 5.4.
+        // See AArch64 ABI (https://github.com/ARM-software/abi-aa/blob/2021Q1/aapcs64/aapcs64.rst#64parameter-passing), sections 6.4.
         //
         // MacOS aarch64 is slightly different, see also
         // https://developer.apple.com/documentation/xcode/writing_arm64_code_for_apple_platforms.
@@ -265,7 +265,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
                 param.value_type
             );
 
-            let (rcs, _) = Inst::rc_for_type(param.value_type)?;
+            let (rcs, reg_types) = Inst::rc_for_type(param.value_type)?;
 
             if let Some(param) = try_fill_baldrdash_reg(call_conv, param) {
                 assert!(rcs[0] == RegClass::I64);
@@ -288,7 +288,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
 
             // Handle multi register params
             //
-            // See AArch64 ABI (https://c9x.me/compile/bib/abi-arm64.pdf), (Section 5.4 Stage C).
+            // See AArch64 ABI (https://github.com/ARM-software/abi-aa/blob/2021Q1/aapcs64/aapcs64.rst#642parameter-passing-rules), (Section 6.4.2 Stage C).
             //
             // For arguments with alignment of 16 we round up the the register number
             // to the next even value. So we can never allocate for example an i128
@@ -301,7 +301,11 @@ impl ABIMachineSpec for AArch64MachineDeps {
             // restriction of passing the lower half in Xn and the upper half in Xn+1
             // (Stage C.9)
             //
-            // For examples of how llvm handles this: https://godbolt.org/z/bhd3vvEfh
+            // For examples of how LLVM handles this: https://godbolt.org/z/bhd3vvEfh
+            //
+            // On the Apple ABI it is unspecified if we can spill half the value into the stack
+            // i.e load the lower half into x7 and the upper half into the stack
+            // LLVM does not seem to do this, so we are going to replicate that behaviour
             let is_multi_reg = rcs.len() >= 2;
             if is_multi_reg {
                 assert!(
@@ -348,10 +352,8 @@ impl ABIMachineSpec for AArch64MachineDeps {
                     remaining_reg_vals -= 2;
                     continue;
                 }
-            }
-
-            // Single Register parameters
-            if !is_multi_reg {
+            } else {
+                // Single Register parameters
                 let rc = rcs[0];
                 let next_reg = match rc {
                     RegClass::I64 => &mut next_xreg,
@@ -400,12 +402,28 @@ impl ABIMachineSpec for AArch64MachineDeps {
             debug_assert!(size.is_power_of_two());
             next_stack = align_to(next_stack, size);
 
-            ret.push(ABIArg::stack(
-                next_stack as i64,
-                param.value_type,
-                param.extension,
-                param.purpose,
-            ));
+            let slots = reg_types
+                .iter()
+                .copied()
+                // Build the stack locations from each slot
+                .scan(next_stack, |next_stack, ty| {
+                    let slot_offset = *next_stack as i64;
+                    *next_stack += (ty_bits(ty) / 8) as u64;
+
+                    Some((ty, slot_offset))
+                })
+                .map(|(ty, offset)| ABIArgSlot::Stack {
+                    offset,
+                    ty,
+                    extension: param.extension,
+                })
+                .collect();
+
+            ret.push(ABIArg::Slots {
+                slots,
+                purpose: param.purpose,
+            });
+
             next_stack += size;
         }
 
diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
index 7c1cd5f055..d07311159e 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -158,18 +158,18 @@ impl NarrowValueMode {
     }
 }
 
-/// Emits instruction(s) to generate the given 64-bit constant value into a newly-allocated
-/// temporary register, returning that register.
-fn generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u64) -> ValueRegs<Reg> {
+/// Emits instruction(s) to generate the given constant value into newly-allocated
+/// temporary registers, returning these registers.
+fn generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u128) -> ValueRegs<Reg> {
     let from_bits = ty_bits(ty);
-    let masked = if from_bits < 64 {
-        c & ((1u64 << from_bits) - 1)
+    let masked = if from_bits < 128 {
+        c & ((1u128 << from_bits) - 1)
     } else {
         c
     };
 
     let cst_copy = ctx.alloc_tmp(ty);
-    for inst in Inst::gen_constant(cst_copy, masked as u128, ty, |ty| {
+    for inst in Inst::gen_constant(cst_copy, masked, ty, |ty| {
         ctx.alloc_tmp(ty).only_reg().unwrap()
     })
     .into_iter()
@@ -181,7 +181,7 @@ fn generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u64) -> Va
 
 /// Extends a register according to `narrow_mode`.
 /// If extended, the value is always extended to 64 bits, for simplicity.
-fn narrow_reg<C: LowerCtx<I = Inst>>(
+fn extend_reg<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     ty: Type,
     in_reg: Reg,
@@ -252,6 +252,26 @@ fn narrow_reg<C: LowerCtx<I = Inst>>(
     }
 }
 
+/// Lowers an instruction input to multiple regs
+fn lower_input_to_regs<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    input: InsnInput,
+) -> (ValueRegs<Reg>, Type, bool) {
+    debug!("lower_input_to_regs: input {:?}", input);
+    let ty = ctx.input_ty(input.insn, input.input);
+    let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
+    let is_const = inputs.constant.is_some();
+
+    let in_regs = if let Some(c) = inputs.constant {
+        // Generate constants fresh at each use to minimize long-range register pressure.
+        generate_constant(ctx, ty, c as u128)
+    } else {
+        ctx.put_input_in_regs(input.insn, input.input)
+    };
+
+    (in_regs, ty, is_const)
+}
+
 /// Lower an instruction input to a register
 ///
 /// The given register will be extended appropriately, according to
@@ -262,17 +282,12 @@ pub(crate) fn put_input_in_reg<C: LowerCtx<I = Inst>>(
     input: InsnInput,
     narrow_mode: NarrowValueMode,
 ) -> Reg {
-    let reg = put_input_in_regs(ctx, input)
+    let (in_regs, ty, is_const) = lower_input_to_regs(ctx, input);
+    let reg = in_regs
         .only_reg()
         .expect("Multi-register value not expected");
 
-    let is_const = ctx
-        .get_input_as_source_or_const(input.insn, input.input)
-        .constant
-        .is_some();
-
-    let ty = ctx.input_ty(input.insn, input.input);
-    narrow_reg(ctx, ty, reg, is_const, narrow_mode)
+    extend_reg(ctx, ty, reg, is_const, narrow_mode)
 }
 
 /// Lower an instruction input to multiple regs
@@ -280,17 +295,7 @@ pub(crate) fn put_input_in_regs<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     input: InsnInput,
 ) -> ValueRegs<Reg> {
-    debug!("put_input_in_reg: input {:?}", input);
-    let ty = ctx.input_ty(input.insn, input.input);
-    let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
-
-    let in_regs = if let Some(c) = inputs.constant {
-        // Generate constants fresh at each use to minimize long-range register pressure.
-        generate_constant(ctx, ty, c)
-    } else {
-        ctx.put_input_in_regs(input.insn, input.input)
-    };
-
+    let (in_regs, _, _) = lower_input_to_regs(ctx, input);
     in_regs
 }
 
diff --git a/cranelift/filetests/filetests/isa/aarch64/call.clif b/cranelift/filetests/filetests/isa/aarch64/call.clif
index 090c220167..9491c73b75 100644
--- a/cranelift/filetests/filetests/isa/aarch64/call.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/call.clif
@@ -287,7 +287,7 @@ block0(v0: i64):
 ; nextln: ret
 
 
-; The aarch64 abi requires that the i128 argument be aligned
+; The AArch64 ABI requires that the i128 argument be aligned
 ; and to be passed in x2 and x3
 function %f12(i64, i128) -> i64 {
 block0(v0: i64, v1: i128):
@@ -325,7 +325,7 @@ block0(v0: i64):
 
 
 
-; aarch64 allows the i128 argument to not be aligned
+; The Apple AArch64 ABI allows the i128 argument to not be aligned
 ; and to be passed in x1 and x2
 function %f13(i64, i128) -> i64 apple_aarch64 {
 block0(v0: i64, v1: i128):
@@ -360,3 +360,122 @@ block0(v0: i64):
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
 
+
+
+; We only have 8 registers to pass data in
+; make sure we spill the last argument even though there is one slot available
+function %f14(i128, i128, i128, i64, i128) -> i128 {
+block0(v0: i128, v1: i128, v2: i128, v3: i64, v4: i128):
+    return v4
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: ldur x0, [fp, #16]
+; nextln: ldur x1, [fp, #24]
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f14_call(i128, i64) -> i128 {
+    fn0 = %f14(i128, i128, i128, i64, i128) -> i128
+
+block0(v0: i128, v1: i64):
+    v2 = call fn0(v0, v0, v0, v1, v0)
+    return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+
+; TODO: Some codegen optimization possible here with x0,x1 moving to x7,x8 and then moving back
+; nextln: mov x7, x0
+; nextln: mov x8, x1
+; nextln: mov x6, x2
+; nextln: sub sp, sp, #16
+; nextln: virtual_sp_offset_adjust 16
+; nextln: mov x0, x7
+; nextln: mov x1, x8
+; nextln: mov x2, x7
+; nextln: mov x3, x8
+; nextln: mov x4, x7
+; nextln: mov x5, x8
+; nextln: stur x7, [sp]
+; nextln: stur x8, [sp, #8]
+
+; nextln: ldr x7, 8 ; b 12 ; data
+; nextln: blr x7
+; nextln: add sp, sp, #16
+; nextln: virtual_sp_offset_adjust -16
+
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+
+
+; We have one register slot available (Similar to %f14), however apple
+; allows us to start i128 on non even numbered registers (x7 in this case).
+;
+; It is unspecified if we can split the i128 into x7 + the stack.
+; In practice LLVM does not do this, so we are going to go with that.
+function %f15(i128, i128, i128, i64, i128) -> i128 apple_aarch64{
+block0(v0: i128, v1: i128, v2: i128, v3: i64, v4: i128):
+    return v4
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: ldur x0, [fp, #16]
+; nextln: ldur x1, [fp, #24]
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f15_call(i128, i64) -> i128 apple_aarch64 {
+    fn0 = %f15(i128, i128, i128, i64, i128) -> i128 apple_aarch64
+
+block0(v0: i128, v1: i64):
+    v2 = call fn0(v0, v0, v0, v1, v0)
+    return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+
+; nextln: mov x7, x0
+; nextln: mov x8, x1
+; nextln: mov x6, x2
+; nextln: sub sp, sp, #16
+; nextln: virtual_sp_offset_adjust 16
+; nextln: mov x0, x7
+; nextln: mov x1, x8
+; nextln: mov x2, x7
+; nextln: mov x3, x8
+; nextln: mov x4, x7
+; nextln: mov x5, x8
+; nextln: stur x7, [sp]
+; nextln: stur x8, [sp, #8]
+
+; nextln: ldr x7, 8 ; b 12 ; data
+; nextln: blr x7
+; nextln: add sp, sp, #16
+; nextln: virtual_sp_offset_adjust -16
+
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+
+function %f16() -> i32, i32 wasmtime_system_v {
+block0:
+    v0 = iconst.i32 0
+    v1 = iconst.i32 1
+    return v0, v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: mov x1, x0
+; nextln: movz x0, #0
+; nextln: movz x2, #1
+; nextln: stur w2, [x1]
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+