From ac624da8d97c98eb1fcb0c705eb8cd98b2a620e5 Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonsobordado@az8.co>
Date: Tue, 11 May 2021 10:25:33 +0100
Subject: [PATCH] Handle i128 arguments in the aarch64 ABI

When dealing with params that need to be split, we follow the
arch64 ABI and split the value in two, and make sure that start that
argument in an even numbered xN register.

The apple ABI does not require this, so on those platforms, we start
params anywhere.
---
 cranelift/codegen/src/isa/aarch64/abi.rs      | 185 ++++++++++++------
 cranelift/codegen/src/isa/aarch64/lower.rs    | 105 ++++++----
 .../codegen/src/isa/aarch64/lower_inst.rs     |  59 +++++-
 .../filetests/filetests/isa/aarch64/call.clif | 110 +++++++++++
 4 files changed, 362 insertions(+), 97 deletions(-)
diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs
index 6a264f1604..f9e0de1498 100644
--- a/cranelift/codegen/src/isa/aarch64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -183,6 +183,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
         args_or_rets: ArgsOrRets,
         add_ret_area_ptr: bool,
     ) -> CodegenResult<(Vec<ABIArg>, i64, Option<usize>)> {
+        let is_apple_cc = call_conv == isa::CallConv::AppleAarch64;
         let is_baldrdash = call_conv.extends_baldrdash();
         let has_baldrdash_tls = call_conv == isa::CallConv::Baldrdash2020;
 
@@ -194,8 +195,6 @@ impl ABIMachineSpec for AArch64MachineDeps {
         // following ways:
         // - sign- and zero- extensions of data types less than 32 bits are not
         // implemented yet.
-        // - i128 arguments passing isn't implemented yet in the standard (non
-        // MacOS) aarch64 ABI.
         // - we align the arguments stack space to a 16-bytes boundary, while
         // the MacOS allows aligning only on 8 bytes. In practice it means we're
         // slightly overallocating when calling, which is fine, and doesn't
@@ -265,20 +264,16 @@ impl ABIMachineSpec for AArch64MachineDeps {
                 "Invalid type for AArch64: {:?}",
                 param.value_type
             );
-            let (rcs, _) = Inst::rc_for_type(param.value_type).unwrap();
-            assert!(rcs.len() == 1, "Multi-reg values not supported yet");
-            let rc = rcs[0];
 
-            let next_reg = match rc {
-                RegClass::I64 => &mut next_xreg,
-                RegClass::V128 => &mut next_vreg,
-                _ => panic!("Invalid register class: {:?}", rc),
-            };
+            let (rcs, _) = Inst::rc_for_type(param.value_type)?;
 
             if let Some(param) = try_fill_baldrdash_reg(call_conv, param) {
-                assert!(rc == RegClass::I64);
+                assert!(rcs[0] == RegClass::I64);
                 ret.push(param);
-            } else if let ir::ArgumentPurpose::StructArgument(size) = param.purpose {
+                continue;
+            }
+
+            if let ir::ArgumentPurpose::StructArgument(size) = param.purpose {
                 let offset = next_stack as i64;
                 let size = size as u64;
                 assert!(size % 8 == 0, "StructArgument size is not properly aligned");
@@ -288,50 +283,130 @@ impl ABIMachineSpec for AArch64MachineDeps {
                     size,
                     purpose: param.purpose,
                 });
-            } else if *next_reg < max_per_class_reg_vals && remaining_reg_vals > 0 {
-                let reg = match rc {
-                    RegClass::I64 => xreg(*next_reg),
-                    RegClass::V128 => vreg(*next_reg),
-                    _ => unreachable!(),
-                };
-                ret.push(ABIArg::reg(
-                    reg.to_real_reg(),
-                    param.value_type,
-                    param.extension,
-                    param.purpose,
-                ));
-                *next_reg += 1;
-                remaining_reg_vals -= 1;
-            } else {
-                // Compute the stack slot's size.
-                let size = (ty_bits(param.value_type) / 8) as u64;
-
-                let size = if call_conv == isa::CallConv::AppleAarch64
-                    || (call_conv.extends_wasmtime() && args_or_rets == ArgsOrRets::Rets)
-                {
-                    // MacOS aarch64 and Wasmtime allow stack slots with
-                    // sizes less than 8 bytes. They still need to be
-                    // properly aligned on their natural data alignment,
-                    // though.
-                    size
-                } else {
-                    // Every arg takes a minimum slot of 8 bytes. (16-byte stack
-                    // alignment happens separately after all args.)
-                    std::cmp::max(size, 8)
-                };
-
-                // Align the stack slot.
-                debug_assert!(size.is_power_of_two());
-                next_stack = align_to(next_stack, size);
-
-                ret.push(ABIArg::stack(
-                    next_stack as i64,
-                    param.value_type,
-                    param.extension,
-                    param.purpose,
-                ));
-                next_stack += size;
+                continue;
             }
+
+            // Handle multi register params
+            //
+            // See AArch64 ABI (https://c9x.me/compile/bib/abi-arm64.pdf), (Section 5.4 Stage C).
+            //
+            // For arguments with alignment of 16 we round up the the register number
+            // to the next even value. So we can never allocate for example an i128
+            // to X1 and X2, we have to skip one register and do X2, X3
+            // (Stage C.8)
+            // Note: The Apple ABI deviates a bit here. They don't respect Stage C.8
+            // and will happily allocate a i128 to X1 and X2
+            //
+            // For integer types with alignment of 16 we also have the additional
+            // restriction of passing the lower half in Xn and the upper half in Xn+1
+            // (Stage C.9)
+            //
+            // For examples of how llvm handles this: https://godbolt.org/z/bhd3vvEfh
+            let is_multi_reg = rcs.len() >= 2;
+            if is_multi_reg {
+                assert!(
+                    rcs.len() == 2,
+                    "Unable to handle multi reg params with more than 2 regs"
+                );
+                assert!(
+                    rcs == &[RegClass::I64, RegClass::I64],
+                    "Unable to handle non i64 regs"
+                );
+
+                let reg_class_space = max_per_class_reg_vals - next_xreg;
+                let reg_space = remaining_reg_vals;
+
+                if reg_space >= 2 && reg_class_space >= 2 {
+                    // The aarch64 ABI does not allow us to start a split argument
+                    // at an odd numbered register. So we need to skip one register
+                    //
+                    // TODO: The Fast ABI should probably not skip the register
+                    if !is_apple_cc && next_xreg % 2 != 0 {
+                        next_xreg += 1;
+                    }
+
+                    let lower_reg = xreg(next_xreg);
+                    let upper_reg = xreg(next_xreg + 1);
+
+                    ret.push(ABIArg::Slots {
+                        slots: vec![
+                            ABIArgSlot::Reg {
+                                reg: lower_reg.to_real_reg(),
+                                ty: param.value_type,
+                                extension: param.extension,
+                            },
+                            ABIArgSlot::Reg {
+                                reg: upper_reg.to_real_reg(),
+                                ty: param.value_type,
+                                extension: param.extension,
+                            },
+                        ],
+                        purpose: param.purpose,
+                    });
+
+                    next_xreg += 2;
+                    remaining_reg_vals -= 2;
+                    continue;
+                }
+            }
+
+            // Single Register parameters
+            if !is_multi_reg {
+                let rc = rcs[0];
+                let next_reg = match rc {
+                    RegClass::I64 => &mut next_xreg,
+                    RegClass::V128 => &mut next_vreg,
+                    _ => panic!("Invalid register class: {:?}", rc),
+                };
+
+                if *next_reg < max_per_class_reg_vals && remaining_reg_vals > 0 {
+                    let reg = match rc {
+                        RegClass::I64 => xreg(*next_reg),
+                        RegClass::V128 => vreg(*next_reg),
+                        _ => unreachable!(),
+                    };
+                    ret.push(ABIArg::reg(
+                        reg.to_real_reg(),
+                        param.value_type,
+                        param.extension,
+                        param.purpose,
+                    ));
+                    *next_reg += 1;
+                    remaining_reg_vals -= 1;
+                    continue;
+                }
+            }
+
+            // Spill to the stack
+
+            // Compute the stack slot's size.
+            let size = (ty_bits(param.value_type) / 8) as u64;
+
+            let size = if is_apple_cc
+                || (call_conv.extends_wasmtime() && args_or_rets == ArgsOrRets::Rets)
+            {
+                // MacOS aarch64 and Wasmtime allow stack slots with
+                // sizes less than 8 bytes. They still need to be
+                // properly aligned on their natural data alignment,
+                // though.
+                size
+            } else {
+                // Every arg takes a minimum slot of 8 bytes. (16-byte stack
+                // alignment happens separately after all args.)
+                std::cmp::max(size, 8)
+            };
+
+            // Align the stack slot.
+            debug_assert!(size.is_power_of_two());
+            next_stack = align_to(next_stack, size);
+
+            ret.push(ABIArg::stack(
+                next_stack as i64,
+                param.value_type,
+                param.extension,
+                param.purpose,
+            ));
+            next_stack += size;
         }
 
         if args_or_rets == ArgsOrRets::Rets && is_baldrdash {
diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
index 4f5893f54b..7c1cd5f055 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -158,42 +158,37 @@ impl NarrowValueMode {
     }
 }
 
-/// Lower an instruction input to a reg.
-///
-/// The given register will be extended appropriately, according to
-/// `narrow_mode` and the input's type. If extended, the value is
-/// always extended to 64 bits, for simplicity.
-pub(crate) fn put_input_in_reg<C: LowerCtx<I = Inst>>(
-    ctx: &mut C,
-    input: InsnInput,
-    narrow_mode: NarrowValueMode,
-) -> Reg {
-    debug!("put_input_in_reg: input {:?}", input);
-    let ty = ctx.input_ty(input.insn, input.input);
-    let from_bits = ty_bits(ty) as u8;
-    let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
-    let in_reg = if let Some(c) = inputs.constant {
-        // Generate constants fresh at each use to minimize long-range register pressure.
-        let masked = if from_bits < 64 {
-            c & ((1u64 << from_bits) - 1)
-        } else {
-            c
-        };
-        let to_reg = ctx.alloc_tmp(ty).only_reg().unwrap();
-        for inst in Inst::gen_constant(ValueRegs::one(to_reg), masked as u128, ty, |ty| {
-            ctx.alloc_tmp(ty).only_reg().unwrap()
-        })
-        .into_iter()
-        {
-            ctx.emit(inst);
-        }
-        to_reg.to_reg()
+/// Emits instruction(s) to generate the given 64-bit constant value into a newly-allocated
+/// temporary register, returning that register.
+fn generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u64) -> ValueRegs<Reg> {
+    let from_bits = ty_bits(ty);
+    let masked = if from_bits < 64 {
+        c & ((1u64 << from_bits) - 1)
     } else {
-        ctx.put_input_in_regs(input.insn, input.input)
-            .only_reg()
-            .unwrap()
+        c
     };
 
+    let cst_copy = ctx.alloc_tmp(ty);
+    for inst in Inst::gen_constant(cst_copy, masked as u128, ty, |ty| {
+        ctx.alloc_tmp(ty).only_reg().unwrap()
+    })
+    .into_iter()
+    {
+        ctx.emit(inst);
+    }
+    non_writable_value_regs(cst_copy)
+}
+
+/// Extends a register according to `narrow_mode`.
+/// If extended, the value is always extended to 64 bits, for simplicity.
+fn narrow_reg<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    ty: Type,
+    in_reg: Reg,
+    is_const: bool,
+    narrow_mode: NarrowValueMode,
+) -> Reg {
+    let from_bits = ty_bits(ty) as u8;
     match (narrow_mode, from_bits) {
         (NarrowValueMode::None, _) => in_reg,
         (NarrowValueMode::ZeroExtend32, n) if n < 32 => {
@@ -221,7 +216,7 @@ pub(crate) fn put_input_in_reg<C: LowerCtx<I = Inst>>(
         (NarrowValueMode::ZeroExtend32, 32) | (NarrowValueMode::SignExtend32, 32) => in_reg,
 
         (NarrowValueMode::ZeroExtend64, n) if n < 64 => {
-            if inputs.constant.is_some() {
+            if is_const {
                 // Constants are zero-extended to full 64-bit width on load already.
                 in_reg
             } else {
@@ -257,6 +252,48 @@ pub(crate) fn put_input_in_reg<C: LowerCtx<I = Inst>>(
     }
 }
 
+/// Lower an instruction input to a register
+///
+/// The given register will be extended appropriately, according to
+/// `narrow_mode` and the input's type. If extended, the value is
+/// always extended to 64 bits, for simplicity.
+pub(crate) fn put_input_in_reg<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    input: InsnInput,
+    narrow_mode: NarrowValueMode,
+) -> Reg {
+    let reg = put_input_in_regs(ctx, input)
+        .only_reg()
+        .expect("Multi-register value not expected");
+
+    let is_const = ctx
+        .get_input_as_source_or_const(input.insn, input.input)
+        .constant
+        .is_some();
+
+    let ty = ctx.input_ty(input.insn, input.input);
+    narrow_reg(ctx, ty, reg, is_const, narrow_mode)
+}
+
+/// Lower an instruction input to multiple regs
+pub(crate) fn put_input_in_regs<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    input: InsnInput,
+) -> ValueRegs<Reg> {
+    debug!("put_input_in_reg: input {:?}", input);
+    let ty = ctx.input_ty(input.insn, input.input);
+    let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
+
+    let in_regs = if let Some(c) = inputs.constant {
+        // Generate constants fresh at each use to minimize long-range register pressure.
+        generate_constant(ctx, ty, c)
+    } else {
+        ctx.put_input_in_regs(input.insn, input.input)
+    };
+
+    in_regs
+}
+
 /// Lower an instruction input to a reg or reg/shift, or reg/extend operand.
 ///
 /// The `narrow_mode` flag indicates whether the consumer of this value needs
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index ede66295e9..5e33c54034 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -1572,10 +1572,21 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 // N.B.: according to the AArch64 ABI, the top bits of a register
                 // (above the bits for the value's type) are undefined, so we
                 // need not extend the return values.
-                let reg = put_input_in_reg(ctx, *input, NarrowValueMode::None);
-                let retval_reg = ctx.retval(i).only_reg().unwrap();
+                let src_regs = put_input_in_regs(ctx, *input);
+                let retval_regs = ctx.retval(i);
+
+                assert_eq!(src_regs.len(), retval_regs.len());
                 let ty = ctx.input_ty(insn, i);
-                ctx.emit(Inst::gen_move(retval_reg, reg, ty));
+                let (_, tys) = Inst::rc_for_type(ty)?;
+
+                src_regs
+                    .regs()
+                    .iter()
+                    .zip(retval_regs.regs().iter())
+                    .zip(tys.iter())
+                    .for_each(|((&src, &dst), &ty)| {
+                        ctx.emit(Inst::gen_move(dst, src, ty));
+                    });
             }
             // N.B.: the Ret itself is generated by the ABI.
         }
@@ -1753,13 +1764,13 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             assert!(inputs.len() == abi.num_args());
             for i in abi.get_copy_to_arg_order() {
                 let input = inputs[i];
-                let arg_reg = put_input_in_reg(ctx, input, NarrowValueMode::None);
-                abi.emit_copy_regs_to_arg(ctx, i, ValueRegs::one(arg_reg));
+                let arg_regs = put_input_in_regs(ctx, input);
+                abi.emit_copy_regs_to_arg(ctx, i, arg_regs);
             }
             abi.emit_call(ctx);
             for (i, output) in outputs.iter().enumerate() {
-                let retval_reg = get_output_reg(ctx, *output).only_reg().unwrap();
-                abi.emit_copy_retval_to_regs(ctx, i, ValueRegs::one(retval_reg));
+                let retval_regs = get_output_reg(ctx, *output);
+                abi.emit_copy_retval_to_regs(ctx, i, retval_regs);
             }
             abi.emit_stack_post_adjust(ctx);
         }
@@ -2306,7 +2317,39 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             panic!("Vector ops not implemented.");
         }
 
-        Opcode::Isplit | Opcode::Iconcat => panic!("Vector ops not supported."),
+        Opcode::Isplit => {
+            assert_eq!(
+                ctx.input_ty(insn, 0),
+                I128,
+                "Isplit only implemented for i128's"
+            );
+            assert_eq!(ctx.output_ty(insn, 0), I64);
+            assert_eq!(ctx.output_ty(insn, 1), I64);
+
+            let src_regs = put_input_in_regs(ctx, inputs[0]);
+            let dst_lo = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+            let dst_hi = get_output_reg(ctx, outputs[1]).only_reg().unwrap();
+
+            ctx.emit(Inst::gen_move(dst_lo, src_regs.regs()[0], I64));
+            ctx.emit(Inst::gen_move(dst_hi, src_regs.regs()[1], I64));
+        }
+
+        Opcode::Iconcat => {
+            assert_eq!(
+                ctx.output_ty(insn, 0),
+                I128,
+                "Iconcat only implemented for i128's"
+            );
+            assert_eq!(ctx.input_ty(insn, 0), I64);
+            assert_eq!(ctx.input_ty(insn, 1), I64);
+
+            let src_lo = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let src_hi = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+            let dst = get_output_reg(ctx, outputs[0]);
+
+            ctx.emit(Inst::gen_move(dst.regs()[0], src_lo, I64));
+            ctx.emit(Inst::gen_move(dst.regs()[1], src_hi, I64));
+        }
 
         Opcode::Imax | Opcode::Umax | Opcode::Umin | Opcode::Imin => {
             let alu_op = match op {
diff --git a/cranelift/filetests/filetests/isa/aarch64/call.clif b/cranelift/filetests/filetests/isa/aarch64/call.clif
index a59c59a5b2..090c220167 100644
--- a/cranelift/filetests/filetests/isa/aarch64/call.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/call.clif
@@ -250,3 +250,113 @@ block0:
 ; nextln:  add sp, sp, #32
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
+
+
+; i128 tests
+function %f11(i128, i64) -> i64 {
+block0(v0: i128, v1: i64):
+    v2, v3 = isplit v0
+    return v3
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: mov x0, x1
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+
+function %f11_call(i64) -> i64 {
+    fn0 = %f11(i128, i64) -> i64
+
+block0(v0: i64):
+    v1 = iconst.i64 42
+    v2 = iconcat v1, v0
+    v3 = call fn0(v2, v1)
+    return v3
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: mov x1, x0
+; nextln: movz x0, #42
+; nextln: movz x2, #42
+; nextln: ldr x3, 8 ; b 12 ; data
+; nextln: blr x3
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+
+; The aarch64 abi requires that the i128 argument be aligned
+; and to be passed in x2 and x3
+function %f12(i64, i128) -> i64 {
+block0(v0: i64, v1: i128):
+    v2, v3 = isplit v1
+    return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: mov x0, x2
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+
+
+function %f12_call(i64) -> i64 {
+    fn0 = %f12(i64, i128) -> i64
+
+block0(v0: i64):
+    v1 = iconst.i64 42
+    v2 = iconcat v0, v1
+    v3 = call fn0(v1, v2)
+    return v3
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movz x3, #42
+; nextln: mov x2, x0
+; nextln: movz x0, #42
+; nextln: ldr x1, 8 ; b 12 ; data
+; nextln: blr x1
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+
+
+; aarch64 allows the i128 argument to not be aligned
+; and to be passed in x1 and x2
+function %f13(i64, i128) -> i64 apple_aarch64 {
+block0(v0: i64, v1: i128):
+    v2, v3 = isplit v1
+    return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: mov x0, x1
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+
+function %f13_call(i64) -> i64 apple_aarch64 {
+    fn0 = %f13(i64, i128) -> i64 apple_aarch64
+
+block0(v0: i64):
+    v1 = iconst.i64 42
+    v2 = iconcat v0, v1
+    v3 = call fn0(v1, v2)
+    return v3
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movz x2, #42
+; nextln: mov x1, x0
+; nextln: movz x0, #42
+; nextln: ldr x3, 8 ; b 12 ; data
+; nextln: blr x3
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+