diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs
index 6a264f1604..f9e0de1498 100644
--- a/cranelift/codegen/src/isa/aarch64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -183,6 +183,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
         args_or_rets: ArgsOrRets,
         add_ret_area_ptr: bool,
     ) -> CodegenResult<(Vec<ABIArg>, i64, Option<usize>)> {
+        let is_apple_cc = call_conv == isa::CallConv::AppleAarch64;
         let is_baldrdash = call_conv.extends_baldrdash();
         let has_baldrdash_tls = call_conv == isa::CallConv::Baldrdash2020;
 
@@ -194,8 +195,6 @@ impl ABIMachineSpec for AArch64MachineDeps {
         // following ways:
         // - sign- and zero- extensions of data types less than 32 bits are not
         // implemented yet.
-        // - i128 arguments passing isn't implemented yet in the standard (non
-        // MacOS) aarch64 ABI.
         // - we align the arguments stack space to a 16-bytes boundary, while
         // the MacOS allows aligning only on 8 bytes. In practice it means we're
         // slightly overallocating when calling, which is fine, and doesn't
@@ -265,20 +264,16 @@ impl ABIMachineSpec for AArch64MachineDeps {
                 "Invalid type for AArch64: {:?}",
                 param.value_type
             );
-            let (rcs, _) = Inst::rc_for_type(param.value_type).unwrap();
-            assert!(rcs.len() == 1, "Multi-reg values not supported yet");
-            let rc = rcs[0];
 
-            let next_reg = match rc {
-                RegClass::I64 => &mut next_xreg,
-                RegClass::V128 => &mut next_vreg,
-                _ => panic!("Invalid register class: {:?}", rc),
-            };
+            let (rcs, _) = Inst::rc_for_type(param.value_type)?;
 
             if let Some(param) = try_fill_baldrdash_reg(call_conv, param) {
-                assert!(rc == RegClass::I64);
+                assert!(rcs[0] == RegClass::I64);
                 ret.push(param);
-            } else if let ir::ArgumentPurpose::StructArgument(size) = param.purpose {
+                continue;
+            }
+
+            if let ir::ArgumentPurpose::StructArgument(size) = param.purpose {
                 let offset = next_stack as i64;
                 let size = size as u64;
                 assert!(size % 8 == 0, "StructArgument size is not properly aligned");
@@ -288,50 +283,130 @@ impl ABIMachineSpec for AArch64MachineDeps {
                     size,
                     purpose: param.purpose,
                 });
-            } else if *next_reg < max_per_class_reg_vals && remaining_reg_vals > 0 {
-                let reg = match rc {
-                    RegClass::I64 => xreg(*next_reg),
-                    RegClass::V128 => vreg(*next_reg),
-                    _ => unreachable!(),
-                };
-                ret.push(ABIArg::reg(
-                    reg.to_real_reg(),
-                    param.value_type,
-                    param.extension,
-                    param.purpose,
-                ));
-                *next_reg += 1;
-                remaining_reg_vals -= 1;
-            } else {
-                // Compute the stack slot's size.
-                let size = (ty_bits(param.value_type) / 8) as u64;
-
-                let size = if call_conv == isa::CallConv::AppleAarch64
-                    || (call_conv.extends_wasmtime() && args_or_rets == ArgsOrRets::Rets)
-                {
-                    // MacOS aarch64 and Wasmtime allow stack slots with
-                    // sizes less than 8 bytes. They still need to be
-                    // properly aligned on their natural data alignment,
-                    // though.
-                    size
-                } else {
-                    // Every arg takes a minimum slot of 8 bytes. (16-byte stack
-                    // alignment happens separately after all args.)
-                    std::cmp::max(size, 8)
-                };
-
-                // Align the stack slot.
-                debug_assert!(size.is_power_of_two());
-                next_stack = align_to(next_stack, size);
-
-                ret.push(ABIArg::stack(
-                    next_stack as i64,
-                    param.value_type,
-                    param.extension,
-                    param.purpose,
-                ));
-                next_stack += size;
+                continue;
             }
+
+            // Handle multi register params
+            //
+            // See AArch64 ABI (https://c9x.me/compile/bib/abi-arm64.pdf), (Section 5.4 Stage C).
+            //
+            // For arguments with alignment of 16 we round up the the register number
+            // to the next even value. So we can never allocate for example an i128
+            // to X1 and X2, we have to skip one register and do X2, X3
+            // (Stage C.8)
+            // Note: The Apple ABI deviates a bit here. They don't respect Stage C.8
+            // and will happily allocate a i128 to X1 and X2
+            //
+            // For integer types with alignment of 16 we also have the additional
+            // restriction of passing the lower half in Xn and the upper half in Xn+1
+            // (Stage C.9)
+            //
+            // For examples of how llvm handles this: https://godbolt.org/z/bhd3vvEfh
+            let is_multi_reg = rcs.len() >= 2;
+            if is_multi_reg {
+                assert!(
+                    rcs.len() == 2,
+                    "Unable to handle multi reg params with more than 2 regs"
+                );
+                assert!(
+                    rcs == &[RegClass::I64, RegClass::I64],
+                    "Unable to handle non i64 regs"
+                );
+
+                let reg_class_space = max_per_class_reg_vals - next_xreg;
+                let reg_space = remaining_reg_vals;
+
+                if reg_space >= 2 && reg_class_space >= 2 {
+                    // The aarch64 ABI does not allow us to start a split argument
+                    // at an odd numbered register. So we need to skip one register
+                    //
+                    // TODO: The Fast ABI should probably not skip the register
+                    if !is_apple_cc && next_xreg % 2 != 0 {
+                        next_xreg += 1;
+                    }
+
+                    let lower_reg = xreg(next_xreg);
+                    let upper_reg = xreg(next_xreg + 1);
+
+                    ret.push(ABIArg::Slots {
+                        slots: vec![
+                            ABIArgSlot::Reg {
+                                reg: lower_reg.to_real_reg(),
+                                ty: param.value_type,
+                                extension: param.extension,
+                            },
+                            ABIArgSlot::Reg {
+                                reg: upper_reg.to_real_reg(),
+                                ty: param.value_type,
+                                extension: param.extension,
+                            },
+                        ],
+                        purpose: param.purpose,
+                    });
+
+                    next_xreg += 2;
+                    remaining_reg_vals -= 2;
+                    continue;
+                }
+            }
+
+            // Single Register parameters
+            if !is_multi_reg {
+                let rc = rcs[0];
+                let next_reg = match rc {
+                    RegClass::I64 => &mut next_xreg,
+                    RegClass::V128 => &mut next_vreg,
+                    _ => panic!("Invalid register class: {:?}", rc),
+                };
+
+                if *next_reg < max_per_class_reg_vals && remaining_reg_vals > 0 {
+                    let reg = match rc {
+                        RegClass::I64 => xreg(*next_reg),
+                        RegClass::V128 => vreg(*next_reg),
+                        _ => unreachable!(),
+                    };
+                    ret.push(ABIArg::reg(
+                        reg.to_real_reg(),
+                        param.value_type,
+                        param.extension,
+                        param.purpose,
+                    ));
+                    *next_reg += 1;
+                    remaining_reg_vals -= 1;
+                    continue;
+                }
+            }
+
+            // Spill to the stack
+
+            // Compute the stack slot's size.
+            let size = (ty_bits(param.value_type) / 8) as u64;
+
+            let size = if is_apple_cc
+                || (call_conv.extends_wasmtime() && args_or_rets == ArgsOrRets::Rets)
+            {
+                // MacOS aarch64 and Wasmtime allow stack slots with
+                // sizes less than 8 bytes. They still need to be
+                // properly aligned on their natural data alignment,
+                // though.
+                size
+            } else {
+                // Every arg takes a minimum slot of 8 bytes. (16-byte stack
+                // alignment happens separately after all args.)
+                std::cmp::max(size, 8)
+            };
+
+            // Align the stack slot.
+            debug_assert!(size.is_power_of_two());
+            next_stack = align_to(next_stack, size);
+
+            ret.push(ABIArg::stack(
+                next_stack as i64,
+                param.value_type,
+                param.extension,
+                param.purpose,
+            ));
+            next_stack += size;
         }
 
         if args_or_rets == ArgsOrRets::Rets && is_baldrdash {
diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
index 4f5893f54b..7c1cd5f055 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -158,42 +158,37 @@ impl NarrowValueMode {
     }
 }
 
-/// Lower an instruction input to a reg.
-///
-/// The given register will be extended appropriately, according to
-/// `narrow_mode` and the input's type. If extended, the value is
-/// always extended to 64 bits, for simplicity.
-pub(crate) fn put_input_in_reg<C: LowerCtx<I = Inst>>(
-    ctx: &mut C,
-    input: InsnInput,
-    narrow_mode: NarrowValueMode,
-) -> Reg {
-    debug!("put_input_in_reg: input {:?}", input);
-    let ty = ctx.input_ty(input.insn, input.input);
-    let from_bits = ty_bits(ty) as u8;
-    let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
-    let in_reg = if let Some(c) = inputs.constant {
-        // Generate constants fresh at each use to minimize long-range register pressure.
-        let masked = if from_bits < 64 {
-            c & ((1u64 << from_bits) - 1)
-        } else {
-            c
-        };
-        let to_reg = ctx.alloc_tmp(ty).only_reg().unwrap();
-        for inst in Inst::gen_constant(ValueRegs::one(to_reg), masked as u128, ty, |ty| {
-            ctx.alloc_tmp(ty).only_reg().unwrap()
-        })
-        .into_iter()
-        {
-            ctx.emit(inst);
-        }
-        to_reg.to_reg()
+/// Emits instruction(s) to generate the given 64-bit constant value into a newly-allocated
+/// temporary register, returning that register.
+fn generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u64) -> ValueRegs<Reg> {
+    let from_bits = ty_bits(ty);
+    let masked = if from_bits < 64 {
+        c & ((1u64 << from_bits) - 1)
     } else {
-        ctx.put_input_in_regs(input.insn, input.input)
-            .only_reg()
-            .unwrap()
+        c
     };
 
+    let cst_copy = ctx.alloc_tmp(ty);
+    for inst in Inst::gen_constant(cst_copy, masked as u128, ty, |ty| {
+        ctx.alloc_tmp(ty).only_reg().unwrap()
+    })
+    .into_iter()
+    {
+        ctx.emit(inst);
+    }
+    non_writable_value_regs(cst_copy)
+}
+
+/// Extends a register according to `narrow_mode`.
+/// If extended, the value is always extended to 64 bits, for simplicity.
+fn narrow_reg<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    ty: Type,
+    in_reg: Reg,
+    is_const: bool,
+    narrow_mode: NarrowValueMode,
+) -> Reg {
+    let from_bits = ty_bits(ty) as u8;
     match (narrow_mode, from_bits) {
         (NarrowValueMode::None, _) => in_reg,
         (NarrowValueMode::ZeroExtend32, n) if n < 32 => {
@@ -221,7 +216,7 @@ pub(crate) fn put_input_in_reg<C: LowerCtx<I = Inst>>(
         (NarrowValueMode::ZeroExtend32, 32) | (NarrowValueMode::SignExtend32, 32) => in_reg,
 
         (NarrowValueMode::ZeroExtend64, n) if n < 64 => {
-            if inputs.constant.is_some() {
+            if is_const {
                 // Constants are zero-extended to full 64-bit width on load already.
                 in_reg
             } else {
@@ -257,6 +252,48 @@ pub(crate) fn put_input_in_reg<C: LowerCtx<I = Inst>>(
     }
 }
 
+/// Lower an instruction input to a register
+///
+/// The given register will be extended appropriately, according to
+/// `narrow_mode` and the input's type. If extended, the value is
+/// always extended to 64 bits, for simplicity.
+pub(crate) fn put_input_in_reg<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    input: InsnInput,
+    narrow_mode: NarrowValueMode,
+) -> Reg {
+    let reg = put_input_in_regs(ctx, input)
+        .only_reg()
+        .expect("Multi-register value not expected");
+
+    let is_const = ctx
+        .get_input_as_source_or_const(input.insn, input.input)
+        .constant
+        .is_some();
+
+    let ty = ctx.input_ty(input.insn, input.input);
+    narrow_reg(ctx, ty, reg, is_const, narrow_mode)
+}
+
+/// Lower an instruction input to multiple regs
+pub(crate) fn put_input_in_regs<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    input: InsnInput,
+) -> ValueRegs<Reg> {
+    debug!("put_input_in_reg: input {:?}", input);
+    let ty = ctx.input_ty(input.insn, input.input);
+    let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
+
+    let in_regs = if let Some(c) = inputs.constant {
+        // Generate constants fresh at each use to minimize long-range register pressure.
+        generate_constant(ctx, ty, c)
+    } else {
+        ctx.put_input_in_regs(input.insn, input.input)
+    };
+
+    in_regs
+}
+
 /// Lower an instruction input to a reg or reg/shift, or reg/extend operand.
 ///
 /// The `narrow_mode` flag indicates whether the consumer of this value needs
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index ede66295e9..5e33c54034 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -1572,10 +1572,21 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 // N.B.: according to the AArch64 ABI, the top bits of a register
                 // (above the bits for the value's type) are undefined, so we
                 // need not extend the return values.
-                let reg = put_input_in_reg(ctx, *input, NarrowValueMode::None);
-                let retval_reg = ctx.retval(i).only_reg().unwrap();
+                let src_regs = put_input_in_regs(ctx, *input);
+                let retval_regs = ctx.retval(i);
+
+                assert_eq!(src_regs.len(), retval_regs.len());
                 let ty = ctx.input_ty(insn, i);
-                ctx.emit(Inst::gen_move(retval_reg, reg, ty));
+                let (_, tys) = Inst::rc_for_type(ty)?;
+
+                src_regs
+                    .regs()
+                    .iter()
+                    .zip(retval_regs.regs().iter())
+                    .zip(tys.iter())
+                    .for_each(|((&src, &dst), &ty)| {
+                        ctx.emit(Inst::gen_move(dst, src, ty));
+                    });
             }
             // N.B.: the Ret itself is generated by the ABI.
         }
@@ -1753,13 +1764,13 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             assert!(inputs.len() == abi.num_args());
             for i in abi.get_copy_to_arg_order() {
                 let input = inputs[i];
-                let arg_reg = put_input_in_reg(ctx, input, NarrowValueMode::None);
-                abi.emit_copy_regs_to_arg(ctx, i, ValueRegs::one(arg_reg));
+                let arg_regs = put_input_in_regs(ctx, input);
+                abi.emit_copy_regs_to_arg(ctx, i, arg_regs);
             }
             abi.emit_call(ctx);
             for (i, output) in outputs.iter().enumerate() {
-                let retval_reg = get_output_reg(ctx, *output).only_reg().unwrap();
-                abi.emit_copy_retval_to_regs(ctx, i, ValueRegs::one(retval_reg));
+                let retval_regs = get_output_reg(ctx, *output);
+                abi.emit_copy_retval_to_regs(ctx, i, retval_regs);
             }
             abi.emit_stack_post_adjust(ctx);
         }
@@ -2306,7 +2317,39 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             panic!("Vector ops not implemented.");
         }
 
-        Opcode::Isplit | Opcode::Iconcat => panic!("Vector ops not supported."),
+        Opcode::Isplit => {
+            assert_eq!(
+                ctx.input_ty(insn, 0),
+                I128,
+                "Isplit only implemented for i128's"
+            );
+            assert_eq!(ctx.output_ty(insn, 0), I64);
+            assert_eq!(ctx.output_ty(insn, 1), I64);
+
+            let src_regs = put_input_in_regs(ctx, inputs[0]);
+            let dst_lo = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+            let dst_hi = get_output_reg(ctx, outputs[1]).only_reg().unwrap();
+
+            ctx.emit(Inst::gen_move(dst_lo, src_regs.regs()[0], I64));
+            ctx.emit(Inst::gen_move(dst_hi, src_regs.regs()[1], I64));
+        }
+
+        Opcode::Iconcat => {
+            assert_eq!(
+                ctx.output_ty(insn, 0),
+                I128,
+                "Iconcat only implemented for i128's"
+            );
+            assert_eq!(ctx.input_ty(insn, 0), I64);
+            assert_eq!(ctx.input_ty(insn, 1), I64);
+
+            let src_lo = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let src_hi = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+            let dst = get_output_reg(ctx, outputs[0]);
+
+            ctx.emit(Inst::gen_move(dst.regs()[0], src_lo, I64));
+            ctx.emit(Inst::gen_move(dst.regs()[1], src_hi, I64));
+        }
 
         Opcode::Imax | Opcode::Umax | Opcode::Umin | Opcode::Imin => {
             let alu_op = match op {
diff --git a/cranelift/filetests/filetests/isa/aarch64/call.clif b/cranelift/filetests/filetests/isa/aarch64/call.clif
index a59c59a5b2..090c220167 100644
--- a/cranelift/filetests/filetests/isa/aarch64/call.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/call.clif
@@ -250,3 +250,113 @@ block0:
 ; nextln:  add sp, sp, #32
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
+
+
+; i128 tests
+function %f11(i128, i64) -> i64 {
+block0(v0: i128, v1: i64):
+    v2, v3 = isplit v0
+    return v3
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: mov x0, x1
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+
+function %f11_call(i64) -> i64 {
+    fn0 = %f11(i128, i64) -> i64
+
+block0(v0: i64):
+    v1 = iconst.i64 42
+    v2 = iconcat v1, v0
+    v3 = call fn0(v2, v1)
+    return v3
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: mov x1, x0
+; nextln: movz x0, #42
+; nextln: movz x2, #42
+; nextln: ldr x3, 8 ; b 12 ; data
+; nextln: blr x3
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+
+; The aarch64 abi requires that the i128 argument be aligned
+; and to be passed in x2 and x3
+function %f12(i64, i128) -> i64 {
+block0(v0: i64, v1: i128):
+    v2, v3 = isplit v1
+    return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: mov x0, x2
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+
+
+function %f12_call(i64) -> i64 {
+    fn0 = %f12(i64, i128) -> i64
+
+block0(v0: i64):
+    v1 = iconst.i64 42
+    v2 = iconcat v0, v1
+    v3 = call fn0(v1, v2)
+    return v3
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movz x3, #42
+; nextln: mov x2, x0
+; nextln: movz x0, #42
+; nextln: ldr x1, 8 ; b 12 ; data
+; nextln: blr x1
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+
+
+; aarch64 allows the i128 argument to not be aligned
+; and to be passed in x1 and x2
+function %f13(i64, i128) -> i64 apple_aarch64 {
+block0(v0: i64, v1: i128):
+    v2, v3 = isplit v1
+    return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: mov x0, x1
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+
+function %f13_call(i64) -> i64 apple_aarch64 {
+    fn0 = %f13(i64, i128) -> i64 apple_aarch64
+
+block0(v0: i64):
+    v1 = iconst.i64 42
+    v2 = iconcat v0, v1
+    v3 = call fn0(v1, v2)
+    return v3
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movz x2, #42
+; nextln: mov x1, x0
+; nextln: movz x0, #42
+; nextln: ldr x3, 8 ; b 12 ; data
+; nextln: blr x3
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+