diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index 2a3ee88fcc..d99bf620c2 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -244,21 +244,70 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::Imul => {
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+            let lhs = put_input_in_regs(ctx, inputs[0]);
+            let rhs = put_input_in_regs(ctx, inputs[1]);
+            let dst = get_output_reg(ctx, outputs[0]);
+
+            let rd = dst.regs()[0];
+            let rn = lhs.regs()[0];
+            let rm = rhs.regs()[0];
+
             let ty = ty.unwrap();
-            if !ty.is_vector() {
-                let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64);
-                ctx.emit(Inst::AluRRRR {
-                    alu_op,
-                    rd,
-                    rn,
-                    rm,
-                    ra: zero_reg(),
-                });
-            } else {
-                if ty == I64X2 {
+            match ty {
+                I128 => {
+                    assert_eq!(lhs.len(), 2);
+                    assert_eq!(rhs.len(), 2);
+                    assert_eq!(dst.len(), 2);
+
+                    // 128bit mul formula:
+                    //   dst_lo = lhs_lo * rhs_lo
+                    //   dst_hi = umulhi(lhs_lo, rhs_lo) + (lhs_lo * rhs_hi) + (lhs_hi * rhs_lo)
+                    //
+                    // We can convert the above formula into the following
+                    // umulh   dst_hi, lhs_lo, rhs_lo
+                    // madd    dst_hi, lhs_lo, rhs_hi, dst_hi
+                    // madd    dst_hi, lhs_hi, rhs_lo, dst_hi
+                    // mul     dst_lo, lhs_lo, rhs_lo
+
+                    ctx.emit(Inst::AluRRR {
+                        alu_op: ALUOp::UMulH,
+                        rd: dst.regs()[1],
+                        rn: lhs.regs()[0],
+                        rm: rhs.regs()[0],
+                    });
+                    ctx.emit(Inst::AluRRRR {
+                        alu_op: ALUOp3::MAdd64,
+                        rd: dst.regs()[1],
+                        rn: lhs.regs()[0],
+                        rm: rhs.regs()[1],
+                        ra: dst.regs()[1].to_reg(),
+                    });
+                    ctx.emit(Inst::AluRRRR {
+                        alu_op: ALUOp3::MAdd64,
+                        rd: dst.regs()[1],
+                        rn: lhs.regs()[1],
+                        rm: rhs.regs()[0],
+                        ra: dst.regs()[1].to_reg(),
+                    });
+                    ctx.emit(Inst::AluRRRR {
+                        alu_op: ALUOp3::MAdd64,
+                        rd: dst.regs()[0],
+                        rn: lhs.regs()[0],
+                        rm: rhs.regs()[0],
+                        ra: zero_reg(),
+                    });
+                }
+                ty if !ty.is_vector() => {
+                    let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64);
+                    ctx.emit(Inst::AluRRRR {
+                        alu_op,
+                        rd,
+                        rn,
+                        rm,
+                        ra: zero_reg(),
+                    });
+                }
+                I64X2 => {
                     let tmp1 = ctx.alloc_tmp(I64X2).only_reg().unwrap();
                     let tmp2 = ctx.alloc_tmp(I64X2).only_reg().unwrap();
 
@@ -363,7 +412,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         rm: tmp1.to_reg(),
                         size: VectorSize::Size32x2,
                     });
-                } else {
+                }
+                ty if ty.is_vector() => {
                     ctx.emit(Inst::VecRRR {
                         alu_op: VecALUOp::Mul,
                         rd,
@@ -372,6 +422,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         size: VectorSize::from_ty(ty),
                     });
                 }
+                _ => panic!("Unable to emit mul for {}", ty),
             }
         }
 
diff --git a/cranelift/filetests/filetests/isa/aarch64/arithmetic-run.clif b/cranelift/filetests/filetests/isa/aarch64/arithmetic-run.clif
index 90fdd43e41..c847ae171e 100644
--- a/cranelift/filetests/filetests/isa/aarch64/arithmetic-run.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/arithmetic-run.clif
@@ -11,16 +11,6 @@ block0:
 }
 ; run: %i128_const_0() == [0, 0]
 
-; TODO: Blocked by https://github.com/bytecodealliance/wasmtime/issues/2906
-;function %i128_const_neg_1() -> i64, i64 {
-;block0:
-;    v1 = iconst.i128 -1
-;    v2, v3 = isplit v1
-;    return v2, v3
-;}
-; r-un: %i128_const_neg_1() == [0xffffffff_ffffffff, 0xffffffff_ffffffff]
-
-
 function %add_i128(i64, i64, i64, i64) -> i64, i64 {
 block0(v0: i64,v1: i64,v2: i64,v3: i64):
     v4 = iconcat v0, v1
@@ -38,6 +28,9 @@ block0(v0: i64,v1: i64,v2: i64,v3: i64):
 ; run: %add_i128(1, 0, -1, -1) == [0, 0]
 ; run: %add_i128(-1, 0, 1, 0) == [0, 1]
 
+; run: %add_i128(0x01234567_89ABCDEF, 0x01234567_89ABCDEF, 0xFEDCBA98_76543210, 0xFEDCBA98_76543210) == [-1, -1]
+; run: %add_i128(0x06060606_06060606, 0xA00A00A0_0A00A00A, 0x30303030_30303030, 0x0BB0BB0B_B0BB0BB0) == [0x36363636_36363636, 0xABBABBAB_BABBABBA]
+; run: %add_i128(0xC0FFEEEE_C0FFEEEE, 0xC0FFEEEE_C0FFEEEE, 0x1DCB1111_1DCB1111, 0x1DCB1111_1DCB1111) == [0xDECAFFFF_DECAFFFF, 0xDECAFFFF_DECAFFFF]
 
 function %sub_i128(i64, i64, i64, i64) -> i64, i64 {
 block0(v0: i64,v1: i64,v2: i64,v3: i64):
@@ -53,4 +46,33 @@ block0(v0: i64,v1: i64,v2: i64,v3: i64):
 ; run: %sub_i128(1, 0, 1, 0) == [0, 0]
 ; run: %sub_i128(1, 0, 0, 0) == [1, 0]
 ; run: %sub_i128(0, 0, 1, 0) == [-1, -1]
-; run: %sub_i128(0, 0, -1, -1) == [1, 0]
\ No newline at end of file
+; run: %sub_i128(0, 0, -1, -1) == [1, 0]
+
+; run: %sub_i128(-1, -1, 0xFEDCBA98_76543210, 0xFEDCBA98_76543210) == [0x01234567_89ABCDEF, 0x01234567_89ABCDEF]
+; run: %sub_i128(0x36363636_36363636, 0xABBABBAB_BABBABBA, 0x30303030_30303030, 0x0BB0BB0B_B0BB0BB0) == [0x06060606_06060606, 0xA00A00A0_0A00A00A]
+; run: %sub_i128(0xDECAFFFF_DECAFFFF, 0xDECAFFFF_DECAFFFF, 0x1DCB1111_1DCB1111, 0x1DCB1111_1DCB1111) == [0xC0FFEEEE_C0FFEEEE, 0xC0FFEEEE_C0FFEEEE]
+
+
+function %mul_i128(i64, i64, i64, i64) -> i64, i64 {
+block0(v0: i64,v1: i64,v2: i64,v3: i64):
+    v4 = iconcat v0, v1
+    v5 = iconcat v2, v3
+
+    v6 = imul v4, v5
+
+    v7, v8 = isplit v6
+    return v7, v8
+}
+; run: %mul_i128(0, 0, 0, 0) == [0, 0]
+; run: %mul_i128(1, 0, 1, 0) == [1, 0]
+; run: %mul_i128(1, 0, 0, 0) == [0, 0]
+; run: %mul_i128(0, 0, 1, 0) == [0, 0]
+; run: %mul_i128(2, 0, 1, 0) == [2, 0]
+; run: %mul_i128(2, 0, 2, 0) == [4, 0]
+; run: %mul_i128(1, 0, -1, -1) == [-1, -1]
+; run: %mul_i128(2, 0, -1, -1) == [-2, -1]
+
+; run: %mul_i128(0x01010101_01010101, 0x01010101_01010101, 13, 0) == [0x0D0D0D0D_0D0D0D0D, 0x0D0D0D0D_0D0D0D0D]
+; run: %mul_i128(13, 0, 0x01010101_01010101, 0x01010101_01010101) == [0x0D0D0D0D_0D0D0D0D, 0x0D0D0D0D_0D0D0D0D]
+; run: %mul_i128(0x00000000_01234567, 0x89ABCDEF_00000000, 0x00000000_FEDCBA98, 0x76543210_00000000) == [0x0121FA00_23E20B28, 0xE2946058_00000000]
+; run: %mul_i128(0xC0FFEEEE_C0FFEEEE, 0xC0FFEEEE_C0FFEEEE, 0xDECAFFFF_DECAFFFF, 0xDECAFFFF_DECAFFFF) == [0xDB6B1E48_19BA1112, 0x5ECD38B5_9D1C2B7E]
diff --git a/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif b/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif
index edd25c0023..41d65ee293 100644
--- a/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif
@@ -453,3 +453,18 @@ block0(v0: i128, v1: i128):
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
 
+function %mul_i128(i128, i128) -> i128 {
+block0(v0: i128, v1: i128):
+    v2 = imul v0, v1
+    return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: umulh x4, x0, x2
+; nextln: madd x3, x0, x3, x4
+; nextln: madd x1, x1, x2, x3
+; nextln: madd x0, x0, x2, xzr
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+