Merge pull request #2928 from afonso360/aarch64-i128-ops

Implement iadd,isub,imul for i128 in AArch64
2021-05-24 13:27:36 -07:00
parent 76c6b83f6a 4ddbfe50ba
commit 37ca06ad3a
6 changed files with 383 additions and 101 deletions
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -597,8 +597,12 @@ impl MachInstEmit for Inst {
                let top11 = match alu_op {
                    ALUOp::Add32 => 0b00001011_000,
                    ALUOp::Add64 => 0b10001011_000,
                    ALUOp::Adc32 => 0b00011010_000,
                    ALUOp::Adc64 => 0b10011010_000,
                    ALUOp::Sub32 => 0b01001011_000,
                    ALUOp::Sub64 => 0b11001011_000,
                    ALUOp::Sbc32 => 0b01011010_000,
                    ALUOp::Sbc64 => 0b11011010_000,
                    ALUOp::Orr32 => 0b00101010_000,
                    ALUOp::Orr64 => 0b10101010_000,
                    ALUOp::And32 => 0b00001010_000,
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -50,6 +50,26 @@ fn test_aarch64_binemit() {
        "A400068B",
        "add x4, x5, x6",
    ));
    insns.push((
        Inst::AluRRR {
            alu_op: ALUOp::Adc32,
            rd: writable_xreg(1),
            rn: xreg(2),
            rm: xreg(3),
        },
        "4100031A",
        "adc w1, w2, w3",
    ));
    insns.push((
        Inst::AluRRR {
            alu_op: ALUOp::Adc64,
            rd: writable_xreg(4),
            rn: xreg(5),
            rm: xreg(6),
        },
        "A400069A",
        "adc x4, x5, x6",
    ));
    insns.push((
        Inst::AluRRR {
            alu_op: ALUOp::Sub32,
@@ -70,6 +90,27 @@ fn test_aarch64_binemit() {
        "A40006CB",
        "sub x4, x5, x6",
    ));
    insns.push((
        Inst::AluRRR {
            alu_op: ALUOp::Sbc32,
            rd: writable_xreg(1),
            rn: xreg(2),
            rm: xreg(3),
        },
        "4100035A",
        "sbc w1, w2, w3",
    ));
    insns.push((
        Inst::AluRRR {
            alu_op: ALUOp::Sbc64,
            rd: writable_xreg(4),
            rn: xreg(5),
            rm: xreg(6),
        },
        "A40006DA",
        "sbc x4, x5, x6",
    ));
    insns.push((
        Inst::AluRRR {
            alu_op: ALUOp::Orr32,
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -84,6 +84,12 @@ pub enum ALUOp {
    Asr64,
    Lsl32,
    Lsl64,
    /// Add with carry
    Adc32,
    Adc64,
    /// Subtract with carry
    Sbc32,
    Sbc64,
 }
 /// An ALU operation with three arguments.
@@ -1365,6 +1371,23 @@ impl Inst {
        }
    }
    /// Create instructions that load a 128-bit constant.
    pub fn load_constant128(to_regs: ValueRegs<Writable<Reg>>, value: u128) -> SmallVec<[Inst; 4]> {
        assert_eq!(to_regs.len(), 2, "Expected to load i128 into two registers");
        let lower = value as u64;
        let upper = (value >> 64) as u64;
        let lower_reg = to_regs.regs()[0];
        let upper_reg = to_regs.regs()[1];
        let mut load_ins = Inst::load_constant(lower_reg, lower);
        let load_upper = Inst::load_constant(upper_reg, upper);
        load_ins.extend(load_upper.into_iter());
        load_ins
    }
    /// Create instructions that load a 32-bit floating-point constant.
    pub fn load_fp_constant32<F: FnMut(Type) -> Writable<Reg>>(
        rd: Writable<Reg>,
@@ -3033,30 +3056,15 @@ impl MachInst for Inst {
        ty: Type,
        alloc_tmp: F,
    ) -> SmallVec<[Inst; 4]> {
-        let to_reg = to_regs
+        let to_reg = to_regs.only_reg();
-            .only_reg()
+        match ty {
-            .expect("multi-reg values not supported yet");
+            F64 => Inst::load_fp_constant64(to_reg.unwrap(), value as u64, alloc_tmp),
-        let value = value as u64;
+            F32 => Inst::load_fp_constant32(to_reg.unwrap(), value as u32, alloc_tmp),
-        if ty == F64 {
+            B1 | B8 | B16 | B32 | B64 | I8 | I16 | I32 | I64 | R32 | R64 => {
-            Inst::load_fp_constant64(to_reg, value, alloc_tmp)
+                Inst::load_constant(to_reg.unwrap(), value as u64)
-        } else if ty == F32 {
+            }
-            Inst::load_fp_constant32(to_reg, value as u32, alloc_tmp)
+            I128 => Inst::load_constant128(to_regs, value),
-        } else {
+            _ => panic!("Cannot generate constant for type: {}", ty),
            // Must be an integer type.
            debug_assert!(
                ty == B1
                    || ty == I8
                    || ty == B8
                    || ty == I16
                    || ty == B16
                    || ty == I32
                    || ty == B32
                    || ty == I64
                    || ty == B64
                    || ty == R32
                    || ty == R64
            );
            Inst::load_constant(to_reg, value)
        }
    }
@@ -3202,6 +3210,10 @@ impl Inst {
                ALUOp::Asr64 => ("asr", OperandSize::Size64),
                ALUOp::Lsl32 => ("lsl", OperandSize::Size32),
                ALUOp::Lsl64 => ("lsl", OperandSize::Size64),
                ALUOp::Adc32 => ("adc", OperandSize::Size32),
                ALUOp::Adc64 => ("adc", OperandSize::Size64),
                ALUOp::Sbc32 => ("sbc", OperandSize::Size32),
                ALUOp::Sbc64 => ("sbc", OperandSize::Size64),
            }
        }
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -64,36 +64,118 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            lower_constant_f64(ctx, rd, value);
        }
        Opcode::Iadd => {
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+            match ty.unwrap() {
-            let ty = ty.unwrap();
+                ty if ty.is_vector() => {
-            if !ty.is_vector() {
+                    let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-                let mul_insn =
+                    let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
-                    if let Some(mul_insn) = maybe_input_insn(ctx, inputs[1], Opcode::Imul) {
+                    let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                    ctx.emit(Inst::VecRRR {
                        rd,
                        rn,
                        rm,
                        alu_op: VecALUOp::Add,
                        size: VectorSize::from_ty(ty),
                    });
                }
                I128 => {
                    let lhs = put_input_in_regs(ctx, inputs[0]);
                    let rhs = put_input_in_regs(ctx, inputs[1]);
                    let dst = get_output_reg(ctx, outputs[0]);
                    assert_eq!(lhs.len(), 2);
                    assert_eq!(rhs.len(), 2);
                    assert_eq!(dst.len(), 2);
                    // adds    x0, x0, x2
                    // adc     x1, x1, x3
                    ctx.emit(Inst::AluRRR {
                        alu_op: ALUOp::AddS64,
                        rd: dst.regs()[0],
                        rn: lhs.regs()[0],
                        rm: rhs.regs()[0],
                    });
                    ctx.emit(Inst::AluRRR {
                        alu_op: ALUOp::Adc64,
                        rd: dst.regs()[1],
                        rn: lhs.regs()[1],
                        rm: rhs.regs()[1],
                    });
                }
                ty => {
                    let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                    let mul_insn = if let Some(mul_insn) =
                        maybe_input_insn(ctx, inputs[1], Opcode::Imul)
                    {
                        Some((mul_insn, 0))
                    } else if let Some(mul_insn) = maybe_input_insn(ctx, inputs[0], Opcode::Imul) {
                        Some((mul_insn, 1))
                    } else {
                        None
                    };
-                // If possible combine mul + add into madd.
+                    // If possible combine mul + add into madd.
-                if let Some((insn, addend_idx)) = mul_insn {
+                    if let Some((insn, addend_idx)) = mul_insn {
-                    let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64);
+                        let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64);
-                    let rn_input = InsnInput { insn, input: 0 };
+                        let rn_input = InsnInput { insn, input: 0 };
-                    let rm_input = InsnInput { insn, input: 1 };
+                        let rm_input = InsnInput { insn, input: 1 };
-                    let rn = put_input_in_reg(ctx, rn_input, NarrowValueMode::None);
+                        let rn = put_input_in_reg(ctx, rn_input, NarrowValueMode::None);
-                    let rm = put_input_in_reg(ctx, rm_input, NarrowValueMode::None);
+                        let rm = put_input_in_reg(ctx, rm_input, NarrowValueMode::None);
-                    let ra = put_input_in_reg(ctx, inputs[addend_idx], NarrowValueMode::None);
+                        let ra = put_input_in_reg(ctx, inputs[addend_idx], NarrowValueMode::None);
-                    ctx.emit(Inst::AluRRRR {
+                        ctx.emit(Inst::AluRRRR {
-                        alu_op,
+                            alu_op,
-                        rd,
+                            rd,
-                        rn,
+                            rn,
-                        rm,
+                            rm,
-                        ra,
+                            ra,
-                    });
+                        });
-                } else {
+                    } else {
-                    let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                        let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                        let (rm, negated) = put_input_in_rse_imm12_maybe_negated(
                            ctx,
                            inputs[1],
                            ty_bits(ty),
                            NarrowValueMode::None,
                        );
                        let alu_op = if !negated {
                            choose_32_64(ty, ALUOp::Add32, ALUOp::Add64)
                        } else {
                            choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64)
                        };
                        ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
                    }
                }
            }
        }
        Opcode::Isub => {
            let ty = ty.unwrap();
            if ty == I128 {
                let lhs = put_input_in_regs(ctx, inputs[0]);
                let rhs = put_input_in_regs(ctx, inputs[1]);
                let dst = get_output_reg(ctx, outputs[0]);
                assert_eq!(lhs.len(), 2);
                assert_eq!(rhs.len(), 2);
                assert_eq!(dst.len(), 2);
                // subs    x0, x0, x2
                // sbc     x1, x1, x3
                ctx.emit(Inst::AluRRR {
                    alu_op: ALUOp::SubS64,
                    rd: dst.regs()[0],
                    rn: lhs.regs()[0],
                    rm: rhs.regs()[0],
                });
                ctx.emit(Inst::AluRRR {
                    alu_op: ALUOp::Sbc64,
                    rd: dst.regs()[1],
                    rn: lhs.regs()[1],
                    rm: rhs.regs()[1],
                });
            } else {
                let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                if !ty.is_vector() {
                    let (rm, negated) = put_input_in_rse_imm12_maybe_negated(
                        ctx,
                        inputs[1],
@@ -101,50 +183,21 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                        NarrowValueMode::None,
                    );
                    let alu_op = if !negated {
                        choose_32_64(ty, ALUOp::Add32, ALUOp::Add64)
                    } else {
                        choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64)
                    } else {
                        choose_32_64(ty, ALUOp::Add32, ALUOp::Add64)
                    };
                    ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
                }
            } else {
                let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                ctx.emit(Inst::VecRRR {
                    rd,
                    rn,
                    rm,
                    alu_op: VecALUOp::Add,
                    size: VectorSize::from_ty(ty),
                });
            }
        }
        Opcode::Isub => {
            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            let ty = ty.unwrap();
            if !ty.is_vector() {
                let (rm, negated) = put_input_in_rse_imm12_maybe_negated(
                    ctx,
                    inputs[1],
                    ty_bits(ty),
                    NarrowValueMode::None,
                );
                let alu_op = if !negated {
                    choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64)
                } else {
-                    choose_32_64(ty, ALUOp::Add32, ALUOp::Add64)
+                    let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
-                };
+                    ctx.emit(Inst::VecRRR {
-                ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
+                        rd,
-            } else {
+                        rn,
-                let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+                        rm,
-                ctx.emit(Inst::VecRRR {
+                        alu_op: VecALUOp::Sub,
-                    rd,
+                        size: VectorSize::from_ty(ty),
-                    rn,
+                    });
-                    rm,
+                }
                    alu_op: VecALUOp::Sub,
                    size: VectorSize::from_ty(ty),
                });
            }
        }
        Opcode::UaddSat | Opcode::SaddSat | Opcode::UsubSat | Opcode::SsubSat => {
@@ -191,21 +244,70 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        }
        Opcode::Imul => {
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+            let lhs = put_input_in_regs(ctx, inputs[0]);
-            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rhs = put_input_in_regs(ctx, inputs[1]);
-            let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+            let dst = get_output_reg(ctx, outputs[0]);
            let rd = dst.regs()[0];
            let rn = lhs.regs()[0];
            let rm = rhs.regs()[0];
            let ty = ty.unwrap();
-            if !ty.is_vector() {
+            match ty {
-                let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64);
+                I128 => {
-                ctx.emit(Inst::AluRRRR {
+                    assert_eq!(lhs.len(), 2);
-                    alu_op,
+                    assert_eq!(rhs.len(), 2);
-                    rd,
+                    assert_eq!(dst.len(), 2);
-                    rn,
+
-                    rm,
+                    // 128bit mul formula:
-                    ra: zero_reg(),
+                    //   dst_lo = lhs_lo * rhs_lo
-                });
+                    //   dst_hi = umulhi(lhs_lo, rhs_lo) + (lhs_lo * rhs_hi) + (lhs_hi * rhs_lo)
-            } else {
+                    //
-                if ty == I64X2 {
+                    // We can convert the above formula into the following
                    // umulh   dst_hi, lhs_lo, rhs_lo
                    // madd    dst_hi, lhs_lo, rhs_hi, dst_hi
                    // madd    dst_hi, lhs_hi, rhs_lo, dst_hi
                    // mul     dst_lo, lhs_lo, rhs_lo
                    ctx.emit(Inst::AluRRR {
                        alu_op: ALUOp::UMulH,
                        rd: dst.regs()[1],
                        rn: lhs.regs()[0],
                        rm: rhs.regs()[0],
                    });
                    ctx.emit(Inst::AluRRRR {
                        alu_op: ALUOp3::MAdd64,
                        rd: dst.regs()[1],
                        rn: lhs.regs()[0],
                        rm: rhs.regs()[1],
                        ra: dst.regs()[1].to_reg(),
                    });
                    ctx.emit(Inst::AluRRRR {
                        alu_op: ALUOp3::MAdd64,
                        rd: dst.regs()[1],
                        rn: lhs.regs()[1],
                        rm: rhs.regs()[0],
                        ra: dst.regs()[1].to_reg(),
                    });
                    ctx.emit(Inst::AluRRRR {
                        alu_op: ALUOp3::MAdd64,
                        rd: dst.regs()[0],
                        rn: lhs.regs()[0],
                        rm: rhs.regs()[0],
                        ra: zero_reg(),
                    });
                }
                ty if !ty.is_vector() => {
                    let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64);
                    ctx.emit(Inst::AluRRRR {
                        alu_op,
                        rd,
                        rn,
                        rm,
                        ra: zero_reg(),
                    });
                }
                I64X2 => {
                    let tmp1 = ctx.alloc_tmp(I64X2).only_reg().unwrap();
                    let tmp2 = ctx.alloc_tmp(I64X2).only_reg().unwrap();
@@ -310,7 +412,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                        rm: tmp1.to_reg(),
                        size: VectorSize::Size32x2,
                    });
-                } else {
+                }
                ty if ty.is_vector() => {
                    ctx.emit(Inst::VecRRR {
                        alu_op: VecALUOp::Mul,
                        rd,
@@ -319,6 +422,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                        size: VectorSize::from_ty(ty),
                    });
                }
                _ => panic!("Unable to emit mul for {}", ty),
            }
        }
--- a/cranelift/filetests/filetests/isa/aarch64/arithmetic-run.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/arithmetic-run.clif
@@ -0,0 +1,78 @@
 test run
 target aarch64
 ; i128 tests
 ; TODO: Cleanup these tests when we have native support for i128 immediates in CLIF's parser
 function %i128_const_0() -> i64, i64 {
 block0:
    v1 = iconst.i128 0
    v2, v3 = isplit v1
    return v2, v3
 }
 ; run: %i128_const_0() == [0, 0]
 function %add_i128(i64, i64, i64, i64) -> i64, i64 {
 block0(v0: i64,v1: i64,v2: i64,v3: i64):
    v4 = iconcat v0, v1
    v5 = iconcat v2, v3
    v6 = iadd v4, v5
    v7, v8 = isplit v6
    return v7, v8
 }
 ; run: %add_i128(0, 0, 0, 0) == [0, 0]
 ; run: %add_i128(0, -1, -1, 0) == [-1, -1]
 ; run: %add_i128(1, 0, 0, 0) == [1, 0]
 ; run: %add_i128(1, 0, 1, 0) == [2, 0]
 ; run: %add_i128(1, 0, -1, -1) == [0, 0]
 ; run: %add_i128(-1, 0, 1, 0) == [0, 1]
 ; run: %add_i128(0x01234567_89ABCDEF, 0x01234567_89ABCDEF, 0xFEDCBA98_76543210, 0xFEDCBA98_76543210) == [-1, -1]
 ; run: %add_i128(0x06060606_06060606, 0xA00A00A0_0A00A00A, 0x30303030_30303030, 0x0BB0BB0B_B0BB0BB0) == [0x36363636_36363636, 0xABBABBAB_BABBABBA]
 ; run: %add_i128(0xC0FFEEEE_C0FFEEEE, 0xC0FFEEEE_C0FFEEEE, 0x1DCB1111_1DCB1111, 0x1DCB1111_1DCB1111) == [0xDECAFFFF_DECAFFFF, 0xDECAFFFF_DECAFFFF]
 function %sub_i128(i64, i64, i64, i64) -> i64, i64 {
 block0(v0: i64,v1: i64,v2: i64,v3: i64):
    v4 = iconcat v0, v1
    v5 = iconcat v2, v3
    v6 = isub v4, v5
    v7, v8 = isplit v6
    return v7, v8
 }
 ; run: %sub_i128(0, 0, 0, 0) == [0, 0]
 ; run: %sub_i128(1, 0, 1, 0) == [0, 0]
 ; run: %sub_i128(1, 0, 0, 0) == [1, 0]
 ; run: %sub_i128(0, 0, 1, 0) == [-1, -1]
 ; run: %sub_i128(0, 0, -1, -1) == [1, 0]
 ; run: %sub_i128(-1, -1, 0xFEDCBA98_76543210, 0xFEDCBA98_76543210) == [0x01234567_89ABCDEF, 0x01234567_89ABCDEF]
 ; run: %sub_i128(0x36363636_36363636, 0xABBABBAB_BABBABBA, 0x30303030_30303030, 0x0BB0BB0B_B0BB0BB0) == [0x06060606_06060606, 0xA00A00A0_0A00A00A]
 ; run: %sub_i128(0xDECAFFFF_DECAFFFF, 0xDECAFFFF_DECAFFFF, 0x1DCB1111_1DCB1111, 0x1DCB1111_1DCB1111) == [0xC0FFEEEE_C0FFEEEE, 0xC0FFEEEE_C0FFEEEE]
 function %mul_i128(i64, i64, i64, i64) -> i64, i64 {
 block0(v0: i64,v1: i64,v2: i64,v3: i64):
    v4 = iconcat v0, v1
    v5 = iconcat v2, v3
    v6 = imul v4, v5
    v7, v8 = isplit v6
    return v7, v8
 }
 ; run: %mul_i128(0, 0, 0, 0) == [0, 0]
 ; run: %mul_i128(1, 0, 1, 0) == [1, 0]
 ; run: %mul_i128(1, 0, 0, 0) == [0, 0]
 ; run: %mul_i128(0, 0, 1, 0) == [0, 0]
 ; run: %mul_i128(2, 0, 1, 0) == [2, 0]
 ; run: %mul_i128(2, 0, 2, 0) == [4, 0]
 ; run: %mul_i128(1, 0, -1, -1) == [-1, -1]
 ; run: %mul_i128(2, 0, -1, -1) == [-2, -1]
 ; run: %mul_i128(0x01010101_01010101, 0x01010101_01010101, 13, 0) == [0x0D0D0D0D_0D0D0D0D, 0x0D0D0D0D_0D0D0D0D]
 ; run: %mul_i128(13, 0, 0x01010101_01010101, 0x01010101_01010101) == [0x0D0D0D0D_0D0D0D0D, 0x0D0D0D0D_0D0D0D0D]
 ; run: %mul_i128(0x00000000_01234567, 0x89ABCDEF_00000000, 0x00000000_FEDCBA98, 0x76543210_00000000) == [0x0121FA00_23E20B28, 0xE2946058_00000000]
 ; run: %mul_i128(0xC0FFEEEE_C0FFEEEE, 0xC0FFEEEE_C0FFEEEE, 0xDECAFFFF_DECAFFFF, 0xDECAFFFF_DECAFFFF) == [0xDB6B1E48_19BA1112, 0x5ECD38B5_9D1C2B7E]
--- a/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif
@@ -425,3 +425,46 @@ block0(v0: i8x16):
 ; nextln: ushl v0.16b, v0.16b, v1.16b
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
 function %add_i128(i128, i128) -> i128 {
 block0(v0: i128, v1: i128):
    v2 = iadd v0, v1
    return v2
 }
 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
 ; nextln: adds x0, x0, x2
 ; nextln: adc x1, x1, x3
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
 function %sub_i128(i128, i128) -> i128 {
 block0(v0: i128, v1: i128):
    v2 = isub v0, v1
    return v2
 }
 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
 ; nextln: subs x0, x0, x2
 ; nextln: sbc x1, x1, x3
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
 function %mul_i128(i128, i128) -> i128 {
 block0(v0: i128, v1: i128):
    v2 = imul v0, v1
    return v2
 }
 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
 ; nextln: umulh x4, x0, x2
 ; nextln: madd x3, x0, x3, x4
 ; nextln: madd x1, x1, x2, x3
 ; nextln: madd x0, x0, x2, xzr
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret