From a2e74b2c4564d4d156fcbe3715e022025495e7c7 Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonsobordado@az8.co>
Date: Sat, 22 May 2021 21:22:05 +0100
Subject: [PATCH] aarch64: Implement isub for i128 operands

---
 .../codegen/src/isa/aarch64/inst/emit.rs      |  2 +
 .../src/isa/aarch64/inst/emit_tests.rs        | 21 ++++++
 cranelift/codegen/src/isa/aarch64/inst/mod.rs |  5 ++
 .../codegen/src/isa/aarch64/lower_inst.rs     | 74 ++++++++++++-------
 .../filetests/isa/aarch64/arithmetic-run.clif | 21 +++++-
 .../filetests/isa/aarch64/arithmetic.clif     | 13 ++++
 6 files changed, 109 insertions(+), 27 deletions(-)
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index 89c4f88b4b..60fedcd0d3 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -601,6 +601,8 @@ impl MachInstEmit for Inst {
                     ALUOp::Adc64 => 0b10011010_000,
                     ALUOp::Sub32 => 0b01001011_000,
                     ALUOp::Sub64 => 0b11001011_000,
+                    ALUOp::Sbc32 => 0b01011010_000,
+                    ALUOp::Sbc64 => 0b11011010_000,
                     ALUOp::Orr32 => 0b00101010_000,
                     ALUOp::Orr64 => 0b10101010_000,
                     ALUOp::And32 => 0b00001010_000,
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
index e568829955..530269b201 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -90,6 +90,27 @@ fn test_aarch64_binemit() {
         "A40006CB",
         "sub x4, x5, x6",
     ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Sbc32,
+            rd: writable_xreg(1),
+            rn: xreg(2),
+            rm: xreg(3),
+        },
+        "4100035A",
+        "sbc w1, w2, w3",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Sbc64,
+            rd: writable_xreg(4),
+            rn: xreg(5),
+            rm: xreg(6),
+        },
+        "A40006DA",
+        "sbc x4, x5, x6",
+    ));
+
     insns.push((
         Inst::AluRRR {
             alu_op: ALUOp::Orr32,
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index 15073a8247..ecdf43c6ff 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -87,6 +87,9 @@ pub enum ALUOp {
     /// Add with carry
     Adc32,
     Adc64,
+    /// Subtract with carry
+    Sbc32,
+    Sbc64,
 }
 
 /// An ALU operation with three arguments.
@@ -3209,6 +3212,8 @@ impl Inst {
                 ALUOp::Lsl64 => ("lsl", OperandSize::Size64),
                 ALUOp::Adc32 => ("adc", OperandSize::Size32),
                 ALUOp::Adc64 => ("adc", OperandSize::Size64),
+                ALUOp::Sbc32 => ("sbc", OperandSize::Size32),
+                ALUOp::Sbc64 => ("sbc", OperandSize::Size64),
             }
         }
 
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index d6b8933fc4..2a3ee88fcc 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -85,10 +85,9 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     assert_eq!(rhs.len(), 2);
                     assert_eq!(dst.len(), 2);
 
-                    // adds    x0, x0, x1
+                    // adds    x0, x0, x2
                     // adc     x1, x1, x3
 
-                    // Add lower
                     ctx.emit(Inst::AluRRR {
                         alu_op: ALUOp::AddS64,
                         rd: dst.regs()[0],
@@ -149,31 +148,56 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             }
         }
         Opcode::Isub => {
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let ty = ty.unwrap();
-            if !ty.is_vector() {
-                let (rm, negated) = put_input_in_rse_imm12_maybe_negated(
-                    ctx,
-                    inputs[1],
-                    ty_bits(ty),
-                    NarrowValueMode::None,
-                );
-                let alu_op = if !negated {
-                    choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64)
-                } else {
-                    choose_32_64(ty, ALUOp::Add32, ALUOp::Add64)
-                };
-                ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
-            } else {
-                let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
-                ctx.emit(Inst::VecRRR {
-                    rd,
-                    rn,
-                    rm,
-                    alu_op: VecALUOp::Sub,
-                    size: VectorSize::from_ty(ty),
+            if ty == I128 {
+                let lhs = put_input_in_regs(ctx, inputs[0]);
+                let rhs = put_input_in_regs(ctx, inputs[1]);
+                let dst = get_output_reg(ctx, outputs[0]);
+                assert_eq!(lhs.len(), 2);
+                assert_eq!(rhs.len(), 2);
+                assert_eq!(dst.len(), 2);
+
+                // subs    x0, x0, x2
+                // sbc     x1, x1, x3
+
+                ctx.emit(Inst::AluRRR {
+                    alu_op: ALUOp::SubS64,
+                    rd: dst.regs()[0],
+                    rn: lhs.regs()[0],
+                    rm: rhs.regs()[0],
                 });
+                ctx.emit(Inst::AluRRR {
+                    alu_op: ALUOp::Sbc64,
+                    rd: dst.regs()[1],
+                    rn: lhs.regs()[1],
+                    rm: rhs.regs()[1],
+                });
+            } else {
+                let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                if !ty.is_vector() {
+                    let (rm, negated) = put_input_in_rse_imm12_maybe_negated(
+                        ctx,
+                        inputs[1],
+                        ty_bits(ty),
+                        NarrowValueMode::None,
+                    );
+                    let alu_op = if !negated {
+                        choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64)
+                    } else {
+                        choose_32_64(ty, ALUOp::Add32, ALUOp::Add64)
+                    };
+                    ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
+                } else {
+                    let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+                    ctx.emit(Inst::VecRRR {
+                        rd,
+                        rn,
+                        rm,
+                        alu_op: VecALUOp::Sub,
+                        size: VectorSize::from_ty(ty),
+                    });
+                }
             }
         }
         Opcode::UaddSat | Opcode::SaddSat | Opcode::UsubSat | Opcode::SsubSat => {
diff --git a/cranelift/filetests/filetests/isa/aarch64/arithmetic-run.clif b/cranelift/filetests/filetests/isa/aarch64/arithmetic-run.clif
index 01bfdb115a..90fdd43e41 100644
--- a/cranelift/filetests/filetests/isa/aarch64/arithmetic-run.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/arithmetic-run.clif
@@ -2,7 +2,7 @@ test run
 target aarch64
 
 ; i128 tests
-; TODO: It would be nice if we had native support for i128 immediates in CLIF's parser
+; TODO: Cleanup these tests when we have native support for i128 immediates in CLIF's parser
 function %i128_const_0() -> i64, i64 {
 block0:
     v1 = iconst.i128 0
@@ -36,4 +36,21 @@ block0(v0: i64,v1: i64,v2: i64,v3: i64):
 ; run: %add_i128(1, 0, 0, 0) == [1, 0]
 ; run: %add_i128(1, 0, 1, 0) == [2, 0]
 ; run: %add_i128(1, 0, -1, -1) == [0, 0]
-; run: %add_i128(-1, 0, 1, 0) == [0, 1]
\ No newline at end of file
+; run: %add_i128(-1, 0, 1, 0) == [0, 1]
+
+
+function %sub_i128(i64, i64, i64, i64) -> i64, i64 {
+block0(v0: i64,v1: i64,v2: i64,v3: i64):
+    v4 = iconcat v0, v1
+    v5 = iconcat v2, v3
+
+    v6 = isub v4, v5
+
+    v7, v8 = isplit v6
+    return v7, v8
+}
+; run: %sub_i128(0, 0, 0, 0) == [0, 0]
+; run: %sub_i128(1, 0, 1, 0) == [0, 0]
+; run: %sub_i128(1, 0, 0, 0) == [1, 0]
+; run: %sub_i128(0, 0, 1, 0) == [-1, -1]
+; run: %sub_i128(0, 0, -1, -1) == [1, 0]
\ No newline at end of file
diff --git a/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif b/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif
index 91d4987d89..edd25c0023 100644
--- a/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif
@@ -440,3 +440,16 @@ block0(v0: i128, v1: i128):
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
 
+function %sub_i128(i128, i128) -> i128 {
+block0(v0: i128, v1: i128):
+    v2 = isub v0, v1
+    return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: subs x0, x0, x2
+; nextln: sbc x1, x1, x3
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+