From f020f0812e4b3dbab75915cdf723eaf212577fee Mon Sep 17 00:00:00 2001
From: Joey Gouly <joey.gouly@arm.com>
Date: Fri, 24 Apr 2020 11:39:39 +0100
Subject: [PATCH] arm64: Implement checks in division / remainder

This implements the divide by 0 and signed overflow checks that Wasm
specifies.

Copyright (c) 2020, Arm Limited.
---
 cranelift/codegen/src/isa/aarch64/lower.rs    | 101 ++++++++++--
 .../filetests/vcode/aarch64/arithmetic.clif   | 146 +++++++++++++++++-
 2 files changed, 229 insertions(+), 18 deletions(-)
diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
index 56962d9110..e9e72dd6d5 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -917,6 +917,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) {
             } else {
                 NarrowValueMode::ZeroExtend64
             };
+            // TODO: Add SDiv32 to implement 32-bit directly, rather
+            // than extending the input.
             let div_op = if is_signed {
                 ALUOp::SDiv64
             } else {
@@ -925,16 +927,19 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) {
 
             let rd = output_to_reg(ctx, outputs[0]);
             let rn = input_to_reg(ctx, inputs[0], narrow_mode);
-            if !is_rem {
-                let rm = input_to_reg(ctx, inputs[1], narrow_mode);
-                ctx.emit(Inst::AluRRR {
-                    alu_op: div_op,
-                    rd,
-                    rn,
-                    rm,
-                });
-            } else {
-                let rm = input_to_reg(ctx, inputs[1], narrow_mode);
+            let rm = input_to_reg(ctx, inputs[1], narrow_mode);
+            // The div instruction does not trap on divide by zero or signed overflow
+            // so checks are inserted below.
+            //
+            //   div rd, rn, rm
+            ctx.emit(Inst::AluRRR {
+                alu_op: div_op,
+                rd,
+                rn,
+                rm,
+            });
+
+            if is_rem {
                 // Remainder (rn % rm) is implemented as:
                 //
                 //   tmp = rn / rm
@@ -943,13 +948,20 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) {
                 // use 'rd' for tmp and you have:
                 //
                 //   div rd, rn, rm       ; rd = rn / rm
+                //   cbnz rm, #8          ; branch over trap
+                //   udf                  ; divide by zero
                 //   msub rd, rd, rm, rn  ; rd = rn - rd * rm
-                ctx.emit(Inst::AluRRR {
-                    alu_op: div_op,
-                    rd,
-                    rn,
-                    rm,
+
+                // Check for divide by 0.
+                let branch_size = 8;
+                ctx.emit(Inst::CondBrLowered {
+                    target: BranchTarget::ResolvedOffset(branch_size),
+                    kind: CondBrKind::NotZero(rm),
                 });
+
+                let trap_info = (ctx.srcloc(insn), TrapCode::IntegerDivisionByZero);
+                ctx.emit(Inst::Udf { trap_info });
+
                 ctx.emit(Inst::AluRRRR {
                     alu_op: ALUOp::MSub64,
                     rd: rd,
@@ -957,6 +969,65 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) {
                     rm: rm,
                     ra: rn,
                 });
+            } else {
+                if div_op == ALUOp::SDiv64 {
+                    //   cbz rm, #20
+                    //   cmn rm, 1
+                    //   ccmp rn, 1, #nzcv, eq
+                    //   b.vc 12
+                    //   udf ; signed overflow
+                    //   udf ; divide by zero
+
+                    // Check for divide by 0.
+                    let branch_size = 20;
+                    ctx.emit(Inst::CondBrLowered {
+                        target: BranchTarget::ResolvedOffset(branch_size),
+                        kind: CondBrKind::Zero(rm),
+                    });
+
+                    // Check for signed overflow. The only case is min_value / -1.
+                    let ty = ty.unwrap();
+                    // The following checks must be done in 32-bit or 64-bit, depending
+                    // on the input type. Even though the initial div instruction is
+                    // always done in 64-bit currently.
+                    let size = InstSize::from_ty(ty);
+                    // Check RHS is -1.
+                    ctx.emit(Inst::AluRRImm12 {
+                        alu_op: choose_32_64(ty, ALUOp::AddS32, ALUOp::AddS64),
+                        rd: writable_zero_reg(),
+                        rn: rm,
+                        imm12: Imm12::maybe_from_u64(1).unwrap(),
+                    });
+                    // Check LHS is min_value, by subtracting 1 and branching if
+                    // there is overflow.
+                    ctx.emit(Inst::CCmpImm {
+                        size,
+                        rn,
+                        imm: UImm5::maybe_from_u8(1).unwrap(),
+                        nzcv: NZCV::new(false, false, false, false),
+                        cond: Cond::Eq,
+                    });
+                    ctx.emit(Inst::CondBrLowered {
+                        target: BranchTarget::ResolvedOffset(12),
+                        kind: CondBrKind::Cond(Cond::Vc),
+                    });
+
+                    let trap_info = (ctx.srcloc(insn), TrapCode::IntegerOverflow);
+                    ctx.emit(Inst::Udf { trap_info });
+                } else {
+                    //   cbnz rm, #8
+                    //   udf ; divide by zero
+
+                    // Check for divide by 0.
+                    let branch_size = 8;
+                    ctx.emit(Inst::CondBrLowered {
+                        target: BranchTarget::ResolvedOffset(branch_size),
+                        kind: CondBrKind::NotZero(rm),
+                    });
+                }
+
+                let trap_info = (ctx.srcloc(insn), TrapCode::IntegerDivisionByZero);
+                ctx.emit(Inst::Udf { trap_info });
             }
         }
 
diff --git a/cranelift/filetests/filetests/vcode/aarch64/arithmetic.clif b/cranelift/filetests/filetests/vcode/aarch64/arithmetic.clif
index 1f6dcf6b82..08ecb31d35 100644
--- a/cranelift/filetests/filetests/vcode/aarch64/arithmetic.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/arithmetic.clif
@@ -75,7 +75,14 @@ block0(v0: i64, v1: i64):
 
 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln:  mov fp, sp
-; nextln:  sdiv x0, x0, x1
+; nextln:  sdiv x2, x0, x1
+; nextln:  cbz x1, 20
+; nextln:  adds xzr, x1, #1
+; nextln:  ccmp x0, #1, #nzcv, eq
+; nextln:  b.vc 12
+; nextln:  udf
+; nextln:  udf
+; nextln:  mov x0, x2
 ; nextln:  mov sp, fp
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
@@ -89,8 +96,15 @@ block0(v0: i64):
 
 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln:  mov fp, sp
-; nextln:  movz x1, #2
-; nextln:  sdiv x0, x0, x1
+; nextln:  movz x2, #2
+; nextln:  sdiv x1, x0, x2
+; nextln:  cbz x2, 20
+; nextln:  adds xzr, x2, #1
+; nextln:  ccmp x0, #1, #nzcv, eq
+; nextln:  b.vc 12
+; nextln:  udf
+; nextln:  udf
+; nextln:  mov x0, x1
 ; nextln:  mov sp, fp
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
@@ -104,6 +118,8 @@ block0(v0: i64, v1: i64):
 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln:  mov fp, sp
 ; nextln:  udiv x0, x0, x1
+; nextln:  cbnz x1, 8
+; nextln:  udf
 ; nextln:  mov sp, fp
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
@@ -119,6 +135,8 @@ block0(v0: i64):
 ; nextln:  mov fp, sp
 ; nextln:  movz x1, #2
 ; nextln:  udiv x0, x0, x1
+; nextln:  cbnz x1, 8
+; nextln:  udf
 ; nextln:  mov sp, fp
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
@@ -132,6 +150,8 @@ block0(v0: i64, v1: i64):
 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln:  mov fp, sp
 ; nextln:  sdiv x2, x0, x1
+; nextln:  cbnz x1, 8
+; nextln:  udf
 ; nextln:  msub x0, x2, x1, x0
 ; nextln:  mov sp, fp
 ; nextln:  ldp fp, lr, [sp], #16
@@ -146,6 +166,126 @@ block0(v0: i64, v1: i64):
 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln:  mov fp, sp
 ; nextln:  udiv x2, x0, x1
+; nextln:  cbnz x1, 8
+; nextln:  udf
+; nextln:  msub x0, x2, x1, x0
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+
+function %f(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = sdiv.i32 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  sxtw x3, w0
+; nextln:  sxtw x2, w1
+; nextln:  sdiv x0, x3, x2
+; nextln:  cbz x2, 20
+; nextln:  adds wzr, w2, #1
+; nextln:  ccmp w3, #1, #nzcv, eq
+; nextln:  b.vc 12
+; nextln:  udf
+; nextln:  udf
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i32) -> i32 {
+block0(v0: i32):
+  v1 = iconst.i32 2
+  v2 = sdiv.i32 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  mov x1, x0
+; nextln:  movz x0, #2
+; nextln:  sxtw x1, w1
+; nextln:  sxtw x2, w0
+; nextln:  sdiv x0, x1, x2
+; nextln:  cbz x2, 20
+; nextln:  adds wzr, w2, #1
+; nextln:  ccmp w1, #1, #nzcv, eq
+; nextln:  b.vc 12
+; nextln:  udf
+; nextln:  udf
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = udiv.i32 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  mov w0, w0
+; nextln:  mov w1, w1
+; nextln:  udiv x0, x0, x1
+; nextln:  cbnz x1, 8
+; nextln:  udf
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i32) -> i32 {
+block0(v0: i32):
+  v1 = iconst.i32 2
+  v2 = udiv.i32 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  movz x1, #2
+; nextln:  mov w0, w0
+; nextln:  mov w1, w1
+; nextln:  udiv x0, x0, x1
+; nextln:  cbnz x1, 8
+; nextln:  udf
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = srem.i32 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  sxtw x0, w0
+; nextln:  sxtw x1, w1
+; nextln:  sdiv x2, x0, x1
+; nextln:  cbnz x1, 8
+; nextln:  udf
+; nextln:  msub x0, x2, x1, x0
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = urem.i32 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  mov w0, w0
+; nextln:  mov w1, w1
+; nextln:  udiv x2, x0, x1
+; nextln:  cbnz x1, 8
+; nextln:  udf
 ; nextln:  msub x0, x2, x1, x0
 ; nextln:  mov sp, fp
 ; nextln:  ldp fp, lr, [sp], #16