From 62e7b7f8382505d4b1d0a000c2ad89b3e7d76a02 Mon Sep 17 00:00:00 2001
From: Joey Gouly <joey.gouly@arm.com>
Date: Thu, 2 Jul 2020 13:17:33 +0100
Subject: [PATCH] arm64: Implement basic SIMD arithmetic

Copyright (c) 2020, Arm Limited.
---
 build.rs                                      |   3 +
 .../codegen/src/isa/aarch64/inst/emit.rs      |  29 ++-
 .../src/isa/aarch64/inst/emit_tests.rs        | 176 ++++++++++++++++++
 cranelift/codegen/src/isa/aarch64/inst/mod.rs |  21 ++-
 .../codegen/src/isa/aarch64/lower_inst.rs     |  78 ++++++--
 5 files changed, 277 insertions(+), 30 deletions(-)
diff --git a/build.rs b/build.rs
index fa89812ed9..6e3a93502f 100644
--- a/build.rs
+++ b/build.rs
@@ -186,8 +186,11 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
             ("simd", "simd_boolean") => return false,
             ("simd", "simd_f32x4_cmp") => return false,
             ("simd", "simd_f64x2_cmp") => return false,
+            ("simd", "simd_i8x16_arith") => return false,
             ("simd", "simd_i8x16_cmp") => return false,
+            ("simd", "simd_i16x8_arith") => return false,
             ("simd", "simd_i16x8_cmp") => return false,
+            ("simd", "simd_i32x4_arith") => return false,
             ("simd", "simd_i32x4_cmp") => return false,
             ("simd", "simd_load_extend") => return false,
             ("simd", "simd_load_splat") => return false,
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index abb9aa0045..a075401555 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -355,10 +355,11 @@ fn enc_fround(top22: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
     (top22 << 10) | (machreg_to_vec(rn) << 5) | machreg_to_vec(rd.to_reg())
 }
 
-fn enc_vec_rr_misc(bits_12_16: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
+fn enc_vec_rr_misc(size: u32, bits_12_16: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
+    debug_assert_eq!(size & 0b11, size);
     debug_assert_eq!(bits_12_16 & 0b11111, bits_12_16);
     let bits = 0b0_1_1_01110_00_10000_00000_10_00000_00000;
-    bits | bits_12_16 << 12 | machreg_to_vec(rn) << 5 | machreg_to_vec(rd.to_reg())
+    bits | size << 22 | bits_12_16 << 12 | machreg_to_vec(rn) << 5 | machreg_to_vec(rd.to_reg())
 }
 
 fn enc_vec_lanes(q: u32, u: u32, size: u32, opcode: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
@@ -1067,13 +1068,24 @@ impl MachInstEmit for Inst {
                 sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra));
             }
             &Inst::VecMisc { op, rd, rn, ty } => {
-                let bits_12_16 = match op {
+                let enc_size = match ty {
+                    I8X16 => 0b00,
+                    I16X8 => 0b01,
+                    I32X4 => 0b10,
+                    I64X2 => 0b11,
+                    _ => 0,
+                };
+                let (bits_12_16, size) = match op {
                     VecMisc2::Not => {
                         debug_assert_eq!(128, ty_bits(ty));
-                        0b00101
+                        (0b00101, 0b00)
+                    }
+                    VecMisc2::Neg => {
+                        debug_assert_eq!(128, ty_bits(ty));
+                        (0b01011, enc_size)
                     }
                 };
-                sink.put4(enc_vec_rr_misc(bits_12_16, rd, rn));
+                sink.put4(enc_vec_rr_misc(size, bits_12_16, rd, rn));
             }
             &Inst::VecLanes { op, rd, rn, ty } => {
                 let (q, size) = match ty {
@@ -1277,6 +1289,7 @@ impl MachInstEmit for Inst {
                     I8X16 => 0b00,
                     I16X8 => 0b01,
                     I32X4 => 0b10,
+                    I64X2 => 0b11,
                     _ => 0,
                 };
                 let enc_size_for_fcmp = match ty {
@@ -1333,6 +1346,12 @@ impl MachInstEmit for Inst {
                         (0b011_01110_01_1, 0b000111)
                     }
                     VecALUOp::Umaxp => (0b011_01110_00_1 | enc_size << 1, 0b101001),
+                    VecALUOp::Add => (0b010_01110_00_1 | enc_size << 1, 0b100001),
+                    VecALUOp::Sub => (0b011_01110_00_1 | enc_size << 1, 0b100001),
+                    VecALUOp::Mul => {
+                        debug_assert_ne!(I64X2, ty);
+                        (0b010_01110_00_1 | enc_size << 1, 0b100111)
+                    }
                 };
                 sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
             }
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
index aaf4cfbae3..01786f13af 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -2341,6 +2341,138 @@ fn test_aarch64_binemit() {
         "umaxp v1.4s, v20.4s, v16.4s",
     ));
 
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Add,
+            rd: writable_vreg(5),
+            rn: vreg(1),
+            rm: vreg(1),
+            ty: I8X16,
+        },
+        "2584214E",
+        "add v5.16b, v1.16b, v1.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Add,
+            rd: writable_vreg(7),
+            rn: vreg(13),
+            rm: vreg(2),
+            ty: I16X8,
+        },
+        "A785624E",
+        "add v7.8h, v13.8h, v2.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Add,
+            rd: writable_vreg(18),
+            rn: vreg(9),
+            rm: vreg(6),
+            ty: I32X4,
+        },
+        "3285A64E",
+        "add v18.4s, v9.4s, v6.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Add,
+            rd: writable_vreg(1),
+            rn: vreg(3),
+            rm: vreg(2),
+            ty: I64X2,
+        },
+        "6184E24E",
+        "add v1.2d, v3.2d, v2.2d",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Sub,
+            rd: writable_vreg(5),
+            rn: vreg(1),
+            rm: vreg(1),
+            ty: I8X16,
+        },
+        "2584216E",
+        "sub v5.16b, v1.16b, v1.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Sub,
+            rd: writable_vreg(7),
+            rn: vreg(13),
+            rm: vreg(2),
+            ty: I16X8,
+        },
+        "A785626E",
+        "sub v7.8h, v13.8h, v2.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Sub,
+            rd: writable_vreg(18),
+            rn: vreg(9),
+            rm: vreg(6),
+            ty: I32X4,
+        },
+        "3285A66E",
+        "sub v18.4s, v9.4s, v6.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Sub,
+            rd: writable_vreg(18),
+            rn: vreg(0),
+            rm: vreg(8),
+            ty: I64X2,
+        },
+        "1284E86E",
+        "sub v18.2d, v0.2d, v8.2d",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Mul,
+            rd: writable_vreg(25),
+            rn: vreg(9),
+            rm: vreg(8),
+            ty: I8X16,
+        },
+        "399D284E",
+        "mul v25.16b, v9.16b, v8.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Mul,
+            rd: writable_vreg(30),
+            rn: vreg(30),
+            rm: vreg(12),
+            ty: I16X8,
+        },
+        "DE9F6C4E",
+        "mul v30.8h, v30.8h, v12.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Mul,
+            rd: writable_vreg(18),
+            rn: vreg(18),
+            rm: vreg(18),
+            ty: I32X4,
+        },
+        "529EB24E",
+        "mul v18.4s, v18.4s, v18.4s",
+    ));
+
     insns.push((
         Inst::VecMisc {
             op: VecMisc2::Not,
@@ -2352,6 +2484,50 @@ fn test_aarch64_binemit() {
         "mvn v2.16b, v1.16b",
     ));
 
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Neg,
+            rd: writable_vreg(8),
+            rn: vreg(12),
+            ty: I8X16,
+        },
+        "88B9206E",
+        "neg v8.16b, v12.16b",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Neg,
+            rd: writable_vreg(0),
+            rn: vreg(31),
+            ty: I16X8,
+        },
+        "E0BB606E",
+        "neg v0.8h, v31.8h",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Neg,
+            rd: writable_vreg(2),
+            rn: vreg(3),
+            ty: I32X4,
+        },
+        "62B8A06E",
+        "neg v2.4s, v3.4s",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Neg,
+            rd: writable_vreg(10),
+            rn: vreg(8),
+            ty: I64X2,
+        },
+        "0AB9E06E",
+        "neg v10.2d, v8.2d",
+    ));
+
     insns.push((
         Inst::VecLanes {
             op: VecLanesOp::Uminv,
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index 1a5563d62a..9d229b8df8 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -243,13 +243,21 @@ pub enum VecALUOp {
     Bsl,
     /// Unsigned maximum pairwise
     Umaxp,
+    /// Add
+    Add,
+    /// Subtract
+    Sub,
+    /// Multiply
+    Mul,
 }
 
 /// A Vector miscellaneous operation with two registers.
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub enum VecMisc2 {
-    /// Bitwise NOT.
+    /// Bitwise NOT
     Not,
+    /// Negate
+    Neg,
 }
 
 /// An operation across the lanes of vectors.
@@ -2737,6 +2745,9 @@ impl ShowWithRRU for Inst {
                     VecALUOp::Eor => ("eor", true, I8X16),
                     VecALUOp::Bsl => ("bsl", true, I8X16),
                     VecALUOp::Umaxp => ("umaxp", true, ty),
+                    VecALUOp::Add => ("add", true, ty),
+                    VecALUOp::Sub => ("sub", true, ty),
+                    VecALUOp::Mul => ("mul", true, ty),
                 };
 
                 let show_vreg_fn: fn(Reg, Option<&RealRegUniverse>, Type) -> String = if vector {
@@ -2750,14 +2761,10 @@ impl ShowWithRRU for Inst {
                 let rm = show_vreg_fn(rm, mb_rru, ty);
                 format!("{} {}, {}, {}", op, rd, rn, rm)
             }
-            &Inst::VecMisc {
-                op,
-                rd,
-                rn,
-                ty: _ty,
-            } => {
+            &Inst::VecMisc { op, rd, rn, ty } => {
                 let (op, ty) = match op {
                     VecMisc2::Not => ("mvn", I8X16),
+                    VecMisc2::Neg => ("neg", ty),
                 };
 
                 let rd = show_vreg_vector(rd.to_reg(), mb_rru, ty);
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index 82eb35f13f..5c77a6a52d 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -58,18 +58,40 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         Opcode::Iadd => {
             let rd = get_output_reg(ctx, outputs[0]);
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None);
             let ty = ty.unwrap();
-            let alu_op = choose_32_64(ty, ALUOp::Add32, ALUOp::Add64);
-            ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
+            if ty_bits(ty) < 128 {
+                let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None);
+                let alu_op = choose_32_64(ty, ALUOp::Add32, ALUOp::Add64);
+                ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
+            } else {
+                let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+                ctx.emit(Inst::VecRRR {
+                    rd,
+                    rn,
+                    rm,
+                    alu_op: VecALUOp::Add,
+                    ty,
+                });
+            }
         }
         Opcode::Isub => {
             let rd = get_output_reg(ctx, outputs[0]);
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None);
             let ty = ty.unwrap();
-            let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
-            ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
+            if ty_bits(ty) < 128 {
+                let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None);
+                let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
+                ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
+            } else {
+                let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+                ctx.emit(Inst::VecRRR {
+                    rd,
+                    rn,
+                    rm,
+                    alu_op: VecALUOp::Sub,
+                    ty,
+                });
+            }
         }
         Opcode::UaddSat | Opcode::SaddSat => {
             // We use the vector instruction set's saturating adds (UQADD /
@@ -143,11 +165,21 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
         Opcode::Ineg => {
             let rd = get_output_reg(ctx, outputs[0]);
-            let rn = zero_reg();
-            let rm = put_input_in_rse_imm12(ctx, inputs[0], NarrowValueMode::None);
             let ty = ty.unwrap();
-            let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
-            ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
+            if ty_bits(ty) < 128 {
+                let rn = zero_reg();
+                let rm = put_input_in_rse_imm12(ctx, inputs[0], NarrowValueMode::None);
+                let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
+                ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
+            } else {
+                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                ctx.emit(Inst::VecMisc {
+                    op: VecMisc2::Neg,
+                    rd,
+                    rn,
+                    ty,
+                });
+            }
         }
 
         Opcode::Imul => {
@@ -155,14 +187,24 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let ty = ty.unwrap();
-            let alu_op = choose_32_64(ty, ALUOp::MAdd32, ALUOp::MAdd64);
-            ctx.emit(Inst::AluRRRR {
-                alu_op,
-                rd,
-                rn,
-                rm,
-                ra: zero_reg(),
-            });
+            if ty_bits(ty) < 128 {
+                let alu_op = choose_32_64(ty, ALUOp::MAdd32, ALUOp::MAdd64);
+                ctx.emit(Inst::AluRRRR {
+                    alu_op,
+                    rd,
+                    rn,
+                    rm,
+                    ra: zero_reg(),
+                });
+            } else {
+                ctx.emit(Inst::VecRRR {
+                    alu_op: VecALUOp::Mul,
+                    rd,
+                    rn,
+                    rm,
+                    ty,
+                });
+            }
         }
 
         Opcode::Umulhi | Opcode::Smulhi => {