From 3bc2f0c7019752f9ee561df1ae3e4936778a9d6e Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Mon, 2 Aug 2021 10:03:54 +0100
Subject: [PATCH] Enable simd_X_extadd_pairwise_X for AArch64

Lower to [u|s]addlp for AArch64.

Copyright (c) 2021, Arm Limited.
---
 build.rs                                      |  11 --
 .../codegen/src/isa/aarch64/inst/emit.rs      |  21 +++
 .../src/isa/aarch64/inst/emit_tests.rs        |  40 ++++++
 cranelift/codegen/src/isa/aarch64/inst/mod.rs |  53 ++++++++
 .../codegen/src/isa/aarch64/lower_inst.rs     |  54 +++++++-
 .../isa/aarch64/simd-pairwise-add.clif        | 124 ++++++++++++++++++
 6 files changed, 291 insertions(+), 12 deletions(-)
 create mode 100644 cranelift/filetests/filetests/isa/aarch64/simd-pairwise-add.clif
diff --git a/build.rs b/build.rs
index edf1d3e290..26e01eda4e 100644
--- a/build.rs
+++ b/build.rs
@@ -202,13 +202,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
             ("simd", _) if cfg!(feature = "old-x86-backend") => return true,
             // No simd support yet for s390x.
             ("simd", _) if platform_is_s390x() => return true,
-            // These are new instructions that are only known to be supported for x64.
-            ("simd", "simd_i16x8_extadd_pairwise_i8x16")
-            | ("simd", "simd_i32x4_extadd_pairwise_i16x8")
-                if !platform_is_x64() =>
-            {
-                return true
-            }
             _ => {}
         },
         _ => panic!("unrecognized strategy"),
@@ -217,10 +210,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
     false
 }
 
-fn platform_is_x64() -> bool {
-    env::var("CARGO_CFG_TARGET_ARCH").unwrap() == "x86_64"
-}
-
 fn platform_is_s390x() -> bool {
     env::var("CARGO_CFG_TARGET_ARCH").unwrap() == "s390x"
 }
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index 54886b010e..5374de6bf8 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -459,6 +459,17 @@ fn enc_vec_rr_pair(bits_12_16: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
         | machreg_to_vec(rd.to_reg())
 }
 
+fn enc_vec_rr_pair_long(u: u32, enc_size: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
+    debug_assert_eq!(u & 0b1, u);
+    debug_assert_eq!(enc_size & 0b1, enc_size);
+
+    0b0_1_0_01110_00_10000_00_0_10_10_00000_00000
+        | u << 29
+        | enc_size << 22
+        | machreg_to_vec(rn) << 5
+        | machreg_to_vec(rd.to_reg())
+}
+
 fn enc_vec_lanes(q: u32, u: u32, size: u32, opcode: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
     debug_assert_eq!(q & 0b1, q);
     debug_assert_eq!(u & 0b1, u);
@@ -2225,6 +2236,16 @@ impl MachInstEmit for Inst {
                     rd,
                 ));
             }
+            &Inst::VecRRPairLong { op, rd, rn } => {
+                let (u, size) = match op {
+                    VecRRPairLongOp::Saddlp8 => (0b0, 0b0),
+                    VecRRPairLongOp::Uaddlp8 => (0b1, 0b0),
+                    VecRRPairLongOp::Saddlp16 => (0b0, 0b1),
+                    VecRRPairLongOp::Uaddlp16 => (0b1, 0b1),
+                };
+
+                sink.put4(enc_vec_rr_pair_long(u, size, rd, rn));
+            }
             &Inst::VecRRR {
                 rd,
                 rn,
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
index d3afca2a77..b27d183a94 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -2643,6 +2643,46 @@ fn test_aarch64_binemit() {
         "addp d0, v30.2d",
     ));
 
+    insns.push((
+        Inst::VecRRPairLong {
+            op: VecRRPairLongOp::Uaddlp8,
+            rd: writable_vreg(0),
+            rn: vreg(1),
+        },
+        "2028206E",
+        "uaddlp v0.8h, v1.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRPairLong {
+            op: VecRRPairLongOp::Saddlp8,
+            rd: writable_vreg(3),
+            rn: vreg(11),
+        },
+        "6329204E",
+        "saddlp v3.8h, v11.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRPairLong {
+            op: VecRRPairLongOp::Uaddlp16,
+            rd: writable_vreg(14),
+            rn: vreg(23),
+        },
+        "EE2A606E",
+        "uaddlp v14.4s, v23.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRPairLong {
+            op: VecRRPairLongOp::Saddlp16,
+            rd: writable_vreg(29),
+            rn: vreg(0),
+        },
+        "1D28604E",
+        "saddlp v29.4s, v0.8h",
+    ));
+
     insns.push((
         Inst::VecRRR {
             alu_op: VecALUOp::Sqadd,
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index 8c993492bd..d498bc9b85 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -419,6 +419,18 @@ pub enum VecPairOp {
     Addp,
 }
 
+/// 1-operand vector instruction that extends elements of the input register
+/// and operates on a pair of elements.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum VecRRPairLongOp {
+    /// Sign extend and add pair of elements
+    Saddlp8,
+    Saddlp16,
+    /// Unsigned extend and add pair of elements
+    Uaddlp8,
+    Uaddlp16,
+}
+
 /// An operation across the lanes of vectors.
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub enum VecLanesOp {
@@ -1107,6 +1119,15 @@ pub enum Inst {
         high_half: bool,
     },
 
+    /// 1-operand vector instruction that extends elements of the input
+    /// register and operates on a pair of elements. The output lane width
+    /// is double that of the input.
+    VecRRPairLong {
+        op: VecRRPairLongOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+    },
+
     /// A vector ALU op.
     VecRRR {
         alu_op: VecALUOp,
@@ -2166,6 +2187,10 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
             collector.add_use(rn);
             collector.add_use(rm);
         }
+        &Inst::VecRRPairLong { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
         &Inst::VecRRR {
             alu_op, rd, rn, rm, ..
         } => {
@@ -2992,6 +3017,14 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
             map_use(mapper, rn);
             map_use(mapper, rm);
         }
+        &mut Inst::VecRRPairLong {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
         &mut Inst::VecRRR {
             alu_op,
             ref mut rd,
@@ -4152,6 +4185,26 @@ impl Inst {
 
                 format!("{} {}, {}", op, rd, rn)
             }
+            &Inst::VecRRPairLong { op, rd, rn } => {
+                let (op, dest, src) = match op {
+                    VecRRPairLongOp::Saddlp8 => {
+                        ("saddlp", VectorSize::Size16x8, VectorSize::Size8x16)
+                    }
+                    VecRRPairLongOp::Saddlp16 => {
+                        ("saddlp", VectorSize::Size32x4, VectorSize::Size16x8)
+                    }
+                    VecRRPairLongOp::Uaddlp8 => {
+                        ("uaddlp", VectorSize::Size16x8, VectorSize::Size8x16)
+                    }
+                    VecRRPairLongOp::Uaddlp16 => {
+                        ("uaddlp", VectorSize::Size32x4, VectorSize::Size16x8)
+                    }
+                };
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, dest);
+                let rn = show_vreg_vector(rn, mb_rru, src);
+
+                format!("{} {}, {}", op, rd, rn)
+            }
             &Inst::VecRRR {
                 rd,
                 rn,
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index c07fb92596..f9440dbbb1 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -2644,6 +2644,58 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             });
         }
 
+        Opcode::IaddPairwise => {
+            let ty = ty.unwrap();
+            let lane_type = ty.lane_type();
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+
+            let mut match_long_pair =
+                |ext_low_op, ext_high_op| -> Option<(VecRRPairLongOp, regalloc::Reg)> {
+                    if let Some(lhs) = maybe_input_insn(ctx, inputs[0], ext_low_op) {
+                        if let Some(rhs) = maybe_input_insn(ctx, inputs[1], ext_high_op) {
+                            let lhs_inputs = insn_inputs(ctx, lhs);
+                            let rhs_inputs = insn_inputs(ctx, rhs);
+                            let low = put_input_in_reg(ctx, lhs_inputs[0], NarrowValueMode::None);
+                            let high = put_input_in_reg(ctx, rhs_inputs[0], NarrowValueMode::None);
+                            if low == high {
+                                match (lane_type, ext_low_op) {
+                                    (I16, Opcode::SwidenLow) => {
+                                        return Some((VecRRPairLongOp::Saddlp8, low))
+                                    }
+                                    (I32, Opcode::SwidenLow) => {
+                                        return Some((VecRRPairLongOp::Saddlp16, low))
+                                    }
+                                    (I16, Opcode::UwidenLow) => {
+                                        return Some((VecRRPairLongOp::Uaddlp8, low))
+                                    }
+                                    (I32, Opcode::UwidenLow) => {
+                                        return Some((VecRRPairLongOp::Uaddlp16, low))
+                                    }
+                                    _ => (),
+                                };
+                            }
+                        }
+                    }
+                    None
+                };
+
+            if let Some((op, rn)) = match_long_pair(Opcode::SwidenLow, Opcode::SwidenHigh) {
+                ctx.emit(Inst::VecRRPairLong { op, rd, rn });
+            } else if let Some((op, rn)) = match_long_pair(Opcode::UwidenLow, Opcode::UwidenHigh) {
+                ctx.emit(Inst::VecRRPairLong { op, rd, rn });
+            } else {
+                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+                ctx.emit(Inst::VecRRR {
+                    alu_op: VecALUOp::Addp,
+                    rd: rd,
+                    rn: rn,
+                    rm: rm,
+                    size: VectorSize::from_ty(ty),
+                });
+            }
+        }
+
         Opcode::WideningPairwiseDotProductS => {
             let r_y = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
@@ -3519,7 +3571,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             });
         }
 
-        Opcode::IaddPairwise | Opcode::ConstAddr | Opcode::Vconcat | Opcode::Vsplit => {
+        Opcode::ConstAddr | Opcode::Vconcat | Opcode::Vsplit => {
             unimplemented!("lowering {}", op)
         }
     }
diff --git a/cranelift/filetests/filetests/isa/aarch64/simd-pairwise-add.clif b/cranelift/filetests/filetests/isa/aarch64/simd-pairwise-add.clif
new file mode 100644
index 0000000000..42190619c6
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/aarch64/simd-pairwise-add.clif
@@ -0,0 +1,124 @@
+test compile
+set unwind_info=false
+target aarch64
+
+
+function %fn1(i8x16) -> i16x8 {
+block0(v0: i8x16):
+  v1 = swiden_low v0
+  v2 = swiden_high v0
+  v3 = iadd_pairwise v1, v2
+  return v3
+}
+
+; check: stp fp
+; nextln: mov fp, sp
+; nextln: saddlp v0.8h, v0.16b
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %fn2(i8x16) -> i16x8 {
+block0(v0: i8x16):
+  v1 = uwiden_low v0
+  v2 = uwiden_high v0
+  v3 = iadd_pairwise v1, v2
+  return v3
+}
+
+; check: stp fp
+; nextln: mov fp, sp
+; nextln: uaddlp v0.8h, v0.16b
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %fn3(i16x8) -> i32x4 {
+block0(v0: i16x8):
+  v1 = swiden_low v0
+  v2 = swiden_high v0
+  v3 = iadd_pairwise v1, v2
+  return v3
+}
+
+; check: stp fp
+; nextln: mov fp, sp
+; nextln: saddlp v0.4s, v0.8h
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %fn4(i16x8) -> i32x4 {
+block0(v0: i16x8):
+  v1 = uwiden_low v0
+  v2 = uwiden_high v0
+  v3 = iadd_pairwise v1, v2
+  return v3
+}
+
+; check: stp fp
+; nextln: mov fp, sp
+; nextln: uaddlp v0.4s, v0.8h
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %fn5(i8x16, i8x16) -> i16x8 {
+block0(v0: i8x16, v1: i8x16):
+  v2 = swiden_low v0
+  v3 = swiden_high v1
+  v4 = iadd_pairwise v2, v3
+  return v4
+}
+
+; check: stp fp
+; nextln: mov fp, sp
+; nextln: sxtl v0.8h, v0.8b
+; nextln: sxtl2 v1.8h, v1.16b
+; nextln: addp v0.8h, v0.8h, v1.8h
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %fn6(i8x16, i8x16) -> i16x8 {
+block0(v0: i8x16, v1: i8x16):
+  v2 = uwiden_low v0
+  v3 = uwiden_high v1
+  v4 = iadd_pairwise v2, v3
+  return v4
+}
+
+; check: stp fp
+; nextln: mov fp, sp
+; nextln: uxtl v0.8h, v0.8b
+; nextln: uxtl2 v1.8h, v1.16b
+; nextln: addp v0.8h, v0.8h, v1.8h
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %fn7(i8x16) -> i16x8 {
+block0(v0: i8x16):
+  v1 = uwiden_low v0
+  v2 = swiden_high v0
+  v3 = iadd_pairwise v1, v2
+  return v3
+}
+
+; check: stp fp
+; nextln: mov fp, sp
+; nextln: uxtl v1.8h, v0.8b
+; nextln: sxtl2 v0.8h, v0.16b
+; nextln: addp v0.8h, v1.8h, v0.8h
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %fn8(i8x16) -> i16x8 {
+block0(v0: i8x16):
+  v1 = swiden_low v0
+  v2 = uwiden_high v0
+  v3 = iadd_pairwise v1, v2
+  return v3
+}
+
+; check: stp fp
+; nextln: mov fp, sp
+; nextln: sxtl v1.8h, v0.8b
+; nextln: uxtl2 v0.8h, v0.16b
+; nextln: addp v0.8h, v1.8h, v0.8h
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret