From 0f462330e0562b78c7565fbc1df9650ff1342a15 Mon Sep 17 00:00:00 2001
From: Joey Gouly <joey.gouly@arm.com>
Date: Wed, 17 Jun 2020 15:40:51 +0100
Subject: [PATCH] arm64: Implement AllTrue and AnyTrue

This enables the simd_boolean WASM SIMD spec test.

Copyright (c) 2020, Arm Limited.
---
 build.rs                                      |  1 +
 .../codegen/src/isa/aarch64/inst/emit.rs      | 41 +++++++++--
 .../src/isa/aarch64/inst/emit_tests.rs        | 69 +++++++++++++++++++
 .../codegen/src/isa/aarch64/inst/imms.rs      |  8 +++
 cranelift/codegen/src/isa/aarch64/inst/mod.rs | 44 +++++++++++-
 .../codegen/src/isa/aarch64/inst/regs.rs      | 11 ++-
 .../codegen/src/isa/aarch64/lower_inst.rs     | 50 +++++++++++++-
 7 files changed, 211 insertions(+), 13 deletions(-)
diff --git a/build.rs b/build.rs
index 40b4385b5e..fdf21f0e12 100644
--- a/build.rs
+++ b/build.rs
@@ -182,6 +182,7 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
         "Cranelift" => match (testsuite, testname) {
             ("simd", "simd_address") => return false,
             ("simd", "simd_bitwise") => return false,
+            ("simd", "simd_boolean") => return false,
             ("simd", "simd_i8x16_cmp") => return false,
             ("simd", "simd_i16x8_cmp") => return false,
             ("simd", "simd_i32x4_cmp") => return false,
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index 263241835f..7668465d62 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -361,6 +361,20 @@ fn enc_vec_rr_misc(bits_12_16: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
     bits | bits_12_16 << 12 | machreg_to_vec(rn) << 5 | machreg_to_vec(rd.to_reg())
 }
 
+fn enc_vec_lanes(q: u32, u: u32, size: u32, opcode: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
+    debug_assert_eq!(q & 0b1, q);
+    debug_assert_eq!(u & 0b1, u);
+    debug_assert_eq!(size & 0b11, size);
+    debug_assert_eq!(opcode & 0b11111, opcode);
+    0b0_0_0_01110_00_11000_0_0000_10_00000_00000
+        | q << 30
+        | u << 29
+        | size << 22
+        | opcode << 12
+        | machreg_to_vec(rn) << 5
+        | machreg_to_vec(rd.to_reg())
+}
+
 /// State carried between emissions of a sequence of instructions.
 #[derive(Default, Clone, Debug)]
 pub struct EmitState {
@@ -1061,6 +1075,18 @@ impl MachInstEmit for Inst {
                 };
                 sink.put4(enc_vec_rr_misc(bits_12_16, rd, rn));
             }
+            &Inst::VecLanes { op, rd, rn, ty } => {
+                let (q, size) = match ty {
+                    I8X16 => (0b1, 0b00),
+                    I16X8 => (0b1, 0b01),
+                    I32X4 => (0b1, 0b10),
+                    _ => unreachable!(),
+                };
+                let (u, opcode) = match op {
+                    VecLanesOp::Uminv => (0b1, 0b11010),
+                };
+                sink.put4(enc_vec_lanes(q, u, size, opcode, rd, rn));
+            }
             &Inst::FpuCmp32 { rn, rm } => {
                 sink.put4(enc_fcmp(InstSize::Size32, rn, rm));
             }
@@ -1247,7 +1273,7 @@ impl MachInstEmit for Inst {
                 alu_op,
                 ty,
             } => {
-                let enc_size_for_cmp = match ty {
+                let enc_size = match ty {
                     I8X16 => 0b00,
                     I16X8 => 0b01,
                     I32X4 => 0b10,
@@ -1271,12 +1297,12 @@ impl MachInstEmit for Inst {
                         debug_assert_eq!(I64, ty);
                         (0b011_11110_11_1, 0b001011)
                     }
-                    VecALUOp::Cmeq => (0b011_01110_00_1 | enc_size_for_cmp << 1, 0b100011),
-                    VecALUOp::Cmge => (0b010_01110_00_1 | enc_size_for_cmp << 1, 0b001111),
-                    VecALUOp::Cmgt => (0b010_01110_00_1 | enc_size_for_cmp << 1, 0b001101),
-                    VecALUOp::Cmhi => (0b011_01110_00_1 | enc_size_for_cmp << 1, 0b001101),
-                    VecALUOp::Cmhs => (0b011_01110_00_1 | enc_size_for_cmp << 1, 0b001111),
-                    // The following instructions operate on bytes, so are not encoded differently
+                    VecALUOp::Cmeq => (0b011_01110_00_1 | enc_size << 1, 0b100011),
+                    VecALUOp::Cmge => (0b010_01110_00_1 | enc_size << 1, 0b001111),
+                    VecALUOp::Cmgt => (0b010_01110_00_1 | enc_size << 1, 0b001101),
+                    VecALUOp::Cmhi => (0b011_01110_00_1 | enc_size << 1, 0b001101),
+                    VecALUOp::Cmhs => (0b011_01110_00_1 | enc_size << 1, 0b001111),
+                    // The following logical instructions operate on bytes, so are not encoded differently
                     // for the different vector types.
                     VecALUOp::And => {
                         debug_assert_eq!(128, ty_bits(ty));
@@ -1298,6 +1324,7 @@ impl MachInstEmit for Inst {
                         debug_assert_eq!(128, ty_bits(ty));
                         (0b011_01110_01_1, 0b000111)
                     }
+                    VecALUOp::Umaxp => (0b011_01110_00_1 | enc_size << 1, 0b101001),
                 };
                 sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
             }
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
index 7b2c095035..05dce50151 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -2269,6 +2269,42 @@ fn test_aarch64_binemit() {
         "bsl v8.16b, v9.16b, v1.16b",
     ));
 
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Umaxp,
+            rd: writable_vreg(8),
+            rn: vreg(12),
+            rm: vreg(1),
+            ty: I8X16,
+        },
+        "88A5216E",
+        "umaxp v8.16b, v12.16b, v1.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Umaxp,
+            rd: writable_vreg(1),
+            rn: vreg(6),
+            rm: vreg(1),
+            ty: I16X8,
+        },
+        "C1A4616E",
+        "umaxp v1.8h, v6.8h, v1.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Umaxp,
+            rd: writable_vreg(1),
+            rn: vreg(20),
+            rm: vreg(16),
+            ty: I32X4,
+        },
+        "81A6B06E",
+        "umaxp v1.4s, v20.4s, v16.4s",
+    ));
+
     insns.push((
         Inst::VecMisc {
             op: VecMisc2::Not,
@@ -2280,6 +2316,39 @@ fn test_aarch64_binemit() {
         "mvn v2.16b, v1.16b",
     ));
 
+    insns.push((
+        Inst::VecLanes {
+            op: VecLanesOp::Uminv,
+            rd: writable_vreg(2),
+            rn: vreg(1),
+            ty: I8X16,
+        },
+        "22A8316E",
+        "uminv b2, v1.16b",
+    ));
+
+    insns.push((
+        Inst::VecLanes {
+            op: VecLanesOp::Uminv,
+            rd: writable_vreg(3),
+            rn: vreg(11),
+            ty: I16X8,
+        },
+        "63A9716E",
+        "uminv h3, v11.8h",
+    ));
+
+    insns.push((
+        Inst::VecLanes {
+            op: VecLanesOp::Uminv,
+            rd: writable_vreg(18),
+            rn: vreg(4),
+            ty: I32X4,
+        },
+        "92A8B16E",
+        "uminv s18, v4.4s",
+    ));
+
     insns.push((
         Inst::Extend {
             rd: writable_xreg(1),
diff --git a/cranelift/codegen/src/isa/aarch64/inst/imms.rs b/cranelift/codegen/src/isa/aarch64/inst/imms.rs
index 6fea5efb5c..961559cc9f 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/imms.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/imms.rs
@@ -304,6 +304,14 @@ impl Imm12 {
         }
     }
 
+    /// Create a zero immediate of this format.
+    pub fn zero() -> Self {
+        Imm12 {
+            bits: 0,
+            shift12: false,
+        }
+    }
+
     /// Bits for 2-bit "shift" field in e.g. AddI.
     pub fn shift_bits(&self) -> u32 {
         if self.shift12 {
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index 6c5eb4d995..6d14d53448 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -235,6 +235,8 @@ pub enum VecALUOp {
     Eor,
     /// Bitwise select
     Bsl,
+    /// Unsigned maximum pairwise
+    Umaxp,
 }
 
 /// A Vector miscellaneous operation with two registers.
@@ -244,6 +246,13 @@ pub enum VecMisc2 {
     Not,
 }
 
+/// An operation across the lanes of vectors.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum VecLanesOp {
+    /// Unsigned minimum across a vector
+    Uminv,
+}
+
 /// An operation on the bits of a register. This can be paired with several instruction formats
 /// below (see `Inst`) in any combination.
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
@@ -743,6 +752,14 @@ pub enum Inst {
         ty: Type,
     },
 
+    /// Vector instruction across lanes.
+    VecLanes {
+        op: VecLanesOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+        ty: Type,
+    },
+
     /// Move to the NZCV flags (actually a `MSR NZCV, Xn` insn).
     MovToNZCV {
         rn: Reg,
@@ -1214,6 +1231,11 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
             collector.add_def(rd);
             collector.add_use(rn);
         }
+
+        &Inst::VecLanes { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
         &Inst::FpuCmp32 { rn, rm } | &Inst::FpuCmp64 { rn, rm } => {
             collector.add_use(rn);
             collector.add_use(rm);
@@ -1708,6 +1730,14 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
             map_def(mapper, rd);
             map_use(mapper, rn);
         }
+        &mut Inst::VecLanes {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
         &mut Inst::FpuCmp32 {
             ref mut rn,
             ref mut rm,
@@ -2482,7 +2512,7 @@ impl ShowWithRRU for Inst {
                 let show_vreg_fn: fn(Reg, Option<&RealRegUniverse>) -> String = if vector {
                     |reg, mb_rru| show_vreg_vector(reg, mb_rru, F32X2)
                 } else {
-                    show_vreg_scalar
+                    |reg, mb_rru| show_vreg_scalar(reg, mb_rru, F64)
                 };
                 let rd = show_vreg_fn(rd.to_reg(), mb_rru);
                 let rn = show_vreg_fn(rn, mb_rru);
@@ -2695,12 +2725,13 @@ impl ShowWithRRU for Inst {
                     VecALUOp::Orr => ("orr", true, I8X16),
                     VecALUOp::Eor => ("eor", true, I8X16),
                     VecALUOp::Bsl => ("bsl", true, I8X16),
+                    VecALUOp::Umaxp => ("umaxp", true, ty),
                 };
 
                 let show_vreg_fn: fn(Reg, Option<&RealRegUniverse>, Type) -> String = if vector {
                     |reg, mb_rru, ty| show_vreg_vector(reg, mb_rru, ty)
                 } else {
-                    |reg, mb_rru, _ty| show_vreg_scalar(reg, mb_rru)
+                    |reg, mb_rru, _ty| show_vreg_scalar(reg, mb_rru, I64)
                 };
 
                 let rd = show_vreg_fn(rd.to_reg(), mb_rru, ty);
@@ -2722,6 +2753,15 @@ impl ShowWithRRU for Inst {
                 let rn = show_vreg_vector(rn, mb_rru, ty);
                 format!("{} {}, {}", op, rd, rn)
             }
+            &Inst::VecLanes { op, rd, rn, ty } => {
+                let op = match op {
+                    VecLanesOp::Uminv => "uminv",
+                };
+
+                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ty);
+                let rn = show_vreg_vector(rn, mb_rru, ty);
+                format!("{} {}, {}", op, rd, rn)
+            }
             &Inst::MovToNZCV { rn } => {
                 let rn = rn.show_rru(mb_rru);
                 format!("msr nzcv, {}", rn)
diff --git a/cranelift/codegen/src/isa/aarch64/inst/regs.rs b/cranelift/codegen/src/isa/aarch64/inst/regs.rs
index 9d74661256..b92b0b70c9 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/regs.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/regs.rs
@@ -292,7 +292,7 @@ pub fn show_freg_sized(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: InstSiz
 }
 
 /// Show a vector register used in a scalar context.
-pub fn show_vreg_scalar(reg: Reg, mb_rru: Option<&RealRegUniverse>) -> String {
+pub fn show_vreg_scalar(reg: Reg, mb_rru: Option<&RealRegUniverse>, ty: Type) -> String {
     let mut s = reg.show_rru(mb_rru);
     if reg.get_class() != RegClass::V128 {
         // We can't do any better.
@@ -302,7 +302,14 @@ pub fn show_vreg_scalar(reg: Reg, mb_rru: Option<&RealRegUniverse>) -> String {
     if reg.is_real() {
         // Change (eg) "v0" into "d0".
         if reg.get_class() == RegClass::V128 && s.starts_with("v") {
-            s.replace_range(0..1, "d");
+            let replacement = match ty {
+                I64 | F64 => "d",
+                I8X16 => "b",
+                I16X8 => "h",
+                I32X4 => "s",
+                _ => unimplemented!(),
+            };
+            s.replace_range(0..1, replacement);
         }
     } else {
         // Add a "d" suffix to RegClass::V128 vregs.
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index a97eab76e7..e77c641630 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -1540,12 +1540,58 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             ctx.emit(inst);
         }
 
+        Opcode::VanyTrue | Opcode::VallTrue => {
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rm = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let tmp = ctx.alloc_tmp(RegClass::V128, ty.unwrap());
+
+            // This operation is implemented by using umaxp or uminv to
+            // create a scalar value, which is then compared against zero.
+            //
+            // umaxp vn.16b, vm.16, vm.16 / uminv bn, vm.16b
+            // mov xm, vn.d[0]
+            // cmp xm, #0
+            // cset xm, ne
+
+            let input_ty = ctx.input_ty(insn, 0);
+            if op == Opcode::VanyTrue {
+                ctx.emit(Inst::VecRRR {
+                    alu_op: VecALUOp::Umaxp,
+                    rd: tmp,
+                    rn: rm,
+                    rm: rm,
+                    ty: input_ty,
+                });
+            } else {
+                ctx.emit(Inst::VecLanes {
+                    op: VecLanesOp::Uminv,
+                    rd: tmp,
+                    rn: rm,
+                    ty: input_ty,
+                });
+            };
+
+            ctx.emit(Inst::MovFromVec {
+                rd,
+                rn: tmp.to_reg(),
+                idx: 0,
+                ty: I64,
+            });
+
+            ctx.emit(Inst::AluRRImm12 {
+                alu_op: ALUOp::SubS64,
+                rd: writable_zero_reg(),
+                rn: rd.to_reg(),
+                imm12: Imm12::zero(),
+            });
+
+            ctx.emit(Inst::CSet { rd, cond: Cond::Ne });
+        }
+
         Opcode::Shuffle
         | Opcode::Vsplit
         | Opcode::Vconcat
         | Opcode::Vselect
-        | Opcode::VanyTrue
-        | Opcode::VallTrue
         | Opcode::Insertlane
         | Opcode::ScalarToVector
         | Opcode::Swizzle