Enable simd_extmul_* for AArch64

2021-07-08 16:39:27 +01:00
parent 65378422bf
commit 541a4ee428
8 changed files with 745 additions and 269 deletions
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -287,6 +287,22 @@ fn enc_vec_rrr(top11: u32, rm: Reg, bit15_10: u32, rn: Reg, rd: Writable<Reg>) -
        | machreg_to_vec(rd.to_reg())
 }

+fn enc_vec_rrr_long(q: u32, u: u32, size: u32, bit14: u32, rm: Reg, rn: Reg, rd: Writable<Reg>) -> u32 {
+  debug_assert_eq!(q & 0b1, q);
+  debug_assert_eq!(u & 0b1, u);
+  debug_assert_eq!(size & 0b11, size);
+  debug_assert_eq!(bit14 & 0b1, bit14);
+
+  0b0_0_0_01110_00_1_00000_100000_00000_00000
+        | q << 30
+        | u << 29
+        | size << 22
+        | bit14 << 14
+        | (machreg_to_vec(rm) << 16)
+        | (machreg_to_vec(rn) << 5)
+        | machreg_to_vec(rd.to_reg())
+}
+
 fn enc_bit_rr(size: u32, opcode2: u32, opcode1: u32, rn: Reg, rd: Writable<Reg>) -> u32 {
    (0b01011010110 << 21)
        | size << 31
@@ -2173,6 +2189,26 @@ impl MachInstEmit for Inst {

                sink.put4(enc_vec_rr_pair(bits_12_16, rd, rn));
            }
+            &Inst::VecRRRLong {
+                rd,
+                rn,
+                rm,
+                alu_op,
+                high_half,
+            } => {
+                let (u, size, bit14) = match alu_op {
+                    VecRRRLongOp::Smull8 => (0b0, 0b00, 0b1),
+                    VecRRRLongOp::Smull16 => (0b0, 0b01, 0b1),
+                    VecRRRLongOp::Smull32 => (0b0, 0b10, 0b1),
+                    VecRRRLongOp::Umull8 => (0b1, 0b00, 0b1),
+                    VecRRRLongOp::Umull16 => (0b1, 0b01, 0b1),
+                    VecRRRLongOp::Umull32 => (0b1, 0b10, 0b1),
+                    VecRRRLongOp::Umlal8 => (0b1, 0b00, 0b0),
+                    VecRRRLongOp::Umlal16 => (0b1, 0b01, 0b0),
+                    VecRRRLongOp::Umlal32 => (0b1, 0b10, 0b0),
+                };
+                sink.put4(enc_vec_rrr_long(high_half as u32, u, size, bit14, rm, rn, rd));
+            }
            &Inst::VecRRR {
                rd,
                rn,
@@ -2242,13 +2278,7 @@ impl MachInstEmit for Inst {
                    VecALUOp::Fmin => (0b000_01110_10_1, 0b111101),
                    VecALUOp::Fmul => (0b001_01110_00_1, 0b110111),
                    VecALUOp::Addp => (0b000_01110_00_1 | enc_size << 1, 0b101111),
-                    VecALUOp::Umlal => {
-                        debug_assert!(!size.is_128bits());
-                        (0b001_01110_00_1 | enc_size << 1, 0b100000)
-                    }
                    VecALUOp::Zip1 => (0b01001110_00_0 | enc_size << 1, 0b001110),
-                    VecALUOp::Smull => (0b000_01110_00_1 | enc_size << 1, 0b110000),
-                    VecALUOp::Smull2 => (0b010_01110_00_1 | enc_size << 1, 0b110000),
                    VecALUOp::Sqrdmulh => {
                        debug_assert!(
                            size.lane_size() == ScalarSize::Size16
@@ -2258,12 +2288,12 @@ impl MachInstEmit for Inst {
                        (0b001_01110_00_1 | enc_size << 1, 0b101101)
                    }
                };
-                let top11 = match alu_op {
-                    VecALUOp::Smull | VecALUOp::Smull2 => top11,
-                    _ if is_float => top11 | (q << 9) | enc_float_size << 1,
-                    _ => top11 | (q << 9),
+                let top11 = if is_float {
+                  top11 | enc_float_size << 1
+                } else {
+                  top11
                };
-                sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
+                sink.put4(enc_vec_rrr(top11 | q << 9, rm, bit15_10, rn, rd));
            }
            &Inst::VecLoadReplicate { rd, rn, size } => {
                let (q, size) = size.enc_size();
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -3651,18 +3651,6 @@ fn test_aarch64_binemit() {
        "addp v8.4s, v12.4s, v14.4s",
    ));

-    insns.push((
-        Inst::VecRRR {
-            alu_op: VecALUOp::Umlal,
-            rd: writable_vreg(9),
-            rn: vreg(20),
-            rm: vreg(17),
-            size: VectorSize::Size32x2,
-        },
-        "8982B12E",
-        "umlal v9.2d, v20.2s, v17.2s",
-    ));
-
    insns.push((
        Inst::VecRRR {
            alu_op: VecALUOp::Zip1,
@@ -3712,77 +3700,221 @@ fn test_aarch64_binemit() {
    ));

    insns.push((
-        Inst::VecRRR {
-            alu_op: VecALUOp::Smull,
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Smull8,
            rd: writable_vreg(16),
            rn: vreg(12),
            rm: vreg(1),
-            size: VectorSize::Size8x16,
+            high_half: false
        },
        "90C1210E",
        "smull v16.8h, v12.8b, v1.8b",
    ));

    insns.push((
-        Inst::VecRRR {
-            alu_op: VecALUOp::Smull,
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Umull8,
+            rd: writable_vreg(15),
+            rn: vreg(11),
+            rm: vreg(2),
+            high_half: false
+        },
+        "6FC1222E",
+        "umull v15.8h, v11.8b, v2.8b",
+    ));
+
+    insns.push((
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Umlal8,
+            rd: writable_vreg(4),
+            rn: vreg(8),
+            rm: vreg(16),
+            high_half: false
+        },
+        "0481302E",
+        "umlal v4.8h, v8.8b, v16.8b",
+    ));
+
+    insns.push((
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Smull16,
            rd: writable_vreg(2),
            rn: vreg(13),
            rm: vreg(6),
-            size: VectorSize::Size16x8,
+            high_half: false,
        },
        "A2C1660E",
        "smull v2.4s, v13.4h, v6.4h",
    ));

    insns.push((
-        Inst::VecRRR {
-            alu_op: VecALUOp::Smull,
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Umull16,
+            rd: writable_vreg(3),
+            rn: vreg(14),
+            rm: vreg(7),
+            high_half: false,
+        },
+        "C3C1672E",
+        "umull v3.4s, v14.4h, v7.4h",
+    ));
+
+    insns.push((
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Umlal16,
+            rd: writable_vreg(7),
+            rn: vreg(14),
+            rm: vreg(21),
+            high_half: false,
+        },
+        "C781752E",
+        "umlal v7.4s, v14.4h, v21.4h",
+    ));
+
+    insns.push((
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Smull32,
            rd: writable_vreg(8),
            rn: vreg(12),
            rm: vreg(14),
-            size: VectorSize::Size32x4,
+            high_half: false,
        },
        "88C1AE0E",
        "smull v8.2d, v12.2s, v14.2s",
    ));

    insns.push((
-        Inst::VecRRR {
-            alu_op: VecALUOp::Smull2,
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Umull32,
+            rd: writable_vreg(9),
+            rn: vreg(5),
+            rm: vreg(6),
+            high_half: false,
+        },
+        "A9C0A62E",
+        "umull v9.2d, v5.2s, v6.2s",
+    ));
+
+    insns.push((
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Umlal32,
+            rd: writable_vreg(9),
+            rn: vreg(20),
+            rm: vreg(17),
+            high_half: false,
+        },
+        "8982B12E",
+        "umlal v9.2d, v20.2s, v17.2s",
+    ));
+
+    insns.push((
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Smull8,
            rd: writable_vreg(16),
            rn: vreg(12),
            rm: vreg(1),
-            size: VectorSize::Size8x16,
+            high_half: true,
        },
        "90C1214E",
        "smull2 v16.8h, v12.16b, v1.16b",
    ));

    insns.push((
-        Inst::VecRRR {
-            alu_op: VecALUOp::Smull2,
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Umull8,
+            rd: writable_vreg(29),
+            rn: vreg(22),
+            rm: vreg(10),
+            high_half: true,
+        },
+        "DDC22A6E",
+        "umull2 v29.8h, v22.16b, v10.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Umlal8,
+            rd: writable_vreg(1),
+            rn: vreg(5),
+            rm: vreg(15),
+            high_half: true,
+        },
+        "A1802F6E",
+        "umlal2 v1.8h, v5.16b, v15.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Smull16,
            rd: writable_vreg(2),
            rn: vreg(13),
            rm: vreg(6),
-            size: VectorSize::Size16x8,
+            high_half: true,
        },
        "A2C1664E",
        "smull2 v2.4s, v13.8h, v6.8h",
    ));

    insns.push((
-        Inst::VecRRR {
-            alu_op: VecALUOp::Smull2,
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Umull16,
+            rd: writable_vreg(19),
+            rn: vreg(18),
+            rm: vreg(17),
+            high_half: true,
+        },
+        "53C2716E",
+        "umull2 v19.4s, v18.8h, v17.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Umlal16,
+            rd: writable_vreg(11),
+            rn: vreg(10),
+            rm: vreg(12),
+            high_half: true,
+        },
+        "4B816C6E",
+        "umlal2 v11.4s, v10.8h, v12.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Smull32,
            rd: writable_vreg(8),
            rn: vreg(12),
            rm: vreg(14),
-            size: VectorSize::Size32x4,
+            high_half: true,
        },
        "88C1AE4E",
        "smull2 v8.2d, v12.4s, v14.4s",
    ));

+    insns.push((
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Umull32,
+            rd: writable_vreg(4),
+            rn: vreg(12),
+            rm: vreg(16),
+            high_half: true,
+        },
+        "84C1B06E",
+        "umull2 v4.2d, v12.4s, v16.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRRLong {
+            alu_op: VecRRRLongOp::Umlal32,
+            rd: writable_vreg(10),
+            rn: vreg(29),
+            rm: vreg(2),
+            high_half: true,
+        },
+        "AA83A26E",
+        "umlal2 v10.2d, v29.4s, v2.4s",
+    ));
+
    insns.push((
        Inst::VecRRR {
            alu_op: VecALUOp::Sqrdmulh,
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -303,14 +303,8 @@ pub enum VecALUOp {
    Fmul,
    /// Add pairwise
    Addp,
-    /// Unsigned multiply add long
-    Umlal,
    /// Zip vectors (primary) [meaning, high halves]
    Zip1,
-    /// Signed multiply long (low halves)
-    Smull,
-    /// Signed multiply long (high halves)
-    Smull2,
    /// Signed saturating rounding doubling multiply returning high half
    Sqrdmulh,
 }
@@ -402,6 +396,23 @@ pub enum VecRRNarrowOp {
    Fcvtn64,
 }

+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum VecRRRLongOp {
+    /// Signed multiply long.
+    Smull8,
+    Smull16,
+    Smull32,
+    /// Unsigned multiply long.
+    Umull8,
+    Umull16,
+    Umull32,
+    /// Unsigned multiply add long
+    Umlal8,
+    Umlal16,
+    Umlal32,
+}
+
+
 /// A vector operation on a pair of elements with one register.
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub enum VecPairOp {
@@ -1087,6 +1098,16 @@ pub enum Inst {
        rn: Reg,
    },

+    /// 2-operand vector instruction that produces a result with twice the
+    /// lane width and half the number of lanes.
+    VecRRRLong {
+        alu_op: VecRRRLongOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+        rm: Reg,
+        high_half: bool,
+    },
+
    /// A vector ALU op.
    VecRRR {
        alu_op: VecALUOp,
@@ -2134,10 +2155,22 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
            collector.add_def(rd);
            collector.add_use(rn);
        }
+        &Inst::VecRRRLong {
+            alu_op, rd, rn, rm, ..
+        } => {
+            match alu_op {
+                VecRRRLongOp::Umlal8
+                | VecRRRLongOp::Umlal16
+                | VecRRRLongOp::Umlal32 => collector.add_mod(rd),
+                _ => collector.add_def(rd),
+            };
+            collector.add_use(rn);
+            collector.add_use(rm);
+        }
        &Inst::VecRRR {
            alu_op, rd, rn, rm, ..
        } => {
-            if alu_op == VecALUOp::Bsl || alu_op == VecALUOp::Umlal {
+            if alu_op == VecALUOp::Bsl {
                collector.add_mod(rd);
            } else {
                collector.add_def(rd);
@@ -2944,6 +2977,22 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
            map_def(mapper, rd);
            map_use(mapper, rn);
        }
+        &mut Inst::VecRRRLong {
+            alu_op,
+            ref mut rd,
+            ref mut rn,
+            ref mut rm,
+            ..
+        } => {
+            match alu_op {
+                VecRRRLongOp::Umlal8
+                | VecRRRLongOp::Umlal16
+                | VecRRRLongOp::Umlal32 => map_mod(mapper, rd),
+                _ => map_def(mapper, rd),
+            };
+            map_use(mapper, rn);
+            map_use(mapper, rm);
+        }
        &mut Inst::VecRRR {
            alu_op,
            ref mut rd,
@@ -2951,7 +3000,7 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
            ref mut rm,
            ..
        } => {
-            if alu_op == VecALUOp::Bsl || alu_op == VecALUOp::Umlal {
+            if alu_op == VecALUOp::Bsl {
                map_mod(mapper, rd);
            } else {
                map_def(mapper, rd);
@@ -4147,24 +4196,62 @@ impl Inst {
                    VecALUOp::Fmin => ("fmin", size),
                    VecALUOp::Fmul => ("fmul", size),
                    VecALUOp::Addp => ("addp", size),
-                    VecALUOp::Umlal => ("umlal", size),
                    VecALUOp::Zip1 => ("zip1", size),
-                    VecALUOp::Smull => ("smull", size),
-                    VecALUOp::Smull2 => ("smull2", size),
                    VecALUOp::Sqrdmulh => ("sqrdmulh", size),
                };
-                let rd_size = match alu_op {
-                    VecALUOp::Umlal | VecALUOp::Smull | VecALUOp::Smull2 => size.widen(),
-                    _ => size,
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
+                let rn = show_vreg_vector(rn, mb_rru, size);
+                let rm = show_vreg_vector(rm, mb_rru, size);
+                format!("{} {}, {}, {}", op, rd, rn, rm)
+            }
+            &Inst::VecRRRLong {
+                rd,
+                rn,
+                rm,
+                alu_op,
+                high_half,
+            } => {
+                let (op, dest_size, src_size) = match (alu_op, high_half) {
+                    (VecRRRLongOp::Smull8, false) =>
+                        ("smull", VectorSize::Size16x8, VectorSize::Size8x8),
+                    (VecRRRLongOp::Smull8, true) =>
+                        ("smull2", VectorSize::Size16x8, VectorSize::Size8x16),
+                    (VecRRRLongOp::Smull16, false) =>
+                        ("smull", VectorSize::Size32x4, VectorSize::Size16x4),
+                    (VecRRRLongOp::Smull16, true) =>
+                        ("smull2", VectorSize::Size32x4, VectorSize::Size16x8),
+                    (VecRRRLongOp::Smull32, false) =>
+                        ("smull", VectorSize::Size64x2, VectorSize::Size32x2),
+                    (VecRRRLongOp::Smull32, true) =>
+                        ("smull2", VectorSize::Size64x2, VectorSize::Size32x4),
+                    (VecRRRLongOp::Umull8, false) =>
+                        ("umull", VectorSize::Size16x8, VectorSize::Size8x8),
+                    (VecRRRLongOp::Umull8, true) =>
+                        ("umull2", VectorSize::Size16x8, VectorSize::Size8x16),
+                    (VecRRRLongOp::Umull16, false) =>
+                        ("umull", VectorSize::Size32x4, VectorSize::Size16x4),
+                    (VecRRRLongOp::Umull16, true) =>
+                        ("umull2", VectorSize::Size32x4, VectorSize::Size16x8),
+                    (VecRRRLongOp::Umull32, false) =>
+                        ("umull", VectorSize::Size64x2, VectorSize::Size32x2),
+                    (VecRRRLongOp::Umull32, true) =>
+                        ("umull2", VectorSize::Size64x2, VectorSize::Size32x4),
+                    (VecRRRLongOp::Umlal8, false) =>
+                        ("umlal", VectorSize::Size16x8, VectorSize::Size8x8),
+                    (VecRRRLongOp::Umlal8, true) =>
+                        ("umlal2", VectorSize::Size16x8, VectorSize::Size8x16),
+                    (VecRRRLongOp::Umlal16, false) =>
+                        ("umlal", VectorSize::Size32x4, VectorSize::Size16x4),
+                    (VecRRRLongOp::Umlal16, true) =>
+                        ("umlal2", VectorSize::Size32x4, VectorSize::Size16x8),
+                    (VecRRRLongOp::Umlal32, false) =>
+                        ("umlal", VectorSize::Size64x2, VectorSize::Size32x2),
+                    (VecRRRLongOp::Umlal32, true) =>
+                        ("umlal2", VectorSize::Size64x2, VectorSize::Size32x4),
                };
-                let rn_size = match alu_op {
-                    VecALUOp::Smull => size.halve(),
-                    _ => size,
-                };
-                let rm_size = rn_size;
-                let rd = show_vreg_vector(rd.to_reg(), mb_rru, rd_size);
-                let rn = show_vreg_vector(rn, mb_rru, rn_size);
-                let rm = show_vreg_vector(rm, mb_rru, rm_size);
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, dest_size);
+                let rn = show_vreg_vector(rn, mb_rru, src_size);
+                let rm = show_vreg_vector(rm, mb_rru, src_size);
                format!("{} {}, {}, {}", op, rd, rn, rm)
            }
            &Inst::VecMisc { op, rd, rn, size } => {