arm64: Implement SIMD i64x2 multiply

2020-08-20 13:26:03 +01:00
parent 693c6ea771
commit a518c10141
6 changed files with 380 additions and 30 deletions
--- a/build.rs
+++ b/build.rs
@@ -211,6 +211,7 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
            ("simd", "simd_bitwise") => return false,
            ("simd", "simd_bit_shift") => return false,
            ("simd", "simd_boolean") => return false,
+            ("simd", "simd_const") => return false,
            ("simd", "simd_f32x4") => return false,
            ("simd", "simd_f32x4_arith") => return false,
            ("simd", "simd_f32x4_cmp") => return false,
@@ -228,6 +229,7 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
            ("simd", "simd_i32x4_arith") => return false,
            ("simd", "simd_i32x4_arith2") => return false,
            ("simd", "simd_i32x4_cmp") => return false,
+            ("simd", "simd_i64x2_arith") => return false,
            ("simd", "simd_lane") => return false,
            ("simd", "simd_load_extend") => return false,
            ("simd", "simd_load_splat") => return false,
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -647,6 +647,30 @@ impl VectorSize {
            VectorSize::Size64x2 => ScalarSize::Size64,
        }
    }
+
+    pub fn is_128bits(&self) -> bool {
+        match self {
+            VectorSize::Size8x8 => false,
+            VectorSize::Size8x16 => true,
+            VectorSize::Size16x4 => false,
+            VectorSize::Size16x8 => true,
+            VectorSize::Size32x2 => false,
+            VectorSize::Size32x4 => true,
+            VectorSize::Size64x2 => true,
+        }
+    }
+
+    pub fn widen(&self) -> VectorSize {
+        match self {
+            VectorSize::Size8x8 => VectorSize::Size16x8,
+            VectorSize::Size8x16 => VectorSize::Size16x8,
+            VectorSize::Size16x4 => VectorSize::Size32x4,
+            VectorSize::Size16x8 => VectorSize::Size32x4,
+            VectorSize::Size32x2 => VectorSize::Size64x2,
+            VectorSize::Size32x4 => VectorSize::Size64x2,
+            VectorSize::Size64x2 => unreachable!(),
+        }
+    }
 }

 //=============================================================================
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -352,12 +352,12 @@ fn enc_fround(top22: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
    (top22 << 10) | (machreg_to_vec(rn) << 5) | machreg_to_vec(rd.to_reg())
 }

-fn enc_vec_rr_misc(u: u32, size: u32, bits_12_16: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
-    debug_assert_eq!(u & 0b1, u);
+fn enc_vec_rr_misc(qu: u32, size: u32, bits_12_16: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
+    debug_assert_eq!(qu & 0b11, qu);
    debug_assert_eq!(size & 0b11, size);
    debug_assert_eq!(bits_12_16 & 0b11111, bits_12_16);
-    let bits = 0b0_1_0_01110_00_10000_00000_10_00000_00000;
-    bits | u << 29
+    let bits = 0b0_00_01110_00_10000_00000_10_00000_00000;
+    bits | qu << 29
        | size << 22
        | bits_12_16 << 12
        | machreg_to_vec(rn) << 5
@@ -1367,13 +1367,14 @@ impl MachInstEmit for Inst {
                sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra));
            }
            &Inst::VecMisc { op, rd, rn, size } => {
-                let enc_size = match size {
-                    VectorSize::Size8x16 => 0b00,
-                    VectorSize::Size16x8 => 0b01,
-                    VectorSize::Size32x4 => 0b10,
-                    VectorSize::Size64x2 => 0b11,
-                    _ => unimplemented!(),
+                let enc_size = match size.lane_size() {
+                    ScalarSize::Size8 => 0b00,
+                    ScalarSize::Size16 => 0b01,
+                    ScalarSize::Size32 => 0b10,
+                    ScalarSize::Size64 => 0b11,
+                    _ => unreachable!(),
                };
+                let q = if size.is_128bits() { 1 } else { 0 };
                let (u, bits_12_16, size) = match op {
                    VecMisc2::Not => (0b1, 0b00101, 0b00),
                    VecMisc2::Neg => (0b1, 0b01011, enc_size),
@@ -1390,8 +1391,17 @@ impl MachInstEmit for Inst {
                        debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
                        (0b1, 0b11111, enc_size)
                    }
+                    VecMisc2::Rev64 => {
+                        debug_assert_ne!(VectorSize::Size64x2, size);
+                        (0b0, 0b00000, enc_size)
+                    }
+                    VecMisc2::Shll => {
+                        debug_assert_ne!(VectorSize::Size64x2, size);
+                        debug_assert!(!size.is_128bits());
+                        (0b1, 0b10011, enc_size)
+                    }
                };
-                sink.put4(enc_vec_rr_misc(u, size, bits_12_16, rd, rn));
+                sink.put4(enc_vec_rr_misc((q << 1) | u, size, bits_12_16, rd, rn));
            }
            &Inst::VecLanes { op, rd, rn, size } => {
                let (q, size) = match size {
@@ -1651,6 +1661,17 @@ impl MachInstEmit for Inst {
                        | machreg_to_vec(rd.to_reg()),
                );
            }
+            &Inst::VecMiscNarrow { op, rd, rn, size } => {
+                debug_assert!(!size.is_128bits());
+                let size = match size.widen() {
+                    VectorSize::Size64x2 => 0b10,
+                    _ => unimplemented!(),
+                };
+                let (u, bits_12_16) = match op {
+                    VecMiscNarrowOp::Xtn => (0b0, 0b10010),
+                };
+                sink.put4(enc_vec_rr_misc(u, size, bits_12_16, rd, rn));
+            }
            &Inst::VecMovElement {
                rd,
                rn,
@@ -1685,12 +1706,12 @@ impl MachInstEmit for Inst {
                alu_op,
                size,
            } => {
-                let enc_size = match size {
-                    VectorSize::Size8x16 => 0b00,
-                    VectorSize::Size16x8 => 0b01,
-                    VectorSize::Size32x4 => 0b10,
-                    VectorSize::Size64x2 => 0b11,
-                    _ => 0,
+                let enc_size = match size.lane_size() {
+                    ScalarSize::Size8 => 0b00,
+                    ScalarSize::Size16 => 0b01,
+                    ScalarSize::Size32 => 0b10,
+                    ScalarSize::Size64 => 0b11,
+                    _ => unreachable!(),
                };
                let is_float = match alu_op {
                    VecALUOp::Fcmeq
@@ -1751,6 +1772,11 @@ impl MachInstEmit for Inst {
                    VecALUOp::Fmax => (0b010_01110_00_1, 0b111101),
                    VecALUOp::Fmin => (0b010_01110_10_1, 0b111101),
                    VecALUOp::Fmul => (0b011_01110_00_1, 0b110111),
+                    VecALUOp::Addp => (0b010_01110_00_1 | enc_size << 1, 0b101111),
+                    VecALUOp::Umlal => {
+                        debug_assert!(!size.is_128bits());
+                        (0b001_01110_00_1 | enc_size << 1, 0b100000)
+                    }
                };
                let top11 = if is_float {
                    top11 | enc_float_size << 1
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -2082,6 +2082,17 @@ fn test_aarch64_binemit() {
        "mov v31.s[1], v16.s[0]",
    ));

+    insns.push((
+        Inst::VecMiscNarrow {
+            op: VecMiscNarrowOp::Xtn,
+            rd: writable_vreg(22),
+            rn: vreg(8),
+            size: VectorSize::Size32x2,
+        },
+        "1629A10E",
+        "xtn v22.2s, v8.2d",
+    ));
+
    insns.push((
        Inst::VecRRR {
            alu_op: VecALUOp::Sqadd,
@@ -3066,6 +3077,53 @@ fn test_aarch64_binemit() {
        "fmul v2.2d, v0.2d, v5.2d",
    ));

+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Addp,
+            rd: writable_vreg(16),
+            rn: vreg(12),
+            rm: vreg(1),
+            size: VectorSize::Size8x16,
+        },
+        "90BD214E",
+        "addp v16.16b, v12.16b, v1.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Addp,
+            rd: writable_vreg(8),
+            rn: vreg(12),
+            rm: vreg(14),
+            size: VectorSize::Size32x4,
+        },
+        "88BDAE4E",
+        "addp v8.4s, v12.4s, v14.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Umlal,
+            rd: writable_vreg(9),
+            rn: vreg(20),
+            rm: vreg(17),
+            size: VectorSize::Size32x2,
+        },
+        "8982B12E",
+        "umlal v9.2d, v20.2s, v17.2s",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Not,
+            rd: writable_vreg(20),
+            rn: vreg(17),
+            size: VectorSize::Size8x8,
+        },
+        "345A202E",
+        "mvn v20.8b, v17.8b",
+    ));
+
    insns.push((
        Inst::VecMisc {
            op: VecMisc2::Not,
@@ -3077,6 +3135,17 @@ fn test_aarch64_binemit() {
        "mvn v2.16b, v1.16b",
    ));

+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Neg,
+            rd: writable_vreg(3),
+            rn: vreg(7),
+            size: VectorSize::Size8x8,
+        },
+        "E3B8202E",
+        "neg v3.8b, v7.8b",
+    ));
+
    insns.push((
        Inst::VecMisc {
            op: VecMisc2::Neg,
@@ -3121,6 +3190,17 @@ fn test_aarch64_binemit() {
        "neg v10.2d, v8.2d",
    ));

+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Abs,
+            rd: writable_vreg(3),
+            rn: vreg(1),
+            size: VectorSize::Size8x8,
+        },
+        "23B8200E",
+        "abs v3.8b, v1.8b",
+    ));
+
    insns.push((
        Inst::VecMisc {
            op: VecMisc2::Abs,
@@ -3198,6 +3278,50 @@ fn test_aarch64_binemit() {
        "fsqrt v7.2d, v18.2d",
    ));

+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Rev64,
+            rd: writable_vreg(1),
+            rn: vreg(10),
+            size: VectorSize::Size32x4,
+        },
+        "4109A04E",
+        "rev64 v1.4s, v10.4s",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Shll,
+            rd: writable_vreg(12),
+            rn: vreg(5),
+            size: VectorSize::Size8x8,
+        },
+        "AC38212E",
+        "shll v12.8h, v5.8b, #8",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Shll,
+            rd: writable_vreg(9),
+            rn: vreg(1),
+            size: VectorSize::Size16x4,
+        },
+        "2938612E",
+        "shll v9.4s, v1.4h, #16",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Shll,
+            rd: writable_vreg(1),
+            rn: vreg(10),
+            size: VectorSize::Size32x2,
+        },
+        "4139A12E",
+        "shll v1.2d, v10.2s, #32",
+    ));
+
    insns.push((
        Inst::VecLanes {
            op: VecLanesOp::Uminv,
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -283,6 +283,10 @@ pub enum VecALUOp {
    Fmin,
    /// Floating-point multiply
    Fmul,
+    /// Add pairwise
+    Addp,
+    /// Unsigned multiply add long
+    Umlal,
 }

 /// A Vector miscellaneous operation with two registers.
@@ -300,6 +304,17 @@ pub enum VecMisc2 {
    Fneg,
    /// Floating-point square root
    Fsqrt,
+    /// Reverse elements in 64-bit doublewords
+    Rev64,
+    /// Shift left long (by element size)
+    Shll,
+}
+
+/// A Vector narrowing operation with two registers.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum VecMiscNarrowOp {
+    /// Extract Narrow
+    Xtn,
 }

 /// An operation across the lanes of vectors.
@@ -880,6 +895,14 @@ pub enum Inst {
        size: VectorSize,
    },

+    /// Vector narrowing operation.
+    VecMiscNarrow {
+        op: VecMiscNarrowOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+        size: VectorSize,
+    },
+
    /// A vector ALU op.
    VecRRR {
        alu_op: VecALUOp,
@@ -1605,10 +1628,14 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
            collector.add_mod(rd);
            collector.add_use(rn);
        }
+        &Inst::VecMiscNarrow { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
        &Inst::VecRRR {
            alu_op, rd, rn, rm, ..
        } => {
-            if alu_op == VecALUOp::Bsl {
+            if alu_op == VecALUOp::Bsl || alu_op == VecALUOp::Umlal {
                collector.add_mod(rd);
            } else {
                collector.add_def(rd);
@@ -2270,6 +2297,14 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
            map_mod(mapper, rd);
            map_use(mapper, rn);
        }
+        &mut Inst::VecMiscNarrow {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
        &mut Inst::VecRRR {
            alu_op,
            ref mut rd,
@@ -2277,7 +2312,7 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
            ref mut rm,
            ..
        } => {
-            if alu_op == VecALUOp::Bsl {
+            if alu_op == VecALUOp::Bsl || alu_op == VecALUOp::Umlal {
                map_mod(mapper, rd);
            } else {
                map_def(mapper, rd);
@@ -3144,6 +3179,14 @@ impl Inst {
                let rn = show_vreg_element(rn, mb_rru, idx2, size);
                format!("mov {}, {}", rd, rn)
            }
+            &Inst::VecMiscNarrow { op, rd, rn, size } => {
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
+                let rn = show_vreg_vector(rn, mb_rru, size.widen());
+                let op = match op {
+                    VecMiscNarrowOp::Xtn => "xtn",
+                };
+                format!("{} {}, {}", op, rd, rn)
+            }
            &Inst::VecRRR {
                rd,
                rn,
@@ -3186,25 +3229,51 @@ impl Inst {
                    VecALUOp::Fmax => ("fmax", size),
                    VecALUOp::Fmin => ("fmin", size),
                    VecALUOp::Fmul => ("fmul", size),
+                    VecALUOp::Addp => ("addp", size),
+                    VecALUOp::Umlal => ("umlal", size),
                };
-                let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
+                let rd_size = if alu_op == VecALUOp::Umlal {
+                    size.widen()
+                } else {
+                    size
+                };
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, rd_size);
                let rn = show_vreg_vector(rn, mb_rru, size);
                let rm = show_vreg_vector(rm, mb_rru, size);
                format!("{} {}, {}, {}", op, rd, rn, rm)
            }
            &Inst::VecMisc { op, rd, rn, size } => {
+                let is_shll = op == VecMisc2::Shll;
+                let suffix = match (is_shll, size) {
+                    (true, VectorSize::Size8x8) => ", #8",
+                    (true, VectorSize::Size16x4) => ", #16",
+                    (true, VectorSize::Size32x2) => ", #32",
+                    _ => "",
+                };
+
                let (op, size) = match op {
-                    VecMisc2::Not => ("mvn", VectorSize::Size8x16),
+                    VecMisc2::Not => (
+                        "mvn",
+                        if size.is_128bits() {
+                            VectorSize::Size8x16
+                        } else {
+                            VectorSize::Size8x8
+                        },
+                    ),
                    VecMisc2::Neg => ("neg", size),
                    VecMisc2::Abs => ("abs", size),
                    VecMisc2::Fabs => ("fabs", size),
                    VecMisc2::Fneg => ("fneg", size),
                    VecMisc2::Fsqrt => ("fsqrt", size),
+                    VecMisc2::Rev64 => ("rev64", size),
+                    VecMisc2::Shll => ("shll", size),
                };

-                let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
+                let rd_size = if is_shll { size.widen() } else { size };
+
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, rd_size);
                let rn = show_vreg_vector(rn, mb_rru, size);
-                format!("{} {}, {}", op, rd, rn)
+                format!("{} {}, {}{}", op, rd, rn, suffix)
            }
            &Inst::VecLanes { op, rd, rn, size } => {
                let op = match op {
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -210,6 +210,110 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    rm,
                    ra: zero_reg(),
                });
+            } else {
+                if ty == I64X2 {
+                    let tmp1 = ctx.alloc_tmp(RegClass::V128, I64X2);
+                    let tmp2 = ctx.alloc_tmp(RegClass::V128, I64X2);
+
+                    // This I64X2 multiplication is performed with several 32-bit
+                    // operations.
+
+                    // 64-bit numbers x and y, can be represented as:
+                    //   x = a + 2^32(b)
+                    //   y = c + 2^32(d)
+
+                    // A 64-bit multiplication is:
+                    //   x * y = ac + 2^32(ad + bc) + 2^64(bd)
+                    // note: `2^64(bd)` can be ignored, the value is too large to fit in
+                    // 64 bits.
+
+                    // This sequence implements a I64X2 multiply, where the registers
+                    // `rn` and `rm` are split up into 32-bit components:
+                    //   rn = |d|c|b|a|
+                    //   rm = |h|g|f|e|
+                    //
+                    //   rn * rm = |cg + 2^32(ch + dg)|ae + 2^32(af + be)|
+                    //
+                    //  The sequence is:
+                    //  rev64 rd.4s, rm.4s
+                    //  mul rd.4s, rd.4s, rn.4s
+                    //  xtn tmp1.2s, rn.2d
+                    //  addp rd.4s, rd.4s, rd.4s
+                    //  xtn tmp2.2s, rm.2d
+                    //  shll rd.2d, rd.2s, #32
+                    //  umlal rd.2d, tmp2.2s, tmp1.2s
+
+                    // Reverse the 32-bit elements in the 64-bit words.
+                    //   rd = |g|h|e|f|
+                    ctx.emit(Inst::VecMisc {
+                        op: VecMisc2::Rev64,
+                        rd,
+                        rn: rm,
+                        size: VectorSize::Size32x4,
+                    });
+
+                    // Calculate the high half components.
+                    //   rd = |dg|ch|be|af|
+                    //
+                    // Note that this 32-bit multiply of the high half
+                    // discards the bits that would overflow, same as
+                    // if 64-bit operations were used. Also the Shll
+                    // below would shift out the overflow bits anyway.
+                    ctx.emit(Inst::VecRRR {
+                        alu_op: VecALUOp::Mul,
+                        rd,
+                        rn: rd.to_reg(),
+                        rm: rn,
+                        size: VectorSize::Size32x4,
+                    });
+
+                    // Extract the low half components of rn.
+                    //   tmp1 = |c|a|
+                    ctx.emit(Inst::VecMiscNarrow {
+                        op: VecMiscNarrowOp::Xtn,
+                        rd: tmp1,
+                        rn,
+                        size: VectorSize::Size32x2,
+                    });
+
+                    // Sum the respective high half components.
+                    //   rd = |dg+ch|be+af||dg+ch|be+af|
+                    ctx.emit(Inst::VecRRR {
+                        alu_op: VecALUOp::Addp,
+                        rd: rd,
+                        rn: rd.to_reg(),
+                        rm: rd.to_reg(),
+                        size: VectorSize::Size32x4,
+                    });
+
+                    // Extract the low half components of rm.
+                    //   tmp2 = |g|e|
+                    ctx.emit(Inst::VecMiscNarrow {
+                        op: VecMiscNarrowOp::Xtn,
+                        rd: tmp2,
+                        rn: rm,
+                        size: VectorSize::Size32x2,
+                    });
+
+                    // Shift the high half components, into the high half.
+                    //   rd = |dg+ch << 32|be+af << 32|
+                    ctx.emit(Inst::VecMisc {
+                        op: VecMisc2::Shll,
+                        rd,
+                        rn: rd.to_reg(),
+                        size: VectorSize::Size32x2,
+                    });
+
+                    // Multiply the low components together, and accumulate with the high
+                    // half.
+                    //   rd = |rd[1] + cg|rd[0] + ae|
+                    ctx.emit(Inst::VecRRR {
+                        alu_op: VecALUOp::Umlal,
+                        rd,
+                        rn: tmp2.to_reg(),
+                        rm: tmp1.to_reg(),
+                        size: VectorSize::Size32x2,
+                    });
                } else {
                    ctx.emit(Inst::VecRRR {
                        alu_op: VecALUOp::Mul,
@@ -220,6 +324,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    });
                }
            }
+        }

        Opcode::Umulhi | Opcode::Smulhi => {
            let rd = get_output_reg(ctx, outputs[0]);