From a518c101416cd62202d5a5baab4a6d3f160f0e97 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 20 Aug 2020 13:26:03 +0100 Subject: [PATCH] arm64: Implement SIMD i64x2 multiply Copyright (c) 2020, Arm Limited. --- build.rs | 2 + .../codegen/src/isa/aarch64/inst/args.rs | 24 ++++ .../codegen/src/isa/aarch64/inst/emit.rs | 60 ++++++--- .../src/isa/aarch64/inst/emit_tests.rs | 124 ++++++++++++++++++ cranelift/codegen/src/isa/aarch64/inst/mod.rs | 81 +++++++++++- .../codegen/src/isa/aarch64/lower_inst.rs | 119 ++++++++++++++++- 6 files changed, 380 insertions(+), 30 deletions(-) diff --git a/build.rs b/build.rs index 04dd042853..cd9ebdc610 100644 --- a/build.rs +++ b/build.rs @@ -211,6 +211,7 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { ("simd", "simd_bitwise") => return false, ("simd", "simd_bit_shift") => return false, ("simd", "simd_boolean") => return false, + ("simd", "simd_const") => return false, ("simd", "simd_f32x4") => return false, ("simd", "simd_f32x4_arith") => return false, ("simd", "simd_f32x4_cmp") => return false, @@ -228,6 +229,7 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { ("simd", "simd_i32x4_arith") => return false, ("simd", "simd_i32x4_arith2") => return false, ("simd", "simd_i32x4_cmp") => return false, + ("simd", "simd_i64x2_arith") => return false, ("simd", "simd_lane") => return false, ("simd", "simd_load_extend") => return false, ("simd", "simd_load_splat") => return false, diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs index 060660fbd9..0045e5b088 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/args.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs @@ -647,6 +647,30 @@ impl VectorSize { VectorSize::Size64x2 => ScalarSize::Size64, } } + + pub fn is_128bits(&self) -> bool { + match self { + VectorSize::Size8x8 => false, + VectorSize::Size8x16 => true, + VectorSize::Size16x4 => false, + VectorSize::Size16x8 => true, + VectorSize::Size32x2 => false, + VectorSize::Size32x4 => true, + VectorSize::Size64x2 => true, + } + } + + pub fn widen(&self) -> VectorSize { + match self { + VectorSize::Size8x8 => VectorSize::Size16x8, + VectorSize::Size8x16 => VectorSize::Size16x8, + VectorSize::Size16x4 => VectorSize::Size32x4, + VectorSize::Size16x8 => VectorSize::Size32x4, + VectorSize::Size32x2 => VectorSize::Size64x2, + VectorSize::Size32x4 => VectorSize::Size64x2, + VectorSize::Size64x2 => unreachable!(), + } + } } //============================================================================= diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index 6dcfb56249..fb69790981 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -352,12 +352,12 @@ fn enc_fround(top22: u32, rd: Writable, rn: Reg) -> u32 { (top22 << 10) | (machreg_to_vec(rn) << 5) | machreg_to_vec(rd.to_reg()) } -fn enc_vec_rr_misc(u: u32, size: u32, bits_12_16: u32, rd: Writable, rn: Reg) -> u32 { - debug_assert_eq!(u & 0b1, u); +fn enc_vec_rr_misc(qu: u32, size: u32, bits_12_16: u32, rd: Writable, rn: Reg) -> u32 { + debug_assert_eq!(qu & 0b11, qu); debug_assert_eq!(size & 0b11, size); debug_assert_eq!(bits_12_16 & 0b11111, bits_12_16); - let bits = 0b0_1_0_01110_00_10000_00000_10_00000_00000; - bits | u << 29 + let bits = 0b0_00_01110_00_10000_00000_10_00000_00000; + bits | qu << 29 | size << 22 | bits_12_16 << 12 | machreg_to_vec(rn) << 5 @@ -1367,13 +1367,14 @@ impl MachInstEmit for Inst { sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra)); } &Inst::VecMisc { op, rd, rn, size } => { - let enc_size = match size { - VectorSize::Size8x16 => 0b00, - VectorSize::Size16x8 => 0b01, - VectorSize::Size32x4 => 0b10, - VectorSize::Size64x2 => 0b11, - _ => unimplemented!(), + let enc_size = match size.lane_size() { + ScalarSize::Size8 => 0b00, + ScalarSize::Size16 => 0b01, + ScalarSize::Size32 => 0b10, + ScalarSize::Size64 => 0b11, + _ => unreachable!(), }; + let q = if size.is_128bits() { 1 } else { 0 }; let (u, bits_12_16, size) = match op { VecMisc2::Not => (0b1, 0b00101, 0b00), VecMisc2::Neg => (0b1, 0b01011, enc_size), @@ -1390,8 +1391,17 @@ impl MachInstEmit for Inst { debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2); (0b1, 0b11111, enc_size) } + VecMisc2::Rev64 => { + debug_assert_ne!(VectorSize::Size64x2, size); + (0b0, 0b00000, enc_size) + } + VecMisc2::Shll => { + debug_assert_ne!(VectorSize::Size64x2, size); + debug_assert!(!size.is_128bits()); + (0b1, 0b10011, enc_size) + } }; - sink.put4(enc_vec_rr_misc(u, size, bits_12_16, rd, rn)); + sink.put4(enc_vec_rr_misc((q << 1) | u, size, bits_12_16, rd, rn)); } &Inst::VecLanes { op, rd, rn, size } => { let (q, size) = match size { @@ -1651,6 +1661,17 @@ impl MachInstEmit for Inst { | machreg_to_vec(rd.to_reg()), ); } + &Inst::VecMiscNarrow { op, rd, rn, size } => { + debug_assert!(!size.is_128bits()); + let size = match size.widen() { + VectorSize::Size64x2 => 0b10, + _ => unimplemented!(), + }; + let (u, bits_12_16) = match op { + VecMiscNarrowOp::Xtn => (0b0, 0b10010), + }; + sink.put4(enc_vec_rr_misc(u, size, bits_12_16, rd, rn)); + } &Inst::VecMovElement { rd, rn, @@ -1685,12 +1706,12 @@ impl MachInstEmit for Inst { alu_op, size, } => { - let enc_size = match size { - VectorSize::Size8x16 => 0b00, - VectorSize::Size16x8 => 0b01, - VectorSize::Size32x4 => 0b10, - VectorSize::Size64x2 => 0b11, - _ => 0, + let enc_size = match size.lane_size() { + ScalarSize::Size8 => 0b00, + ScalarSize::Size16 => 0b01, + ScalarSize::Size32 => 0b10, + ScalarSize::Size64 => 0b11, + _ => unreachable!(), }; let is_float = match alu_op { VecALUOp::Fcmeq @@ -1751,6 +1772,11 @@ impl MachInstEmit for Inst { VecALUOp::Fmax => (0b010_01110_00_1, 0b111101), VecALUOp::Fmin => (0b010_01110_10_1, 0b111101), VecALUOp::Fmul => (0b011_01110_00_1, 0b110111), + VecALUOp::Addp => (0b010_01110_00_1 | enc_size << 1, 0b101111), + VecALUOp::Umlal => { + debug_assert!(!size.is_128bits()); + (0b001_01110_00_1 | enc_size << 1, 0b100000) + } }; let top11 = if is_float { top11 | enc_float_size << 1 diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index 2b2f48f802..7fba35f2bc 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -2082,6 +2082,17 @@ fn test_aarch64_binemit() { "mov v31.s[1], v16.s[0]", )); + insns.push(( + Inst::VecMiscNarrow { + op: VecMiscNarrowOp::Xtn, + rd: writable_vreg(22), + rn: vreg(8), + size: VectorSize::Size32x2, + }, + "1629A10E", + "xtn v22.2s, v8.2d", + )); + insns.push(( Inst::VecRRR { alu_op: VecALUOp::Sqadd, @@ -3066,6 +3077,53 @@ fn test_aarch64_binemit() { "fmul v2.2d, v0.2d, v5.2d", )); + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Addp, + rd: writable_vreg(16), + rn: vreg(12), + rm: vreg(1), + size: VectorSize::Size8x16, + }, + "90BD214E", + "addp v16.16b, v12.16b, v1.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Addp, + rd: writable_vreg(8), + rn: vreg(12), + rm: vreg(14), + size: VectorSize::Size32x4, + }, + "88BDAE4E", + "addp v8.4s, v12.4s, v14.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Umlal, + rd: writable_vreg(9), + rn: vreg(20), + rm: vreg(17), + size: VectorSize::Size32x2, + }, + "8982B12E", + "umlal v9.2d, v20.2s, v17.2s", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Not, + rd: writable_vreg(20), + rn: vreg(17), + size: VectorSize::Size8x8, + }, + "345A202E", + "mvn v20.8b, v17.8b", + )); + insns.push(( Inst::VecMisc { op: VecMisc2::Not, @@ -3077,6 +3135,17 @@ fn test_aarch64_binemit() { "mvn v2.16b, v1.16b", )); + insns.push(( + Inst::VecMisc { + op: VecMisc2::Neg, + rd: writable_vreg(3), + rn: vreg(7), + size: VectorSize::Size8x8, + }, + "E3B8202E", + "neg v3.8b, v7.8b", + )); + insns.push(( Inst::VecMisc { op: VecMisc2::Neg, @@ -3121,6 +3190,17 @@ fn test_aarch64_binemit() { "neg v10.2d, v8.2d", )); + insns.push(( + Inst::VecMisc { + op: VecMisc2::Abs, + rd: writable_vreg(3), + rn: vreg(1), + size: VectorSize::Size8x8, + }, + "23B8200E", + "abs v3.8b, v1.8b", + )); + insns.push(( Inst::VecMisc { op: VecMisc2::Abs, @@ -3198,6 +3278,50 @@ fn test_aarch64_binemit() { "fsqrt v7.2d, v18.2d", )); + insns.push(( + Inst::VecMisc { + op: VecMisc2::Rev64, + rd: writable_vreg(1), + rn: vreg(10), + size: VectorSize::Size32x4, + }, + "4109A04E", + "rev64 v1.4s, v10.4s", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Shll, + rd: writable_vreg(12), + rn: vreg(5), + size: VectorSize::Size8x8, + }, + "AC38212E", + "shll v12.8h, v5.8b, #8", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Shll, + rd: writable_vreg(9), + rn: vreg(1), + size: VectorSize::Size16x4, + }, + "2938612E", + "shll v9.4s, v1.4h, #16", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Shll, + rd: writable_vreg(1), + rn: vreg(10), + size: VectorSize::Size32x2, + }, + "4139A12E", + "shll v1.2d, v10.2s, #32", + )); + insns.push(( Inst::VecLanes { op: VecLanesOp::Uminv, diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 27868f96dc..46f6edc2e8 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -283,6 +283,10 @@ pub enum VecALUOp { Fmin, /// Floating-point multiply Fmul, + /// Add pairwise + Addp, + /// Unsigned multiply add long + Umlal, } /// A Vector miscellaneous operation with two registers. @@ -300,6 +304,17 @@ pub enum VecMisc2 { Fneg, /// Floating-point square root Fsqrt, + /// Reverse elements in 64-bit doublewords + Rev64, + /// Shift left long (by element size) + Shll, +} + +/// A Vector narrowing operation with two registers. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum VecMiscNarrowOp { + /// Extract Narrow + Xtn, } /// An operation across the lanes of vectors. @@ -880,6 +895,14 @@ pub enum Inst { size: VectorSize, }, + /// Vector narrowing operation. + VecMiscNarrow { + op: VecMiscNarrowOp, + rd: Writable, + rn: Reg, + size: VectorSize, + }, + /// A vector ALU op. VecRRR { alu_op: VecALUOp, @@ -1605,10 +1628,14 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { collector.add_mod(rd); collector.add_use(rn); } + &Inst::VecMiscNarrow { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } &Inst::VecRRR { alu_op, rd, rn, rm, .. } => { - if alu_op == VecALUOp::Bsl { + if alu_op == VecALUOp::Bsl || alu_op == VecALUOp::Umlal { collector.add_mod(rd); } else { collector.add_def(rd); @@ -2270,6 +2297,14 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RUM) { map_mod(mapper, rd); map_use(mapper, rn); } + &mut Inst::VecMiscNarrow { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } &mut Inst::VecRRR { alu_op, ref mut rd, @@ -2277,7 +2312,7 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RUM) { ref mut rm, .. } => { - if alu_op == VecALUOp::Bsl { + if alu_op == VecALUOp::Bsl || alu_op == VecALUOp::Umlal { map_mod(mapper, rd); } else { map_def(mapper, rd); @@ -3144,6 +3179,14 @@ impl Inst { let rn = show_vreg_element(rn, mb_rru, idx2, size); format!("mov {}, {}", rd, rn) } + &Inst::VecMiscNarrow { op, rd, rn, size } => { + let rd = show_vreg_vector(rd.to_reg(), mb_rru, size); + let rn = show_vreg_vector(rn, mb_rru, size.widen()); + let op = match op { + VecMiscNarrowOp::Xtn => "xtn", + }; + format!("{} {}, {}", op, rd, rn) + } &Inst::VecRRR { rd, rn, @@ -3186,25 +3229,51 @@ impl Inst { VecALUOp::Fmax => ("fmax", size), VecALUOp::Fmin => ("fmin", size), VecALUOp::Fmul => ("fmul", size), + VecALUOp::Addp => ("addp", size), + VecALUOp::Umlal => ("umlal", size), }; - let rd = show_vreg_vector(rd.to_reg(), mb_rru, size); + let rd_size = if alu_op == VecALUOp::Umlal { + size.widen() + } else { + size + }; + let rd = show_vreg_vector(rd.to_reg(), mb_rru, rd_size); let rn = show_vreg_vector(rn, mb_rru, size); let rm = show_vreg_vector(rm, mb_rru, size); format!("{} {}, {}, {}", op, rd, rn, rm) } &Inst::VecMisc { op, rd, rn, size } => { + let is_shll = op == VecMisc2::Shll; + let suffix = match (is_shll, size) { + (true, VectorSize::Size8x8) => ", #8", + (true, VectorSize::Size16x4) => ", #16", + (true, VectorSize::Size32x2) => ", #32", + _ => "", + }; + let (op, size) = match op { - VecMisc2::Not => ("mvn", VectorSize::Size8x16), + VecMisc2::Not => ( + "mvn", + if size.is_128bits() { + VectorSize::Size8x16 + } else { + VectorSize::Size8x8 + }, + ), VecMisc2::Neg => ("neg", size), VecMisc2::Abs => ("abs", size), VecMisc2::Fabs => ("fabs", size), VecMisc2::Fneg => ("fneg", size), VecMisc2::Fsqrt => ("fsqrt", size), + VecMisc2::Rev64 => ("rev64", size), + VecMisc2::Shll => ("shll", size), }; - let rd = show_vreg_vector(rd.to_reg(), mb_rru, size); + let rd_size = if is_shll { size.widen() } else { size }; + + let rd = show_vreg_vector(rd.to_reg(), mb_rru, rd_size); let rn = show_vreg_vector(rn, mb_rru, size); - format!("{} {}, {}", op, rd, rn) + format!("{} {}, {}{}", op, rd, rn, suffix) } &Inst::VecLanes { op, rd, rn, size } => { let op = match op { diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 5fe62da697..5f8823a3d5 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -211,13 +211,118 @@ pub(crate) fn lower_insn_to_regs>( ra: zero_reg(), }); } else { - ctx.emit(Inst::VecRRR { - alu_op: VecALUOp::Mul, - rd, - rn, - rm, - size: VectorSize::from_ty(ty), - }); + if ty == I64X2 { + let tmp1 = ctx.alloc_tmp(RegClass::V128, I64X2); + let tmp2 = ctx.alloc_tmp(RegClass::V128, I64X2); + + // This I64X2 multiplication is performed with several 32-bit + // operations. + + // 64-bit numbers x and y, can be represented as: + // x = a + 2^32(b) + // y = c + 2^32(d) + + // A 64-bit multiplication is: + // x * y = ac + 2^32(ad + bc) + 2^64(bd) + // note: `2^64(bd)` can be ignored, the value is too large to fit in + // 64 bits. + + // This sequence implements a I64X2 multiply, where the registers + // `rn` and `rm` are split up into 32-bit components: + // rn = |d|c|b|a| + // rm = |h|g|f|e| + // + // rn * rm = |cg + 2^32(ch + dg)|ae + 2^32(af + be)| + // + // The sequence is: + // rev64 rd.4s, rm.4s + // mul rd.4s, rd.4s, rn.4s + // xtn tmp1.2s, rn.2d + // addp rd.4s, rd.4s, rd.4s + // xtn tmp2.2s, rm.2d + // shll rd.2d, rd.2s, #32 + // umlal rd.2d, tmp2.2s, tmp1.2s + + // Reverse the 32-bit elements in the 64-bit words. + // rd = |g|h|e|f| + ctx.emit(Inst::VecMisc { + op: VecMisc2::Rev64, + rd, + rn: rm, + size: VectorSize::Size32x4, + }); + + // Calculate the high half components. + // rd = |dg|ch|be|af| + // + // Note that this 32-bit multiply of the high half + // discards the bits that would overflow, same as + // if 64-bit operations were used. Also the Shll + // below would shift out the overflow bits anyway. + ctx.emit(Inst::VecRRR { + alu_op: VecALUOp::Mul, + rd, + rn: rd.to_reg(), + rm: rn, + size: VectorSize::Size32x4, + }); + + // Extract the low half components of rn. + // tmp1 = |c|a| + ctx.emit(Inst::VecMiscNarrow { + op: VecMiscNarrowOp::Xtn, + rd: tmp1, + rn, + size: VectorSize::Size32x2, + }); + + // Sum the respective high half components. + // rd = |dg+ch|be+af||dg+ch|be+af| + ctx.emit(Inst::VecRRR { + alu_op: VecALUOp::Addp, + rd: rd, + rn: rd.to_reg(), + rm: rd.to_reg(), + size: VectorSize::Size32x4, + }); + + // Extract the low half components of rm. + // tmp2 = |g|e| + ctx.emit(Inst::VecMiscNarrow { + op: VecMiscNarrowOp::Xtn, + rd: tmp2, + rn: rm, + size: VectorSize::Size32x2, + }); + + // Shift the high half components, into the high half. + // rd = |dg+ch << 32|be+af << 32| + ctx.emit(Inst::VecMisc { + op: VecMisc2::Shll, + rd, + rn: rd.to_reg(), + size: VectorSize::Size32x2, + }); + + // Multiply the low components together, and accumulate with the high + // half. + // rd = |rd[1] + cg|rd[0] + ae| + ctx.emit(Inst::VecRRR { + alu_op: VecALUOp::Umlal, + rd, + rn: tmp2.to_reg(), + rm: tmp1.to_reg(), + size: VectorSize::Size32x2, + }); + } else { + ctx.emit(Inst::VecRRR { + alu_op: VecALUOp::Mul, + rd, + rn, + rm, + size: VectorSize::from_ty(ty), + }); + } } }