From 3bc2f0c7019752f9ee561df1ae3e4936778a9d6e Mon Sep 17 00:00:00 2001 From: Sam Parker Date: Mon, 2 Aug 2021 10:03:54 +0100 Subject: [PATCH] Enable simd_X_extadd_pairwise_X for AArch64 Lower to [u|s]addlp for AArch64. Copyright (c) 2021, Arm Limited. --- build.rs | 11 -- .../codegen/src/isa/aarch64/inst/emit.rs | 21 +++ .../src/isa/aarch64/inst/emit_tests.rs | 40 ++++++ cranelift/codegen/src/isa/aarch64/inst/mod.rs | 53 ++++++++ .../codegen/src/isa/aarch64/lower_inst.rs | 54 +++++++- .../isa/aarch64/simd-pairwise-add.clif | 124 ++++++++++++++++++ 6 files changed, 291 insertions(+), 12 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/aarch64/simd-pairwise-add.clif diff --git a/build.rs b/build.rs index edf1d3e290..26e01eda4e 100644 --- a/build.rs +++ b/build.rs @@ -202,13 +202,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { ("simd", _) if cfg!(feature = "old-x86-backend") => return true, // No simd support yet for s390x. ("simd", _) if platform_is_s390x() => return true, - // These are new instructions that are only known to be supported for x64. - ("simd", "simd_i16x8_extadd_pairwise_i8x16") - | ("simd", "simd_i32x4_extadd_pairwise_i16x8") - if !platform_is_x64() => - { - return true - } _ => {} }, _ => panic!("unrecognized strategy"), @@ -217,10 +210,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { false } -fn platform_is_x64() -> bool { - env::var("CARGO_CFG_TARGET_ARCH").unwrap() == "x86_64" -} - fn platform_is_s390x() -> bool { env::var("CARGO_CFG_TARGET_ARCH").unwrap() == "s390x" } diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index 54886b010e..5374de6bf8 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -459,6 +459,17 @@ fn enc_vec_rr_pair(bits_12_16: u32, rd: Writable, rn: Reg) -> u32 { | machreg_to_vec(rd.to_reg()) } +fn enc_vec_rr_pair_long(u: u32, enc_size: u32, rd: Writable, rn: Reg) -> u32 { + debug_assert_eq!(u & 0b1, u); + debug_assert_eq!(enc_size & 0b1, enc_size); + + 0b0_1_0_01110_00_10000_00_0_10_10_00000_00000 + | u << 29 + | enc_size << 22 + | machreg_to_vec(rn) << 5 + | machreg_to_vec(rd.to_reg()) +} + fn enc_vec_lanes(q: u32, u: u32, size: u32, opcode: u32, rd: Writable, rn: Reg) -> u32 { debug_assert_eq!(q & 0b1, q); debug_assert_eq!(u & 0b1, u); @@ -2225,6 +2236,16 @@ impl MachInstEmit for Inst { rd, )); } + &Inst::VecRRPairLong { op, rd, rn } => { + let (u, size) = match op { + VecRRPairLongOp::Saddlp8 => (0b0, 0b0), + VecRRPairLongOp::Uaddlp8 => (0b1, 0b0), + VecRRPairLongOp::Saddlp16 => (0b0, 0b1), + VecRRPairLongOp::Uaddlp16 => (0b1, 0b1), + }; + + sink.put4(enc_vec_rr_pair_long(u, size, rd, rn)); + } &Inst::VecRRR { rd, rn, diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index d3afca2a77..b27d183a94 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -2643,6 +2643,46 @@ fn test_aarch64_binemit() { "addp d0, v30.2d", )); + insns.push(( + Inst::VecRRPairLong { + op: VecRRPairLongOp::Uaddlp8, + rd: writable_vreg(0), + rn: vreg(1), + }, + "2028206E", + "uaddlp v0.8h, v1.16b", + )); + + insns.push(( + Inst::VecRRPairLong { + op: VecRRPairLongOp::Saddlp8, + rd: writable_vreg(3), + rn: vreg(11), + }, + "6329204E", + "saddlp v3.8h, v11.16b", + )); + + insns.push(( + Inst::VecRRPairLong { + op: VecRRPairLongOp::Uaddlp16, + rd: writable_vreg(14), + rn: vreg(23), + }, + "EE2A606E", + "uaddlp v14.4s, v23.8h", + )); + + insns.push(( + Inst::VecRRPairLong { + op: VecRRPairLongOp::Saddlp16, + rd: writable_vreg(29), + rn: vreg(0), + }, + "1D28604E", + "saddlp v29.4s, v0.8h", + )); + insns.push(( Inst::VecRRR { alu_op: VecALUOp::Sqadd, diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 8c993492bd..d498bc9b85 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -419,6 +419,18 @@ pub enum VecPairOp { Addp, } +/// 1-operand vector instruction that extends elements of the input register +/// and operates on a pair of elements. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum VecRRPairLongOp { + /// Sign extend and add pair of elements + Saddlp8, + Saddlp16, + /// Unsigned extend and add pair of elements + Uaddlp8, + Uaddlp16, +} + /// An operation across the lanes of vectors. #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub enum VecLanesOp { @@ -1107,6 +1119,15 @@ pub enum Inst { high_half: bool, }, + /// 1-operand vector instruction that extends elements of the input + /// register and operates on a pair of elements. The output lane width + /// is double that of the input. + VecRRPairLong { + op: VecRRPairLongOp, + rd: Writable, + rn: Reg, + }, + /// A vector ALU op. VecRRR { alu_op: VecALUOp, @@ -2166,6 +2187,10 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { collector.add_use(rn); collector.add_use(rm); } + &Inst::VecRRPairLong { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } &Inst::VecRRR { alu_op, rd, rn, rm, .. } => { @@ -2992,6 +3017,14 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RUM) { map_use(mapper, rn); map_use(mapper, rm); } + &mut Inst::VecRRPairLong { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } &mut Inst::VecRRR { alu_op, ref mut rd, @@ -4152,6 +4185,26 @@ impl Inst { format!("{} {}, {}", op, rd, rn) } + &Inst::VecRRPairLong { op, rd, rn } => { + let (op, dest, src) = match op { + VecRRPairLongOp::Saddlp8 => { + ("saddlp", VectorSize::Size16x8, VectorSize::Size8x16) + } + VecRRPairLongOp::Saddlp16 => { + ("saddlp", VectorSize::Size32x4, VectorSize::Size16x8) + } + VecRRPairLongOp::Uaddlp8 => { + ("uaddlp", VectorSize::Size16x8, VectorSize::Size8x16) + } + VecRRPairLongOp::Uaddlp16 => { + ("uaddlp", VectorSize::Size32x4, VectorSize::Size16x8) + } + }; + let rd = show_vreg_vector(rd.to_reg(), mb_rru, dest); + let rn = show_vreg_vector(rn, mb_rru, src); + + format!("{} {}, {}", op, rd, rn) + } &Inst::VecRRR { rd, rn, diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index c07fb92596..f9440dbbb1 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -2644,6 +2644,58 @@ pub(crate) fn lower_insn_to_regs>( }); } + Opcode::IaddPairwise => { + let ty = ty.unwrap(); + let lane_type = ty.lane_type(); + let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + + let mut match_long_pair = + |ext_low_op, ext_high_op| -> Option<(VecRRPairLongOp, regalloc::Reg)> { + if let Some(lhs) = maybe_input_insn(ctx, inputs[0], ext_low_op) { + if let Some(rhs) = maybe_input_insn(ctx, inputs[1], ext_high_op) { + let lhs_inputs = insn_inputs(ctx, lhs); + let rhs_inputs = insn_inputs(ctx, rhs); + let low = put_input_in_reg(ctx, lhs_inputs[0], NarrowValueMode::None); + let high = put_input_in_reg(ctx, rhs_inputs[0], NarrowValueMode::None); + if low == high { + match (lane_type, ext_low_op) { + (I16, Opcode::SwidenLow) => { + return Some((VecRRPairLongOp::Saddlp8, low)) + } + (I32, Opcode::SwidenLow) => { + return Some((VecRRPairLongOp::Saddlp16, low)) + } + (I16, Opcode::UwidenLow) => { + return Some((VecRRPairLongOp::Uaddlp8, low)) + } + (I32, Opcode::UwidenLow) => { + return Some((VecRRPairLongOp::Uaddlp16, low)) + } + _ => (), + }; + } + } + } + None + }; + + if let Some((op, rn)) = match_long_pair(Opcode::SwidenLow, Opcode::SwidenHigh) { + ctx.emit(Inst::VecRRPairLong { op, rd, rn }); + } else if let Some((op, rn)) = match_long_pair(Opcode::UwidenLow, Opcode::UwidenHigh) { + ctx.emit(Inst::VecRRPairLong { op, rd, rn }); + } else { + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + ctx.emit(Inst::VecRRR { + alu_op: VecALUOp::Addp, + rd: rd, + rn: rn, + rm: rm, + size: VectorSize::from_ty(ty), + }); + } + } + Opcode::WideningPairwiseDotProductS => { let r_y = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); @@ -3519,7 +3571,7 @@ pub(crate) fn lower_insn_to_regs>( }); } - Opcode::IaddPairwise | Opcode::ConstAddr | Opcode::Vconcat | Opcode::Vsplit => { + Opcode::ConstAddr | Opcode::Vconcat | Opcode::Vsplit => { unimplemented!("lowering {}", op) } } diff --git a/cranelift/filetests/filetests/isa/aarch64/simd-pairwise-add.clif b/cranelift/filetests/filetests/isa/aarch64/simd-pairwise-add.clif new file mode 100644 index 0000000000..42190619c6 --- /dev/null +++ b/cranelift/filetests/filetests/isa/aarch64/simd-pairwise-add.clif @@ -0,0 +1,124 @@ +test compile +set unwind_info=false +target aarch64 + + +function %fn1(i8x16) -> i16x8 { +block0(v0: i8x16): + v1 = swiden_low v0 + v2 = swiden_high v0 + v3 = iadd_pairwise v1, v2 + return v3 +} + +; check: stp fp +; nextln: mov fp, sp +; nextln: saddlp v0.8h, v0.16b +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %fn2(i8x16) -> i16x8 { +block0(v0: i8x16): + v1 = uwiden_low v0 + v2 = uwiden_high v0 + v3 = iadd_pairwise v1, v2 + return v3 +} + +; check: stp fp +; nextln: mov fp, sp +; nextln: uaddlp v0.8h, v0.16b +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %fn3(i16x8) -> i32x4 { +block0(v0: i16x8): + v1 = swiden_low v0 + v2 = swiden_high v0 + v3 = iadd_pairwise v1, v2 + return v3 +} + +; check: stp fp +; nextln: mov fp, sp +; nextln: saddlp v0.4s, v0.8h +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %fn4(i16x8) -> i32x4 { +block0(v0: i16x8): + v1 = uwiden_low v0 + v2 = uwiden_high v0 + v3 = iadd_pairwise v1, v2 + return v3 +} + +; check: stp fp +; nextln: mov fp, sp +; nextln: uaddlp v0.4s, v0.8h +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %fn5(i8x16, i8x16) -> i16x8 { +block0(v0: i8x16, v1: i8x16): + v2 = swiden_low v0 + v3 = swiden_high v1 + v4 = iadd_pairwise v2, v3 + return v4 +} + +; check: stp fp +; nextln: mov fp, sp +; nextln: sxtl v0.8h, v0.8b +; nextln: sxtl2 v1.8h, v1.16b +; nextln: addp v0.8h, v0.8h, v1.8h +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %fn6(i8x16, i8x16) -> i16x8 { +block0(v0: i8x16, v1: i8x16): + v2 = uwiden_low v0 + v3 = uwiden_high v1 + v4 = iadd_pairwise v2, v3 + return v4 +} + +; check: stp fp +; nextln: mov fp, sp +; nextln: uxtl v0.8h, v0.8b +; nextln: uxtl2 v1.8h, v1.16b +; nextln: addp v0.8h, v0.8h, v1.8h +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %fn7(i8x16) -> i16x8 { +block0(v0: i8x16): + v1 = uwiden_low v0 + v2 = swiden_high v0 + v3 = iadd_pairwise v1, v2 + return v3 +} + +; check: stp fp +; nextln: mov fp, sp +; nextln: uxtl v1.8h, v0.8b +; nextln: sxtl2 v0.8h, v0.16b +; nextln: addp v0.8h, v1.8h, v0.8h +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %fn8(i8x16) -> i16x8 { +block0(v0: i8x16): + v1 = swiden_low v0 + v2 = uwiden_high v0 + v3 = iadd_pairwise v1, v2 + return v3 +} + +; check: stp fp +; nextln: mov fp, sp +; nextln: sxtl v1.8h, v0.8b +; nextln: uxtl2 v0.8h, v0.16b +; nextln: addp v0.8h, v1.8h, v0.8h +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret