From 043a8434d2e79e4b0ba2a179bf30e4965d26186e Mon Sep 17 00:00:00 2001 From: Anton Kirilov Date: Mon, 11 Jan 2021 18:23:03 +0000 Subject: [PATCH] Cranelift AArch64: Improve the Popcnt implementation Now the backend uses the CNT instruction, which results into a major simplification. Copyright (c) 2021, Arm Limited. --- .../codegen/src/isa/aarch64/inst/args.rs | 8 + .../codegen/src/isa/aarch64/inst/emit.rs | 6 + .../src/isa/aarch64/inst/emit_tests.rs | 33 ++++ cranelift/codegen/src/isa/aarch64/inst/mod.rs | 3 + .../codegen/src/isa/aarch64/lower_inst.rs | 168 +++++------------- .../filetests/isa/aarch64/bitops.clif | 70 ++------ 6 files changed, 106 insertions(+), 182 deletions(-) diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs index 738495714a..1a55029b32 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/args.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs @@ -601,6 +601,14 @@ impl ScalarSize { } } + /// Convert from an integer operand size. + pub fn from_operand_size(size: OperandSize) -> ScalarSize { + match size { + OperandSize::Size32 => ScalarSize::Size32, + OperandSize::Size64 => ScalarSize::Size64, + } + } + /// Convert from a type into the smallest size that fits. pub fn from_ty(ty: Type) -> ScalarSize { Self::from_bits(ty_bits(ty)) diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index 432bbc19dd..1195e879bb 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -1463,12 +1463,18 @@ impl MachInstEmit for Inst { debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2); (0b0, 0b11000, enc_size | 0b10) } + VecMisc2::Cnt => { + debug_assert!(size == VectorSize::Size8x8 || size == VectorSize::Size8x16); + (0b0, 0b00101, enc_size) + } }; sink.put4(enc_vec_rr_misc((q << 1) | u, size, bits_12_16, rd, rn)); } &Inst::VecLanes { op, rd, rn, size } => { let (q, size) = match size { + VectorSize::Size8x8 => (0b0, 0b00), VectorSize::Size8x16 => (0b1, 0b00), + VectorSize::Size16x4 => (0b0, 0b01), VectorSize::Size16x8 => (0b1, 0b01), VectorSize::Size32x4 => (0b1, 0b10), _ => unreachable!(), diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index f01fbf43f0..63232d58a4 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -3792,6 +3792,28 @@ fn test_aarch64_binemit() { "frintp v12.2d, v17.2d", )); + insns.push(( + Inst::VecMisc { + op: VecMisc2::Cnt, + rd: writable_vreg(23), + rn: vreg(5), + size: VectorSize::Size8x8, + }, + "B758200E", + "cnt v23.8b, v5.8b", + )); + + insns.push(( + Inst::VecLanes { + op: VecLanesOp::Uminv, + rd: writable_vreg(0), + rn: vreg(31), + size: VectorSize::Size8x8, + }, + "E0AB312E", + "uminv b0, v31.8b", + )); + insns.push(( Inst::VecLanes { op: VecLanesOp::Uminv, @@ -3836,6 +3858,17 @@ fn test_aarch64_binemit() { "addv b2, v29.16b", )); + insns.push(( + Inst::VecLanes { + op: VecLanesOp::Addv, + rd: writable_vreg(15), + rn: vreg(7), + size: VectorSize::Size16x4, + }, + "EFB8710E", + "addv h15, v7.4h", + )); + insns.push(( Inst::VecLanes { op: VecLanesOp::Addv, diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 38b6e29ce4..5e88a783f5 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -331,6 +331,8 @@ pub enum VecMisc2 { Frintm, /// Floating point round to integral, rounding towards plus infinity Frintp, + /// Population count per byte + Cnt, } /// A Vector narrowing operation with two registers. @@ -3752,6 +3754,7 @@ impl Inst { VecMisc2::Frintz => ("frintz", size), VecMisc2::Frintm => ("frintm", size), VecMisc2::Frintp => ("frintp", size), + VecMisc2::Cnt => ("cnt", size), }; let rd_size = if is_shll { size.widen() } else { size }; diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 73c11008c4..93c2385098 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -962,143 +962,57 @@ pub(crate) fn lower_insn_to_regs>( } Opcode::Popcnt => { - // Lower popcount using the following algorithm: - // - // x -= (x >> 1) & 0x5555555555555555 - // x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333) - // x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f - // x += x << 8 - // x += x << 16 - // x += x << 32 - // x >> 56 - let ty = ty.unwrap(); let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - // FIXME(#1537): zero-extend 8/16/32-bit operands only to 32 bits, - // and fix the sequence below to work properly for this. - let narrow_mode = NarrowValueMode::ZeroExtend64; - let rn = put_input_in_reg(ctx, inputs[0], narrow_mode); - let tmp = ctx.alloc_tmp(I64).only_reg().unwrap(); + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let ty = ty.unwrap(); + let size = ScalarSize::from_operand_size(OperandSize::from_ty(ty)); + let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap(); - // If this is a 32-bit Popcnt, use Lsr32 to clear the top 32 bits of the register, then - // the rest of the code is identical to the 64-bit version. - // lsr [wx]d, [wx]n, #1 - ctx.emit(Inst::AluRRImmShift { - alu_op: choose_32_64(ty, ALUOp::Lsr32, ALUOp::Lsr64), - rd: rd, + // fmov tmp, rn + // cnt tmp.8b, tmp.8b + // addp tmp.8b, tmp.8b, tmp.8b / addv tmp, tmp.8b / (no instruction for 8-bit inputs) + // umov rd, tmp.b[0] + + ctx.emit(Inst::MovToFpu { + rd: tmp, rn: rn, - immshift: ImmShift::maybe_from_u64(1).unwrap(), + size, }); - - // and xd, xd, #0x5555555555555555 - ctx.emit(Inst::AluRRImmLogic { - alu_op: ALUOp::And64, - rd: rd, - rn: rd.to_reg(), - imml: ImmLogic::maybe_from_u64(0x5555555555555555, I64).unwrap(), - }); - - // sub xd, xn, xd - ctx.emit(Inst::AluRRR { - alu_op: ALUOp::Sub64, - rd: rd, - rn: rn, - rm: rd.to_reg(), - }); - - // and xt, xd, #0x3333333333333333 - ctx.emit(Inst::AluRRImmLogic { - alu_op: ALUOp::And64, - rd: tmp, - rn: rd.to_reg(), - imml: ImmLogic::maybe_from_u64(0x3333333333333333, I64).unwrap(), - }); - - // lsr xd, xd, #2 - ctx.emit(Inst::AluRRImmShift { - alu_op: ALUOp::Lsr64, - rd: rd, - rn: rd.to_reg(), - immshift: ImmShift::maybe_from_u64(2).unwrap(), - }); - - // and xd, xd, #0x3333333333333333 - ctx.emit(Inst::AluRRImmLogic { - alu_op: ALUOp::And64, - rd: rd, - rn: rd.to_reg(), - imml: ImmLogic::maybe_from_u64(0x3333333333333333, I64).unwrap(), - }); - - // add xt, xd, xt - ctx.emit(Inst::AluRRR { - alu_op: ALUOp::Add64, - rd: tmp, - rn: rd.to_reg(), - rm: tmp.to_reg(), - }); - - // add xt, xt, xt LSR #4 - ctx.emit(Inst::AluRRRShift { - alu_op: ALUOp::Add64, + ctx.emit(Inst::VecMisc { + op: VecMisc2::Cnt, rd: tmp, rn: tmp.to_reg(), - rm: tmp.to_reg(), - shiftop: ShiftOpAndAmt::new( - ShiftOp::LSR, - ShiftOpShiftImm::maybe_from_shift(4).unwrap(), - ), + size: VectorSize::Size8x8, }); - // and xt, xt, #0x0f0f0f0f0f0f0f0f - ctx.emit(Inst::AluRRImmLogic { - alu_op: ALUOp::And64, - rd: tmp, - rn: tmp.to_reg(), - imml: ImmLogic::maybe_from_u64(0x0f0f0f0f0f0f0f0f, I64).unwrap(), - }); + match ScalarSize::from_ty(ty) { + ScalarSize::Size8 => {} + ScalarSize::Size16 => { + // ADDP is usually cheaper than ADDV. + ctx.emit(Inst::VecRRR { + alu_op: VecALUOp::Addp, + rd: tmp, + rn: tmp.to_reg(), + rm: tmp.to_reg(), + size: VectorSize::Size8x8, + }); + } + ScalarSize::Size32 | ScalarSize::Size64 => { + ctx.emit(Inst::VecLanes { + op: VecLanesOp::Addv, + rd: tmp, + rn: tmp.to_reg(), + size: VectorSize::Size8x8, + }); + } + sz => panic!("Unexpected scalar FP operand size: {:?}", sz), + } - // add xt, xt, xt, LSL #8 - ctx.emit(Inst::AluRRRShift { - alu_op: ALUOp::Add64, - rd: tmp, + ctx.emit(Inst::MovFromVec { + rd, rn: tmp.to_reg(), - rm: tmp.to_reg(), - shiftop: ShiftOpAndAmt::new( - ShiftOp::LSL, - ShiftOpShiftImm::maybe_from_shift(8).unwrap(), - ), - }); - - // add xt, xt, xt, LSL #16 - ctx.emit(Inst::AluRRRShift { - alu_op: ALUOp::Add64, - rd: tmp, - rn: tmp.to_reg(), - rm: tmp.to_reg(), - shiftop: ShiftOpAndAmt::new( - ShiftOp::LSL, - ShiftOpShiftImm::maybe_from_shift(16).unwrap(), - ), - }); - - // add xt, xt, xt, LSL #32 - ctx.emit(Inst::AluRRRShift { - alu_op: ALUOp::Add64, - rd: tmp, - rn: tmp.to_reg(), - rm: tmp.to_reg(), - shiftop: ShiftOpAndAmt::new( - ShiftOp::LSL, - ShiftOpShiftImm::maybe_from_shift(32).unwrap(), - ), - }); - - // lsr xd, xt, #56 - ctx.emit(Inst::AluRRImmShift { - alu_op: ALUOp::Lsr64, - rd: rd, - rn: tmp.to_reg(), - immshift: ImmShift::maybe_from_u64(56).unwrap(), + idx: 0, + size: VectorSize::Size8x16, }); } diff --git a/cranelift/filetests/filetests/isa/aarch64/bitops.clif b/cranelift/filetests/filetests/isa/aarch64/bitops.clif index ab1c113104..8730128cc5 100644 --- a/cranelift/filetests/filetests/isa/aarch64/bitops.clif +++ b/cranelift/filetests/filetests/isa/aarch64/bitops.clif @@ -230,19 +230,10 @@ block0(v0: i64): ; check: stp fp, lr, [sp, #-16]! ; nextln: mov fp, sp -; nextln: lsr x1, x0, #1 -; nextln: and x1, x1, #6148914691236517205 -; nextln: sub x1, x0, x1 -; nextln: and x0, x1, #3689348814741910323 -; nextln: lsr x1, x1, #2 -; nextln: and x1, x1, #3689348814741910323 -; nextln: add x0, x1, x0 -; nextln: add x0, x0, x0, LSR 4 -; nextln: and x0, x0, #1085102592571150095 -; nextln: add x0, x0, x0, LSL 8 -; nextln: add x0, x0, x0, LSL 16 -; nextln: add x0, x0, x0, LSL 32 -; nextln: lsr x0, x0, #56 +; nextln: fmov d0, x0 +; nextln: cnt v0.8b, v0.8b +; nextln: addv b0, v0.8b +; nextln: umov w0, v0.b[0] ; nextln: mov sp, fp ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret @@ -255,20 +246,10 @@ block0(v0: i32): ; check: stp fp, lr, [sp, #-16]! ; nextln: mov fp, sp -; nextln: mov w0, w0 -; nextln: lsr w1, w0, #1 -; nextln: and x1, x1, #6148914691236517205 -; nextln: sub x1, x0, x1 -; nextln: and x0, x1, #3689348814741910323 -; nextln: lsr x1, x1, #2 -; nextln: and x1, x1, #3689348814741910323 -; nextln: add x0, x1, x0 -; nextln: add x0, x0, x0, LSR 4 -; nextln: and x0, x0, #1085102592571150095 -; nextln: add x0, x0, x0, LSL 8 -; nextln: add x0, x0, x0, LSL 16 -; nextln: add x0, x0, x0, LSL 32 -; nextln: lsr x0, x0, #56 +; nextln: fmov s0, w0 +; nextln: cnt v0.8b, v0.8b +; nextln: addv b0, v0.8b +; nextln: umov w0, v0.b[0] ; nextln: mov sp, fp ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret @@ -281,20 +262,10 @@ block0(v0: i16): ; check: stp fp, lr, [sp, #-16]! ; nextln: mov fp, sp -; nextln: uxth w0, w0 -; nextln: lsr w1, w0, #1 -; nextln: and x1, x1, #6148914691236517205 -; nextln: sub x1, x0, x1 -; nextln: and x0, x1, #3689348814741910323 -; nextln: lsr x1, x1, #2 -; nextln: and x1, x1, #3689348814741910323 -; nextln: add x0, x1, x0 -; nextln: add x0, x0, x0, LSR 4 -; nextln: and x0, x0, #1085102592571150095 -; nextln: add x0, x0, x0, LSL 8 -; nextln: add x0, x0, x0, LSL 16 -; nextln: add x0, x0, x0, LSL 32 -; nextln: lsr x0, x0, #56 +; nextln: fmov s0, w0 +; nextln: cnt v0.8b, v0.8b +; nextln: addp v0.8b, v0.8b, v0.8b +; nextln: umov w0, v0.b[0] ; nextln: mov sp, fp ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret @@ -307,20 +278,9 @@ block0(v0: i8): ; check: stp fp, lr, [sp, #-16]! ; nextln: mov fp, sp -; nextln: uxtb w0, w0 -; nextln: lsr w1, w0, #1 -; nextln: and x1, x1, #6148914691236517205 -; nextln: sub x1, x0, x1 -; nextln: and x0, x1, #3689348814741910323 -; nextln: lsr x1, x1, #2 -; nextln: and x1, x1, #3689348814741910323 -; nextln: add x0, x1, x0 -; nextln: add x0, x0, x0, LSR 4 -; nextln: and x0, x0, #1085102592571150095 -; nextln: add x0, x0, x0, LSL 8 -; nextln: add x0, x0, x0, LSL 16 -; nextln: add x0, x0, x0, LSL 32 -; nextln: lsr x0, x0, #56 +; nextln: fmov s0, w0 +; nextln: cnt v0.8b, v0.8b +; nextln: umov w0, v0.b[0] ; nextln: mov sp, fp ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret