From 09fec151eba4cde38967eb6044dd668d31c6e661 Mon Sep 17 00:00:00 2001 From: Afonso Bordado Date: Wed, 2 Jun 2021 17:35:02 +0100 Subject: [PATCH] aarch64: Add popcnt for i128 values --- .../codegen/src/isa/aarch64/lower_inst.rs | 55 ++++++++++++++----- .../filetests/isa/aarch64/bitops.clif | 18 ++++++ ...itops-misc.clif => i128-bitops-count.clif} | 17 ++++++ 3 files changed, 77 insertions(+), 13 deletions(-) rename cranelift/filetests/filetests/runtests/{i128-bitops-misc.clif => i128-bitops-count.clif} (65%) diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 690eacc298..ae9fd9b4d7 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -1087,27 +1087,54 @@ pub(crate) fn lower_insn_to_regs>( } Opcode::Popcnt => { - let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let out_regs = get_output_reg(ctx, outputs[0]); + let in_regs = put_input_in_regs(ctx, inputs[0]); let ty = ty.unwrap(); - let size = ScalarSize::from_operand_size(OperandSize::from_ty(ty)); + let size = if ty == I128 { + ScalarSize::Size64 + } else { + ScalarSize::from_operand_size(OperandSize::from_ty(ty)) + }; + + let vec_size = if ty == I128 { + VectorSize::Size8x16 + } else { + VectorSize::Size8x8 + }; + let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap(); - // fmov tmp, rn - // cnt tmp.8b, tmp.8b - // addp tmp.8b, tmp.8b, tmp.8b / addv tmp, tmp.8b / (no instruction for 8-bit inputs) - // umov rd, tmp.b[0] + // fmov tmp, in_lo + // if ty == i128: + // mov tmp.d[1], in_hi + // + // cnt tmp.16b, tmp.16b / cnt tmp.8b, tmp.8b + // addv tmp, tmp.16b / addv tmp, tmp.8b / addp tmp.8b, tmp.8b, tmp.8b / (no instruction for 8-bit inputs) + // + // umov out_lo, tmp.b[0] + // if ty == i128: + // mov out_hi, 0 ctx.emit(Inst::MovToFpu { rd: tmp, - rn: rn, + rn: in_regs.regs()[0], size, }); + + if ty == I128 { + ctx.emit(Inst::MovToVec { + rd: tmp, + rn: in_regs.regs()[1], + idx: 1, + size: VectorSize::Size64x2, + }); + } + ctx.emit(Inst::VecMisc { op: VecMisc2::Cnt, rd: tmp, rn: tmp.to_reg(), - size: VectorSize::Size8x8, + size: vec_size, }); match ScalarSize::from_ty(ty) { @@ -1122,23 +1149,25 @@ pub(crate) fn lower_insn_to_regs>( size: VectorSize::Size8x8, }); } - ScalarSize::Size32 | ScalarSize::Size64 => { + ScalarSize::Size32 | ScalarSize::Size64 | ScalarSize::Size128 => { ctx.emit(Inst::VecLanes { op: VecLanesOp::Addv, rd: tmp, rn: tmp.to_reg(), - size: VectorSize::Size8x8, + size: vec_size, }); } - sz => panic!("Unexpected scalar FP operand size: {:?}", sz), } ctx.emit(Inst::MovFromVec { - rd, + rd: out_regs.regs()[0], rn: tmp.to_reg(), idx: 0, size: VectorSize::Size8x16, }); + if ty == I128 { + lower_constant_u64(ctx, out_regs.regs()[1], 0); + } } Opcode::Load diff --git a/cranelift/filetests/filetests/isa/aarch64/bitops.clif b/cranelift/filetests/filetests/isa/aarch64/bitops.clif index 215a207dda..eaddadb37c 100644 --- a/cranelift/filetests/filetests/isa/aarch64/bitops.clif +++ b/cranelift/filetests/filetests/isa/aarch64/bitops.clif @@ -207,6 +207,24 @@ block0(v0: i64): ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret +function %d(i128) -> i128 { +block0(v0: i128): + v1 = popcnt v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: fmov d0, x0 +; nextln: mov v0.d[1], x1 +; nextln: cnt v0.16b, v0.16b +; nextln: addv b0, v0.16b +; nextln: umov w0, v0.b[0] +; nextln: movz x1, #0 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + + function %d(i64) -> i64 { block0(v0: i64): v1 = popcnt v0 diff --git a/cranelift/filetests/filetests/runtests/i128-bitops-misc.clif b/cranelift/filetests/filetests/runtests/i128-bitops-count.clif similarity index 65% rename from cranelift/filetests/filetests/runtests/i128-bitops-misc.clif rename to cranelift/filetests/filetests/runtests/i128-bitops-count.clif index ec55510e5d..c701b0911f 100644 --- a/cranelift/filetests/filetests/runtests/i128-bitops-misc.clif +++ b/cranelift/filetests/filetests/runtests/i128-bitops-count.clif @@ -25,3 +25,20 @@ block0(v0: i64, v1: i64): ; run: %clz(0x00000000_00010000, 0x00000001_00000000) == 31 ; run: %clz(0x00000000_00010000, 0x00000000_00000000) == 111 ; run: %clz(0x00000000_00000000, 0x00000000_00000000) == 128 + +function %popcnt_i128(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = iconcat v0, v1 + + v3 = popcnt v2 + + v4, v5 = isplit v3 + v6 = iadd v4, v5 + return v6 +} +; run: %popcnt_i128(0, 0) == 0 +; run: %popcnt_i128(-1, 0) == 64 +; run: %popcnt_i128(0, -1) == 64 +; run: %popcnt_i128(-1, -1) == 128 +; run: %popcnt_i128(0x55555555_55555555, 0x55555555_55555555) == 64 +; run: %popcnt_i128(0xC0FFEEEE_DECAFFFF, 0xDECAFFFF_C0FFEEEE) == 96