From 043a8434d2e79e4b0ba2a179bf30e4965d26186e Mon Sep 17 00:00:00 2001
From: Anton Kirilov <anton.kirilov@arm.com>
Date: Mon, 11 Jan 2021 18:23:03 +0000
Subject: [PATCH] Cranelift AArch64: Improve the Popcnt implementation

Now the backend uses the CNT instruction, which results into a major
simplification.

Copyright (c) 2021, Arm Limited.
---
 .../codegen/src/isa/aarch64/inst/args.rs      |   8 +
 .../codegen/src/isa/aarch64/inst/emit.rs      |   6 +
 .../src/isa/aarch64/inst/emit_tests.rs        |  33 ++++
 cranelift/codegen/src/isa/aarch64/inst/mod.rs |   3 +
 .../codegen/src/isa/aarch64/lower_inst.rs     | 168 +++++-------------
 .../filetests/isa/aarch64/bitops.clif         |  70 ++------
 6 files changed, 106 insertions(+), 182 deletions(-)

diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs
index 738495714a..1a55029b32 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -601,6 +601,14 @@ impl ScalarSize {
         }
     }
 
+    /// Convert from an integer operand size.
+    pub fn from_operand_size(size: OperandSize) -> ScalarSize {
+        match size {
+            OperandSize::Size32 => ScalarSize::Size32,
+            OperandSize::Size64 => ScalarSize::Size64,
+        }
+    }
+
     /// Convert from a type into the smallest size that fits.
     pub fn from_ty(ty: Type) -> ScalarSize {
         Self::from_bits(ty_bits(ty))
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index 432bbc19dd..1195e879bb 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -1463,12 +1463,18 @@ impl MachInstEmit for Inst {
                         debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
                         (0b0, 0b11000, enc_size | 0b10)
                     }
+                    VecMisc2::Cnt => {
+                        debug_assert!(size == VectorSize::Size8x8 || size == VectorSize::Size8x16);
+                        (0b0, 0b00101, enc_size)
+                    }
                 };
                 sink.put4(enc_vec_rr_misc((q << 1) | u, size, bits_12_16, rd, rn));
             }
             &Inst::VecLanes { op, rd, rn, size } => {
                 let (q, size) = match size {
+                    VectorSize::Size8x8 => (0b0, 0b00),
                     VectorSize::Size8x16 => (0b1, 0b00),
+                    VectorSize::Size16x4 => (0b0, 0b01),
                     VectorSize::Size16x8 => (0b1, 0b01),
                     VectorSize::Size32x4 => (0b1, 0b10),
                     _ => unreachable!(),
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
index f01fbf43f0..63232d58a4 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -3792,6 +3792,28 @@ fn test_aarch64_binemit() {
         "frintp v12.2d, v17.2d",
     ));
 
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Cnt,
+            rd: writable_vreg(23),
+            rn: vreg(5),
+            size: VectorSize::Size8x8,
+        },
+        "B758200E",
+        "cnt v23.8b, v5.8b",
+    ));
+
+    insns.push((
+        Inst::VecLanes {
+            op: VecLanesOp::Uminv,
+            rd: writable_vreg(0),
+            rn: vreg(31),
+            size: VectorSize::Size8x8,
+        },
+        "E0AB312E",
+        "uminv b0, v31.8b",
+    ));
+
     insns.push((
         Inst::VecLanes {
             op: VecLanesOp::Uminv,
@@ -3836,6 +3858,17 @@ fn test_aarch64_binemit() {
         "addv b2, v29.16b",
     ));
 
+    insns.push((
+        Inst::VecLanes {
+            op: VecLanesOp::Addv,
+            rd: writable_vreg(15),
+            rn: vreg(7),
+            size: VectorSize::Size16x4,
+        },
+        "EFB8710E",
+        "addv h15, v7.4h",
+    ));
+
     insns.push((
         Inst::VecLanes {
             op: VecLanesOp::Addv,
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index 38b6e29ce4..5e88a783f5 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -331,6 +331,8 @@ pub enum VecMisc2 {
     Frintm,
     /// Floating point round to integral, rounding towards plus infinity
     Frintp,
+    /// Population count per byte
+    Cnt,
 }
 
 /// A Vector narrowing operation with two registers.
@@ -3752,6 +3754,7 @@ impl Inst {
                     VecMisc2::Frintz => ("frintz", size),
                     VecMisc2::Frintm => ("frintm", size),
                     VecMisc2::Frintp => ("frintp", size),
+                    VecMisc2::Cnt => ("cnt", size),
                 };
 
                 let rd_size = if is_shll { size.widen() } else { size };
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index 73c11008c4..93c2385098 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -962,143 +962,57 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::Popcnt => {
-            // Lower popcount using the following algorithm:
-            //
-            //   x -= (x >> 1) & 0x5555555555555555
-            //   x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333)
-            //   x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f
-            //   x += x << 8
-            //   x += x << 16
-            //   x += x << 32
-            //   x >> 56
-            let ty = ty.unwrap();
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            // FIXME(#1537): zero-extend 8/16/32-bit operands only to 32 bits,
-            // and fix the sequence below to work properly for this.
-            let narrow_mode = NarrowValueMode::ZeroExtend64;
-            let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
-            let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
+            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let ty = ty.unwrap();
+            let size = ScalarSize::from_operand_size(OperandSize::from_ty(ty));
+            let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap();
 
-            // If this is a 32-bit Popcnt, use Lsr32 to clear the top 32 bits of the register, then
-            // the rest of the code is identical to the 64-bit version.
-            // lsr [wx]d, [wx]n, #1
-            ctx.emit(Inst::AluRRImmShift {
-                alu_op: choose_32_64(ty, ALUOp::Lsr32, ALUOp::Lsr64),
-                rd: rd,
+            // fmov tmp, rn
+            // cnt tmp.8b, tmp.8b
+            // addp tmp.8b, tmp.8b, tmp.8b / addv tmp, tmp.8b / (no instruction for 8-bit inputs)
+            // umov rd, tmp.b[0]
+
+            ctx.emit(Inst::MovToFpu {
+                rd: tmp,
                 rn: rn,
-                immshift: ImmShift::maybe_from_u64(1).unwrap(),
+                size,
             });
-
-            // and xd, xd, #0x5555555555555555
-            ctx.emit(Inst::AluRRImmLogic {
-                alu_op: ALUOp::And64,
-                rd: rd,
-                rn: rd.to_reg(),
-                imml: ImmLogic::maybe_from_u64(0x5555555555555555, I64).unwrap(),
-            });
-
-            // sub xd, xn, xd
-            ctx.emit(Inst::AluRRR {
-                alu_op: ALUOp::Sub64,
-                rd: rd,
-                rn: rn,
-                rm: rd.to_reg(),
-            });
-
-            // and xt, xd, #0x3333333333333333
-            ctx.emit(Inst::AluRRImmLogic {
-                alu_op: ALUOp::And64,
-                rd: tmp,
-                rn: rd.to_reg(),
-                imml: ImmLogic::maybe_from_u64(0x3333333333333333, I64).unwrap(),
-            });
-
-            // lsr xd, xd, #2
-            ctx.emit(Inst::AluRRImmShift {
-                alu_op: ALUOp::Lsr64,
-                rd: rd,
-                rn: rd.to_reg(),
-                immshift: ImmShift::maybe_from_u64(2).unwrap(),
-            });
-
-            // and xd, xd, #0x3333333333333333
-            ctx.emit(Inst::AluRRImmLogic {
-                alu_op: ALUOp::And64,
-                rd: rd,
-                rn: rd.to_reg(),
-                imml: ImmLogic::maybe_from_u64(0x3333333333333333, I64).unwrap(),
-            });
-
-            // add xt, xd, xt
-            ctx.emit(Inst::AluRRR {
-                alu_op: ALUOp::Add64,
-                rd: tmp,
-                rn: rd.to_reg(),
-                rm: tmp.to_reg(),
-            });
-
-            // add xt, xt, xt LSR #4
-            ctx.emit(Inst::AluRRRShift {
-                alu_op: ALUOp::Add64,
+            ctx.emit(Inst::VecMisc {
+                op: VecMisc2::Cnt,
                 rd: tmp,
                 rn: tmp.to_reg(),
-                rm: tmp.to_reg(),
-                shiftop: ShiftOpAndAmt::new(
-                    ShiftOp::LSR,
-                    ShiftOpShiftImm::maybe_from_shift(4).unwrap(),
-                ),
+                size: VectorSize::Size8x8,
             });
 
-            // and xt, xt, #0x0f0f0f0f0f0f0f0f
-            ctx.emit(Inst::AluRRImmLogic {
-                alu_op: ALUOp::And64,
-                rd: tmp,
-                rn: tmp.to_reg(),
-                imml: ImmLogic::maybe_from_u64(0x0f0f0f0f0f0f0f0f, I64).unwrap(),
-            });
+            match ScalarSize::from_ty(ty) {
+                ScalarSize::Size8 => {}
+                ScalarSize::Size16 => {
+                    // ADDP is usually cheaper than ADDV.
+                    ctx.emit(Inst::VecRRR {
+                        alu_op: VecALUOp::Addp,
+                        rd: tmp,
+                        rn: tmp.to_reg(),
+                        rm: tmp.to_reg(),
+                        size: VectorSize::Size8x8,
+                    });
+                }
+                ScalarSize::Size32 | ScalarSize::Size64 => {
+                    ctx.emit(Inst::VecLanes {
+                        op: VecLanesOp::Addv,
+                        rd: tmp,
+                        rn: tmp.to_reg(),
+                        size: VectorSize::Size8x8,
+                    });
+                }
+                sz => panic!("Unexpected scalar FP operand size: {:?}", sz),
+            }
 
-            // add xt, xt, xt, LSL #8
-            ctx.emit(Inst::AluRRRShift {
-                alu_op: ALUOp::Add64,
-                rd: tmp,
+            ctx.emit(Inst::MovFromVec {
+                rd,
                 rn: tmp.to_reg(),
-                rm: tmp.to_reg(),
-                shiftop: ShiftOpAndAmt::new(
-                    ShiftOp::LSL,
-                    ShiftOpShiftImm::maybe_from_shift(8).unwrap(),
-                ),
-            });
-
-            // add xt, xt, xt, LSL #16
-            ctx.emit(Inst::AluRRRShift {
-                alu_op: ALUOp::Add64,
-                rd: tmp,
-                rn: tmp.to_reg(),
-                rm: tmp.to_reg(),
-                shiftop: ShiftOpAndAmt::new(
-                    ShiftOp::LSL,
-                    ShiftOpShiftImm::maybe_from_shift(16).unwrap(),
-                ),
-            });
-
-            // add xt, xt, xt, LSL #32
-            ctx.emit(Inst::AluRRRShift {
-                alu_op: ALUOp::Add64,
-                rd: tmp,
-                rn: tmp.to_reg(),
-                rm: tmp.to_reg(),
-                shiftop: ShiftOpAndAmt::new(
-                    ShiftOp::LSL,
-                    ShiftOpShiftImm::maybe_from_shift(32).unwrap(),
-                ),
-            });
-
-            // lsr xd, xt, #56
-            ctx.emit(Inst::AluRRImmShift {
-                alu_op: ALUOp::Lsr64,
-                rd: rd,
-                rn: tmp.to_reg(),
-                immshift: ImmShift::maybe_from_u64(56).unwrap(),
+                idx: 0,
+                size: VectorSize::Size8x16,
             });
         }
 
diff --git a/cranelift/filetests/filetests/isa/aarch64/bitops.clif b/cranelift/filetests/filetests/isa/aarch64/bitops.clif
index ab1c113104..8730128cc5 100644
--- a/cranelift/filetests/filetests/isa/aarch64/bitops.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/bitops.clif
@@ -230,19 +230,10 @@ block0(v0: i64):
 
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
-; nextln: lsr x1, x0, #1
-; nextln: and x1, x1, #6148914691236517205
-; nextln: sub x1, x0, x1
-; nextln: and x0, x1, #3689348814741910323
-; nextln: lsr x1, x1, #2
-; nextln: and x1, x1, #3689348814741910323
-; nextln: add x0, x1, x0
-; nextln: add x0, x0, x0, LSR 4
-; nextln: and x0, x0, #1085102592571150095
-; nextln: add x0, x0, x0, LSL 8
-; nextln: add x0, x0, x0, LSL 16
-; nextln: add x0, x0, x0, LSL 32
-; nextln: lsr x0, x0, #56
+; nextln: fmov d0, x0
+; nextln: cnt v0.8b, v0.8b
+; nextln: addv b0, v0.8b
+; nextln: umov w0, v0.b[0]
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
@@ -255,20 +246,10 @@ block0(v0: i32):
 
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
-; nextln: mov w0, w0
-; nextln: lsr w1, w0, #1
-; nextln: and x1, x1, #6148914691236517205
-; nextln: sub x1, x0, x1
-; nextln: and x0, x1, #3689348814741910323
-; nextln: lsr x1, x1, #2
-; nextln: and x1, x1, #3689348814741910323
-; nextln: add x0, x1, x0
-; nextln: add x0, x0, x0, LSR 4
-; nextln: and x0, x0, #1085102592571150095
-; nextln: add x0, x0, x0, LSL 8
-; nextln: add x0, x0, x0, LSL 16
-; nextln: add x0, x0, x0, LSL 32
-; nextln: lsr x0, x0, #56
+; nextln: fmov s0, w0
+; nextln: cnt v0.8b, v0.8b
+; nextln: addv b0, v0.8b
+; nextln: umov w0, v0.b[0]
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
@@ -281,20 +262,10 @@ block0(v0: i16):
 
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
-; nextln: uxth w0, w0
-; nextln: lsr w1, w0, #1
-; nextln: and x1, x1, #6148914691236517205
-; nextln: sub x1, x0, x1
-; nextln: and x0, x1, #3689348814741910323
-; nextln: lsr x1, x1, #2
-; nextln: and x1, x1, #3689348814741910323
-; nextln: add x0, x1, x0
-; nextln: add x0, x0, x0, LSR 4
-; nextln: and x0, x0, #1085102592571150095
-; nextln: add x0, x0, x0, LSL 8
-; nextln: add x0, x0, x0, LSL 16
-; nextln: add x0, x0, x0, LSL 32
-; nextln: lsr x0, x0, #56
+; nextln: fmov s0, w0
+; nextln: cnt v0.8b, v0.8b
+; nextln: addp v0.8b, v0.8b, v0.8b
+; nextln: umov w0, v0.b[0]
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
@@ -307,20 +278,9 @@ block0(v0: i8):
 
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
-; nextln: uxtb w0, w0
-; nextln: lsr w1, w0, #1
-; nextln: and x1, x1, #6148914691236517205
-; nextln: sub x1, x0, x1
-; nextln: and x0, x1, #3689348814741910323
-; nextln: lsr x1, x1, #2
-; nextln: and x1, x1, #3689348814741910323
-; nextln: add x0, x1, x0
-; nextln: add x0, x0, x0, LSR 4
-; nextln: and x0, x0, #1085102592571150095
-; nextln: add x0, x0, x0, LSL 8
-; nextln: add x0, x0, x0, LSL 16
-; nextln: add x0, x0, x0, LSL 32
-; nextln: lsr x0, x0, #56
+; nextln: fmov s0, w0
+; nextln: cnt v0.8b, v0.8b
+; nextln: umov w0, v0.b[0]
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret