aarch64: Add popcnt for i128 values

2021-06-02 17:35:02 +01:00
parent 5140fd251a
commit 09fec151eb
3 changed files with 77 additions and 13 deletions
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -1087,27 +1087,54 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        }

        Opcode::Popcnt => {
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let out_regs = get_output_reg(ctx, outputs[0]);
+            let in_regs = put_input_in_regs(ctx, inputs[0]);
            let ty = ty.unwrap();
-            let size = ScalarSize::from_operand_size(OperandSize::from_ty(ty));
+            let size = if ty == I128 {
+                ScalarSize::Size64
+            } else {
+                ScalarSize::from_operand_size(OperandSize::from_ty(ty))
+            };
+
+            let vec_size = if ty == I128 {
+                VectorSize::Size8x16
+            } else {
+                VectorSize::Size8x8
+            };
+
            let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap();

-            // fmov tmp, rn
-            // cnt tmp.8b, tmp.8b
-            // addp tmp.8b, tmp.8b, tmp.8b / addv tmp, tmp.8b / (no instruction for 8-bit inputs)
-            // umov rd, tmp.b[0]
+            // fmov tmp, in_lo
+            // if ty == i128:
+            //     mov tmp.d[1], in_hi
+            //
+            // cnt tmp.16b, tmp.16b / cnt tmp.8b, tmp.8b
+            // addv tmp, tmp.16b / addv tmp, tmp.8b / addp tmp.8b, tmp.8b, tmp.8b / (no instruction for 8-bit inputs)
+            //
+            // umov out_lo, tmp.b[0]
+            // if ty == i128:
+            //     mov out_hi, 0

            ctx.emit(Inst::MovToFpu {
                rd: tmp,
-                rn: rn,
+                rn: in_regs.regs()[0],
                size,
            });
+
+            if ty == I128 {
+                ctx.emit(Inst::MovToVec {
+                    rd: tmp,
+                    rn: in_regs.regs()[1],
+                    idx: 1,
+                    size: VectorSize::Size64x2,
+                });
+            }
+
            ctx.emit(Inst::VecMisc {
                op: VecMisc2::Cnt,
                rd: tmp,
                rn: tmp.to_reg(),
-                size: VectorSize::Size8x8,
+                size: vec_size,
            });

            match ScalarSize::from_ty(ty) {
@@ -1122,23 +1149,25 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                        size: VectorSize::Size8x8,
                    });
                }
-                ScalarSize::Size32 | ScalarSize::Size64 => {
+                ScalarSize::Size32 | ScalarSize::Size64 | ScalarSize::Size128 => {
                    ctx.emit(Inst::VecLanes {
                        op: VecLanesOp::Addv,
                        rd: tmp,
                        rn: tmp.to_reg(),
-                        size: VectorSize::Size8x8,
+                        size: vec_size,
                    });
                }
-                sz => panic!("Unexpected scalar FP operand size: {:?}", sz),
            }

            ctx.emit(Inst::MovFromVec {
-                rd,
+                rd: out_regs.regs()[0],
                rn: tmp.to_reg(),
                idx: 0,
                size: VectorSize::Size8x16,
            });
+            if ty == I128 {
+                lower_constant_u64(ctx, out_regs.regs()[1], 0);
+            }
        }

        Opcode::Load