diff --git a/build.rs b/build.rs
index 69dcfff932..ecc27d218d 100644
--- a/build.rs
+++ b/build.rs
@@ -230,8 +230,7 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
             ("simd", _) if platform_is_s390x() => return true,
 
             // These are new instructions that are not really implemented in any backend.
-            ("simd", "simd_i8x16_arith2")
-            | ("simd", "simd_conversions")
+            ("simd", "simd_conversions")
             | ("simd", "simd_i16x8_extadd_pairwise_i8x16")
             | ("simd", "simd_i16x8_extmul_i8x16")
             | ("simd", "simd_i16x8_q15mulr_sat_s")
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index 25c83eede6..88ca58d400 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -1181,86 +1181,108 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::Popcnt => {
-            let out_regs = get_output_reg(ctx, outputs[0]);
-            let in_regs = put_input_in_regs(ctx, inputs[0]);
             let ty = ty.unwrap();
-            let size = if ty == I128 {
-                ScalarSize::Size64
-            } else {
-                ScalarSize::from_operand_size(OperandSize::from_ty(ty))
-            };
 
-            let vec_size = if ty == I128 {
-                VectorSize::Size8x16
-            } else {
-                VectorSize::Size8x8
-            };
+            if ty.is_vector() {
+                let lane_type = ty.lane_type();
+                let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
 
-            let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap();
+                if lane_type != I8 {
+                    return Err(CodegenError::Unsupported(format!(
+                        "Unsupported SIMD vector lane type: {:?}",
+                        lane_type
+                    )));
+                }
 
-            // fmov tmp, in_lo
-            // if ty == i128:
-            //     mov tmp.d[1], in_hi
-            //
-            // cnt tmp.16b, tmp.16b / cnt tmp.8b, tmp.8b
-            // addv tmp, tmp.16b / addv tmp, tmp.8b / addp tmp.8b, tmp.8b, tmp.8b / (no instruction for 8-bit inputs)
-            //
-            // umov out_lo, tmp.b[0]
-            // if ty == i128:
-            //     mov out_hi, 0
-
-            ctx.emit(Inst::MovToFpu {
-                rd: tmp,
-                rn: in_regs.regs()[0],
-                size,
-            });
-
-            if ty == I128 {
-                ctx.emit(Inst::MovToVec {
-                    rd: tmp,
-                    rn: in_regs.regs()[1],
-                    idx: 1,
-                    size: VectorSize::Size64x2,
+                ctx.emit(Inst::VecMisc {
+                    op: VecMisc2::Cnt,
+                    rd,
+                    rn,
+                    size: VectorSize::from_ty(ty),
                 });
-            }
+            } else {
+                let out_regs = get_output_reg(ctx, outputs[0]);
+                let in_regs = put_input_in_regs(ctx, inputs[0]);
+                let size = if ty == I128 {
+                    ScalarSize::Size64
+                } else {
+                    ScalarSize::from_operand_size(OperandSize::from_ty(ty))
+                };
 
-            ctx.emit(Inst::VecMisc {
-                op: VecMisc2::Cnt,
-                rd: tmp,
-                rn: tmp.to_reg(),
-                size: vec_size,
-            });
+                let vec_size = if ty == I128 {
+                    VectorSize::Size8x16
+                } else {
+                    VectorSize::Size8x8
+                };
 
-            match ScalarSize::from_ty(ty) {
-                ScalarSize::Size8 => {}
-                ScalarSize::Size16 => {
-                    // ADDP is usually cheaper than ADDV.
-                    ctx.emit(Inst::VecRRR {
-                        alu_op: VecALUOp::Addp,
+                let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap();
+
+                // fmov tmp, in_lo
+                // if ty == i128:
+                //     mov tmp.d[1], in_hi
+                //
+                // cnt tmp.16b, tmp.16b / cnt tmp.8b, tmp.8b
+                // addv tmp, tmp.16b / addv tmp, tmp.8b / addp tmp.8b, tmp.8b, tmp.8b / (no instruction for 8-bit inputs)
+                //
+                // umov out_lo, tmp.b[0]
+                // if ty == i128:
+                //     mov out_hi, 0
+
+                ctx.emit(Inst::MovToFpu {
+                    rd: tmp,
+                    rn: in_regs.regs()[0],
+                    size,
+                });
+
+                if ty == I128 {
+                    ctx.emit(Inst::MovToVec {
                         rd: tmp,
-                        rn: tmp.to_reg(),
-                        rm: tmp.to_reg(),
-                        size: VectorSize::Size8x8,
+                        rn: in_regs.regs()[1],
+                        idx: 1,
+                        size: VectorSize::Size64x2,
                     });
                 }
-                ScalarSize::Size32 | ScalarSize::Size64 | ScalarSize::Size128 => {
-                    ctx.emit(Inst::VecLanes {
-                        op: VecLanesOp::Addv,
-                        rd: tmp,
-                        rn: tmp.to_reg(),
-                        size: vec_size,
-                    });
-                }
-            }
 
-            ctx.emit(Inst::MovFromVec {
-                rd: out_regs.regs()[0],
-                rn: tmp.to_reg(),
-                idx: 0,
-                size: VectorSize::Size8x16,
-            });
-            if ty == I128 {
-                lower_constant_u64(ctx, out_regs.regs()[1], 0);
+                ctx.emit(Inst::VecMisc {
+                    op: VecMisc2::Cnt,
+                    rd: tmp,
+                    rn: tmp.to_reg(),
+                    size: vec_size,
+                });
+
+                match ScalarSize::from_ty(ty) {
+                    ScalarSize::Size8 => {}
+                    ScalarSize::Size16 => {
+                        // ADDP is usually cheaper than ADDV.
+                        ctx.emit(Inst::VecRRR {
+                            alu_op: VecALUOp::Addp,
+                            rd: tmp,
+                            rn: tmp.to_reg(),
+                            rm: tmp.to_reg(),
+                            size: VectorSize::Size8x8,
+                        });
+                    }
+                    ScalarSize::Size32 | ScalarSize::Size64 | ScalarSize::Size128 => {
+                        ctx.emit(Inst::VecLanes {
+                            op: VecLanesOp::Addv,
+                            rd: tmp,
+                            rn: tmp.to_reg(),
+                            size: vec_size,
+                        });
+                    }
+                }
+
+                ctx.emit(Inst::MovFromVec {
+                    rd: out_regs.regs()[0],
+                    rn: tmp.to_reg(),
+                    idx: 0,
+                    size: VectorSize::Size8x16,
+                });
+
+                if ty == I128 {
+                    lower_constant_u64(ctx, out_regs.regs()[1], 0);
+                }
             }
         }