diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index 44da584b44..ee55f3394f 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -221,7 +221,7 @@ impl From<(Opcode, Type)> for BitOp {
             (Opcode::Clz, I64) => BitOp::Clz64,
             (Opcode::Cls, I32) => BitOp::Cls32,
             (Opcode::Cls, I64) => BitOp::Cls64,
-            _ => unreachable!("Called with non-bit op!"),
+            _ => unreachable!("Called with non-bit op!: {:?}", op_ty),
         }
     }
 }
diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
index 4bb3c547e3..92760a81bb 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -1240,24 +1240,64 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) {
             }
         }
 
-        Opcode::Bitrev | Opcode::Clz | Opcode::Cls => {
+        Opcode::Bitrev | Opcode::Clz | Opcode::Cls | Opcode::Ctz => {
             let rd = output_to_reg(ctx, outputs[0]);
-            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
-            let op = BitOp::from((op, ty.unwrap()));
-            ctx.emit(Inst::BitRR { rd, rn, op });
-        }
+            let needs_zext = match op {
+                Opcode::Bitrev | Opcode::Ctz => false,
+                Opcode::Clz | Opcode::Cls => true,
+                _ => unreachable!(),
+            };
+            let ty = ty.unwrap();
+            let narrow_mode = if needs_zext && ty_bits(ty) == 64 {
+                NarrowValueMode::ZeroExtend64
+            } else if needs_zext {
+                NarrowValueMode::ZeroExtend32
+            } else {
+                NarrowValueMode::None
+            };
+            let rn = input_to_reg(ctx, inputs[0], narrow_mode);
+            let op_ty = match ty {
+                I8 | I16 | I32 => I32,
+                I64 => I64,
+                _ => panic!("Unsupported type for Bitrev/Clz/Cls"),
+            };
+            let bitop = match op {
+                Opcode::Clz | Opcode::Cls | Opcode::Bitrev => BitOp::from((op, op_ty)),
+                Opcode::Ctz => BitOp::from((Opcode::Bitrev, op_ty)),
+                _ => unreachable!(),
+            };
+            ctx.emit(Inst::BitRR { rd, rn, op: bitop });
 
-        Opcode::Ctz => {
-            let rd = output_to_reg(ctx, outputs[0]);
-            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
-            let op = BitOp::from((Opcode::Bitrev, ty.unwrap()));
-            ctx.emit(Inst::BitRR { rd, rn, op });
-            let op = BitOp::from((Opcode::Clz, ty.unwrap()));
-            ctx.emit(Inst::BitRR {
-                rd,
-                rn: rd.to_reg(),
-                op,
-            });
+            // Both bitrev and ctz use a bit-reverse (rbit) instruction; ctz to reduce the problem
+            // to a clz, and bitrev as the main operation.
+            if op == Opcode::Bitrev || op == Opcode::Ctz {
+                // Reversing an n-bit value (n < 32) with a 32-bit bitrev instruction will place
+                // the reversed result in the highest n bits, so we need to shift them down into
+                // place.
+                let right_shift = match ty {
+                    I8 => Some(24),
+                    I16 => Some(16),
+                    I32 => None,
+                    I64 => None,
+                    _ => panic!("Unsupported type for Bitrev"),
+                };
+                if let Some(s) = right_shift {
+                    ctx.emit(Inst::AluRRImmShift {
+                        alu_op: ALUOp::Lsr32,
+                        rd,
+                        rn: rd.to_reg(),
+                        immshift: ImmShift::maybe_from_u64(s).unwrap(),
+                    });
+                }
+            }
+
+            if op == Opcode::Ctz {
+                ctx.emit(Inst::BitRR {
+                    op: BitOp::from((Opcode::Clz, op_ty)),
+                    rd,
+                    rn: rd.to_reg(),
+                });
+            }
         }
 
         Opcode::Popcnt => {
@@ -1272,7 +1312,10 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) {
             //   x >> 56
             let ty = ty.unwrap();
             let rd = output_to_reg(ctx, outputs[0]);
-            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            // FIXME(#1537): zero-extend 8/16/32-bit operands only to 32 bits,
+            // and fix the sequence below to work properly for this.
+            let narrow_mode = NarrowValueMode::ZeroExtend64;
+            let rn = input_to_reg(ctx, inputs[0], narrow_mode);
             let tmp = ctx.tmp(RegClass::I64, I64);
 
             // If this is a 32-bit Popcnt, use Lsr32 to clear the top 32 bits of the register, then
diff --git a/cranelift/filetests/filetests/vcode/aarch64/bitops.clif b/cranelift/filetests/filetests/vcode/aarch64/bitops.clif
index 8f5e81d322..f1f1a7dba3 100644
--- a/cranelift/filetests/filetests/vcode/aarch64/bitops.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/bitops.clif
@@ -1,6 +1,34 @@
 test vcode
 target aarch64
 
+function %a(i8) -> i8 {
+block0(v0: i8):
+    v1 = bitrev v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: rbit w0, w0
+; nextln: lsr w0, w0, #24
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %a(i16) -> i16 {
+block0(v0: i16):
+    v1 = bitrev v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: rbit w0, w0
+; nextln: lsr w0, w0, #16
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
 function %a(i32) -> i32 {
 block0(v0: i32):
     v1 = bitrev v0
@@ -27,6 +55,35 @@ block0(v0: i64):
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
 
+
+function %b(i8) -> i8 {
+block0(v0: i8):
+    v1 = clz v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: uxtb w0, w0
+; nextln: clz w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %b(i16) -> i16 {
+block0(v0: i16):
+    v1 = clz v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: uxth w0, w0
+; nextln: clz w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
 function %b(i32) -> i32 {
 block0(v0: i32):
     v1 = clz v0
@@ -53,6 +110,34 @@ block0(v0: i64):
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
 
+function %c(i8) -> i8 {
+block0(v0: i8):
+    v1 = cls v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: uxtb w0, w0
+; nextln: cls w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %c(i16) -> i16 {
+block0(v0: i16):
+    v1 = cls v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: uxth w0, w0
+; nextln: cls w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
 function %c(i32) -> i32 {
 block0(v0: i32):
     v1 = cls v0
@@ -79,6 +164,36 @@ block0(v0: i64):
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
 
+function %d(i8) -> i8 {
+block0(v0: i8):
+    v1 = ctz v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: rbit w0, w0
+; nextln: lsr w0, w0, #24
+; nextln: clz w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %d(i16) -> i16 {
+block0(v0: i16):
+    v1 = ctz v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: rbit w0, w0
+; nextln: lsr w0, w0, #16
+; nextln: clz w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
 function %d(i32) -> i32 {
 block0(v0: i32):
     v1 = ctz v0
@@ -140,6 +255,59 @@ block0(v0: i32):
 
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
+; nextln: mov w0, w0
+; nextln: lsr w1, w0, #1
+; nextln: and x1, x1, #6148914691236517205
+; nextln: sub x1, x0, x1
+; nextln: and x0, x1, #3689348814741910323
+; nextln: lsr x1, x1, #2
+; nextln: and x1, x1, #3689348814741910323
+; nextln: add x0, x1, x0
+; nextln: add x0, x0, x0, LSR 4
+; nextln: and x0, x0, #1085102592571150095
+; nextln: add x0, x0, x0, LSL 8
+; nextln: add x0, x0, x0, LSL 16
+; nextln: add x0, x0, x0, LSL 32
+; nextln: lsr x0, x0, #56
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %d(i16) -> i16 {
+block0(v0: i16):
+    v1 = popcnt v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: uxth x0, w0
+; nextln: lsr w1, w0, #1
+; nextln: and x1, x1, #6148914691236517205
+; nextln: sub x1, x0, x1
+; nextln: and x0, x1, #3689348814741910323
+; nextln: lsr x1, x1, #2
+; nextln: and x1, x1, #3689348814741910323
+; nextln: add x0, x1, x0
+; nextln: add x0, x0, x0, LSR 4
+; nextln: and x0, x0, #1085102592571150095
+; nextln: add x0, x0, x0, LSL 8
+; nextln: add x0, x0, x0, LSL 16
+; nextln: add x0, x0, x0, LSL 32
+; nextln: lsr x0, x0, #56
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %d(i8) -> i8 {
+block0(v0: i8):
+    v1 = popcnt v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: uxtb x0, w0
 ; nextln: lsr w1, w0, #1
 ; nextln: and x1, x1, #6148914691236517205
 ; nextln: sub x1, x0, x1