diff --git a/cranelift/filetests/isa/intel/binary32.cton b/cranelift/filetests/isa/intel/binary32.cton
index 4cc14b6905..77e53ceeec 100644
--- a/cranelift/filetests/isa/intel/binary32.cton
+++ b/cranelift/filetests/isa/intel/binary32.cton
@@ -217,6 +217,13 @@ ebb0:
     ; asm: movsbl -50000(%esi), %edx
     [-,%rdx]            v129 = sload8.i32 v2-50000         ; bin: 0f be 96 ffff3cb0
 
+    ; Bit-counting instructions.
+
+    ; asm: popcntl %esi, %ecx
+    [-,%rcx]            v200 = popcnt v2        ; bin: f3 0f b8 ce
+    ; asm: popcntl %ecx, %esi
+    [-,%rsi]            v201 = popcnt v1        ; bin: f3 0f b8 f1
+
     ; asm: call foo
     call fn0()                                  ; bin: e8 PCRel4(fn0) 00000000
 
diff --git a/cranelift/filetests/isa/intel/binary64.cton b/cranelift/filetests/isa/intel/binary64.cton
index e9ee5d3941..da350d1e45 100644
--- a/cranelift/filetests/isa/intel/binary64.cton
+++ b/cranelift/filetests/isa/intel/binary64.cton
@@ -145,6 +145,15 @@ ebb0:
     ; asm: movq %rcx, %r10
     [-,%r10]             v112 = copy v1          ; bin: 49 89 ca
 
+    ; Bit-counting instructions.
+
+    ; asm: popcntq %rsi, %rcx
+    [-,%rcx]            v200 = popcnt v2        ; bin: f3 48 0f b8 ce
+    ; asm: popcntq %r10, %rsi
+    [-,%rsi]            v201 = popcnt v3        ; bin: f3 49 0f b8 f2
+    ; asm: popcntq %rcx, %r10
+    [-,%r10]            v202 = popcnt v1        ; bin: f3 4c 0f b8 d1
+
     return                                       ; bin: c3
 }
 
@@ -290,5 +299,14 @@ ebb0:
     ; asm: movl %ecx, %r10d
     [-,%r10]             v112 = copy v1          ; bin: 41 89 ca
 
+    ; Bit-counting instructions.
+
+    ; asm: popcntl %esi, %ecx
+    [-,%rcx]            v200 = popcnt v2        ; bin: f3 40 0f b8 ce
+    ; asm: popcntl %r10d, %esi
+    [-,%rsi]            v201 = popcnt v3        ; bin: f3 41 0f b8 f2
+    ; asm: popcntl %ecx, %r10d
+    [-,%r10]            v202 = popcnt v1        ; bin: f3 44 0f b8 d1
+
     return                                       ; bin: c3
 }
diff --git a/cranelift/filetests/wasm/i32-arith.cton b/cranelift/filetests/wasm/i32-arith.cton
index f2fafffee3..fc730cbe23 100644
--- a/cranelift/filetests/wasm/i32-arith.cton
+++ b/cranelift/filetests/wasm/i32-arith.cton
@@ -19,7 +19,12 @@ ebb0:
 
 ; function %i32_clz(i32) -> i32
 ; function %i32_ctz(i32) -> i32
-; function %i32_popcnt(i32) -> i32
+
+function %i32_popcnt(i32) -> i32 {
+ebb0(v0: i32):
+    v1 = popcnt v0
+    return v1
+}
 
 ; Binary operations.
 
diff --git a/lib/cretonne/meta/base/instructions.py b/lib/cretonne/meta/base/instructions.py
index e1b38086e4..2aa9027a45 100644
--- a/lib/cretonne/meta/base/instructions.py
+++ b/lib/cretonne/meta/base/instructions.py
@@ -8,7 +8,7 @@ from __future__ import absolute_import
 from cdsl.operands import Operand, VARIABLE_ARGS
 from cdsl.typevar import TypeVar
 from cdsl.instructions import Instruction, InstructionGroup
-from base.types import i8, f32, f64, b1
+from base.types import f32, f64, b1
 from base.immediates import imm64, uimm8, ieee32, ieee64, offset32, uoffset32
 from base.immediates import intcc, floatcc, memflags, regunit
 from base import entities
@@ -1050,7 +1050,7 @@ sshr_imm = Instruction(
 #
 
 x = Operand('x', iB)
-a = Operand('a', i8)
+a = Operand('a', iB)
 
 clz = Instruction(
         'clz', r"""
diff --git a/lib/cretonne/meta/isa/intel/encodings.py b/lib/cretonne/meta/isa/intel/encodings.py
index df6631fe4c..d8dfe53140 100644
--- a/lib/cretonne/meta/isa/intel/encodings.py
+++ b/lib/cretonne/meta/isa/intel/encodings.py
@@ -22,10 +22,10 @@ for inst,           opc in [
     # default. Otherwise reg-alloc would never use r8 and up.
     I64.enc(inst.i32, *r.rr(opc))
 
-I32.enc(base.copy.i32, *r.ur(0x89))
-I64.enc(base.copy.i64, *r.ur.rex(0x89, w=1))
-I64.enc(base.copy.i32, *r.ur.rex(0x89))
-I64.enc(base.copy.i32, *r.ur(0x89))
+I32.enc(base.copy.i32, *r.umr(0x89))
+I64.enc(base.copy.i64, *r.umr.rex(0x89, w=1))
+I64.enc(base.copy.i32, *r.umr.rex(0x89))
+I64.enc(base.copy.i32, *r.umr(0x89))
 
 I32.enc(base.regmove.i32, *r.rmov(0x89))
 I64.enc(base.regmove.i64, *r.rmov.rex(0x89, w=1))
@@ -80,6 +80,12 @@ for inst,           rrr in [
     I64.enc(inst.i32.i32, *r.rc.rex(0xd3, rrr=rrr))
     I64.enc(inst.i32.i32, *r.rc(0xd3, rrr=rrr))
 
+# Population count.
+I32.enc(base.popcnt.i32, *r.urm(0xf3, 0x0f, 0xb8))
+I64.enc(base.popcnt.i64, *r.urm.rex(0xf3, 0x0f, 0xb8, w=1))
+I64.enc(base.popcnt.i32, *r.urm.rex(0xf3, 0x0f, 0xb8))
+I64.enc(base.popcnt.i32, *r.urm(0xf3, 0x0f, 0xb8))
+
 # Loads and stores.
 I32.enc(base.store.i32.i32, *r.st(0x89))
 I32.enc(base.store.i32.i32, *r.stDisp8(0x89))
diff --git a/lib/cretonne/meta/isa/intel/recipes.py b/lib/cretonne/meta/isa/intel/recipes.py
index c6c7b3f2af..b7e6aa5575 100644
--- a/lib/cretonne/meta/isa/intel/recipes.py
+++ b/lib/cretonne/meta/isa/intel/recipes.py
@@ -198,14 +198,23 @@ rr = TailRecipe(
         ''')
 
 # XX /r, but for a unary operator with separate input/output register, like
-# copies.
-ur = TailRecipe(
-        'ur', Unary, size=1, ins=GPR, outs=GPR,
+# copies. MR form.
+umr = TailRecipe(
+        'umr', Unary, size=1, ins=GPR, outs=GPR,
         emit='''
         PUT_OP(bits, rex2(out_reg0, in_reg0), sink);
         modrm_rr(out_reg0, in_reg0, sink);
         ''')
 
+# XX /r, but for a unary operator with separate input/output register.
+# RM form.
+urm = TailRecipe(
+        'urm', Unary, size=1, ins=GPR, outs=GPR,
+        emit='''
+        PUT_OP(bits, rex2(in_reg0, out_reg0), sink);
+        modrm_rr(in_reg0, out_reg0, sink);
+        ''')
+
 # XX /r, for regmove instructions.
 rmov = TailRecipe(
         'ur', RegMove, size=1, ins=GPR, outs=(),
diff --git a/lib/cretonne/src/isa/intel/binemit.rs b/lib/cretonne/src/isa/intel/binemit.rs
index 9db0ee1234..56f0c2f7f0 100644
--- a/lib/cretonne/src/isa/intel/binemit.rs
+++ b/lib/cretonne/src/isa/intel/binemit.rs
@@ -57,7 +57,7 @@ fn rex_prefix<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
 // Emit a single-byte opcode with no REX prefix.
 fn put_op1<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
     debug_assert_eq!(bits & 0x8f00, 0, "Invalid encoding bits for Op1*");
-    debug_assert_eq!(rex, BASE_REX, "Invalid registers for REX-less encoding");
+    debug_assert_eq!(rex, BASE_REX, "Invalid registers for REX-less Op1 encoding");
     sink.put1(bits as u8);
 }
 
@@ -71,17 +71,37 @@ fn put_rexop1<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
 // Emit two-byte opcode: 0F XX
 fn put_op2<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
     debug_assert_eq!(bits & 0x8f00, 0x0400, "Invalid encoding bits for Op2*");
-    debug_assert_eq!(rex, BASE_REX, "Invalid registers for REX-less encoding");
+    debug_assert_eq!(rex, BASE_REX, "Invalid registers for REX-less Op2 encoding");
     sink.put1(0x0f);
     sink.put1(bits as u8);
 }
 
 // Emit single-byte opcode with mandatory prefix.
 fn put_mp1<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
-    debug_assert_eq!(bits & 0x0c00, 0, "Invalid encoding bits for Mp1*");
+    debug_assert_eq!(bits & 0x8c00, 0, "Invalid encoding bits for Mp1*");
     let pp = (bits >> 8) & 3;
     sink.put1(PREFIX[(pp - 1) as usize]);
-    debug_assert_eq!(rex, BASE_REX, "Invalid registers for REX-less encoding");
+    debug_assert_eq!(rex, BASE_REX, "Invalid registers for REX-less Mp1 encoding");
+    sink.put1(bits as u8);
+}
+
+// Emit two-byte opcode (0F XX) with mandatory prefix.
+fn put_mp2<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
+    debug_assert_eq!(bits & 0x8c00, 0x0400, "Invalid encoding bits for Mp2*");
+    let pp = (bits >> 8) & 3;
+    sink.put1(PREFIX[(pp - 1) as usize]);
+    debug_assert_eq!(rex, BASE_REX, "Invalid registers for REX-less Mp2 encoding");
+    sink.put1(0x0f);
+    sink.put1(bits as u8);
+}
+
+// Emit two-byte opcode (0F XX) with mandatory prefix and REX.
+fn put_rexmp2<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
+    debug_assert_eq!(bits & 0x0c00, 0x0400, "Invalid encoding bits for Mp2*");
+    let pp = (bits >> 8) & 3;
+    sink.put1(PREFIX[(pp - 1) as usize]);
+    rex_prefix(bits, rex, sink);
+    sink.put1(0x0f);
     sink.put1(bits as u8);
 }