From 435a15b88dae7ce4f2a4634a50bff46a3558f41e Mon Sep 17 00:00:00 2001
From: Jakob Stoklund Olesen <jolesen@mozilla.com>
Date: Wed, 12 Jul 2017 14:14:08 -0700
Subject: [PATCH] Add Intel encodings for popcnt.

Change the result type for the bit-counting instructions from a fixed i8
to the iB type variable which is the type of the input. This matches the
convention in WebAssembly, and at least Intel's instructions will set a
full register's worth of count result, even if it is always < 64.

Duplicate the Intel 'ur' encoding recipe into 'umr' and 'urm' variants
corresponding to the RM and MR encoding variants. The difference is
which register is encoded as 'reg' and which is 'r/m' in the ModR/M
byte. A 'mov' register copy uses the MR variant, a unary popcnt uses the
RM variant.
---
 cranelift/filetests/isa/intel/binary32.cton |  7 ++++++
 cranelift/filetests/isa/intel/binary64.cton | 18 +++++++++++++
 cranelift/filetests/wasm/i32-arith.cton     |  7 +++++-
 lib/cretonne/meta/base/instructions.py      |  4 +--
 lib/cretonne/meta/isa/intel/encodings.py    | 14 ++++++++---
 lib/cretonne/meta/isa/intel/recipes.py      | 15 ++++++++---
 lib/cretonne/src/isa/intel/binemit.rs       | 28 ++++++++++++++++++---
 7 files changed, 79 insertions(+), 14 deletions(-)

diff --git a/cranelift/filetests/isa/intel/binary32.cton b/cranelift/filetests/isa/intel/binary32.cton
index 4cc14b6905..77e53ceeec 100644
--- a/cranelift/filetests/isa/intel/binary32.cton
+++ b/cranelift/filetests/isa/intel/binary32.cton
@@ -217,6 +217,13 @@ ebb0:
     ; asm: movsbl -50000(%esi), %edx
     [-,%rdx]            v129 = sload8.i32 v2-50000         ; bin: 0f be 96 ffff3cb0
 
+    ; Bit-counting instructions.
+
+    ; asm: popcntl %esi, %ecx
+    [-,%rcx]            v200 = popcnt v2        ; bin: f3 0f b8 ce
+    ; asm: popcntl %ecx, %esi
+    [-,%rsi]            v201 = popcnt v1        ; bin: f3 0f b8 f1
+
     ; asm: call foo
     call fn0()                                  ; bin: e8 PCRel4(fn0) 00000000
 
diff --git a/cranelift/filetests/isa/intel/binary64.cton b/cranelift/filetests/isa/intel/binary64.cton
index e9ee5d3941..da350d1e45 100644
--- a/cranelift/filetests/isa/intel/binary64.cton
+++ b/cranelift/filetests/isa/intel/binary64.cton
@@ -145,6 +145,15 @@ ebb0:
     ; asm: movq %rcx, %r10
     [-,%r10]             v112 = copy v1          ; bin: 49 89 ca
 
+    ; Bit-counting instructions.
+
+    ; asm: popcntq %rsi, %rcx
+    [-,%rcx]            v200 = popcnt v2        ; bin: f3 48 0f b8 ce
+    ; asm: popcntq %r10, %rsi
+    [-,%rsi]            v201 = popcnt v3        ; bin: f3 49 0f b8 f2
+    ; asm: popcntq %rcx, %r10
+    [-,%r10]            v202 = popcnt v1        ; bin: f3 4c 0f b8 d1
+
     return                                       ; bin: c3
 }
 
@@ -290,5 +299,14 @@ ebb0:
     ; asm: movl %ecx, %r10d
     [-,%r10]             v112 = copy v1          ; bin: 41 89 ca
 
+    ; Bit-counting instructions.
+
+    ; asm: popcntl %esi, %ecx
+    [-,%rcx]            v200 = popcnt v2        ; bin: f3 40 0f b8 ce
+    ; asm: popcntl %r10d, %esi
+    [-,%rsi]            v201 = popcnt v3        ; bin: f3 41 0f b8 f2
+    ; asm: popcntl %ecx, %r10d
+    [-,%r10]            v202 = popcnt v1        ; bin: f3 44 0f b8 d1
+
     return                                       ; bin: c3
 }
diff --git a/cranelift/filetests/wasm/i32-arith.cton b/cranelift/filetests/wasm/i32-arith.cton
index f2fafffee3..fc730cbe23 100644
--- a/cranelift/filetests/wasm/i32-arith.cton
+++ b/cranelift/filetests/wasm/i32-arith.cton
@@ -19,7 +19,12 @@ ebb0:
 
 ; function %i32_clz(i32) -> i32
 ; function %i32_ctz(i32) -> i32
-; function %i32_popcnt(i32) -> i32
+
+function %i32_popcnt(i32) -> i32 {
+ebb0(v0: i32):
+    v1 = popcnt v0
+    return v1
+}
 
 ; Binary operations.
 
diff --git a/lib/cretonne/meta/base/instructions.py b/lib/cretonne/meta/base/instructions.py
index e1b38086e4..2aa9027a45 100644
--- a/lib/cretonne/meta/base/instructions.py
+++ b/lib/cretonne/meta/base/instructions.py
@@ -8,7 +8,7 @@ from __future__ import absolute_import
 from cdsl.operands import Operand, VARIABLE_ARGS
 from cdsl.typevar import TypeVar
 from cdsl.instructions import Instruction, InstructionGroup
-from base.types import i8, f32, f64, b1
+from base.types import f32, f64, b1
 from base.immediates import imm64, uimm8, ieee32, ieee64, offset32, uoffset32
 from base.immediates import intcc, floatcc, memflags, regunit
 from base import entities
@@ -1050,7 +1050,7 @@ sshr_imm = Instruction(
 #
 
 x = Operand('x', iB)
-a = Operand('a', i8)
+a = Operand('a', iB)
 
 clz = Instruction(
         'clz', r"""
diff --git a/lib/cretonne/meta/isa/intel/encodings.py b/lib/cretonne/meta/isa/intel/encodings.py
index df6631fe4c..d8dfe53140 100644
--- a/lib/cretonne/meta/isa/intel/encodings.py
+++ b/lib/cretonne/meta/isa/intel/encodings.py
@@ -22,10 +22,10 @@ for inst,           opc in [
     # default. Otherwise reg-alloc would never use r8 and up.
     I64.enc(inst.i32, *r.rr(opc))
 
-I32.enc(base.copy.i32, *r.ur(0x89))
-I64.enc(base.copy.i64, *r.ur.rex(0x89, w=1))
-I64.enc(base.copy.i32, *r.ur.rex(0x89))
-I64.enc(base.copy.i32, *r.ur(0x89))
+I32.enc(base.copy.i32, *r.umr(0x89))
+I64.enc(base.copy.i64, *r.umr.rex(0x89, w=1))
+I64.enc(base.copy.i32, *r.umr.rex(0x89))
+I64.enc(base.copy.i32, *r.umr(0x89))
 
 I32.enc(base.regmove.i32, *r.rmov(0x89))
 I64.enc(base.regmove.i64, *r.rmov.rex(0x89, w=1))
@@ -80,6 +80,12 @@ for inst,           rrr in [
     I64.enc(inst.i32.i32, *r.rc.rex(0xd3, rrr=rrr))
     I64.enc(inst.i32.i32, *r.rc(0xd3, rrr=rrr))
 
+# Population count.
+I32.enc(base.popcnt.i32, *r.urm(0xf3, 0x0f, 0xb8))
+I64.enc(base.popcnt.i64, *r.urm.rex(0xf3, 0x0f, 0xb8, w=1))
+I64.enc(base.popcnt.i32, *r.urm.rex(0xf3, 0x0f, 0xb8))
+I64.enc(base.popcnt.i32, *r.urm(0xf3, 0x0f, 0xb8))
+
 # Loads and stores.
 I32.enc(base.store.i32.i32, *r.st(0x89))
 I32.enc(base.store.i32.i32, *r.stDisp8(0x89))
diff --git a/lib/cretonne/meta/isa/intel/recipes.py b/lib/cretonne/meta/isa/intel/recipes.py
index c6c7b3f2af..b7e6aa5575 100644
--- a/lib/cretonne/meta/isa/intel/recipes.py
+++ b/lib/cretonne/meta/isa/intel/recipes.py
@@ -198,14 +198,23 @@ rr = TailRecipe(
         ''')
 
 # XX /r, but for a unary operator with separate input/output register, like
-# copies.
-ur = TailRecipe(
-        'ur', Unary, size=1, ins=GPR, outs=GPR,
+# copies. MR form.
+umr = TailRecipe(
+        'umr', Unary, size=1, ins=GPR, outs=GPR,
         emit='''
         PUT_OP(bits, rex2(out_reg0, in_reg0), sink);
         modrm_rr(out_reg0, in_reg0, sink);
         ''')
 
+# XX /r, but for a unary operator with separate input/output register.
+# RM form.
+urm = TailRecipe(
+        'urm', Unary, size=1, ins=GPR, outs=GPR,
+        emit='''
+        PUT_OP(bits, rex2(in_reg0, out_reg0), sink);
+        modrm_rr(in_reg0, out_reg0, sink);
+        ''')
+
 # XX /r, for regmove instructions.
 rmov = TailRecipe(
         'ur', RegMove, size=1, ins=GPR, outs=(),
diff --git a/lib/cretonne/src/isa/intel/binemit.rs b/lib/cretonne/src/isa/intel/binemit.rs
index 9db0ee1234..56f0c2f7f0 100644
--- a/lib/cretonne/src/isa/intel/binemit.rs
+++ b/lib/cretonne/src/isa/intel/binemit.rs
@@ -57,7 +57,7 @@ fn rex_prefix<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
 // Emit a single-byte opcode with no REX prefix.
 fn put_op1<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
     debug_assert_eq!(bits & 0x8f00, 0, "Invalid encoding bits for Op1*");
-    debug_assert_eq!(rex, BASE_REX, "Invalid registers for REX-less encoding");
+    debug_assert_eq!(rex, BASE_REX, "Invalid registers for REX-less Op1 encoding");
     sink.put1(bits as u8);
 }
 
@@ -71,17 +71,37 @@ fn put_rexop1<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
 // Emit two-byte opcode: 0F XX
 fn put_op2<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
     debug_assert_eq!(bits & 0x8f00, 0x0400, "Invalid encoding bits for Op2*");
-    debug_assert_eq!(rex, BASE_REX, "Invalid registers for REX-less encoding");
+    debug_assert_eq!(rex, BASE_REX, "Invalid registers for REX-less Op2 encoding");
     sink.put1(0x0f);
     sink.put1(bits as u8);
 }
 
 // Emit single-byte opcode with mandatory prefix.
 fn put_mp1<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
-    debug_assert_eq!(bits & 0x0c00, 0, "Invalid encoding bits for Mp1*");
+    debug_assert_eq!(bits & 0x8c00, 0, "Invalid encoding bits for Mp1*");
     let pp = (bits >> 8) & 3;
     sink.put1(PREFIX[(pp - 1) as usize]);
-    debug_assert_eq!(rex, BASE_REX, "Invalid registers for REX-less encoding");
+    debug_assert_eq!(rex, BASE_REX, "Invalid registers for REX-less Mp1 encoding");
+    sink.put1(bits as u8);
+}
+
+// Emit two-byte opcode (0F XX) with mandatory prefix.
+fn put_mp2<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
+    debug_assert_eq!(bits & 0x8c00, 0x0400, "Invalid encoding bits for Mp2*");
+    let pp = (bits >> 8) & 3;
+    sink.put1(PREFIX[(pp - 1) as usize]);
+    debug_assert_eq!(rex, BASE_REX, "Invalid registers for REX-less Mp2 encoding");
+    sink.put1(0x0f);
+    sink.put1(bits as u8);
+}
+
+// Emit two-byte opcode (0F XX) with mandatory prefix and REX.
+fn put_rexmp2<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
+    debug_assert_eq!(bits & 0x0c00, 0x0400, "Invalid encoding bits for Mp2*");
+    let pp = (bits >> 8) & 3;
+    sink.put1(PREFIX[(pp - 1) as usize]);
+    rex_prefix(bits, rex, sink);
+    sink.put1(0x0f);
     sink.put1(bits as u8);
 }