From 6632c45c01fec9d30b62031e75e7c3e29b3fac04 Mon Sep 17 00:00:00 2001
From: Chris Fallin <chris@cfallin.org>
Date: Tue, 8 Dec 2020 11:46:15 -0800
Subject: [PATCH] x64 lowering fix: i32.popcnt should not merge load and make
 it 64-bit.

As a subtle consequence of the recent load-op fusion, popcnt of a
value that came from a load.i32 was compiling into a 64-bit load. This
is a result of the way in which x86 infers the width of loads: it is a
consequence of the instruction containing the memory reference, not the
memory reference itself. So the `input_to_reg_mem()` helper (convert an
instruction input into a register or memory reference) was providing the
appropriate memory reference for the result of a load.i32, but never
encoded the assumption that it would only be used in a 32-bit
instruction. It turns out that popcnt.i32 uses a 64-bit instruction to
load this RM op, hence widening a 32-bit to 64-bit load (which is
problematic when the offset is (memory_length - 4)).

Separately, popcnt was using the RM operand twice, resulting in two
loads if we merged a load. This isn't a correctness bug in practice
because only a racy sequence (store interleaving between the loads)
would produce incorrect results, but we decided earlier to treat loads
as effectful for now, neither reordering nor duplicating them, to
deliberately reduce complexity.

Because of the second issue, the fix is just to force the operand into a
register always, so any source load will not be merged.

Discovered via fuzzing with oss-fuzz.
---
 cranelift/codegen/src/isa/x64/lower.rs        |   5 +-
 .../filetests/filetests/isa/x64/popcnt.clif   | 113 ++++++++++++++++++
 2 files changed, 117 insertions(+), 1 deletion(-)
 create mode 100644 cranelift/filetests/filetests/isa/x64/popcnt.clif
diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs
index 16e7490a09..a01e35bc0d 100644
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -1530,7 +1530,10 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let src = if let Some(ext_spec) = ext_spec {
                 RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec))
             } else {
-                input_to_reg_mem(ctx, inputs[0])
+                // N.B.: explicitly put input in a reg here because the width of the instruction
+                // into which this RM op goes may not match the width of the input type (in fact,
+                // it won't for i32.popcnt), and we don't want a larger than necessary load.
+                RegMem::reg(put_input_in_reg(ctx, inputs[0]))
             };
             let dst = get_output_reg(ctx, outputs[0]);
 
diff --git a/cranelift/filetests/filetests/isa/x64/popcnt.clif b/cranelift/filetests/filetests/isa/x64/popcnt.clif
new file mode 100644
index 0000000000..a06f5a27ce
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/popcnt.clif
@@ -0,0 +1,113 @@
+test compile
+target x86_64
+feature "experimental_x64"
+
+; TODO: test with popcnt feature available too, once new backend supports that.
+
+function %popcnt64(i64) -> i64 {
+block0(v0: i64):
+    v1 = popcnt v0
+; check:  movq    %rdi, %rsi
+; nextln: shrq    $$1, %rsi
+; nextln: movabsq $$8608480567731124087, %rax
+; nextln: andq    %rax, %rsi
+; nextln: subq    %rsi, %rdi
+; nextln: shrq    $$1, %rsi
+; nextln: andq    %rax, %rsi
+; nextln: subq    %rsi, %rdi
+; nextln: shrq    $$1, %rsi
+; nextln: andq    %rax, %rsi
+; nextln: subq    %rsi, %rdi
+; nextln: movq    %rdi, %rsi
+; nextln: shrq    $$4, %rsi
+; nextln: addq    %rdi, %rsi
+; nextln: movabsq $$1085102592571150095, %rdi
+; nextln: andq    %rdi, %rsi
+; nextln: movabsq $$72340172838076673, %rdi
+; nextln: imulq   %rdi, %rsi
+; nextln: shrq    $$56, %rsi
+; nextln: movq    %rsi, %rax
+    return v1
+}
+
+function %popcnt64load(i64) -> i64 {
+block0(v0: i64):
+    v1 = load.i64 v0
+    v2 = popcnt v1
+    return v2
+; check:  movq    0(%rdi), %rdi
+; nextln: movq    %rdi, %rsi
+; nextln: shrq    $$1, %rsi
+; nextln: movabsq $$8608480567731124087, %rax
+; nextln: andq    %rax, %rsi
+; nextln: subq    %rsi, %rdi
+; nextln: shrq    $$1, %rsi
+; nextln: andq    %rax, %rsi
+; nextln: subq    %rsi, %rdi
+; nextln: shrq    $$1, %rsi
+; nextln: andq    %rax, %rsi
+; nextln: subq    %rsi, %rdi
+; nextln: movq    %rdi, %rsi
+; nextln: shrq    $$4, %rsi
+; nextln: addq    %rdi, %rsi
+; nextln: movabsq $$1085102592571150095, %rdi
+; nextln: andq    %rdi, %rsi
+; nextln: movabsq $$72340172838076673, %rdi
+; nextln: imulq   %rdi, %rsi
+; nextln: shrq    $$56, %rsi
+; nextln: movq    %rsi, %rax
+}
+
+function %popcnt32(i32) -> i32 {
+block0(v0: i32):
+    v1 = popcnt v0
+    return v1
+; check:  movq    %rdi, %rsi
+; nextln: shrl    $$1, %esi
+; nextln: andl    $$2004318071, %esi
+; nextln: subl    %esi, %edi
+; nextln: shrl    $$1, %esi
+; nextln: andl    $$2004318071, %esi
+; nextln: subl    %esi, %edi
+; nextln: shrl    $$1, %esi
+; nextln: andl    $$2004318071, %esi
+; nextln: subl    %esi, %edi
+; nextln: movq    %rdi, %rsi
+; nextln: shrl    $$4, %esi
+; nextln: addl    %edi, %esi
+; nextln: andl    $$252645135, %esi
+; nextln: imull   $$16843009, %esi
+; nextln: shrl    $$24, %esi
+; nextln: movq    %rsi, %rax
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
+}
+
+function %popcnt32load(i64) -> i32 {
+block0(v0: i64):
+    v1 = load.i32 v0
+    v2 = popcnt v1
+    return v2
+; check:  movl    0(%rdi), %edi
+; nextln: movq    %rdi, %rsi
+; nextln: shrl    $$1, %esi
+; nextln: andl    $$2004318071, %esi
+; nextln: subl    %esi, %edi
+; nextln: shrl    $$1, %esi
+; nextln: andl    $$2004318071, %esi
+; nextln: subl    %esi, %edi
+; nextln: shrl    $$1, %esi
+; nextln: andl    $$2004318071, %esi
+; nextln: subl    %esi, %edi
+; nextln: movq    %rdi, %rsi
+; nextln: shrl    $$4, %esi
+; nextln: addl    %edi, %esi
+; nextln: andl    $$252645135, %esi
+; nextln: imull   $$16843009, %esi
+; nextln: shrl    $$24, %esi
+; nextln: movq    %rsi, %rax
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
+}