From 4638de673cf43028d22fc6ca58c7867f887704d7 Mon Sep 17 00:00:00 2001
From: Chris Fallin <chris@cfallin.org>
Date: Tue, 12 Jan 2021 15:37:53 -0800
Subject: [PATCH] x64 bugfix: prevent load-op fusion of cmp because it could be
 emitted multiple times.

On x64, the new backend generates `cmp` instructions at their use-sites
when possible (when the icmp that generates a boolean is known) so that
the condition flows directly through flags rather than a materialized
boolean. E.g., both `bint` (boolean to int) and `select` (conditional
select) instruction lowerings invoke `emit_cmp()` to do so.

Load-op fusion in `emit_cmp()` nominally allowed `cmp` to use its `cmp
reg, mem` form.

However, the mergeable-load condition (load has only single use) was not
adequately checked. Consider the sequence:

```
    v2 = load.i64 v1
    v3 = icmp eq v0, v2
    v4 = bint.i64 v3
    v5 = select.i64 v3, v0, v1
```

The load `v2` is only used in the `icmp` at `v3`. However, the cmp will
be separately codegen'd twice, once for the `bint` and once for the
`select`.

Prior to this fix, the above example would result in the load at `v2`
sinking to the `cmp` just above the `select`; we then emit another `cmp`
for the `bint`, but the load has already been used once so we do not
allow merging. We thus (i) expect the register for `v2` to contain the
loaded value, but (ii) skip the codegen for the load because it has been
sunk. This results in a regalloc error (unexpected livein) as the
unfilled register is upward-exposed to the entry point.

Because of this, we need to accept only the reg, reg form in
`emit_cmp()` (and the FP equivalent). We could get marginally better
code by tracking whether the `cmp` we are emitting comes from an
`icmp`/`fcmp` with only one use; but IMHO simplicity is a better rule
here when subtle interactions occur.
---
 cranelift/codegen/src/isa/x64/lower.rs        | 17 +++++--
 .../filetests/isa/x64/cmp-mem-bug.clif        | 49 +++++++++++++++++++
 2 files changed, 62 insertions(+), 4 deletions(-)
 create mode 100644 cranelift/filetests/filetests/isa/x64/cmp-mem-bug.clif
diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs
index 0b9784d59d..1712d5d172 100644
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -380,11 +380,18 @@ fn emit_cmp<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) {
 
     // TODO Try to commute the operands (and invert the condition) if one is an immediate.
     let lhs = put_input_in_reg(ctx, inputs[0]);
-    let rhs = input_to_reg_mem_imm(ctx, inputs[1]);
+    // We force the RHS into a register, and disallow load-op fusion, because we
+    // do not have a transitive guarantee that this cmp-site will be the sole
+    // user of the value. Consider: the icmp might be the only user of a load,
+    // but there may be multiple users of the icmp (e.g.  select or bint
+    // instructions) that each invoke `emit_cmp()`. If we were to allow a load
+    // to sink to the *latest* one, but other sites did not permit sinking, then
+    // we would be missing the load for other cmp-sites.
+    let rhs = put_input_in_reg(ctx, inputs[1]);
 
     // Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives
     // us dst - src at the machine instruction level, so invert operands.
-    ctx.emit(Inst::cmp_rmi_r(ty.bytes() as u8, rhs, lhs));
+    ctx.emit(Inst::cmp_rmi_r(ty.bytes() as u8, RegMemImm::reg(rhs), lhs));
 }
 
 /// A specification for a fcmp emission.
@@ -465,8 +472,10 @@ fn emit_fcmp<C: LowerCtx<I = Inst>>(
         (inputs[0], inputs[1])
     };
     let lhs = put_input_in_reg(ctx, lhs_input);
-    let rhs = input_to_reg_mem(ctx, rhs_input);
-    ctx.emit(Inst::xmm_cmp_rm_r(op, rhs, lhs));
+    // See above in `emit_cmp()`. We must only use the reg/reg form of the
+    // comparison in order to avoid issues with merged loads.
+    let rhs = put_input_in_reg(ctx, rhs_input);
+    ctx.emit(Inst::xmm_cmp_rm_r(op, RegMem::reg(rhs), lhs));
 
     let cond_result = match cond_code {
         FloatCC::Equal => FcmpCondResult::AndConditions(CC::NP, CC::Z),
diff --git a/cranelift/filetests/filetests/isa/x64/cmp-mem-bug.clif b/cranelift/filetests/filetests/isa/x64/cmp-mem-bug.clif
new file mode 100644
index 0000000000..9d05e04b04
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/cmp-mem-bug.clif
@@ -0,0 +1,49 @@
+test compile
+target x86_64
+feature "experimental_x64"
+
+function %f0(i64, i64) -> i64, i64 {
+block0(v0: i64, v1: i64):
+    v2 = load.i64 v1
+; check:  movq    0(%rsi), %rax
+
+    v3 = icmp eq v0, v2
+
+    v4 = bint.i64 v3
+; nextln: cmpq    %rax, %rdi
+; nextln: setz    %cl
+; nextln: movzbq  %cl, %rcx
+
+    v5 = select.i64 v3, v0, v1
+; nextln: cmpq    %rax, %rdi
+; nextln: cmovzq  %rdi, %rsi
+
+    return v4, v5
+; nextln: movq    %rcx, %rax
+; nextln: movq    %rsi, %rdx
+}
+
+function %f1(f64, i64) -> i64, f64 {
+block0(v0: f64, v1: i64):
+    v2 = load.f64 v1
+; check:  movsd   0(%rdi), %xmm1
+
+    v3 = fcmp eq v0, v2
+
+    v4 = bint.i64 v3
+; nextln: ucomisd %xmm1, %xmm0
+; nextln: setnp   %dil
+; nextln: setz    %sil
+; nextln: andl    %edi, %esi
+; nextln: movzbq  %sil, %rsi
+
+    v5 = select.f64 v3, v0, v0
+; nextln: ucomisd %xmm1, %xmm0
+; nextln: movaps  %xmm0, %xmm1
+; nextln: jnp $$next; movsd %xmm0, %xmm1; $$next:
+; nextln: jz $$next; movsd %xmm0, %xmm1; $$next:
+
+    return v4, v5
+; nextln: movq    %rsi, %rax
+; nextln: movaps  %xmm1, %xmm0
+}