From 4638de673cf43028d22fc6ca58c7867f887704d7 Mon Sep 17 00:00:00 2001 From: Chris Fallin Date: Tue, 12 Jan 2021 15:37:53 -0800 Subject: [PATCH] x64 bugfix: prevent load-op fusion of cmp because it could be emitted multiple times. On x64, the new backend generates `cmp` instructions at their use-sites when possible (when the icmp that generates a boolean is known) so that the condition flows directly through flags rather than a materialized boolean. E.g., both `bint` (boolean to int) and `select` (conditional select) instruction lowerings invoke `emit_cmp()` to do so. Load-op fusion in `emit_cmp()` nominally allowed `cmp` to use its `cmp reg, mem` form. However, the mergeable-load condition (load has only single use) was not adequately checked. Consider the sequence: ``` v2 = load.i64 v1 v3 = icmp eq v0, v2 v4 = bint.i64 v3 v5 = select.i64 v3, v0, v1 ``` The load `v2` is only used in the `icmp` at `v3`. However, the cmp will be separately codegen'd twice, once for the `bint` and once for the `select`. Prior to this fix, the above example would result in the load at `v2` sinking to the `cmp` just above the `select`; we then emit another `cmp` for the `bint`, but the load has already been used once so we do not allow merging. We thus (i) expect the register for `v2` to contain the loaded value, but (ii) skip the codegen for the load because it has been sunk. This results in a regalloc error (unexpected livein) as the unfilled register is upward-exposed to the entry point. Because of this, we need to accept only the reg, reg form in `emit_cmp()` (and the FP equivalent). We could get marginally better code by tracking whether the `cmp` we are emitting comes from an `icmp`/`fcmp` with only one use; but IMHO simplicity is a better rule here when subtle interactions occur. --- cranelift/codegen/src/isa/x64/lower.rs | 17 +++++-- .../filetests/isa/x64/cmp-mem-bug.clif | 49 +++++++++++++++++++ 2 files changed, 62 insertions(+), 4 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/x64/cmp-mem-bug.clif diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 0b9784d59d..1712d5d172 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -380,11 +380,18 @@ fn emit_cmp>(ctx: &mut C, insn: IRInst) { // TODO Try to commute the operands (and invert the condition) if one is an immediate. let lhs = put_input_in_reg(ctx, inputs[0]); - let rhs = input_to_reg_mem_imm(ctx, inputs[1]); + // We force the RHS into a register, and disallow load-op fusion, because we + // do not have a transitive guarantee that this cmp-site will be the sole + // user of the value. Consider: the icmp might be the only user of a load, + // but there may be multiple users of the icmp (e.g. select or bint + // instructions) that each invoke `emit_cmp()`. If we were to allow a load + // to sink to the *latest* one, but other sites did not permit sinking, then + // we would be missing the load for other cmp-sites. + let rhs = put_input_in_reg(ctx, inputs[1]); // Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives // us dst - src at the machine instruction level, so invert operands. - ctx.emit(Inst::cmp_rmi_r(ty.bytes() as u8, rhs, lhs)); + ctx.emit(Inst::cmp_rmi_r(ty.bytes() as u8, RegMemImm::reg(rhs), lhs)); } /// A specification for a fcmp emission. @@ -465,8 +472,10 @@ fn emit_fcmp>( (inputs[0], inputs[1]) }; let lhs = put_input_in_reg(ctx, lhs_input); - let rhs = input_to_reg_mem(ctx, rhs_input); - ctx.emit(Inst::xmm_cmp_rm_r(op, rhs, lhs)); + // See above in `emit_cmp()`. We must only use the reg/reg form of the + // comparison in order to avoid issues with merged loads. + let rhs = put_input_in_reg(ctx, rhs_input); + ctx.emit(Inst::xmm_cmp_rm_r(op, RegMem::reg(rhs), lhs)); let cond_result = match cond_code { FloatCC::Equal => FcmpCondResult::AndConditions(CC::NP, CC::Z), diff --git a/cranelift/filetests/filetests/isa/x64/cmp-mem-bug.clif b/cranelift/filetests/filetests/isa/x64/cmp-mem-bug.clif new file mode 100644 index 0000000000..9d05e04b04 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/cmp-mem-bug.clif @@ -0,0 +1,49 @@ +test compile +target x86_64 +feature "experimental_x64" + +function %f0(i64, i64) -> i64, i64 { +block0(v0: i64, v1: i64): + v2 = load.i64 v1 +; check: movq 0(%rsi), %rax + + v3 = icmp eq v0, v2 + + v4 = bint.i64 v3 +; nextln: cmpq %rax, %rdi +; nextln: setz %cl +; nextln: movzbq %cl, %rcx + + v5 = select.i64 v3, v0, v1 +; nextln: cmpq %rax, %rdi +; nextln: cmovzq %rdi, %rsi + + return v4, v5 +; nextln: movq %rcx, %rax +; nextln: movq %rsi, %rdx +} + +function %f1(f64, i64) -> i64, f64 { +block0(v0: f64, v1: i64): + v2 = load.f64 v1 +; check: movsd 0(%rdi), %xmm1 + + v3 = fcmp eq v0, v2 + + v4 = bint.i64 v3 +; nextln: ucomisd %xmm1, %xmm0 +; nextln: setnp %dil +; nextln: setz %sil +; nextln: andl %edi, %esi +; nextln: movzbq %sil, %rsi + + v5 = select.f64 v3, v0, v0 +; nextln: ucomisd %xmm1, %xmm0 +; nextln: movaps %xmm0, %xmm1 +; nextln: jnp $$next; movsd %xmm0, %xmm1; $$next: +; nextln: jz $$next; movsd %xmm0, %xmm1; $$next: + + return v4, v5 +; nextln: movq %rsi, %rax +; nextln: movaps %xmm1, %xmm0 +}