x64 lowering fix: i32.popcnt should not merge load and make it 64-bit.

As a subtle consequence of the recent load-op fusion, popcnt of a
value that came from a load.i32 was compiling into a 64-bit load. This
is a result of the way in which x86 infers the width of loads: it is a
consequence of the instruction containing the memory reference, not the
memory reference itself. So the `input_to_reg_mem()` helper (convert an
instruction input into a register or memory reference) was providing the
appropriate memory reference for the result of a load.i32, but never
encoded the assumption that it would only be used in a 32-bit
instruction. It turns out that popcnt.i32 uses a 64-bit instruction to
load this RM op, hence widening a 32-bit to 64-bit load (which is
problematic when the offset is (memory_length - 4)).

Separately, popcnt was using the RM operand twice, resulting in two
loads if we merged a load. This isn't a correctness bug in practice
because only a racy sequence (store interleaving between the loads)
would produce incorrect results, but we decided earlier to treat loads
as effectful for now, neither reordering nor duplicating them, to
deliberately reduce complexity.

Because of the second issue, the fix is just to force the operand into a
register always, so any source load will not be merged.

Discovered via fuzzing with oss-fuzz.
This commit is contained in:
Chris Fallin
2020-12-08 11:46:15 -08:00
parent 2cec20aa57
commit 6632c45c01
2 changed files with 117 additions and 1 deletions

View File

@@ -1530,7 +1530,10 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let src = if let Some(ext_spec) = ext_spec { let src = if let Some(ext_spec) = ext_spec {
RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec)) RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec))
} else { } else {
input_to_reg_mem(ctx, inputs[0]) // N.B.: explicitly put input in a reg here because the width of the instruction
// into which this RM op goes may not match the width of the input type (in fact,
// it won't for i32.popcnt), and we don't want a larger than necessary load.
RegMem::reg(put_input_in_reg(ctx, inputs[0]))
}; };
let dst = get_output_reg(ctx, outputs[0]); let dst = get_output_reg(ctx, outputs[0]);

View File

@@ -0,0 +1,113 @@
test compile
target x86_64
feature "experimental_x64"
; TODO: test with popcnt feature available too, once new backend supports that.
function %popcnt64(i64) -> i64 {
block0(v0: i64):
v1 = popcnt v0
; check: movq %rdi, %rsi
; nextln: shrq $$1, %rsi
; nextln: movabsq $$8608480567731124087, %rax
; nextln: andq %rax, %rsi
; nextln: subq %rsi, %rdi
; nextln: shrq $$1, %rsi
; nextln: andq %rax, %rsi
; nextln: subq %rsi, %rdi
; nextln: shrq $$1, %rsi
; nextln: andq %rax, %rsi
; nextln: subq %rsi, %rdi
; nextln: movq %rdi, %rsi
; nextln: shrq $$4, %rsi
; nextln: addq %rdi, %rsi
; nextln: movabsq $$1085102592571150095, %rdi
; nextln: andq %rdi, %rsi
; nextln: movabsq $$72340172838076673, %rdi
; nextln: imulq %rdi, %rsi
; nextln: shrq $$56, %rsi
; nextln: movq %rsi, %rax
return v1
}
function %popcnt64load(i64) -> i64 {
block0(v0: i64):
v1 = load.i64 v0
v2 = popcnt v1
return v2
; check: movq 0(%rdi), %rdi
; nextln: movq %rdi, %rsi
; nextln: shrq $$1, %rsi
; nextln: movabsq $$8608480567731124087, %rax
; nextln: andq %rax, %rsi
; nextln: subq %rsi, %rdi
; nextln: shrq $$1, %rsi
; nextln: andq %rax, %rsi
; nextln: subq %rsi, %rdi
; nextln: shrq $$1, %rsi
; nextln: andq %rax, %rsi
; nextln: subq %rsi, %rdi
; nextln: movq %rdi, %rsi
; nextln: shrq $$4, %rsi
; nextln: addq %rdi, %rsi
; nextln: movabsq $$1085102592571150095, %rdi
; nextln: andq %rdi, %rsi
; nextln: movabsq $$72340172838076673, %rdi
; nextln: imulq %rdi, %rsi
; nextln: shrq $$56, %rsi
; nextln: movq %rsi, %rax
}
function %popcnt32(i32) -> i32 {
block0(v0: i32):
v1 = popcnt v0
return v1
; check: movq %rdi, %rsi
; nextln: shrl $$1, %esi
; nextln: andl $$2004318071, %esi
; nextln: subl %esi, %edi
; nextln: shrl $$1, %esi
; nextln: andl $$2004318071, %esi
; nextln: subl %esi, %edi
; nextln: shrl $$1, %esi
; nextln: andl $$2004318071, %esi
; nextln: subl %esi, %edi
; nextln: movq %rdi, %rsi
; nextln: shrl $$4, %esi
; nextln: addl %edi, %esi
; nextln: andl $$252645135, %esi
; nextln: imull $$16843009, %esi
; nextln: shrl $$24, %esi
; nextln: movq %rsi, %rax
; nextln: movq %rbp, %rsp
; nextln: popq %rbp
; nextln: ret
}
function %popcnt32load(i64) -> i32 {
block0(v0: i64):
v1 = load.i32 v0
v2 = popcnt v1
return v2
; check: movl 0(%rdi), %edi
; nextln: movq %rdi, %rsi
; nextln: shrl $$1, %esi
; nextln: andl $$2004318071, %esi
; nextln: subl %esi, %edi
; nextln: shrl $$1, %esi
; nextln: andl $$2004318071, %esi
; nextln: subl %esi, %edi
; nextln: shrl $$1, %esi
; nextln: andl $$2004318071, %esi
; nextln: subl %esi, %edi
; nextln: movq %rdi, %rsi
; nextln: shrl $$4, %esi
; nextln: addl %edi, %esi
; nextln: andl $$252645135, %esi
; nextln: imull $$16843009, %esi
; nextln: shrl $$24, %esi
; nextln: movq %rsi, %rax
; nextln: movq %rbp, %rsp
; nextln: popq %rbp
; nextln: ret
}