From 6632c45c01fec9d30b62031e75e7c3e29b3fac04 Mon Sep 17 00:00:00 2001 From: Chris Fallin Date: Tue, 8 Dec 2020 11:46:15 -0800 Subject: [PATCH] x64 lowering fix: i32.popcnt should not merge load and make it 64-bit. As a subtle consequence of the recent load-op fusion, popcnt of a value that came from a load.i32 was compiling into a 64-bit load. This is a result of the way in which x86 infers the width of loads: it is a consequence of the instruction containing the memory reference, not the memory reference itself. So the `input_to_reg_mem()` helper (convert an instruction input into a register or memory reference) was providing the appropriate memory reference for the result of a load.i32, but never encoded the assumption that it would only be used in a 32-bit instruction. It turns out that popcnt.i32 uses a 64-bit instruction to load this RM op, hence widening a 32-bit to 64-bit load (which is problematic when the offset is (memory_length - 4)). Separately, popcnt was using the RM operand twice, resulting in two loads if we merged a load. This isn't a correctness bug in practice because only a racy sequence (store interleaving between the loads) would produce incorrect results, but we decided earlier to treat loads as effectful for now, neither reordering nor duplicating them, to deliberately reduce complexity. Because of the second issue, the fix is just to force the operand into a register always, so any source load will not be merged. Discovered via fuzzing with oss-fuzz. --- cranelift/codegen/src/isa/x64/lower.rs | 5 +- .../filetests/filetests/isa/x64/popcnt.clif | 113 ++++++++++++++++++ 2 files changed, 117 insertions(+), 1 deletion(-) create mode 100644 cranelift/filetests/filetests/isa/x64/popcnt.clif diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 16e7490a09..a01e35bc0d 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -1530,7 +1530,10 @@ fn lower_insn_to_regs>( let src = if let Some(ext_spec) = ext_spec { RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec)) } else { - input_to_reg_mem(ctx, inputs[0]) + // N.B.: explicitly put input in a reg here because the width of the instruction + // into which this RM op goes may not match the width of the input type (in fact, + // it won't for i32.popcnt), and we don't want a larger than necessary load. + RegMem::reg(put_input_in_reg(ctx, inputs[0])) }; let dst = get_output_reg(ctx, outputs[0]); diff --git a/cranelift/filetests/filetests/isa/x64/popcnt.clif b/cranelift/filetests/filetests/isa/x64/popcnt.clif new file mode 100644 index 0000000000..a06f5a27ce --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/popcnt.clif @@ -0,0 +1,113 @@ +test compile +target x86_64 +feature "experimental_x64" + +; TODO: test with popcnt feature available too, once new backend supports that. + +function %popcnt64(i64) -> i64 { +block0(v0: i64): + v1 = popcnt v0 +; check: movq %rdi, %rsi +; nextln: shrq $$1, %rsi +; nextln: movabsq $$8608480567731124087, %rax +; nextln: andq %rax, %rsi +; nextln: subq %rsi, %rdi +; nextln: shrq $$1, %rsi +; nextln: andq %rax, %rsi +; nextln: subq %rsi, %rdi +; nextln: shrq $$1, %rsi +; nextln: andq %rax, %rsi +; nextln: subq %rsi, %rdi +; nextln: movq %rdi, %rsi +; nextln: shrq $$4, %rsi +; nextln: addq %rdi, %rsi +; nextln: movabsq $$1085102592571150095, %rdi +; nextln: andq %rdi, %rsi +; nextln: movabsq $$72340172838076673, %rdi +; nextln: imulq %rdi, %rsi +; nextln: shrq $$56, %rsi +; nextln: movq %rsi, %rax + return v1 +} + +function %popcnt64load(i64) -> i64 { +block0(v0: i64): + v1 = load.i64 v0 + v2 = popcnt v1 + return v2 +; check: movq 0(%rdi), %rdi +; nextln: movq %rdi, %rsi +; nextln: shrq $$1, %rsi +; nextln: movabsq $$8608480567731124087, %rax +; nextln: andq %rax, %rsi +; nextln: subq %rsi, %rdi +; nextln: shrq $$1, %rsi +; nextln: andq %rax, %rsi +; nextln: subq %rsi, %rdi +; nextln: shrq $$1, %rsi +; nextln: andq %rax, %rsi +; nextln: subq %rsi, %rdi +; nextln: movq %rdi, %rsi +; nextln: shrq $$4, %rsi +; nextln: addq %rdi, %rsi +; nextln: movabsq $$1085102592571150095, %rdi +; nextln: andq %rdi, %rsi +; nextln: movabsq $$72340172838076673, %rdi +; nextln: imulq %rdi, %rsi +; nextln: shrq $$56, %rsi +; nextln: movq %rsi, %rax +} + +function %popcnt32(i32) -> i32 { +block0(v0: i32): + v1 = popcnt v0 + return v1 +; check: movq %rdi, %rsi +; nextln: shrl $$1, %esi +; nextln: andl $$2004318071, %esi +; nextln: subl %esi, %edi +; nextln: shrl $$1, %esi +; nextln: andl $$2004318071, %esi +; nextln: subl %esi, %edi +; nextln: shrl $$1, %esi +; nextln: andl $$2004318071, %esi +; nextln: subl %esi, %edi +; nextln: movq %rdi, %rsi +; nextln: shrl $$4, %esi +; nextln: addl %edi, %esi +; nextln: andl $$252645135, %esi +; nextln: imull $$16843009, %esi +; nextln: shrl $$24, %esi +; nextln: movq %rsi, %rax +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + +function %popcnt32load(i64) -> i32 { +block0(v0: i64): + v1 = load.i32 v0 + v2 = popcnt v1 + return v2 +; check: movl 0(%rdi), %edi +; nextln: movq %rdi, %rsi +; nextln: shrl $$1, %esi +; nextln: andl $$2004318071, %esi +; nextln: subl %esi, %edi +; nextln: shrl $$1, %esi +; nextln: andl $$2004318071, %esi +; nextln: subl %esi, %edi +; nextln: shrl $$1, %esi +; nextln: andl $$2004318071, %esi +; nextln: subl %esi, %edi +; nextln: movq %rdi, %rsi +; nextln: shrl $$4, %esi +; nextln: addl %edi, %esi +; nextln: andl $$252645135, %esi +; nextln: imull $$16843009, %esi +; nextln: shrl $$24, %esi +; nextln: movq %rsi, %rax +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +}