diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 16e7490a09..a01e35bc0d 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -1530,7 +1530,10 @@ fn lower_insn_to_regs>( let src = if let Some(ext_spec) = ext_spec { RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec)) } else { - input_to_reg_mem(ctx, inputs[0]) + // N.B.: explicitly put input in a reg here because the width of the instruction + // into which this RM op goes may not match the width of the input type (in fact, + // it won't for i32.popcnt), and we don't want a larger than necessary load. + RegMem::reg(put_input_in_reg(ctx, inputs[0])) }; let dst = get_output_reg(ctx, outputs[0]); diff --git a/cranelift/filetests/filetests/isa/x64/popcnt.clif b/cranelift/filetests/filetests/isa/x64/popcnt.clif new file mode 100644 index 0000000000..a06f5a27ce --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/popcnt.clif @@ -0,0 +1,113 @@ +test compile +target x86_64 +feature "experimental_x64" + +; TODO: test with popcnt feature available too, once new backend supports that. + +function %popcnt64(i64) -> i64 { +block0(v0: i64): + v1 = popcnt v0 +; check: movq %rdi, %rsi +; nextln: shrq $$1, %rsi +; nextln: movabsq $$8608480567731124087, %rax +; nextln: andq %rax, %rsi +; nextln: subq %rsi, %rdi +; nextln: shrq $$1, %rsi +; nextln: andq %rax, %rsi +; nextln: subq %rsi, %rdi +; nextln: shrq $$1, %rsi +; nextln: andq %rax, %rsi +; nextln: subq %rsi, %rdi +; nextln: movq %rdi, %rsi +; nextln: shrq $$4, %rsi +; nextln: addq %rdi, %rsi +; nextln: movabsq $$1085102592571150095, %rdi +; nextln: andq %rdi, %rsi +; nextln: movabsq $$72340172838076673, %rdi +; nextln: imulq %rdi, %rsi +; nextln: shrq $$56, %rsi +; nextln: movq %rsi, %rax + return v1 +} + +function %popcnt64load(i64) -> i64 { +block0(v0: i64): + v1 = load.i64 v0 + v2 = popcnt v1 + return v2 +; check: movq 0(%rdi), %rdi +; nextln: movq %rdi, %rsi +; nextln: shrq $$1, %rsi +; nextln: movabsq $$8608480567731124087, %rax +; nextln: andq %rax, %rsi +; nextln: subq %rsi, %rdi +; nextln: shrq $$1, %rsi +; nextln: andq %rax, %rsi +; nextln: subq %rsi, %rdi +; nextln: shrq $$1, %rsi +; nextln: andq %rax, %rsi +; nextln: subq %rsi, %rdi +; nextln: movq %rdi, %rsi +; nextln: shrq $$4, %rsi +; nextln: addq %rdi, %rsi +; nextln: movabsq $$1085102592571150095, %rdi +; nextln: andq %rdi, %rsi +; nextln: movabsq $$72340172838076673, %rdi +; nextln: imulq %rdi, %rsi +; nextln: shrq $$56, %rsi +; nextln: movq %rsi, %rax +} + +function %popcnt32(i32) -> i32 { +block0(v0: i32): + v1 = popcnt v0 + return v1 +; check: movq %rdi, %rsi +; nextln: shrl $$1, %esi +; nextln: andl $$2004318071, %esi +; nextln: subl %esi, %edi +; nextln: shrl $$1, %esi +; nextln: andl $$2004318071, %esi +; nextln: subl %esi, %edi +; nextln: shrl $$1, %esi +; nextln: andl $$2004318071, %esi +; nextln: subl %esi, %edi +; nextln: movq %rdi, %rsi +; nextln: shrl $$4, %esi +; nextln: addl %edi, %esi +; nextln: andl $$252645135, %esi +; nextln: imull $$16843009, %esi +; nextln: shrl $$24, %esi +; nextln: movq %rsi, %rax +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + +function %popcnt32load(i64) -> i32 { +block0(v0: i64): + v1 = load.i32 v0 + v2 = popcnt v1 + return v2 +; check: movl 0(%rdi), %edi +; nextln: movq %rdi, %rsi +; nextln: shrl $$1, %esi +; nextln: andl $$2004318071, %esi +; nextln: subl %esi, %edi +; nextln: shrl $$1, %esi +; nextln: andl $$2004318071, %esi +; nextln: subl %esi, %edi +; nextln: shrl $$1, %esi +; nextln: andl $$2004318071, %esi +; nextln: subl %esi, %edi +; nextln: movq %rdi, %rsi +; nextln: shrl $$4, %esi +; nextln: addl %edi, %esi +; nextln: andl $$252645135, %esi +; nextln: imull $$16843009, %esi +; nextln: shrl $$24, %esi +; nextln: movq %rsi, %rax +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +}