diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index a6938ca64d..0e99ff94fa 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -406,6 +406,8 @@ pub enum UnaryRmROpcode { Lzcnt, /// Counts trailing zeroes (Trailing Zero CouNT). Tzcnt, + /// Counts the number of ones (POPulation CouNT). + Popcnt, } impl fmt::Debug for UnaryRmROpcode { @@ -415,6 +417,7 @@ impl fmt::Debug for UnaryRmROpcode { UnaryRmROpcode::Bsf => write!(fmt, "bsf"), UnaryRmROpcode::Lzcnt => write!(fmt, "lzcnt"), UnaryRmROpcode::Tzcnt => write!(fmt, "tzcnt"), + UnaryRmROpcode::Popcnt => write!(fmt, "popcnt"), } } } diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 74559272a7..4d6ae596eb 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -677,23 +677,25 @@ pub(crate) fn emit( _ => unreachable!(), }; + use UnaryRmROpcode::*; let prefix = match size { 2 => match op { - UnaryRmROpcode::Bsr | UnaryRmROpcode::Bsf => LegacyPrefixes::_66, - UnaryRmROpcode::Lzcnt | UnaryRmROpcode::Tzcnt => LegacyPrefixes::_66F3, + Bsr | Bsf => LegacyPrefixes::_66, + Lzcnt | Tzcnt | Popcnt => LegacyPrefixes::_66F3, }, 4 | 8 => match op { - UnaryRmROpcode::Bsr | UnaryRmROpcode::Bsf => LegacyPrefixes::None, - UnaryRmROpcode::Lzcnt | UnaryRmROpcode::Tzcnt => LegacyPrefixes::_F3, + Bsr | Bsf => LegacyPrefixes::None, + Lzcnt | Tzcnt | Popcnt => LegacyPrefixes::_F3, }, _ => unreachable!(), }; let (opcode, num_opcodes) = match op { - UnaryRmROpcode::Bsr => (0x0fbd, 2), - UnaryRmROpcode::Bsf => (0x0fbc, 2), - UnaryRmROpcode::Lzcnt => (0x0fbd, 2), - UnaryRmROpcode::Tzcnt => (0x0fbc, 2), + Bsr => (0x0fbd, 2), + Bsf => (0x0fbc, 2), + Lzcnt => (0x0fbd, 2), + Tzcnt => (0x0fbc, 2), + Popcnt => (0x0fb8, 2), }; match src { diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index ccf887bb87..1319ac2f94 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -2410,15 +2410,69 @@ fn lower_insn_to_regs>( } Opcode::Popcnt => { - // TODO when the x86 flags have use_popcnt, we can use the popcnt instruction. - let (ext_spec, ty) = match ctx.input_ty(insn, 0) { types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32), - a if a == types::I32 || a == types::I64 => (None, a), - types::I128 => (None, types::I128), + a if a == types::I32 || a == types::I64 || a == types::I128 => (None, a), _ => unreachable!(), }; + if isa_flags.use_popcnt() { + match ty { + types::I32 | types::I64 => { + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + ctx.emit(Inst::unary_rm_r( + ty.bytes() as u8, + UnaryRmROpcode::Popcnt, + src, + dst, + )); + return Ok(()); + } + + types::I128 => { + // The number of ones in a 128-bits value is the plain sum of the number of + // ones in its low and high parts. No risk of overflow here. + let dsts = get_output_reg(ctx, outputs[0]); + let dst = dsts.regs()[0]; + let tmp = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + let srcs = put_input_in_regs(ctx, inputs[0]); + let src_lo = srcs.regs()[0]; + let src_hi = srcs.regs()[1]; + + ctx.emit(Inst::unary_rm_r( + 8, + UnaryRmROpcode::Popcnt, + RegMem::reg(src_lo), + dst, + )); + ctx.emit(Inst::unary_rm_r( + 8, + UnaryRmROpcode::Popcnt, + RegMem::reg(src_hi), + tmp, + )); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Add, + RegMemImm::reg(tmp.to_reg()), + dst, + )); + + // Zero the result's high component. + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Xor, + RegMemImm::reg(dsts.regs()[1].to_reg()), + dsts.regs()[1], + )); + + return Ok(()); + } + _ => {} + } + } + let (srcs, ty): (SmallVec<[RegMem; 2]>, Type) = if let Some(ext_spec) = ext_spec { ( smallvec![RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec))], diff --git a/cranelift/filetests/filetests/isa/x64/popcnt-use-popcnt.clif b/cranelift/filetests/filetests/isa/x64/popcnt-use-popcnt.clif new file mode 100644 index 0000000000..4e49cd6d4f --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/popcnt-use-popcnt.clif @@ -0,0 +1,31 @@ +test compile +target x86_64 has_popcnt has_sse42 +feature "experimental_x64" + +function %popcnt(i64) -> i64 { +block0(v0: i64): + v1 = popcnt v0 + return v1 +} + +; check: pushq %rbp +; check: movq %rsp, %rbp +; check: popcntq %rdi, %rsi +; check: movq %rsi, %rax +; check: movq %rbp, %rsp +; check: popq %rbp +; check: ret + +function %popcnt(i32) -> i32 { +block0(v0: i32): + v1 = popcnt v0 + return v1 +} + +; check: pushq %rbp +; check: movq %rsp, %rbp +; check: popcntl %edi, %esi +; check: movq %rsi, %rax +; check: movq %rbp, %rsp +; check: popq %rbp +; check: ret