diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index d81e1bbf8a..a6938ca64d 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -404,6 +404,8 @@ pub enum UnaryRmROpcode { Bsf, /// Counts leading zeroes (Leading Zero CouNT). Lzcnt, + /// Counts trailing zeroes (Trailing Zero CouNT). + Tzcnt, } impl fmt::Debug for UnaryRmROpcode { @@ -412,6 +414,7 @@ impl fmt::Debug for UnaryRmROpcode { UnaryRmROpcode::Bsr => write!(fmt, "bsr"), UnaryRmROpcode::Bsf => write!(fmt, "bsf"), UnaryRmROpcode::Lzcnt => write!(fmt, "lzcnt"), + UnaryRmROpcode::Tzcnt => write!(fmt, "tzcnt"), } } } diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 6ae0dc012e..74559272a7 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -680,11 +680,11 @@ pub(crate) fn emit( let prefix = match size { 2 => match op { UnaryRmROpcode::Bsr | UnaryRmROpcode::Bsf => LegacyPrefixes::_66, - UnaryRmROpcode::Lzcnt => LegacyPrefixes::_66F3, + UnaryRmROpcode::Lzcnt | UnaryRmROpcode::Tzcnt => LegacyPrefixes::_66F3, }, 4 | 8 => match op { UnaryRmROpcode::Bsr | UnaryRmROpcode::Bsf => LegacyPrefixes::None, - UnaryRmROpcode::Lzcnt => LegacyPrefixes::_F3, + UnaryRmROpcode::Lzcnt | UnaryRmROpcode::Tzcnt => LegacyPrefixes::_F3, }, _ => unreachable!(), }; @@ -693,6 +693,7 @@ pub(crate) fn emit( UnaryRmROpcode::Bsr => (0x0fbd, 2), UnaryRmROpcode::Bsf => (0x0fbc, 2), UnaryRmROpcode::Lzcnt => (0x0fbd, 2), + UnaryRmROpcode::Tzcnt => (0x0fbc, 2), }; match src { diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 837d4d1266..ccf887bb87 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -2346,13 +2346,27 @@ fn lower_insn_to_regs>( } Opcode::Ctz => { - // TODO when the x86 flags have use_bmi1, we can use TZCNT. + let orig_ty = ctx.input_ty(insn, 0); + + if isa_flags.use_bmi1() && (orig_ty == types::I32 || orig_ty == types::I64) { + // We can use a plain tzcnt instruction here. Note no special handling is required + // for zero inputs, because the machine instruction does what the CLIF expects for + // zero, i.e. it returns zero. + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + ctx.emit(Inst::unary_rm_r( + orig_ty.bytes() as u8, + UnaryRmROpcode::Tzcnt, + src, + dst, + )); + return Ok(()); + } // General formula using bit-scan forward (BSF): // bsf %src, %dst // mov $(size_bits), %tmp // cmovz %tmp, %dst - let orig_ty = ctx.input_ty(insn, 0); if orig_ty == types::I128 { // ctz src_lo, dst // ctz src_hi, tmp1 diff --git a/cranelift/filetests/filetests/isa/x64/ctz-bmi1.clif b/cranelift/filetests/filetests/isa/x64/ctz-bmi1.clif new file mode 100644 index 0000000000..b50b10107a --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/ctz-bmi1.clif @@ -0,0 +1,31 @@ +test compile +target x86_64 has_bmi1 +feature "experimental_x64" + +function %ctz(i64) -> i64 { +block0(v0: i64): + v1 = ctz v0 + return v1 +} + +; check: pushq %rbp +; check: movq %rsp, %rbp +; check: tzcntq %rdi, %rsi +; check: movq %rsi, %rax +; check: movq %rbp, %rsp +; check: popq %rbp +; check: ret + +function %ctz(i32) -> i32 { +block0(v0: i32): + v1 = ctz v0 + return v1 +} + +; check: pushq %rbp +; check: movq %rsp, %rbp +; check: tzcntl %edi, %esi +; check: movq %rsi, %rax +; check: movq %rbp, %rsp +; check: popq %rbp +; check: ret