From d3acd9a2831436e7296d002fd1cd283333d144cb Mon Sep 17 00:00:00 2001 From: Benjamin Bouvier Date: Wed, 27 Jan 2021 18:44:14 +0100 Subject: [PATCH] cranelift x64: use the LZCNT instruction for Clz when it's available; --- cranelift/codegen/src/isa/x64/inst/args.rs | 3 ++ cranelift/codegen/src/isa/x64/inst/emit.rs | 38 ++++++++++++++----- cranelift/codegen/src/isa/x64/lower.rs | 19 +++++++++- .../filetests/isa/x64/clz-lzcnt.clif | 31 +++++++++++++++ 4 files changed, 79 insertions(+), 12 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/x64/clz-lzcnt.clif diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 39ca25d060..d81e1bbf8a 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -402,6 +402,8 @@ pub enum UnaryRmROpcode { Bsr, /// Bit-scan forward. Bsf, + /// Counts leading zeroes (Leading Zero CouNT). + Lzcnt, } impl fmt::Debug for UnaryRmROpcode { @@ -409,6 +411,7 @@ impl fmt::Debug for UnaryRmROpcode { match self { UnaryRmROpcode::Bsr => write!(fmt, "bsr"), UnaryRmROpcode::Bsf => write!(fmt, "bsf"), + UnaryRmROpcode::Lzcnt => write!(fmt, "lzcnt"), } } } diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 0a029301a6..6ae0dc012e 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -129,18 +129,20 @@ impl RexFlags { /// We may need to include one or more legacy prefix bytes before the REX prefix. This enum /// covers only the small set of possibilities that we actually need. enum LegacyPrefixes { - /// No prefix bytes + /// No prefix bytes. None, - /// Operand Size Override -- here, denoting "16-bit operation" + /// Operand Size Override -- here, denoting "16-bit operation". _66, - /// The Lock prefix + /// The Lock prefix. _F0, - /// Operand size override and Lock + /// Operand size override and Lock. _66F0, - /// REPNE, but no specific meaning here -- is just an opcode extension + /// REPNE, but no specific meaning here -- is just an opcode extension. _F2, - /// REP/REPE, but no specific meaning here -- is just an opcode extension + /// REP/REPE, but no specific meaning here -- is just an opcode extension. _F3, + /// Operand size override and same effect as F3. + _66F3, } impl LegacyPrefixes { @@ -157,6 +159,10 @@ impl LegacyPrefixes { } LegacyPrefixes::_F2 => sink.put1(0xF2), LegacyPrefixes::_F3 => sink.put1(0xF3), + LegacyPrefixes::_66F3 => { + sink.put1(0x66); + sink.put1(0xF3); + } LegacyPrefixes::None => (), } } @@ -665,16 +671,28 @@ pub(crate) fn emit( } Inst::UnaryRmR { size, op, src, dst } => { - let (prefix, rex_flags) = match size { - 2 => (LegacyPrefixes::_66, RexFlags::clear_w()), - 4 => (LegacyPrefixes::None, RexFlags::clear_w()), - 8 => (LegacyPrefixes::None, RexFlags::set_w()), + let rex_flags = match size { + 2 | 4 => RexFlags::clear_w(), + 8 => RexFlags::set_w(), + _ => unreachable!(), + }; + + let prefix = match size { + 2 => match op { + UnaryRmROpcode::Bsr | UnaryRmROpcode::Bsf => LegacyPrefixes::_66, + UnaryRmROpcode::Lzcnt => LegacyPrefixes::_66F3, + }, + 4 | 8 => match op { + UnaryRmROpcode::Bsr | UnaryRmROpcode::Bsf => LegacyPrefixes::None, + UnaryRmROpcode::Lzcnt => LegacyPrefixes::_F3, + }, _ => unreachable!(), }; let (opcode, num_opcodes) = match op { UnaryRmROpcode::Bsr => (0x0fbd, 2), UnaryRmROpcode::Bsf => (0x0fbc, 2), + UnaryRmROpcode::Lzcnt => (0x0fbd, 2), }; match src { diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 6c002dd6cf..837d4d1266 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -2276,7 +2276,22 @@ fn lower_insn_to_regs>( } Opcode::Clz => { - // TODO when the x86 flags have use_lzcnt, we can use LZCNT. + let orig_ty = ty.unwrap(); + + if isa_flags.use_lzcnt() && (orig_ty == types::I32 || orig_ty == types::I64) { + // We can use a plain lzcnt instruction here. Note no special handling is required + // for zero inputs, because the machine instruction does what the CLIF expects for + // zero, i.e. it returns zero. + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + ctx.emit(Inst::unary_rm_r( + orig_ty.bytes() as u8, + UnaryRmROpcode::Lzcnt, + src, + dst, + )); + return Ok(()); + } // General formula using bit-scan reverse (BSR): // mov -1, %dst @@ -2285,7 +2300,6 @@ fn lower_insn_to_regs>( // mov $(size_bits - 1), %dst // sub %tmp, %dst - let orig_ty = ty.unwrap(); if orig_ty == types::I128 { // clz upper, tmp1 // clz lower, dst @@ -4427,6 +4441,7 @@ fn lower_insn_to_regs>( } } } + Opcode::Store | Opcode::Istore8 | Opcode::Istore16 diff --git a/cranelift/filetests/filetests/isa/x64/clz-lzcnt.clif b/cranelift/filetests/filetests/isa/x64/clz-lzcnt.clif new file mode 100644 index 0000000000..ac0df03384 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/clz-lzcnt.clif @@ -0,0 +1,31 @@ +test compile +target x86_64 has_lzcnt +feature "experimental_x64" + +function %clz(i64) -> i64 { +block0(v0: i64): + v1 = clz v0 + return v1 +} + +; check: pushq %rbp +; check: movq %rsp, %rbp +; check: lzcntq %rdi, %rsi +; check: movq %rsi, %rax +; check: movq %rbp, %rsp +; check: popq %rbp +; check: ret + +function %clz(i32) -> i32 { +block0(v0: i32): + v1 = clz v0 + return v1 +} + +; check: pushq %rbp +; check: movq %rsp, %rbp +; check: lzcntl %edi, %esi +; check: movq %rsi, %rax +; check: movq %rbp, %rsp +; check: popq %rbp +; check: ret