cranelift x64: use the LZCNT instruction for Clz when it's available;

This commit is contained in:
Benjamin Bouvier
2021-01-27 18:44:14 +01:00
parent b1b078e2bc
commit d3acd9a283
4 changed files with 79 additions and 12 deletions

View File

@@ -402,6 +402,8 @@ pub enum UnaryRmROpcode {
Bsr, Bsr,
/// Bit-scan forward. /// Bit-scan forward.
Bsf, Bsf,
/// Counts leading zeroes (Leading Zero CouNT).
Lzcnt,
} }
impl fmt::Debug for UnaryRmROpcode { impl fmt::Debug for UnaryRmROpcode {
@@ -409,6 +411,7 @@ impl fmt::Debug for UnaryRmROpcode {
match self { match self {
UnaryRmROpcode::Bsr => write!(fmt, "bsr"), UnaryRmROpcode::Bsr => write!(fmt, "bsr"),
UnaryRmROpcode::Bsf => write!(fmt, "bsf"), UnaryRmROpcode::Bsf => write!(fmt, "bsf"),
UnaryRmROpcode::Lzcnt => write!(fmt, "lzcnt"),
} }
} }
} }

View File

@@ -129,18 +129,20 @@ impl RexFlags {
/// We may need to include one or more legacy prefix bytes before the REX prefix. This enum /// We may need to include one or more legacy prefix bytes before the REX prefix. This enum
/// covers only the small set of possibilities that we actually need. /// covers only the small set of possibilities that we actually need.
enum LegacyPrefixes { enum LegacyPrefixes {
/// No prefix bytes /// No prefix bytes.
None, None,
/// Operand Size Override -- here, denoting "16-bit operation" /// Operand Size Override -- here, denoting "16-bit operation".
_66, _66,
/// The Lock prefix /// The Lock prefix.
_F0, _F0,
/// Operand size override and Lock /// Operand size override and Lock.
_66F0, _66F0,
/// REPNE, but no specific meaning here -- is just an opcode extension /// REPNE, but no specific meaning here -- is just an opcode extension.
_F2, _F2,
/// REP/REPE, but no specific meaning here -- is just an opcode extension /// REP/REPE, but no specific meaning here -- is just an opcode extension.
_F3, _F3,
/// Operand size override and same effect as F3.
_66F3,
} }
impl LegacyPrefixes { impl LegacyPrefixes {
@@ -157,6 +159,10 @@ impl LegacyPrefixes {
} }
LegacyPrefixes::_F2 => sink.put1(0xF2), LegacyPrefixes::_F2 => sink.put1(0xF2),
LegacyPrefixes::_F3 => sink.put1(0xF3), LegacyPrefixes::_F3 => sink.put1(0xF3),
LegacyPrefixes::_66F3 => {
sink.put1(0x66);
sink.put1(0xF3);
}
LegacyPrefixes::None => (), LegacyPrefixes::None => (),
} }
} }
@@ -665,16 +671,28 @@ pub(crate) fn emit(
} }
Inst::UnaryRmR { size, op, src, dst } => { Inst::UnaryRmR { size, op, src, dst } => {
let (prefix, rex_flags) = match size { let rex_flags = match size {
2 => (LegacyPrefixes::_66, RexFlags::clear_w()), 2 | 4 => RexFlags::clear_w(),
4 => (LegacyPrefixes::None, RexFlags::clear_w()), 8 => RexFlags::set_w(),
8 => (LegacyPrefixes::None, RexFlags::set_w()), _ => unreachable!(),
};
let prefix = match size {
2 => match op {
UnaryRmROpcode::Bsr | UnaryRmROpcode::Bsf => LegacyPrefixes::_66,
UnaryRmROpcode::Lzcnt => LegacyPrefixes::_66F3,
},
4 | 8 => match op {
UnaryRmROpcode::Bsr | UnaryRmROpcode::Bsf => LegacyPrefixes::None,
UnaryRmROpcode::Lzcnt => LegacyPrefixes::_F3,
},
_ => unreachable!(), _ => unreachable!(),
}; };
let (opcode, num_opcodes) = match op { let (opcode, num_opcodes) = match op {
UnaryRmROpcode::Bsr => (0x0fbd, 2), UnaryRmROpcode::Bsr => (0x0fbd, 2),
UnaryRmROpcode::Bsf => (0x0fbc, 2), UnaryRmROpcode::Bsf => (0x0fbc, 2),
UnaryRmROpcode::Lzcnt => (0x0fbd, 2),
}; };
match src { match src {

View File

@@ -2276,7 +2276,22 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
} }
Opcode::Clz => { Opcode::Clz => {
// TODO when the x86 flags have use_lzcnt, we can use LZCNT. let orig_ty = ty.unwrap();
if isa_flags.use_lzcnt() && (orig_ty == types::I32 || orig_ty == types::I64) {
// We can use a plain lzcnt instruction here. Note no special handling is required
// for zero inputs, because the machine instruction does what the CLIF expects for
// zero, i.e. it returns zero.
let src = input_to_reg_mem(ctx, inputs[0]);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
ctx.emit(Inst::unary_rm_r(
orig_ty.bytes() as u8,
UnaryRmROpcode::Lzcnt,
src,
dst,
));
return Ok(());
}
// General formula using bit-scan reverse (BSR): // General formula using bit-scan reverse (BSR):
// mov -1, %dst // mov -1, %dst
@@ -2285,7 +2300,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
// mov $(size_bits - 1), %dst // mov $(size_bits - 1), %dst
// sub %tmp, %dst // sub %tmp, %dst
let orig_ty = ty.unwrap();
if orig_ty == types::I128 { if orig_ty == types::I128 {
// clz upper, tmp1 // clz upper, tmp1
// clz lower, dst // clz lower, dst
@@ -4427,6 +4441,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
} }
} }
} }
Opcode::Store Opcode::Store
| Opcode::Istore8 | Opcode::Istore8
| Opcode::Istore16 | Opcode::Istore16

View File

@@ -0,0 +1,31 @@
test compile
target x86_64 has_lzcnt
feature "experimental_x64"
function %clz(i64) -> i64 {
block0(v0: i64):
v1 = clz v0
return v1
}
; check: pushq %rbp
; check: movq %rsp, %rbp
; check: lzcntq %rdi, %rsi
; check: movq %rsi, %rax
; check: movq %rbp, %rsp
; check: popq %rbp
; check: ret
function %clz(i32) -> i32 {
block0(v0: i32):
v1 = clz v0
return v1
}
; check: pushq %rbp
; check: movq %rsp, %rbp
; check: lzcntl %edi, %esi
; check: movq %rsi, %rax
; check: movq %rbp, %rsp
; check: popq %rbp
; check: ret