cranelift x64: use the LZCNT instruction for Clz when it's available;
This commit is contained in:
@@ -402,6 +402,8 @@ pub enum UnaryRmROpcode {
|
||||
Bsr,
|
||||
/// Bit-scan forward.
|
||||
Bsf,
|
||||
/// Counts leading zeroes (Leading Zero CouNT).
|
||||
Lzcnt,
|
||||
}
|
||||
|
||||
impl fmt::Debug for UnaryRmROpcode {
|
||||
@@ -409,6 +411,7 @@ impl fmt::Debug for UnaryRmROpcode {
|
||||
match self {
|
||||
UnaryRmROpcode::Bsr => write!(fmt, "bsr"),
|
||||
UnaryRmROpcode::Bsf => write!(fmt, "bsf"),
|
||||
UnaryRmROpcode::Lzcnt => write!(fmt, "lzcnt"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -129,18 +129,20 @@ impl RexFlags {
|
||||
/// We may need to include one or more legacy prefix bytes before the REX prefix. This enum
|
||||
/// covers only the small set of possibilities that we actually need.
|
||||
enum LegacyPrefixes {
|
||||
/// No prefix bytes
|
||||
/// No prefix bytes.
|
||||
None,
|
||||
/// Operand Size Override -- here, denoting "16-bit operation"
|
||||
/// Operand Size Override -- here, denoting "16-bit operation".
|
||||
_66,
|
||||
/// The Lock prefix
|
||||
/// The Lock prefix.
|
||||
_F0,
|
||||
/// Operand size override and Lock
|
||||
/// Operand size override and Lock.
|
||||
_66F0,
|
||||
/// REPNE, but no specific meaning here -- is just an opcode extension
|
||||
/// REPNE, but no specific meaning here -- is just an opcode extension.
|
||||
_F2,
|
||||
/// REP/REPE, but no specific meaning here -- is just an opcode extension
|
||||
/// REP/REPE, but no specific meaning here -- is just an opcode extension.
|
||||
_F3,
|
||||
/// Operand size override and same effect as F3.
|
||||
_66F3,
|
||||
}
|
||||
|
||||
impl LegacyPrefixes {
|
||||
@@ -157,6 +159,10 @@ impl LegacyPrefixes {
|
||||
}
|
||||
LegacyPrefixes::_F2 => sink.put1(0xF2),
|
||||
LegacyPrefixes::_F3 => sink.put1(0xF3),
|
||||
LegacyPrefixes::_66F3 => {
|
||||
sink.put1(0x66);
|
||||
sink.put1(0xF3);
|
||||
}
|
||||
LegacyPrefixes::None => (),
|
||||
}
|
||||
}
|
||||
@@ -665,16 +671,28 @@ pub(crate) fn emit(
|
||||
}
|
||||
|
||||
Inst::UnaryRmR { size, op, src, dst } => {
|
||||
let (prefix, rex_flags) = match size {
|
||||
2 => (LegacyPrefixes::_66, RexFlags::clear_w()),
|
||||
4 => (LegacyPrefixes::None, RexFlags::clear_w()),
|
||||
8 => (LegacyPrefixes::None, RexFlags::set_w()),
|
||||
let rex_flags = match size {
|
||||
2 | 4 => RexFlags::clear_w(),
|
||||
8 => RexFlags::set_w(),
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
let prefix = match size {
|
||||
2 => match op {
|
||||
UnaryRmROpcode::Bsr | UnaryRmROpcode::Bsf => LegacyPrefixes::_66,
|
||||
UnaryRmROpcode::Lzcnt => LegacyPrefixes::_66F3,
|
||||
},
|
||||
4 | 8 => match op {
|
||||
UnaryRmROpcode::Bsr | UnaryRmROpcode::Bsf => LegacyPrefixes::None,
|
||||
UnaryRmROpcode::Lzcnt => LegacyPrefixes::_F3,
|
||||
},
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
let (opcode, num_opcodes) = match op {
|
||||
UnaryRmROpcode::Bsr => (0x0fbd, 2),
|
||||
UnaryRmROpcode::Bsf => (0x0fbc, 2),
|
||||
UnaryRmROpcode::Lzcnt => (0x0fbd, 2),
|
||||
};
|
||||
|
||||
match src {
|
||||
|
||||
@@ -2276,7 +2276,22 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
}
|
||||
|
||||
Opcode::Clz => {
|
||||
// TODO when the x86 flags have use_lzcnt, we can use LZCNT.
|
||||
let orig_ty = ty.unwrap();
|
||||
|
||||
if isa_flags.use_lzcnt() && (orig_ty == types::I32 || orig_ty == types::I64) {
|
||||
// We can use a plain lzcnt instruction here. Note no special handling is required
|
||||
// for zero inputs, because the machine instruction does what the CLIF expects for
|
||||
// zero, i.e. it returns zero.
|
||||
let src = input_to_reg_mem(ctx, inputs[0]);
|
||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||
ctx.emit(Inst::unary_rm_r(
|
||||
orig_ty.bytes() as u8,
|
||||
UnaryRmROpcode::Lzcnt,
|
||||
src,
|
||||
dst,
|
||||
));
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// General formula using bit-scan reverse (BSR):
|
||||
// mov -1, %dst
|
||||
@@ -2285,7 +2300,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
// mov $(size_bits - 1), %dst
|
||||
// sub %tmp, %dst
|
||||
|
||||
let orig_ty = ty.unwrap();
|
||||
if orig_ty == types::I128 {
|
||||
// clz upper, tmp1
|
||||
// clz lower, dst
|
||||
@@ -4427,6 +4441,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Opcode::Store
|
||||
| Opcode::Istore8
|
||||
| Opcode::Istore16
|
||||
|
||||
31
cranelift/filetests/filetests/isa/x64/clz-lzcnt.clif
Normal file
31
cranelift/filetests/filetests/isa/x64/clz-lzcnt.clif
Normal file
@@ -0,0 +1,31 @@
|
||||
test compile
|
||||
target x86_64 has_lzcnt
|
||||
feature "experimental_x64"
|
||||
|
||||
function %clz(i64) -> i64 {
|
||||
block0(v0: i64):
|
||||
v1 = clz v0
|
||||
return v1
|
||||
}
|
||||
|
||||
; check: pushq %rbp
|
||||
; check: movq %rsp, %rbp
|
||||
; check: lzcntq %rdi, %rsi
|
||||
; check: movq %rsi, %rax
|
||||
; check: movq %rbp, %rsp
|
||||
; check: popq %rbp
|
||||
; check: ret
|
||||
|
||||
function %clz(i32) -> i32 {
|
||||
block0(v0: i32):
|
||||
v1 = clz v0
|
||||
return v1
|
||||
}
|
||||
|
||||
; check: pushq %rbp
|
||||
; check: movq %rsp, %rbp
|
||||
; check: lzcntl %edi, %esi
|
||||
; check: movq %rsi, %rax
|
||||
; check: movq %rbp, %rsp
|
||||
; check: popq %rbp
|
||||
; check: ret
|
||||
Reference in New Issue
Block a user