Adds support for legalizing CLZ, CTZ and POPCOUNT on baseline x86_64 targets.

Changes:

* Adds a new generic instruction, SELECTIF, that does value selection (a la
  conditional move) similarly to existing SELECT, except that it is
  controlled by condition code input and flags-register inputs.

* Adds a new Intel x86_64 variant, 'baseline', that supports SSE2 and
  nothing else.

* Adds new Intel x86_64 instructions BSR and BSF.

* Implements generic CLZ, CTZ and POPCOUNT on x86_64 'baseline' targets
  using the new BSR, BSF and SELECTIF instructions.

* Implements SELECTIF on x86_64 targets using conditional-moves.

* new test filetests/isa/intel/baseline_clz_ctz_popcount.cton
  (for legalization)

* new test filetests/isa/intel/baseline_clz_ctz_popcount_encoding.cton
  (for encoding)

* Allow lib/cretonne/meta/gen_legalizer.py to generate non-snake-caseified
  Rust without rustc complaining.

Fixes #238.
This commit is contained in:
Julian Seward
2018-01-17 06:23:30 +01:00
committed by Jakob Stoklund Olesen
parent e3714ddd10
commit 6f8a54b6a5
16 changed files with 440 additions and 3 deletions

View File

@@ -700,6 +700,7 @@ Operations
==========
.. autoinst:: select
.. autoinst:: selectif
Constant materialization
------------------------
@@ -979,6 +980,10 @@ Instructions that can only be used by the Intel target ISA.
.. autoinst:: isa.intel.instructions.cvtt2si
.. autoinst:: isa.intel.instructions.fmin
.. autoinst:: isa.intel.instructions.fmax
.. autoinst:: isa.intel.instructions.bsf
.. autoinst:: isa.intel.instructions.bsr
.. autoinst:: isa.intel.instructions.push
.. autoinst:: isa.intel.instructions.pop
Instruction groups
==================

View File

@@ -0,0 +1,104 @@
test compile
set is_64bit
isa intel baseline
; clz/ctz on 64 bit operands
function %i64_clz(i64) -> i64 {
ebb0(v10: i64):
v11 = clz v10
; check: x86_bsr
; check: selectif.i64
return v11
}
function %i64_ctz(i64) -> i64 {
ebb1(v20: i64):
v21 = ctz v20
; check: x86_bsf
; check: selectif.i64
return v21
}
; clz/ctz on 32 bit operands
function %i32_clz(i32) -> i32 {
ebb0(v10: i32):
v11 = clz v10
; check: x86_bsr
; check: selectif.i32
return v11
}
function %i32_ctz(i32) -> i32 {
ebb1(v20: i32):
v21 = ctz v20
; check: x86_bsf
; check: selectif.i32
return v21
}
; popcount on 64 bit operands
function %i64_popcount(i64) -> i64 {
ebb0(v30: i64):
v31 = popcnt v30;
; check: iconst.i32
; check: ushr
; check: iconst.i64
; check: band
; check: isub
; check: iconst.i32
; check: ushr
; check: band
; check: isub
; check: iconst.i32
; check: ushr
; check: band
; check: isub
; check: iconst.i32
; check: ushr
; check: iadd
; check: iconst.i64
; check: band
; check: iconst.i64
; check: imul
; check: iconst.i32
; check: ushr
return v31;
}
; popcount on 32 bit operands
function %i32_popcount(i32) -> i32 {
ebb0(v40: i32):
v41 = popcnt v40;
; check: iconst.i32
; check: ushr
; check: iconst.i32
; check: band
; check: isub
; check: iconst.i32
; check: ushr
; check: band
; check: isub
; check: iconst.i32
; check: ushr
; check: band
; check: isub
; check: iconst.i32
; check: ushr
; check: iadd
; check: iconst.i32
; check: band
; check: iconst.i32
; check: imul
; check: iconst.i32
; check: ushr
return v41;
}

View File

@@ -0,0 +1,89 @@
test binemit
set is_64bit
set is_compressed
isa intel baseline
; The binary encodings can be verified with the command:
;
; sed -ne 's/^ *; asm: *//p' filetests/isa/intel/baseline_clz_ctz_popcount_encoding.cton | llvm-mc -show-encoding -triple=x86_64
;
function %Foo() {
ebb0:
; 64-bit wide bsf
[-,%r11] v10 = iconst.i64 0x1234
; asm: bsfq %r11, %rcx
[-,%rcx,%eflags] v11, v12 = x86_bsf v10 ; bin: 49 0f bc cb
[-,%rdx] v14 = iconst.i64 0x5678
; asm: bsfq %rdx, %r12
[-,%r12,%eflags] v15, v16 = x86_bsf v14 ; bin: 4c 0f bc e2
; asm: bsfq %rdx, %rdi
[-,%rdi,%eflags] v17, v18 = x86_bsf v14 ; bin: 48 0f bc fa
; 32-bit wide bsf
[-,%r11] v20 = iconst.i32 0x1234
; asm: bsfl %r11d, %ecx
[-,%rcx,%eflags] v21, v22 = x86_bsf v20 ; bin: 41 0f bc cb
[-,%rdx] v24 = iconst.i32 0x5678
; asm: bsfl %edx, %r12d
[-,%r12,%eflags] v25, v26 = x86_bsf v24 ; bin: 44 0f bc e2
; asm: bsfl %edx, %esi
[-,%rsi,%eflags] v27, v28 = x86_bsf v24 ; bin: 0f bc f2
; 64-bit wide bsr
[-,%r11] v30 = iconst.i64 0x1234
; asm: bsrq %r11, %rcx
[-,%rcx,%eflags] v31, v32 = x86_bsr v30 ; bin: 49 0f bd cb
[-,%rdx] v34 = iconst.i64 0x5678
; asm: bsrq %rdx, %r12
[-,%r12,%eflags] v35, v36 = x86_bsr v34 ; bin: 4c 0f bd e2
; asm: bsrq %rdx, %rdi
[-,%rdi,%eflags] v37, v38 = x86_bsr v34 ; bin: 48 0f bd fa
; 32-bit wide bsr
[-,%r11] v40 = iconst.i32 0x1234
; asm: bsrl %r11d, %ecx
[-,%rcx,%eflags] v41, v42 = x86_bsr v40 ; bin: 41 0f bd cb
[-,%rdx] v44 = iconst.i32 0x5678
; asm: bsrl %edx, %r12d
[-,%r12,%eflags] v45, v46 = x86_bsr v44 ; bin: 44 0f bd e2
; asm: bsrl %edx, %esi
[-,%rsi,%eflags] v47, v48 = x86_bsr v44 ; bin: 0f bd f2
; 64-bit wide cmov
; asm: cmoveq %r11, %rdx
[-,%rdx] v51 = selectif.i64 eq v48, v30, v34 ; bin: 49 0f 44 d3
; asm: cmoveq %rdi, %rdx
[-,%rdx] v52 = selectif.i64 eq v48, v37, v34 ; bin: 48 0f 44 d7
; 32-bit wide cmov
; asm: cmovnel %r11d, %edx
[-,%rdx] v60 = selectif.i32 ne v48, v40, v44 ; bin: 41 0f 45 d3
; asm: cmovlel %esi, %edx
[-,%rdx] v61 = selectif.i32 sle v48, v27, v44 ; bin: 0f 4e d6
trap user0
}

View File

@@ -42,7 +42,7 @@ ebb0:
; nextln: $v3 = bxor v0, v2
; nextln: }
; Polymorphic istruction controlled by second operand.
; Polymorphic instruction controlled by second operand.
function %select() {
ebb0(v90: i32, v91: i32, v92: b1):
v0 = select v92, v90, v91
@@ -52,6 +52,16 @@ ebb0(v90: i32, v91: i32, v92: b1):
; nextln: $v0 = select $v92, $v90, $v91
; nextln: }
; Polymorphic instruction controlled by third operand.
function %selectif() native {
ebb0(v95: i32, v96: i32, v97: b1):
v98 = selectif.i32 eq v97, v95, v96
}
; sameln: function %selectif() native {
; nextln: ebb0(v0: i32, v1: i32, v2: b1):
; nextln: v3 = selectif.i32 eq v2, v0, v1
; nextln: }
; Lane indexes.
function %lanes() {
ebb0:

View File

@@ -43,6 +43,8 @@ IntCond = InstructionFormat(intcc, VALUE)
FloatCompare = InstructionFormat(floatcc, VALUE, VALUE)
FloatCond = InstructionFormat(floatcc, VALUE)
IntSelect = InstructionFormat(intcc, VALUE, VALUE, VALUE)
Jump = InstructionFormat(ebb, VARIABLE_ARGS)
Branch = InstructionFormat(VALUE, ebb, VARIABLE_ARGS)
BranchInt = InstructionFormat(intcc, VALUE, ebb, VARIABLE_ARGS)

View File

@@ -485,6 +485,15 @@ select = Instruction(
""",
ins=(c, x, y), outs=a)
cc = Operand('cc', intcc, doc='Controlling condition code')
flags = Operand('flags', iflags, doc='The machine\'s flag register')
selectif = Instruction(
'selectif', r"""
Conditional select, dependent on integer condition codes.
""",
ins=(cc, flags, x, y), outs=a)
x = Operand('x', Any)
copy = Instruction(

View File

@@ -355,7 +355,7 @@ def gen_xform(xform, fmt, type_sets):
def gen_xform_group(xgrp, fmt, type_sets):
# type: (XFormGroup, Formatter, UniqueTable) -> None
fmt.doc_comment("Legalize the instruction pointed to by `pos`.")
fmt.line('#[allow(unused_variables,unused_assignments)]')
fmt.line('#[allow(unused_variables,unused_assignments,non_snake_case)]')
with fmt.indented('pub fn {}('.format(xgrp.name)):
fmt.line('inst: ir::Inst,')
fmt.line('func: &mut ir::Function,')

View File

@@ -367,6 +367,17 @@ enc_i32_i64(base.ifcmp, r.rcmp, 0x39)
enc_both(base.trueif, r.seti_abcd, 0x0f, 0x90)
enc_both(base.trueff, r.setf_abcd, 0x0f, 0x90)
#
# Conditional move (a.k.a integer select)
#
enc_i32_i64(base.selectif, r.cmov, 0x0F, 0x40)
#
# Bit scan forwards and reverse
#
enc_i32_i64(x86.bsf, r.bsf_and_bsr, 0x0F, 0xBC)
enc_i32_i64(x86.bsr, r.bsf_and_bsr, 0x0F, 0xBD)
#
# Convert bool to int.
#

View File

@@ -5,6 +5,7 @@ This module defines additional instructions that are useful only to the Intel
target ISA.
"""
from base.types import iflags
from cdsl.operands import Operand
from cdsl.typevar import TypeVar
from cdsl.instructions import Instruction, InstructionGroup
@@ -125,4 +126,26 @@ pop = Instruction(
""",
outs=x, can_load=True, other_side_effects=True)
y = Operand('y', iWord)
rflags = Operand('rflags', iflags)
bsr = Instruction(
'x86_bsr', r"""
Bit Scan Reverse -- returns the bit-index of the most significant 1
in the word. Result is undefined if the argument is zero. However, it
sets the Z flag depending on the argument, so it is at least easy to
detect and handle that case.
This is polymorphic in i32 and i64. It is implemented for both i64 and
i32 in 64-bit mode, and only for i32 in 32-bit mode.
""",
ins=x, outs=(y, rflags))
bsf = Instruction(
'x86_bsf', r"""
Bit Scan Forwards -- returns the bit-index of the least significant 1
in the word. Is otherwise identical to 'bsr', just above.
""",
ins=x, outs=(y, rflags))
GROUP.close()

View File

@@ -4,7 +4,7 @@ Custom legalization patterns for Intel.
from __future__ import absolute_import
from cdsl.ast import Var
from cdsl.xform import Rtl, XFormGroup
from base.immediates import imm64, floatcc
from base.immediates import imm64, intcc, floatcc
from base.types import i32, i64
from base import legalize as shared
from base import instructions as insts
@@ -100,3 +100,131 @@ intel_expand.custom_legalize(insts.fcvt_from_uint, 'expand_fcvt_from_uint')
# Conversions from float to int can trap.
intel_expand.custom_legalize(insts.fcvt_to_sint, 'expand_fcvt_to_sint')
intel_expand.custom_legalize(insts.fcvt_to_uint, 'expand_fcvt_to_uint')
# Count leading and trailing zeroes, for baseline x86_64
c_minus_one = Var('c_minus_one')
c_thirty_one = Var('c_thirty_one')
c_thirty_two = Var('c_thirty_two')
c_sixty_three = Var('c_sixty_three')
c_sixty_four = Var('c_sixty_four')
index1 = Var('index1')
r2flags = Var('r2flags')
index2 = Var('index2')
intel_expand.legalize(
a << insts.clz.i64(x),
Rtl(
c_minus_one << insts.iconst(imm64(-1)),
c_sixty_three << insts.iconst(imm64(63)),
(index1, r2flags) << x86.bsr(x),
index2 << insts.selectif(intcc.eq, r2flags, c_minus_one, index1),
a << insts.isub(c_sixty_three, index2),
))
intel_expand.legalize(
a << insts.clz.i32(x),
Rtl(
c_minus_one << insts.iconst(imm64(-1)),
c_thirty_one << insts.iconst(imm64(31)),
(index1, r2flags) << x86.bsr(x),
index2 << insts.selectif(intcc.eq, r2flags, c_minus_one, index1),
a << insts.isub(c_thirty_one, index2),
))
intel_expand.legalize(
a << insts.ctz.i64(x),
Rtl(
c_sixty_four << insts.iconst(imm64(64)),
(index1, r2flags) << x86.bsf(x),
a << insts.selectif(intcc.eq, r2flags, c_sixty_four, index1),
))
intel_expand.legalize(
a << insts.ctz.i32(x),
Rtl(
c_thirty_two << insts.iconst(imm64(32)),
(index1, r2flags) << x86.bsf(x),
a << insts.selectif(intcc.eq, r2flags, c_thirty_two, index1),
))
# Population count for baseline x86_64
qv1 = Var('qv1')
qv3 = Var('qv3')
qv4 = Var('qv4')
qv5 = Var('qv5')
qv6 = Var('qv6')
qv7 = Var('qv7')
qv8 = Var('qv8')
qv9 = Var('qv9')
qv10 = Var('qv10')
qv11 = Var('qv11')
qv12 = Var('qv12')
qv13 = Var('qv13')
qv14 = Var('qv14')
qv15 = Var('qv15')
qv16 = Var('qv16')
qc77 = Var('qc77')
qc0F = Var('qc0F')
qc01 = Var('qc01')
intel_expand.legalize(
qv16 << insts.popcnt.i64(qv1),
Rtl(
qv3 << insts.ushr_imm(qv1, imm64(1)),
qc77 << insts.iconst(imm64(0x7777777777777777)),
qv4 << insts.band(qv3, qc77),
qv5 << insts.isub(qv1, qv4),
qv6 << insts.ushr_imm(qv4, imm64(1)),
qv7 << insts.band(qv6, qc77),
qv8 << insts.isub(qv5, qv7),
qv9 << insts.ushr_imm(qv7, imm64(1)),
qv10 << insts.band(qv9, qc77),
qv11 << insts.isub(qv8, qv10),
qv12 << insts.ushr_imm(qv11, imm64(4)),
qv13 << insts.iadd(qv11, qv12),
qc0F << insts.iconst(imm64(0x0F0F0F0F0F0F0F0F)),
qv14 << insts.band(qv13, qc0F),
qc01 << insts.iconst(imm64(0x0101010101010101)),
qv15 << insts.imul(qv14, qc01),
qv16 << insts.ushr_imm(qv15, imm64(56))
))
lv1 = Var('lv1')
lv3 = Var('lv3')
lv4 = Var('lv4')
lv5 = Var('lv5')
lv6 = Var('lv6')
lv7 = Var('lv7')
lv8 = Var('lv8')
lv9 = Var('lv9')
lv10 = Var('lv10')
lv11 = Var('lv11')
lv12 = Var('lv12')
lv13 = Var('lv13')
lv14 = Var('lv14')
lv15 = Var('lv15')
lv16 = Var('lv16')
lc77 = Var('lc77')
lc0F = Var('lc0F')
lc01 = Var('lc01')
intel_expand.legalize(
lv16 << insts.popcnt.i32(lv1),
Rtl(
lv3 << insts.ushr_imm(lv1, imm64(1)),
lc77 << insts.iconst(imm64(0x77777777)),
lv4 << insts.band(lv3, lc77),
lv5 << insts.isub(lv1, lv4),
lv6 << insts.ushr_imm(lv4, imm64(1)),
lv7 << insts.band(lv6, lc77),
lv8 << insts.isub(lv5, lv7),
lv9 << insts.ushr_imm(lv7, imm64(1)),
lv10 << insts.band(lv9, lc77),
lv11 << insts.isub(lv8, lv10),
lv12 << insts.ushr_imm(lv11, imm64(4)),
lv13 << insts.iadd(lv11, lv12),
lc0F << insts.iconst(imm64(0x0F0F0F0F)),
lv14 << insts.band(lv13, lc0F),
lc01 << insts.iconst(imm64(0x01010101)),
lv15 << insts.imul(lv14, lc01),
lv16 << insts.ushr_imm(lv15, imm64(24))
))

View File

@@ -8,6 +8,7 @@ from cdsl.registers import RegClass
from base.formats import Unary, UnaryImm, Binary, BinaryImm, MultiAry, NullAry
from base.formats import Trap, Call, IndirectCall, Store, Load
from base.formats import IntCompare, FloatCompare, IntCond, FloatCond
from base.formats import IntSelect
from base.formats import Jump, Branch, BranchInt, BranchFloat
from base.formats import Ternary, FuncAddr, UnaryGlobalVar
from base.formats import RegMove, RegSpill, RegFill, CopySpecial
@@ -1021,6 +1022,32 @@ setf_abcd = TailRecipe(
modrm_r_bits(out_reg0, bits, sink);
''')
#
# Conditional move (a.k.a integer select)
# (maybe-REX.W) 0F 4x modrm(r,r)
# 1 byte, modrm(r,r), is after the opcode
#
cmov = TailRecipe(
'cmov', IntSelect, size=1, ins=(FLAG.eflags, GPR, GPR), outs=2,
requires_prefix=False,
clobbers_flags=False,
emit='''
PUT_OP(bits | icc2opc(cond), rex2(in_reg1, in_reg2), sink);
modrm_rr(in_reg1, in_reg2, sink);
''')
#
# Bit scan forwards and reverse
#
bsf_and_bsr = TailRecipe(
'bsf_and_bsr', Unary, size=1, ins=GPR, outs=(GPR, FLAG.eflags),
requires_prefix=False,
clobbers_flags=True,
emit='''
PUT_OP(bits, rex2(in_reg0, out_reg0), sink);
modrm_rr(in_reg0, out_reg0, sink);
''')
#
# Compare and set flags.
#

View File

@@ -40,6 +40,7 @@ use_lzcnt = And(has_lzcnt)
# Presets corresponding to Intel CPUs.
baseline = Preset(has_sse2)
nehalem = Preset(
has_sse2, has_sse3, has_ssse3, has_sse41, has_sse42, has_popcnt)
haswell = Preset(nehalem, has_bmi1, has_lzcnt)

View File

@@ -157,6 +157,11 @@ pub enum InstructionData {
cond: FloatCC,
arg: Value,
},
IntSelect {
opcode: Opcode,
cond: IntCC,
args: [Value; 3],
},
Jump {
opcode: Opcode,
destination: Ebb,

View File

@@ -358,6 +358,7 @@ impl<'a> Verifier<'a> {
IntCond { .. } |
FloatCompare { .. } |
FloatCond { .. } |
IntSelect { .. } |
Load { .. } |
Store { .. } |
RegMove { .. } |

View File

@@ -303,6 +303,9 @@ pub fn write_operands(
IntCond { cond, arg, .. } => write!(w, " {} {}", cond, arg),
FloatCompare { cond, args, .. } => write!(w, " {} {}, {}", cond, args[0], args[1]),
FloatCond { cond, arg, .. } => write!(w, " {} {}", cond, arg),
IntSelect { cond, args, .. } => {
write!(w, " {} {}, {}, {}", cond, args[0], args[1], args[2])
}
Jump {
destination,
ref args,

View File

@@ -2119,6 +2119,25 @@ impl<'a> Parser<'a> {
let arg = self.match_value("expected SSA value")?;
InstructionData::FloatCond { opcode, cond, arg }
}
InstructionFormat::IntSelect => {
let cond = self.match_enum("expected intcc condition code")?;
let guard = self.match_value("expected SSA value first operand")?;
self.match_token(
Token::Comma,
"expected ',' between operands",
)?;
let v_true = self.match_value("expected SSA value second operand")?;
self.match_token(
Token::Comma,
"expected ',' between operands",
)?;
let v_false = self.match_value("expected SSA value third operand")?;
InstructionData::IntSelect {
opcode,
cond,
args: [guard, v_true, v_false],
}
}
InstructionFormat::Call => {
let func_ref = self.match_fn("expected function reference").and_then(
|num| {