aarch64: Migrate bitrev/clz/cls/ctz to ISLE (#3658)

This commit migrates these existing instructions to ISLE from the manual
lowerings implemented today. This was mostly straightforward but while I
was at it I fixed what appeared to be broken translations for I{8,16}
for `clz`, `cls`, and `ctz`. Previously the lowerings would produce
results as-if the input was 32-bits, but now I believe they all
correctly account for the bit-width.
This commit is contained in:
Alex Crichton
2022-01-06 15:18:32 -06:00
committed by GitHub
parent 7fd78da23f
commit 72e2b7fe80
9 changed files with 1040 additions and 608 deletions

View File

@@ -1464,6 +1464,13 @@
(_ Unit (emit (MInst.AluRRRR op dst src1 src2 src3))))
(writable_reg_to_reg dst)))
;; Helper for emitting `MInst.BitRR` instructions.
(decl bit_rr (BitOp Reg) Reg)
(rule (bit_rr op src)
(let ((dst WritableReg (temp_writable_reg $I64))
(_ Unit (emit (MInst.BitRR op dst src))))
(writable_reg_to_reg dst)))
;; Helper for emitting `adds` instructions.
(decl add64_with_flags (Reg Reg) ProducesFlags)
(rule (add64_with_flags src1 src2)
@@ -1485,6 +1492,11 @@
(ProducesFlags.ProducesFlags (MInst.AluRRR (ALUOp.SubS64) dst src1 src2)
(writable_reg_to_reg dst))))
(decl cmp64_imm (Reg Imm12) ProducesFlags)
(rule (cmp64_imm src1 src2)
(ProducesFlags.ProducesFlags (MInst.AluRRImm12 (ALUOp.SubS64) (writable_zero_reg) src1 src2)
(zero_reg)))
;; Helper for emitting `sbc` instructions.
(decl sbc64 (Reg Reg) ConsumesFlags)
(rule (sbc64 src1 src2)
@@ -1837,6 +1849,9 @@
(decl orr32 (Reg Reg) Reg)
(rule (orr32 x y) (alu_rrr (ALUOp.Orr32) x y))
(decl orr32_imm (Reg ImmLogic) Reg)
(rule (orr32_imm x y) (alu_rr_imm_logic (ALUOp.Orr32) x y))
(decl orr64 (Reg Reg) Reg)
(rule (orr64 x y) (alu_rrr (ALUOp.Orr64) x y))
@@ -1884,6 +1899,38 @@
(decl rotr64_imm (Reg ImmShift) Reg)
(rule (rotr64_imm x y) (alu_rr_imm_shift (ALUOp.RotR64) x y))
;; Helpers for generating `rbit` instructions.
(decl rbit32 (Reg) Reg)
(rule (rbit32 x) (bit_rr (BitOp.RBit32) x))
(decl rbit64 (Reg) Reg)
(rule (rbit64 x) (bit_rr (BitOp.RBit64) x))
;; Helpers for generating `clz` instructions.
(decl clz32 (Reg) Reg)
(rule (clz32 x) (bit_rr (BitOp.Clz32) x))
(decl clz64 (Reg) Reg)
(rule (clz64 x) (bit_rr (BitOp.Clz64) x))
;; Helpers for generating `cls` instructions.
(decl cls32 (Reg) Reg)
(rule (cls32 x) (bit_rr (BitOp.Cls32) x))
(decl cls64 (Reg) Reg)
(rule (cls64 x) (bit_rr (BitOp.Cls64) x))
;; Helpers for generating `eon` instructions.
(decl eon32 (Reg Reg) Reg)
(rule (eon32 x y) (alu_rrr (ALUOp.EorNot32) x y))
(decl eon64 (Reg Reg) Reg)
(rule (eon64 x y) (alu_rrr (ALUOp.EorNot64) x y))
;; Immediate value helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(decl imm (Type u64) Reg)

View File

@@ -1004,3 +1004,128 @@
(lo Reg (orr64 (value_regs_get rshift 0) (value_regs_get lshift 0)))
)
(value_regs lo hi)))
;;;; Rules for `bitrev` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Reversing an 8-bit value with a 32-bit bitrev instruction will place
;; the reversed result in the highest 8 bits, so we need to shift them down into
;; place.
(rule (lower (has_type $I8 (bitrev x)))
(value_reg (lsr32_imm (rbit32 (put_in_reg x)) (imm_shift_from_u8 24))))
;; Reversing an 16-bit value with a 32-bit bitrev instruction will place
;; the reversed result in the highest 16 bits, so we need to shift them down into
;; place.
(rule (lower (has_type $I16 (bitrev x)))
(value_reg (lsr32_imm (rbit32 (put_in_reg x)) (imm_shift_from_u8 16))))
(rule (lower (has_type $I32 (bitrev x)))
(value_reg (rbit32 (put_in_reg x))))
(rule (lower (has_type $I64 (bitrev x)))
(value_reg (rbit64 (put_in_reg x))))
(rule (lower (has_type $I128 (bitrev x)))
(let (
(val ValueRegs (put_in_regs x))
(lo_rev Reg (rbit64 (value_regs_get val 0)))
(hi_rev Reg (rbit64 (value_regs_get val 1)))
)
(value_regs hi_rev lo_rev)))
;;;; Rules for `clz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type $I8 (clz x)))
(value_reg (sub32_imm (clz32 (put_in_reg_zext32 x)) (u8_into_imm12 24))))
(rule (lower (has_type $I16 (clz x)))
(value_reg (sub32_imm (clz32 (put_in_reg_zext32 x)) (u8_into_imm12 16))))
(rule (lower (has_type $I32 (clz x)))
(value_reg (clz32 (put_in_reg x))))
(rule (lower (has_type $I64 (clz x)))
(value_reg (clz64 (put_in_reg x))))
(rule (lower (has_type $I128 (clz x)))
(lower_clz128 (put_in_regs x)))
;; clz hi_clz, hi
;; clz lo_clz, lo
;; lsr tmp, hi_clz, #6
;; madd dst_lo, lo_clz, tmp, hi_clz
;; mov dst_hi, 0
(decl lower_clz128 (ValueRegs) ValueRegs)
(rule (lower_clz128 val)
(let (
(hi_clz Reg (clz64 (value_regs_get val 1)))
(lo_clz Reg (clz64 (value_regs_get val 0)))
(tmp Reg (lsr64_imm hi_clz (imm_shift_from_u8 6)))
)
(value_regs (madd64 lo_clz tmp hi_clz) (imm $I64 0))))
;;;; Rules for `ctz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Note that all `ctz` instructions are implemented by reversing the bits and
;; then using a `clz` instruction since the tail zeros are the same as the
;; leading zeros of the reversed value.
(rule (lower (has_type $I8 (ctz x)))
(value_reg (clz32 (orr32_imm (rbit32 (put_in_reg x)) (u64_into_imm_logic $I32 0x800000)))))
(rule (lower (has_type $I16 (ctz x)))
(value_reg (clz32 (orr32_imm (rbit32 (put_in_reg x)) (u64_into_imm_logic $I32 0x8000)))))
(rule (lower (has_type $I32 (ctz x)))
(value_reg (clz32 (rbit32 (put_in_reg x)))))
(rule (lower (has_type $I64 (ctz x)))
(value_reg (clz64 (rbit64 (put_in_reg x)))))
(rule (lower (has_type $I128 (ctz x)))
(let (
(val ValueRegs (put_in_regs x))
(lo Reg (rbit64 (value_regs_get val 0)))
(hi Reg (rbit64 (value_regs_get val 1)))
)
(lower_clz128 (value_regs hi lo))))
;;;; Rules for `cls` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type $I8 (cls x)))
(value_reg (sub32_imm (cls32 (put_in_reg_zext32 x)) (u8_into_imm12 24))))
(rule (lower (has_type $I16 (cls x)))
(value_reg (sub32_imm (cls32 (put_in_reg_zext32 x)) (u8_into_imm12 16))))
(rule (lower (has_type $I32 (cls x)))
(value_reg (cls32 (put_in_reg x))))
(rule (lower (has_type $I64 (cls x)))
(value_reg (cls64 (put_in_reg x))))
;; cls lo_cls, lo
;; cls hi_cls, hi
;; eon sign_eq_eor, hi, lo
;; lsr sign_eq, sign_eq_eor, #63
;; madd lo_sign_bits, out_lo, sign_eq, sign_eq
;; cmp hi_cls, #63
;; csel maybe_lo, lo_sign_bits, xzr, eq
;; add out_lo, maybe_lo, hi_cls
;; mov out_hi, 0
(rule (lower (has_type $I128 (cls x)))
(let (
(val ValueRegs (put_in_regs x))
(lo Reg (value_regs_get val 0))
(hi Reg (value_regs_get val 1))
(lo_cls Reg (cls64 lo))
(hi_cls Reg (cls64 hi))
(sign_eq_eon Reg (eon64 hi lo))
(sign_eq Reg (lsr64_imm sign_eq_eon (imm_shift_from_u8 63)))
(lo_sign_bits Reg (madd64 lo_cls sign_eq sign_eq))
(maybe_lo Reg (with_flags_1
(cmp64_imm hi_cls (u8_into_imm12 63))
(csel (Cond.Eq) lo_sign_bits (zero_reg))
))
)
(value_regs (add64 maybe_lo hi_cls) (imm $I64 0))))

View File

@@ -1533,50 +1533,6 @@ pub(crate) fn lower_load<
f(ctx, rd, elem_ty, mem)
}
pub(crate) fn emit_clz_i128<C: LowerCtx<I = Inst>>(
ctx: &mut C,
src: ValueRegs<Reg>,
dst: ValueRegs<Writable<Reg>>,
) {
let src_lo = src.regs()[0];
let src_hi = src.regs()[1];
let dst_lo = dst.regs()[0];
let dst_hi = dst.regs()[1];
// clz dst_hi, src_hi
// clz dst_lo, src_lo
// lsr tmp, dst_hi, #6
// madd dst_lo, dst_lo, tmp, dst_hi
// mov dst_hi, 0
let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
ctx.emit(Inst::BitRR {
rd: dst_hi,
rn: src_hi,
op: BitOp::Clz64,
});
ctx.emit(Inst::BitRR {
rd: dst_lo,
rn: src_lo,
op: BitOp::Clz64,
});
ctx.emit(Inst::AluRRImmShift {
alu_op: ALUOp::Lsr64,
rd: tmp,
rn: dst_hi.to_reg(),
immshift: ImmShift::maybe_from_u64(6).unwrap(),
});
ctx.emit(Inst::AluRRRR {
alu_op: ALUOp3::MAdd64,
rd: dst_lo,
rn: dst_lo.to_reg(),
rm: tmp.to_reg(),
ra: dst_hi.to_reg(),
});
lower_constant_u64(ctx, dst_hi, 0);
}
//=============================================================================
// Lowering-backend trait implementation.

View File

@@ -1,4 +1,4 @@
src/clif.isle f176ef3bba99365
src/prelude.isle babc931e5dc5b4cf
src/isa/aarch64/inst.isle 36d057f98a944e4
src/isa/aarch64/lower.isle 43467df9d06b00ac
src/isa/aarch64/inst.isle 3ae25d431916bb81
src/isa/aarch64/lower.isle 5715ecb7c7a41164

File diff suppressed because it is too large Load Diff

View File

@@ -92,164 +92,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
Opcode::Rotr | Opcode::Rotl => implemented_in_isle(ctx),
Opcode::Bitrev | Opcode::Clz | Opcode::Cls | Opcode::Ctz => {
let ty = ty.unwrap();
let op_ty = match ty {
I8 | I16 | I32 => I32,
I64 | I128 => I64,
_ => {
return Err(CodegenError::Unsupported(format!(
"{}: Unsupported type: {:?}",
op, ty
)))
}
};
let bitop = match op {
Opcode::Clz | Opcode::Cls | Opcode::Bitrev => BitOp::from((op, op_ty)),
Opcode::Ctz => BitOp::from((Opcode::Bitrev, op_ty)),
_ => unreachable!(),
};
if ty == I128 {
let out_regs = get_output_reg(ctx, outputs[0]);
let in_regs = put_input_in_regs(ctx, inputs[0]);
let in_lo = in_regs.regs()[0];
let in_hi = in_regs.regs()[1];
let out_lo = out_regs.regs()[0];
let out_hi = out_regs.regs()[1];
if op == Opcode::Bitrev || op == Opcode::Ctz {
ctx.emit(Inst::BitRR {
rd: out_hi,
rn: in_lo,
op: bitop,
});
ctx.emit(Inst::BitRR {
rd: out_lo,
rn: in_hi,
op: bitop,
});
}
if op == Opcode::Ctz {
// We have reduced the problem to a clz by reversing the inputs previouly
emit_clz_i128(ctx, out_regs.map(|r| r.to_reg()), out_regs);
} else if op == Opcode::Clz {
emit_clz_i128(ctx, in_regs, out_regs);
} else if op == Opcode::Cls {
// cls out_hi, in_hi
// cls out_lo, in_lo
// eon sign_eq, in_hi, in_lo
// lsr sign_eq, sign_eq, #63
// madd out_lo, out_lo, sign_eq, sign_eq
// cmp out_hi, #63
// csel out_lo, out_lo, xzr, eq
// add out_lo, out_lo, out_hi
// mov out_hi, 0
let sign_eq = ctx.alloc_tmp(I64).only_reg().unwrap();
let xzr = writable_zero_reg();
ctx.emit(Inst::BitRR {
rd: out_lo,
rn: in_lo,
op: bitop,
});
ctx.emit(Inst::BitRR {
rd: out_hi,
rn: in_hi,
op: bitop,
});
ctx.emit(Inst::AluRRR {
alu_op: ALUOp::EorNot64,
rd: sign_eq,
rn: in_hi,
rm: in_lo,
});
ctx.emit(Inst::AluRRImmShift {
alu_op: ALUOp::Lsr64,
rd: sign_eq,
rn: sign_eq.to_reg(),
immshift: ImmShift::maybe_from_u64(63).unwrap(),
});
ctx.emit(Inst::AluRRRR {
alu_op: ALUOp3::MAdd64,
rd: out_lo,
rn: out_lo.to_reg(),
rm: sign_eq.to_reg(),
ra: sign_eq.to_reg(),
});
ctx.emit(Inst::AluRRImm12 {
alu_op: ALUOp::SubS64,
rd: xzr,
rn: out_hi.to_reg(),
imm12: Imm12::maybe_from_u64(63).unwrap(),
});
ctx.emit(Inst::CSel {
cond: Cond::Eq,
rd: out_lo,
rn: out_lo.to_reg(),
rm: xzr.to_reg(),
});
ctx.emit(Inst::AluRRR {
alu_op: ALUOp::Add64,
rd: out_lo,
rn: out_lo.to_reg(),
rm: out_hi.to_reg(),
});
lower_constant_u64(ctx, out_hi, 0);
}
} else {
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let needs_zext = match op {
Opcode::Bitrev | Opcode::Ctz => false,
Opcode::Clz | Opcode::Cls => true,
_ => unreachable!(),
};
let narrow_mode = if needs_zext && ty_bits(ty) == 64 {
NarrowValueMode::ZeroExtend64
} else if needs_zext {
NarrowValueMode::ZeroExtend32
} else {
NarrowValueMode::None
};
let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
ctx.emit(Inst::BitRR { rd, rn, op: bitop });
// Both bitrev and ctz use a bit-reverse (rbit) instruction; ctz to reduce the problem
// to a clz, and bitrev as the main operation.
if op == Opcode::Bitrev || op == Opcode::Ctz {
// Reversing an n-bit value (n < 32) with a 32-bit bitrev instruction will place
// the reversed result in the highest n bits, so we need to shift them down into
// place.
let right_shift = match ty {
I8 => Some(24),
I16 => Some(16),
I32 => None,
I64 => None,
_ => unreachable!(),
};
if let Some(s) = right_shift {
ctx.emit(Inst::AluRRImmShift {
alu_op: ALUOp::Lsr32,
rd,
rn: rd.to_reg(),
immshift: ImmShift::maybe_from_u64(s).unwrap(),
});
}
}
if op == Opcode::Ctz {
ctx.emit(Inst::BitRR {
op: BitOp::from((Opcode::Clz, op_ty)),
rd,
rn: rd.to_reg(),
});
}
}
}
Opcode::Bitrev | Opcode::Clz | Opcode::Cls | Opcode::Ctz => implemented_in_isle(ctx),
Opcode::Popcnt => {
let ty = ty.unwrap();

View File

@@ -59,6 +59,7 @@ block0(v0: i8):
; check: uxtb w0, w0
; nextln: clz w0, w0
; nextln: sub w0, w0, #24
; nextln: ret
function %b(i16) -> i16 {
@@ -69,6 +70,7 @@ block0(v0: i16):
; check: uxth w0, w0
; nextln: clz w0, w0
; nextln: sub w0, w0, #16
; nextln: ret
function %b(i32) -> i32 {
@@ -110,6 +112,7 @@ block0(v0: i8):
; check: uxtb w0, w0
; nextln: cls w0, w0
; nextln: sub w0, w0, #24
; nextln: ret
function %c(i16) -> i16 {
@@ -120,6 +123,7 @@ block0(v0: i16):
; check: uxth w0, w0
; nextln: cls w0, w0
; nextln: sub w0, w0, #16
; nextln: ret
function %c(i32) -> i32 {
@@ -164,7 +168,7 @@ block0(v0: i8):
}
; check: rbit w0, w0
; nextln: lsr w0, w0, #24
; nextln: orr w0, w0, #8388608
; nextln: clz w0, w0
; nextln: ret
@@ -175,7 +179,7 @@ block0(v0: i16):
}
; check: rbit w0, w0
; nextln: lsr w0, w0, #16
; nextln: orr w0, w0, #32768
; nextln: clz w0, w0
; nextln: ret

View File

@@ -3,6 +3,24 @@ test run
target aarch64
target x86_64
function %clz_i8(i8) -> i8 {
block0(v0: i8):
v1 = clz v0
return v1
}
; run: %clz_i8(1) == 7
; run: %clz_i8(0x40) == 1
; run: %clz_i8(-1) == 0
function %clz_i16(i16) -> i16 {
block0(v0: i16):
v1 = clz v0
return v1
}
; run: %clz_i16(1) == 15
; run: %clz_i16(0x4000) == 1
; run: %clz_i16(-1) == 0
function %clz_i32(i32) -> i32 {
block0(v0: i32):
v1 = clz v0

View File

@@ -3,6 +3,24 @@ test run
target aarch64
target x86_64
function %ctz_i8(i8) -> i8 {
block0(v0: i8):
v1 = ctz v0
return v1
}
; run: %ctz_i8(1) == 0
; run: %ctz_i8(0x40) == 6
; run: %ctz_i8(-1) == 0
function %ctz_i16(i16) -> i16 {
block0(v0: i16):
v1 = ctz v0
return v1
}
; run: %ctz_i16(1) == 0
; run: %ctz_i16(0x4000) == 14
; run: %ctz_i16(-1) == 0
function %ctz_i32(i32) -> i32 {
block0(v0: i32):
v1 = ctz v0