Cranelift AArch64: Migrate Splat to ISLE (#4521)

Copyright (c) 2022, Arm Limited.
This commit is contained in:
Anton Kirilov
2022-07-26 18:57:15 +01:00
committed by GitHub
parent 1321c234e5
commit ead6edb0c5
21 changed files with 593 additions and 338 deletions

View File

@@ -171,8 +171,8 @@ impl Type {
self.replace_lanes(match self.lane_type() {
I8 | B1 | B8 => I8,
I16 | B16 => I16,
I32 | B32 => I32,
I64 | B64 => I64,
I32 | B32 | F32 => I32,
I64 | B64 | F64 => I64,
I128 | B128 => I128,
_ => unimplemented!(),
})

View File

@@ -627,7 +627,8 @@
(VecLoadReplicate
(rd WritableReg)
(rn Reg)
(size VectorSize))
(size VectorSize)
(flags MemFlags))
;; Vector conditional select, 128 bit. A synthetic instruction, which generates a 4-insn
;; control-flow diamond.
@@ -1376,6 +1377,16 @@
(decl cond_br_cond (Cond) CondBrKind)
(extern constructor cond_br_cond cond_br_cond)
;; Lower the address of a load or a store.
(decl amode (Type Inst u32) AMode)
;; TODO: Port lower_address() to ISLE.
(extern constructor amode amode)
;; Matches an `AMode` that is just a register.
(decl pure amode_is_reg (AMode) Reg)
;; TODO: Implement in ISLE.
(extern constructor amode_is_reg amode_is_reg)
;; Instruction creation helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Helper for creating the zero register.
@@ -1481,6 +1492,13 @@
(_ Unit (emit (MInst.VecDup dst src size))))
dst))
;; Helper for emitting `MInst.VecDupFromFpu` instructions.
(decl vec_dup_from_fpu (Reg VectorSize) Reg)
(rule (vec_dup_from_fpu src size)
(let ((dst WritableReg (temp_writable_reg $I8X16))
(_ Unit (emit (MInst.VecDupFromFpu dst src size))))
dst))
;; Helper for emitting `MInst.AluRRImm12` instructions.
(decl alu_rr_imm12 (ALUOp Type Reg Imm12) Reg)
(rule (alu_rr_imm12 op ty src imm)
@@ -2167,7 +2185,7 @@
(decl sinkable_atomic_load (SinkableAtomicLoad) Value)
(extern extractor sinkable_atomic_load sinkable_atomic_load)
;; Sink a `SinkableLoad` into a `Reg`.
;; Sink a `SinkableAtomicLoad` into a `Reg`.
;;
;; This is a side-effectful operation that notifies the context that the
;; instruction that produced the `SinkableAtomicLoad` has been sunk into another
@@ -2230,6 +2248,29 @@
(alu_rrr op ty x_lo y_lo)
(alu_rrr op ty x_hi y_hi))))
;; Helper for emitting `MInst.VecLoadReplicate` instructions.
(decl ld1r (Reg VectorSize MemFlags) Reg)
(rule (ld1r src size flags)
(let ((dst WritableReg (temp_writable_reg $I8X16))
(_ Unit (emit (MInst.VecLoadReplicate dst src size flags))))
dst))
;; Helper for emitting `MInst.LoadAddr` instructions.
(decl load_addr (AMode) Reg)
(rule (load_addr addr)
(let ((dst WritableReg (temp_writable_reg $I64))
(_ Unit (emit (MInst.LoadAddr dst addr))))
dst))
(rule (load_addr addr)
(if-let addr_reg (amode_is_reg addr))
addr_reg)
;; Lower a vector splat with a constant parameter.
(decl splat_const (u64 VectorSize) Reg)
;; TODO: Port lower_splat_const() to ISLE.
(extern constructor splat_const splat_const)
;; Generate comparison to zero operator from input condition code
(decl float_cc_cmp_zero_to_vec_misc_op (FloatCC) VecMisc2)
(extern constructor float_cc_cmp_zero_to_vec_misc_op float_cc_cmp_zero_to_vec_misc_op)

View File

@@ -2258,10 +2258,10 @@ impl MachInstEmit for Inst {
ScalarSize::Size16 => 0b00010,
ScalarSize::Size32 => 0b00100,
ScalarSize::Size64 => 0b01000,
_ => unimplemented!("Unexpected VectorSize: {:?}", size),
_ => unreachable!(),
};
sink.put4(
0b000_01110000_00000_000011_00000_00000
0b0_0_0_01110000_00000_000011_00000_00000
| (q << 30)
| (imm5 << 16)
| (machreg_to_gpr(rn) << 5)
@@ -2625,13 +2625,18 @@ impl MachInstEmit for Inst {
};
sink.put4(enc_vec_rrr(top11 | q << 9, rm, bit15_10, rn, rd));
}
&Inst::VecLoadReplicate { rd, rn, size } => {
&Inst::VecLoadReplicate {
rd,
rn,
size,
flags,
} => {
let rd = allocs.next_writable(rd);
let rn = allocs.next(rn);
let (q, size) = size.enc_size();
let srcloc = state.cur_srcloc();
if srcloc != SourceLoc::default() {
if srcloc != SourceLoc::default() && !flags.notrap() {
// Register the offset at which the actual load instruction starts.
sink.add_trap(TrapCode::HeapOutOfBounds);
}

View File

@@ -2351,10 +2351,10 @@ fn test_aarch64_binemit() {
Inst::VecDup {
rd: writable_vreg(25),
rn: xreg(7),
size: VectorSize::Size8x16,
size: VectorSize::Size8x8,
},
"F90C014E",
"dup v25.16b, w7",
"F90C010E",
"dup v25.8b, w7",
));
insns.push((
Inst::VecDup {
@@ -2387,10 +2387,10 @@ fn test_aarch64_binemit() {
Inst::VecDup {
rd: writable_vreg(0),
rn: xreg(28),
size: VectorSize::Size32x4,
size: VectorSize::Size32x2,
},
"800F044E",
"dup v0.4s, w28",
"800F040E",
"dup v0.2s, w28",
));
insns.push((
Inst::VecDup {
@@ -5199,8 +5199,8 @@ fn test_aarch64_binemit() {
Inst::VecLoadReplicate {
rd: writable_vreg(31),
rn: xreg(0),
size: VectorSize::Size64x2,
flags: MemFlags::trusted(),
},
"1FCC404D",
"ld1r { v31.2d }, [x0]",
@@ -5210,8 +5210,8 @@ fn test_aarch64_binemit() {
Inst::VecLoadReplicate {
rd: writable_vreg(0),
rn: xreg(25),
size: VectorSize::Size8x8,
flags: MemFlags::trusted(),
},
"20C3400D",
"ld1r { v0.8b }, [x25]",

View File

@@ -530,17 +530,6 @@ impl Inst {
}
}
}
/// Generate a LoadAddr instruction (load address of an amode into
/// register). Elides when possible (when amode is just a register). Returns
/// destination register: either `rd` or a register directly from the amode.
pub fn gen_load_addr(rd: Writable<Reg>, mem: AMode) -> (Reg, Option<Inst>) {
if let Some(r) = mem.is_reg() {
(r, None)
} else {
(rd.to_reg(), Some(Inst::LoadAddr { rd, mem }))
}
}
}
//=============================================================================

View File

@@ -165,6 +165,8 @@ pub fn create_reg_env(flags: &settings::Flags) -> MachineEnv {
preg(xreg(14)),
preg(xreg(15)),
// x16 and x17 are spilltmp and tmp2 (see above).
// x18 could be used by the platform to carry inter-procedural state;
// conservatively assume so and make it not allocatable.
// x19-28 are callee-saved and so not preferred.
// x21 is the pinned register (if enabled) and not allocatable if so.
// x29 is FP, x30 is LR, x31 is SP/ZR.
@@ -178,30 +180,7 @@ pub fn create_reg_env(flags: &settings::Flags) -> MachineEnv {
preg(vreg(5)),
preg(vreg(6)),
preg(vreg(7)),
preg(vreg(8)),
preg(vreg(9)),
preg(vreg(10)),
preg(vreg(11)),
preg(vreg(12)),
preg(vreg(13)),
preg(vreg(14)),
preg(vreg(15)),
],
],
non_preferred_regs_by_class: [
vec![
preg(xreg(19)),
preg(xreg(20)),
// x21 is pinned reg if enabled; we add to this list below if not.
preg(xreg(22)),
preg(xreg(23)),
preg(xreg(24)),
preg(xreg(25)),
preg(xreg(26)),
preg(xreg(27)),
preg(xreg(28)),
],
vec![
// v8-15 are callee-saved and so not preferred.
preg(vreg(16)),
preg(vreg(17)),
preg(vreg(18)),
@@ -220,6 +199,30 @@ pub fn create_reg_env(flags: &settings::Flags) -> MachineEnv {
preg(vreg(31)),
],
],
non_preferred_regs_by_class: [
vec![
preg(xreg(19)),
preg(xreg(20)),
// x21 is pinned reg if enabled; we add to this list below if not.
preg(xreg(22)),
preg(xreg(23)),
preg(xreg(24)),
preg(xreg(25)),
preg(xreg(26)),
preg(xreg(27)),
preg(xreg(28)),
],
vec![
preg(vreg(8)),
preg(vreg(9)),
preg(vreg(10)),
preg(vreg(11)),
preg(vreg(12)),
preg(vreg(13)),
preg(vreg(14)),
preg(vreg(15)),
],
],
fixed_stack_slots: vec![],
};

View File

@@ -1423,7 +1423,8 @@
;;;; Rules for `bitselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type (ty_int_bool_ref_scalar_64 ty) (bitselect c x y)))
(rule (lower (has_type ty (bitselect c x y)))
(if (ty_int_bool_ref_scalar_64 ty))
(let ((tmp1 Reg (and_reg ty x c))
(tmp2 Reg (bic ty y c)))
(orr ty tmp1 tmp2)))
@@ -1441,12 +1442,14 @@
;; T -> I{64,32,16,8}: We can simply pass through the value: values
;; are always stored with high bits undefined, so we can just leave
;; them be.
(rule (lower (has_type (ty_int_bool_ref_scalar_64 ty) (ireduce src)))
(rule (lower (has_type ty (ireduce src)))
(if (ty_int_bool_ref_scalar_64 ty))
(value_regs_get src 0))
;; Likewise for breduce.
(rule (lower (has_type (ty_int_bool_ref_scalar_64 ty) (breduce src)))
(rule (lower (has_type ty (breduce src)))
(if (ty_int_bool_ref_scalar_64 ty))
(value_regs_get src 0))
@@ -1515,6 +1518,39 @@
(let ((use_allocated_encoding bool (is_not_baldrdash_call_conv)))
(side_effect (udf use_allocated_encoding trap_code))))
;;;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type ty (splat x @ (value_type in_ty))))
(if (ty_int_bool_ref_scalar_64 in_ty))
(vec_dup x (vector_size ty)))
(rule (lower (has_type ty (splat x @ (value_type (ty_scalar_float _)))))
(vec_dup_from_fpu x (vector_size ty)))
(rule (lower (has_type ty (splat (bconst (u64_from_bool n)))))
(splat_const n (vector_size ty)))
(rule (lower (has_type ty (splat (breduce (bconst (u64_from_bool n))))))
(splat_const n (vector_size ty)))
(rule (lower (has_type ty (splat (f32const (u64_from_ieee32 n)))))
(splat_const n (vector_size ty)))
(rule (lower (has_type ty (splat (f64const (u64_from_ieee64 n)))))
(splat_const n (vector_size ty)))
(rule (lower (has_type ty (splat (iconst (u64_from_imm64 n)))))
(splat_const n (vector_size ty)))
(rule (lower (has_type ty (splat (ireduce (iconst (u64_from_imm64 n))))))
(splat_const n (vector_size ty)))
(rule (lower (has_type ty (splat x @ (load flags _addr offset))))
(if-let mem_op (is_sinkable_inst x))
(let ((_ Unit (sink_inst mem_op))
(addr AMode (amode (lane_type ty) mem_op offset))
(address Reg (load_addr addr)))
(ld1r address (vector_size ty) flags)))
;;;; Rules for `AtomicLoad` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type (valid_atomic_transaction ty) (atomic_load flags addr)))
@@ -1527,7 +1563,6 @@
addr))
(side_effect (store_release ty src addr)))
;;;; Rules for `AtomicRMW` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule 1 (lower (and (use_lse)

View File

@@ -5,12 +5,13 @@ pub mod generated_code;
// Types that the generated ISLE code uses via `use super::*`.
use super::{
writable_zero_reg, zero_reg, AMode, ASIMDFPModImm, ASIMDMovModImm, BranchTarget, CallIndInfo,
CallInfo, Cond, CondBrKind, ExtendOp, FPUOpRI, FloatCC, Imm12, ImmLogic, ImmShift,
insn_inputs, writable_zero_reg, zero_reg, AMode, ASIMDFPModImm, ASIMDMovModImm, BranchTarget,
CallIndInfo, CallInfo, Cond, CondBrKind, ExtendOp, FPUOpRI, FloatCC, Imm12, ImmLogic, ImmShift,
Inst as MInst, IntCC, JTSequenceInfo, MachLabel, MoveWideConst, MoveWideOp, NarrowValueMode,
Opcode, OperandSize, PairAMode, Reg, ScalarSize, ShiftOpAndAmt, UImm5, VecMisc2, VectorSize,
NZCV,
};
use crate::isa::aarch64::lower::{lower_address, lower_splat_const};
use crate::isa::aarch64::settings::Flags as IsaFlags;
use crate::machinst::{isle::*, InputSourceInst};
use crate::settings::Flags;
@@ -442,4 +443,25 @@ where
_ => panic!(),
}
}
fn amode(&mut self, ty: Type, mem_op: Inst, offset: u32) -> AMode {
lower_address(
self.lower_ctx,
ty,
&insn_inputs(self.lower_ctx, mem_op)[..],
offset as i32,
)
}
fn amode_is_reg(&mut self, address: &AMode) -> Option<Reg> {
address.is_reg()
}
fn splat_const(&mut self, value: u64, size: &VectorSize) -> Reg {
let rd = self.temp_writable_reg(I8X16);
lower_splat_const(self.lower_ctx, rd, value, *size);
rd.to_reg()
}
}

View File

@@ -741,80 +741,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
}
}
Opcode::Splat => {
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let ty = ty.unwrap();
// TODO: Handle SVE Dup.
let ty = if ty.is_dynamic_vector() {
dynamic_to_fixed(ty)
} else {
ty
};
let size = VectorSize::from_ty(ty);
if let Some((_, insn)) = maybe_input_insn_multi(
ctx,
inputs[0],
&[
Opcode::Bconst,
Opcode::F32const,
Opcode::F64const,
Opcode::Iconst,
],
) {
lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
} else if let Some(insn) =
maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Iconst, Opcode::Ireduce)
{
lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
} else if let Some(insn) =
maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Bconst, Opcode::Breduce)
{
lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
} else if let Some((_, insn)) = maybe_input_insn_multi(
ctx,
inputs[0],
&[
Opcode::Uload8,
Opcode::Sload8,
Opcode::Uload16,
Opcode::Sload16,
Opcode::Uload32,
Opcode::Sload32,
Opcode::Load,
],
) {
ctx.sink_inst(insn);
let load_inputs = insn_inputs(ctx, insn);
let load_outputs = insn_outputs(ctx, insn);
lower_load(
ctx,
insn,
&load_inputs[..],
load_outputs[0],
|ctx, _rd, _elem_ty, mem| {
let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
let (addr, addr_inst) = Inst::gen_load_addr(tmp, mem);
if let Some(addr_inst) = addr_inst {
ctx.emit(addr_inst);
}
ctx.emit(Inst::VecLoadReplicate { rd, rn: addr, size });
Ok(())
},
)?;
} else {
let input_ty = ctx.input_ty(insn, 0);
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let inst = if ty_has_int_representation(input_ty) {
Inst::VecDup { rd, rn, size }
} else {
Inst::VecDupFromFpu { rd, rn, size }
};
ctx.emit(inst);
}
}
Opcode::Splat => implemented_in_isle(ctx),
Opcode::ScalarToVector => implemented_in_isle(ctx),

View File

@@ -868,7 +868,7 @@
;; Pseudoinstruction to keep a value alive.
(DummyUse
(reg Reg))
;; An unwind pseudoinstruction describing the state of the
;; machine at this program point.
(Unwind
@@ -1641,15 +1641,6 @@
(decl sinkable_inst (Inst) Value)
(extern extractor sinkable_inst sinkable_inst)
;; Sink a sinkable instruction.
;;
;; This is a side-effectful operation that notifies the context that the
;; sinkable instruction been sunk into another instruction, and no longer
;; needs to be lowered.
(decl sink_inst (Inst) Unit)
(extern constructor sink_inst sink_inst)
;; Sinkable big-endian load instruction.
(decl sinkable_load (Inst) Value)
(extractor (sinkable_load inst)

View File

@@ -1656,8 +1656,9 @@
;; Insert vector lane from general-purpose register.
(rule (lower (insertlane x @ (value_type ty)
y @ (value_type (ty_int_bool_ref_scalar_64 _))
y @ (value_type in_ty)
(u8_from_uimm8 idx)))
(if (ty_int_bool_ref_scalar_64 in_ty))
(vec_insert_lane ty x y (be_lane_idx ty idx) (zero_reg)))
;; Insert vector lane from floating-point register.
@@ -1771,8 +1772,9 @@
;;;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Extract vector lane to general-purpose register.
(rule (lower (has_type (ty_int_bool_ref_scalar_64 _)
(rule (lower (has_type out_ty
(extractlane x @ (value_type ty) (u8_from_uimm8 idx))))
(if (ty_int_bool_ref_scalar_64 out_ty))
(vec_extract_lane ty x (be_lane_idx ty idx) (zero_reg)))
;; Extract vector lane to floating-point register.
@@ -1828,8 +1830,8 @@
;;;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Load replicated value from general-purpose register.
(rule (lower (has_type ty (splat
x @ (value_type (ty_int_bool_ref_scalar_64 _)))))
(rule (lower (has_type ty (splat x @ (value_type in_ty))))
(if (ty_int_bool_ref_scalar_64 in_ty))
(vec_replicate_lane ty (vec_insert_lane_undef ty x 0 (zero_reg)) 0))
;; Load replicated value from floating-point register.
@@ -1888,7 +1890,8 @@
;; Load scalar value from general-purpose register.
(rule (lower (has_type ty (scalar_to_vector
x @ (value_type (ty_int_bool_ref_scalar_64 _)))))
x @ (value_type in_ty))))
(if (ty_int_bool_ref_scalar_64 in_ty))
(vec_insert_lane ty (vec_imm ty 0) x (be_lane_idx ty 0) (zero_reg)))
;; Load scalar value from floating-point register.

View File

@@ -666,11 +666,6 @@ where
None
}
#[inline]
fn sink_inst(&mut self, inst: Inst) -> Unit {
self.lower_ctx.sink_inst(inst);
}
#[inline]
fn emit(&mut self, inst: &MInst) -> Unit {
self.lower_ctx.emit(inst.clone());

View File

@@ -11,7 +11,9 @@ pub use crate::ir::{
SigRef, StackSlot,
};
pub use crate::isa::unwind::UnwindInst;
pub use crate::machinst::{ABIArg, ABIArgSlot, ABISig, RealReg, Reg, RelocDistance, Writable};
pub use crate::machinst::{
ABIArg, ABIArgSlot, ABISig, InputSourceInst, RealReg, Reg, RelocDistance, Writable,
};
pub type Unit = ();
pub type ValueSlice = (ValueList, usize);
@@ -425,6 +427,15 @@ macro_rules! isle_prelude_methods {
imm.bits() as u64
}
#[inline]
fn u64_from_bool(&mut self, b: bool) -> u64 {
if b {
u64::MAX
} else {
0
}
}
#[inline]
fn inst_results(&mut self, inst: Inst) -> ValueSlice {
(self.lower_ctx.dfg().inst_results_list(inst), 0)
@@ -854,6 +865,21 @@ macro_rules! isle_prelude_methods {
fn real_reg_to_writable_reg(&mut self, reg: RealReg) -> WritableReg {
Writable::from_reg(Reg::from(reg))
}
fn is_sinkable_inst(&mut self, val: Value) -> Option<Inst> {
let input = self.lower_ctx.get_value_as_source_or_const(val);
if let InputSourceInst::UniqueUse(inst, _) = input.inst {
Some(inst)
} else {
None
}
}
#[inline]
fn sink_inst(&mut self, inst: Inst) {
self.lower_ctx.sink_inst(inst);
}
};
}

View File

@@ -308,10 +308,10 @@
(decl fits_in_64 (Type) Type)
(extern extractor fits_in_64 fits_in_64)
;; An extractor that only matches scalar booleans, integers, and references that
;; can fit in 64 bits.
(decl ty_int_bool_ref_scalar_64 (Type) Type)
(extern extractor ty_int_bool_ref_scalar_64 ty_int_bool_ref_scalar_64)
;; A pure constructor that only matches scalar booleans, integers, and
;; references that can fit in 64 bits.
(decl pure ty_int_bool_ref_scalar_64 (Type) Type)
(extern constructor ty_int_bool_ref_scalar_64 ty_int_bool_ref_scalar_64)
;; An extractor that matches 32- and 64-bit types only.
(decl ty_32_or_64 (Type) Type)
@@ -407,6 +407,10 @@
(decl u8_from_uimm8 (u8) Uimm8)
(extern extractor infallible u8_from_uimm8 u8_from_uimm8)
;; Extract a `u64` from a `bool`.
(decl u64_from_bool (u64) bool)
(extern extractor infallible u64_from_bool u64_from_bool)
;; Extract a `u64` from an `Imm64`.
(decl u64_from_imm64 (u64) Imm64)
(extern extractor infallible u64_from_imm64 u64_from_imm64)
@@ -498,6 +502,10 @@
(decl pure zero_value (Value) Value)
(extern constructor zero_value zero_value)
;; Match a sinkable instruction from a value operand.
(decl pure is_sinkable_inst (Value) Inst)
(extern constructor is_sinkable_inst is_sinkable_inst)
;; Instruction creation helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Emit an instruction.
@@ -508,6 +516,14 @@
(decl emit (MInst) Unit)
(extern constructor emit emit)
;; Sink an instruction.
;;
;; This is a side-effectful operation that notifies the context that the
;; instruction has been sunk into another instruction, and no longer needs to
;; be lowered.
(decl sink_inst (Inst) Unit)
(extern constructor sink_inst sink_inst)
;; Constant pool emission.
(type VCodeConstant (primitive VCodeConstant))