Merge pull request #2278 from akirilov-arm/load_splat
Introduce the Cranelift IR instruction `LoadSplat`
This commit is contained in:
@@ -680,4 +680,19 @@ impl VectorSize {
|
||||
_ => *self,
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the encoding bits that are used by some SIMD instructions
|
||||
/// for a particular operand size.
|
||||
pub fn enc_size(&self) -> (u32, u32) {
|
||||
let q = self.is_128bits() as u32;
|
||||
let size = match self.lane_size() {
|
||||
ScalarSize::Size8 => 0b00,
|
||||
ScalarSize::Size16 => 0b01,
|
||||
ScalarSize::Size32 => 0b10,
|
||||
ScalarSize::Size64 => 0b11,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
(q, size)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -248,6 +248,16 @@ fn enc_ldst_imm19(op_31_24: u32, imm19: u32, rd: Reg) -> u32 {
|
||||
(op_31_24 << 24) | (imm19 << 5) | machreg_to_gpr_or_vec(rd)
|
||||
}
|
||||
|
||||
fn enc_ldst_vec(q: u32, size: u32, rn: Reg, rt: Writable<Reg>) -> u32 {
|
||||
debug_assert_eq!(q & 0b1, q);
|
||||
debug_assert_eq!(size & 0b11, size);
|
||||
0b0_0_0011010_10_00000_110_0_00_00000_00000
|
||||
| q << 30
|
||||
| size << 10
|
||||
| machreg_to_gpr(rn) << 5
|
||||
| machreg_to_vec(rt.to_reg())
|
||||
}
|
||||
|
||||
fn enc_extend(top22: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
|
||||
(top22 << 10) | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg())
|
||||
}
|
||||
@@ -1381,14 +1391,7 @@ impl MachInstEmit for Inst {
|
||||
sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra));
|
||||
}
|
||||
&Inst::VecMisc { op, rd, rn, size } => {
|
||||
let enc_size = match size.lane_size() {
|
||||
ScalarSize::Size8 => 0b00,
|
||||
ScalarSize::Size16 => 0b01,
|
||||
ScalarSize::Size32 => 0b10,
|
||||
ScalarSize::Size64 => 0b11,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
let q = if size.is_128bits() { 1 } else { 0 };
|
||||
let (q, enc_size) = size.enc_size();
|
||||
let (u, bits_12_16, size) = match op {
|
||||
VecMisc2::Not => (0b1, 0b00101, 0b00),
|
||||
VecMisc2::Neg => (0b1, 0b01011, enc_size),
|
||||
@@ -1831,13 +1834,7 @@ impl MachInstEmit for Inst {
|
||||
alu_op,
|
||||
size,
|
||||
} => {
|
||||
let enc_size = match size.lane_size() {
|
||||
ScalarSize::Size8 => 0b00,
|
||||
ScalarSize::Size16 => 0b01,
|
||||
ScalarSize::Size32 => 0b10,
|
||||
ScalarSize::Size64 => 0b11,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
let (q, enc_size) = size.enc_size();
|
||||
let is_float = match alu_op {
|
||||
VecALUOp::Fcmeq
|
||||
| VecALUOp::Fcmgt
|
||||
@@ -1851,6 +1848,7 @@ impl MachInstEmit for Inst {
|
||||
_ => false,
|
||||
};
|
||||
let enc_float_size = match (is_float, size) {
|
||||
(true, VectorSize::Size32x2) => 0b0,
|
||||
(true, VectorSize::Size32x4) => 0b0,
|
||||
(true, VectorSize::Size64x2) => 0b1,
|
||||
(true, _) => unimplemented!(),
|
||||
@@ -1858,46 +1856,46 @@ impl MachInstEmit for Inst {
|
||||
};
|
||||
|
||||
let (top11, bit15_10) = match alu_op {
|
||||
VecALUOp::Sqadd => (0b010_01110_00_1 | enc_size << 1, 0b000011),
|
||||
VecALUOp::Sqsub => (0b010_01110_00_1 | enc_size << 1, 0b001011),
|
||||
VecALUOp::Uqadd => (0b011_01110_00_1 | enc_size << 1, 0b000011),
|
||||
VecALUOp::Uqsub => (0b011_01110_00_1 | enc_size << 1, 0b001011),
|
||||
VecALUOp::Cmeq => (0b011_01110_00_1 | enc_size << 1, 0b100011),
|
||||
VecALUOp::Cmge => (0b010_01110_00_1 | enc_size << 1, 0b001111),
|
||||
VecALUOp::Cmgt => (0b010_01110_00_1 | enc_size << 1, 0b001101),
|
||||
VecALUOp::Cmhi => (0b011_01110_00_1 | enc_size << 1, 0b001101),
|
||||
VecALUOp::Cmhs => (0b011_01110_00_1 | enc_size << 1, 0b001111),
|
||||
VecALUOp::Fcmeq => (0b010_01110_00_1, 0b111001),
|
||||
VecALUOp::Fcmgt => (0b011_01110_10_1, 0b111001),
|
||||
VecALUOp::Fcmge => (0b011_01110_00_1, 0b111001),
|
||||
VecALUOp::Sqadd => (0b000_01110_00_1 | enc_size << 1, 0b000011),
|
||||
VecALUOp::Sqsub => (0b000_01110_00_1 | enc_size << 1, 0b001011),
|
||||
VecALUOp::Uqadd => (0b001_01110_00_1 | enc_size << 1, 0b000011),
|
||||
VecALUOp::Uqsub => (0b001_01110_00_1 | enc_size << 1, 0b001011),
|
||||
VecALUOp::Cmeq => (0b001_01110_00_1 | enc_size << 1, 0b100011),
|
||||
VecALUOp::Cmge => (0b000_01110_00_1 | enc_size << 1, 0b001111),
|
||||
VecALUOp::Cmgt => (0b000_01110_00_1 | enc_size << 1, 0b001101),
|
||||
VecALUOp::Cmhi => (0b001_01110_00_1 | enc_size << 1, 0b001101),
|
||||
VecALUOp::Cmhs => (0b001_01110_00_1 | enc_size << 1, 0b001111),
|
||||
VecALUOp::Fcmeq => (0b000_01110_00_1, 0b111001),
|
||||
VecALUOp::Fcmgt => (0b001_01110_10_1, 0b111001),
|
||||
VecALUOp::Fcmge => (0b001_01110_00_1, 0b111001),
|
||||
// The following logical instructions operate on bytes, so are not encoded differently
|
||||
// for the different vector types.
|
||||
VecALUOp::And => (0b010_01110_00_1, 0b000111),
|
||||
VecALUOp::Bic => (0b010_01110_01_1, 0b000111),
|
||||
VecALUOp::Orr => (0b010_01110_10_1, 0b000111),
|
||||
VecALUOp::Eor => (0b011_01110_00_1, 0b000111),
|
||||
VecALUOp::Bsl => (0b011_01110_01_1, 0b000111),
|
||||
VecALUOp::Umaxp => (0b011_01110_00_1 | enc_size << 1, 0b101001),
|
||||
VecALUOp::Add => (0b010_01110_00_1 | enc_size << 1, 0b100001),
|
||||
VecALUOp::Sub => (0b011_01110_00_1 | enc_size << 1, 0b100001),
|
||||
VecALUOp::And => (0b000_01110_00_1, 0b000111),
|
||||
VecALUOp::Bic => (0b000_01110_01_1, 0b000111),
|
||||
VecALUOp::Orr => (0b000_01110_10_1, 0b000111),
|
||||
VecALUOp::Eor => (0b001_01110_00_1, 0b000111),
|
||||
VecALUOp::Bsl => (0b001_01110_01_1, 0b000111),
|
||||
VecALUOp::Umaxp => (0b001_01110_00_1 | enc_size << 1, 0b101001),
|
||||
VecALUOp::Add => (0b000_01110_00_1 | enc_size << 1, 0b100001),
|
||||
VecALUOp::Sub => (0b001_01110_00_1 | enc_size << 1, 0b100001),
|
||||
VecALUOp::Mul => {
|
||||
debug_assert_ne!(size, VectorSize::Size64x2);
|
||||
(0b010_01110_00_1 | enc_size << 1, 0b100111)
|
||||
(0b000_01110_00_1 | enc_size << 1, 0b100111)
|
||||
}
|
||||
VecALUOp::Sshl => (0b010_01110_00_1 | enc_size << 1, 0b010001),
|
||||
VecALUOp::Ushl => (0b011_01110_00_1 | enc_size << 1, 0b010001),
|
||||
VecALUOp::Umin => (0b011_01110_00_1 | enc_size << 1, 0b011011),
|
||||
VecALUOp::Smin => (0b010_01110_00_1 | enc_size << 1, 0b011011),
|
||||
VecALUOp::Umax => (0b011_01110_00_1 | enc_size << 1, 0b011001),
|
||||
VecALUOp::Smax => (0b010_01110_00_1 | enc_size << 1, 0b011001),
|
||||
VecALUOp::Urhadd => (0b011_01110_00_1 | enc_size << 1, 0b000101),
|
||||
VecALUOp::Fadd => (0b010_01110_00_1, 0b110101),
|
||||
VecALUOp::Fsub => (0b010_01110_10_1, 0b110101),
|
||||
VecALUOp::Fdiv => (0b011_01110_00_1, 0b111111),
|
||||
VecALUOp::Fmax => (0b010_01110_00_1, 0b111101),
|
||||
VecALUOp::Fmin => (0b010_01110_10_1, 0b111101),
|
||||
VecALUOp::Fmul => (0b011_01110_00_1, 0b110111),
|
||||
VecALUOp::Addp => (0b010_01110_00_1 | enc_size << 1, 0b101111),
|
||||
VecALUOp::Sshl => (0b000_01110_00_1 | enc_size << 1, 0b010001),
|
||||
VecALUOp::Ushl => (0b001_01110_00_1 | enc_size << 1, 0b010001),
|
||||
VecALUOp::Umin => (0b001_01110_00_1 | enc_size << 1, 0b011011),
|
||||
VecALUOp::Smin => (0b000_01110_00_1 | enc_size << 1, 0b011011),
|
||||
VecALUOp::Umax => (0b001_01110_00_1 | enc_size << 1, 0b011001),
|
||||
VecALUOp::Smax => (0b000_01110_00_1 | enc_size << 1, 0b011001),
|
||||
VecALUOp::Urhadd => (0b001_01110_00_1 | enc_size << 1, 0b000101),
|
||||
VecALUOp::Fadd => (0b000_01110_00_1, 0b110101),
|
||||
VecALUOp::Fsub => (0b000_01110_10_1, 0b110101),
|
||||
VecALUOp::Fdiv => (0b001_01110_00_1, 0b111111),
|
||||
VecALUOp::Fmax => (0b000_01110_00_1, 0b111101),
|
||||
VecALUOp::Fmin => (0b000_01110_10_1, 0b111101),
|
||||
VecALUOp::Fmul => (0b001_01110_00_1, 0b110111),
|
||||
VecALUOp::Addp => (0b000_01110_00_1 | enc_size << 1, 0b101111),
|
||||
VecALUOp::Umlal => {
|
||||
debug_assert!(!size.is_128bits());
|
||||
(0b001_01110_00_1 | enc_size << 1, 0b100000)
|
||||
@@ -1905,12 +1903,27 @@ impl MachInstEmit for Inst {
|
||||
VecALUOp::Zip1 => (0b01001110_00_0 | enc_size << 1, 0b001110),
|
||||
};
|
||||
let top11 = if is_float {
|
||||
top11 | enc_float_size << 1
|
||||
top11 | (q << 9) | enc_float_size << 1
|
||||
} else {
|
||||
top11
|
||||
top11 | (q << 9)
|
||||
};
|
||||
sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
|
||||
}
|
||||
&Inst::VecLoadReplicate {
|
||||
rd,
|
||||
rn,
|
||||
size,
|
||||
srcloc,
|
||||
} => {
|
||||
let (q, size) = size.enc_size();
|
||||
|
||||
if let Some(srcloc) = srcloc {
|
||||
// Register the offset at which the actual load instruction starts.
|
||||
sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
|
||||
}
|
||||
|
||||
sink.put4(enc_ldst_vec(q, size, rn, rd));
|
||||
}
|
||||
&Inst::MovToNZCV { rn } => {
|
||||
sink.put4(0xd51b4200 | machreg_to_gpr(rn));
|
||||
}
|
||||
@@ -2195,9 +2208,12 @@ impl MachInstEmit for Inst {
|
||||
inst.emit(sink, emit_info, state);
|
||||
}
|
||||
|
||||
let (reg, offset) = match mem {
|
||||
AMode::Unscaled(r, simm9) => (r, simm9.value()),
|
||||
AMode::UnsignedOffset(r, uimm12scaled) => (r, uimm12scaled.value() as i32),
|
||||
let (reg, index_reg, offset) = match mem {
|
||||
AMode::RegExtended(r, idx, extendop) => (r, Some((idx, extendop)), 0),
|
||||
AMode::Unscaled(r, simm9) => (r, None, simm9.value()),
|
||||
AMode::UnsignedOffset(r, uimm12scaled) => {
|
||||
(r, None, uimm12scaled.value() as i32)
|
||||
}
|
||||
_ => panic!("Unsupported case for LoadAddr: {:?}", mem),
|
||||
};
|
||||
let abs_offset = if offset < 0 {
|
||||
@@ -2211,9 +2227,22 @@ impl MachInstEmit for Inst {
|
||||
ALUOp::Add64
|
||||
};
|
||||
|
||||
if offset == 0 {
|
||||
let mov = Inst::mov(rd, reg);
|
||||
mov.emit(sink, emit_info, state);
|
||||
if let Some((idx, extendop)) = index_reg {
|
||||
let add = Inst::AluRRRExtend {
|
||||
alu_op: ALUOp::Add64,
|
||||
rd,
|
||||
rn: reg,
|
||||
rm: idx,
|
||||
extendop,
|
||||
};
|
||||
|
||||
add.emit(sink, emit_info, state);
|
||||
} else if offset == 0 {
|
||||
if reg != rd.to_reg() {
|
||||
let mov = Inst::mov(rd, reg);
|
||||
|
||||
mov.emit(sink, emit_info, state);
|
||||
}
|
||||
} else if let Some(imm12) = Imm12::maybe_from_u64(abs_offset) {
|
||||
let add = Inst::AluRRImm12 {
|
||||
alu_op,
|
||||
|
||||
@@ -2533,10 +2533,10 @@ fn test_aarch64_binemit() {
|
||||
rd: writable_vreg(28),
|
||||
rn: vreg(12),
|
||||
rm: vreg(4),
|
||||
size: VectorSize::Size32x4,
|
||||
size: VectorSize::Size32x2,
|
||||
},
|
||||
"9CE5244E",
|
||||
"fcmeq v28.4s, v12.4s, v4.4s",
|
||||
"9CE5240E",
|
||||
"fcmeq v28.2s, v12.2s, v4.2s",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
@@ -2965,10 +2965,10 @@ fn test_aarch64_binemit() {
|
||||
rd: writable_vreg(6),
|
||||
rn: vreg(9),
|
||||
rm: vreg(8),
|
||||
size: VectorSize::Size8x16,
|
||||
size: VectorSize::Size8x8,
|
||||
},
|
||||
"2665286E",
|
||||
"umax v6.16b, v9.16b, v8.16b",
|
||||
"2665282E",
|
||||
"umax v6.8b, v9.8b, v8.8b",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
@@ -3805,6 +3805,28 @@ fn test_aarch64_binemit() {
|
||||
"tbx v3.16b, { v11.16b, v12.16b }, v19.16b",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecLoadReplicate {
|
||||
rd: writable_vreg(31),
|
||||
rn: xreg(0),
|
||||
srcloc: None,
|
||||
size: VectorSize::Size64x2,
|
||||
},
|
||||
"1FCC404D",
|
||||
"ld1r { v31.2d }, [x0]",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecLoadReplicate {
|
||||
rd: writable_vreg(0),
|
||||
rn: xreg(25),
|
||||
srcloc: None,
|
||||
size: VectorSize::Size8x8,
|
||||
},
|
||||
"20C3400D",
|
||||
"ld1r { v0.8b }, [x25]",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::Extend {
|
||||
rd: writable_xreg(1),
|
||||
|
||||
@@ -1021,6 +1021,14 @@ pub enum Inst {
|
||||
is_extension: bool,
|
||||
},
|
||||
|
||||
/// Load an element and replicate to all lanes of a vector.
|
||||
VecLoadReplicate {
|
||||
rd: Writable<Reg>,
|
||||
rn: Reg,
|
||||
size: VectorSize,
|
||||
srcloc: Option<SourceLoc>,
|
||||
},
|
||||
|
||||
/// Move to the NZCV flags (actually a `MSR NZCV, Xn` insn).
|
||||
MovToNZCV {
|
||||
rn: Reg,
|
||||
@@ -1664,7 +1672,10 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
|
||||
collector.add_def(rd);
|
||||
}
|
||||
}
|
||||
|
||||
&Inst::VecLoadReplicate { rd, rn, .. } => {
|
||||
collector.add_def(rd);
|
||||
collector.add_use(rn);
|
||||
}
|
||||
&Inst::FpuCmp32 { rn, rm } | &Inst::FpuCmp64 { rn, rm } => {
|
||||
collector.add_use(rn);
|
||||
collector.add_use(rm);
|
||||
@@ -1817,8 +1828,9 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
|
||||
&Inst::LoadExtName { rd, .. } => {
|
||||
collector.add_def(rd);
|
||||
}
|
||||
&Inst::LoadAddr { rd, mem: _ } => {
|
||||
&Inst::LoadAddr { rd, ref mem } => {
|
||||
collector.add_def(rd);
|
||||
memarg_regs(mem, collector);
|
||||
}
|
||||
&Inst::VirtualSPOffsetAdj { .. } => {}
|
||||
&Inst::EmitIsland { .. } => {}
|
||||
@@ -2262,6 +2274,14 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
|
||||
map_def(mapper, rd);
|
||||
}
|
||||
}
|
||||
&mut Inst::VecLoadReplicate {
|
||||
ref mut rd,
|
||||
ref mut rn,
|
||||
..
|
||||
} => {
|
||||
map_def(mapper, rd);
|
||||
map_use(mapper, rn);
|
||||
}
|
||||
&mut Inst::FpuCmp32 {
|
||||
ref mut rn,
|
||||
ref mut rm,
|
||||
@@ -3507,6 +3527,12 @@ impl Inst {
|
||||
let rm = show_vreg_vector(rm, mb_rru, VectorSize::Size8x16);
|
||||
format!("{} {}, {{ {}, {} }}, {}", op, rd, rn, rn2, rm)
|
||||
}
|
||||
&Inst::VecLoadReplicate { rd, rn, size, .. } => {
|
||||
let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
|
||||
let rn = rn.show_rru(mb_rru);
|
||||
|
||||
format!("ld1r {{ {} }}, [{}]", rd, rn)
|
||||
}
|
||||
&Inst::MovToNZCV { rn } => {
|
||||
let rn = rn.show_rru(mb_rru);
|
||||
format!("msr nzcv, {}", rn)
|
||||
|
||||
@@ -1197,6 +1197,29 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
}
|
||||
}
|
||||
|
||||
Opcode::LoadSplat => {
|
||||
let off = ctx.data(insn).load_store_offset().unwrap();
|
||||
let ty = ty.unwrap();
|
||||
let mem = lower_address(ctx, ty.lane_type(), &inputs[..], off);
|
||||
let memflags = ctx.memflags(insn).expect("memory flags");
|
||||
let rd = get_output_reg(ctx, outputs[0]);
|
||||
let size = VectorSize::from_ty(ty);
|
||||
let srcloc = if memflags.notrap() {
|
||||
None
|
||||
} else {
|
||||
Some(ctx.srcloc(insn))
|
||||
};
|
||||
let tmp = ctx.alloc_tmp(RegClass::I64, I64);
|
||||
|
||||
ctx.emit(Inst::LoadAddr { rd: tmp, mem });
|
||||
ctx.emit(Inst::VecLoadReplicate {
|
||||
rd,
|
||||
rn: tmp.to_reg(),
|
||||
size,
|
||||
srcloc,
|
||||
});
|
||||
}
|
||||
|
||||
Opcode::Store
|
||||
| Opcode::Istore8
|
||||
| Opcode::Istore16
|
||||
|
||||
Reference in New Issue
Block a user