Merge pull request #2278 from akirilov-arm/load_splat

Introduce the Cranelift IR instruction `LoadSplat`
This commit is contained in:
Chris Fallin
2020-10-28 12:54:03 -07:00
committed by GitHub
13 changed files with 554 additions and 222 deletions

View File

@@ -680,4 +680,19 @@ impl VectorSize {
_ => *self,
}
}
/// Return the encoding bits that are used by some SIMD instructions
/// for a particular operand size.
pub fn enc_size(&self) -> (u32, u32) {
let q = self.is_128bits() as u32;
let size = match self.lane_size() {
ScalarSize::Size8 => 0b00,
ScalarSize::Size16 => 0b01,
ScalarSize::Size32 => 0b10,
ScalarSize::Size64 => 0b11,
_ => unreachable!(),
};
(q, size)
}
}

View File

@@ -248,6 +248,16 @@ fn enc_ldst_imm19(op_31_24: u32, imm19: u32, rd: Reg) -> u32 {
(op_31_24 << 24) | (imm19 << 5) | machreg_to_gpr_or_vec(rd)
}
fn enc_ldst_vec(q: u32, size: u32, rn: Reg, rt: Writable<Reg>) -> u32 {
debug_assert_eq!(q & 0b1, q);
debug_assert_eq!(size & 0b11, size);
0b0_0_0011010_10_00000_110_0_00_00000_00000
| q << 30
| size << 10
| machreg_to_gpr(rn) << 5
| machreg_to_vec(rt.to_reg())
}
fn enc_extend(top22: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
(top22 << 10) | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg())
}
@@ -1381,14 +1391,7 @@ impl MachInstEmit for Inst {
sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra));
}
&Inst::VecMisc { op, rd, rn, size } => {
let enc_size = match size.lane_size() {
ScalarSize::Size8 => 0b00,
ScalarSize::Size16 => 0b01,
ScalarSize::Size32 => 0b10,
ScalarSize::Size64 => 0b11,
_ => unreachable!(),
};
let q = if size.is_128bits() { 1 } else { 0 };
let (q, enc_size) = size.enc_size();
let (u, bits_12_16, size) = match op {
VecMisc2::Not => (0b1, 0b00101, 0b00),
VecMisc2::Neg => (0b1, 0b01011, enc_size),
@@ -1831,13 +1834,7 @@ impl MachInstEmit for Inst {
alu_op,
size,
} => {
let enc_size = match size.lane_size() {
ScalarSize::Size8 => 0b00,
ScalarSize::Size16 => 0b01,
ScalarSize::Size32 => 0b10,
ScalarSize::Size64 => 0b11,
_ => unreachable!(),
};
let (q, enc_size) = size.enc_size();
let is_float = match alu_op {
VecALUOp::Fcmeq
| VecALUOp::Fcmgt
@@ -1851,6 +1848,7 @@ impl MachInstEmit for Inst {
_ => false,
};
let enc_float_size = match (is_float, size) {
(true, VectorSize::Size32x2) => 0b0,
(true, VectorSize::Size32x4) => 0b0,
(true, VectorSize::Size64x2) => 0b1,
(true, _) => unimplemented!(),
@@ -1858,46 +1856,46 @@ impl MachInstEmit for Inst {
};
let (top11, bit15_10) = match alu_op {
VecALUOp::Sqadd => (0b010_01110_00_1 | enc_size << 1, 0b000011),
VecALUOp::Sqsub => (0b010_01110_00_1 | enc_size << 1, 0b001011),
VecALUOp::Uqadd => (0b011_01110_00_1 | enc_size << 1, 0b000011),
VecALUOp::Uqsub => (0b011_01110_00_1 | enc_size << 1, 0b001011),
VecALUOp::Cmeq => (0b011_01110_00_1 | enc_size << 1, 0b100011),
VecALUOp::Cmge => (0b010_01110_00_1 | enc_size << 1, 0b001111),
VecALUOp::Cmgt => (0b010_01110_00_1 | enc_size << 1, 0b001101),
VecALUOp::Cmhi => (0b011_01110_00_1 | enc_size << 1, 0b001101),
VecALUOp::Cmhs => (0b011_01110_00_1 | enc_size << 1, 0b001111),
VecALUOp::Fcmeq => (0b010_01110_00_1, 0b111001),
VecALUOp::Fcmgt => (0b011_01110_10_1, 0b111001),
VecALUOp::Fcmge => (0b011_01110_00_1, 0b111001),
VecALUOp::Sqadd => (0b000_01110_00_1 | enc_size << 1, 0b000011),
VecALUOp::Sqsub => (0b000_01110_00_1 | enc_size << 1, 0b001011),
VecALUOp::Uqadd => (0b001_01110_00_1 | enc_size << 1, 0b000011),
VecALUOp::Uqsub => (0b001_01110_00_1 | enc_size << 1, 0b001011),
VecALUOp::Cmeq => (0b001_01110_00_1 | enc_size << 1, 0b100011),
VecALUOp::Cmge => (0b000_01110_00_1 | enc_size << 1, 0b001111),
VecALUOp::Cmgt => (0b000_01110_00_1 | enc_size << 1, 0b001101),
VecALUOp::Cmhi => (0b001_01110_00_1 | enc_size << 1, 0b001101),
VecALUOp::Cmhs => (0b001_01110_00_1 | enc_size << 1, 0b001111),
VecALUOp::Fcmeq => (0b000_01110_00_1, 0b111001),
VecALUOp::Fcmgt => (0b001_01110_10_1, 0b111001),
VecALUOp::Fcmge => (0b001_01110_00_1, 0b111001),
// The following logical instructions operate on bytes, so are not encoded differently
// for the different vector types.
VecALUOp::And => (0b010_01110_00_1, 0b000111),
VecALUOp::Bic => (0b010_01110_01_1, 0b000111),
VecALUOp::Orr => (0b010_01110_10_1, 0b000111),
VecALUOp::Eor => (0b011_01110_00_1, 0b000111),
VecALUOp::Bsl => (0b011_01110_01_1, 0b000111),
VecALUOp::Umaxp => (0b011_01110_00_1 | enc_size << 1, 0b101001),
VecALUOp::Add => (0b010_01110_00_1 | enc_size << 1, 0b100001),
VecALUOp::Sub => (0b011_01110_00_1 | enc_size << 1, 0b100001),
VecALUOp::And => (0b000_01110_00_1, 0b000111),
VecALUOp::Bic => (0b000_01110_01_1, 0b000111),
VecALUOp::Orr => (0b000_01110_10_1, 0b000111),
VecALUOp::Eor => (0b001_01110_00_1, 0b000111),
VecALUOp::Bsl => (0b001_01110_01_1, 0b000111),
VecALUOp::Umaxp => (0b001_01110_00_1 | enc_size << 1, 0b101001),
VecALUOp::Add => (0b000_01110_00_1 | enc_size << 1, 0b100001),
VecALUOp::Sub => (0b001_01110_00_1 | enc_size << 1, 0b100001),
VecALUOp::Mul => {
debug_assert_ne!(size, VectorSize::Size64x2);
(0b010_01110_00_1 | enc_size << 1, 0b100111)
(0b000_01110_00_1 | enc_size << 1, 0b100111)
}
VecALUOp::Sshl => (0b010_01110_00_1 | enc_size << 1, 0b010001),
VecALUOp::Ushl => (0b011_01110_00_1 | enc_size << 1, 0b010001),
VecALUOp::Umin => (0b011_01110_00_1 | enc_size << 1, 0b011011),
VecALUOp::Smin => (0b010_01110_00_1 | enc_size << 1, 0b011011),
VecALUOp::Umax => (0b011_01110_00_1 | enc_size << 1, 0b011001),
VecALUOp::Smax => (0b010_01110_00_1 | enc_size << 1, 0b011001),
VecALUOp::Urhadd => (0b011_01110_00_1 | enc_size << 1, 0b000101),
VecALUOp::Fadd => (0b010_01110_00_1, 0b110101),
VecALUOp::Fsub => (0b010_01110_10_1, 0b110101),
VecALUOp::Fdiv => (0b011_01110_00_1, 0b111111),
VecALUOp::Fmax => (0b010_01110_00_1, 0b111101),
VecALUOp::Fmin => (0b010_01110_10_1, 0b111101),
VecALUOp::Fmul => (0b011_01110_00_1, 0b110111),
VecALUOp::Addp => (0b010_01110_00_1 | enc_size << 1, 0b101111),
VecALUOp::Sshl => (0b000_01110_00_1 | enc_size << 1, 0b010001),
VecALUOp::Ushl => (0b001_01110_00_1 | enc_size << 1, 0b010001),
VecALUOp::Umin => (0b001_01110_00_1 | enc_size << 1, 0b011011),
VecALUOp::Smin => (0b000_01110_00_1 | enc_size << 1, 0b011011),
VecALUOp::Umax => (0b001_01110_00_1 | enc_size << 1, 0b011001),
VecALUOp::Smax => (0b000_01110_00_1 | enc_size << 1, 0b011001),
VecALUOp::Urhadd => (0b001_01110_00_1 | enc_size << 1, 0b000101),
VecALUOp::Fadd => (0b000_01110_00_1, 0b110101),
VecALUOp::Fsub => (0b000_01110_10_1, 0b110101),
VecALUOp::Fdiv => (0b001_01110_00_1, 0b111111),
VecALUOp::Fmax => (0b000_01110_00_1, 0b111101),
VecALUOp::Fmin => (0b000_01110_10_1, 0b111101),
VecALUOp::Fmul => (0b001_01110_00_1, 0b110111),
VecALUOp::Addp => (0b000_01110_00_1 | enc_size << 1, 0b101111),
VecALUOp::Umlal => {
debug_assert!(!size.is_128bits());
(0b001_01110_00_1 | enc_size << 1, 0b100000)
@@ -1905,12 +1903,27 @@ impl MachInstEmit for Inst {
VecALUOp::Zip1 => (0b01001110_00_0 | enc_size << 1, 0b001110),
};
let top11 = if is_float {
top11 | enc_float_size << 1
top11 | (q << 9) | enc_float_size << 1
} else {
top11
top11 | (q << 9)
};
sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
}
&Inst::VecLoadReplicate {
rd,
rn,
size,
srcloc,
} => {
let (q, size) = size.enc_size();
if let Some(srcloc) = srcloc {
// Register the offset at which the actual load instruction starts.
sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
}
sink.put4(enc_ldst_vec(q, size, rn, rd));
}
&Inst::MovToNZCV { rn } => {
sink.put4(0xd51b4200 | machreg_to_gpr(rn));
}
@@ -2195,9 +2208,12 @@ impl MachInstEmit for Inst {
inst.emit(sink, emit_info, state);
}
let (reg, offset) = match mem {
AMode::Unscaled(r, simm9) => (r, simm9.value()),
AMode::UnsignedOffset(r, uimm12scaled) => (r, uimm12scaled.value() as i32),
let (reg, index_reg, offset) = match mem {
AMode::RegExtended(r, idx, extendop) => (r, Some((idx, extendop)), 0),
AMode::Unscaled(r, simm9) => (r, None, simm9.value()),
AMode::UnsignedOffset(r, uimm12scaled) => {
(r, None, uimm12scaled.value() as i32)
}
_ => panic!("Unsupported case for LoadAddr: {:?}", mem),
};
let abs_offset = if offset < 0 {
@@ -2211,9 +2227,22 @@ impl MachInstEmit for Inst {
ALUOp::Add64
};
if offset == 0 {
let mov = Inst::mov(rd, reg);
mov.emit(sink, emit_info, state);
if let Some((idx, extendop)) = index_reg {
let add = Inst::AluRRRExtend {
alu_op: ALUOp::Add64,
rd,
rn: reg,
rm: idx,
extendop,
};
add.emit(sink, emit_info, state);
} else if offset == 0 {
if reg != rd.to_reg() {
let mov = Inst::mov(rd, reg);
mov.emit(sink, emit_info, state);
}
} else if let Some(imm12) = Imm12::maybe_from_u64(abs_offset) {
let add = Inst::AluRRImm12 {
alu_op,

View File

@@ -2533,10 +2533,10 @@ fn test_aarch64_binemit() {
rd: writable_vreg(28),
rn: vreg(12),
rm: vreg(4),
size: VectorSize::Size32x4,
size: VectorSize::Size32x2,
},
"9CE5244E",
"fcmeq v28.4s, v12.4s, v4.4s",
"9CE5240E",
"fcmeq v28.2s, v12.2s, v4.2s",
));
insns.push((
@@ -2965,10 +2965,10 @@ fn test_aarch64_binemit() {
rd: writable_vreg(6),
rn: vreg(9),
rm: vreg(8),
size: VectorSize::Size8x16,
size: VectorSize::Size8x8,
},
"2665286E",
"umax v6.16b, v9.16b, v8.16b",
"2665282E",
"umax v6.8b, v9.8b, v8.8b",
));
insns.push((
@@ -3805,6 +3805,28 @@ fn test_aarch64_binemit() {
"tbx v3.16b, { v11.16b, v12.16b }, v19.16b",
));
insns.push((
Inst::VecLoadReplicate {
rd: writable_vreg(31),
rn: xreg(0),
srcloc: None,
size: VectorSize::Size64x2,
},
"1FCC404D",
"ld1r { v31.2d }, [x0]",
));
insns.push((
Inst::VecLoadReplicate {
rd: writable_vreg(0),
rn: xreg(25),
srcloc: None,
size: VectorSize::Size8x8,
},
"20C3400D",
"ld1r { v0.8b }, [x25]",
));
insns.push((
Inst::Extend {
rd: writable_xreg(1),

View File

@@ -1021,6 +1021,14 @@ pub enum Inst {
is_extension: bool,
},
/// Load an element and replicate to all lanes of a vector.
VecLoadReplicate {
rd: Writable<Reg>,
rn: Reg,
size: VectorSize,
srcloc: Option<SourceLoc>,
},
/// Move to the NZCV flags (actually a `MSR NZCV, Xn` insn).
MovToNZCV {
rn: Reg,
@@ -1664,7 +1672,10 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
collector.add_def(rd);
}
}
&Inst::VecLoadReplicate { rd, rn, .. } => {
collector.add_def(rd);
collector.add_use(rn);
}
&Inst::FpuCmp32 { rn, rm } | &Inst::FpuCmp64 { rn, rm } => {
collector.add_use(rn);
collector.add_use(rm);
@@ -1817,8 +1828,9 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
&Inst::LoadExtName { rd, .. } => {
collector.add_def(rd);
}
&Inst::LoadAddr { rd, mem: _ } => {
&Inst::LoadAddr { rd, ref mem } => {
collector.add_def(rd);
memarg_regs(mem, collector);
}
&Inst::VirtualSPOffsetAdj { .. } => {}
&Inst::EmitIsland { .. } => {}
@@ -2262,6 +2274,14 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
map_def(mapper, rd);
}
}
&mut Inst::VecLoadReplicate {
ref mut rd,
ref mut rn,
..
} => {
map_def(mapper, rd);
map_use(mapper, rn);
}
&mut Inst::FpuCmp32 {
ref mut rn,
ref mut rm,
@@ -3507,6 +3527,12 @@ impl Inst {
let rm = show_vreg_vector(rm, mb_rru, VectorSize::Size8x16);
format!("{} {}, {{ {}, {} }}, {}", op, rd, rn, rn2, rm)
}
&Inst::VecLoadReplicate { rd, rn, size, .. } => {
let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
let rn = rn.show_rru(mb_rru);
format!("ld1r {{ {} }}, [{}]", rd, rn)
}
&Inst::MovToNZCV { rn } => {
let rn = rn.show_rru(mb_rru);
format!("msr nzcv, {}", rn)

View File

@@ -1197,6 +1197,29 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
}
}
Opcode::LoadSplat => {
let off = ctx.data(insn).load_store_offset().unwrap();
let ty = ty.unwrap();
let mem = lower_address(ctx, ty.lane_type(), &inputs[..], off);
let memflags = ctx.memflags(insn).expect("memory flags");
let rd = get_output_reg(ctx, outputs[0]);
let size = VectorSize::from_ty(ty);
let srcloc = if memflags.notrap() {
None
} else {
Some(ctx.srcloc(insn))
};
let tmp = ctx.alloc_tmp(RegClass::I64, I64);
ctx.emit(Inst::LoadAddr { rd: tmp, mem });
ctx.emit(Inst::VecLoadReplicate {
rd,
rn: tmp.to_reg(),
size,
srcloc,
});
}
Opcode::Store
| Opcode::Istore8
| Opcode::Istore16