Merge pull request #2376 from cfallin/loadsplat
AArch64 SIMD: replace `LoadSplat` with pattern-matching on load+splat
This commit is contained in:
@@ -396,7 +396,6 @@ fn define_simd(
|
|||||||
let insertlane = insts.by_name("insertlane");
|
let insertlane = insts.by_name("insertlane");
|
||||||
let ishl = insts.by_name("ishl");
|
let ishl = insts.by_name("ishl");
|
||||||
let ishl_imm = insts.by_name("ishl_imm");
|
let ishl_imm = insts.by_name("ishl_imm");
|
||||||
let load_splat = insts.by_name("load_splat");
|
|
||||||
let raw_bitcast = insts.by_name("raw_bitcast");
|
let raw_bitcast = insts.by_name("raw_bitcast");
|
||||||
let scalar_to_vector = insts.by_name("scalar_to_vector");
|
let scalar_to_vector = insts.by_name("scalar_to_vector");
|
||||||
let splat = insts.by_name("splat");
|
let splat = insts.by_name("splat");
|
||||||
@@ -821,7 +820,6 @@ fn define_simd(
|
|||||||
narrow.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat_vector");
|
narrow.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat_vector");
|
||||||
narrow.custom_legalize(fmin, "expand_minmax_vector");
|
narrow.custom_legalize(fmin, "expand_minmax_vector");
|
||||||
narrow.custom_legalize(fmax, "expand_minmax_vector");
|
narrow.custom_legalize(fmax, "expand_minmax_vector");
|
||||||
narrow.custom_legalize(load_splat, "expand_load_splat");
|
|
||||||
|
|
||||||
narrow_avx.custom_legalize(imul, "convert_i64x2_imul");
|
narrow_avx.custom_legalize(imul, "convert_i64x2_imul");
|
||||||
narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector");
|
narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector");
|
||||||
|
|||||||
@@ -4491,24 +4491,5 @@ pub(crate) fn define(
|
|||||||
.other_side_effects(true),
|
.other_side_effects(true),
|
||||||
);
|
);
|
||||||
|
|
||||||
let Offset = &Operand::new("Offset", &imm.offset32).with_doc("Byte offset from base address");
|
|
||||||
let a = &Operand::new("a", TxN);
|
|
||||||
|
|
||||||
ig.push(
|
|
||||||
Inst::new(
|
|
||||||
"load_splat",
|
|
||||||
r#"
|
|
||||||
Load an element from memory at ``p + Offset`` and return a vector
|
|
||||||
whose lanes are all set to that element.
|
|
||||||
|
|
||||||
This is equivalent to ``load`` followed by ``splat``.
|
|
||||||
"#,
|
|
||||||
&formats.load,
|
|
||||||
)
|
|
||||||
.operands_in(vec![MemFlags, p, Offset])
|
|
||||||
.operands_out(vec![a])
|
|
||||||
.can_load(true),
|
|
||||||
);
|
|
||||||
|
|
||||||
ig.build()
|
ig.build()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -209,6 +209,19 @@ impl AMode {
|
|||||||
pub fn label(label: MemLabel) -> AMode {
|
pub fn label(label: MemLabel) -> AMode {
|
||||||
AMode::Label(label)
|
AMode::Label(label)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Does the address resolve to just a register value, with no offset or
|
||||||
|
/// other computation?
|
||||||
|
pub fn is_reg(&self) -> Option<Reg> {
|
||||||
|
match self {
|
||||||
|
&AMode::UnsignedOffset(r, uimm12) if uimm12.value() == 0 => Some(r),
|
||||||
|
&AMode::Unscaled(r, imm9) if imm9.value() == 0 => Some(r),
|
||||||
|
&AMode::RegOffset(r, off, _) if off == 0 => Some(r),
|
||||||
|
&AMode::FPOffset(off, _) if off == 0 => Some(fp_reg()),
|
||||||
|
&AMode::SPOffset(off, _) if off == 0 => Some(stack_reg()),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A memory argument to a load/store-pair.
|
/// A memory argument to a load/store-pair.
|
||||||
|
|||||||
@@ -1463,6 +1463,17 @@ impl Inst {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Generate a LoadAddr instruction (load address of an amode into
|
||||||
|
/// register). Elides when possible (when amode is just a register). Returns
|
||||||
|
/// destination register: either `rd` or a register directly from the amode.
|
||||||
|
pub fn gen_load_addr(rd: Writable<Reg>, mem: AMode) -> (Reg, Option<Inst>) {
|
||||||
|
if let Some(r) = mem.is_reg() {
|
||||||
|
(r, None)
|
||||||
|
} else {
|
||||||
|
(rd.to_reg(), Some(Inst::LoadAddr { rd, mem }))
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//=============================================================================
|
//=============================================================================
|
||||||
|
|||||||
@@ -1169,6 +1169,59 @@ pub(crate) fn normalize_bool_result<C: LowerCtx<I = Inst>>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// This is target-word-size dependent. And it excludes booleans and reftypes.
|
||||||
|
pub(crate) fn is_valid_atomic_transaction_ty(ty: Type) -> bool {
|
||||||
|
match ty {
|
||||||
|
I8 | I16 | I32 | I64 => true,
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn load_op_to_ty(op: Opcode) -> Option<Type> {
|
||||||
|
match op {
|
||||||
|
Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => Some(I8),
|
||||||
|
Opcode::Sload16 | Opcode::Uload16 | Opcode::Sload16Complex | Opcode::Uload16Complex => {
|
||||||
|
Some(I16)
|
||||||
|
}
|
||||||
|
Opcode::Sload32 | Opcode::Uload32 | Opcode::Sload32Complex | Opcode::Uload32Complex => {
|
||||||
|
Some(I32)
|
||||||
|
}
|
||||||
|
Opcode::Load | Opcode::LoadComplex => None,
|
||||||
|
Opcode::Sload8x8 | Opcode::Uload8x8 | Opcode::Sload8x8Complex | Opcode::Uload8x8Complex => {
|
||||||
|
Some(I8X8)
|
||||||
|
}
|
||||||
|
Opcode::Sload16x4
|
||||||
|
| Opcode::Uload16x4
|
||||||
|
| Opcode::Sload16x4Complex
|
||||||
|
| Opcode::Uload16x4Complex => Some(I16X4),
|
||||||
|
Opcode::Sload32x2
|
||||||
|
| Opcode::Uload32x2
|
||||||
|
| Opcode::Sload32x2Complex
|
||||||
|
| Opcode::Uload32x2Complex => Some(I32X2),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Helper to lower a load instruction; this is used in several places, because
|
||||||
|
/// a load can sometimes be merged into another operation.
|
||||||
|
pub(crate) fn lower_load<C: LowerCtx<I = Inst>, F: FnMut(&mut C, Writable<Reg>, Type, AMode)>(
|
||||||
|
ctx: &mut C,
|
||||||
|
ir_inst: IRInst,
|
||||||
|
inputs: &[InsnInput],
|
||||||
|
output: InsnOutput,
|
||||||
|
mut f: F,
|
||||||
|
) {
|
||||||
|
let op = ctx.data(ir_inst).opcode();
|
||||||
|
|
||||||
|
let elem_ty = load_op_to_ty(op).unwrap_or_else(|| ctx.output_ty(ir_inst, 0));
|
||||||
|
|
||||||
|
let off = ctx.data(ir_inst).load_store_offset().unwrap();
|
||||||
|
let mem = lower_address(ctx, elem_ty, &inputs[..], off);
|
||||||
|
let rd = get_output_reg(ctx, output);
|
||||||
|
|
||||||
|
f(ctx, rd, elem_ty, mem);
|
||||||
|
}
|
||||||
|
|
||||||
//=============================================================================
|
//=============================================================================
|
||||||
// Lowering-backend trait implementation.
|
// Lowering-backend trait implementation.
|
||||||
|
|
||||||
|
|||||||
@@ -17,30 +17,17 @@ use regalloc::{RegClass, Writable};
|
|||||||
use alloc::boxed::Box;
|
use alloc::boxed::Box;
|
||||||
use alloc::vec::Vec;
|
use alloc::vec::Vec;
|
||||||
use core::convert::TryFrom;
|
use core::convert::TryFrom;
|
||||||
use smallvec::SmallVec;
|
|
||||||
|
|
||||||
use super::lower::*;
|
use super::lower::*;
|
||||||
|
|
||||||
/// This is target-word-size dependent. And it excludes booleans and reftypes.
|
|
||||||
fn is_valid_atomic_transaction_ty(ty: Type) -> bool {
|
|
||||||
match ty {
|
|
||||||
I8 | I16 | I32 | I64 => true,
|
|
||||||
_ => false,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Actually codegen an instruction's results into registers.
|
/// Actually codegen an instruction's results into registers.
|
||||||
pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||||
ctx: &mut C,
|
ctx: &mut C,
|
||||||
insn: IRInst,
|
insn: IRInst,
|
||||||
) -> CodegenResult<()> {
|
) -> CodegenResult<()> {
|
||||||
let op = ctx.data(insn).opcode();
|
let op = ctx.data(insn).opcode();
|
||||||
let inputs: SmallVec<[InsnInput; 4]> = (0..ctx.num_inputs(insn))
|
let inputs = insn_inputs(ctx, insn);
|
||||||
.map(|i| InsnInput { insn, input: i })
|
let outputs = insn_outputs(ctx, insn);
|
||||||
.collect();
|
|
||||||
let outputs: SmallVec<[InsnOutput; 2]> = (0..ctx.num_outputs(insn))
|
|
||||||
.map(|i| InsnOutput { insn, output: i })
|
|
||||||
.collect();
|
|
||||||
let ty = if outputs.len() > 0 {
|
let ty = if outputs.len() > 0 {
|
||||||
Some(ctx.output_ty(insn, 0))
|
Some(ctx.output_ty(insn, 0))
|
||||||
} else {
|
} else {
|
||||||
@@ -1134,34 +1121,6 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
| Opcode::Sload16x4Complex
|
| Opcode::Sload16x4Complex
|
||||||
| Opcode::Uload32x2Complex
|
| Opcode::Uload32x2Complex
|
||||||
| Opcode::Sload32x2Complex => {
|
| Opcode::Sload32x2Complex => {
|
||||||
let off = ctx.data(insn).load_store_offset().unwrap();
|
|
||||||
let elem_ty = match op {
|
|
||||||
Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => {
|
|
||||||
I8
|
|
||||||
}
|
|
||||||
Opcode::Sload16
|
|
||||||
| Opcode::Uload16
|
|
||||||
| Opcode::Sload16Complex
|
|
||||||
| Opcode::Uload16Complex => I16,
|
|
||||||
Opcode::Sload32
|
|
||||||
| Opcode::Uload32
|
|
||||||
| Opcode::Sload32Complex
|
|
||||||
| Opcode::Uload32Complex => I32,
|
|
||||||
Opcode::Load | Opcode::LoadComplex => ctx.output_ty(insn, 0),
|
|
||||||
Opcode::Sload8x8
|
|
||||||
| Opcode::Uload8x8
|
|
||||||
| Opcode::Sload8x8Complex
|
|
||||||
| Opcode::Uload8x8Complex => I8X8,
|
|
||||||
Opcode::Sload16x4
|
|
||||||
| Opcode::Uload16x4
|
|
||||||
| Opcode::Sload16x4Complex
|
|
||||||
| Opcode::Uload16x4Complex => I16X4,
|
|
||||||
Opcode::Sload32x2
|
|
||||||
| Opcode::Uload32x2
|
|
||||||
| Opcode::Sload32x2Complex
|
|
||||||
| Opcode::Uload32x2Complex => I32X2,
|
|
||||||
_ => unreachable!(),
|
|
||||||
};
|
|
||||||
let sign_extend = match op {
|
let sign_extend = match op {
|
||||||
Opcode::Sload8
|
Opcode::Sload8
|
||||||
| Opcode::Sload8Complex
|
| Opcode::Sload8Complex
|
||||||
@@ -1171,68 +1130,57 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
| Opcode::Sload32Complex => true,
|
| Opcode::Sload32Complex => true,
|
||||||
_ => false,
|
_ => false,
|
||||||
};
|
};
|
||||||
let is_float = ty_has_float_or_vec_representation(elem_ty);
|
|
||||||
|
|
||||||
let mem = lower_address(ctx, elem_ty, &inputs[..], off);
|
lower_load(
|
||||||
let rd = get_output_reg(ctx, outputs[0]);
|
ctx,
|
||||||
|
insn,
|
||||||
|
&inputs[..],
|
||||||
|
outputs[0],
|
||||||
|
|ctx, rd, elem_ty, mem| {
|
||||||
|
let is_float = ty_has_float_or_vec_representation(elem_ty);
|
||||||
|
ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) {
|
||||||
|
(1, _, _) => Inst::ULoad8 { rd, mem },
|
||||||
|
(8, false, _) => Inst::ULoad8 { rd, mem },
|
||||||
|
(8, true, _) => Inst::SLoad8 { rd, mem },
|
||||||
|
(16, false, _) => Inst::ULoad16 { rd, mem },
|
||||||
|
(16, true, _) => Inst::SLoad16 { rd, mem },
|
||||||
|
(32, false, false) => Inst::ULoad32 { rd, mem },
|
||||||
|
(32, true, false) => Inst::SLoad32 { rd, mem },
|
||||||
|
(32, _, true) => Inst::FpuLoad32 { rd, mem },
|
||||||
|
(64, _, false) => Inst::ULoad64 { rd, mem },
|
||||||
|
// Note that we treat some of the vector loads as scalar floating-point loads,
|
||||||
|
// which is correct in a little endian environment.
|
||||||
|
(64, _, true) => Inst::FpuLoad64 { rd, mem },
|
||||||
|
(128, _, _) => Inst::FpuLoad128 { rd, mem },
|
||||||
|
_ => panic!("Unsupported size in load"),
|
||||||
|
});
|
||||||
|
|
||||||
ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) {
|
let vec_extend = match op {
|
||||||
(1, _, _) => Inst::ULoad8 { rd, mem },
|
Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8),
|
||||||
(8, false, _) => Inst::ULoad8 { rd, mem },
|
Opcode::Sload8x8Complex => Some(VecExtendOp::Sxtl8),
|
||||||
(8, true, _) => Inst::SLoad8 { rd, mem },
|
Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8),
|
||||||
(16, false, _) => Inst::ULoad16 { rd, mem },
|
Opcode::Uload8x8Complex => Some(VecExtendOp::Uxtl8),
|
||||||
(16, true, _) => Inst::SLoad16 { rd, mem },
|
Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16),
|
||||||
(32, false, false) => Inst::ULoad32 { rd, mem },
|
Opcode::Sload16x4Complex => Some(VecExtendOp::Sxtl16),
|
||||||
(32, true, false) => Inst::SLoad32 { rd, mem },
|
Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16),
|
||||||
(32, _, true) => Inst::FpuLoad32 { rd, mem },
|
Opcode::Uload16x4Complex => Some(VecExtendOp::Uxtl16),
|
||||||
(64, _, false) => Inst::ULoad64 { rd, mem },
|
Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32),
|
||||||
// Note that we treat some of the vector loads as scalar floating-point loads,
|
Opcode::Sload32x2Complex => Some(VecExtendOp::Sxtl32),
|
||||||
// which is correct in a little endian environment.
|
Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32),
|
||||||
(64, _, true) => Inst::FpuLoad64 { rd, mem },
|
Opcode::Uload32x2Complex => Some(VecExtendOp::Uxtl32),
|
||||||
(128, _, _) => Inst::FpuLoad128 { rd, mem },
|
_ => None,
|
||||||
_ => panic!("Unsupported size in load"),
|
};
|
||||||
});
|
|
||||||
|
|
||||||
let vec_extend = match op {
|
if let Some(t) = vec_extend {
|
||||||
Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8),
|
ctx.emit(Inst::VecExtend {
|
||||||
Opcode::Sload8x8Complex => Some(VecExtendOp::Sxtl8),
|
t,
|
||||||
Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8),
|
rd,
|
||||||
Opcode::Uload8x8Complex => Some(VecExtendOp::Uxtl8),
|
rn: rd.to_reg(),
|
||||||
Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16),
|
high_half: false,
|
||||||
Opcode::Sload16x4Complex => Some(VecExtendOp::Sxtl16),
|
});
|
||||||
Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16),
|
}
|
||||||
Opcode::Uload16x4Complex => Some(VecExtendOp::Uxtl16),
|
},
|
||||||
Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32),
|
);
|
||||||
Opcode::Sload32x2Complex => Some(VecExtendOp::Sxtl32),
|
|
||||||
Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32),
|
|
||||||
Opcode::Uload32x2Complex => Some(VecExtendOp::Uxtl32),
|
|
||||||
_ => None,
|
|
||||||
};
|
|
||||||
|
|
||||||
if let Some(t) = vec_extend {
|
|
||||||
ctx.emit(Inst::VecExtend {
|
|
||||||
t,
|
|
||||||
rd,
|
|
||||||
rn: rd.to_reg(),
|
|
||||||
high_half: false,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Opcode::LoadSplat => {
|
|
||||||
let off = ctx.data(insn).load_store_offset().unwrap();
|
|
||||||
let ty = ty.unwrap();
|
|
||||||
let mem = lower_address(ctx, ty.lane_type(), &inputs[..], off);
|
|
||||||
let rd = get_output_reg(ctx, outputs[0]);
|
|
||||||
let size = VectorSize::from_ty(ty);
|
|
||||||
let tmp = ctx.alloc_tmp(RegClass::I64, I64);
|
|
||||||
|
|
||||||
ctx.emit(Inst::LoadAddr { rd: tmp, mem });
|
|
||||||
ctx.emit(Inst::VecLoadReplicate {
|
|
||||||
rd,
|
|
||||||
rn: tmp.to_reg(),
|
|
||||||
size,
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Opcode::Store
|
Opcode::Store
|
||||||
@@ -2026,6 +1974,36 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Bconst, Opcode::Breduce)
|
maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Bconst, Opcode::Breduce)
|
||||||
{
|
{
|
||||||
lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
|
lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
|
||||||
|
} else if let Some((_, insn)) = maybe_input_insn_multi(
|
||||||
|
ctx,
|
||||||
|
inputs[0],
|
||||||
|
&[
|
||||||
|
Opcode::Uload8,
|
||||||
|
Opcode::Sload8,
|
||||||
|
Opcode::Uload16,
|
||||||
|
Opcode::Sload16,
|
||||||
|
Opcode::Uload32,
|
||||||
|
Opcode::Sload32,
|
||||||
|
Opcode::Load,
|
||||||
|
],
|
||||||
|
) {
|
||||||
|
ctx.sink_inst(insn);
|
||||||
|
let load_inputs = insn_inputs(ctx, insn);
|
||||||
|
let load_outputs = insn_outputs(ctx, insn);
|
||||||
|
lower_load(
|
||||||
|
ctx,
|
||||||
|
insn,
|
||||||
|
&load_inputs[..],
|
||||||
|
load_outputs[0],
|
||||||
|
|ctx, _rd, _elem_ty, mem| {
|
||||||
|
let tmp = ctx.alloc_tmp(RegClass::I64, I64);
|
||||||
|
let (addr, addr_inst) = Inst::gen_load_addr(tmp, mem);
|
||||||
|
if let Some(addr_inst) = addr_inst {
|
||||||
|
ctx.emit(addr_inst);
|
||||||
|
}
|
||||||
|
ctx.emit(Inst::VecLoadReplicate { rd, rn: addr, size });
|
||||||
|
},
|
||||||
|
);
|
||||||
} else {
|
} else {
|
||||||
let input_ty = ctx.input_ty(insn, 0);
|
let input_ty = ctx.input_ty(insn, 0);
|
||||||
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
||||||
|
|||||||
@@ -3661,21 +3661,13 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
emit_extract_lane(ctx, src, dst, lane, ty);
|
emit_extract_lane(ctx, src, dst, lane, ty);
|
||||||
}
|
}
|
||||||
|
|
||||||
Opcode::Splat | Opcode::LoadSplat => {
|
Opcode::Splat => {
|
||||||
let ty = ty.unwrap();
|
let ty = ty.unwrap();
|
||||||
assert_eq!(ty.bits(), 128);
|
assert_eq!(ty.bits(), 128);
|
||||||
let src_ty = ctx.input_ty(insn, 0);
|
let src_ty = ctx.input_ty(insn, 0);
|
||||||
assert!(src_ty.bits() < 128);
|
assert!(src_ty.bits() < 128);
|
||||||
|
|
||||||
let src = match op {
|
let src = input_to_reg_mem(ctx, inputs[0]);
|
||||||
Opcode::Splat => input_to_reg_mem(ctx, inputs[0]),
|
|
||||||
Opcode::LoadSplat => {
|
|
||||||
let offset = ctx.data(insn).load_store_offset().unwrap();
|
|
||||||
let amode = lower_to_amode(ctx, inputs[0], offset);
|
|
||||||
RegMem::mem(amode)
|
|
||||||
}
|
|
||||||
_ => unreachable!(),
|
|
||||||
};
|
|
||||||
let dst = get_output_reg(ctx, outputs[0]);
|
let dst = get_output_reg(ctx, outputs[0]);
|
||||||
|
|
||||||
// We know that splat will overwrite all of the lanes of `dst` but it takes several
|
// We know that splat will overwrite all of the lanes of `dst` but it takes several
|
||||||
|
|||||||
@@ -1892,31 +1892,3 @@ fn expand_tls_value(
|
|||||||
unreachable!();
|
unreachable!();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn expand_load_splat(
|
|
||||||
inst: ir::Inst,
|
|
||||||
func: &mut ir::Function,
|
|
||||||
_cfg: &mut ControlFlowGraph,
|
|
||||||
_isa: &dyn TargetIsa,
|
|
||||||
) {
|
|
||||||
let mut pos = FuncCursor::new(func).at_inst(inst);
|
|
||||||
|
|
||||||
pos.use_srcloc(inst);
|
|
||||||
|
|
||||||
let (ptr, offset, flags) = match pos.func.dfg[inst] {
|
|
||||||
ir::InstructionData::Load {
|
|
||||||
opcode: ir::Opcode::LoadSplat,
|
|
||||||
arg,
|
|
||||||
offset,
|
|
||||||
flags,
|
|
||||||
} => (arg, offset, flags),
|
|
||||||
_ => panic!(
|
|
||||||
"Expected load_splat: {}",
|
|
||||||
pos.func.dfg.display_inst(inst, None)
|
|
||||||
),
|
|
||||||
};
|
|
||||||
let ty = pos.func.dfg.ctrl_typevar(inst);
|
|
||||||
let load = pos.ins().load(ty.lane_type(), flags, ptr, offset);
|
|
||||||
|
|
||||||
pos.func.dfg.replace(inst).splat(ty, load);
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
//! A place to park MachInst::Inst fragments which are common across multiple architectures.
|
//! A place to park MachInst::Inst fragments which are common across multiple architectures.
|
||||||
|
|
||||||
|
use super::{LowerCtx, VCodeInst};
|
||||||
use crate::ir::{self, Inst as IRInst};
|
use crate::ir::{self, Inst as IRInst};
|
||||||
|
use smallvec::SmallVec;
|
||||||
|
|
||||||
//============================================================================
|
//============================================================================
|
||||||
// Instruction input "slots".
|
// Instruction input "slots".
|
||||||
@@ -22,6 +24,24 @@ pub(crate) struct InsnOutput {
|
|||||||
pub(crate) output: usize,
|
pub(crate) output: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn insn_inputs<I: VCodeInst, C: LowerCtx<I = I>>(
|
||||||
|
ctx: &C,
|
||||||
|
insn: IRInst,
|
||||||
|
) -> SmallVec<[InsnInput; 4]> {
|
||||||
|
(0..ctx.num_inputs(insn))
|
||||||
|
.map(|i| InsnInput { insn, input: i })
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn insn_outputs<I: VCodeInst, C: LowerCtx<I = I>>(
|
||||||
|
ctx: &C,
|
||||||
|
insn: IRInst,
|
||||||
|
) -> SmallVec<[InsnOutput; 4]> {
|
||||||
|
(0..ctx.num_outputs(insn))
|
||||||
|
.map(|i| InsnOutput { insn, output: i })
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
//============================================================================
|
//============================================================================
|
||||||
// Atomic instructions.
|
// Atomic instructions.
|
||||||
|
|
||||||
|
|||||||
@@ -147,9 +147,10 @@ pub trait LowerCtx {
|
|||||||
/// Emit a machine instruction that is a safepoint.
|
/// Emit a machine instruction that is a safepoint.
|
||||||
fn emit_safepoint(&mut self, mach_inst: Self::I);
|
fn emit_safepoint(&mut self, mach_inst: Self::I);
|
||||||
/// Indicate that the side-effect of an instruction has been sunk to the
|
/// Indicate that the side-effect of an instruction has been sunk to the
|
||||||
/// current scan location. This can only be done to an instruction with no
|
/// current scan location. This should only be done with the instruction's
|
||||||
/// uses of its result register(s), because it will cause the instruction
|
/// original results are not used (i.e., `put_input_in_reg` is not invoked
|
||||||
/// not to be codegen'd at its original location.
|
/// for the input produced by the sunk instruction), otherwise the
|
||||||
|
/// side-effect will occur twice.
|
||||||
fn sink_inst(&mut self, ir_inst: Inst);
|
fn sink_inst(&mut self, ir_inst: Inst);
|
||||||
/// Retrieve constant data given a handle.
|
/// Retrieve constant data given a handle.
|
||||||
fn get_constant_data(&self, constant_handle: Constant) -> &ConstantData;
|
fn get_constant_data(&self, constant_handle: Constant) -> &ConstantData;
|
||||||
|
|||||||
@@ -61,3 +61,69 @@ block0(v0: i32, v1: i8x16, v2: i8x16):
|
|||||||
; nextln: mov sp, fp
|
; nextln: mov sp, fp
|
||||||
; nextln: ldp fp, lr, [sp], #16
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
; nextln: ret
|
; nextln: ret
|
||||||
|
|
||||||
|
function %f5(i64) -> i8x16 {
|
||||||
|
block0(v0: i64):
|
||||||
|
v1 = load.i8 v0
|
||||||
|
v2 = splat.i8x16 v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
|
||||||
|
; check: stp fp, lr, [sp, #-16]!
|
||||||
|
; nextln: mov fp, sp
|
||||||
|
; nextln: ld1r { v0.16b }, [x0]
|
||||||
|
; nextln: mov sp, fp
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
|
|
||||||
|
function %f6(i64, i64) -> i8x16, i8x16 {
|
||||||
|
block0(v0: i64, v1: i64):
|
||||||
|
v2 = load.i8 v0
|
||||||
|
v3 = load.i8 v1
|
||||||
|
v4 = splat.i8x16 v2
|
||||||
|
v5 = splat.i8x16 v3
|
||||||
|
return v4, v5
|
||||||
|
}
|
||||||
|
|
||||||
|
; check: stp fp, lr, [sp, #-16]!
|
||||||
|
; nextln: mov fp, sp
|
||||||
|
; nextln: ld1r { v0.16b }, [x0]
|
||||||
|
; nextln: ld1r { v1.16b }, [x1]
|
||||||
|
; nextln: mov sp, fp
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
|
|
||||||
|
function %f7(i64, i64) -> i8x16, i8x16 {
|
||||||
|
block0(v0: i64, v1: i64):
|
||||||
|
v2 = load.i8 v0
|
||||||
|
v3 = load.i8 v1
|
||||||
|
v4 = splat.i8x16 v3
|
||||||
|
v5 = splat.i8x16 v2
|
||||||
|
return v4, v5
|
||||||
|
}
|
||||||
|
|
||||||
|
; check: stp fp, lr, [sp, #-16]!
|
||||||
|
; nextln: mov fp, sp
|
||||||
|
; nextln: ldrb w0, [x0]
|
||||||
|
; nextln: ld1r { v0.16b }, [x1]
|
||||||
|
; nextln: dup v1.16b, w0
|
||||||
|
; nextln: mov sp, fp
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
|
|
||||||
|
function %f8(i64, i64) -> i8x16, i8x16 {
|
||||||
|
block0(v0: i64, v1: i64):
|
||||||
|
v2 = load.i8 v0
|
||||||
|
v3 = splat.i8x16 v2
|
||||||
|
v4 = splat.i8x16 v2
|
||||||
|
return v3, v4
|
||||||
|
}
|
||||||
|
|
||||||
|
; check: stp fp, lr, [sp, #-16]!
|
||||||
|
; nextln: mov fp, sp
|
||||||
|
; nextln: ldrb w0, [x0]
|
||||||
|
; nextln: dup v0.16b, w0
|
||||||
|
; nextln: dup v1.16b, w0
|
||||||
|
; nextln: mov sp, fp
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
|
|||||||
@@ -545,7 +545,6 @@ where
|
|||||||
Opcode::Shuffle => unimplemented!("Shuffle"),
|
Opcode::Shuffle => unimplemented!("Shuffle"),
|
||||||
Opcode::Swizzle => unimplemented!("Swizzle"),
|
Opcode::Swizzle => unimplemented!("Swizzle"),
|
||||||
Opcode::Splat => unimplemented!("Splat"),
|
Opcode::Splat => unimplemented!("Splat"),
|
||||||
Opcode::LoadSplat => unimplemented!("LoadSplat"),
|
|
||||||
Opcode::Insertlane => unimplemented!("Insertlane"),
|
Opcode::Insertlane => unimplemented!("Insertlane"),
|
||||||
Opcode::Extractlane => unimplemented!("Extractlane"),
|
Opcode::Extractlane => unimplemented!("Extractlane"),
|
||||||
Opcode::VhighBits => unimplemented!("VhighBits"),
|
Opcode::VhighBits => unimplemented!("VhighBits"),
|
||||||
|
|||||||
@@ -1414,17 +1414,16 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
|
|||||||
| Operator::V128Load16Splat { memarg }
|
| Operator::V128Load16Splat { memarg }
|
||||||
| Operator::V128Load32Splat { memarg }
|
| Operator::V128Load32Splat { memarg }
|
||||||
| Operator::V128Load64Splat { memarg } => {
|
| Operator::V128Load64Splat { memarg } => {
|
||||||
let opcode = ir::Opcode::LoadSplat;
|
translate_load(
|
||||||
let result_ty = type_of(op);
|
|
||||||
let (flags, base, offset) = prepare_load(
|
|
||||||
memarg,
|
memarg,
|
||||||
mem_op_size(opcode, result_ty.lane_type()),
|
ir::Opcode::Load,
|
||||||
|
type_of(op).lane_type(),
|
||||||
builder,
|
builder,
|
||||||
state,
|
state,
|
||||||
environ,
|
environ,
|
||||||
)?;
|
)?;
|
||||||
let (load, dfg) = builder.ins().Load(opcode, result_ty, flags, offset, base);
|
let splatted = builder.ins().splat(type_of(op), state.pop1());
|
||||||
state.push1(dfg.first_result(load))
|
state.push1(splatted)
|
||||||
}
|
}
|
||||||
Operator::V128Load32Zero { memarg } | Operator::V128Load64Zero { memarg } => {
|
Operator::V128Load32Zero { memarg } | Operator::V128Load64Zero { memarg } => {
|
||||||
translate_load(
|
translate_load(
|
||||||
@@ -2103,7 +2102,7 @@ fn mem_op_size(opcode: ir::Opcode, ty: Type) -> u32 {
|
|||||||
ir::Opcode::Istore8 | ir::Opcode::Sload8 | ir::Opcode::Uload8 => 1,
|
ir::Opcode::Istore8 | ir::Opcode::Sload8 | ir::Opcode::Uload8 => 1,
|
||||||
ir::Opcode::Istore16 | ir::Opcode::Sload16 | ir::Opcode::Uload16 => 2,
|
ir::Opcode::Istore16 | ir::Opcode::Sload16 | ir::Opcode::Uload16 => 2,
|
||||||
ir::Opcode::Istore32 | ir::Opcode::Sload32 | ir::Opcode::Uload32 => 4,
|
ir::Opcode::Istore32 | ir::Opcode::Sload32 | ir::Opcode::Uload32 => 4,
|
||||||
ir::Opcode::Store | ir::Opcode::Load | ir::Opcode::LoadSplat => ty.bytes(),
|
ir::Opcode::Store | ir::Opcode::Load => ty.bytes(),
|
||||||
_ => panic!("unknown size of mem op for {:?}", opcode),
|
_ => panic!("unknown size of mem op for {:?}", opcode),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user