Cranelift AArch64: Various small fixes

* Use FMOV to move 64-bit FP registers and SIMD vectors.
* Add support for additional vector load types.
* Fix the printing of Inst::LoadAddr.

Copyright (c) 2020, Arm Limited.
This commit is contained in:
Anton Kirilov
2020-10-30 13:14:51 +00:00
parent 19640367db
commit edaada3f57
5 changed files with 141 additions and 66 deletions

View File

@@ -1127,7 +1127,13 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::Sload16x4
| Opcode::Uload16x4
| Opcode::Sload32x2
| Opcode::Uload32x2 => {
| Opcode::Uload32x2
| Opcode::Uload8x8Complex
| Opcode::Sload8x8Complex
| Opcode::Uload16x4Complex
| Opcode::Sload16x4Complex
| Opcode::Uload32x2Complex
| Opcode::Sload32x2Complex => {
let off = ctx.data(insn).load_store_offset().unwrap();
let elem_ty = match op {
Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => {
@@ -1142,9 +1148,18 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::Sload32Complex
| Opcode::Uload32Complex => I32,
Opcode::Load | Opcode::LoadComplex => ctx.output_ty(insn, 0),
Opcode::Sload8x8 | Opcode::Uload8x8 => I8X8,
Opcode::Sload16x4 | Opcode::Uload16x4 => I16X4,
Opcode::Sload32x2 | Opcode::Uload32x2 => I32X2,
Opcode::Sload8x8
| Opcode::Uload8x8
| Opcode::Sload8x8Complex
| Opcode::Uload8x8Complex => I8X8,
Opcode::Sload16x4
| Opcode::Uload16x4
| Opcode::Sload16x4Complex
| Opcode::Uload16x4Complex => I16X4,
Opcode::Sload32x2
| Opcode::Uload32x2
| Opcode::Sload32x2Complex
| Opcode::Uload32x2Complex => I32X2,
_ => unreachable!(),
};
let sign_extend = match op {
@@ -1180,11 +1195,17 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let vec_extend = match op {
Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8),
Opcode::Sload8x8Complex => Some(VecExtendOp::Sxtl8),
Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8),
Opcode::Uload8x8Complex => Some(VecExtendOp::Uxtl8),
Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16),
Opcode::Sload16x4Complex => Some(VecExtendOp::Sxtl16),
Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16),
Opcode::Uload16x4Complex => Some(VecExtendOp::Uxtl16),
Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32),
Opcode::Sload32x2Complex => Some(VecExtendOp::Sxtl32),
Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32),
Opcode::Uload32x2Complex => Some(VecExtendOp::Uxtl32),
_ => None,
};
@@ -1641,11 +1662,16 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let rd = get_output_reg(ctx, outputs[0]);
let ity = ctx.input_ty(insn, 0);
let oty = ctx.output_ty(insn, 0);
let ity_bits = ty_bits(ity);
let ity_vec_reg = ty_has_float_or_vec_representation(ity);
let oty_bits = ty_bits(oty);
let oty_vec_reg = ty_has_float_or_vec_representation(oty);
debug_assert_eq!(ity_bits, oty_bits);
match (ity_vec_reg, oty_vec_reg) {
(true, true) => {
let narrow_mode = if ty_bits(ity) <= 32 && ty_bits(oty) <= 32 {
let narrow_mode = if ity_bits <= 32 {
NarrowValueMode::ZeroExtend32
} else {
NarrowValueMode::ZeroExtend64
@@ -1667,11 +1693,13 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
}
(true, false) => {
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let size = VectorSize::from_lane_size(ScalarSize::from_bits(oty_bits), true);
ctx.emit(Inst::MovFromVec {
rd,
rn,
idx: 0,
size: VectorSize::Size64x2,
size,
});
}
}
@@ -1877,12 +1905,12 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
Opcode::GetPinnedReg => {
let rd = get_output_reg(ctx, outputs[0]);
ctx.emit(Inst::mov(rd, xreg(PINNED_REG)));
ctx.emit(Inst::gen_move(rd, xreg(PINNED_REG), I64));
}
Opcode::SetPinnedReg => {
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
ctx.emit(Inst::mov(writable_xreg(PINNED_REG), rm));
ctx.emit(Inst::gen_move(writable_xreg(PINNED_REG), rm, I64));
}
Opcode::Spill
@@ -2314,14 +2342,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
});
}
Opcode::Vsplit
| Opcode::Vconcat
| Opcode::Uload8x8Complex
| Opcode::Sload8x8Complex
| Opcode::Uload16x4Complex
| Opcode::Sload16x4Complex
| Opcode::Uload32x2Complex
| Opcode::Sload32x2Complex => {
Opcode::Vsplit | Opcode::Vconcat => {
// TODO
panic!("Vector ops not implemented.");
}
@@ -2569,6 +2590,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
//
// This is a scalar Fcopysign.
// This uses scalar NEON operations for 64-bit and vector operations (2S) for 32-bit.
// In the latter case it still sets all bits except the lowest 32 to 0.
//
// mov vd, vn
// ushr vtmp, vm, #63 / #31
@@ -2583,7 +2605,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let tmp = ctx.alloc_tmp(RegClass::V128, F64);
// Copy LHS to rd.
ctx.emit(Inst::FpuMove64 { rd, rn });
ctx.emit(Inst::gen_move(rd, rn, ty));
// Copy the sign bit to the lowest bit in tmp.
let imm = FPURightShiftImm::maybe_from_u8(bits - 1, bits).unwrap();