Cranelift AArch64: Various small fixes

* Use FMOV to move 64-bit FP registers and SIMD vectors.
* Add support for additional vector load types.
* Fix the printing of Inst::LoadAddr.

Copyright (c) 2020, Arm Limited.
This commit is contained in:
Anton Kirilov
2020-10-30 13:14:51 +00:00
parent 19640367db
commit edaada3f57
5 changed files with 141 additions and 66 deletions

View File

@@ -1239,7 +1239,7 @@ impl MachInstEmit for Inst {
sink.put4(enc_dmb_ish()); // dmb ish sink.put4(enc_dmb_ish()); // dmb ish
} }
&Inst::FpuMove64 { rd, rn } => { &Inst::FpuMove64 { rd, rn } => {
sink.put4(enc_vecmov(/* 16b = */ false, rd, rn)); sink.put4(enc_fpurr(0b000_11110_01_1_000000_10000, rd, rn));
} }
&Inst::FpuMove128 { rd, rn } => { &Inst::FpuMove128 { rd, rn } => {
sink.put4(enc_vecmov(/* 16b = */ true, rd, rn)); sink.put4(enc_vecmov(/* 16b = */ true, rd, rn));
@@ -1984,7 +1984,9 @@ impl MachInstEmit for Inst {
if top22 != 0 { if top22 != 0 {
sink.put4(enc_extend(top22, rd, rn)); sink.put4(enc_extend(top22, rd, rn));
} else { } else {
Inst::mov32(rd, rn).emit(sink, emit_info, state); let mov = Inst::Mov32 { rd, rm: rn };
mov.emit(sink, emit_info, state);
} }
} }
&Inst::Extend { &Inst::Extend {
@@ -2264,7 +2266,7 @@ impl MachInstEmit for Inst {
add.emit(sink, emit_info, state); add.emit(sink, emit_info, state);
} else if offset == 0 { } else if offset == 0 {
if reg != rd.to_reg() { if reg != rd.to_reg() {
let mov = Inst::mov(rd, reg); let mov = Inst::Mov64 { rd, rm: reg };
mov.emit(sink, emit_info, state); mov.emit(sink, emit_info, state);
} }

View File

@@ -4219,8 +4219,8 @@ fn test_aarch64_binemit() {
rd: writable_vreg(8), rd: writable_vreg(8),
rn: vreg(4), rn: vreg(4),
}, },
"881CA40E", "8840601E",
"mov v8.8b, v4.8b", "fmov d8, d4",
)); ));
insns.push(( insns.push((

View File

@@ -5,8 +5,9 @@
use crate::binemit::CodeOffset; use crate::binemit::CodeOffset;
use crate::ir::types::{ use crate::ir::types::{
B1, B16, B16X8, B32, B32X4, B64, B64X2, B8, B8X16, F32, F32X4, F64, F64X2, FFLAGS, I16, I16X8, B1, B16, B16X4, B16X8, B32, B32X2, B32X4, B64, B64X2, B8, B8X16, B8X8, F32, F32X2, F32X4, F64,
I32, I32X4, I64, I64X2, I8, I8X16, IFLAGS, R32, R64, F64X2, FFLAGS, I16, I16X4, I16X8, I32, I32X2, I32X4, I64, I64X2, I8, I8X16, I8X8, IFLAGS, R32,
R64,
}; };
use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode, Type}; use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode, Type};
use crate::isa::CallConv; use crate::isa::CallConv;
@@ -1192,35 +1193,6 @@ fn inst_size_test() {
} }
impl Inst { impl Inst {
/// Create a move instruction.
pub fn mov(to_reg: Writable<Reg>, from_reg: Reg) -> Inst {
assert!(to_reg.to_reg().get_class() == from_reg.get_class());
if from_reg.get_class() == RegClass::I64 {
Inst::Mov64 {
rd: to_reg,
rm: from_reg,
}
} else if from_reg.get_class() == RegClass::V128 {
Inst::FpuMove128 {
rd: to_reg,
rn: from_reg,
}
} else {
Inst::FpuMove64 {
rd: to_reg,
rn: from_reg,
}
}
}
/// Create a 32-bit move instruction.
pub fn mov32(to_reg: Writable<Reg>, from_reg: Reg) -> Inst {
Inst::Mov32 {
rd: to_reg,
rm: from_reg,
}
}
/// Create an instruction that loads a constant, using one of serveral options (MOVZ, MOVN, /// Create an instruction that loads a constant, using one of serveral options (MOVZ, MOVN,
/// logical immediate, or constant pool). /// logical immediate, or constant pool).
pub fn load_constant(rd: Writable<Reg>, value: u64) -> SmallVec<[Inst; 4]> { pub fn load_constant(rd: Writable<Reg>, value: u64) -> SmallVec<[Inst; 4]> {
@@ -2709,8 +2681,31 @@ impl MachInst for Inst {
} }
fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Inst { fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Inst {
assert!(ty.bits() <= 128); let bits = ty.bits();
Inst::mov(to_reg, from_reg)
assert!(bits <= 128);
assert!(to_reg.to_reg().get_class() == from_reg.get_class());
if from_reg.get_class() == RegClass::I64 {
Inst::Mov64 {
rd: to_reg,
rm: from_reg,
}
} else if from_reg.get_class() == RegClass::V128 {
if bits > 64 {
Inst::FpuMove128 {
rd: to_reg,
rn: from_reg,
}
} else {
Inst::FpuMove64 {
rd: to_reg,
rn: from_reg,
}
}
} else {
panic!("Unexpected register class: {:?}", from_reg.get_class());
}
} }
fn gen_constant<F: FnMut(RegClass, Type) -> Writable<Reg>>( fn gen_constant<F: FnMut(RegClass, Type) -> Writable<Reg>>(
@@ -2761,9 +2756,9 @@ impl MachInst for Inst {
I8 | I16 | I32 | I64 | B1 | B8 | B16 | B32 | B64 | R32 | R64 => Ok(RegClass::I64), I8 | I16 | I32 | I64 | B1 | B8 | B16 | B32 | B64 | R32 | R64 => Ok(RegClass::I64),
F32 | F64 => Ok(RegClass::V128), F32 | F64 => Ok(RegClass::V128),
IFLAGS | FFLAGS => Ok(RegClass::I64), IFLAGS | FFLAGS => Ok(RegClass::I64),
B8X16 | I8X16 | B16X8 | I16X8 | B32X4 | I32X4 | B64X2 | I64X2 | F32X4 | F64X2 => { B8X8 | B8X16 | B16X4 | B16X8 | B32X2 | B32X4 | B64X2 => Ok(RegClass::V128),
Ok(RegClass::V128) F32X2 | I8X8 | I16X4 | I32X2 => Ok(RegClass::V128),
} F32X4 | F64X2 | I8X16 | I16X8 | I32X4 | I64X2 => Ok(RegClass::V128),
_ => Err(CodegenError::Unsupported(format!( _ => Err(CodegenError::Unsupported(format!(
"Unexpected SSA-value type: {}", "Unexpected SSA-value type: {}",
ty ty
@@ -3149,9 +3144,9 @@ impl Inst {
format!("dmb ish") format!("dmb ish")
} }
&Inst::FpuMove64 { rd, rn } => { &Inst::FpuMove64 { rd, rn } => {
let rd = rd.to_reg().show_rru(mb_rru); let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size64);
let rn = rn.show_rru(mb_rru); let rn = show_vreg_scalar(rn, mb_rru, ScalarSize::Size64);
format!("mov {}.8b, {}.8b", rd, rn) format!("fmov {}, {}", rd, rn)
} }
&Inst::FpuMove128 { rd, rn } => { &Inst::FpuMove128 { rd, rn } => {
let rd = rd.to_reg().show_rru(mb_rru); let rd = rd.to_reg().show_rru(mb_rru);
@@ -3800,9 +3795,10 @@ impl Inst {
for inst in mem_insts.into_iter() { for inst in mem_insts.into_iter() {
ret.push_str(&inst.show_rru(mb_rru)); ret.push_str(&inst.show_rru(mb_rru));
} }
let (reg, offset) = match mem { let (reg, index_reg, offset) = match mem {
AMode::Unscaled(r, simm9) => (r, simm9.value()), AMode::RegExtended(r, idx, extendop) => (r, Some((idx, extendop)), 0),
AMode::UnsignedOffset(r, uimm12scaled) => (r, uimm12scaled.value() as i32), AMode::Unscaled(r, simm9) => (r, None, simm9.value()),
AMode::UnsignedOffset(r, uimm12scaled) => (r, None, uimm12scaled.value() as i32),
_ => panic!("Unsupported case for LoadAddr: {:?}", mem), _ => panic!("Unsupported case for LoadAddr: {:?}", mem),
}; };
let abs_offset = if offset < 0 { let abs_offset = if offset < 0 {
@@ -3816,8 +3812,18 @@ impl Inst {
ALUOp::Add64 ALUOp::Add64
}; };
if offset == 0 { if let Some((idx, extendop)) = index_reg {
let mov = Inst::mov(rd, reg); let add = Inst::AluRRRExtend {
alu_op: ALUOp::Add64,
rd,
rn: reg,
rm: idx,
extendop,
};
ret.push_str(&add.show_rru(mb_rru));
} else if offset == 0 {
let mov = Inst::gen_move(rd, reg, I64);
ret.push_str(&mov.show_rru(mb_rru)); ret.push_str(&mov.show_rru(mb_rru));
} else if let Some(imm12) = Imm12::maybe_from_u64(abs_offset) { } else if let Some(imm12) = Imm12::maybe_from_u64(abs_offset) {
let add = Inst::AluRRImm12 { let add = Inst::AluRRImm12 {

View File

@@ -1127,7 +1127,13 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::Sload16x4 | Opcode::Sload16x4
| Opcode::Uload16x4 | Opcode::Uload16x4
| Opcode::Sload32x2 | Opcode::Sload32x2
| Opcode::Uload32x2 => { | Opcode::Uload32x2
| Opcode::Uload8x8Complex
| Opcode::Sload8x8Complex
| Opcode::Uload16x4Complex
| Opcode::Sload16x4Complex
| Opcode::Uload32x2Complex
| Opcode::Sload32x2Complex => {
let off = ctx.data(insn).load_store_offset().unwrap(); let off = ctx.data(insn).load_store_offset().unwrap();
let elem_ty = match op { let elem_ty = match op {
Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => { Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => {
@@ -1142,9 +1148,18 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::Sload32Complex | Opcode::Sload32Complex
| Opcode::Uload32Complex => I32, | Opcode::Uload32Complex => I32,
Opcode::Load | Opcode::LoadComplex => ctx.output_ty(insn, 0), Opcode::Load | Opcode::LoadComplex => ctx.output_ty(insn, 0),
Opcode::Sload8x8 | Opcode::Uload8x8 => I8X8, Opcode::Sload8x8
Opcode::Sload16x4 | Opcode::Uload16x4 => I16X4, | Opcode::Uload8x8
Opcode::Sload32x2 | Opcode::Uload32x2 => I32X2, | Opcode::Sload8x8Complex
| Opcode::Uload8x8Complex => I8X8,
Opcode::Sload16x4
| Opcode::Uload16x4
| Opcode::Sload16x4Complex
| Opcode::Uload16x4Complex => I16X4,
Opcode::Sload32x2
| Opcode::Uload32x2
| Opcode::Sload32x2Complex
| Opcode::Uload32x2Complex => I32X2,
_ => unreachable!(), _ => unreachable!(),
}; };
let sign_extend = match op { let sign_extend = match op {
@@ -1180,11 +1195,17 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let vec_extend = match op { let vec_extend = match op {
Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8), Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8),
Opcode::Sload8x8Complex => Some(VecExtendOp::Sxtl8),
Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8), Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8),
Opcode::Uload8x8Complex => Some(VecExtendOp::Uxtl8),
Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16), Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16),
Opcode::Sload16x4Complex => Some(VecExtendOp::Sxtl16),
Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16), Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16),
Opcode::Uload16x4Complex => Some(VecExtendOp::Uxtl16),
Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32), Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32),
Opcode::Sload32x2Complex => Some(VecExtendOp::Sxtl32),
Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32), Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32),
Opcode::Uload32x2Complex => Some(VecExtendOp::Uxtl32),
_ => None, _ => None,
}; };
@@ -1641,11 +1662,16 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let rd = get_output_reg(ctx, outputs[0]); let rd = get_output_reg(ctx, outputs[0]);
let ity = ctx.input_ty(insn, 0); let ity = ctx.input_ty(insn, 0);
let oty = ctx.output_ty(insn, 0); let oty = ctx.output_ty(insn, 0);
let ity_bits = ty_bits(ity);
let ity_vec_reg = ty_has_float_or_vec_representation(ity); let ity_vec_reg = ty_has_float_or_vec_representation(ity);
let oty_bits = ty_bits(oty);
let oty_vec_reg = ty_has_float_or_vec_representation(oty); let oty_vec_reg = ty_has_float_or_vec_representation(oty);
debug_assert_eq!(ity_bits, oty_bits);
match (ity_vec_reg, oty_vec_reg) { match (ity_vec_reg, oty_vec_reg) {
(true, true) => { (true, true) => {
let narrow_mode = if ty_bits(ity) <= 32 && ty_bits(oty) <= 32 { let narrow_mode = if ity_bits <= 32 {
NarrowValueMode::ZeroExtend32 NarrowValueMode::ZeroExtend32
} else { } else {
NarrowValueMode::ZeroExtend64 NarrowValueMode::ZeroExtend64
@@ -1667,11 +1693,13 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
} }
(true, false) => { (true, false) => {
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let size = VectorSize::from_lane_size(ScalarSize::from_bits(oty_bits), true);
ctx.emit(Inst::MovFromVec { ctx.emit(Inst::MovFromVec {
rd, rd,
rn, rn,
idx: 0, idx: 0,
size: VectorSize::Size64x2, size,
}); });
} }
} }
@@ -1877,12 +1905,12 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
Opcode::GetPinnedReg => { Opcode::GetPinnedReg => {
let rd = get_output_reg(ctx, outputs[0]); let rd = get_output_reg(ctx, outputs[0]);
ctx.emit(Inst::mov(rd, xreg(PINNED_REG))); ctx.emit(Inst::gen_move(rd, xreg(PINNED_REG), I64));
} }
Opcode::SetPinnedReg => { Opcode::SetPinnedReg => {
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
ctx.emit(Inst::mov(writable_xreg(PINNED_REG), rm)); ctx.emit(Inst::gen_move(writable_xreg(PINNED_REG), rm, I64));
} }
Opcode::Spill Opcode::Spill
@@ -2314,14 +2342,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
}); });
} }
Opcode::Vsplit Opcode::Vsplit | Opcode::Vconcat => {
| Opcode::Vconcat
| Opcode::Uload8x8Complex
| Opcode::Sload8x8Complex
| Opcode::Uload16x4Complex
| Opcode::Sload16x4Complex
| Opcode::Uload32x2Complex
| Opcode::Sload32x2Complex => {
// TODO // TODO
panic!("Vector ops not implemented."); panic!("Vector ops not implemented.");
} }
@@ -2569,6 +2590,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
// //
// This is a scalar Fcopysign. // This is a scalar Fcopysign.
// This uses scalar NEON operations for 64-bit and vector operations (2S) for 32-bit. // This uses scalar NEON operations for 64-bit and vector operations (2S) for 32-bit.
// In the latter case it still sets all bits except the lowest 32 to 0.
// //
// mov vd, vn // mov vd, vn
// ushr vtmp, vm, #63 / #31 // ushr vtmp, vm, #63 / #31
@@ -2583,7 +2605,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let tmp = ctx.alloc_tmp(RegClass::V128, F64); let tmp = ctx.alloc_tmp(RegClass::V128, F64);
// Copy LHS to rd. // Copy LHS to rd.
ctx.emit(Inst::FpuMove64 { rd, rn }); ctx.emit(Inst::gen_move(rd, rn, ty));
// Copy the sign bit to the lowest bit in tmp. // Copy the sign bit to the lowest bit in tmp.
let imm = FPURightShiftImm::maybe_from_u8(bits - 1, bits).unwrap(); let imm = FPURightShiftImm::maybe_from_u8(bits - 1, bits).unwrap();

View File

@@ -299,3 +299,48 @@ block0(v0: i64):
; nextln: mov sp, fp ; nextln: mov sp, fp
; nextln: ldp fp, lr, [sp], #16 ; nextln: ldp fp, lr, [sp], #16
; nextln: ret ; nextln: ret
function %f18(i64, i32) -> i16x8 {
block0(v0: i64, v1: i32):
v2 = uextend.i64 v1
v3 = sload8x8_complex v2+v0
return v3
}
; check: stp fp, lr, [sp, #-16]!
; nextln: mov fp, sp
; nextln: ldr d0, [x0, w1, UXTW]
; nextln: sxtl v0.8h, v0.8b
; nextln: mov sp, fp
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
function %f19(i64, i64) -> i32x4 {
block0(v0: i64, v1: i64):
v2 = uload16x4_complex v0+v1+8
return v2
}
; check: stp fp, lr, [sp, #-16]!
; nextln: mov fp, sp
; nextln: add x0, x0, x1
; nextln: ldr d0, [x0, #8]
; nextln: uxtl v0.4s, v0.4h
; nextln: mov sp, fp
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
function %f20(i64, i32) -> i64x2 {
block0(v0: i64, v1: i32):
v2 = sextend.i64 v1
v3 = uload32x2_complex v2+v0
return v3
}
; check: stp fp, lr, [sp, #-16]!
; nextln: mov fp, sp
; nextln: ldr d0, [x0, w1, SXTW]
; nextln: uxtl v0.2d, v0.2s
; nextln: mov sp, fp
; nextln: ldp fp, lr, [sp], #16
; nextln: ret