From edaada3f57af3a49031adbb83af237a820ff91dd Mon Sep 17 00:00:00 2001 From: Anton Kirilov Date: Fri, 30 Oct 2020 13:14:51 +0000 Subject: [PATCH] Cranelift AArch64: Various small fixes * Use FMOV to move 64-bit FP registers and SIMD vectors. * Add support for additional vector load types. * Fix the printing of Inst::LoadAddr. Copyright (c) 2020, Arm Limited. --- .../codegen/src/isa/aarch64/inst/emit.rs | 8 +- .../src/isa/aarch64/inst/emit_tests.rs | 4 +- cranelift/codegen/src/isa/aarch64/inst/mod.rs | 94 ++++++++++--------- .../codegen/src/isa/aarch64/lower_inst.rs | 56 +++++++---- .../filetests/isa/aarch64/amodes.clif | 45 +++++++++ 5 files changed, 141 insertions(+), 66 deletions(-) diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index d5aec6f1fc..597c9ac592 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -1239,7 +1239,7 @@ impl MachInstEmit for Inst { sink.put4(enc_dmb_ish()); // dmb ish } &Inst::FpuMove64 { rd, rn } => { - sink.put4(enc_vecmov(/* 16b = */ false, rd, rn)); + sink.put4(enc_fpurr(0b000_11110_01_1_000000_10000, rd, rn)); } &Inst::FpuMove128 { rd, rn } => { sink.put4(enc_vecmov(/* 16b = */ true, rd, rn)); @@ -1984,7 +1984,9 @@ impl MachInstEmit for Inst { if top22 != 0 { sink.put4(enc_extend(top22, rd, rn)); } else { - Inst::mov32(rd, rn).emit(sink, emit_info, state); + let mov = Inst::Mov32 { rd, rm: rn }; + + mov.emit(sink, emit_info, state); } } &Inst::Extend { @@ -2264,7 +2266,7 @@ impl MachInstEmit for Inst { add.emit(sink, emit_info, state); } else if offset == 0 { if reg != rd.to_reg() { - let mov = Inst::mov(rd, reg); + let mov = Inst::Mov64 { rd, rm: reg }; mov.emit(sink, emit_info, state); } diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index 9248c2a199..74aac428ef 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -4219,8 +4219,8 @@ fn test_aarch64_binemit() { rd: writable_vreg(8), rn: vreg(4), }, - "881CA40E", - "mov v8.8b, v4.8b", + "8840601E", + "fmov d8, d4", )); insns.push(( diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 05c922912b..d09637298c 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -5,8 +5,9 @@ use crate::binemit::CodeOffset; use crate::ir::types::{ - B1, B16, B16X8, B32, B32X4, B64, B64X2, B8, B8X16, F32, F32X4, F64, F64X2, FFLAGS, I16, I16X8, - I32, I32X4, I64, I64X2, I8, I8X16, IFLAGS, R32, R64, + B1, B16, B16X4, B16X8, B32, B32X2, B32X4, B64, B64X2, B8, B8X16, B8X8, F32, F32X2, F32X4, F64, + F64X2, FFLAGS, I16, I16X4, I16X8, I32, I32X2, I32X4, I64, I64X2, I8, I8X16, I8X8, IFLAGS, R32, + R64, }; use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode, Type}; use crate::isa::CallConv; @@ -1192,35 +1193,6 @@ fn inst_size_test() { } impl Inst { - /// Create a move instruction. - pub fn mov(to_reg: Writable, from_reg: Reg) -> Inst { - assert!(to_reg.to_reg().get_class() == from_reg.get_class()); - if from_reg.get_class() == RegClass::I64 { - Inst::Mov64 { - rd: to_reg, - rm: from_reg, - } - } else if from_reg.get_class() == RegClass::V128 { - Inst::FpuMove128 { - rd: to_reg, - rn: from_reg, - } - } else { - Inst::FpuMove64 { - rd: to_reg, - rn: from_reg, - } - } - } - - /// Create a 32-bit move instruction. - pub fn mov32(to_reg: Writable, from_reg: Reg) -> Inst { - Inst::Mov32 { - rd: to_reg, - rm: from_reg, - } - } - /// Create an instruction that loads a constant, using one of serveral options (MOVZ, MOVN, /// logical immediate, or constant pool). pub fn load_constant(rd: Writable, value: u64) -> SmallVec<[Inst; 4]> { @@ -2709,8 +2681,31 @@ impl MachInst for Inst { } fn gen_move(to_reg: Writable, from_reg: Reg, ty: Type) -> Inst { - assert!(ty.bits() <= 128); - Inst::mov(to_reg, from_reg) + let bits = ty.bits(); + + assert!(bits <= 128); + assert!(to_reg.to_reg().get_class() == from_reg.get_class()); + + if from_reg.get_class() == RegClass::I64 { + Inst::Mov64 { + rd: to_reg, + rm: from_reg, + } + } else if from_reg.get_class() == RegClass::V128 { + if bits > 64 { + Inst::FpuMove128 { + rd: to_reg, + rn: from_reg, + } + } else { + Inst::FpuMove64 { + rd: to_reg, + rn: from_reg, + } + } + } else { + panic!("Unexpected register class: {:?}", from_reg.get_class()); + } } fn gen_constant Writable>( @@ -2761,9 +2756,9 @@ impl MachInst for Inst { I8 | I16 | I32 | I64 | B1 | B8 | B16 | B32 | B64 | R32 | R64 => Ok(RegClass::I64), F32 | F64 => Ok(RegClass::V128), IFLAGS | FFLAGS => Ok(RegClass::I64), - B8X16 | I8X16 | B16X8 | I16X8 | B32X4 | I32X4 | B64X2 | I64X2 | F32X4 | F64X2 => { - Ok(RegClass::V128) - } + B8X8 | B8X16 | B16X4 | B16X8 | B32X2 | B32X4 | B64X2 => Ok(RegClass::V128), + F32X2 | I8X8 | I16X4 | I32X2 => Ok(RegClass::V128), + F32X4 | F64X2 | I8X16 | I16X8 | I32X4 | I64X2 => Ok(RegClass::V128), _ => Err(CodegenError::Unsupported(format!( "Unexpected SSA-value type: {}", ty @@ -3149,9 +3144,9 @@ impl Inst { format!("dmb ish") } &Inst::FpuMove64 { rd, rn } => { - let rd = rd.to_reg().show_rru(mb_rru); - let rn = rn.show_rru(mb_rru); - format!("mov {}.8b, {}.8b", rd, rn) + let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size64); + let rn = show_vreg_scalar(rn, mb_rru, ScalarSize::Size64); + format!("fmov {}, {}", rd, rn) } &Inst::FpuMove128 { rd, rn } => { let rd = rd.to_reg().show_rru(mb_rru); @@ -3800,9 +3795,10 @@ impl Inst { for inst in mem_insts.into_iter() { ret.push_str(&inst.show_rru(mb_rru)); } - let (reg, offset) = match mem { - AMode::Unscaled(r, simm9) => (r, simm9.value()), - AMode::UnsignedOffset(r, uimm12scaled) => (r, uimm12scaled.value() as i32), + let (reg, index_reg, offset) = match mem { + AMode::RegExtended(r, idx, extendop) => (r, Some((idx, extendop)), 0), + AMode::Unscaled(r, simm9) => (r, None, simm9.value()), + AMode::UnsignedOffset(r, uimm12scaled) => (r, None, uimm12scaled.value() as i32), _ => panic!("Unsupported case for LoadAddr: {:?}", mem), }; let abs_offset = if offset < 0 { @@ -3816,8 +3812,18 @@ impl Inst { ALUOp::Add64 }; - if offset == 0 { - let mov = Inst::mov(rd, reg); + if let Some((idx, extendop)) = index_reg { + let add = Inst::AluRRRExtend { + alu_op: ALUOp::Add64, + rd, + rn: reg, + rm: idx, + extendop, + }; + + ret.push_str(&add.show_rru(mb_rru)); + } else if offset == 0 { + let mov = Inst::gen_move(rd, reg, I64); ret.push_str(&mov.show_rru(mb_rru)); } else if let Some(imm12) = Imm12::maybe_from_u64(abs_offset) { let add = Inst::AluRRImm12 { diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 90ee97b83c..88fc8358b1 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -1127,7 +1127,13 @@ pub(crate) fn lower_insn_to_regs>( | Opcode::Sload16x4 | Opcode::Uload16x4 | Opcode::Sload32x2 - | Opcode::Uload32x2 => { + | Opcode::Uload32x2 + | Opcode::Uload8x8Complex + | Opcode::Sload8x8Complex + | Opcode::Uload16x4Complex + | Opcode::Sload16x4Complex + | Opcode::Uload32x2Complex + | Opcode::Sload32x2Complex => { let off = ctx.data(insn).load_store_offset().unwrap(); let elem_ty = match op { Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => { @@ -1142,9 +1148,18 @@ pub(crate) fn lower_insn_to_regs>( | Opcode::Sload32Complex | Opcode::Uload32Complex => I32, Opcode::Load | Opcode::LoadComplex => ctx.output_ty(insn, 0), - Opcode::Sload8x8 | Opcode::Uload8x8 => I8X8, - Opcode::Sload16x4 | Opcode::Uload16x4 => I16X4, - Opcode::Sload32x2 | Opcode::Uload32x2 => I32X2, + Opcode::Sload8x8 + | Opcode::Uload8x8 + | Opcode::Sload8x8Complex + | Opcode::Uload8x8Complex => I8X8, + Opcode::Sload16x4 + | Opcode::Uload16x4 + | Opcode::Sload16x4Complex + | Opcode::Uload16x4Complex => I16X4, + Opcode::Sload32x2 + | Opcode::Uload32x2 + | Opcode::Sload32x2Complex + | Opcode::Uload32x2Complex => I32X2, _ => unreachable!(), }; let sign_extend = match op { @@ -1180,11 +1195,17 @@ pub(crate) fn lower_insn_to_regs>( let vec_extend = match op { Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8), + Opcode::Sload8x8Complex => Some(VecExtendOp::Sxtl8), Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8), + Opcode::Uload8x8Complex => Some(VecExtendOp::Uxtl8), Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16), + Opcode::Sload16x4Complex => Some(VecExtendOp::Sxtl16), Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16), + Opcode::Uload16x4Complex => Some(VecExtendOp::Uxtl16), Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32), + Opcode::Sload32x2Complex => Some(VecExtendOp::Sxtl32), Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32), + Opcode::Uload32x2Complex => Some(VecExtendOp::Uxtl32), _ => None, }; @@ -1641,11 +1662,16 @@ pub(crate) fn lower_insn_to_regs>( let rd = get_output_reg(ctx, outputs[0]); let ity = ctx.input_ty(insn, 0); let oty = ctx.output_ty(insn, 0); + let ity_bits = ty_bits(ity); let ity_vec_reg = ty_has_float_or_vec_representation(ity); + let oty_bits = ty_bits(oty); let oty_vec_reg = ty_has_float_or_vec_representation(oty); + + debug_assert_eq!(ity_bits, oty_bits); + match (ity_vec_reg, oty_vec_reg) { (true, true) => { - let narrow_mode = if ty_bits(ity) <= 32 && ty_bits(oty) <= 32 { + let narrow_mode = if ity_bits <= 32 { NarrowValueMode::ZeroExtend32 } else { NarrowValueMode::ZeroExtend64 @@ -1667,11 +1693,13 @@ pub(crate) fn lower_insn_to_regs>( } (true, false) => { let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let size = VectorSize::from_lane_size(ScalarSize::from_bits(oty_bits), true); + ctx.emit(Inst::MovFromVec { rd, rn, idx: 0, - size: VectorSize::Size64x2, + size, }); } } @@ -1877,12 +1905,12 @@ pub(crate) fn lower_insn_to_regs>( Opcode::GetPinnedReg => { let rd = get_output_reg(ctx, outputs[0]); - ctx.emit(Inst::mov(rd, xreg(PINNED_REG))); + ctx.emit(Inst::gen_move(rd, xreg(PINNED_REG), I64)); } Opcode::SetPinnedReg => { let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - ctx.emit(Inst::mov(writable_xreg(PINNED_REG), rm)); + ctx.emit(Inst::gen_move(writable_xreg(PINNED_REG), rm, I64)); } Opcode::Spill @@ -2314,14 +2342,7 @@ pub(crate) fn lower_insn_to_regs>( }); } - Opcode::Vsplit - | Opcode::Vconcat - | Opcode::Uload8x8Complex - | Opcode::Sload8x8Complex - | Opcode::Uload16x4Complex - | Opcode::Sload16x4Complex - | Opcode::Uload32x2Complex - | Opcode::Sload32x2Complex => { + Opcode::Vsplit | Opcode::Vconcat => { // TODO panic!("Vector ops not implemented."); } @@ -2569,6 +2590,7 @@ pub(crate) fn lower_insn_to_regs>( // // This is a scalar Fcopysign. // This uses scalar NEON operations for 64-bit and vector operations (2S) for 32-bit. + // In the latter case it still sets all bits except the lowest 32 to 0. // // mov vd, vn // ushr vtmp, vm, #63 / #31 @@ -2583,7 +2605,7 @@ pub(crate) fn lower_insn_to_regs>( let tmp = ctx.alloc_tmp(RegClass::V128, F64); // Copy LHS to rd. - ctx.emit(Inst::FpuMove64 { rd, rn }); + ctx.emit(Inst::gen_move(rd, rn, ty)); // Copy the sign bit to the lowest bit in tmp. let imm = FPURightShiftImm::maybe_from_u8(bits - 1, bits).unwrap(); diff --git a/cranelift/filetests/filetests/isa/aarch64/amodes.clif b/cranelift/filetests/filetests/isa/aarch64/amodes.clif index 6cb728c45d..ad109e340e 100644 --- a/cranelift/filetests/filetests/isa/aarch64/amodes.clif +++ b/cranelift/filetests/filetests/isa/aarch64/amodes.clif @@ -299,3 +299,48 @@ block0(v0: i64): ; nextln: mov sp, fp ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret + +function %f18(i64, i32) -> i16x8 { +block0(v0: i64, v1: i32): + v2 = uextend.i64 v1 + v3 = sload8x8_complex v2+v0 + return v3 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: ldr d0, [x0, w1, UXTW] +; nextln: sxtl v0.8h, v0.8b +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f19(i64, i64) -> i32x4 { +block0(v0: i64, v1: i64): + v2 = uload16x4_complex v0+v1+8 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: add x0, x0, x1 +; nextln: ldr d0, [x0, #8] +; nextln: uxtl v0.4s, v0.4h +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f20(i64, i32) -> i64x2 { +block0(v0: i64, v1: i32): + v2 = sextend.i64 v1 + v3 = uload32x2_complex v2+v0 + return v3 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: ldr d0, [x0, w1, SXTW] +; nextln: uxtl v0.2d, v0.2s +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret