AArch64: port misc ops to ISLE. (#4796)

* Add some precise-output compile tests for aarch64.

* AArch64: port misc ops to ISLE.

- get_pinned_reg / set_pinned_reg
- bitcast
- stack_addr
- extractlane
- insertlane
- vhigh_bits
- iadd_ifcout
- fcvt_low_from_sint
This commit is contained in:
Chris Fallin
2022-08-29 12:56:39 -07:00
committed by GitHub
parent 6368c6b188
commit a6eb24bd4f
18 changed files with 1362 additions and 662 deletions

View File

@@ -4,7 +4,7 @@ use super::lower::*;
use crate::binemit::CodeOffset;
use crate::ir::types::*;
use crate::ir::Inst as IRInst;
use crate::ir::{InstructionData, Opcode};
use crate::ir::Opcode;
use crate::isa::aarch64::inst::*;
use crate::isa::aarch64::settings as aarch64_settings;
use crate::machinst::lower::*;
@@ -13,7 +13,6 @@ use crate::settings::{Flags, TlsModel};
use crate::{CodegenError, CodegenResult};
use alloc::boxed::Box;
use alloc::vec::Vec;
use core::convert::TryFrom;
use target_lexicon::Triple;
/// Actually codegen an instruction's results into registers.
@@ -231,23 +230,7 @@ pub(crate) fn lower_insn_to_regs(
}
}
Opcode::StackAddr => {
let (stack_slot, offset) = match *ctx.data(insn) {
InstructionData::StackLoad {
opcode: Opcode::StackAddr,
stack_slot,
offset,
} => (stack_slot, offset),
_ => unreachable!(),
};
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let offset: i32 = offset.into();
assert!(ctx.abi().sized_stackslot_offsets().is_valid(stack_slot));
let inst =
ctx.abi()
.sized_stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), rd);
ctx.emit(inst);
}
Opcode::StackAddr => implemented_in_isle(ctx),
Opcode::DynamicStackAddr => implemented_in_isle(ctx),
@@ -421,52 +404,7 @@ pub(crate) fn lower_insn_to_regs(
Opcode::Bint => implemented_in_isle(ctx),
Opcode::Bitcast => {
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let ity = ctx.input_ty(insn, 0);
let oty = ctx.output_ty(insn, 0);
let ity_bits = ty_bits(ity);
let ity_vec_reg = ty_has_float_or_vec_representation(ity);
let oty_bits = ty_bits(oty);
let oty_vec_reg = ty_has_float_or_vec_representation(oty);
debug_assert_eq!(ity_bits, oty_bits);
match (ity_vec_reg, oty_vec_reg) {
(true, true) => {
let narrow_mode = if ity_bits <= 32 {
NarrowValueMode::ZeroExtend32
} else {
NarrowValueMode::ZeroExtend64
};
let rm = put_input_in_reg(ctx, inputs[0], narrow_mode);
ctx.emit(Inst::gen_move(rd, rm, oty));
}
(false, false) => {
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
ctx.emit(Inst::gen_move(rd, rm, oty));
}
(false, true) => {
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
ctx.emit(Inst::MovToFpu {
rd,
rn,
size: ScalarSize::Size64,
});
}
(true, false) => {
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let size = ScalarSize::from_bits(oty_bits);
ctx.emit(Inst::MovFromVec {
rd,
rn,
idx: 0,
size,
});
}
}
}
Opcode::Bitcast => implemented_in_isle(ctx),
Opcode::Return => implemented_in_isle(ctx),
@@ -556,15 +494,7 @@ pub(crate) fn lower_insn_to_regs(
Opcode::Call | Opcode::CallIndirect => implemented_in_isle(ctx),
Opcode::GetPinnedReg => {
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
ctx.emit(Inst::gen_move(rd, xreg(PINNED_REG), I64));
}
Opcode::SetPinnedReg => {
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
ctx.emit(Inst::gen_move(writable_xreg(PINNED_REG), rm, I64));
}
Opcode::GetPinnedReg | Opcode::SetPinnedReg => implemented_in_isle(ctx),
Opcode::Jump
| Opcode::Brz
@@ -578,67 +508,11 @@ pub(crate) fn lower_insn_to_regs(
Opcode::Vconst => implemented_in_isle(ctx),
Opcode::RawBitcast => {
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let ty = ctx.input_ty(insn, 0);
ctx.emit(Inst::gen_move(rd, rm, ty));
}
Opcode::RawBitcast => implemented_in_isle(ctx),
Opcode::Extractlane => {
if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(insn) {
let idx = *imm;
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let input_ty = ctx.input_ty(insn, 0);
let size = VectorSize::from_ty(input_ty);
let ty = ty.unwrap();
Opcode::Extractlane => implemented_in_isle(ctx),
if ty_has_int_representation(ty) {
ctx.emit(Inst::MovFromVec {
rd,
rn,
idx,
size: size.lane_size(),
});
// Plain moves are faster on some processors.
} else if idx == 0 {
ctx.emit(Inst::gen_move(rd, rn, ty));
} else {
ctx.emit(Inst::FpuMoveFromVec { rd, rn, idx, size });
}
} else {
unreachable!();
}
}
Opcode::Insertlane => {
let idx = if let InstructionData::TernaryImm8 { imm, .. } = ctx.data(insn) {
*imm
} else {
unreachable!();
};
let input_ty = ctx.input_ty(insn, 1);
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
let ty = ty.unwrap();
let size = VectorSize::from_ty(ty);
ctx.emit(Inst::gen_move(rd, rm, ty));
if ty_has_int_representation(input_ty) {
ctx.emit(Inst::MovToVec { rd, rn, idx, size });
} else {
ctx.emit(Inst::VecMovElement {
rd,
rn,
dest_idx: idx,
src_idx: 0,
size,
});
}
}
Opcode::Insertlane => implemented_in_isle(ctx),
Opcode::Splat => implemented_in_isle(ctx),
@@ -646,240 +520,7 @@ pub(crate) fn lower_insn_to_regs(
Opcode::VallTrue | Opcode::VanyTrue => implemented_in_isle(ctx),
Opcode::VhighBits => {
let dst_r = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let src_v = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let ty = ctx.input_ty(insn, 0);
// All three sequences use one integer temporary and two vector temporaries. The
// shift is done early so as to give the register allocator the possibility of using
// the same reg for `tmp_v1` and `src_v` in the case that this is the last use of
// `src_v`. See https://github.com/WebAssembly/simd/pull/201 for the background and
// derivation of these sequences. Alternative sequences are discussed in
// https://github.com/bytecodealliance/wasmtime/issues/2296, although they are not
// used here.
let tmp_r0 = ctx.alloc_tmp(I64).only_reg().unwrap();
let tmp_v0 = ctx.alloc_tmp(I8X16).only_reg().unwrap();
let tmp_v1 = ctx.alloc_tmp(I8X16).only_reg().unwrap();
match ty {
I8X16 => {
// sshr tmp_v1.16b, src_v.16b, #7
// mov tmp_r0, #0x0201
// movk tmp_r0, #0x0804, lsl 16
// movk tmp_r0, #0x2010, lsl 32
// movk tmp_r0, #0x8040, lsl 48
// dup tmp_v0.2d, tmp_r0
// and tmp_v1.16b, tmp_v1.16b, tmp_v0.16b
// ext tmp_v0.16b, tmp_v1.16b, tmp_v1.16b, #8
// zip1 tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
// addv tmp_v0h, tmp_v0.8h
// mov dst_r, tmp_v0.h[0]
ctx.emit(Inst::VecShiftImm {
op: VecShiftImmOp::Sshr,
rd: tmp_v1,
rn: src_v,
size: VectorSize::Size8x16,
imm: 7,
});
lower_splat_const(ctx, tmp_v0, 0x8040201008040201u64, VectorSize::Size64x2);
ctx.emit(Inst::VecRRR {
alu_op: VecALUOp::And,
rd: tmp_v1,
rn: tmp_v1.to_reg(),
rm: tmp_v0.to_reg(),
size: VectorSize::Size8x16,
});
ctx.emit(Inst::VecExtract {
rd: tmp_v0,
rn: tmp_v1.to_reg(),
rm: tmp_v1.to_reg(),
imm4: 8,
});
ctx.emit(Inst::VecRRR {
alu_op: VecALUOp::Zip1,
rd: tmp_v0,
rn: tmp_v1.to_reg(),
rm: tmp_v0.to_reg(),
size: VectorSize::Size8x16,
});
ctx.emit(Inst::VecLanes {
op: VecLanesOp::Addv,
rd: tmp_v0,
rn: tmp_v0.to_reg(),
size: VectorSize::Size16x8,
});
ctx.emit(Inst::MovFromVec {
rd: dst_r,
rn: tmp_v0.to_reg(),
idx: 0,
size: ScalarSize::Size16,
});
}
I16X8 => {
// sshr tmp_v1.8h, src_v.8h, #15
// mov tmp_r0, #0x1
// movk tmp_r0, #0x2, lsl 16
// movk tmp_r0, #0x4, lsl 32
// movk tmp_r0, #0x8, lsl 48
// dup tmp_v0.2d, tmp_r0
// shl tmp_r0, tmp_r0, #4
// mov tmp_v0.d[1], tmp_r0
// and tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
// addv tmp_v0h, tmp_v0.8h
// mov dst_r, tmp_v0.h[0]
ctx.emit(Inst::VecShiftImm {
op: VecShiftImmOp::Sshr,
rd: tmp_v1,
rn: src_v,
size: VectorSize::Size16x8,
imm: 15,
});
lower_constant_u64(ctx, tmp_r0, 0x0008000400020001u64);
ctx.emit(Inst::VecDup {
rd: tmp_v0,
rn: tmp_r0.to_reg(),
size: VectorSize::Size64x2,
});
ctx.emit(Inst::AluRRImmShift {
alu_op: ALUOp::Lsl,
size: OperandSize::Size64,
rd: tmp_r0,
rn: tmp_r0.to_reg(),
immshift: ImmShift { imm: 4 },
});
ctx.emit(Inst::MovToVec {
rd: tmp_v0,
rn: tmp_r0.to_reg(),
idx: 1,
size: VectorSize::Size64x2,
});
ctx.emit(Inst::VecRRR {
alu_op: VecALUOp::And,
rd: tmp_v0,
rn: tmp_v1.to_reg(),
rm: tmp_v0.to_reg(),
size: VectorSize::Size8x16,
});
ctx.emit(Inst::VecLanes {
op: VecLanesOp::Addv,
rd: tmp_v0,
rn: tmp_v0.to_reg(),
size: VectorSize::Size16x8,
});
ctx.emit(Inst::MovFromVec {
rd: dst_r,
rn: tmp_v0.to_reg(),
idx: 0,
size: ScalarSize::Size16,
});
}
I32X4 => {
// sshr tmp_v1.4s, src_v.4s, #31
// mov tmp_r0, #0x1
// movk tmp_r0, #0x2, lsl 32
// dup tmp_v0.2d, tmp_r0
// shl tmp_r0, tmp_r0, #2
// mov tmp_v0.d[1], tmp_r0
// and tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
// addv tmp_v0s, tmp_v0.4s
// mov dst_r, tmp_v0.s[0]
ctx.emit(Inst::VecShiftImm {
op: VecShiftImmOp::Sshr,
rd: tmp_v1,
rn: src_v,
size: VectorSize::Size32x4,
imm: 31,
});
lower_constant_u64(ctx, tmp_r0, 0x0000000200000001u64);
ctx.emit(Inst::VecDup {
rd: tmp_v0,
rn: tmp_r0.to_reg(),
size: VectorSize::Size64x2,
});
ctx.emit(Inst::AluRRImmShift {
alu_op: ALUOp::Lsl,
size: OperandSize::Size64,
rd: tmp_r0,
rn: tmp_r0.to_reg(),
immshift: ImmShift { imm: 2 },
});
ctx.emit(Inst::MovToVec {
rd: tmp_v0,
rn: tmp_r0.to_reg(),
idx: 1,
size: VectorSize::Size64x2,
});
ctx.emit(Inst::VecRRR {
alu_op: VecALUOp::And,
rd: tmp_v0,
rn: tmp_v1.to_reg(),
rm: tmp_v0.to_reg(),
size: VectorSize::Size8x16,
});
ctx.emit(Inst::VecLanes {
op: VecLanesOp::Addv,
rd: tmp_v0,
rn: tmp_v0.to_reg(),
size: VectorSize::Size32x4,
});
ctx.emit(Inst::MovFromVec {
rd: dst_r,
rn: tmp_v0.to_reg(),
idx: 0,
size: ScalarSize::Size32,
});
}
I64X2 => {
// mov dst_r, src_v.d[0]
// mov tmp_r0, src_v.d[1]
// lsr dst_r, dst_r, #63
// lsr tmp_r0, tmp_r0, #63
// add dst_r, dst_r, tmp_r0, lsl #1
ctx.emit(Inst::MovFromVec {
rd: dst_r,
rn: src_v,
idx: 0,
size: ScalarSize::Size64,
});
ctx.emit(Inst::MovFromVec {
rd: tmp_r0,
rn: src_v,
idx: 1,
size: ScalarSize::Size64,
});
ctx.emit(Inst::AluRRImmShift {
alu_op: ALUOp::Lsr,
size: OperandSize::Size64,
rd: dst_r,
rn: dst_r.to_reg(),
immshift: ImmShift::maybe_from_u64(63).unwrap(),
});
ctx.emit(Inst::AluRRImmShift {
alu_op: ALUOp::Lsr,
size: OperandSize::Size64,
rd: tmp_r0,
rn: tmp_r0.to_reg(),
immshift: ImmShift::maybe_from_u64(63).unwrap(),
});
ctx.emit(Inst::AluRRRShift {
alu_op: ALUOp::Add,
size: OperandSize::Size32,
rd: dst_r,
rn: dst_r.to_reg(),
rm: tmp_r0.to_reg(),
shiftop: ShiftOpAndAmt::new(
ShiftOp::LSL,
ShiftOpShiftImm::maybe_from_shift(1).unwrap(),
),
});
}
_ => {
return Err(CodegenError::Unsupported(format!(
"VhighBits: Unsupported type: {:?}",
ty
)))
}
}
}
Opcode::VhighBits => implemented_in_isle(ctx),
Opcode::Shuffle => implemented_in_isle(ctx),
@@ -917,34 +558,7 @@ pub(crate) fn lower_insn_to_regs(
Opcode::FcvtToUintSat | Opcode::FcvtToSintSat => implemented_in_isle(ctx),
Opcode::IaddIfcout => {
// This is a two-output instruction that is needed for the
// legalizer's explicit heap-check sequence, among possible other
// uses. Its second output is a flags output only ever meant to
// check for overflow using the
// `backend.unsigned_add_overflow_condition()` condition.
//
// Note that the CLIF validation will ensure that no flag-setting
// operation comes between this IaddIfcout and its use (e.g., a
// Trapif). Thus, we can rely on implicit communication through the
// processor flags rather than explicitly generating flags into a
// register. We simply use the variant of the add instruction that
// sets flags (`adds`) here.
// Note that the second output (the flags) need not be generated,
// because flags are never materialized into a register; the only
// instructions that can use a value of type `iflags` or `fflags`
// will look directly for the flags-producing instruction (which can
// always be found, by construction) and merge it.
// Now handle the iadd as above, except use an AddS opcode that sets
// flags.
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None);
let ty = ty.unwrap();
ctx.emit(alu_inst_imm12(ALUOp::AddS, ty, rd, rn, rm));
}
Opcode::IaddIfcout => implemented_in_isle(ctx),
Opcode::IaddImm
| Opcode::ImulImm
@@ -1006,47 +620,9 @@ pub(crate) fn lower_insn_to_regs(
Opcode::SqmulRoundSat => implemented_in_isle(ctx),
Opcode::FcvtLowFromSint => {
let ty = ty.unwrap();
Opcode::FcvtLowFromSint => implemented_in_isle(ctx),
if ty != F64X2 {
return Err(CodegenError::Unsupported(format!(
"FcvtLowFromSint: Unsupported type: {:?}",
ty
)));
}
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
ctx.emit(Inst::VecExtend {
t: VecExtendOp::Sxtl,
rd,
rn,
high_half: false,
lane_size: ScalarSize::Size64,
});
ctx.emit(Inst::VecMisc {
op: VecMisc2::Scvtf,
rd,
rn: rd.to_reg(),
size: VectorSize::Size64x2,
});
}
Opcode::FvpromoteLow => {
debug_assert_eq!(ty.unwrap(), F64X2);
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
ctx.emit(Inst::VecRRLong {
op: VecRRLongOp::Fcvtl32,
rd,
rn,
high_half: false,
});
}
Opcode::FvpromoteLow => implemented_in_isle(ctx),
Opcode::Fvdemote => implemented_in_isle(ctx),