AArch64: port misc ops to ISLE. (#4796)

* Add some precise-output compile tests for aarch64. * AArch64: port misc ops to ISLE. - get_pinned_reg / set_pinned_reg - bitcast - stack_addr - extractlane - insertlane - vhigh_bits - iadd_ifcout - fcvt_low_from_sint
2022-08-29 12:56:39 -07:00
parent 6368c6b188
commit a6eb24bd4f
18 changed files with 1362 additions and 662 deletions
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -4,7 +4,7 @@ use super::lower::*;
 use crate::binemit::CodeOffset;
 use crate::ir::types::*;
 use crate::ir::Inst as IRInst;
-use crate::ir::{InstructionData, Opcode};
+use crate::ir::Opcode;
 use crate::isa::aarch64::inst::*;
 use crate::isa::aarch64::settings as aarch64_settings;
 use crate::machinst::lower::*;
@@ -13,7 +13,6 @@ use crate::settings::{Flags, TlsModel};
 use crate::{CodegenError, CodegenResult};
 use alloc::boxed::Box;
 use alloc::vec::Vec;
-use core::convert::TryFrom;
 use target_lexicon::Triple;

 /// Actually codegen an instruction's results into registers.
@@ -231,23 +230,7 @@ pub(crate) fn lower_insn_to_regs(
            }
        }

-        Opcode::StackAddr => {
-            let (stack_slot, offset) = match *ctx.data(insn) {
-                InstructionData::StackLoad {
-                    opcode: Opcode::StackAddr,
-                    stack_slot,
-                    offset,
-                } => (stack_slot, offset),
-                _ => unreachable!(),
-            };
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let offset: i32 = offset.into();
-            assert!(ctx.abi().sized_stackslot_offsets().is_valid(stack_slot));
-            let inst =
-                ctx.abi()
-                    .sized_stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), rd);
-            ctx.emit(inst);
-        }
+        Opcode::StackAddr => implemented_in_isle(ctx),

        Opcode::DynamicStackAddr => implemented_in_isle(ctx),

@@ -421,52 +404,7 @@ pub(crate) fn lower_insn_to_regs(

        Opcode::Bint => implemented_in_isle(ctx),

-        Opcode::Bitcast => {
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let ity = ctx.input_ty(insn, 0);
-            let oty = ctx.output_ty(insn, 0);
-            let ity_bits = ty_bits(ity);
-            let ity_vec_reg = ty_has_float_or_vec_representation(ity);
-            let oty_bits = ty_bits(oty);
-            let oty_vec_reg = ty_has_float_or_vec_representation(oty);
-
-            debug_assert_eq!(ity_bits, oty_bits);
-
-            match (ity_vec_reg, oty_vec_reg) {
-                (true, true) => {
-                    let narrow_mode = if ity_bits <= 32 {
-                        NarrowValueMode::ZeroExtend32
-                    } else {
-                        NarrowValueMode::ZeroExtend64
-                    };
-                    let rm = put_input_in_reg(ctx, inputs[0], narrow_mode);
-                    ctx.emit(Inst::gen_move(rd, rm, oty));
-                }
-                (false, false) => {
-                    let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-                    ctx.emit(Inst::gen_move(rd, rm, oty));
-                }
-                (false, true) => {
-                    let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
-                    ctx.emit(Inst::MovToFpu {
-                        rd,
-                        rn,
-                        size: ScalarSize::Size64,
-                    });
-                }
-                (true, false) => {
-                    let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-                    let size = ScalarSize::from_bits(oty_bits);
-
-                    ctx.emit(Inst::MovFromVec {
-                        rd,
-                        rn,
-                        idx: 0,
-                        size,
-                    });
-                }
-            }
-        }
+        Opcode::Bitcast => implemented_in_isle(ctx),

        Opcode::Return => implemented_in_isle(ctx),

@@ -556,15 +494,7 @@ pub(crate) fn lower_insn_to_regs(

        Opcode::Call | Opcode::CallIndirect => implemented_in_isle(ctx),

-        Opcode::GetPinnedReg => {
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            ctx.emit(Inst::gen_move(rd, xreg(PINNED_REG), I64));
-        }
-
-        Opcode::SetPinnedReg => {
-            let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            ctx.emit(Inst::gen_move(writable_xreg(PINNED_REG), rm, I64));
-        }
+        Opcode::GetPinnedReg | Opcode::SetPinnedReg => implemented_in_isle(ctx),

        Opcode::Jump
        | Opcode::Brz
@@ -578,67 +508,11 @@ pub(crate) fn lower_insn_to_regs(

        Opcode::Vconst => implemented_in_isle(ctx),

-        Opcode::RawBitcast => {
-            let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let ty = ctx.input_ty(insn, 0);
-            ctx.emit(Inst::gen_move(rd, rm, ty));
-        }
+        Opcode::RawBitcast => implemented_in_isle(ctx),

-        Opcode::Extractlane => {
-            if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(insn) {
-                let idx = *imm;
-                let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-                let input_ty = ctx.input_ty(insn, 0);
-                let size = VectorSize::from_ty(input_ty);
-                let ty = ty.unwrap();
+        Opcode::Extractlane => implemented_in_isle(ctx),

-                if ty_has_int_representation(ty) {
-                    ctx.emit(Inst::MovFromVec {
-                        rd,
-                        rn,
-                        idx,
-                        size: size.lane_size(),
-                    });
-                // Plain moves are faster on some processors.
-                } else if idx == 0 {
-                    ctx.emit(Inst::gen_move(rd, rn, ty));
-                } else {
-                    ctx.emit(Inst::FpuMoveFromVec { rd, rn, idx, size });
-                }
-            } else {
-                unreachable!();
-            }
-        }
-
-        Opcode::Insertlane => {
-            let idx = if let InstructionData::TernaryImm8 { imm, .. } = ctx.data(insn) {
-                *imm
-            } else {
-                unreachable!();
-            };
-            let input_ty = ctx.input_ty(insn, 1);
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
-            let ty = ty.unwrap();
-            let size = VectorSize::from_ty(ty);
-
-            ctx.emit(Inst::gen_move(rd, rm, ty));
-
-            if ty_has_int_representation(input_ty) {
-                ctx.emit(Inst::MovToVec { rd, rn, idx, size });
-            } else {
-                ctx.emit(Inst::VecMovElement {
-                    rd,
-                    rn,
-                    dest_idx: idx,
-                    src_idx: 0,
-                    size,
-                });
-            }
-        }
+        Opcode::Insertlane => implemented_in_isle(ctx),

        Opcode::Splat => implemented_in_isle(ctx),

@@ -646,240 +520,7 @@ pub(crate) fn lower_insn_to_regs(

        Opcode::VallTrue | Opcode::VanyTrue => implemented_in_isle(ctx),

-        Opcode::VhighBits => {
-            let dst_r = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let src_v = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let ty = ctx.input_ty(insn, 0);
-            // All three sequences use one integer temporary and two vector temporaries.  The
-            // shift is done early so as to give the register allocator the possibility of using
-            // the same reg for `tmp_v1` and `src_v` in the case that this is the last use of
-            // `src_v`.  See https://github.com/WebAssembly/simd/pull/201 for the background and
-            // derivation of these sequences.  Alternative sequences are discussed in
-            // https://github.com/bytecodealliance/wasmtime/issues/2296, although they are not
-            // used here.
-            let tmp_r0 = ctx.alloc_tmp(I64).only_reg().unwrap();
-            let tmp_v0 = ctx.alloc_tmp(I8X16).only_reg().unwrap();
-            let tmp_v1 = ctx.alloc_tmp(I8X16).only_reg().unwrap();
-            match ty {
-                I8X16 => {
-                    // sshr  tmp_v1.16b, src_v.16b, #7
-                    // mov   tmp_r0, #0x0201
-                    // movk  tmp_r0, #0x0804, lsl 16
-                    // movk  tmp_r0, #0x2010, lsl 32
-                    // movk  tmp_r0, #0x8040, lsl 48
-                    // dup   tmp_v0.2d, tmp_r0
-                    // and   tmp_v1.16b, tmp_v1.16b, tmp_v0.16b
-                    // ext   tmp_v0.16b, tmp_v1.16b, tmp_v1.16b, #8
-                    // zip1  tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
-                    // addv  tmp_v0h, tmp_v0.8h
-                    // mov   dst_r, tmp_v0.h[0]
-                    ctx.emit(Inst::VecShiftImm {
-                        op: VecShiftImmOp::Sshr,
-                        rd: tmp_v1,
-                        rn: src_v,
-                        size: VectorSize::Size8x16,
-                        imm: 7,
-                    });
-                    lower_splat_const(ctx, tmp_v0, 0x8040201008040201u64, VectorSize::Size64x2);
-                    ctx.emit(Inst::VecRRR {
-                        alu_op: VecALUOp::And,
-                        rd: tmp_v1,
-                        rn: tmp_v1.to_reg(),
-                        rm: tmp_v0.to_reg(),
-                        size: VectorSize::Size8x16,
-                    });
-                    ctx.emit(Inst::VecExtract {
-                        rd: tmp_v0,
-                        rn: tmp_v1.to_reg(),
-                        rm: tmp_v1.to_reg(),
-                        imm4: 8,
-                    });
-                    ctx.emit(Inst::VecRRR {
-                        alu_op: VecALUOp::Zip1,
-                        rd: tmp_v0,
-                        rn: tmp_v1.to_reg(),
-                        rm: tmp_v0.to_reg(),
-                        size: VectorSize::Size8x16,
-                    });
-                    ctx.emit(Inst::VecLanes {
-                        op: VecLanesOp::Addv,
-                        rd: tmp_v0,
-                        rn: tmp_v0.to_reg(),
-                        size: VectorSize::Size16x8,
-                    });
-                    ctx.emit(Inst::MovFromVec {
-                        rd: dst_r,
-                        rn: tmp_v0.to_reg(),
-                        idx: 0,
-                        size: ScalarSize::Size16,
-                    });
-                }
-                I16X8 => {
-                    // sshr  tmp_v1.8h, src_v.8h, #15
-                    // mov   tmp_r0, #0x1
-                    // movk  tmp_r0, #0x2, lsl 16
-                    // movk  tmp_r0, #0x4, lsl 32
-                    // movk  tmp_r0, #0x8, lsl 48
-                    // dup   tmp_v0.2d, tmp_r0
-                    // shl   tmp_r0, tmp_r0, #4
-                    // mov   tmp_v0.d[1], tmp_r0
-                    // and   tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
-                    // addv  tmp_v0h, tmp_v0.8h
-                    // mov   dst_r, tmp_v0.h[0]
-                    ctx.emit(Inst::VecShiftImm {
-                        op: VecShiftImmOp::Sshr,
-                        rd: tmp_v1,
-                        rn: src_v,
-                        size: VectorSize::Size16x8,
-                        imm: 15,
-                    });
-                    lower_constant_u64(ctx, tmp_r0, 0x0008000400020001u64);
-                    ctx.emit(Inst::VecDup {
-                        rd: tmp_v0,
-                        rn: tmp_r0.to_reg(),
-                        size: VectorSize::Size64x2,
-                    });
-                    ctx.emit(Inst::AluRRImmShift {
-                        alu_op: ALUOp::Lsl,
-                        size: OperandSize::Size64,
-                        rd: tmp_r0,
-                        rn: tmp_r0.to_reg(),
-                        immshift: ImmShift { imm: 4 },
-                    });
-                    ctx.emit(Inst::MovToVec {
-                        rd: tmp_v0,
-                        rn: tmp_r0.to_reg(),
-                        idx: 1,
-                        size: VectorSize::Size64x2,
-                    });
-                    ctx.emit(Inst::VecRRR {
-                        alu_op: VecALUOp::And,
-                        rd: tmp_v0,
-                        rn: tmp_v1.to_reg(),
-                        rm: tmp_v0.to_reg(),
-                        size: VectorSize::Size8x16,
-                    });
-                    ctx.emit(Inst::VecLanes {
-                        op: VecLanesOp::Addv,
-                        rd: tmp_v0,
-                        rn: tmp_v0.to_reg(),
-                        size: VectorSize::Size16x8,
-                    });
-                    ctx.emit(Inst::MovFromVec {
-                        rd: dst_r,
-                        rn: tmp_v0.to_reg(),
-                        idx: 0,
-                        size: ScalarSize::Size16,
-                    });
-                }
-                I32X4 => {
-                    // sshr  tmp_v1.4s, src_v.4s, #31
-                    // mov   tmp_r0, #0x1
-                    // movk  tmp_r0, #0x2, lsl 32
-                    // dup   tmp_v0.2d, tmp_r0
-                    // shl   tmp_r0, tmp_r0, #2
-                    // mov   tmp_v0.d[1], tmp_r0
-                    // and   tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
-                    // addv  tmp_v0s, tmp_v0.4s
-                    // mov   dst_r, tmp_v0.s[0]
-                    ctx.emit(Inst::VecShiftImm {
-                        op: VecShiftImmOp::Sshr,
-                        rd: tmp_v1,
-                        rn: src_v,
-                        size: VectorSize::Size32x4,
-                        imm: 31,
-                    });
-                    lower_constant_u64(ctx, tmp_r0, 0x0000000200000001u64);
-                    ctx.emit(Inst::VecDup {
-                        rd: tmp_v0,
-                        rn: tmp_r0.to_reg(),
-                        size: VectorSize::Size64x2,
-                    });
-                    ctx.emit(Inst::AluRRImmShift {
-                        alu_op: ALUOp::Lsl,
-                        size: OperandSize::Size64,
-                        rd: tmp_r0,
-                        rn: tmp_r0.to_reg(),
-                        immshift: ImmShift { imm: 2 },
-                    });
-                    ctx.emit(Inst::MovToVec {
-                        rd: tmp_v0,
-                        rn: tmp_r0.to_reg(),
-                        idx: 1,
-                        size: VectorSize::Size64x2,
-                    });
-                    ctx.emit(Inst::VecRRR {
-                        alu_op: VecALUOp::And,
-                        rd: tmp_v0,
-                        rn: tmp_v1.to_reg(),
-                        rm: tmp_v0.to_reg(),
-                        size: VectorSize::Size8x16,
-                    });
-                    ctx.emit(Inst::VecLanes {
-                        op: VecLanesOp::Addv,
-                        rd: tmp_v0,
-                        rn: tmp_v0.to_reg(),
-                        size: VectorSize::Size32x4,
-                    });
-                    ctx.emit(Inst::MovFromVec {
-                        rd: dst_r,
-                        rn: tmp_v0.to_reg(),
-                        idx: 0,
-                        size: ScalarSize::Size32,
-                    });
-                }
-                I64X2 => {
-                    // mov dst_r, src_v.d[0]
-                    // mov tmp_r0, src_v.d[1]
-                    // lsr dst_r, dst_r, #63
-                    // lsr tmp_r0, tmp_r0, #63
-                    // add dst_r, dst_r, tmp_r0, lsl #1
-                    ctx.emit(Inst::MovFromVec {
-                        rd: dst_r,
-                        rn: src_v,
-                        idx: 0,
-                        size: ScalarSize::Size64,
-                    });
-                    ctx.emit(Inst::MovFromVec {
-                        rd: tmp_r0,
-                        rn: src_v,
-                        idx: 1,
-                        size: ScalarSize::Size64,
-                    });
-                    ctx.emit(Inst::AluRRImmShift {
-                        alu_op: ALUOp::Lsr,
-                        size: OperandSize::Size64,
-                        rd: dst_r,
-                        rn: dst_r.to_reg(),
-                        immshift: ImmShift::maybe_from_u64(63).unwrap(),
-                    });
-                    ctx.emit(Inst::AluRRImmShift {
-                        alu_op: ALUOp::Lsr,
-                        size: OperandSize::Size64,
-                        rd: tmp_r0,
-                        rn: tmp_r0.to_reg(),
-                        immshift: ImmShift::maybe_from_u64(63).unwrap(),
-                    });
-                    ctx.emit(Inst::AluRRRShift {
-                        alu_op: ALUOp::Add,
-                        size: OperandSize::Size32,
-                        rd: dst_r,
-                        rn: dst_r.to_reg(),
-                        rm: tmp_r0.to_reg(),
-                        shiftop: ShiftOpAndAmt::new(
-                            ShiftOp::LSL,
-                            ShiftOpShiftImm::maybe_from_shift(1).unwrap(),
-                        ),
-                    });
-                }
-                _ => {
-                    return Err(CodegenError::Unsupported(format!(
-                        "VhighBits: Unsupported type: {:?}",
-                        ty
-                    )))
-                }
-            }
-        }
+        Opcode::VhighBits => implemented_in_isle(ctx),

        Opcode::Shuffle => implemented_in_isle(ctx),

@@ -917,34 +558,7 @@ pub(crate) fn lower_insn_to_regs(

        Opcode::FcvtToUintSat | Opcode::FcvtToSintSat => implemented_in_isle(ctx),

-        Opcode::IaddIfcout => {
-            // This is a two-output instruction that is needed for the
-            // legalizer's explicit heap-check sequence, among possible other
-            // uses. Its second output is a flags output only ever meant to
-            // check for overflow using the
-            // `backend.unsigned_add_overflow_condition()` condition.
-            //
-            // Note that the CLIF validation will ensure that no flag-setting
-            // operation comes between this IaddIfcout and its use (e.g., a
-            // Trapif). Thus, we can rely on implicit communication through the
-            // processor flags rather than explicitly generating flags into a
-            // register. We simply use the variant of the add instruction that
-            // sets flags (`adds`) here.
-
-            // Note that the second output (the flags) need not be generated,
-            // because flags are never materialized into a register; the only
-            // instructions that can use a value of type `iflags` or `fflags`
-            // will look directly for the flags-producing instruction (which can
-            // always be found, by construction) and merge it.
-
-            // Now handle the iadd as above, except use an AddS opcode that sets
-            // flags.
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None);
-            let ty = ty.unwrap();
-            ctx.emit(alu_inst_imm12(ALUOp::AddS, ty, rd, rn, rm));
-        }
+        Opcode::IaddIfcout => implemented_in_isle(ctx),

        Opcode::IaddImm
        | Opcode::ImulImm
@@ -1006,47 +620,9 @@ pub(crate) fn lower_insn_to_regs(

        Opcode::SqmulRoundSat => implemented_in_isle(ctx),

-        Opcode::FcvtLowFromSint => {
-            let ty = ty.unwrap();
+        Opcode::FcvtLowFromSint => implemented_in_isle(ctx),

-            if ty != F64X2 {
-                return Err(CodegenError::Unsupported(format!(
-                    "FcvtLowFromSint: Unsupported type: {:?}",
-                    ty
-                )));
-            }
-
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-
-            ctx.emit(Inst::VecExtend {
-                t: VecExtendOp::Sxtl,
-                rd,
-                rn,
-                high_half: false,
-                lane_size: ScalarSize::Size64,
-            });
-            ctx.emit(Inst::VecMisc {
-                op: VecMisc2::Scvtf,
-                rd,
-                rn: rd.to_reg(),
-                size: VectorSize::Size64x2,
-            });
-        }
-
-        Opcode::FvpromoteLow => {
-            debug_assert_eq!(ty.unwrap(), F64X2);
-
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-
-            ctx.emit(Inst::VecRRLong {
-                op: VecRRLongOp::Fcvtl32,
-                rd,
-                rn,
-                high_half: false,
-            });
-        }
+        Opcode::FvpromoteLow => implemented_in_isle(ctx),

        Opcode::Fvdemote => implemented_in_isle(ctx),