AArch64: port misc ops to ISLE. (#4796)

* Add some precise-output compile tests for aarch64.

* AArch64: port misc ops to ISLE.

- get_pinned_reg / set_pinned_reg
- bitcast
- stack_addr
- extractlane
- insertlane
- vhigh_bits
- iadd_ifcout
- fcvt_low_from_sint
This commit is contained in:
Chris Fallin
2022-08-29 12:56:39 -07:00
committed by GitHub
parent 6368c6b188
commit a6eb24bd4f
18 changed files with 1362 additions and 662 deletions

View File

@@ -952,11 +952,19 @@
;; Helper for calculating the `ScalarSize` corresponding to a type
(decl scalar_size (Type) ScalarSize)
(rule (scalar_size $I8) (ScalarSize.Size8))
(rule (scalar_size $I16) (ScalarSize.Size16))
(rule (scalar_size $I32) (ScalarSize.Size32))
(rule (scalar_size $I64) (ScalarSize.Size64))
(rule (scalar_size $I128) (ScalarSize.Size128))
(rule (scalar_size $B8) (ScalarSize.Size8))
(rule (scalar_size $B16) (ScalarSize.Size16))
(rule (scalar_size $B32) (ScalarSize.Size32))
(rule (scalar_size $B64) (ScalarSize.Size64))
(rule (scalar_size $B128) (ScalarSize.Size128))
(rule (scalar_size $F32) (ScalarSize.Size32))
(rule (scalar_size $F64) (ScalarSize.Size64))
@@ -1452,6 +1460,9 @@
(decl pure lshl_from_imm64 (Type Imm64) ShiftOpAndAmt)
(extern constructor lshl_from_imm64 lshl_from_imm64)
(decl pure lshl_from_u64 (Type u64) ShiftOpAndAmt)
(extern constructor lshl_from_u64 lshl_from_u64)
(decl integral_ty (Type) Type)
(extern extractor integral_ty integral_ty)
@@ -1704,6 +1715,14 @@
(MInst.AluRRR (ALUOp.AddS) (operand_size ty) dst src1 src2)
dst)))
;; Helper for emitting `adds` instructions, setting flags in ambient
;; state. Used only for `iadd_ifcout`.
(decl add_with_flags (Type Reg Reg) Reg)
(rule (add_with_flags ty src1 src2)
(let ((dst WritableReg (temp_writable_reg $I64))
(_ Unit (emit (MInst.AluRRR (ALUOp.AddS) (operand_size ty) dst src1 src2))))
dst))
;; Helper for emitting `adc` instructions.
(decl adc_paired (Type Reg Reg) ConsumesFlags)
(rule (adc_paired ty src1 src2)
@@ -1927,6 +1946,13 @@
(_ Unit (emit (MInst.VecExtend op dst src high_half size))))
dst))
;; Helper for emitting `MInst.VecExtract` instructions.
(decl vec_extract (Reg Reg u8) Reg)
(rule (vec_extract src1 src2 idx)
(let ((dst WritableReg (temp_writable_reg $I8X16))
(_ Unit (emit (MInst.VecExtract dst src1 src2 idx))))
dst))
;; Helper for emitting `MInst.LoadAcquire` instructions.
(decl load_acquire (Type Reg) Reg)
(rule (load_acquire ty addr)
@@ -2118,6 +2144,10 @@
(decl addp (Reg Reg VectorSize) Reg)
(rule (addp x y size) (vec_rrr (VecALUOp.Addp) x y size))
;; Helper for generating `zip1` instructions.
(decl zip1 (Reg Reg VectorSize) Reg)
(rule (zip1 x y size) (vec_rrr (VecALUOp.Zip1) x y size))
;; Helper for generating vector `abs` instructions.
(decl vec_abs (Reg VectorSize) Reg)
(rule (vec_abs x size) (vec_misc (VecMisc2.Abs) x size))
@@ -2826,3 +2856,24 @@
(decl gen_call_indirect (SigRef Value ValueSlice) InstOutput)
(extern constructor gen_call_indirect gen_call_indirect)
;; Helpers for pinned register manipulation.
(decl writable_pinned_reg () WritableReg)
(extern constructor writable_pinned_reg writable_pinned_reg)
(decl pinned_reg () Reg)
(rule (pinned_reg) (writable_pinned_reg))
(decl write_pinned_reg (Reg) SideEffectNoResult)
(rule (write_pinned_reg val)
(let ((dst WritableReg (writable_pinned_reg)))
(SideEffectNoResult.Inst (gen_move $I64 dst val))))
;; Helpers for stackslot effective address generation.
(decl compute_stack_addr (StackSlot Offset32) Reg)
(rule (compute_stack_addr stack_slot offset)
(let ((dst WritableReg (temp_writable_reg $I64))
(_ Unit (emit (abi_stackslot_addr dst stack_slot offset))))
dst))

View File

@@ -2030,3 +2030,212 @@
;; N.B.: the Ret itself is generated by the ABI.
(rule (lower (return args))
(lower_return (range 0 (value_slice_len args)) args))
;;; Rules for `{get,set}_pinned_reg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (get_pinned_reg))
(pinned_reg))
(rule (lower (set_pinned_reg val))
(side_effect (write_pinned_reg val)))
;;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type $I32 (bitcast src @ (value_type $F32))))
(mov_from_vec src 0 (ScalarSize.Size32)))
(rule (lower (has_type $F32 (bitcast src @ (value_type $I32))))
(mov_to_fpu src (ScalarSize.Size32)))
(rule (lower (has_type $I64 (bitcast src @ (value_type $F64))))
(mov_from_vec src 0 (ScalarSize.Size64)))
(rule (lower (has_type $F64 (bitcast src @ (value_type $I64))))
(mov_to_fpu src (ScalarSize.Size64)))
;;; Rules for `raw_bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (raw_bitcast val))
val)
;;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; extractlane with lane 0 can pass through the value unchanged; upper
;; bits are undefined when a narrower type is in a wider register.
(rule (lower (has_type (ty_scalar_float _) (extractlane val (u8_from_uimm8 0))))
val)
(rule (lower (has_type (ty_int_bool ty)
(extractlane val
(u8_from_uimm8 lane))))
(mov_from_vec val lane (scalar_size ty)))
(rule (lower (has_type (ty_scalar_float ty)
(extractlane val @ (value_type vty)
(u8_from_uimm8 lane))))
(fpu_move_from_vec val lane (vector_size vty)))
;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (insertlane vec @ (value_type vty)
val @ (value_type (ty_int_bool _))
(u8_from_uimm8 lane)))
(mov_to_vec vec val lane (vector_size vty)))
(rule (lower (insertlane vec @ (value_type vty)
val @ (value_type (ty_scalar_float _))
(u8_from_uimm8 lane)))
(mov_vec_elem vec val lane 0 (vector_size vty)))
;;; Rules for `copy` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (copy x))
x)
;;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (stack_addr stack_slot offset))
(compute_stack_addr stack_slot offset))
;;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; All three sequences use one integer temporary and two vector
;; temporaries. The shift is done early so as to give the register
;; allocator the possibility of using the same reg for `tmp_v1` and
;; `src_v` in the case that this is the last use of `src_v`. See
;; https://github.com/WebAssembly/simd/pull/201 for the background and
;; derivation of these sequences. Alternative sequences are discussed
;; in https://github.com/bytecodealliance/wasmtime/issues/2296,
;; although they are not used here.
(rule (lower (vhigh_bits vec @ (value_type $I8X16)))
(let (
;; Replicate the MSB of each of the 16 byte lanes across
;; the whole lane (sshr is an arithmetic right shift).
(shifted Reg (vec_shift_imm (VecShiftImmOp.Sshr) 7 vec (VectorSize.Size8x16)))
;; Bitwise-and with a mask
;; `0x80402010_08040201_80402010_08040201` to get the bit
;; in the proper location for each group of 8 lanes.
(anded Reg (and_vec shifted (constant_f128 0x80402010_08040201_80402010_08040201) (VectorSize.Size8x16)))
;; Produce a version of `anded` with upper 8 lanes and
;; lower 8 lanes swapped.
(anded_swapped Reg (vec_extract anded anded 8))
;; Zip together the two; with the above this produces the lane permutation:
;; 15 7 14 6 13 5 12 4 11 3 10 2 9 1 8 0
(zipped Reg (zip1 anded anded_swapped (VectorSize.Size8x16)))
;; Add 16-bit lanes together ("add across vector"), so we
;; get, in the low 16 bits, 15+14+...+8 in the high byte
;; and 7+6+...+0 in the low byte. This effectively puts
;; the 16 MSBs together, giving our results.
;;
;; N.B.: `Size16x8` is not a typo!
(result Reg (addv zipped (VectorSize.Size16x8))))
(mov_from_vec result 0 (ScalarSize.Size16))))
(rule (lower (vhigh_bits vec @ (value_type $I16X8)))
(let (
;; Replicate the MSB of each of the 8 16-bit lanes across
;; the whole lane (sshr is an arithmetic right shift).
(shifted Reg (vec_shift_imm (VecShiftImmOp.Sshr) 15 vec (VectorSize.Size16x8)))
;; Bitwise-and with a mask
;; `0x0080_0040_0020_0010_0008_0004_0002_0001` to get the
;; bit in the proper location for each group of 4 lanes.
(anded Reg (and_vec shifted (constant_f128 0x0080_0040_0020_0010_0008_0004_0002_0001) (VectorSize.Size16x8)))
;; Add lanes together to get the 8 MSBs in the low byte.
(result Reg (addv anded (VectorSize.Size16x8))))
(mov_from_vec result 0 (ScalarSize.Size16))))
(rule (lower (vhigh_bits vec @ (value_type $I32X4)))
(let (
;; Replicate the MSB of each of the 4 32-bit lanes across
;; the whole lane (sshr is an arithmetic right shift).
(shifted Reg (vec_shift_imm (VecShiftImmOp.Sshr) 31 vec (VectorSize.Size32x4)))
;; Bitwise-and with a mask
;; `0x00000008_00000004_00000002_00000001` to get the bit
;; in the proper location for each group of 4 lanes.
(anded Reg (and_vec shifted (constant_f128 0x00000008_00000004_00000002_00000001) (VectorSize.Size32x4)))
;; Add lanes together to get the 4 MSBs in the low byte.
(result Reg (addv anded (VectorSize.Size32x4))))
(mov_from_vec result 0 (ScalarSize.Size32))))
(rule (lower (vhigh_bits vec @ (value_type $I64X2)))
(let (
;; Grab the MSB out of each of the lanes, right-shift to
;; LSB, and add with a left-shift of upper lane's MSB back
;; to bit 1. the whole lane (sshr is an arithmetic right
;; shift).
(upper_msb Reg (mov_from_vec vec 1 (ScalarSize.Size64)))
(lower_msb Reg (mov_from_vec vec 0 (ScalarSize.Size64)))
(upper_msb Reg (lsr_imm $I64 upper_msb (imm_shift_from_u8 63)))
(lower_msb Reg (lsr_imm $I64 lower_msb (imm_shift_from_u8 63))))
(add_shift $I64 lower_msb upper_msb (lshl_from_u64 $I64 1))))
;;; Rules for `iadd_ifcout` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; This is a two-output instruction that is needed for the
;; legalizer's explicit heap-check sequence, among possible other
;; uses. Its second output is a flags output only ever meant to
;; check for overflow using the
;; `backend.unsigned_add_overflow_condition()` condition.
;;
;; Note that the CLIF validation will ensure that no flag-setting
;; operation comes between this IaddIfcout and its use (e.g., a
;; Trapif). Thus, we can rely on implicit communication through the
;; processor flags rather than explicitly generating flags into a
;; register. We simply use the variant of the add instruction that
;; sets flags (`adds`) here.
;;
;; Note that the second output (the flags) need not be generated,
;; because flags are never materialized into a register; the only
;; instructions that can use a value of type `iflags` or `fflags`
;; will look directly for the flags-producing instruction (which can
;; always be found, by construction) and merge it.
;;
;; Now handle the iadd as above, except use an AddS opcode that sets
;; flags.
(rule (lower (has_type (ty_int ty)
(iadd_ifcout a b)))
(output_pair
(add_with_flags ty a b)
(invalid_reg)))
;;; Rules for `tls_value` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; TODO.
;;; Rules for `fcvt_low_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type $F64X2 (fcvt_low_from_sint val)))
(let ((extended Reg (vec_extend (VecExtendOp.Sxtl) val $false (ScalarSize.Size64)))
(converted Reg (vec_misc (VecMisc2.Scvtf) extended (VectorSize.Size64x2))))
converted))
;;; Rules for `fvpromote_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (fvpromote_low val))
(vec_rr_long (VecRRLongOp.Fcvtl32) val $false))
;;; Rules for `select` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; TODO: requires icmp/fcmp first.
;;; Rules for `selectif` / `selectif_spectre_guard` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; TODO: requires icmp/fcmp first.
;;; Rules for `trueif` / `trueff` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; TODO: requires icmp/fcmp first.
;;; Rules for `brz`/`brnz`/`brif`/`brff`/`bricmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; TODO: requires icmp/fcmp first.
;;; Rules for `jump` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; TODO.
;;; Rules for `br_table` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; TODO.

View File

@@ -128,7 +128,11 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
}
fn lshl_from_imm64(&mut self, ty: Type, n: Imm64) -> Option<ShiftOpAndAmt> {
let shiftimm = ShiftOpShiftImm::maybe_from_shift(n.bits() as u64)?;
self.lshl_from_u64(ty, n.bits() as u64)
}
fn lshl_from_u64(&mut self, ty: Type, n: u64) -> Option<ShiftOpAndAmt> {
let shiftimm = ShiftOpShiftImm::maybe_from_shift(n)?;
let shiftee_bits = ty_bits(ty);
if shiftee_bits <= std::u8::MAX as usize {
let shiftimm = shiftimm.mask(shiftee_bits as u8);
@@ -722,4 +726,8 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
);
}
}
fn writable_pinned_reg(&mut self) -> WritableReg {
super::regs::writable_xreg(super::regs::PINNED_REG)
}
}

View File

@@ -4,7 +4,7 @@ use super::lower::*;
use crate::binemit::CodeOffset;
use crate::ir::types::*;
use crate::ir::Inst as IRInst;
use crate::ir::{InstructionData, Opcode};
use crate::ir::Opcode;
use crate::isa::aarch64::inst::*;
use crate::isa::aarch64::settings as aarch64_settings;
use crate::machinst::lower::*;
@@ -13,7 +13,6 @@ use crate::settings::{Flags, TlsModel};
use crate::{CodegenError, CodegenResult};
use alloc::boxed::Box;
use alloc::vec::Vec;
use core::convert::TryFrom;
use target_lexicon::Triple;
/// Actually codegen an instruction's results into registers.
@@ -231,23 +230,7 @@ pub(crate) fn lower_insn_to_regs(
}
}
Opcode::StackAddr => {
let (stack_slot, offset) = match *ctx.data(insn) {
InstructionData::StackLoad {
opcode: Opcode::StackAddr,
stack_slot,
offset,
} => (stack_slot, offset),
_ => unreachable!(),
};
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let offset: i32 = offset.into();
assert!(ctx.abi().sized_stackslot_offsets().is_valid(stack_slot));
let inst =
ctx.abi()
.sized_stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), rd);
ctx.emit(inst);
}
Opcode::StackAddr => implemented_in_isle(ctx),
Opcode::DynamicStackAddr => implemented_in_isle(ctx),
@@ -421,52 +404,7 @@ pub(crate) fn lower_insn_to_regs(
Opcode::Bint => implemented_in_isle(ctx),
Opcode::Bitcast => {
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let ity = ctx.input_ty(insn, 0);
let oty = ctx.output_ty(insn, 0);
let ity_bits = ty_bits(ity);
let ity_vec_reg = ty_has_float_or_vec_representation(ity);
let oty_bits = ty_bits(oty);
let oty_vec_reg = ty_has_float_or_vec_representation(oty);
debug_assert_eq!(ity_bits, oty_bits);
match (ity_vec_reg, oty_vec_reg) {
(true, true) => {
let narrow_mode = if ity_bits <= 32 {
NarrowValueMode::ZeroExtend32
} else {
NarrowValueMode::ZeroExtend64
};
let rm = put_input_in_reg(ctx, inputs[0], narrow_mode);
ctx.emit(Inst::gen_move(rd, rm, oty));
}
(false, false) => {
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
ctx.emit(Inst::gen_move(rd, rm, oty));
}
(false, true) => {
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
ctx.emit(Inst::MovToFpu {
rd,
rn,
size: ScalarSize::Size64,
});
}
(true, false) => {
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let size = ScalarSize::from_bits(oty_bits);
ctx.emit(Inst::MovFromVec {
rd,
rn,
idx: 0,
size,
});
}
}
}
Opcode::Bitcast => implemented_in_isle(ctx),
Opcode::Return => implemented_in_isle(ctx),
@@ -556,15 +494,7 @@ pub(crate) fn lower_insn_to_regs(
Opcode::Call | Opcode::CallIndirect => implemented_in_isle(ctx),
Opcode::GetPinnedReg => {
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
ctx.emit(Inst::gen_move(rd, xreg(PINNED_REG), I64));
}
Opcode::SetPinnedReg => {
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
ctx.emit(Inst::gen_move(writable_xreg(PINNED_REG), rm, I64));
}
Opcode::GetPinnedReg | Opcode::SetPinnedReg => implemented_in_isle(ctx),
Opcode::Jump
| Opcode::Brz
@@ -578,67 +508,11 @@ pub(crate) fn lower_insn_to_regs(
Opcode::Vconst => implemented_in_isle(ctx),
Opcode::RawBitcast => {
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let ty = ctx.input_ty(insn, 0);
ctx.emit(Inst::gen_move(rd, rm, ty));
}
Opcode::RawBitcast => implemented_in_isle(ctx),
Opcode::Extractlane => {
if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(insn) {
let idx = *imm;
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let input_ty = ctx.input_ty(insn, 0);
let size = VectorSize::from_ty(input_ty);
let ty = ty.unwrap();
Opcode::Extractlane => implemented_in_isle(ctx),
if ty_has_int_representation(ty) {
ctx.emit(Inst::MovFromVec {
rd,
rn,
idx,
size: size.lane_size(),
});
// Plain moves are faster on some processors.
} else if idx == 0 {
ctx.emit(Inst::gen_move(rd, rn, ty));
} else {
ctx.emit(Inst::FpuMoveFromVec { rd, rn, idx, size });
}
} else {
unreachable!();
}
}
Opcode::Insertlane => {
let idx = if let InstructionData::TernaryImm8 { imm, .. } = ctx.data(insn) {
*imm
} else {
unreachable!();
};
let input_ty = ctx.input_ty(insn, 1);
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
let ty = ty.unwrap();
let size = VectorSize::from_ty(ty);
ctx.emit(Inst::gen_move(rd, rm, ty));
if ty_has_int_representation(input_ty) {
ctx.emit(Inst::MovToVec { rd, rn, idx, size });
} else {
ctx.emit(Inst::VecMovElement {
rd,
rn,
dest_idx: idx,
src_idx: 0,
size,
});
}
}
Opcode::Insertlane => implemented_in_isle(ctx),
Opcode::Splat => implemented_in_isle(ctx),
@@ -646,240 +520,7 @@ pub(crate) fn lower_insn_to_regs(
Opcode::VallTrue | Opcode::VanyTrue => implemented_in_isle(ctx),
Opcode::VhighBits => {
let dst_r = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let src_v = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let ty = ctx.input_ty(insn, 0);
// All three sequences use one integer temporary and two vector temporaries. The
// shift is done early so as to give the register allocator the possibility of using
// the same reg for `tmp_v1` and `src_v` in the case that this is the last use of
// `src_v`. See https://github.com/WebAssembly/simd/pull/201 for the background and
// derivation of these sequences. Alternative sequences are discussed in
// https://github.com/bytecodealliance/wasmtime/issues/2296, although they are not
// used here.
let tmp_r0 = ctx.alloc_tmp(I64).only_reg().unwrap();
let tmp_v0 = ctx.alloc_tmp(I8X16).only_reg().unwrap();
let tmp_v1 = ctx.alloc_tmp(I8X16).only_reg().unwrap();
match ty {
I8X16 => {
// sshr tmp_v1.16b, src_v.16b, #7
// mov tmp_r0, #0x0201
// movk tmp_r0, #0x0804, lsl 16
// movk tmp_r0, #0x2010, lsl 32
// movk tmp_r0, #0x8040, lsl 48
// dup tmp_v0.2d, tmp_r0
// and tmp_v1.16b, tmp_v1.16b, tmp_v0.16b
// ext tmp_v0.16b, tmp_v1.16b, tmp_v1.16b, #8
// zip1 tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
// addv tmp_v0h, tmp_v0.8h
// mov dst_r, tmp_v0.h[0]
ctx.emit(Inst::VecShiftImm {
op: VecShiftImmOp::Sshr,
rd: tmp_v1,
rn: src_v,
size: VectorSize::Size8x16,
imm: 7,
});
lower_splat_const(ctx, tmp_v0, 0x8040201008040201u64, VectorSize::Size64x2);
ctx.emit(Inst::VecRRR {
alu_op: VecALUOp::And,
rd: tmp_v1,
rn: tmp_v1.to_reg(),
rm: tmp_v0.to_reg(),
size: VectorSize::Size8x16,
});
ctx.emit(Inst::VecExtract {
rd: tmp_v0,
rn: tmp_v1.to_reg(),
rm: tmp_v1.to_reg(),
imm4: 8,
});
ctx.emit(Inst::VecRRR {
alu_op: VecALUOp::Zip1,
rd: tmp_v0,
rn: tmp_v1.to_reg(),
rm: tmp_v0.to_reg(),
size: VectorSize::Size8x16,
});
ctx.emit(Inst::VecLanes {
op: VecLanesOp::Addv,
rd: tmp_v0,
rn: tmp_v0.to_reg(),
size: VectorSize::Size16x8,
});
ctx.emit(Inst::MovFromVec {
rd: dst_r,
rn: tmp_v0.to_reg(),
idx: 0,
size: ScalarSize::Size16,
});
}
I16X8 => {
// sshr tmp_v1.8h, src_v.8h, #15
// mov tmp_r0, #0x1
// movk tmp_r0, #0x2, lsl 16
// movk tmp_r0, #0x4, lsl 32
// movk tmp_r0, #0x8, lsl 48
// dup tmp_v0.2d, tmp_r0
// shl tmp_r0, tmp_r0, #4
// mov tmp_v0.d[1], tmp_r0
// and tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
// addv tmp_v0h, tmp_v0.8h
// mov dst_r, tmp_v0.h[0]
ctx.emit(Inst::VecShiftImm {
op: VecShiftImmOp::Sshr,
rd: tmp_v1,
rn: src_v,
size: VectorSize::Size16x8,
imm: 15,
});
lower_constant_u64(ctx, tmp_r0, 0x0008000400020001u64);
ctx.emit(Inst::VecDup {
rd: tmp_v0,
rn: tmp_r0.to_reg(),
size: VectorSize::Size64x2,
});
ctx.emit(Inst::AluRRImmShift {
alu_op: ALUOp::Lsl,
size: OperandSize::Size64,
rd: tmp_r0,
rn: tmp_r0.to_reg(),
immshift: ImmShift { imm: 4 },
});
ctx.emit(Inst::MovToVec {
rd: tmp_v0,
rn: tmp_r0.to_reg(),
idx: 1,
size: VectorSize::Size64x2,
});
ctx.emit(Inst::VecRRR {
alu_op: VecALUOp::And,
rd: tmp_v0,
rn: tmp_v1.to_reg(),
rm: tmp_v0.to_reg(),
size: VectorSize::Size8x16,
});
ctx.emit(Inst::VecLanes {
op: VecLanesOp::Addv,
rd: tmp_v0,
rn: tmp_v0.to_reg(),
size: VectorSize::Size16x8,
});
ctx.emit(Inst::MovFromVec {
rd: dst_r,
rn: tmp_v0.to_reg(),
idx: 0,
size: ScalarSize::Size16,
});
}
I32X4 => {
// sshr tmp_v1.4s, src_v.4s, #31
// mov tmp_r0, #0x1
// movk tmp_r0, #0x2, lsl 32
// dup tmp_v0.2d, tmp_r0
// shl tmp_r0, tmp_r0, #2
// mov tmp_v0.d[1], tmp_r0
// and tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
// addv tmp_v0s, tmp_v0.4s
// mov dst_r, tmp_v0.s[0]
ctx.emit(Inst::VecShiftImm {
op: VecShiftImmOp::Sshr,
rd: tmp_v1,
rn: src_v,
size: VectorSize::Size32x4,
imm: 31,
});
lower_constant_u64(ctx, tmp_r0, 0x0000000200000001u64);
ctx.emit(Inst::VecDup {
rd: tmp_v0,
rn: tmp_r0.to_reg(),
size: VectorSize::Size64x2,
});
ctx.emit(Inst::AluRRImmShift {
alu_op: ALUOp::Lsl,
size: OperandSize::Size64,
rd: tmp_r0,
rn: tmp_r0.to_reg(),
immshift: ImmShift { imm: 2 },
});
ctx.emit(Inst::MovToVec {
rd: tmp_v0,
rn: tmp_r0.to_reg(),
idx: 1,
size: VectorSize::Size64x2,
});
ctx.emit(Inst::VecRRR {
alu_op: VecALUOp::And,
rd: tmp_v0,
rn: tmp_v1.to_reg(),
rm: tmp_v0.to_reg(),
size: VectorSize::Size8x16,
});
ctx.emit(Inst::VecLanes {
op: VecLanesOp::Addv,
rd: tmp_v0,
rn: tmp_v0.to_reg(),
size: VectorSize::Size32x4,
});
ctx.emit(Inst::MovFromVec {
rd: dst_r,
rn: tmp_v0.to_reg(),
idx: 0,
size: ScalarSize::Size32,
});
}
I64X2 => {
// mov dst_r, src_v.d[0]
// mov tmp_r0, src_v.d[1]
// lsr dst_r, dst_r, #63
// lsr tmp_r0, tmp_r0, #63
// add dst_r, dst_r, tmp_r0, lsl #1
ctx.emit(Inst::MovFromVec {
rd: dst_r,
rn: src_v,
idx: 0,
size: ScalarSize::Size64,
});
ctx.emit(Inst::MovFromVec {
rd: tmp_r0,
rn: src_v,
idx: 1,
size: ScalarSize::Size64,
});
ctx.emit(Inst::AluRRImmShift {
alu_op: ALUOp::Lsr,
size: OperandSize::Size64,
rd: dst_r,
rn: dst_r.to_reg(),
immshift: ImmShift::maybe_from_u64(63).unwrap(),
});
ctx.emit(Inst::AluRRImmShift {
alu_op: ALUOp::Lsr,
size: OperandSize::Size64,
rd: tmp_r0,
rn: tmp_r0.to_reg(),
immshift: ImmShift::maybe_from_u64(63).unwrap(),
});
ctx.emit(Inst::AluRRRShift {
alu_op: ALUOp::Add,
size: OperandSize::Size32,
rd: dst_r,
rn: dst_r.to_reg(),
rm: tmp_r0.to_reg(),
shiftop: ShiftOpAndAmt::new(
ShiftOp::LSL,
ShiftOpShiftImm::maybe_from_shift(1).unwrap(),
),
});
}
_ => {
return Err(CodegenError::Unsupported(format!(
"VhighBits: Unsupported type: {:?}",
ty
)))
}
}
}
Opcode::VhighBits => implemented_in_isle(ctx),
Opcode::Shuffle => implemented_in_isle(ctx),
@@ -917,34 +558,7 @@ pub(crate) fn lower_insn_to_regs(
Opcode::FcvtToUintSat | Opcode::FcvtToSintSat => implemented_in_isle(ctx),
Opcode::IaddIfcout => {
// This is a two-output instruction that is needed for the
// legalizer's explicit heap-check sequence, among possible other
// uses. Its second output is a flags output only ever meant to
// check for overflow using the
// `backend.unsigned_add_overflow_condition()` condition.
//
// Note that the CLIF validation will ensure that no flag-setting
// operation comes between this IaddIfcout and its use (e.g., a
// Trapif). Thus, we can rely on implicit communication through the
// processor flags rather than explicitly generating flags into a
// register. We simply use the variant of the add instruction that
// sets flags (`adds`) here.
// Note that the second output (the flags) need not be generated,
// because flags are never materialized into a register; the only
// instructions that can use a value of type `iflags` or `fflags`
// will look directly for the flags-producing instruction (which can
// always be found, by construction) and merge it.
// Now handle the iadd as above, except use an AddS opcode that sets
// flags.
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None);
let ty = ty.unwrap();
ctx.emit(alu_inst_imm12(ALUOp::AddS, ty, rd, rn, rm));
}
Opcode::IaddIfcout => implemented_in_isle(ctx),
Opcode::IaddImm
| Opcode::ImulImm
@@ -1006,47 +620,9 @@ pub(crate) fn lower_insn_to_regs(
Opcode::SqmulRoundSat => implemented_in_isle(ctx),
Opcode::FcvtLowFromSint => {
let ty = ty.unwrap();
Opcode::FcvtLowFromSint => implemented_in_isle(ctx),
if ty != F64X2 {
return Err(CodegenError::Unsupported(format!(
"FcvtLowFromSint: Unsupported type: {:?}",
ty
)));
}
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
ctx.emit(Inst::VecExtend {
t: VecExtendOp::Sxtl,
rd,
rn,
high_half: false,
lane_size: ScalarSize::Size64,
});
ctx.emit(Inst::VecMisc {
op: VecMisc2::Scvtf,
rd,
rn: rd.to_reg(),
size: VectorSize::Size64x2,
});
}
Opcode::FvpromoteLow => {
debug_assert_eq!(ty.unwrap(), F64X2);
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
ctx.emit(Inst::VecRRLong {
op: VecRRLongOp::Fcvtl32,
rd,
rn,
high_half: false,
});
}
Opcode::FvpromoteLow => implemented_in_isle(ctx),
Opcode::Fvdemote => implemented_in_isle(ctx),

View File

@@ -371,6 +371,15 @@ macro_rules! isle_prelude_methods {
ty.is_int().then(|| ty)
}
#[inline]
fn ty_int_bool(&mut self, ty: Type) -> Option<Type> {
if ty.is_int() || ty.is_bool() {
Some(ty)
} else {
None
}
}
#[inline]
fn ty_scalar_float(&mut self, ty: Type) -> Option<Type> {
match ty {
@@ -379,6 +388,15 @@ macro_rules! isle_prelude_methods {
}
}
#[inline]
fn ty_float_or_vec(&mut self, ty: Type) -> Option<Type> {
match ty {
F32 | F64 => Some(ty),
ty if ty.is_vector() => Some(ty),
_ => None,
}
}
#[inline]
fn ty_vec64(&mut self, ty: Type) -> Option<Type> {
if ty.is_vector() && ty.bits() == 64 {

View File

@@ -365,6 +365,10 @@
(decl ty_int_bool_128 (Type) Type)
(extern extractor ty_int_bool_128 ty_int_bool_128)
;; An extractor that matches any int or bool.
(decl ty_int_bool (Type) Type)
(extern extractor ty_int_bool ty_int_bool)
;; An extractor that only matches integers.
(decl ty_int (Type) Type)
(extern extractor ty_int ty_int)
@@ -373,6 +377,10 @@
(decl ty_scalar_float (Type) Type)
(extern extractor ty_scalar_float ty_scalar_float)
;; An extractor that matches scalar floating-point types or vector types.
(decl ty_float_or_vec (Type) Type)
(extern extractor ty_float_or_vec ty_float_or_vec)
;; A pure constructor that only matches 64-bit vector types.
(decl pure ty_vec64 (Type) Type)
(extern constructor ty_vec64 ty_vec64)