From a6eb24bd4fd2255dfd7c86638a6965ff9e27bff2 Mon Sep 17 00:00:00 2001 From: Chris Fallin Date: Mon, 29 Aug 2022 12:56:39 -0700 Subject: [PATCH] AArch64: port misc ops to ISLE. (#4796) * Add some precise-output compile tests for aarch64. * AArch64: port misc ops to ISLE. - get_pinned_reg / set_pinned_reg - bitcast - stack_addr - extractlane - insertlane - vhigh_bits - iadd_ifcout - fcvt_low_from_sint --- cranelift/codegen/src/isa/aarch64/inst.isle | 51 ++ cranelift/codegen/src/isa/aarch64/lower.isle | 209 ++++++++ .../codegen/src/isa/aarch64/lower/isle.rs | 10 +- .../codegen/src/isa/aarch64/lower_inst.rs | 446 +---------------- cranelift/codegen/src/machinst/isle.rs | 18 + cranelift/codegen/src/prelude.isle | 8 + .../filetests/isa/aarch64/bitcast.clif | 43 ++ .../filetests/isa/aarch64/dynamic-slot.clif | 8 +- .../filetests/filetests/isa/aarch64/fcvt.clif | 461 ++++++++++++++++++ .../filetests/isa/aarch64/reftypes.clif | 12 +- .../isa/aarch64/simd-bitwise-compile.clif | 213 ++++++++ .../isa/aarch64/simd-comparison-legalize.clif | 45 ++ .../isa/aarch64/simd-lane-access-compile.clif | 124 +++++ .../isa/aarch64/simd-logical-compile.clif | 40 ++ .../isa/aarch64/simd-pairwise-add.clif | 194 +------- .../filetests/isa/aarch64/stack.clif | 32 +- .../filetests/isa/aarch64/traps.clif | 25 +- .../filetests/isa/aarch64/vhigh_bits.clif | 85 ++++ 18 files changed, 1362 insertions(+), 662 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/aarch64/bitcast.clif create mode 100644 cranelift/filetests/filetests/isa/aarch64/fcvt.clif create mode 100644 cranelift/filetests/filetests/isa/aarch64/simd-bitwise-compile.clif create mode 100644 cranelift/filetests/filetests/isa/aarch64/simd-comparison-legalize.clif create mode 100644 cranelift/filetests/filetests/isa/aarch64/simd-lane-access-compile.clif create mode 100644 cranelift/filetests/filetests/isa/aarch64/simd-logical-compile.clif create mode 100644 cranelift/filetests/filetests/isa/aarch64/vhigh_bits.clif diff --git a/cranelift/codegen/src/isa/aarch64/inst.isle b/cranelift/codegen/src/isa/aarch64/inst.isle index a21658b8de..856516ef2b 100644 --- a/cranelift/codegen/src/isa/aarch64/inst.isle +++ b/cranelift/codegen/src/isa/aarch64/inst.isle @@ -952,11 +952,19 @@ ;; Helper for calculating the `ScalarSize` corresponding to a type (decl scalar_size (Type) ScalarSize) + (rule (scalar_size $I8) (ScalarSize.Size8)) (rule (scalar_size $I16) (ScalarSize.Size16)) (rule (scalar_size $I32) (ScalarSize.Size32)) (rule (scalar_size $I64) (ScalarSize.Size64)) (rule (scalar_size $I128) (ScalarSize.Size128)) + +(rule (scalar_size $B8) (ScalarSize.Size8)) +(rule (scalar_size $B16) (ScalarSize.Size16)) +(rule (scalar_size $B32) (ScalarSize.Size32)) +(rule (scalar_size $B64) (ScalarSize.Size64)) +(rule (scalar_size $B128) (ScalarSize.Size128)) + (rule (scalar_size $F32) (ScalarSize.Size32)) (rule (scalar_size $F64) (ScalarSize.Size64)) @@ -1452,6 +1460,9 @@ (decl pure lshl_from_imm64 (Type Imm64) ShiftOpAndAmt) (extern constructor lshl_from_imm64 lshl_from_imm64) +(decl pure lshl_from_u64 (Type u64) ShiftOpAndAmt) +(extern constructor lshl_from_u64 lshl_from_u64) + (decl integral_ty (Type) Type) (extern extractor integral_ty integral_ty) @@ -1704,6 +1715,14 @@ (MInst.AluRRR (ALUOp.AddS) (operand_size ty) dst src1 src2) dst))) +;; Helper for emitting `adds` instructions, setting flags in ambient +;; state. Used only for `iadd_ifcout`. +(decl add_with_flags (Type Reg Reg) Reg) +(rule (add_with_flags ty src1 src2) + (let ((dst WritableReg (temp_writable_reg $I64)) + (_ Unit (emit (MInst.AluRRR (ALUOp.AddS) (operand_size ty) dst src1 src2)))) + dst)) + ;; Helper for emitting `adc` instructions. (decl adc_paired (Type Reg Reg) ConsumesFlags) (rule (adc_paired ty src1 src2) @@ -1927,6 +1946,13 @@ (_ Unit (emit (MInst.VecExtend op dst src high_half size)))) dst)) +;; Helper for emitting `MInst.VecExtract` instructions. +(decl vec_extract (Reg Reg u8) Reg) +(rule (vec_extract src1 src2 idx) + (let ((dst WritableReg (temp_writable_reg $I8X16)) + (_ Unit (emit (MInst.VecExtract dst src1 src2 idx)))) + dst)) + ;; Helper for emitting `MInst.LoadAcquire` instructions. (decl load_acquire (Type Reg) Reg) (rule (load_acquire ty addr) @@ -2118,6 +2144,10 @@ (decl addp (Reg Reg VectorSize) Reg) (rule (addp x y size) (vec_rrr (VecALUOp.Addp) x y size)) +;; Helper for generating `zip1` instructions. +(decl zip1 (Reg Reg VectorSize) Reg) +(rule (zip1 x y size) (vec_rrr (VecALUOp.Zip1) x y size)) + ;; Helper for generating vector `abs` instructions. (decl vec_abs (Reg VectorSize) Reg) (rule (vec_abs x size) (vec_misc (VecMisc2.Abs) x size)) @@ -2826,3 +2856,24 @@ (decl gen_call_indirect (SigRef Value ValueSlice) InstOutput) (extern constructor gen_call_indirect gen_call_indirect) + +;; Helpers for pinned register manipulation. + +(decl writable_pinned_reg () WritableReg) +(extern constructor writable_pinned_reg writable_pinned_reg) + +(decl pinned_reg () Reg) +(rule (pinned_reg) (writable_pinned_reg)) + +(decl write_pinned_reg (Reg) SideEffectNoResult) +(rule (write_pinned_reg val) + (let ((dst WritableReg (writable_pinned_reg))) + (SideEffectNoResult.Inst (gen_move $I64 dst val)))) + +;; Helpers for stackslot effective address generation. + +(decl compute_stack_addr (StackSlot Offset32) Reg) +(rule (compute_stack_addr stack_slot offset) + (let ((dst WritableReg (temp_writable_reg $I64)) + (_ Unit (emit (abi_stackslot_addr dst stack_slot offset)))) + dst)) diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle index 955f453532..d86de45a68 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.isle +++ b/cranelift/codegen/src/isa/aarch64/lower.isle @@ -2030,3 +2030,212 @@ ;; N.B.: the Ret itself is generated by the ABI. (rule (lower (return args)) (lower_return (range 0 (value_slice_len args)) args)) + +;;; Rules for `{get,set}_pinned_reg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (get_pinned_reg)) + (pinned_reg)) + +(rule (lower (set_pinned_reg val)) + (side_effect (write_pinned_reg val))) + +;;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type $I32 (bitcast src @ (value_type $F32)))) + (mov_from_vec src 0 (ScalarSize.Size32))) + +(rule (lower (has_type $F32 (bitcast src @ (value_type $I32)))) + (mov_to_fpu src (ScalarSize.Size32))) + +(rule (lower (has_type $I64 (bitcast src @ (value_type $F64)))) + (mov_from_vec src 0 (ScalarSize.Size64))) + +(rule (lower (has_type $F64 (bitcast src @ (value_type $I64)))) + (mov_to_fpu src (ScalarSize.Size64))) + +;;; Rules for `raw_bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (raw_bitcast val)) + val) + +;;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; extractlane with lane 0 can pass through the value unchanged; upper +;; bits are undefined when a narrower type is in a wider register. +(rule (lower (has_type (ty_scalar_float _) (extractlane val (u8_from_uimm8 0)))) + val) + +(rule (lower (has_type (ty_int_bool ty) + (extractlane val + (u8_from_uimm8 lane)))) + (mov_from_vec val lane (scalar_size ty))) + +(rule (lower (has_type (ty_scalar_float ty) + (extractlane val @ (value_type vty) + (u8_from_uimm8 lane)))) + (fpu_move_from_vec val lane (vector_size vty))) + +;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (insertlane vec @ (value_type vty) + val @ (value_type (ty_int_bool _)) + (u8_from_uimm8 lane))) + (mov_to_vec vec val lane (vector_size vty))) + +(rule (lower (insertlane vec @ (value_type vty) + val @ (value_type (ty_scalar_float _)) + (u8_from_uimm8 lane))) + (mov_vec_elem vec val lane 0 (vector_size vty))) + +;;; Rules for `copy` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (copy x)) + x) + +;;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (stack_addr stack_slot offset)) + (compute_stack_addr stack_slot offset)) + +;;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; All three sequences use one integer temporary and two vector +;; temporaries. The shift is done early so as to give the register +;; allocator the possibility of using the same reg for `tmp_v1` and +;; `src_v` in the case that this is the last use of `src_v`. See +;; https://github.com/WebAssembly/simd/pull/201 for the background and +;; derivation of these sequences. Alternative sequences are discussed +;; in https://github.com/bytecodealliance/wasmtime/issues/2296, +;; although they are not used here. + +(rule (lower (vhigh_bits vec @ (value_type $I8X16))) + (let ( + ;; Replicate the MSB of each of the 16 byte lanes across + ;; the whole lane (sshr is an arithmetic right shift). + (shifted Reg (vec_shift_imm (VecShiftImmOp.Sshr) 7 vec (VectorSize.Size8x16))) + ;; Bitwise-and with a mask + ;; `0x80402010_08040201_80402010_08040201` to get the bit + ;; in the proper location for each group of 8 lanes. + (anded Reg (and_vec shifted (constant_f128 0x80402010_08040201_80402010_08040201) (VectorSize.Size8x16))) + ;; Produce a version of `anded` with upper 8 lanes and + ;; lower 8 lanes swapped. + (anded_swapped Reg (vec_extract anded anded 8)) + ;; Zip together the two; with the above this produces the lane permutation: + ;; 15 7 14 6 13 5 12 4 11 3 10 2 9 1 8 0 + (zipped Reg (zip1 anded anded_swapped (VectorSize.Size8x16))) + ;; Add 16-bit lanes together ("add across vector"), so we + ;; get, in the low 16 bits, 15+14+...+8 in the high byte + ;; and 7+6+...+0 in the low byte. This effectively puts + ;; the 16 MSBs together, giving our results. + ;; + ;; N.B.: `Size16x8` is not a typo! + (result Reg (addv zipped (VectorSize.Size16x8)))) + (mov_from_vec result 0 (ScalarSize.Size16)))) + +(rule (lower (vhigh_bits vec @ (value_type $I16X8))) + (let ( + ;; Replicate the MSB of each of the 8 16-bit lanes across + ;; the whole lane (sshr is an arithmetic right shift). + (shifted Reg (vec_shift_imm (VecShiftImmOp.Sshr) 15 vec (VectorSize.Size16x8))) + ;; Bitwise-and with a mask + ;; `0x0080_0040_0020_0010_0008_0004_0002_0001` to get the + ;; bit in the proper location for each group of 4 lanes. + (anded Reg (and_vec shifted (constant_f128 0x0080_0040_0020_0010_0008_0004_0002_0001) (VectorSize.Size16x8))) + ;; Add lanes together to get the 8 MSBs in the low byte. + (result Reg (addv anded (VectorSize.Size16x8)))) + (mov_from_vec result 0 (ScalarSize.Size16)))) + +(rule (lower (vhigh_bits vec @ (value_type $I32X4))) + (let ( + ;; Replicate the MSB of each of the 4 32-bit lanes across + ;; the whole lane (sshr is an arithmetic right shift). + (shifted Reg (vec_shift_imm (VecShiftImmOp.Sshr) 31 vec (VectorSize.Size32x4))) + ;; Bitwise-and with a mask + ;; `0x00000008_00000004_00000002_00000001` to get the bit + ;; in the proper location for each group of 4 lanes. + (anded Reg (and_vec shifted (constant_f128 0x00000008_00000004_00000002_00000001) (VectorSize.Size32x4))) + ;; Add lanes together to get the 4 MSBs in the low byte. + (result Reg (addv anded (VectorSize.Size32x4)))) + (mov_from_vec result 0 (ScalarSize.Size32)))) + +(rule (lower (vhigh_bits vec @ (value_type $I64X2))) + (let ( + ;; Grab the MSB out of each of the lanes, right-shift to + ;; LSB, and add with a left-shift of upper lane's MSB back + ;; to bit 1. the whole lane (sshr is an arithmetic right + ;; shift). + (upper_msb Reg (mov_from_vec vec 1 (ScalarSize.Size64))) + (lower_msb Reg (mov_from_vec vec 0 (ScalarSize.Size64))) + (upper_msb Reg (lsr_imm $I64 upper_msb (imm_shift_from_u8 63))) + (lower_msb Reg (lsr_imm $I64 lower_msb (imm_shift_from_u8 63)))) + (add_shift $I64 lower_msb upper_msb (lshl_from_u64 $I64 1)))) + +;;; Rules for `iadd_ifcout` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; This is a two-output instruction that is needed for the +;; legalizer's explicit heap-check sequence, among possible other +;; uses. Its second output is a flags output only ever meant to +;; check for overflow using the +;; `backend.unsigned_add_overflow_condition()` condition. +;; +;; Note that the CLIF validation will ensure that no flag-setting +;; operation comes between this IaddIfcout and its use (e.g., a +;; Trapif). Thus, we can rely on implicit communication through the +;; processor flags rather than explicitly generating flags into a +;; register. We simply use the variant of the add instruction that +;; sets flags (`adds`) here. +;; +;; Note that the second output (the flags) need not be generated, +;; because flags are never materialized into a register; the only +;; instructions that can use a value of type `iflags` or `fflags` +;; will look directly for the flags-producing instruction (which can +;; always be found, by construction) and merge it. +;; +;; Now handle the iadd as above, except use an AddS opcode that sets +;; flags. + +(rule (lower (has_type (ty_int ty) + (iadd_ifcout a b))) + (output_pair + (add_with_flags ty a b) + (invalid_reg))) + +;;; Rules for `tls_value` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; TODO. + +;;; Rules for `fcvt_low_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type $F64X2 (fcvt_low_from_sint val))) + (let ((extended Reg (vec_extend (VecExtendOp.Sxtl) val $false (ScalarSize.Size64))) + (converted Reg (vec_misc (VecMisc2.Scvtf) extended (VectorSize.Size64x2)))) + converted)) + +;;; Rules for `fvpromote_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (fvpromote_low val)) + (vec_rr_long (VecRRLongOp.Fcvtl32) val $false)) + +;;; Rules for `select` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; TODO: requires icmp/fcmp first. + +;;; Rules for `selectif` / `selectif_spectre_guard` ;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; TODO: requires icmp/fcmp first. + +;;; Rules for `trueif` / `trueff` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; TODO: requires icmp/fcmp first. + +;;; Rules for `brz`/`brnz`/`brif`/`brff`/`bricmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; TODO: requires icmp/fcmp first. + +;;; Rules for `jump` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; TODO. + +;;; Rules for `br_table` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; TODO. diff --git a/cranelift/codegen/src/isa/aarch64/lower/isle.rs b/cranelift/codegen/src/isa/aarch64/lower/isle.rs index 3b876bfbc8..8c54d9a3d9 100644 --- a/cranelift/codegen/src/isa/aarch64/lower/isle.rs +++ b/cranelift/codegen/src/isa/aarch64/lower/isle.rs @@ -128,7 +128,11 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> { } fn lshl_from_imm64(&mut self, ty: Type, n: Imm64) -> Option { - let shiftimm = ShiftOpShiftImm::maybe_from_shift(n.bits() as u64)?; + self.lshl_from_u64(ty, n.bits() as u64) + } + + fn lshl_from_u64(&mut self, ty: Type, n: u64) -> Option { + let shiftimm = ShiftOpShiftImm::maybe_from_shift(n)?; let shiftee_bits = ty_bits(ty); if shiftee_bits <= std::u8::MAX as usize { let shiftimm = shiftimm.mask(shiftee_bits as u8); @@ -722,4 +726,8 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> { ); } } + + fn writable_pinned_reg(&mut self) -> WritableReg { + super::regs::writable_xreg(super::regs::PINNED_REG) + } } diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 30ea4a2a11..c72ddea6ef 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -4,7 +4,7 @@ use super::lower::*; use crate::binemit::CodeOffset; use crate::ir::types::*; use crate::ir::Inst as IRInst; -use crate::ir::{InstructionData, Opcode}; +use crate::ir::Opcode; use crate::isa::aarch64::inst::*; use crate::isa::aarch64::settings as aarch64_settings; use crate::machinst::lower::*; @@ -13,7 +13,6 @@ use crate::settings::{Flags, TlsModel}; use crate::{CodegenError, CodegenResult}; use alloc::boxed::Box; use alloc::vec::Vec; -use core::convert::TryFrom; use target_lexicon::Triple; /// Actually codegen an instruction's results into registers. @@ -231,23 +230,7 @@ pub(crate) fn lower_insn_to_regs( } } - Opcode::StackAddr => { - let (stack_slot, offset) = match *ctx.data(insn) { - InstructionData::StackLoad { - opcode: Opcode::StackAddr, - stack_slot, - offset, - } => (stack_slot, offset), - _ => unreachable!(), - }; - let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let offset: i32 = offset.into(); - assert!(ctx.abi().sized_stackslot_offsets().is_valid(stack_slot)); - let inst = - ctx.abi() - .sized_stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), rd); - ctx.emit(inst); - } + Opcode::StackAddr => implemented_in_isle(ctx), Opcode::DynamicStackAddr => implemented_in_isle(ctx), @@ -421,52 +404,7 @@ pub(crate) fn lower_insn_to_regs( Opcode::Bint => implemented_in_isle(ctx), - Opcode::Bitcast => { - let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let ity = ctx.input_ty(insn, 0); - let oty = ctx.output_ty(insn, 0); - let ity_bits = ty_bits(ity); - let ity_vec_reg = ty_has_float_or_vec_representation(ity); - let oty_bits = ty_bits(oty); - let oty_vec_reg = ty_has_float_or_vec_representation(oty); - - debug_assert_eq!(ity_bits, oty_bits); - - match (ity_vec_reg, oty_vec_reg) { - (true, true) => { - let narrow_mode = if ity_bits <= 32 { - NarrowValueMode::ZeroExtend32 - } else { - NarrowValueMode::ZeroExtend64 - }; - let rm = put_input_in_reg(ctx, inputs[0], narrow_mode); - ctx.emit(Inst::gen_move(rd, rm, oty)); - } - (false, false) => { - let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - ctx.emit(Inst::gen_move(rd, rm, oty)); - } - (false, true) => { - let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64); - ctx.emit(Inst::MovToFpu { - rd, - rn, - size: ScalarSize::Size64, - }); - } - (true, false) => { - let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - let size = ScalarSize::from_bits(oty_bits); - - ctx.emit(Inst::MovFromVec { - rd, - rn, - idx: 0, - size, - }); - } - } - } + Opcode::Bitcast => implemented_in_isle(ctx), Opcode::Return => implemented_in_isle(ctx), @@ -556,15 +494,7 @@ pub(crate) fn lower_insn_to_regs( Opcode::Call | Opcode::CallIndirect => implemented_in_isle(ctx), - Opcode::GetPinnedReg => { - let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - ctx.emit(Inst::gen_move(rd, xreg(PINNED_REG), I64)); - } - - Opcode::SetPinnedReg => { - let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - ctx.emit(Inst::gen_move(writable_xreg(PINNED_REG), rm, I64)); - } + Opcode::GetPinnedReg | Opcode::SetPinnedReg => implemented_in_isle(ctx), Opcode::Jump | Opcode::Brz @@ -578,67 +508,11 @@ pub(crate) fn lower_insn_to_regs( Opcode::Vconst => implemented_in_isle(ctx), - Opcode::RawBitcast => { - let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let ty = ctx.input_ty(insn, 0); - ctx.emit(Inst::gen_move(rd, rm, ty)); - } + Opcode::RawBitcast => implemented_in_isle(ctx), - Opcode::Extractlane => { - if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(insn) { - let idx = *imm; - let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - let input_ty = ctx.input_ty(insn, 0); - let size = VectorSize::from_ty(input_ty); - let ty = ty.unwrap(); + Opcode::Extractlane => implemented_in_isle(ctx), - if ty_has_int_representation(ty) { - ctx.emit(Inst::MovFromVec { - rd, - rn, - idx, - size: size.lane_size(), - }); - // Plain moves are faster on some processors. - } else if idx == 0 { - ctx.emit(Inst::gen_move(rd, rn, ty)); - } else { - ctx.emit(Inst::FpuMoveFromVec { rd, rn, idx, size }); - } - } else { - unreachable!(); - } - } - - Opcode::Insertlane => { - let idx = if let InstructionData::TernaryImm8 { imm, .. } = ctx.data(insn) { - *imm - } else { - unreachable!(); - }; - let input_ty = ctx.input_ty(insn, 1); - let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); - let ty = ty.unwrap(); - let size = VectorSize::from_ty(ty); - - ctx.emit(Inst::gen_move(rd, rm, ty)); - - if ty_has_int_representation(input_ty) { - ctx.emit(Inst::MovToVec { rd, rn, idx, size }); - } else { - ctx.emit(Inst::VecMovElement { - rd, - rn, - dest_idx: idx, - src_idx: 0, - size, - }); - } - } + Opcode::Insertlane => implemented_in_isle(ctx), Opcode::Splat => implemented_in_isle(ctx), @@ -646,240 +520,7 @@ pub(crate) fn lower_insn_to_regs( Opcode::VallTrue | Opcode::VanyTrue => implemented_in_isle(ctx), - Opcode::VhighBits => { - let dst_r = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let src_v = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - let ty = ctx.input_ty(insn, 0); - // All three sequences use one integer temporary and two vector temporaries. The - // shift is done early so as to give the register allocator the possibility of using - // the same reg for `tmp_v1` and `src_v` in the case that this is the last use of - // `src_v`. See https://github.com/WebAssembly/simd/pull/201 for the background and - // derivation of these sequences. Alternative sequences are discussed in - // https://github.com/bytecodealliance/wasmtime/issues/2296, although they are not - // used here. - let tmp_r0 = ctx.alloc_tmp(I64).only_reg().unwrap(); - let tmp_v0 = ctx.alloc_tmp(I8X16).only_reg().unwrap(); - let tmp_v1 = ctx.alloc_tmp(I8X16).only_reg().unwrap(); - match ty { - I8X16 => { - // sshr tmp_v1.16b, src_v.16b, #7 - // mov tmp_r0, #0x0201 - // movk tmp_r0, #0x0804, lsl 16 - // movk tmp_r0, #0x2010, lsl 32 - // movk tmp_r0, #0x8040, lsl 48 - // dup tmp_v0.2d, tmp_r0 - // and tmp_v1.16b, tmp_v1.16b, tmp_v0.16b - // ext tmp_v0.16b, tmp_v1.16b, tmp_v1.16b, #8 - // zip1 tmp_v0.16b, tmp_v1.16b, tmp_v0.16b - // addv tmp_v0h, tmp_v0.8h - // mov dst_r, tmp_v0.h[0] - ctx.emit(Inst::VecShiftImm { - op: VecShiftImmOp::Sshr, - rd: tmp_v1, - rn: src_v, - size: VectorSize::Size8x16, - imm: 7, - }); - lower_splat_const(ctx, tmp_v0, 0x8040201008040201u64, VectorSize::Size64x2); - ctx.emit(Inst::VecRRR { - alu_op: VecALUOp::And, - rd: tmp_v1, - rn: tmp_v1.to_reg(), - rm: tmp_v0.to_reg(), - size: VectorSize::Size8x16, - }); - ctx.emit(Inst::VecExtract { - rd: tmp_v0, - rn: tmp_v1.to_reg(), - rm: tmp_v1.to_reg(), - imm4: 8, - }); - ctx.emit(Inst::VecRRR { - alu_op: VecALUOp::Zip1, - rd: tmp_v0, - rn: tmp_v1.to_reg(), - rm: tmp_v0.to_reg(), - size: VectorSize::Size8x16, - }); - ctx.emit(Inst::VecLanes { - op: VecLanesOp::Addv, - rd: tmp_v0, - rn: tmp_v0.to_reg(), - size: VectorSize::Size16x8, - }); - ctx.emit(Inst::MovFromVec { - rd: dst_r, - rn: tmp_v0.to_reg(), - idx: 0, - size: ScalarSize::Size16, - }); - } - I16X8 => { - // sshr tmp_v1.8h, src_v.8h, #15 - // mov tmp_r0, #0x1 - // movk tmp_r0, #0x2, lsl 16 - // movk tmp_r0, #0x4, lsl 32 - // movk tmp_r0, #0x8, lsl 48 - // dup tmp_v0.2d, tmp_r0 - // shl tmp_r0, tmp_r0, #4 - // mov tmp_v0.d[1], tmp_r0 - // and tmp_v0.16b, tmp_v1.16b, tmp_v0.16b - // addv tmp_v0h, tmp_v0.8h - // mov dst_r, tmp_v0.h[0] - ctx.emit(Inst::VecShiftImm { - op: VecShiftImmOp::Sshr, - rd: tmp_v1, - rn: src_v, - size: VectorSize::Size16x8, - imm: 15, - }); - lower_constant_u64(ctx, tmp_r0, 0x0008000400020001u64); - ctx.emit(Inst::VecDup { - rd: tmp_v0, - rn: tmp_r0.to_reg(), - size: VectorSize::Size64x2, - }); - ctx.emit(Inst::AluRRImmShift { - alu_op: ALUOp::Lsl, - size: OperandSize::Size64, - rd: tmp_r0, - rn: tmp_r0.to_reg(), - immshift: ImmShift { imm: 4 }, - }); - ctx.emit(Inst::MovToVec { - rd: tmp_v0, - rn: tmp_r0.to_reg(), - idx: 1, - size: VectorSize::Size64x2, - }); - ctx.emit(Inst::VecRRR { - alu_op: VecALUOp::And, - rd: tmp_v0, - rn: tmp_v1.to_reg(), - rm: tmp_v0.to_reg(), - size: VectorSize::Size8x16, - }); - ctx.emit(Inst::VecLanes { - op: VecLanesOp::Addv, - rd: tmp_v0, - rn: tmp_v0.to_reg(), - size: VectorSize::Size16x8, - }); - ctx.emit(Inst::MovFromVec { - rd: dst_r, - rn: tmp_v0.to_reg(), - idx: 0, - size: ScalarSize::Size16, - }); - } - I32X4 => { - // sshr tmp_v1.4s, src_v.4s, #31 - // mov tmp_r0, #0x1 - // movk tmp_r0, #0x2, lsl 32 - // dup tmp_v0.2d, tmp_r0 - // shl tmp_r0, tmp_r0, #2 - // mov tmp_v0.d[1], tmp_r0 - // and tmp_v0.16b, tmp_v1.16b, tmp_v0.16b - // addv tmp_v0s, tmp_v0.4s - // mov dst_r, tmp_v0.s[0] - ctx.emit(Inst::VecShiftImm { - op: VecShiftImmOp::Sshr, - rd: tmp_v1, - rn: src_v, - size: VectorSize::Size32x4, - imm: 31, - }); - lower_constant_u64(ctx, tmp_r0, 0x0000000200000001u64); - ctx.emit(Inst::VecDup { - rd: tmp_v0, - rn: tmp_r0.to_reg(), - size: VectorSize::Size64x2, - }); - ctx.emit(Inst::AluRRImmShift { - alu_op: ALUOp::Lsl, - size: OperandSize::Size64, - rd: tmp_r0, - rn: tmp_r0.to_reg(), - immshift: ImmShift { imm: 2 }, - }); - ctx.emit(Inst::MovToVec { - rd: tmp_v0, - rn: tmp_r0.to_reg(), - idx: 1, - size: VectorSize::Size64x2, - }); - ctx.emit(Inst::VecRRR { - alu_op: VecALUOp::And, - rd: tmp_v0, - rn: tmp_v1.to_reg(), - rm: tmp_v0.to_reg(), - size: VectorSize::Size8x16, - }); - ctx.emit(Inst::VecLanes { - op: VecLanesOp::Addv, - rd: tmp_v0, - rn: tmp_v0.to_reg(), - size: VectorSize::Size32x4, - }); - ctx.emit(Inst::MovFromVec { - rd: dst_r, - rn: tmp_v0.to_reg(), - idx: 0, - size: ScalarSize::Size32, - }); - } - I64X2 => { - // mov dst_r, src_v.d[0] - // mov tmp_r0, src_v.d[1] - // lsr dst_r, dst_r, #63 - // lsr tmp_r0, tmp_r0, #63 - // add dst_r, dst_r, tmp_r0, lsl #1 - ctx.emit(Inst::MovFromVec { - rd: dst_r, - rn: src_v, - idx: 0, - size: ScalarSize::Size64, - }); - ctx.emit(Inst::MovFromVec { - rd: tmp_r0, - rn: src_v, - idx: 1, - size: ScalarSize::Size64, - }); - ctx.emit(Inst::AluRRImmShift { - alu_op: ALUOp::Lsr, - size: OperandSize::Size64, - rd: dst_r, - rn: dst_r.to_reg(), - immshift: ImmShift::maybe_from_u64(63).unwrap(), - }); - ctx.emit(Inst::AluRRImmShift { - alu_op: ALUOp::Lsr, - size: OperandSize::Size64, - rd: tmp_r0, - rn: tmp_r0.to_reg(), - immshift: ImmShift::maybe_from_u64(63).unwrap(), - }); - ctx.emit(Inst::AluRRRShift { - alu_op: ALUOp::Add, - size: OperandSize::Size32, - rd: dst_r, - rn: dst_r.to_reg(), - rm: tmp_r0.to_reg(), - shiftop: ShiftOpAndAmt::new( - ShiftOp::LSL, - ShiftOpShiftImm::maybe_from_shift(1).unwrap(), - ), - }); - } - _ => { - return Err(CodegenError::Unsupported(format!( - "VhighBits: Unsupported type: {:?}", - ty - ))) - } - } - } + Opcode::VhighBits => implemented_in_isle(ctx), Opcode::Shuffle => implemented_in_isle(ctx), @@ -917,34 +558,7 @@ pub(crate) fn lower_insn_to_regs( Opcode::FcvtToUintSat | Opcode::FcvtToSintSat => implemented_in_isle(ctx), - Opcode::IaddIfcout => { - // This is a two-output instruction that is needed for the - // legalizer's explicit heap-check sequence, among possible other - // uses. Its second output is a flags output only ever meant to - // check for overflow using the - // `backend.unsigned_add_overflow_condition()` condition. - // - // Note that the CLIF validation will ensure that no flag-setting - // operation comes between this IaddIfcout and its use (e.g., a - // Trapif). Thus, we can rely on implicit communication through the - // processor flags rather than explicitly generating flags into a - // register. We simply use the variant of the add instruction that - // sets flags (`adds`) here. - - // Note that the second output (the flags) need not be generated, - // because flags are never materialized into a register; the only - // instructions that can use a value of type `iflags` or `fflags` - // will look directly for the flags-producing instruction (which can - // always be found, by construction) and merge it. - - // Now handle the iadd as above, except use an AddS opcode that sets - // flags. - let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None); - let ty = ty.unwrap(); - ctx.emit(alu_inst_imm12(ALUOp::AddS, ty, rd, rn, rm)); - } + Opcode::IaddIfcout => implemented_in_isle(ctx), Opcode::IaddImm | Opcode::ImulImm @@ -1006,47 +620,9 @@ pub(crate) fn lower_insn_to_regs( Opcode::SqmulRoundSat => implemented_in_isle(ctx), - Opcode::FcvtLowFromSint => { - let ty = ty.unwrap(); + Opcode::FcvtLowFromSint => implemented_in_isle(ctx), - if ty != F64X2 { - return Err(CodegenError::Unsupported(format!( - "FcvtLowFromSint: Unsupported type: {:?}", - ty - ))); - } - - let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - - ctx.emit(Inst::VecExtend { - t: VecExtendOp::Sxtl, - rd, - rn, - high_half: false, - lane_size: ScalarSize::Size64, - }); - ctx.emit(Inst::VecMisc { - op: VecMisc2::Scvtf, - rd, - rn: rd.to_reg(), - size: VectorSize::Size64x2, - }); - } - - Opcode::FvpromoteLow => { - debug_assert_eq!(ty.unwrap(), F64X2); - - let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - - ctx.emit(Inst::VecRRLong { - op: VecRRLongOp::Fcvtl32, - rd, - rn, - high_half: false, - }); - } + Opcode::FvpromoteLow => implemented_in_isle(ctx), Opcode::Fvdemote => implemented_in_isle(ctx), diff --git a/cranelift/codegen/src/machinst/isle.rs b/cranelift/codegen/src/machinst/isle.rs index 458c2c1f35..d62b2f831e 100644 --- a/cranelift/codegen/src/machinst/isle.rs +++ b/cranelift/codegen/src/machinst/isle.rs @@ -371,6 +371,15 @@ macro_rules! isle_prelude_methods { ty.is_int().then(|| ty) } + #[inline] + fn ty_int_bool(&mut self, ty: Type) -> Option { + if ty.is_int() || ty.is_bool() { + Some(ty) + } else { + None + } + } + #[inline] fn ty_scalar_float(&mut self, ty: Type) -> Option { match ty { @@ -379,6 +388,15 @@ macro_rules! isle_prelude_methods { } } + #[inline] + fn ty_float_or_vec(&mut self, ty: Type) -> Option { + match ty { + F32 | F64 => Some(ty), + ty if ty.is_vector() => Some(ty), + _ => None, + } + } + #[inline] fn ty_vec64(&mut self, ty: Type) -> Option { if ty.is_vector() && ty.bits() == 64 { diff --git a/cranelift/codegen/src/prelude.isle b/cranelift/codegen/src/prelude.isle index d622de334a..7119a0cb87 100644 --- a/cranelift/codegen/src/prelude.isle +++ b/cranelift/codegen/src/prelude.isle @@ -365,6 +365,10 @@ (decl ty_int_bool_128 (Type) Type) (extern extractor ty_int_bool_128 ty_int_bool_128) +;; An extractor that matches any int or bool. +(decl ty_int_bool (Type) Type) +(extern extractor ty_int_bool ty_int_bool) + ;; An extractor that only matches integers. (decl ty_int (Type) Type) (extern extractor ty_int ty_int) @@ -373,6 +377,10 @@ (decl ty_scalar_float (Type) Type) (extern extractor ty_scalar_float ty_scalar_float) +;; An extractor that matches scalar floating-point types or vector types. +(decl ty_float_or_vec (Type) Type) +(extern extractor ty_float_or_vec ty_float_or_vec) + ;; A pure constructor that only matches 64-bit vector types. (decl pure ty_vec64 (Type) Type) (extern constructor ty_vec64 ty_vec64) diff --git a/cranelift/filetests/filetests/isa/aarch64/bitcast.clif b/cranelift/filetests/filetests/isa/aarch64/bitcast.clif new file mode 100644 index 0000000000..e16e088ec3 --- /dev/null +++ b/cranelift/filetests/filetests/isa/aarch64/bitcast.clif @@ -0,0 +1,43 @@ +test compile precise-output +target aarch64 + +function %f1(f32) -> i32 { +block0(v0: f32): + v1 = bitcast.i32 v0 + return v1 +} + +; block0: +; mov w0, v0.s[0] +; ret + +function %f2(i32) -> f32 { +block0(v0: i32): + v1 = bitcast.f32 v0 + return v1 +} + +; block0: +; fmov s0, w0 +; ret + +function %f3(f64) -> i64 { +block0(v0: f64): + v1 = bitcast.i64 v0 + return v1 +} + +; block0: +; mov x0, v0.d[0] +; ret + +function %f4(i64) -> f64 { +block0(v0: i64): + v1 = bitcast.f64 v0 + return v1 +} + +; block0: +; fmov d0, x0 +; ret + diff --git a/cranelift/filetests/filetests/isa/aarch64/dynamic-slot.clif b/cranelift/filetests/filetests/isa/aarch64/dynamic-slot.clif index 51f1f450e8..0072a2b146 100644 --- a/cranelift/filetests/filetests/isa/aarch64/dynamic-slot.clif +++ b/cranelift/filetests/filetests/isa/aarch64/dynamic-slot.clif @@ -15,9 +15,9 @@ block0: ; mov fp, sp ; sub sp, sp, #16 ; block0: -; mov x0, sp +; mov x1, sp ; movz x2, #1 -; str x2, [x0] +; str x2, [x1] ; add sp, sp, #16 ; ldp fp, lr, [sp], #16 ; ret @@ -36,9 +36,9 @@ block0: ; mov fp, sp ; sub sp, sp, #16 ; block0: -; mov x0, sp +; mov x1, sp ; movz x2, #1 -; str x2, [x0] +; str x2, [x1] ; add sp, sp, #16 ; ldp fp, lr, [sp], #16 ; ret diff --git a/cranelift/filetests/filetests/isa/aarch64/fcvt.clif b/cranelift/filetests/filetests/isa/aarch64/fcvt.clif new file mode 100644 index 0000000000..98f4ba119a --- /dev/null +++ b/cranelift/filetests/filetests/isa/aarch64/fcvt.clif @@ -0,0 +1,461 @@ +test compile precise-output +target aarch64 + +function %f1(i8) -> f32 { +block0(v0: i8): + v1 = fcvt_from_sint.f32 v0 + return v1 +} + +; block0: +; sxtb w3, w0 +; scvtf s0, w3 +; ret + +function %f2(i16) -> f32 { +block0(v0: i16): + v1 = fcvt_from_sint.f32 v0 + return v1 +} + +; block0: +; sxth w3, w0 +; scvtf s0, w3 +; ret + +function %f3(i32) -> f32 { +block0(v0: i32): + v1 = fcvt_from_sint.f32 v0 + return v1 +} + +; block0: +; scvtf s0, w0 +; ret + +function %f4(i64) -> f32 { +block0(v0: i64): + v1 = fcvt_from_sint.f32 v0 + return v1 +} + +; block0: +; scvtf s0, x0 +; ret + +function %f5(i8) -> f64 { +block0(v0: i8): + v1 = fcvt_from_sint.f64 v0 + return v1 +} + +; block0: +; sxtb w3, w0 +; scvtf d0, w3 +; ret + +function %f6(i16) -> f64 { +block0(v0: i16): + v1 = fcvt_from_sint.f64 v0 + return v1 +} + +; block0: +; sxth w3, w0 +; scvtf d0, w3 +; ret + +function %f7(i32) -> f64 { +block0(v0: i32): + v1 = fcvt_from_sint.f64 v0 + return v1 +} + +; block0: +; scvtf d0, w0 +; ret + +function %f8(i64) -> f64 { +block0(v0: i64): + v1 = fcvt_from_sint.f64 v0 + return v1 +} + +; block0: +; scvtf d0, x0 +; ret + +function %f9(i32x4) -> f64x2 { +block0(v0: i32x4): + v1 = fcvt_low_from_sint.f64x2 v0 + return v1 +} + +; block0: +; sxtl v3.2d, v0.2s +; scvtf v0.2d, v3.2d +; ret + +function %f10(i8, i16, i32, i64) -> f32 { +block0(v0: i8, v1: i16, v2: i32, v3: i64): + v4 = fcvt_from_uint.f32 v0 + v5 = fcvt_from_uint.f32 v1 + v6 = fcvt_from_uint.f32 v2 + v7 = fcvt_from_uint.f32 v3 + v8 = fadd.f32 v4, v5 + v9 = fadd.f32 v8, v6 + v10 = fadd.f32 v9, v7 + return v10 +} + +; block0: +; uxtb w0, w0 +; ucvtf s26, w0 +; uxth w0, w1 +; ucvtf s27, w0 +; ucvtf s25, w2 +; ucvtf s28, x3 +; fadd s26, s26, s27 +; fadd s25, s26, s25 +; fadd s0, s25, s28 +; ret + +function %f11(i32x4) -> f64x2 { +block0(v0: i32x4): + v1 = uwiden_low v0 + v2 = fcvt_from_uint.f64x2 v1 + return v2 +} + +; block0: +; uxtl v4.2d, v0.2s +; ucvtf v0.2d, v4.2d +; ret + +function %f12(i32x4) -> f32x4 { +block0(v0: i32x4): + v1 = fcvt_from_uint.f32x4 v0 + return v1 +} + +; block0: +; ucvtf v0.4s, v0.4s +; ret + +function %f13(f32) -> i32 { +block0(v0: f32): + v1 = fcvt_to_uint.i32 v0 + return v1 +} + +; block0: +; fcmp s0, s0 +; b.vc 8 ; udf +; fmov s5, #-1 +; fcmp s0, s5 +; b.gt 8 ; udf +; movz x10, #20352, LSL #16 +; fmov s18, w10 +; fcmp s0, s18 +; b.lt 8 ; udf +; fcvtzu w0, s0 +; ret + +function %f14(f32) -> i64 { +block0(v0: f32): + v1 = fcvt_to_uint.i64 v0 + return v1 +} + +; block0: +; fcmp s0, s0 +; b.vc 8 ; udf +; fmov s5, #-1 +; fcmp s0, s5 +; b.gt 8 ; udf +; movz x10, #24448, LSL #16 +; fmov s18, w10 +; fcmp s0, s18 +; b.lt 8 ; udf +; fcvtzu x0, s0 +; ret + +function %f15(f64) -> i32 { +block0(v0: f64): + v1 = fcvt_to_uint.i32 v0 + return v1 +} + +; block0: +; fcmp d0, d0 +; b.vc 8 ; udf +; fmov d5, #-1 +; fcmp d0, d5 +; b.gt 8 ; udf +; movz x10, #16880, LSL #48 +; fmov d18, x10 +; fcmp d0, d18 +; b.lt 8 ; udf +; fcvtzu w0, d0 +; ret + +function %f16(f64) -> i64 { +block0(v0: f64): + v1 = fcvt_to_uint.i64 v0 + return v1 +} + +; block0: +; fcmp d0, d0 +; b.vc 8 ; udf +; fmov d5, #-1 +; fcmp d0, d5 +; b.gt 8 ; udf +; movz x10, #17392, LSL #48 +; fmov d18, x10 +; fcmp d0, d18 +; b.lt 8 ; udf +; fcvtzu x0, d0 +; ret + +function %f17(f32) -> i32 { +block0(v0: f32): + v1 = fcvt_to_uint_sat.i32 v0 + return v1 +} + +; block0: +; movz x4, #20352, LSL #16 +; fmov s4, w4 +; fmin s7, s0, s4 +; movi v17.2s, #0 +; fmax s19, s7, s17 +; fcmp s0, s0 +; fcsel s22, s17, s19, ne +; fcvtzu w0, s22 +; ret + +function %f18(f32) -> i64 { +block0(v0: f32): + v1 = fcvt_to_uint_sat.i64 v0 + return v1 +} + +; block0: +; movz x4, #24448, LSL #16 +; fmov s4, w4 +; fmin s7, s0, s4 +; movi v17.2s, #0 +; fmax s19, s7, s17 +; fcmp s0, s0 +; fcsel s22, s17, s19, ne +; fcvtzu x0, s22 +; ret + +function %f19(f64) -> i32 { +block0(v0: f64): + v1 = fcvt_to_uint_sat.i32 v0 + return v1 +} + +; block0: +; ldr d3, pc+8 ; b 12 ; data.f64 4294967295 +; fmin d5, d0, d3 +; movi v7.2s, #0 +; fmax d17, d5, d7 +; fcmp d0, d0 +; fcsel d20, d7, d17, ne +; fcvtzu w0, d20 +; ret + +function %f20(f64) -> i64 { +block0(v0: f64): + v1 = fcvt_to_uint_sat.i64 v0 + return v1 +} + +; block0: +; movz x4, #17392, LSL #48 +; fmov d4, x4 +; fmin d7, d0, d4 +; movi v17.2s, #0 +; fmax d19, d7, d17 +; fcmp d0, d0 +; fcsel d22, d17, d19, ne +; fcvtzu x0, d22 +; ret + +function %f21(f32) -> i32 { +block0(v0: f32): + v1 = fcvt_to_sint.i32 v0 + return v1 +} + +; block0: +; fcmp s0, s0 +; b.vc 8 ; udf +; movz x6, #52992, LSL #16 +; fmov s6, w6 +; fcmp s0, s6 +; b.ge 8 ; udf +; movz x12, #20224, LSL #16 +; fmov s20, w12 +; fcmp s0, s20 +; b.lt 8 ; udf +; fcvtzs w0, s0 +; ret + +function %f22(f32) -> i64 { +block0(v0: f32): + v1 = fcvt_to_sint.i64 v0 + return v1 +} + +; block0: +; fcmp s0, s0 +; b.vc 8 ; udf +; movz x6, #57088, LSL #16 +; fmov s6, w6 +; fcmp s0, s6 +; b.ge 8 ; udf +; movz x12, #24320, LSL #16 +; fmov s20, w12 +; fcmp s0, s20 +; b.lt 8 ; udf +; fcvtzs x0, s0 +; ret + +function %f23(f64) -> i32 { +block0(v0: f64): + v1 = fcvt_to_sint.i32 v0 + return v1 +} + +; block0: +; fcmp d0, d0 +; b.vc 8 ; udf +; ldr d5, pc+8 ; b 12 ; data.f64 -2147483649 +; fcmp d0, d5 +; b.gt 8 ; udf +; movz x10, #16864, LSL #48 +; fmov d18, x10 +; fcmp d0, d18 +; b.lt 8 ; udf +; fcvtzs w0, d0 +; ret + +function %f24(f64) -> i64 { +block0(v0: f64): + v1 = fcvt_to_sint.i64 v0 + return v1 +} + +; block0: +; fcmp d0, d0 +; b.vc 8 ; udf +; movz x6, #50144, LSL #48 +; fmov d6, x6 +; fcmp d0, d6 +; b.ge 8 ; udf +; movz x12, #17376, LSL #48 +; fmov d20, x12 +; fcmp d0, d20 +; b.lt 8 ; udf +; fcvtzs x0, d0 +; ret + +function %f25(f32) -> i32 { +block0(v0: f32): + v1 = fcvt_to_sint_sat.i32 v0 + return v1 +} + +; block0: +; movz x4, #20224, LSL #16 +; fmov s4, w4 +; fmin s7, s0, s4 +; movz x10, #52992, LSL #16 +; fmov s18, w10 +; fmax s21, s7, s18 +; movi v23.16b, #0 +; fcmp s0, s0 +; fcsel s26, s23, s21, ne +; fcvtzs w0, s26 +; ret + +function %f26(f32) -> i64 { +block0(v0: f32): + v1 = fcvt_to_sint_sat.i64 v0 + return v1 +} + +; block0: +; movz x4, #24320, LSL #16 +; fmov s4, w4 +; fmin s7, s0, s4 +; movz x10, #57088, LSL #16 +; fmov s18, w10 +; fmax s21, s7, s18 +; movi v23.16b, #0 +; fcmp s0, s0 +; fcsel s26, s23, s21, ne +; fcvtzs x0, s26 +; ret + +function %f27(f64) -> i32 { +block0(v0: f64): + v1 = fcvt_to_sint_sat.i32 v0 + return v1 +} + +; block0: +; ldr d3, pc+8 ; b 12 ; data.f64 2147483647 +; fmin d5, d0, d3 +; movz x8, #49632, LSL #48 +; fmov d16, x8 +; fmax d19, d5, d16 +; movi v21.16b, #0 +; fcmp d0, d0 +; fcsel d24, d21, d19, ne +; fcvtzs w0, d24 +; ret + +function %f28(f64) -> i64 { +block0(v0: f64): + v1 = fcvt_to_sint_sat.i64 v0 + return v1 +} + +; block0: +; movz x4, #17376, LSL #48 +; fmov d4, x4 +; fmin d7, d0, d4 +; movz x10, #50144, LSL #48 +; fmov d18, x10 +; fmax d21, d7, d18 +; movi v23.16b, #0 +; fcmp d0, d0 +; fcsel d26, d23, d21, ne +; fcvtzs x0, d26 +; ret + +function %f29(f32x4) -> i32x4 { +block0(v0: f32x4): + v1 = fcvt_to_uint_sat.i32x4 v0 + return v1 +} + +; block0: +; fcvtzu v0.4s, v0.4s +; ret + +function %f30(f32x4) -> i32x4 { +block0(v0: f32x4): + v1 = fcvt_to_sint_sat.i32x4 v0 + return v1 +} + +; block0: +; fcvtzs v0.4s, v0.4s +; ret + diff --git a/cranelift/filetests/filetests/isa/aarch64/reftypes.clif b/cranelift/filetests/filetests/isa/aarch64/reftypes.clif index 474629ed1c..1fa7db80b2 100644 --- a/cranelift/filetests/filetests/isa/aarch64/reftypes.clif +++ b/cranelift/filetests/filetests/isa/aarch64/reftypes.clif @@ -71,11 +71,11 @@ block3(v7: r64, v8: r64): ; str x0, [sp, #8] ; ldr x2, 8 ; b 12 ; data TestCase(%f) + 0 ; blr x2 -; mov x8, sp +; mov x4, sp ; ldr x11, [sp, #8] -; str x11, [x8] -; and w6, w0, #1 -; cbz x6, label1 ; b label3 +; str x11, [x4] +; and w5, w0, #1 +; cbz x5, label1 ; b label3 ; block1: ; b label2 ; block2: @@ -89,8 +89,8 @@ block3(v7: r64, v8: r64): ; ldr x1, [sp, #16] ; b label5 ; block5: -; mov x3, sp -; ldr x2, [x3] +; mov x6, sp +; ldr x2, [x6] ; add sp, sp, #32 ; ldp fp, lr, [sp], #16 ; ret diff --git a/cranelift/filetests/filetests/isa/aarch64/simd-bitwise-compile.clif b/cranelift/filetests/filetests/isa/aarch64/simd-bitwise-compile.clif new file mode 100644 index 0000000000..f9e9967ffe --- /dev/null +++ b/cranelift/filetests/filetests/isa/aarch64/simd-bitwise-compile.clif @@ -0,0 +1,213 @@ +test compile precise-output +set enable_simd +target aarch64 + +function %band_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = band v0, v1 + return v2 +} + +; block0: +; and v0.16b, v0.16b, v1.16b +; ret + +function %band_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = band v0, v1 + return v2 +} + +; block0: +; and v0.16b, v0.16b, v1.16b +; ret + +function %band_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = band v0, v1 + return v2 +} + +; block0: +; and v0.16b, v0.16b, v1.16b +; ret + +function %bor_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = bor v0, v1 + return v2 +} + +; block0: +; orr v0.16b, v0.16b, v1.16b +; ret + +function %bor_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = bor v0, v1 + return v2 +} + +; block0: +; orr v0.16b, v0.16b, v1.16b +; ret + +function %bor_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bor v0, v1 + return v2 +} + +; block0: +; orr v0.16b, v0.16b, v1.16b +; ret + +function %bxor_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = bxor v0, v1 + return v2 +} + +; block0: +; eor v0.16b, v0.16b, v1.16b +; ret + +function %bxor_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = bxor v0, v1 + return v2 +} + +; block0: +; eor v0.16b, v0.16b, v1.16b +; ret + +function %bxor_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bxor v0, v1 + return v2 +} + +; block0: +; eor v0.16b, v0.16b, v1.16b +; ret + +function %bitselect_i16x8() -> i16x8 { +block0: + v0 = vconst.i16x8 [0 0 0 0 0 0 0 0] + v1 = vconst.i16x8 [0 0 0 0 0 0 0 0] + v2 = vconst.i16x8 [0 0 0 0 0 0 0 0] + v3 = bitselect v0, v1, v2 + return v3 +} + +; block0: +; movi v0.16b, #0 +; movi v4.16b, #0 +; movi v5.16b, #0 +; bsl v0.16b, v4.16b, v5.16b +; ret + +function %vselect_i16x8(b16x8, i16x8, i16x8) -> i16x8 { +block0(v0: b16x8, v1: i16x8, v2: i16x8): + v3 = vselect v0, v1, v2 + return v3 +} + +; block0: +; bsl v0.16b, v1.16b, v2.16b +; ret + +function %vselect_f32x4(b32x4, f32x4, f32x4) -> f32x4 { +block0(v0: b32x4, v1: f32x4, v2: f32x4): + v3 = vselect v0, v1, v2 + return v3 +} + +; block0: +; bsl v0.16b, v1.16b, v2.16b +; ret + +function %vselect_f64x2(b64x2, f64x2, f64x2) -> f64x2 { +block0(v0: b64x2, v1: f64x2, v2: f64x2): + v3 = vselect v0, v1, v2 + return v3 +} + +; block0: +; bsl v0.16b, v1.16b, v2.16b +; ret + +function %ishl_i8x16(i32) -> i8x16 { +block0(v0: i32): + v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] + v2 = ishl v1, v0 + return v2 +} + +; block0: +; ldr q6, pc+8 ; b 20 ; data.f128 0x0f0e0d0c0b0a09080706050403020100 +; and w4, w0, #7 +; dup v7.16b, w4 +; sshl v0.16b, v6.16b, v7.16b +; ret + +function %ushr_i8x16_imm() -> i8x16 { +block0: + v0 = iconst.i32 1 + v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] + v2 = ushr v1, v0 + return v2 +} + +; block0: +; ldr q6, pc+8 ; b 20 ; data.f128 0x0f0e0d0c0b0a09080706050403020100 +; movz x2, #1 +; and w4, w2, #7 +; sub x6, xzr, x4 +; dup v16.16b, w6 +; ushl v0.16b, v6.16b, v16.16b +; ret + +function %sshr_i8x16(i32) -> i8x16 { +block0(v0: i32): + v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] + v2 = sshr v1, v0 + return v2 +} + +; block0: +; ldr q7, pc+8 ; b 20 ; data.f128 0x0f0e0d0c0b0a09080706050403020100 +; and w4, w0, #7 +; sub x6, xzr, x4 +; dup v16.16b, w6 +; sshl v0.16b, v7.16b, v16.16b +; ret + +function %sshr_i8x16_imm(i8x16, i32) -> i8x16 { +block0(v0: i8x16, v1: i32): + v2 = sshr_imm v0, 3 + return v2 +} + +; block0: +; movz x5, #3 +; and w7, w5, #7 +; sub x9, xzr, x7 +; dup v19.16b, w9 +; sshl v0.16b, v0.16b, v19.16b +; ret + +function %sshr_i64x2(i64x2, i32) -> i64x2 { +block0(v0: i64x2, v1: i32): + v2 = sshr v0, v1 + return v2 +} + +; block0: +; and w5, w0, #63 +; sub x7, xzr, x5 +; dup v17.2d, x7 +; sshl v0.2d, v0.2d, v17.2d +; ret + diff --git a/cranelift/filetests/filetests/isa/aarch64/simd-comparison-legalize.clif b/cranelift/filetests/filetests/isa/aarch64/simd-comparison-legalize.clif new file mode 100644 index 0000000000..b16a6bef53 --- /dev/null +++ b/cranelift/filetests/filetests/isa/aarch64/simd-comparison-legalize.clif @@ -0,0 +1,45 @@ +test compile precise-output +set enable_simd +target aarch64 + +function %icmp_ne_32x4(i32x4, i32x4) -> b32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp ne v0, v1 + return v2 +} + +; block0: +; cmeq v0.4s, v0.4s, v1.4s +; mvn v0.16b, v0.16b +; ret + +function %icmp_ugt_i32x4(i32x4, i32x4) -> b32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp ugt v0, v1 + return v2 +} + +; block0: +; cmhi v0.4s, v0.4s, v1.4s +; ret + +function %icmp_sge_i16x8(i16x8, i16x8) -> b16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = icmp sge v0, v1 + return v2 +} + +; block0: +; cmge v0.8h, v0.8h, v1.8h +; ret + +function %icmp_uge_i8x16(i8x16, i8x16) -> b8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = icmp uge v0, v1 + return v2 +} + +; block0: +; cmhs v0.16b, v0.16b, v1.16b +; ret + diff --git a/cranelift/filetests/filetests/isa/aarch64/simd-lane-access-compile.clif b/cranelift/filetests/filetests/isa/aarch64/simd-lane-access-compile.clif new file mode 100644 index 0000000000..76eb32cb8e --- /dev/null +++ b/cranelift/filetests/filetests/isa/aarch64/simd-lane-access-compile.clif @@ -0,0 +1,124 @@ +test compile precise-output +set enable_simd +target aarch64 + +;; shuffle + +function %shuffle_different_ssa_values() -> i8x16 { +block0: + v0 = vconst.i8x16 0x00 + v1 = vconst.i8x16 0x01 + v2 = shuffle v0, v1, 0x11000000000000000000000000000000 ;; pick the second lane of v1, the rest use the first lane of v0 + return v2 +} + +; block0: +; movi v30.16b, #0 +; movz x5, #1 +; fmov s31, w5 +; ldr q4, pc+8 ; b 20 ; data.f128 0x11000000000000000000000000000000 +; tbl v0.16b, { v30.16b, v31.16b }, v4.16b +; ret + +function %shuffle_same_ssa_value() -> i8x16 { +block0: + v1 = vconst.i8x16 0x01 + v2 = shuffle v1, v1, 0x13000000000000000000000000000000 ;; pick the fourth lane of v1 and the rest from the first lane of v1 + return v2 +} + +; block0: +; movz x4, #1 +; fmov s30, w4 +; ldr q3, pc+8 ; b 20 ; data.f128 0x13000000000000000000000000000000 +; mov v31.16b, v30.16b +; tbl v0.16b, { v30.16b, v31.16b }, v3.16b +; ret + +function %swizzle() -> i8x16 { +block0: + v0 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] + v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] + v2 = swizzle.i8x16 v0, v1 + return v2 +} + +; block0: +; ldr q3, pc+8 ; b 20 ; data.f128 0x0f0e0d0c0b0a09080706050403020100 +; ldr q4, pc+8 ; b 20 ; data.f128 0x0f0e0d0c0b0a09080706050403020100 +; tbl v0.16b, { v3.16b }, v4.16b +; ret + +function %splat_i8(i8) -> i8x16 { +block0(v0: i8): + v1 = splat.i8x16 v0 + return v1 +} + +; block0: +; dup v0.16b, w0 +; ret + +function %splat_b16() -> b16x8 { +block0: + v0 = bconst.b16 true + v1 = splat.b16x8 v0 + return v1 +} + +; block0: +; movi v0.16b, #255 +; ret + +function %splat_i32(i32) -> i32x4 { +block0(v0: i32): + v1 = splat.i32x4 v0 + return v1 +} + +; block0: +; dup v0.4s, w0 +; ret + +function %splat_f64(f64) -> f64x2 { +block0(v0: f64): + v1 = splat.f64x2 v0 + return v1 +} + +; block0: +; dup v0.2d, v0.d[0] +; ret + +function %load32_zero_coalesced(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32 v0 + v2 = scalar_to_vector.i32x4 v1 + return v2 +} + +; block0: +; ldr w2, [x0] +; fmov s0, w2 +; ret + +function %load32_zero_int(i32) -> i32x4 { +block0(v0: i32): + v1 = scalar_to_vector.i32x4 v0 + return v1 +} + +; block0: +; fmov s0, w0 +; ret + +function %load32_zero_float(f32) -> f32x4 { +block0(v0: f32): + v1 = scalar_to_vector.f32x4 v0 + return v1 +} + +; block0: +; fmov s0, s0 +; ret + diff --git a/cranelift/filetests/filetests/isa/aarch64/simd-logical-compile.clif b/cranelift/filetests/filetests/isa/aarch64/simd-logical-compile.clif new file mode 100644 index 0000000000..a1fa05f266 --- /dev/null +++ b/cranelift/filetests/filetests/isa/aarch64/simd-logical-compile.clif @@ -0,0 +1,40 @@ +test compile precise-output +set enable_simd +target aarch64 + +function %bnot_b32x4(b32x4) -> b32x4 { +block0(v0: b32x4): + v1 = bnot v0 + return v1 +} + +; block0: +; mvn v0.16b, v0.16b +; ret + +function %vany_true_b32x4(b32x4) -> b1 { +block0(v0: b32x4): + v1 = vany_true v0 + return v1 +} + +; block0: +; umaxp v3.4s, v0.4s, v0.4s +; mov x5, v3.d[0] +; subs xzr, x5, #0 +; csetm x0, ne +; ret + +function %vall_true_i64x2(i64x2) -> b1 { +block0(v0: i64x2): + v1 = vall_true v0 + return v1 +} + +; block0: +; cmeq v3.2d, v0.2d, #0 +; addp v5.2d, v3.2d, v3.2d +; fcmp d5, d5 +; cset x0, eq +; ret + diff --git a/cranelift/filetests/filetests/isa/aarch64/simd-pairwise-add.clif b/cranelift/filetests/filetests/isa/aarch64/simd-pairwise-add.clif index 5ba3712851..9b3f24d5be 100644 --- a/cranelift/filetests/filetests/isa/aarch64/simd-pairwise-add.clif +++ b/cranelift/filetests/filetests/isa/aarch64/simd-pairwise-add.clif @@ -1,8 +1,6 @@ test compile precise-output -set unwind_info=false target aarch64 - function %fn1(i8x16) -> i16x8 { block0(v0: i8x16): v1 = swiden_low v0 @@ -15,19 +13,7 @@ block0(v0: i8x16): ; saddlp v0.8h, v0.16b ; ret -function %fn2(i8x16) -> i16x8 { -block0(v0: i8x16): - v1 = uwiden_low v0 - v2 = uwiden_high v0 - v3 = iadd_pairwise v1, v2 - return v3 -} - -; block0: -; uaddlp v0.8h, v0.16b -; ret - -function %fn3(i16x8) -> i32x4 { +function %fn2(i16x8) -> i32x4 { block0(v0: i16x8): v1 = swiden_low v0 v2 = swiden_high v0 @@ -39,6 +25,18 @@ block0(v0: i16x8): ; saddlp v0.4s, v0.8h ; ret +function %fn3(i8x16) -> i16x8 { +block0(v0: i8x16): + v1 = uwiden_low v0 + v2 = uwiden_high v0 + v3 = iadd_pairwise v1, v2 + return v3 +} + +; block0: +; uaddlp v0.8h, v0.16b +; ret + function %fn4(i16x8) -> i32x4 { block0(v0: i16x8): v1 = uwiden_low v0 @@ -51,169 +49,3 @@ block0(v0: i16x8): ; uaddlp v0.4s, v0.8h ; ret -function %fn5(i8x16, i8x16) -> i16x8 { -block0(v0: i8x16, v1: i8x16): - v2 = swiden_low v0 - v3 = swiden_high v1 - v4 = iadd_pairwise v2, v3 - return v4 -} - -; block0: -; sxtl v7.8h, v0.8b -; sxtl2 v16.8h, v1.16b -; addp v0.8h, v7.8h, v16.8h -; ret - -function %fn6(i8x16, i8x16) -> i16x8 { -block0(v0: i8x16, v1: i8x16): - v2 = uwiden_low v0 - v3 = uwiden_high v1 - v4 = iadd_pairwise v2, v3 - return v4 -} - -; block0: -; uxtl v7.8h, v0.8b -; uxtl2 v16.8h, v1.16b -; addp v0.8h, v7.8h, v16.8h -; ret - -function %fn7(i8x16) -> i16x8 { -block0(v0: i8x16): - v1 = uwiden_low v0 - v2 = swiden_high v0 - v3 = iadd_pairwise v1, v2 - return v3 -} - -; block0: -; uxtl v5.8h, v0.8b -; sxtl2 v6.8h, v0.16b -; addp v0.8h, v5.8h, v6.8h -; ret - -function %fn8(i8x16) -> i16x8 { -block0(v0: i8x16): - v1 = swiden_low v0 - v2 = uwiden_high v0 - v3 = iadd_pairwise v1, v2 - return v3 -} - -; block0: -; sxtl v5.8h, v0.8b -; uxtl2 v6.8h, v0.16b -; addp v0.8h, v5.8h, v6.8h -; ret - -function %fn9(i8x8, i8x8) -> i8x8 { -block0(v0: i8x8, v1: i8x8): - v2 = iadd_pairwise v0, v1 - return v2 -} - -; block0: -; addp v0.8b, v0.8b, v1.8b -; ret - -function %fn10(i8x16, i8x16) -> i8x16 { -block0(v0: i8x16, v1: i8x16): - v2 = iadd_pairwise v0, v1 - return v2 -} - -; block0: -; addp v0.16b, v0.16b, v1.16b -; ret - -function %fn11(i16x4, i16x4) -> i16x4 { -block0(v0: i16x4, v1: i16x4): - v2 = iadd_pairwise v0, v1 - return v2 -} - -; block0: -; addp v0.4h, v0.4h, v1.4h -; ret - -function %fn12(i16x8, i16x8) -> i16x8 { -block0(v0: i16x8, v1: i16x8): - v2 = iadd_pairwise v0, v1 - return v2 -} - -; block0: -; addp v0.8h, v0.8h, v1.8h -; ret - -function %fn14(i32x4, i32x4) -> i32x4 { -block0(v0: i32x4, v1: i32x4): - v2 = iadd_pairwise v0, v1 - return v2 -} - -; block0: -; addp v0.4s, v0.4s, v1.4s -; ret - -function %fn15(i8x8, i8x8) -> i16x4 { -block0(v0: i8x8, v1: i8x8): - v2 = swiden_low v0 - v3 = swiden_high v1 - v4 = iadd_pairwise v2, v3 - return v4 -} - -; block0: -; sxtl v16.8h, v0.8b -; mov s7, v1.s[1] -; sxtl v17.8h, v7.8b -; addp v0.4h, v16.4h, v17.4h -; ret - -function %fn16(i8x8, i8x8) -> i16x4 { -block0(v0: i8x8, v1: i8x8): - v2 = uwiden_low v0 - v3 = uwiden_high v1 - v4 = iadd_pairwise v2, v3 - return v4 -} - -; block0: -; uxtl v16.8h, v0.8b -; mov s7, v1.s[1] -; uxtl v17.8h, v7.8b -; addp v0.4h, v16.4h, v17.4h -; ret - -function %fn17(i8x8) -> i16x4 { -block0(v0: i8x8): - v1 = uwiden_low v0 - v2 = swiden_high v0 - v3 = iadd_pairwise v1, v2 - return v3 -} - -; block0: -; uxtl v6.8h, v0.8b -; mov s5, v0.s[1] -; sxtl v7.8h, v5.8b -; addp v0.4h, v6.4h, v7.4h -; ret - -function %fn18(i8x8) -> i16x4 { -block0(v0: i8x8): - v1 = swiden_low v0 - v2 = uwiden_high v0 - v3 = iadd_pairwise v1, v2 - return v3 -} - -; block0: -; sxtl v6.8h, v0.8b -; mov s5, v0.s[1] -; uxtl v7.8h, v5.8b -; addp v0.4h, v6.4h, v7.4h -; ret - diff --git a/cranelift/filetests/filetests/isa/aarch64/stack.clif b/cranelift/filetests/filetests/isa/aarch64/stack.clif index a5ebd29a9e..a1478dccda 100644 --- a/cranelift/filetests/filetests/isa/aarch64/stack.clif +++ b/cranelift/filetests/filetests/isa/aarch64/stack.clif @@ -53,8 +53,8 @@ block0: ; mov fp, sp ; sub sp, sp, #16 ; block0: -; mov x0, sp -; ldr x0, [x0] +; mov x2, sp +; ldr x0, [x2] ; add sp, sp, #16 ; ldp fp, lr, [sp], #16 ; ret @@ -74,8 +74,8 @@ block0: ; movk w16, #1, LSL #16 ; sub sp, sp, x16, UXTX ; block0: -; mov x0, sp -; ldr x0, [x0] +; mov x2, sp +; ldr x0, [x2] ; movz w16, #34480 ; movk w16, #1, LSL #16 ; add sp, sp, x16, UXTX @@ -442,8 +442,8 @@ block0(v0: i128): ; mov fp, sp ; sub sp, sp, #16 ; block0: -; mov x4, sp -; stp x0, x1, [x4] +; mov x5, sp +; stp x0, x1, [x5] ; add sp, sp, #16 ; ldp fp, lr, [sp], #16 ; ret @@ -461,8 +461,8 @@ block0(v0: i128): ; mov fp, sp ; sub sp, sp, #32 ; block0: -; add x4, sp, #32 -; stp x0, x1, [x4] +; add x5, sp, #32 +; stp x0, x1, [x5] ; add sp, sp, #32 ; ldp fp, lr, [sp], #16 ; ret @@ -482,8 +482,8 @@ block0(v0: i128): ; movk w16, #1, LSL #16 ; sub sp, sp, x16, UXTX ; block0: -; mov x4, sp -; stp x0, x1, [x4] +; mov x5, sp +; stp x0, x1, [x5] ; movz w16, #34480 ; movk w16, #1, LSL #16 ; add sp, sp, x16, UXTX @@ -502,8 +502,8 @@ block0: ; mov fp, sp ; sub sp, sp, #16 ; block0: -; mov x0, sp -; ldp x0, x1, [x0] +; mov x5, sp +; ldp x0, x1, [x5] ; add sp, sp, #16 ; ldp fp, lr, [sp], #16 ; ret @@ -521,8 +521,8 @@ block0: ; mov fp, sp ; sub sp, sp, #32 ; block0: -; add x0, sp, #32 -; ldp x0, x1, [x0] +; add x5, sp, #32 +; ldp x0, x1, [x5] ; add sp, sp, #32 ; ldp fp, lr, [sp], #16 ; ret @@ -542,8 +542,8 @@ block0: ; movk w16, #1, LSL #16 ; sub sp, sp, x16, UXTX ; block0: -; mov x0, sp -; ldp x0, x1, [x0] +; mov x5, sp +; ldp x0, x1, [x5] ; movz w16, #34480 ; movk w16, #1, LSL #16 ; add sp, sp, x16, UXTX diff --git a/cranelift/filetests/filetests/isa/aarch64/traps.clif b/cranelift/filetests/filetests/isa/aarch64/traps.clif index 206ee938f2..cdc465efce 100644 --- a/cranelift/filetests/filetests/isa/aarch64/traps.clif +++ b/cranelift/filetests/filetests/isa/aarch64/traps.clif @@ -1,8 +1,7 @@ test compile precise-output -set unwind_info=false target aarch64 -function %f() { +function %trap() { block0: trap user0 } @@ -10,26 +9,14 @@ block0: ; block0: ; udf #0xc11f -function %g(i64) { -block0(v0: i64): - v1 = iconst.i64 42 - v2 = ifcmp v0, v1 - trapif eq v2, user0 +function %trap_iadd_ifcout(i64, i64) { +block0(v0: i64, v1: i64): + v2, v3 = iadd_ifcout v0, v1 + trapif of v3, user0 return } ; block0: -; subs xzr, x0, #42 -; b.ne 8 ; udf -; ret - -function %h() { -block0: - debugtrap - return -} - -; block0: -; brk #0 +; b.vc 8 ; udf ; ret diff --git a/cranelift/filetests/filetests/isa/aarch64/vhigh_bits.clif b/cranelift/filetests/filetests/isa/aarch64/vhigh_bits.clif new file mode 100644 index 0000000000..53a99fe2c8 --- /dev/null +++ b/cranelift/filetests/filetests/isa/aarch64/vhigh_bits.clif @@ -0,0 +1,85 @@ +test compile precise-output +target aarch64 + +function %f1(i8x16) -> i8 { +block0(v0: i8x16): + v1 = vhigh_bits.i8 v0 + return v1 +} + +; block0: +; sshr v3.16b, v0.16b, #7 +; movz x6, #513 +; movk x6, #2052, LSL #16 +; movk x6, #8208, LSL #32 +; movk x6, #32832, LSL #48 +; dup v17.2d, x6 +; and v20.16b, v3.16b, v17.16b +; ext v22.16b, v20.16b, v20.16b, #8 +; zip1 v24.16b, v20.16b, v22.16b +; addv h26, v24.8h +; umov w0, v26.h[0] +; ret + +function %f2(i8x16) -> i16 { +block0(v0: i8x16): + v1 = vhigh_bits.i16 v0 + return v1 +} + +; block0: +; sshr v3.16b, v0.16b, #7 +; movz x6, #513 +; movk x6, #2052, LSL #16 +; movk x6, #8208, LSL #32 +; movk x6, #32832, LSL #48 +; dup v17.2d, x6 +; and v20.16b, v3.16b, v17.16b +; ext v22.16b, v20.16b, v20.16b, #8 +; zip1 v24.16b, v20.16b, v22.16b +; addv h26, v24.8h +; umov w0, v26.h[0] +; ret + +function %f3(i16x8) -> i8 { +block0(v0: i16x8): + v1 = vhigh_bits.i8 v0 + return v1 +} + +; block0: +; sshr v3.8h, v0.8h, #15 +; ldr q5, pc+8 ; b 20 ; data.f128 0x00800040002000100008000400020001 +; and v7.16b, v3.16b, v5.16b +; addv h17, v7.8h +; umov w0, v17.h[0] +; ret + +function %f4(i32x4) -> i8 { +block0(v0: i32x4): + v1 = vhigh_bits.i8 v0 + return v1 +} + +; block0: +; sshr v3.4s, v0.4s, #31 +; ldr q5, pc+8 ; b 20 ; data.f128 0x00000008000000040000000200000001 +; and v7.16b, v3.16b, v5.16b +; addv s17, v7.4s +; mov w0, v17.s[0] +; ret + +function %f5(i64x2) -> i8 { +block0(v0: i64x2): + v1 = vhigh_bits.i8 v0 + return v1 +} + +; block0: +; mov x3, v0.d[1] +; mov x5, v0.d[0] +; lsr x7, x3, #63 +; lsr x9, x5, #63 +; add x0, x9, x7, LSL 1 +; ret +