From ead6edb0c5cd6d3842ecba9b030367fdcd53fc80 Mon Sep 17 00:00:00 2001 From: Anton Kirilov Date: Tue, 26 Jul 2022 18:57:15 +0100 Subject: [PATCH] Cranelift AArch64: Migrate Splat to ISLE (#4521) Copyright (c) 2022, Arm Limited. --- cranelift/codegen/src/ir/types.rs | 4 +- cranelift/codegen/src/isa/aarch64/inst.isle | 45 +++- .../codegen/src/isa/aarch64/inst/emit.rs | 13 +- .../src/isa/aarch64/inst/emit_tests.rs | 16 +- cranelift/codegen/src/isa/aarch64/inst/mod.rs | 11 - .../codegen/src/isa/aarch64/inst/regs.rs | 51 ++--- cranelift/codegen/src/isa/aarch64/lower.isle | 43 +++- .../codegen/src/isa/aarch64/lower/isle.rs | 26 ++- .../codegen/src/isa/aarch64/lower_inst.rs | 75 +------ cranelift/codegen/src/isa/s390x/inst.isle | 11 +- cranelift/codegen/src/isa/s390x/lower.isle | 13 +- cranelift/codegen/src/isa/s390x/lower/isle.rs | 5 - cranelift/codegen/src/machinst/isle.rs | 28 ++- cranelift/codegen/src/prelude.isle | 24 ++- .../filetests/isa/aarch64/bitops.clif | 11 +- .../isa/aarch64/dynamic-simd-narrow.clif | 91 ++++----- .../isa/aarch64/dynamic-simd-neon.clif | 114 ++++++----- .../isa/aarch64/dynamic-simd-widen.clif | 44 ++-- .../filetests/isa/aarch64/dynamic-slot.clif | 12 +- .../filetests/isa/aarch64/prologue.clif | 101 +++++---- .../filetests/runtests/simd-splat.clif | 193 +++++++++++++++++- 21 files changed, 593 insertions(+), 338 deletions(-) diff --git a/cranelift/codegen/src/ir/types.rs b/cranelift/codegen/src/ir/types.rs index cacafb5bfb..311addadf7 100644 --- a/cranelift/codegen/src/ir/types.rs +++ b/cranelift/codegen/src/ir/types.rs @@ -171,8 +171,8 @@ impl Type { self.replace_lanes(match self.lane_type() { I8 | B1 | B8 => I8, I16 | B16 => I16, - I32 | B32 => I32, - I64 | B64 => I64, + I32 | B32 | F32 => I32, + I64 | B64 | F64 => I64, I128 | B128 => I128, _ => unimplemented!(), }) diff --git a/cranelift/codegen/src/isa/aarch64/inst.isle b/cranelift/codegen/src/isa/aarch64/inst.isle index 95d6c5ae44..f6eed6f74e 100644 --- a/cranelift/codegen/src/isa/aarch64/inst.isle +++ b/cranelift/codegen/src/isa/aarch64/inst.isle @@ -627,7 +627,8 @@ (VecLoadReplicate (rd WritableReg) (rn Reg) - (size VectorSize)) + (size VectorSize) + (flags MemFlags)) ;; Vector conditional select, 128 bit. A synthetic instruction, which generates a 4-insn ;; control-flow diamond. @@ -1376,6 +1377,16 @@ (decl cond_br_cond (Cond) CondBrKind) (extern constructor cond_br_cond cond_br_cond) +;; Lower the address of a load or a store. +(decl amode (Type Inst u32) AMode) +;; TODO: Port lower_address() to ISLE. +(extern constructor amode amode) + +;; Matches an `AMode` that is just a register. +(decl pure amode_is_reg (AMode) Reg) +;; TODO: Implement in ISLE. +(extern constructor amode_is_reg amode_is_reg) + ;; Instruction creation helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Helper for creating the zero register. @@ -1481,6 +1492,13 @@ (_ Unit (emit (MInst.VecDup dst src size)))) dst)) +;; Helper for emitting `MInst.VecDupFromFpu` instructions. +(decl vec_dup_from_fpu (Reg VectorSize) Reg) +(rule (vec_dup_from_fpu src size) + (let ((dst WritableReg (temp_writable_reg $I8X16)) + (_ Unit (emit (MInst.VecDupFromFpu dst src size)))) + dst)) + ;; Helper for emitting `MInst.AluRRImm12` instructions. (decl alu_rr_imm12 (ALUOp Type Reg Imm12) Reg) (rule (alu_rr_imm12 op ty src imm) @@ -2167,7 +2185,7 @@ (decl sinkable_atomic_load (SinkableAtomicLoad) Value) (extern extractor sinkable_atomic_load sinkable_atomic_load) -;; Sink a `SinkableLoad` into a `Reg`. +;; Sink a `SinkableAtomicLoad` into a `Reg`. ;; ;; This is a side-effectful operation that notifies the context that the ;; instruction that produced the `SinkableAtomicLoad` has been sunk into another @@ -2230,6 +2248,29 @@ (alu_rrr op ty x_lo y_lo) (alu_rrr op ty x_hi y_hi)))) +;; Helper for emitting `MInst.VecLoadReplicate` instructions. +(decl ld1r (Reg VectorSize MemFlags) Reg) +(rule (ld1r src size flags) + (let ((dst WritableReg (temp_writable_reg $I8X16)) + (_ Unit (emit (MInst.VecLoadReplicate dst src size flags)))) + dst)) + +;; Helper for emitting `MInst.LoadAddr` instructions. +(decl load_addr (AMode) Reg) +(rule (load_addr addr) + (let ((dst WritableReg (temp_writable_reg $I64)) + (_ Unit (emit (MInst.LoadAddr dst addr)))) + dst)) + +(rule (load_addr addr) + (if-let addr_reg (amode_is_reg addr)) + addr_reg) + +;; Lower a vector splat with a constant parameter. +(decl splat_const (u64 VectorSize) Reg) +;; TODO: Port lower_splat_const() to ISLE. +(extern constructor splat_const splat_const) + ;; Generate comparison to zero operator from input condition code (decl float_cc_cmp_zero_to_vec_misc_op (FloatCC) VecMisc2) (extern constructor float_cc_cmp_zero_to_vec_misc_op float_cc_cmp_zero_to_vec_misc_op) diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index 9fbbee1849..bcffdc8de0 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -2258,10 +2258,10 @@ impl MachInstEmit for Inst { ScalarSize::Size16 => 0b00010, ScalarSize::Size32 => 0b00100, ScalarSize::Size64 => 0b01000, - _ => unimplemented!("Unexpected VectorSize: {:?}", size), + _ => unreachable!(), }; sink.put4( - 0b000_01110000_00000_000011_00000_00000 + 0b0_0_0_01110000_00000_000011_00000_00000 | (q << 30) | (imm5 << 16) | (machreg_to_gpr(rn) << 5) @@ -2625,13 +2625,18 @@ impl MachInstEmit for Inst { }; sink.put4(enc_vec_rrr(top11 | q << 9, rm, bit15_10, rn, rd)); } - &Inst::VecLoadReplicate { rd, rn, size } => { + &Inst::VecLoadReplicate { + rd, + rn, + size, + flags, + } => { let rd = allocs.next_writable(rd); let rn = allocs.next(rn); let (q, size) = size.enc_size(); let srcloc = state.cur_srcloc(); - if srcloc != SourceLoc::default() { + if srcloc != SourceLoc::default() && !flags.notrap() { // Register the offset at which the actual load instruction starts. sink.add_trap(TrapCode::HeapOutOfBounds); } diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index 071e4864b0..4217c13810 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -2351,10 +2351,10 @@ fn test_aarch64_binemit() { Inst::VecDup { rd: writable_vreg(25), rn: xreg(7), - size: VectorSize::Size8x16, + size: VectorSize::Size8x8, }, - "F90C014E", - "dup v25.16b, w7", + "F90C010E", + "dup v25.8b, w7", )); insns.push(( Inst::VecDup { @@ -2387,10 +2387,10 @@ fn test_aarch64_binemit() { Inst::VecDup { rd: writable_vreg(0), rn: xreg(28), - size: VectorSize::Size32x4, + size: VectorSize::Size32x2, }, - "800F044E", - "dup v0.4s, w28", + "800F040E", + "dup v0.2s, w28", )); insns.push(( Inst::VecDup { @@ -5199,8 +5199,8 @@ fn test_aarch64_binemit() { Inst::VecLoadReplicate { rd: writable_vreg(31), rn: xreg(0), - size: VectorSize::Size64x2, + flags: MemFlags::trusted(), }, "1FCC404D", "ld1r { v31.2d }, [x0]", @@ -5210,8 +5210,8 @@ fn test_aarch64_binemit() { Inst::VecLoadReplicate { rd: writable_vreg(0), rn: xreg(25), - size: VectorSize::Size8x8, + flags: MemFlags::trusted(), }, "20C3400D", "ld1r { v0.8b }, [x25]", diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index b708d6df05..05c51459ce 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -530,17 +530,6 @@ impl Inst { } } } - - /// Generate a LoadAddr instruction (load address of an amode into - /// register). Elides when possible (when amode is just a register). Returns - /// destination register: either `rd` or a register directly from the amode. - pub fn gen_load_addr(rd: Writable, mem: AMode) -> (Reg, Option) { - if let Some(r) = mem.is_reg() { - (r, None) - } else { - (rd.to_reg(), Some(Inst::LoadAddr { rd, mem })) - } - } } //============================================================================= diff --git a/cranelift/codegen/src/isa/aarch64/inst/regs.rs b/cranelift/codegen/src/isa/aarch64/inst/regs.rs index ba86baeeb4..fbae85ecb7 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/regs.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/regs.rs @@ -165,6 +165,8 @@ pub fn create_reg_env(flags: &settings::Flags) -> MachineEnv { preg(xreg(14)), preg(xreg(15)), // x16 and x17 are spilltmp and tmp2 (see above). + // x18 could be used by the platform to carry inter-procedural state; + // conservatively assume so and make it not allocatable. // x19-28 are callee-saved and so not preferred. // x21 is the pinned register (if enabled) and not allocatable if so. // x29 is FP, x30 is LR, x31 is SP/ZR. @@ -178,30 +180,7 @@ pub fn create_reg_env(flags: &settings::Flags) -> MachineEnv { preg(vreg(5)), preg(vreg(6)), preg(vreg(7)), - preg(vreg(8)), - preg(vreg(9)), - preg(vreg(10)), - preg(vreg(11)), - preg(vreg(12)), - preg(vreg(13)), - preg(vreg(14)), - preg(vreg(15)), - ], - ], - non_preferred_regs_by_class: [ - vec![ - preg(xreg(19)), - preg(xreg(20)), - // x21 is pinned reg if enabled; we add to this list below if not. - preg(xreg(22)), - preg(xreg(23)), - preg(xreg(24)), - preg(xreg(25)), - preg(xreg(26)), - preg(xreg(27)), - preg(xreg(28)), - ], - vec![ + // v8-15 are callee-saved and so not preferred. preg(vreg(16)), preg(vreg(17)), preg(vreg(18)), @@ -220,6 +199,30 @@ pub fn create_reg_env(flags: &settings::Flags) -> MachineEnv { preg(vreg(31)), ], ], + non_preferred_regs_by_class: [ + vec![ + preg(xreg(19)), + preg(xreg(20)), + // x21 is pinned reg if enabled; we add to this list below if not. + preg(xreg(22)), + preg(xreg(23)), + preg(xreg(24)), + preg(xreg(25)), + preg(xreg(26)), + preg(xreg(27)), + preg(xreg(28)), + ], + vec![ + preg(vreg(8)), + preg(vreg(9)), + preg(vreg(10)), + preg(vreg(11)), + preg(vreg(12)), + preg(vreg(13)), + preg(vreg(14)), + preg(vreg(15)), + ], + ], fixed_stack_slots: vec![], }; diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle index 51034bd3f9..0874d90254 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.isle +++ b/cranelift/codegen/src/isa/aarch64/lower.isle @@ -1423,7 +1423,8 @@ ;;;; Rules for `bitselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule (lower (has_type (ty_int_bool_ref_scalar_64 ty) (bitselect c x y))) +(rule (lower (has_type ty (bitselect c x y))) + (if (ty_int_bool_ref_scalar_64 ty)) (let ((tmp1 Reg (and_reg ty x c)) (tmp2 Reg (bic ty y c))) (orr ty tmp1 tmp2))) @@ -1441,12 +1442,14 @@ ;; T -> I{64,32,16,8}: We can simply pass through the value: values ;; are always stored with high bits undefined, so we can just leave ;; them be. -(rule (lower (has_type (ty_int_bool_ref_scalar_64 ty) (ireduce src))) +(rule (lower (has_type ty (ireduce src))) + (if (ty_int_bool_ref_scalar_64 ty)) (value_regs_get src 0)) ;; Likewise for breduce. -(rule (lower (has_type (ty_int_bool_ref_scalar_64 ty) (breduce src))) +(rule (lower (has_type ty (breduce src))) + (if (ty_int_bool_ref_scalar_64 ty)) (value_regs_get src 0)) @@ -1515,6 +1518,39 @@ (let ((use_allocated_encoding bool (is_not_baldrdash_call_conv))) (side_effect (udf use_allocated_encoding trap_code)))) +;;;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type ty (splat x @ (value_type in_ty)))) + (if (ty_int_bool_ref_scalar_64 in_ty)) + (vec_dup x (vector_size ty))) + +(rule (lower (has_type ty (splat x @ (value_type (ty_scalar_float _))))) + (vec_dup_from_fpu x (vector_size ty))) + +(rule (lower (has_type ty (splat (bconst (u64_from_bool n))))) + (splat_const n (vector_size ty))) + +(rule (lower (has_type ty (splat (breduce (bconst (u64_from_bool n)))))) + (splat_const n (vector_size ty))) + +(rule (lower (has_type ty (splat (f32const (u64_from_ieee32 n))))) + (splat_const n (vector_size ty))) + +(rule (lower (has_type ty (splat (f64const (u64_from_ieee64 n))))) + (splat_const n (vector_size ty))) + +(rule (lower (has_type ty (splat (iconst (u64_from_imm64 n))))) + (splat_const n (vector_size ty))) + +(rule (lower (has_type ty (splat (ireduce (iconst (u64_from_imm64 n)))))) + (splat_const n (vector_size ty))) + +(rule (lower (has_type ty (splat x @ (load flags _addr offset)))) + (if-let mem_op (is_sinkable_inst x)) + (let ((_ Unit (sink_inst mem_op)) + (addr AMode (amode (lane_type ty) mem_op offset)) + (address Reg (load_addr addr))) + (ld1r address (vector_size ty) flags))) ;;;; Rules for `AtomicLoad` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type (valid_atomic_transaction ty) (atomic_load flags addr))) @@ -1527,7 +1563,6 @@ addr)) (side_effect (store_release ty src addr))) - ;;;; Rules for `AtomicRMW` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule 1 (lower (and (use_lse) diff --git a/cranelift/codegen/src/isa/aarch64/lower/isle.rs b/cranelift/codegen/src/isa/aarch64/lower/isle.rs index a58cb5b8e3..9c0f7a5738 100644 --- a/cranelift/codegen/src/isa/aarch64/lower/isle.rs +++ b/cranelift/codegen/src/isa/aarch64/lower/isle.rs @@ -5,12 +5,13 @@ pub mod generated_code; // Types that the generated ISLE code uses via `use super::*`. use super::{ - writable_zero_reg, zero_reg, AMode, ASIMDFPModImm, ASIMDMovModImm, BranchTarget, CallIndInfo, - CallInfo, Cond, CondBrKind, ExtendOp, FPUOpRI, FloatCC, Imm12, ImmLogic, ImmShift, + insn_inputs, writable_zero_reg, zero_reg, AMode, ASIMDFPModImm, ASIMDMovModImm, BranchTarget, + CallIndInfo, CallInfo, Cond, CondBrKind, ExtendOp, FPUOpRI, FloatCC, Imm12, ImmLogic, ImmShift, Inst as MInst, IntCC, JTSequenceInfo, MachLabel, MoveWideConst, MoveWideOp, NarrowValueMode, Opcode, OperandSize, PairAMode, Reg, ScalarSize, ShiftOpAndAmt, UImm5, VecMisc2, VectorSize, NZCV, }; +use crate::isa::aarch64::lower::{lower_address, lower_splat_const}; use crate::isa::aarch64::settings::Flags as IsaFlags; use crate::machinst::{isle::*, InputSourceInst}; use crate::settings::Flags; @@ -442,4 +443,25 @@ where _ => panic!(), } } + + fn amode(&mut self, ty: Type, mem_op: Inst, offset: u32) -> AMode { + lower_address( + self.lower_ctx, + ty, + &insn_inputs(self.lower_ctx, mem_op)[..], + offset as i32, + ) + } + + fn amode_is_reg(&mut self, address: &AMode) -> Option { + address.is_reg() + } + + fn splat_const(&mut self, value: u64, size: &VectorSize) -> Reg { + let rd = self.temp_writable_reg(I8X16); + + lower_splat_const(self.lower_ctx, rd, value, *size); + + rd.to_reg() + } } diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 18fa8d24ef..e02de737bb 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -741,80 +741,7 @@ pub(crate) fn lower_insn_to_regs>( } } - Opcode::Splat => { - let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let ty = ty.unwrap(); - // TODO: Handle SVE Dup. - let ty = if ty.is_dynamic_vector() { - dynamic_to_fixed(ty) - } else { - ty - }; - let size = VectorSize::from_ty(ty); - - if let Some((_, insn)) = maybe_input_insn_multi( - ctx, - inputs[0], - &[ - Opcode::Bconst, - Opcode::F32const, - Opcode::F64const, - Opcode::Iconst, - ], - ) { - lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size); - } else if let Some(insn) = - maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Iconst, Opcode::Ireduce) - { - lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size); - } else if let Some(insn) = - maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Bconst, Opcode::Breduce) - { - lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size); - } else if let Some((_, insn)) = maybe_input_insn_multi( - ctx, - inputs[0], - &[ - Opcode::Uload8, - Opcode::Sload8, - Opcode::Uload16, - Opcode::Sload16, - Opcode::Uload32, - Opcode::Sload32, - Opcode::Load, - ], - ) { - ctx.sink_inst(insn); - let load_inputs = insn_inputs(ctx, insn); - let load_outputs = insn_outputs(ctx, insn); - lower_load( - ctx, - insn, - &load_inputs[..], - load_outputs[0], - |ctx, _rd, _elem_ty, mem| { - let tmp = ctx.alloc_tmp(I64).only_reg().unwrap(); - let (addr, addr_inst) = Inst::gen_load_addr(tmp, mem); - if let Some(addr_inst) = addr_inst { - ctx.emit(addr_inst); - } - ctx.emit(Inst::VecLoadReplicate { rd, rn: addr, size }); - - Ok(()) - }, - )?; - } else { - let input_ty = ctx.input_ty(insn, 0); - let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - let inst = if ty_has_int_representation(input_ty) { - Inst::VecDup { rd, rn, size } - } else { - Inst::VecDupFromFpu { rd, rn, size } - }; - - ctx.emit(inst); - } - } + Opcode::Splat => implemented_in_isle(ctx), Opcode::ScalarToVector => implemented_in_isle(ctx), diff --git a/cranelift/codegen/src/isa/s390x/inst.isle b/cranelift/codegen/src/isa/s390x/inst.isle index 1e378fd6f5..1ea06e94dd 100644 --- a/cranelift/codegen/src/isa/s390x/inst.isle +++ b/cranelift/codegen/src/isa/s390x/inst.isle @@ -868,7 +868,7 @@ ;; Pseudoinstruction to keep a value alive. (DummyUse (reg Reg)) - + ;; An unwind pseudoinstruction describing the state of the ;; machine at this program point. (Unwind @@ -1641,15 +1641,6 @@ (decl sinkable_inst (Inst) Value) (extern extractor sinkable_inst sinkable_inst) -;; Sink a sinkable instruction. -;; -;; This is a side-effectful operation that notifies the context that the -;; sinkable instruction been sunk into another instruction, and no longer -;; needs to be lowered. -(decl sink_inst (Inst) Unit) -(extern constructor sink_inst sink_inst) - - ;; Sinkable big-endian load instruction. (decl sinkable_load (Inst) Value) (extractor (sinkable_load inst) diff --git a/cranelift/codegen/src/isa/s390x/lower.isle b/cranelift/codegen/src/isa/s390x/lower.isle index 1468c60b2a..60cd2aaf5d 100644 --- a/cranelift/codegen/src/isa/s390x/lower.isle +++ b/cranelift/codegen/src/isa/s390x/lower.isle @@ -1656,8 +1656,9 @@ ;; Insert vector lane from general-purpose register. (rule (lower (insertlane x @ (value_type ty) - y @ (value_type (ty_int_bool_ref_scalar_64 _)) + y @ (value_type in_ty) (u8_from_uimm8 idx))) + (if (ty_int_bool_ref_scalar_64 in_ty)) (vec_insert_lane ty x y (be_lane_idx ty idx) (zero_reg))) ;; Insert vector lane from floating-point register. @@ -1771,8 +1772,9 @@ ;;;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Extract vector lane to general-purpose register. -(rule (lower (has_type (ty_int_bool_ref_scalar_64 _) +(rule (lower (has_type out_ty (extractlane x @ (value_type ty) (u8_from_uimm8 idx)))) + (if (ty_int_bool_ref_scalar_64 out_ty)) (vec_extract_lane ty x (be_lane_idx ty idx) (zero_reg))) ;; Extract vector lane to floating-point register. @@ -1828,8 +1830,8 @@ ;;;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Load replicated value from general-purpose register. -(rule (lower (has_type ty (splat - x @ (value_type (ty_int_bool_ref_scalar_64 _))))) +(rule (lower (has_type ty (splat x @ (value_type in_ty)))) + (if (ty_int_bool_ref_scalar_64 in_ty)) (vec_replicate_lane ty (vec_insert_lane_undef ty x 0 (zero_reg)) 0)) ;; Load replicated value from floating-point register. @@ -1888,7 +1890,8 @@ ;; Load scalar value from general-purpose register. (rule (lower (has_type ty (scalar_to_vector - x @ (value_type (ty_int_bool_ref_scalar_64 _))))) + x @ (value_type in_ty)))) + (if (ty_int_bool_ref_scalar_64 in_ty)) (vec_insert_lane ty (vec_imm ty 0) x (be_lane_idx ty 0) (zero_reg))) ;; Load scalar value from floating-point register. diff --git a/cranelift/codegen/src/isa/s390x/lower/isle.rs b/cranelift/codegen/src/isa/s390x/lower/isle.rs index 00710146a1..34eec0097f 100644 --- a/cranelift/codegen/src/isa/s390x/lower/isle.rs +++ b/cranelift/codegen/src/isa/s390x/lower/isle.rs @@ -666,11 +666,6 @@ where None } - #[inline] - fn sink_inst(&mut self, inst: Inst) -> Unit { - self.lower_ctx.sink_inst(inst); - } - #[inline] fn emit(&mut self, inst: &MInst) -> Unit { self.lower_ctx.emit(inst.clone()); diff --git a/cranelift/codegen/src/machinst/isle.rs b/cranelift/codegen/src/machinst/isle.rs index dc09b8c7c1..796656c0ff 100644 --- a/cranelift/codegen/src/machinst/isle.rs +++ b/cranelift/codegen/src/machinst/isle.rs @@ -11,7 +11,9 @@ pub use crate::ir::{ SigRef, StackSlot, }; pub use crate::isa::unwind::UnwindInst; -pub use crate::machinst::{ABIArg, ABIArgSlot, ABISig, RealReg, Reg, RelocDistance, Writable}; +pub use crate::machinst::{ + ABIArg, ABIArgSlot, ABISig, InputSourceInst, RealReg, Reg, RelocDistance, Writable, +}; pub type Unit = (); pub type ValueSlice = (ValueList, usize); @@ -425,6 +427,15 @@ macro_rules! isle_prelude_methods { imm.bits() as u64 } + #[inline] + fn u64_from_bool(&mut self, b: bool) -> u64 { + if b { + u64::MAX + } else { + 0 + } + } + #[inline] fn inst_results(&mut self, inst: Inst) -> ValueSlice { (self.lower_ctx.dfg().inst_results_list(inst), 0) @@ -854,6 +865,21 @@ macro_rules! isle_prelude_methods { fn real_reg_to_writable_reg(&mut self, reg: RealReg) -> WritableReg { Writable::from_reg(Reg::from(reg)) } + + fn is_sinkable_inst(&mut self, val: Value) -> Option { + let input = self.lower_ctx.get_value_as_source_or_const(val); + + if let InputSourceInst::UniqueUse(inst, _) = input.inst { + Some(inst) + } else { + None + } + } + + #[inline] + fn sink_inst(&mut self, inst: Inst) { + self.lower_ctx.sink_inst(inst); + } }; } diff --git a/cranelift/codegen/src/prelude.isle b/cranelift/codegen/src/prelude.isle index 1998c31925..4b7ec8fd73 100644 --- a/cranelift/codegen/src/prelude.isle +++ b/cranelift/codegen/src/prelude.isle @@ -308,10 +308,10 @@ (decl fits_in_64 (Type) Type) (extern extractor fits_in_64 fits_in_64) -;; An extractor that only matches scalar booleans, integers, and references that -;; can fit in 64 bits. -(decl ty_int_bool_ref_scalar_64 (Type) Type) -(extern extractor ty_int_bool_ref_scalar_64 ty_int_bool_ref_scalar_64) +;; A pure constructor that only matches scalar booleans, integers, and +;; references that can fit in 64 bits. +(decl pure ty_int_bool_ref_scalar_64 (Type) Type) +(extern constructor ty_int_bool_ref_scalar_64 ty_int_bool_ref_scalar_64) ;; An extractor that matches 32- and 64-bit types only. (decl ty_32_or_64 (Type) Type) @@ -407,6 +407,10 @@ (decl u8_from_uimm8 (u8) Uimm8) (extern extractor infallible u8_from_uimm8 u8_from_uimm8) +;; Extract a `u64` from a `bool`. +(decl u64_from_bool (u64) bool) +(extern extractor infallible u64_from_bool u64_from_bool) + ;; Extract a `u64` from an `Imm64`. (decl u64_from_imm64 (u64) Imm64) (extern extractor infallible u64_from_imm64 u64_from_imm64) @@ -498,6 +502,10 @@ (decl pure zero_value (Value) Value) (extern constructor zero_value zero_value) +;; Match a sinkable instruction from a value operand. +(decl pure is_sinkable_inst (Value) Inst) +(extern constructor is_sinkable_inst is_sinkable_inst) + ;; Instruction creation helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Emit an instruction. @@ -508,6 +516,14 @@ (decl emit (MInst) Unit) (extern constructor emit emit) +;; Sink an instruction. +;; +;; This is a side-effectful operation that notifies the context that the +;; instruction has been sunk into another instruction, and no longer needs to +;; be lowered. +(decl sink_inst (Inst) Unit) +(extern constructor sink_inst sink_inst) + ;; Constant pool emission. (type VCodeConstant (primitive VCodeConstant)) diff --git a/cranelift/filetests/filetests/isa/aarch64/bitops.clif b/cranelift/filetests/filetests/isa/aarch64/bitops.clif index 7a88d27be4..5419d077b8 100644 --- a/cranelift/filetests/filetests/isa/aarch64/bitops.clif +++ b/cranelift/filetests/filetests/isa/aarch64/bitops.clif @@ -244,18 +244,13 @@ block0(v0: i128): return v1 } -; stp fp, lr, [sp, #-16]! -; mov fp, sp -; stp d11, d13, [sp, #-16]! ; block0: ; fmov d6, x0 ; mov v6.d[1], x1 -; cnt v11.16b, v6.16b -; addv b13, v11.16b -; umov w0, v13.b[0] +; cnt v19.16b, v6.16b +; addv b21, v19.16b +; umov w0, v21.b[0] ; movz w1, #0 -; ldp d11, d13, [sp], #16 -; ldp fp, lr, [sp], #16 ; ret function %d(i64) -> i64 { diff --git a/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-narrow.clif b/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-narrow.clif index f9e7b32448..7b041b5a14 100644 --- a/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-narrow.clif +++ b/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-narrow.clif @@ -15,9 +15,9 @@ block0(v0: i16): } ; block0: -; dup v2.4h, w0 -; mov v7.16b, v2.16b -; mov v7.d[1], v2.d[0] +; dup v6.4h, w0 +; mov v7.16b, v6.16b +; mov v7.d[1], v6.d[0] ; sqxtn v0.8b, v7.8h ; ret @@ -35,9 +35,9 @@ block0(v0: i16): } ; block0: -; dup v2.8h, w0 -; sqxtn v0.8b, v2.8h -; sqxtn2 v0.16b, v2.8h +; dup v6.8h, w0 +; sqxtn v0.8b, v6.8h +; sqxtn2 v0.16b, v6.8h ; ret function %snarrow_i32x2(i32) -> i16x4 { @@ -54,9 +54,9 @@ block0(v0: i32): } ; block0: -; dup v2.2s, w0 -; mov v7.16b, v2.16b -; mov v7.d[1], v2.d[0] +; dup v6.2s, w0 +; mov v7.16b, v6.16b +; mov v7.d[1], v6.d[0] ; sqxtn v0.4h, v7.4s ; ret @@ -74,9 +74,9 @@ block0(v0: i32): } ; block0: -; dup v2.4s, w0 -; sqxtn v0.4h, v2.4s -; sqxtn2 v0.8h, v2.4s +; dup v6.4s, w0 +; sqxtn v0.4h, v6.4s +; sqxtn2 v0.8h, v6.4s ; ret function %snarrow_i64x2(i64) -> i32x4 { @@ -93,9 +93,9 @@ block0(v0: i64): } ; block0: -; dup v2.2d, x0 -; sqxtn v0.2s, v2.2d -; sqxtn2 v0.4s, v2.2d +; dup v6.2d, x0 +; sqxtn v0.2s, v6.2d +; sqxtn2 v0.4s, v6.2d ; ret function %unarrow_i16x4(i16) -> i8x8 { @@ -112,9 +112,9 @@ block0(v0: i16): } ; block0: -; dup v2.4h, w0 -; mov v7.16b, v2.16b -; mov v7.d[1], v2.d[0] +; dup v6.4h, w0 +; mov v7.16b, v6.16b +; mov v7.d[1], v6.d[0] ; sqxtun v0.8b, v7.8h ; ret @@ -132,9 +132,9 @@ block0(v0: i16): } ; block0: -; dup v2.8h, w0 -; sqxtun v0.8b, v2.8h -; sqxtun2 v0.16b, v2.8h +; dup v6.8h, w0 +; sqxtun v0.8b, v6.8h +; sqxtun2 v0.16b, v6.8h ; ret function %unarrow_i32x2(i32) -> i16x4 { @@ -151,9 +151,9 @@ block0(v0: i32): } ; block0: -; dup v2.2s, w0 -; mov v7.16b, v2.16b -; mov v7.d[1], v2.d[0] +; dup v6.2s, w0 +; mov v7.16b, v6.16b +; mov v7.d[1], v6.d[0] ; sqxtun v0.4h, v7.4s ; ret @@ -171,9 +171,9 @@ block0(v0: i32): } ; block0: -; dup v2.4s, w0 -; sqxtun v0.4h, v2.4s -; sqxtun2 v0.8h, v2.4s +; dup v6.4s, w0 +; sqxtun v0.4h, v6.4s +; sqxtun2 v0.8h, v6.4s ; ret function %unarrow_i64x2(i64) -> i32x4 { @@ -190,9 +190,9 @@ block0(v0: i64): } ; block0: -; dup v2.2d, x0 -; sqxtun v0.2s, v2.2d -; sqxtun2 v0.4s, v2.2d +; dup v6.2d, x0 +; sqxtun v0.2s, v6.2d +; sqxtun2 v0.4s, v6.2d ; ret function %uunarrow_i16x4(i16) -> i8x8 { @@ -209,9 +209,9 @@ block0(v0: i16): } ; block0: -; dup v2.4h, w0 -; mov v7.16b, v2.16b -; mov v7.d[1], v2.d[0] +; dup v6.4h, w0 +; mov v7.16b, v6.16b +; mov v7.d[1], v6.d[0] ; uqxtn v0.8b, v7.8h ; ret @@ -229,9 +229,9 @@ block0(v0: i16): } ; block0: -; dup v2.8h, w0 -; uqxtn v0.8b, v2.8h -; uqxtn2 v0.16b, v2.8h +; dup v6.8h, w0 +; uqxtn v0.8b, v6.8h +; uqxtn2 v0.16b, v6.8h ; ret function %uunarrow_i32x2(i32) -> i16x4 { @@ -248,9 +248,9 @@ block0(v0: i32): } ; block0: -; dup v2.2s, w0 -; mov v7.16b, v2.16b -; mov v7.d[1], v2.d[0] +; dup v6.2s, w0 +; mov v7.16b, v6.16b +; mov v7.d[1], v6.d[0] ; uqxtn v0.4h, v7.4s ; ret @@ -268,9 +268,9 @@ block0(v0: i32): } ; block0: -; dup v2.4s, w0 -; uqxtn v0.4h, v2.4s -; uqxtn2 v0.8h, v2.4s +; dup v6.4s, w0 +; uqxtn v0.4h, v6.4s +; uqxtn2 v0.8h, v6.4s ; ret function %uunarrow_i64x2(i64) -> i32x4 { @@ -287,8 +287,7 @@ block0(v0: i64): } ; block0: -; dup v2.2d, x0 -; uqxtn v0.2s, v2.2d -; uqxtn2 v0.4s, v2.2d +; dup v6.2d, x0 +; uqxtn v0.2s, v6.2d +; uqxtn2 v0.4s, v6.2d ; ret - diff --git a/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-neon.clif b/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-neon.clif index 1f1f64d0cf..0fbcf700bd 100644 --- a/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-neon.clif +++ b/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-neon.clif @@ -1,4 +1,4 @@ -test compile +test compile precise-output target aarch64 function %i8x16_splat_add(i8, i8) -> i8x16 { @@ -13,10 +13,11 @@ block0(v0: i8, v1: i8): return v5 } -; check: dup v4.16b, w0 -; nextln: dup v6.16b, w1 -; nextln: add v0.16b, v4.16b, v6.16b -; nextln: ret +; block0: +; dup v16.16b, w0 +; dup v17.16b, w1 +; add v0.16b, v16.16b, v17.16b +; ret function %i16x8_splat_add(i16, i16) -> i16x8 { gv0 = dyn_scale_target_const.i16x8 @@ -30,10 +31,11 @@ block0(v0: i16, v1: i16): return v5 } -; check: dup v4.8h, w0 -; nextln: dup v6.8h, w1 -; nextln: add v0.8h, v4.8h, v6.8h -; nextln: ret +; block0: +; dup v16.8h, w0 +; dup v17.8h, w1 +; add v0.8h, v16.8h, v17.8h +; ret function %i32x4_splat_mul(i32, i32) -> i32x4 { gv0 = dyn_scale_target_const.i32x4 @@ -47,10 +49,11 @@ block0(v0: i32, v1: i32): return v5 } -; check: dup v4.4s, w0 -; nextln: dup v6.4s, w1 -; nextln: mul v0.4s, v4.4s, v6.4s -; nextln: ret +; block0: +; dup v16.4s, w0 +; dup v17.4s, w1 +; mul v0.4s, v16.4s, v17.4s +; ret function %i64x2_splat_sub(i64, i64) -> i64x2 { gv0 = dyn_scale_target_const.i64x2 @@ -64,10 +67,11 @@ block0(v0: i64, v1: i64): return v5 } -; check: dup v4.2d, x0 -; nextln: dup v6.2d, x1 -; nextln: sub v0.2d, v4.2d, v6.2d -; nextln: ret +; block0: +; dup v16.2d, x0 +; dup v17.2d, x1 +; sub v0.2d, v16.2d, v17.2d +; ret function %f32x4_splat_add(f32, f32) -> f32x4 { gv0 = dyn_scale_target_const.f32x4 @@ -81,10 +85,11 @@ block0(v0: f32, v1: f32): return v5 } -; check: dup v4.4s, v0.s[0] -; nextln: dup v6.4s, v1.s[0] -; nextln: fadd v0.4s, v4.4s, v6.4s -; nextln: ret +; block0: +; dup v16.4s, v0.s[0] +; dup v17.4s, v1.s[0] +; fadd v0.4s, v16.4s, v17.4s +; ret function %f64x2_splat_sub(f64, f64) -> f64x2 { gv0 = dyn_scale_target_const.f64x2 @@ -98,10 +103,11 @@ block0(v0: f64, v1: f64): return v5 } -; check: dup v4.2d, v0.d[0] -; nextln: dup v6.2d, v1.d[0] -; nextln: fsub v0.2d, v4.2d, v6.2d -; nextln: ret +; block0: +; dup v16.2d, v0.d[0] +; dup v17.2d, v1.d[0] +; fsub v0.2d, v16.2d, v17.2d +; ret function %f64x2_splat_mul(f64, f64) -> f64x2 { gv0 = dyn_scale_target_const.f64x2 @@ -115,10 +121,11 @@ block0(v0: f64, v1: f64): return v5 } -; check: dup v4.2d, v0.d[0] -; nextln: dup v6.2d, v1.d[0] -; nextln: fmul v0.2d, v4.2d, v6.2d -; nextln: ret +; block0: +; dup v16.2d, v0.d[0] +; dup v17.2d, v1.d[0] +; fmul v0.2d, v16.2d, v17.2d +; ret function %f64x2_splat_div(f64, f64) -> f64x2 { gv0 = dyn_scale_target_const.f64x2 @@ -132,10 +139,11 @@ block0(v0: f64, v1: f64): return v5 } -; check: dup v4.2d, v0.d[0] -; nextln: dup v6.2d, v1.d[0] -; nextln: fdiv v0.2d, v4.2d, v6.2d -; nextln: ret +; block0: +; dup v16.2d, v0.d[0] +; dup v17.2d, v1.d[0] +; fdiv v0.2d, v16.2d, v17.2d +; ret function %f64x2_splat_min(f64, f64) -> f64x2 { gv0 = dyn_scale_target_const.f64x2 @@ -149,10 +157,11 @@ block0(v0: f64, v1: f64): return v5 } -; check: dup v4.2d, v0.d[0] -; nextln: dup v6.2d, v1.d[0] -; nextln: fmin v0.2d, v4.2d, v6.2d -; nextln: ret +; block0: +; dup v16.2d, v0.d[0] +; dup v17.2d, v1.d[0] +; fmin v0.2d, v16.2d, v17.2d +; ret function %f64x2_splat_max(f64, f64) -> f64x2 { gv0 = dyn_scale_target_const.f64x2 @@ -166,10 +175,11 @@ block0(v0: f64, v1: f64): return v5 } -; check: dup v4.2d, v0.d[0] -; nextln: dup v6.2d, v1.d[0] -; nextln: fmax v0.2d, v4.2d, v6.2d -; nextln: ret +; block0: +; dup v16.2d, v0.d[0] +; dup v17.2d, v1.d[0] +; fmax v0.2d, v16.2d, v17.2d +; ret function %f64x2_splat_min_pseudo(f64, f64) -> f64x2 { gv0 = dyn_scale_target_const.f64x2 @@ -183,11 +193,12 @@ block0(v0: f64, v1: f64): return v5 } -; check: dup v4.2d, v0.d[0] -; nextln: dup v6.2d, v1.d[0] -; nextln: fcmgt v0.2d, v4.2d, v6.2d -; nextln: bsl v0.16b, v6.16b, v4.16b -; nextln: ret +; block0: +; dup v17.2d, v0.d[0] +; dup v18.2d, v1.d[0] +; fcmgt v0.2d, v17.2d, v18.2d +; bsl v0.16b, v18.16b, v17.16b +; ret function %f64x2_splat_max_pseudo(f64, f64) -> f64x2 { gv0 = dyn_scale_target_const.f64x2 @@ -201,8 +212,9 @@ block0(v0: f64, v1: f64): return v5 } -; check: dup v4.2d, v0.d[0] -; nextln: dup v6.2d, v1.d[0] -; nextln: fcmgt v0.2d, v6.2d, v4.2d -; nextln: bsl v0.16b, v6.16b, v4.16b -; nextln: ret +; block0: +; dup v17.2d, v0.d[0] +; dup v18.2d, v1.d[0] +; fcmgt v0.2d, v18.2d, v17.2d +; bsl v0.16b, v18.16b, v17.16b +; ret diff --git a/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-widen.clif b/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-widen.clif index 5161c48ae1..6fda772d85 100644 --- a/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-widen.clif +++ b/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-widen.clif @@ -1,4 +1,4 @@ -test compile +test compile precise-output target aarch64 function %swidenhigh_i8x16(i8) -> i16x8 { @@ -14,9 +14,10 @@ block0(v0: i8): return v3 } -; check: dup v2.16b, w0 -; nextln: sxtl2 v0.8h, v2.16b -; nextln: ret +; block0: +; dup v5.16b, w0 +; sxtl2 v0.8h, v5.16b +; ret function %swidenhigh_i16x8(i16) -> i32x4 { gv0 = dyn_scale_target_const.i32x4 @@ -31,9 +32,10 @@ block0(v0: i16): return v3 } -; check: dup v2.8h, w0 -; nextln: sxtl2 v0.4s, v2.8h -; nextln: ret +; block0: +; dup v5.8h, w0 +; sxtl2 v0.4s, v5.8h +; ret function %swidenhigh_i32x4(i32) -> i64x2 { gv0 = dyn_scale_target_const.i32x4 @@ -48,9 +50,10 @@ block0(v0: i32): return v3 } -; check: dup v2.4s, w0 -; nextln: sxtl2 v0.2d, v2.4s -; nextln: ret +; block0: +; dup v5.4s, w0 +; sxtl2 v0.2d, v5.4s +; ret function %swidenlow_i8x16(i8) -> i16x8 { gv0 = dyn_scale_target_const.i16x8 @@ -65,9 +68,10 @@ block0(v0: i8): return v3 } -; check: dup v2.16b, w0 -; nextln: sxtl v0.8h, v2.8b -; nextln: ret +; block0: +; dup v5.16b, w0 +; sxtl v0.8h, v5.8b +; ret function %swidenlow_i16x8(i16) -> i32x4 { gv0 = dyn_scale_target_const.i32x4 @@ -82,9 +86,10 @@ block0(v0: i16): return v3 } -; check: dup v2.8h, w0 -; nextln: sxtl v0.4s, v2.4h -; nextln: ret +; block0: +; dup v5.8h, w0 +; sxtl v0.4s, v5.4h +; ret function %swidenlow_i32x4(i32) -> i64x2 { gv0 = dyn_scale_target_const.i32x4 @@ -99,6 +104,7 @@ block0(v0: i32): return v3 } -; check: dup v2.4s, w0 -; nextln: sxtl v0.2d, v2.2s -; nextln: ret +; block0: +; dup v5.4s, w0 +; sxtl v0.2d, v5.2s +; ret diff --git a/cranelift/filetests/filetests/isa/aarch64/dynamic-slot.clif b/cranelift/filetests/filetests/isa/aarch64/dynamic-slot.clif index 982457c889..51f1f450e8 100644 --- a/cranelift/filetests/filetests/isa/aarch64/dynamic-slot.clif +++ b/cranelift/filetests/filetests/isa/aarch64/dynamic-slot.clif @@ -58,9 +58,9 @@ block0(v0: i32): ; mov fp, sp ; sub sp, sp, #16 ; block0: -; dup v2.4s, w0 -; mov x4, sp -; str q2, [x4] +; dup v3.4s, w0 +; mov x3, sp +; str q3, [x3] ; add sp, sp, #16 ; ldp fp, lr, [sp], #16 ; ret @@ -101,9 +101,9 @@ block0(v0: i32): ; mov fp, sp ; sub sp, sp, #16 ; block0: -; dup v2.4s, w0 -; mov x4, sp -; str q2, [x4] +; dup v3.4s, w0 +; mov x3, sp +; str q3, [x3] ; add sp, sp, #16 ; ldp fp, lr, [sp], #16 ; ret diff --git a/cranelift/filetests/filetests/isa/aarch64/prologue.clif b/cranelift/filetests/filetests/isa/aarch64/prologue.clif index 9b4a52e4e0..519a3970f8 100644 --- a/cranelift/filetests/filetests/isa/aarch64/prologue.clif +++ b/cranelift/filetests/filetests/isa/aarch64/prologue.clif @@ -82,29 +82,6 @@ block0(v0: f64): ; stp d10, d11, [sp, #-16]! ; stp d8, d9, [sp, #-16]! ; block0: -; fadd d1, d0, d0 -; fadd d2, d0, d0 -; fadd d3, d0, d0 -; fadd d4, d0, d0 -; fadd d5, d0, d0 -; fadd d6, d0, d0 -; fadd d7, d0, d0 -; fadd d8, d0, d0 -; fadd d9, d0, d0 -; fadd d10, d0, d0 -; fadd d11, d0, d0 -; fadd d12, d0, d0 -; fadd d13, d0, d0 -; fadd d14, d0, d0 -; fadd d15, d0, d0 -; fadd d16, d0, d0 -; fadd d17, d0, d0 -; fadd d18, d0, d0 -; fadd d19, d0, d0 -; fadd d20, d0, d0 -; fadd d21, d0, d0 -; fadd d22, d0, d0 -; fadd d23, d0, d0 ; fadd d24, d0, d0 ; fadd d25, d0, d0 ; fadd d26, d0, d0 @@ -113,37 +90,60 @@ block0(v0: f64): ; fadd d29, d0, d0 ; fadd d30, d0, d0 ; fadd d31, d0, d0 -; fadd d0, d0, d1 -; fadd d1, d2, d3 -; fadd d2, d4, d5 -; fadd d3, d6, d7 +; fadd d1, d0, d0 +; fadd d2, d0, d0 +; fadd d3, d0, d0 +; fadd d4, d0, d0 +; fadd d5, d0, d0 +; fadd d6, d0, d0 +; fadd d7, d0, d0 +; fadd d16, d0, d0 +; fadd d17, d0, d0 +; fadd d18, d0, d0 +; fadd d19, d0, d0 +; fadd d20, d0, d0 +; fadd d21, d0, d0 +; fadd d22, d0, d0 +; fadd d23, d0, d0 +; fadd d8, d0, d0 +; fadd d9, d0, d0 +; fadd d10, d0, d0 +; fadd d11, d0, d0 +; fadd d12, d0, d0 +; fadd d13, d0, d0 +; fadd d14, d0, d0 +; fadd d15, d0, d0 +; fadd d24, d0, d24 +; fadd d25, d25, d26 +; fadd d26, d27, d28 +; fadd d27, d29, d30 +; fadd d28, d31, d1 +; fadd d29, d2, d3 +; fadd d30, d4, d5 +; fadd d31, d6, d7 +; fadd d0, d16, d17 +; fadd d1, d18, d19 +; fadd d2, d20, d21 +; fadd d3, d22, d23 ; fadd d4, d8, d9 ; fadd d5, d10, d11 ; fadd d6, d12, d13 ; fadd d7, d14, d15 -; fadd d8, d16, d17 -; fadd d9, d18, d19 -; fadd d10, d20, d21 -; fadd d11, d22, d23 -; fadd d12, d24, d25 -; fadd d13, d26, d27 -; fadd d14, d28, d29 -; fadd d15, d30, d31 -; fadd d0, d0, d1 -; fadd d1, d2, d3 -; fadd d2, d4, d5 -; fadd d3, d6, d7 -; fadd d4, d8, d9 -; fadd d5, d10, d11 -; fadd d6, d12, d13 -; fadd d7, d14, d15 -; fadd d0, d0, d1 -; fadd d1, d2, d3 -; fadd d2, d4, d5 -; fadd d3, d6, d7 -; fadd d0, d0, d1 -; fadd d1, d2, d3 -; fadd d0, d0, d1 +; fadd d24, d24, d25 +; fadd d25, d26, d27 +; fadd d26, d28, d29 +; fadd d27, d30, d31 +; fadd d28, d0, d1 +; fadd d29, d2, d3 +; fadd d30, d4, d5 +; fadd d31, d6, d7 +; fadd d24, d24, d25 +; fadd d25, d26, d27 +; fadd d26, d28, d29 +; fadd d27, d30, d31 +; fadd d24, d24, d25 +; fadd d25, d26, d27 +; fadd d0, d24, d25 ; ldp d8, d9, [sp], #16 ; ldp d10, d11, [sp], #16 ; ldp d12, d13, [sp], #16 @@ -242,4 +242,3 @@ block0(v0: i64): ; ldr x28, [sp], #16 ; ldp fp, lr, [sp], #16 ; ret - diff --git a/cranelift/filetests/filetests/runtests/simd-splat.clif b/cranelift/filetests/filetests/runtests/simd-splat.clif index 1cfef52c78..702e229a40 100644 --- a/cranelift/filetests/filetests/runtests/simd-splat.clif +++ b/cranelift/filetests/filetests/runtests/simd-splat.clif @@ -1,4 +1,4 @@ -test interpret +; test interpret TODO: Not yet implemented test run target aarch64 target s390x @@ -10,6 +10,8 @@ block0(v0: i8): v1 = splat.i8x16 v0 return v1 } +; run: %splat_i8x16(-1) == [-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1] +; run: %splat_i8x16(0) == [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] ; run: %splat_i8x16(1) == [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1] function %splat_i16x8(i16) -> i16x8 { @@ -17,6 +19,8 @@ block0(v0: i16): v1 = splat.i16x8 v0 return v1 } +; run: %splat_i16x8(-1) == [-1 -1 -1 -1 -1 -1 -1 -1] +; run: %splat_i16x8(0) == [0 0 0 0 0 0 0 0] ; run: %splat_i16x8(512) == [512 512 512 512 512 512 512 512] function %splat_i32x4(i32) -> i32x4 { @@ -24,6 +28,8 @@ block0(v0: i32): v1 = splat.i32x4 v0 return v1 } +; run: %splat_i32x4(-1) == [-1 -1 -1 -1] +; run: %splat_i32x4(0) == [0 0 0 0] ; run: %splat_i32x4(2000000) == [2000000 2000000 2000000 2000000] function %splat_i64x2(i64) -> i64x2 { @@ -31,4 +37,189 @@ block0(v0: i64): v1 = splat.i64x2 v0 return v1 } +; run: %splat_i64x2(-1) == [-1 -1] +; run: %splat_i64x2(0) == [0 0] ; run: %splat_i64x2(5000000000) == [5000000000 5000000000] + +function %splat_f32x4(f32) -> f32x4 { +block0(v0: f32): + v1 = splat.f32x4 v0 + return v1 +} +; run: %splat_f32x4(-0x0.0) == [-0x0.0 -0x0.0 -0x0.0 -0x0.0] +; run: %splat_f32x4(0x1.0) == [0x1.0 0x1.0 0x1.0 0x1.0] +; run: %splat_f32x4(NaN) == [NaN NaN NaN NaN] + +function %splat_f64x2(f64) -> f64x2 { +block0(v0: f64): + v1 = splat.f64x2 v0 + return v1 +} +; run: %splat_f64x2(0x0.0) == [0x0.0 0x0.0] +; run: %splat_f64x2(0x2.0) == [0x2.0 0x2.0] +; run: %splat_f64x2(NaN) == [NaN NaN] + +; TODO: Test combinations of `bconst` and `splat`, potentially with `breduce` in +; the middle + +function %splat_i8x16_2(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i8 116 + v2 = splat.i8x16 v1 + v3 = iadd v0, v2 + return v3 +} +; run: %splat_i8x16_2([-128 -101 -75 -59 -22 -12 -7 -1 0 3 17 34 68 92 111 127]) == [-12 15 41 57 94 104 109 115 116 119 -123 -106 -72 -48 -29 -13] + +function %splat_i8x16_3(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i16 116 + v2 = ireduce.i8 v1 + v3 = splat.i8x16 v2 + v4 = iadd v0, v3 + return v4 +} +; run: %splat_i8x16_3([-128 -101 -75 -59 -22 -12 -7 -1 0 3 17 34 68 92 111 127]) == [-12 15 41 57 94 104 109 115 116 119 -123 -106 -72 -48 -29 -13] + +function %splat_i16x8_2(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i16 42 + v2 = splat.i16x8 v1 + v3 = iadd v0, v2 + return v3 +} +; run: %splat_i16x8_2([-32768 -1500 -1 0 42 200 8576 32767]) == [-32726 -1458 41 42 84 242 8618 -32727] + +function %splat_i16x8_3(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i64 42 + v2 = ireduce.i16 v1 + v3 = splat.i16x8 v2 + v4 = iadd v0, v3 + return v4 +} +; run: %splat_i16x8_3([-32768 -1500 -1 0 42 200 8576 32767]) == [-32726 -1458 41 42 84 242 8618 -32727] + +function %splat_i32x4_2(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 1024 + v2 = splat.i32x4 v1 + v3 = iadd v0, v2 + return v3 +} +; run: %splat_i32x4_2([-2147483648 -1 0 2147483647]) == [-2147482624 1023 1024 -2147482625] + +function %splat_i32x4_3(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i64 1024 + v2 = ireduce.i32 v1 + v3 = splat.i32x4 v2 + v4 = iadd v0, v3 + return v4 +} +; run: %splat_i32x4_3([-2147483648 -1 0 2147483647]) == [-2147482624 1023 1024 -2147482625] + +function %splat_i64x2_2(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i64 -1 + v2 = splat.i64x2 v1 + v3 = iadd v0, v2 + return v3 +} +; run: %splat_i64x2_2([-1 0]) == [-2 -1] + +function %splat_f32x4_2(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = f32const 0x1.5 + v2 = splat.f32x4 v1 + v3 = fadd v0, v2 + return v3 +} +; run: %splat_f32x4_2([0x0.0 NaN 0x1.0 0x2.0]) == [0x1.5 NaN 0x2.5 0x3.5] + +function %splat_f64x2_2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = f64const 0x7.5 + v2 = splat.f64x2 v1 + v3 = fadd v0, v2 + return v3 +} +; run: %splat_f64x2_2([0x0.0 0x1.0]) == [0x7.5 0x8.5] + +function %load_splat_i8x16(i8) -> i8x16 { + ss0 = explicit_slot 8 + +block0(v0: i8): + stack_store.i8 v0, ss0 + v1 = stack_load.i8 ss0 + v2 = splat.i8x16 v1 + return v2 +} +; run: %load_splat_i8x16(-1) == [-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1] +; run: %load_splat_i8x16(0) == [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] +; run: %load_splat_i8x16(1) == [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1] + +function %load_splat_i16x8(i16) -> i16x8 { + ss0 = explicit_slot 8 + +block0(v0: i16): + stack_store.i16 v0, ss0 + v1 = stack_load.i16 ss0 + v2 = splat.i16x8 v1 + return v2 +} +; run: %load_splat_i16x8(-1) == [-1 -1 -1 -1 -1 -1 -1 -1] +; run: %load_splat_i16x8(0) == [0 0 0 0 0 0 0 0] +; run: %load_splat_i16x8(512) == [512 512 512 512 512 512 512 512] + +function %load_splat_i32x4(i32) -> i32x4 { + ss0 = explicit_slot 8 + +block0(v0: i32): + stack_store.i32 v0, ss0 + v1 = stack_load.i32 ss0 + v2 = splat.i32x4 v1 + return v2 +} +; run: %load_splat_i32x4(-1) == [-1 -1 -1 -1] +; run: %load_splat_i32x4(0) == [0 0 0 0] +; run: %load_splat_i32x4(2000000) == [2000000 2000000 2000000 2000000] + +function %load_splat_i64x2(i64) -> i64x2 { + ss0 = explicit_slot 8 + +block0(v0: i64): + stack_store.i64 v0, ss0 + v1 = stack_load.i64 ss0 + v2 = splat.i64x2 v1 + return v2 +} +; run: %load_splat_i64x2(-1) == [-1 -1] +; run: %load_splat_i64x2(0) == [0 0] +; run: %load_splat_i64x2(5000000000) == [5000000000 5000000000] + +function %load_splat_f32x4(f32) -> f32x4 { + ss0 = explicit_slot 8 + +block0(v0: f32): + stack_store.f32 v0, ss0 + v1 = stack_load.f32 ss0 + v2 = splat.f32x4 v1 + return v2 +} +; run: %load_splat_f32x4(-0x0.0) == [-0x0.0 -0x0.0 -0x0.0 -0x0.0] +; run: %load_splat_f32x4(0x1.0) == [0x1.0 0x1.0 0x1.0 0x1.0] +; run: %load_splat_f32x4(NaN) == [NaN NaN NaN NaN] + +function %load_splat_f64x2(f64) -> f64x2 { + ss0 = explicit_slot 8 + +block0(v0: f64): + stack_store.f64 v0, ss0 + v1 = stack_load.f64 ss0 + v2 = splat.f64x2 v1 + return v2 +} +; run: %load_splat_f64x2(0x0.0) == [0x0.0 0x0.0] +; run: %load_splat_f64x2(0x2.0) == [0x2.0 0x2.0] +; run: %load_splat_f64x2(NaN) == [NaN NaN]