diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index 63580dc633..b0cf19ccc4 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -919,6 +919,7 @@ Pshuflw Pshufhw Pblendw + Movddup )) (type CmpOpcode extern @@ -1292,6 +1293,11 @@ Vpextrd Vpextrq Vpblendw + Vmovddup + Vpbroadcastb + Vpbroadcastw + Vpbroadcastd + Vbroadcastss )) (type Avx512Opcode extern @@ -1622,6 +1628,9 @@ (decl pure has_avx () bool) (extern constructor has_avx has_avx) +(decl pure has_avx2 () bool) +(extern constructor has_avx2 has_avx2) + ;;;; Helpers for Merging and Sinking Immediates/Loads ;;;;;;;;;;;;;;;;;;;;;;;;; ;; Extract a constant `Imm8Reg.Imm8` from a value operand. @@ -1656,9 +1665,21 @@ ;; Extract a `SinkableLoad` that works with `RegMemImm.Mem` from a value ;; operand. +;; +;; Note that this will only work for 32-bit-types-or-larger since this is +;; pervasively used with operations that load a minimum of 32-bits. For +;; instructions which load exactly the type width necessary use +;; `sinkable_load_exact`. (decl sinkable_load (SinkableLoad) Value) (extern extractor sinkable_load sinkable_load) +;; Same as `sinkable_load` except that all type widths of loads are supported. +;; +;; Only use this when the instruction which performs the load is guaranteed to +;; load the precisely correct size. +(decl sinkable_load_exact (SinkableLoad) Value) +(extern extractor sinkable_load_exact sinkable_load_exact) + ;; Sink a `SinkableLoad` into a `SyntheticAmode`. ;; ;; This is a side-effectful operation that notifies the context that the @@ -1678,6 +1699,9 @@ (decl sink_load_to_reg_mem (SinkableLoad) RegMem) (rule (sink_load_to_reg_mem load) (RegMem.Mem load)) +(decl sink_load_to_gpr_mem (SinkableLoad) GprMem) +(rule (sink_load_to_gpr_mem load) (RegMem.Mem load)) + (decl sink_load_to_reg_mem_imm (SinkableLoad) RegMemImm) (rule (sink_load_to_reg_mem_imm load) (RegMemImm.Mem load)) @@ -4103,6 +4127,34 @@ (rule (trap_if_fcmp (FcmpCondResult.OrCondition producer cc1 cc2) tc) (with_flags_side_effect producer (trap_if_or cc1 cc2 tc))) +;; Helper for creating `movddup` instructions +(decl x64_movddup (XmmMem) Xmm) +(rule (x64_movddup src) + (xmm_unary_rm_r_unaligned (SseOpcode.Movddup) src)) +(rule 1 (x64_movddup src) + (if-let $true (has_avx)) + (xmm_unary_rm_r_vex (AvxOpcode.Vmovddup) src)) + +;; Helper for creating `vpbroadcastb` instructions +(decl x64_vpbroadcastb (XmmMem) Xmm) +(rule (x64_vpbroadcastb src) + (xmm_unary_rm_r_vex (AvxOpcode.Vpbroadcastb) src)) + +;; Helper for creating `vpbroadcastw` instructions +(decl x64_vpbroadcastw (XmmMem) Xmm) +(rule (x64_vpbroadcastw src) + (xmm_unary_rm_r_vex (AvxOpcode.Vpbroadcastw) src)) + +;; Helper for creating `vpbroadcastd` instructions +(decl x64_vpbroadcastd (XmmMem) Xmm) +(rule (x64_vpbroadcastd src) + (xmm_unary_rm_r_vex (AvxOpcode.Vpbroadcastd) src)) + +;; Helper for creating `vbroadcastss` instructions +(decl x64_vbroadcastss (XmmMem) Xmm) +(rule (x64_vbroadcastss src) + (xmm_unary_rm_r_vex (AvxOpcode.Vbroadcastss) src)) + ;;;; Jumps ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Unconditional jump. @@ -4664,6 +4716,7 @@ (convert AtomicRmwOp MachAtomicRmwOp atomic_rmw_op_to_mach_atomic_rmw_op) (convert SinkableLoad RegMem sink_load_to_reg_mem) +(convert SinkableLoad GprMem sink_load_to_gpr_mem) (convert SinkableLoad RegMemImm sink_load_to_reg_mem_imm) (convert SinkableLoad GprMemImm sink_load_to_gpr_mem_imm) (convert SinkableLoad XmmMem sink_load_to_xmm_mem) diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 0c2db35cd7..d7a851e66d 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -928,6 +928,7 @@ pub(crate) enum InstructionSet { BMI2, FMA, AVX, + AVX2, AVX512BITALG, AVX512DQ, AVX512F, @@ -1126,6 +1127,7 @@ pub enum SseOpcode { Pshuflw, Pshufhw, Pblendw, + Movddup, } impl SseOpcode { @@ -1280,7 +1282,8 @@ impl SseOpcode { | SseOpcode::Pmulhrsw | SseOpcode::Pshufb | SseOpcode::Phaddw - | SseOpcode::Phaddd => SSSE3, + | SseOpcode::Phaddd + | SseOpcode::Movddup => SSSE3, SseOpcode::Blendvpd | SseOpcode::Blendvps @@ -1524,6 +1527,7 @@ impl fmt::Debug for SseOpcode { SseOpcode::Pshuflw => "pshuflw", SseOpcode::Pshufhw => "pshufhw", SseOpcode::Pblendw => "pblendw", + SseOpcode::Movddup => "movddup", }; write!(fmt, "{}", name) } @@ -1709,9 +1713,15 @@ impl AvxOpcode { | AvxOpcode::Vpextrw | AvxOpcode::Vpextrd | AvxOpcode::Vpextrq - | AvxOpcode::Vpblendw => { + | AvxOpcode::Vpblendw + | AvxOpcode::Vmovddup + | AvxOpcode::Vbroadcastss => { smallvec![InstructionSet::AVX] } + + AvxOpcode::Vpbroadcastb | AvxOpcode::Vpbroadcastw | AvxOpcode::Vpbroadcastd => { + smallvec![InstructionSet::AVX2] + } } } } diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index bc3db10e30..64c07f1e8c 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -122,6 +122,7 @@ pub(crate) fn emit( InstructionSet::BMI2 => info.isa_flags.has_bmi2(), InstructionSet::FMA => info.isa_flags.has_fma(), InstructionSet::AVX => info.isa_flags.has_avx(), + InstructionSet::AVX2 => info.isa_flags.has_avx2(), InstructionSet::AVX512BITALG => info.isa_flags.has_avx512bitalg(), InstructionSet::AVX512DQ => info.isa_flags.has_avx512dq(), InstructionSet::AVX512F => info.isa_flags.has_avx512f(), @@ -1826,6 +1827,7 @@ pub(crate) fn emit( SseOpcode::Sqrtpd => (LegacyPrefixes::_66, 0x0F51, 2), SseOpcode::Sqrtss => (LegacyPrefixes::_F3, 0x0F51, 2), SseOpcode::Sqrtsd => (LegacyPrefixes::_F2, 0x0F51, 2), + SseOpcode::Movddup => (LegacyPrefixes::_F2, 0x0F12, 2), _ => unimplemented!("Opcode {:?} not implemented", op), }; @@ -2450,6 +2452,13 @@ pub(crate) fn emit( RegisterOrAmode::Amode(_) => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x10), _ => unreachable!(), }, + + AvxOpcode::Vpbroadcastb => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x78), + AvxOpcode::Vpbroadcastw => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x79), + AvxOpcode::Vpbroadcastd => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x58), + AvxOpcode::Vbroadcastss => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x18), + AvxOpcode::Vmovddup => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x12), + _ => panic!("unexpected rmr_imm_vex opcode {op:?}"), }; diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index c532459e05..083566d03b 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -3915,47 +3915,89 @@ ;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule (lower (has_type (multi_lane 8 16) (splat src))) - (let ((vec Xmm (vec_insert_lane $I8X16 (xmm_uninit_value) src 0)) - (zeros Xmm (xmm_zero $I8X16))) - ;; Shuffle the lowest byte lane to all other lanes. - (x64_pshufb vec zeros))) +;; For all the splat rules below one of the goals is that splatting a value +;; doesn't end up accidentally depending on the previous value in a register. +;; This means that instructions are chosen to avoid false dependencies where +;; new values are created fresh or otherwise overwrite previous register +;; contents where possible. +;; +;; Additionally splats are specialized to special-case load-and-splat which +;; has a number of micro-optimizations available. -(rule (lower (has_type (multi_lane 16 8) (splat src))) - (let (;; Force the input into a register so that we don't create a - ;; VCodeConstant. - (src RegMem (RegMem.Reg src)) - (vec Xmm (vec_insert_lane $I16X8 (xmm_uninit_value) src 0)) - (vec Xmm (vec_insert_lane $I16X8 vec src 1))) - ;; Shuffle the lowest two lanes to all other lanes. - (x64_pshufd vec 0))) +;; i8x16 splats: use `vpbroadcastb` on AVX2 and otherwise `pshufb` broadcasts +;; with a mask of zero which is calculated with an xor-against-itself register. +(rule 0 (lower (has_type $I8X16 (splat src))) + (x64_pshufb (bitcast_gpr_to_xmm $I32 src) (xmm_zero $I8X16))) +(rule 1 (lower (has_type $I8X16 (splat src))) + (if-let $true (has_avx2)) + (x64_vpbroadcastb (bitcast_gpr_to_xmm $I32 src))) +(rule 2 (lower (has_type $I8X16 (splat (sinkable_load_exact addr)))) + (x64_pshufb (x64_pinsrb (xmm_uninit_value) addr 0) (xmm_zero $I8X16))) +(rule 3 (lower (has_type $I8X16 (splat (sinkable_load_exact addr)))) + (if-let $true (has_avx2)) + (x64_vpbroadcastb addr)) -(rule 1 (lower (has_type (multi_lane 32 4) (splat src @ (value_type (ty_scalar_float _))))) - (lower_splat_32x4 $F32X4 src)) +;; i16x8 splats: use `vpbroadcastw` on AVX2 and otherwise a 16-bit value is +;; loaded into an xmm register, `pshuflw` broadcasts the low 16-bit lane +;; to the low four lanes, and `pshufd` broadcasts the low 32-bit lane (which +;; at that point is two of the 16-bit values we want to broadcast) to all the +;; lanes. +(rule 0 (lower (has_type $I16X8 (splat src))) + (x64_pshufd (x64_pshuflw (bitcast_gpr_to_xmm $I32 src) 0) 0)) +(rule 1 (lower (has_type $I16X8 (splat src))) + (if-let $true (has_avx2)) + (x64_vpbroadcastw (bitcast_gpr_to_xmm $I32 src))) +(rule 2 (lower (has_type $I16X8 (splat (sinkable_load_exact addr)))) + (x64_pshufd (x64_pshuflw (x64_pinsrw (xmm_uninit_value) addr 0) 0) 0)) +(rule 3 (lower (has_type $I16X8 (splat (sinkable_load_exact addr)))) + (if-let $true (has_avx2)) + (x64_vpbroadcastw addr)) -(rule (lower (has_type (multi_lane 32 4) (splat src))) - (lower_splat_32x4 $I32X4 src)) +;; i32x4.splat - use `vpbroadcastd` on AVX2 and otherwise `pshufd` can be +;; used to broadcast the low lane to all other lanes. +;; +;; Note that sinkable-load cases come later +(rule 0 (lower (has_type $I32X4 (splat src))) + (x64_pshufd (bitcast_gpr_to_xmm $I32 src) 0)) +(rule 1 (lower (has_type $I32X4 (splat src))) + (if-let $true (has_avx2)) + (x64_vpbroadcastd (bitcast_gpr_to_xmm $I32 src))) -(decl lower_splat_32x4 (Type Value) Xmm) -(rule (lower_splat_32x4 ty src) - (let ((src RegMem src) - (vec Xmm (vec_insert_lane ty (xmm_uninit_value) src 0))) - ;; Shuffle the lowest lane to all other lanes. - (x64_pshufd vec 0))) +;; f32x4.splat - the source is already in an xmm register so `shufps` is all +;; that's necessary to complete the splat. This is specialized to `vbroadcastss` +;; on AVX2 to leverage that specific instruction for this operation. +(rule 0 (lower (has_type $F32X4 (splat src))) + (let ((tmp Xmm src)) + (x64_shufps src src 0))) +(rule 1 (lower (has_type $F32X4 (splat src))) + (if-let $true (has_avx2)) + (x64_vbroadcastss src)) -(rule 1 (lower (has_type (multi_lane 64 2) (splat src @ (value_type (ty_scalar_float _))))) - (lower_splat_64x2 $F64X2 src)) +;; t32x4.splat of a load - use a `movss` to load into an xmm register and then +;; `shufps` broadcasts to the other lanes. Note that this is used for both i32 +;; and f32 splats. +;; +;; With AVX the `vbroadcastss` instruction suits this purpose precisely. Note +;; that the memory-operand encoding of `vbroadcastss` is usable with AVX, but +;; the register-based encoding is only available with AVX2. With the +;; `sinkable_load` extractor this should be guaranteed to use the memory-based +;; encoding hence the `has_avx` test. +(rule 4 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr)))) + (let ((tmp Xmm (x64_movss_load addr))) + (x64_shufps tmp tmp 0))) +(rule 5 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr)))) + (if-let $true (has_avx)) + (x64_vbroadcastss addr)) -(rule (lower (has_type (multi_lane 64 2) (splat src))) - (lower_splat_64x2 $I64X2 src)) - -(decl lower_splat_64x2 (Type Value) Xmm) -(rule (lower_splat_64x2 ty src) - (let (;; Force the input into a register so that we don't create a - ;; VCodeConstant. - (src RegMem (RegMem.Reg src)) - (vec Xmm (vec_insert_lane ty (xmm_uninit_value) src 0))) - (vec_insert_lane ty vec src 1))) +;; t64x2.splat - use `movddup` which is exactly what we want and there's a +;; minor specialization for sinkable loads to avoid going through a gpr for i64 +;; splats +(rule 0 (lower (has_type $I64X2 (splat src))) + (x64_movddup (bitcast_gpr_to_xmm $I64 src))) +(rule 0 (lower (has_type $F64X2 (splat src))) + (x64_movddup src)) +(rule 5 (lower (has_type (multi_lane 64 2) (splat (sinkable_load addr)))) + (x64_movddup addr)) ;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 868d0860eb..bd3b6e8dcd 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -70,26 +70,42 @@ fn put_input_in_reg(ctx: &mut Lower, spec: InsnInput) -> Reg { .expect("Multi-register value not expected") } +enum MergeableLoadSize { + /// The load size performed by a sinkable load merging operation is + /// precisely the size necessary for the type in question. + Exact, + + /// Narrower-than-32-bit values are handled by ALU insts that are at least + /// 32 bits wide, which is normally OK as we ignore upper buts; but, if we + /// generate, e.g., a direct-from-memory 32-bit add for a byte value and + /// the byte is the last byte in a page, the extra data that we load is + /// incorrectly accessed. So we only allow loads to merge for + /// 32-bit-and-above widths. + Min32, +} + /// Determines whether a load operation (indicated by `src_insn`) can be merged /// into the current lowering point. If so, returns the address-base source (as /// an `InsnInput`) and an offset from that address from which to perform the /// load. -fn is_mergeable_load(ctx: &mut Lower, src_insn: IRInst) -> Option<(InsnInput, i32)> { +fn is_mergeable_load( + ctx: &mut Lower, + src_insn: IRInst, + size: MergeableLoadSize, +) -> Option<(InsnInput, i32)> { let insn_data = ctx.data(src_insn); let inputs = ctx.num_inputs(src_insn); if inputs != 1 { return None; } + // If this type is too small to get a merged load, don't merge the load. let load_ty = ctx.output_ty(src_insn, 0); if ty_bits(load_ty) < 32 { - // Narrower values are handled by ALU insts that are at least 32 bits - // wide, which is normally OK as we ignore upper buts; but, if we - // generate, e.g., a direct-from-memory 32-bit add for a byte value and - // the byte is the last byte in a page, the extra data that we load is - // incorrectly accessed. So we only allow loads to merge for - // 32-bit-and-above widths. - return None; + match size { + MergeableLoadSize::Exact => {} + MergeableLoadSize::Min32 => return None, + } } // Just testing the opcode is enough, because the width will always match if diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs index 66b8091f77..904b67a88b 100644 --- a/cranelift/codegen/src/isa/x64/lower/isle.rs +++ b/cranelift/codegen/src/isa/x64/lower/isle.rs @@ -11,7 +11,7 @@ use crate::{isle_common_prelude_methods, isle_lower_prelude_methods}; use generated_code::{Context, MInst, RegisterClass}; // Types that the generated ISLE code uses via `use super::*`. -use super::{is_int_or_ref_ty, is_mergeable_load, lower_to_amode}; +use super::{is_int_or_ref_ty, is_mergeable_load, lower_to_amode, MergeableLoadSize}; use crate::ir::LibCall; use crate::isa::x64::lower::emit_vm_call; use crate::isa::x64::X64Backend; @@ -174,6 +174,11 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> { self.backend.x64_flags.has_avx() } + #[inline] + fn has_avx2(&mut self) -> bool { + self.backend.x64_flags.has_avx2() + } + #[inline] fn avx512vl_enabled(&mut self, _: Type) -> bool { self.backend.x64_flags.use_avx512vl_simd() @@ -268,7 +273,25 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> { fn sinkable_load(&mut self, val: Value) -> Option { let input = self.lower_ctx.get_value_as_source_or_const(val); if let InputSourceInst::UniqueUse(inst, 0) = input.inst { - if let Some((addr_input, offset)) = is_mergeable_load(self.lower_ctx, inst) { + if let Some((addr_input, offset)) = + is_mergeable_load(self.lower_ctx, inst, MergeableLoadSize::Min32) + { + return Some(SinkableLoad { + inst, + addr_input, + offset, + }); + } + } + None + } + + fn sinkable_load_exact(&mut self, val: Value) -> Option { + let input = self.lower_ctx.get_value_as_source_or_const(val); + if let InputSourceInst::UniqueUse(inst, 0) = input.inst { + if let Some((addr_input, offset)) = + is_mergeable_load(self.lower_ctx, inst, MergeableLoadSize::Exact) + { return Some(SinkableLoad { inst, addr_input, diff --git a/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif b/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif index c33650d1a8..15f5c84d15 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif @@ -1315,11 +1315,10 @@ block0(v0: i8): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; uninit %xmm2 -; vpinsrb $0, %xmm2, %rdi, %xmm4 -; uninit %xmm6 -; vpxor %xmm6, %xmm6, %xmm8 -; vpshufb %xmm4, %xmm8, %xmm0 +; movd %edi, %xmm2 +; uninit %xmm4 +; vpxor %xmm4, %xmm4, %xmm6 +; vpshufb %xmm2, %xmm6, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -1329,9 +1328,9 @@ block0(v0: i8): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; vpinsrb $0, %edi, %xmm2, %xmm4 -; vpxor %xmm6, %xmm6, %xmm8 -; vpshufb %xmm8, %xmm4, %xmm0 +; movd %edi, %xmm2 +; vpxor %xmm4, %xmm4, %xmm6 +; vpshufb %xmm6, %xmm2, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq diff --git a/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif b/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif index f2ce65fec4..5224705332 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif @@ -170,11 +170,10 @@ block0(v0: i8): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; uninit %xmm0 -; pinsrb $0, %xmm0, %rdi, %xmm0 -; uninit %xmm7 -; pxor %xmm7, %xmm7, %xmm7 -; pshufb %xmm0, %xmm7, %xmm0 +; movd %edi, %xmm0 +; uninit %xmm5 +; pxor %xmm5, %xmm5, %xmm5 +; pshufb %xmm0, %xmm5, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -184,9 +183,9 @@ block0(v0: i8): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; pinsrb $0, %edi, %xmm0 -; pxor %xmm7, %xmm7 -; pshufb %xmm7, %xmm0 +; movd %edi, %xmm0 +; pxor %xmm5, %xmm5 +; pshufb %xmm5, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -203,9 +202,8 @@ block0: ; movq %rsp, %rbp ; block0: ; movl $-1, %esi -; uninit %xmm4 -; pinsrw $0, %xmm4, %rsi, %xmm4 -; pinsrw $1, %xmm4, %rsi, %xmm4 +; movd %esi, %xmm2 +; pshuflw $0, %xmm2, %xmm4 ; pshufd $0, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -217,8 +215,8 @@ block0: ; movq %rsp, %rbp ; block1: ; offset 0x4 ; movl $0xffffffff, %esi -; pinsrw $0, %esi, %xmm4 -; pinsrw $1, %esi, %xmm4 +; movd %esi, %xmm2 +; pshuflw $0, %xmm2, %xmm4 ; pshufd $0, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -234,9 +232,8 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; uninit %xmm3 -; pinsrd $0, %xmm3, %rdi, %xmm3 -; pshufd $0, %xmm3, %xmm0 +; movd %edi, %xmm2 +; pshufd $0, %xmm2, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -246,8 +243,8 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; pinsrd $0, %edi, %xmm3 -; pshufd $0, %xmm3, %xmm0 +; movd %edi, %xmm2 +; pshufd $0, %xmm2, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -262,11 +259,7 @@ block0(v0: f64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqa %xmm0, %xmm5 -; uninit %xmm0 -; movdqa %xmm5, %xmm6 -; movsd %xmm0, %xmm6, %xmm0 -; movlhps %xmm0, %xmm6, %xmm0 +; movddup %xmm0, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -276,10 +269,7 @@ block0(v0: f64): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movdqa %xmm0, %xmm5 -; movdqa %xmm5, %xmm6 -; movsd %xmm6, %xmm0 -; movlhps %xmm6, %xmm0 +; movddup %xmm0, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq diff --git a/cranelift/filetests/filetests/isa/x64/simd-splat-avx.clif b/cranelift/filetests/filetests/isa/x64/simd-splat-avx.clif new file mode 100644 index 0000000000..8848b6694a --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/simd-splat-avx.clif @@ -0,0 +1,334 @@ +test compile precise-output +set enable_simd +target x86_64 has_avx + +function %splat_i8(i8) -> i8x16 { +block0(v0: i8): + v1 = splat.i8x16 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movd %edi, %xmm2 +; uninit %xmm4 +; vpxor %xmm4, %xmm4, %xmm6 +; vpshufb %xmm2, %xmm6, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movd %edi, %xmm2 +; vpxor %xmm4, %xmm4, %xmm6 +; vpshufb %xmm6, %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %splat_i16(i16) -> i16x8 { +block0(v0: i16): + v1 = splat.i16x8 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movd %edi, %xmm2 +; vpshuflw $0, %xmm2, %xmm4 +; vpshufd $0, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movd %edi, %xmm2 +; vpshuflw $0, %xmm2, %xmm4 +; vpshufd $0, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %splat_i32(i32) -> i32x4 { +block0(v0: i32): + v1 = splat.i32x4 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movd %edi, %xmm2 +; vpshufd $0, %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movd %edi, %xmm2 +; vpshufd $0, %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %splat_i64(i64) -> i64x2 { +block0(v0: i64): + v1 = splat.i64x2 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %xmm2 +; vmovddup %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %xmm2 +; vmovddup %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %splat_f32(f32) -> f32x4 { +block0(v0: f32): + v1 = splat.f32x4 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vshufps $0, %xmm0, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vshufps $0, %xmm0, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %splat_f64(f64) -> f64x2 { +block0(v0: f64): + v1 = splat.f64x2 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmovddup %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmovddup %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %load_splat_i8(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8 v0 + v2 = splat.i8x16 v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; uninit %xmm2 +; vpinsrb $0, %xmm2, 0(%rdi), %xmm4 +; uninit %xmm6 +; vpxor %xmm6, %xmm6, %xmm8 +; vpshufb %xmm4, %xmm8, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpinsrb $0, (%rdi), %xmm2, %xmm4 ; trap: heap_oob +; vpxor %xmm6, %xmm6, %xmm8 +; vpshufb %xmm8, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %load_splat_i16(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16 v0 + v2 = splat.i16x8 v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; uninit %xmm2 +; vpinsrw $0, %xmm2, 0(%rdi), %xmm4 +; vpshuflw $0, %xmm4, %xmm6 +; vpshufd $0, %xmm6, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpinsrw $0, (%rdi), %xmm2, %xmm4 ; trap: heap_oob +; vpshuflw $0, %xmm4, %xmm6 +; vpshufd $0, %xmm6, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %load_splat_i32(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32 v0 + v2 = splat.i32x4 v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vbroadcastss 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vbroadcastss (%rdi), %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %load_splat_i64(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64 v0 + v2 = splat.i64x2 v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmovddup 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmovddup (%rdi), %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %load_splat_f32(i64) -> f32x4 { +block0(v0: i64): + v1 = load.f32 v0 + v2 = splat.f32x4 v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vbroadcastss 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vbroadcastss (%rdi), %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %load_splat_f64(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64 v0 + v2 = splat.f64x2 v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmovddup 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmovddup (%rdi), %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/isa/x64/simd-splat-avx2.clif b/cranelift/filetests/filetests/isa/x64/simd-splat-avx2.clif new file mode 100644 index 0000000000..84cdfd3730 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/simd-splat-avx2.clif @@ -0,0 +1,318 @@ +test compile precise-output +set enable_simd +target x86_64 has_avx has_avx2 + +function %splat_i8(i8) -> i8x16 { +block0(v0: i8): + v1 = splat.i8x16 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movd %edi, %xmm2 +; vpbroadcastb %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movd %edi, %xmm2 +; vpbroadcastb %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %splat_i16(i16) -> i16x8 { +block0(v0: i16): + v1 = splat.i16x8 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movd %edi, %xmm2 +; vpbroadcastw %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movd %edi, %xmm2 +; vpbroadcastw %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %splat_i32(i32) -> i32x4 { +block0(v0: i32): + v1 = splat.i32x4 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movd %edi, %xmm2 +; vpbroadcastd %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movd %edi, %xmm2 +; vpbroadcastd %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %splat_i64(i64) -> i64x2 { +block0(v0: i64): + v1 = splat.i64x2 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %xmm2 +; vmovddup %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %xmm2 +; vmovddup %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %splat_f32(f32) -> f32x4 { +block0(v0: f32): + v1 = splat.f32x4 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vbroadcastss %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vbroadcastss %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %splat_f64(f64) -> f64x2 { +block0(v0: f64): + v1 = splat.f64x2 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmovddup %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmovddup %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %load_splat_i8(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8 v0 + v2 = splat.i8x16 v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpbroadcastb 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpbroadcastb (%rdi), %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %load_splat_i16(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16 v0 + v2 = splat.i16x8 v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpbroadcastw 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpbroadcastw (%rdi), %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %load_splat_i32(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32 v0 + v2 = splat.i32x4 v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vbroadcastss 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vbroadcastss (%rdi), %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %load_splat_i64(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64 v0 + v2 = splat.i64x2 v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmovddup 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmovddup (%rdi), %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %load_splat_f32(i64) -> f32x4 { +block0(v0: i64): + v1 = load.f32 v0 + v2 = splat.f32x4 v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vbroadcastss 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vbroadcastss (%rdi), %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %load_splat_f64(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64 v0 + v2 = splat.f64x2 v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmovddup 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmovddup (%rdi), %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/isa/x64/simd-splat.clif b/cranelift/filetests/filetests/isa/x64/simd-splat.clif new file mode 100644 index 0000000000..8bb3b08905 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/simd-splat.clif @@ -0,0 +1,338 @@ +test compile precise-output +set enable_simd +target x86_64 + +function %splat_i8(i8) -> i8x16 { +block0(v0: i8): + v1 = splat.i8x16 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movd %edi, %xmm0 +; uninit %xmm5 +; pxor %xmm5, %xmm5, %xmm5 +; pshufb %xmm0, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movd %edi, %xmm0 +; pxor %xmm5, %xmm5 +; pshufb %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %splat_i16(i16) -> i16x8 { +block0(v0: i16): + v1 = splat.i16x8 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movd %edi, %xmm2 +; pshuflw $0, %xmm2, %xmm4 +; pshufd $0, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movd %edi, %xmm2 +; pshuflw $0, %xmm2, %xmm4 +; pshufd $0, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %splat_i32(i32) -> i32x4 { +block0(v0: i32): + v1 = splat.i32x4 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movd %edi, %xmm2 +; pshufd $0, %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movd %edi, %xmm2 +; pshufd $0, %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %splat_i64(i64) -> i64x2 { +block0(v0: i64): + v1 = splat.i64x2 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %xmm2 +; movddup %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %xmm2 +; movddup %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %splat_f32(f32) -> f32x4 { +block0(v0: f32): + v1 = splat.f32x4 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; shufps $0, %xmm0, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; shufps $0, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %splat_f64(f64) -> f64x2 { +block0(v0: f64): + v1 = splat.f64x2 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movddup %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movddup %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %load_splat_i8(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8 v0 + v2 = splat.i8x16 v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; uninit %xmm0 +; pinsrb $0, %xmm0, 0(%rdi), %xmm0 +; uninit %xmm7 +; pxor %xmm7, %xmm7, %xmm7 +; pshufb %xmm0, %xmm7, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pinsrb $0, (%rdi), %xmm0 ; trap: heap_oob +; pxor %xmm7, %xmm7 +; pshufb %xmm7, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %load_splat_i16(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16 v0 + v2 = splat.i16x8 v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; uninit %xmm3 +; pinsrw $0, %xmm3, 0(%rdi), %xmm3 +; pshuflw $0, %xmm3, %xmm6 +; pshufd $0, %xmm6, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pinsrw $0, (%rdi), %xmm3 ; trap: heap_oob +; pshuflw $0, %xmm3, %xmm6 +; pshufd $0, %xmm6, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %load_splat_i32(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32 v0 + v2 = splat.i32x4 v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movss 0(%rdi), %xmm0 +; shufps $0, %xmm0, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movss (%rdi), %xmm0 ; trap: heap_oob +; shufps $0, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %load_splat_i64(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64 v0 + v2 = splat.i64x2 v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movddup 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movddup (%rdi), %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %load_splat_f32(i64) -> f32x4 { +block0(v0: i64): + v1 = load.f32 v0 + v2 = splat.f32x4 v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movss 0(%rdi), %xmm0 +; shufps $0, %xmm0, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movss (%rdi), %xmm0 ; trap: heap_oob +; shufps $0, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %load_splat_f64(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64 v0 + v2 = splat.f64x2 v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movddup 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movddup (%rdi), %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/runtests/simd-splat.clif b/cranelift/filetests/filetests/runtests/simd-splat.clif index 37db142ec1..de2b49fd4a 100644 --- a/cranelift/filetests/filetests/runtests/simd-splat.clif +++ b/cranelift/filetests/filetests/runtests/simd-splat.clif @@ -4,6 +4,8 @@ target aarch64 target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 +target x86_64 has_sse3 has_ssse3 has_sse41 has_avx +target x86_64 has_sse3 has_ssse3 has_sse41 has_avx has_avx2 function %splat_i8x16(i8) -> i8x16 { block0(v0: i8):