x64: Improve codegen for splats (#6025)

This commit goes through the lowerings for the CLIF `splat` instruction
and improves the support for each operator. Many of these lowerings are
mirrored from v8/SpiderMonkey and there are a number of improvements:

* AVX2 `v{p,}broadcast*` instructions are added and used when available.
* Float-based splats are much simpler and always a single-instruction
* Integer-based splats don't insert into an uninit xmm value and instead
  start out with a `movd` to move into an `xmm` register. This
  thoeretically breaks dependencies with prior instructions since `movd`
  creates a fresh new value in the destination register.
* Loads are now sunk into all of the instructions. A new extractor,
  `sinkable_load_exact`, was added to sink the i8/i16 loads.
This commit is contained in:
Alex Crichton
2023-03-15 16:33:56 -05:00
committed by GitHub
parent a10c50afe9
commit d76f7ee52e
12 changed files with 1216 additions and 82 deletions

View File

@@ -919,6 +919,7 @@
Pshuflw Pshuflw
Pshufhw Pshufhw
Pblendw Pblendw
Movddup
)) ))
(type CmpOpcode extern (type CmpOpcode extern
@@ -1292,6 +1293,11 @@
Vpextrd Vpextrd
Vpextrq Vpextrq
Vpblendw Vpblendw
Vmovddup
Vpbroadcastb
Vpbroadcastw
Vpbroadcastd
Vbroadcastss
)) ))
(type Avx512Opcode extern (type Avx512Opcode extern
@@ -1622,6 +1628,9 @@
(decl pure has_avx () bool) (decl pure has_avx () bool)
(extern constructor has_avx has_avx) (extern constructor has_avx has_avx)
(decl pure has_avx2 () bool)
(extern constructor has_avx2 has_avx2)
;;;; Helpers for Merging and Sinking Immediates/Loads ;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Helpers for Merging and Sinking Immediates/Loads ;;;;;;;;;;;;;;;;;;;;;;;;;
;; Extract a constant `Imm8Reg.Imm8` from a value operand. ;; Extract a constant `Imm8Reg.Imm8` from a value operand.
@@ -1656,9 +1665,21 @@
;; Extract a `SinkableLoad` that works with `RegMemImm.Mem` from a value ;; Extract a `SinkableLoad` that works with `RegMemImm.Mem` from a value
;; operand. ;; operand.
;;
;; Note that this will only work for 32-bit-types-or-larger since this is
;; pervasively used with operations that load a minimum of 32-bits. For
;; instructions which load exactly the type width necessary use
;; `sinkable_load_exact`.
(decl sinkable_load (SinkableLoad) Value) (decl sinkable_load (SinkableLoad) Value)
(extern extractor sinkable_load sinkable_load) (extern extractor sinkable_load sinkable_load)
;; Same as `sinkable_load` except that all type widths of loads are supported.
;;
;; Only use this when the instruction which performs the load is guaranteed to
;; load the precisely correct size.
(decl sinkable_load_exact (SinkableLoad) Value)
(extern extractor sinkable_load_exact sinkable_load_exact)
;; Sink a `SinkableLoad` into a `SyntheticAmode`. ;; Sink a `SinkableLoad` into a `SyntheticAmode`.
;; ;;
;; This is a side-effectful operation that notifies the context that the ;; This is a side-effectful operation that notifies the context that the
@@ -1678,6 +1699,9 @@
(decl sink_load_to_reg_mem (SinkableLoad) RegMem) (decl sink_load_to_reg_mem (SinkableLoad) RegMem)
(rule (sink_load_to_reg_mem load) (RegMem.Mem load)) (rule (sink_load_to_reg_mem load) (RegMem.Mem load))
(decl sink_load_to_gpr_mem (SinkableLoad) GprMem)
(rule (sink_load_to_gpr_mem load) (RegMem.Mem load))
(decl sink_load_to_reg_mem_imm (SinkableLoad) RegMemImm) (decl sink_load_to_reg_mem_imm (SinkableLoad) RegMemImm)
(rule (sink_load_to_reg_mem_imm load) (RegMemImm.Mem load)) (rule (sink_load_to_reg_mem_imm load) (RegMemImm.Mem load))
@@ -4103,6 +4127,34 @@
(rule (trap_if_fcmp (FcmpCondResult.OrCondition producer cc1 cc2) tc) (rule (trap_if_fcmp (FcmpCondResult.OrCondition producer cc1 cc2) tc)
(with_flags_side_effect producer (trap_if_or cc1 cc2 tc))) (with_flags_side_effect producer (trap_if_or cc1 cc2 tc)))
;; Helper for creating `movddup` instructions
(decl x64_movddup (XmmMem) Xmm)
(rule (x64_movddup src)
(xmm_unary_rm_r_unaligned (SseOpcode.Movddup) src))
(rule 1 (x64_movddup src)
(if-let $true (has_avx))
(xmm_unary_rm_r_vex (AvxOpcode.Vmovddup) src))
;; Helper for creating `vpbroadcastb` instructions
(decl x64_vpbroadcastb (XmmMem) Xmm)
(rule (x64_vpbroadcastb src)
(xmm_unary_rm_r_vex (AvxOpcode.Vpbroadcastb) src))
;; Helper for creating `vpbroadcastw` instructions
(decl x64_vpbroadcastw (XmmMem) Xmm)
(rule (x64_vpbroadcastw src)
(xmm_unary_rm_r_vex (AvxOpcode.Vpbroadcastw) src))
;; Helper for creating `vpbroadcastd` instructions
(decl x64_vpbroadcastd (XmmMem) Xmm)
(rule (x64_vpbroadcastd src)
(xmm_unary_rm_r_vex (AvxOpcode.Vpbroadcastd) src))
;; Helper for creating `vbroadcastss` instructions
(decl x64_vbroadcastss (XmmMem) Xmm)
(rule (x64_vbroadcastss src)
(xmm_unary_rm_r_vex (AvxOpcode.Vbroadcastss) src))
;;;; Jumps ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Jumps ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Unconditional jump. ;; Unconditional jump.
@@ -4664,6 +4716,7 @@
(convert AtomicRmwOp MachAtomicRmwOp atomic_rmw_op_to_mach_atomic_rmw_op) (convert AtomicRmwOp MachAtomicRmwOp atomic_rmw_op_to_mach_atomic_rmw_op)
(convert SinkableLoad RegMem sink_load_to_reg_mem) (convert SinkableLoad RegMem sink_load_to_reg_mem)
(convert SinkableLoad GprMem sink_load_to_gpr_mem)
(convert SinkableLoad RegMemImm sink_load_to_reg_mem_imm) (convert SinkableLoad RegMemImm sink_load_to_reg_mem_imm)
(convert SinkableLoad GprMemImm sink_load_to_gpr_mem_imm) (convert SinkableLoad GprMemImm sink_load_to_gpr_mem_imm)
(convert SinkableLoad XmmMem sink_load_to_xmm_mem) (convert SinkableLoad XmmMem sink_load_to_xmm_mem)

View File

@@ -928,6 +928,7 @@ pub(crate) enum InstructionSet {
BMI2, BMI2,
FMA, FMA,
AVX, AVX,
AVX2,
AVX512BITALG, AVX512BITALG,
AVX512DQ, AVX512DQ,
AVX512F, AVX512F,
@@ -1126,6 +1127,7 @@ pub enum SseOpcode {
Pshuflw, Pshuflw,
Pshufhw, Pshufhw,
Pblendw, Pblendw,
Movddup,
} }
impl SseOpcode { impl SseOpcode {
@@ -1280,7 +1282,8 @@ impl SseOpcode {
| SseOpcode::Pmulhrsw | SseOpcode::Pmulhrsw
| SseOpcode::Pshufb | SseOpcode::Pshufb
| SseOpcode::Phaddw | SseOpcode::Phaddw
| SseOpcode::Phaddd => SSSE3, | SseOpcode::Phaddd
| SseOpcode::Movddup => SSSE3,
SseOpcode::Blendvpd SseOpcode::Blendvpd
| SseOpcode::Blendvps | SseOpcode::Blendvps
@@ -1524,6 +1527,7 @@ impl fmt::Debug for SseOpcode {
SseOpcode::Pshuflw => "pshuflw", SseOpcode::Pshuflw => "pshuflw",
SseOpcode::Pshufhw => "pshufhw", SseOpcode::Pshufhw => "pshufhw",
SseOpcode::Pblendw => "pblendw", SseOpcode::Pblendw => "pblendw",
SseOpcode::Movddup => "movddup",
}; };
write!(fmt, "{}", name) write!(fmt, "{}", name)
} }
@@ -1709,9 +1713,15 @@ impl AvxOpcode {
| AvxOpcode::Vpextrw | AvxOpcode::Vpextrw
| AvxOpcode::Vpextrd | AvxOpcode::Vpextrd
| AvxOpcode::Vpextrq | AvxOpcode::Vpextrq
| AvxOpcode::Vpblendw => { | AvxOpcode::Vpblendw
| AvxOpcode::Vmovddup
| AvxOpcode::Vbroadcastss => {
smallvec![InstructionSet::AVX] smallvec![InstructionSet::AVX]
} }
AvxOpcode::Vpbroadcastb | AvxOpcode::Vpbroadcastw | AvxOpcode::Vpbroadcastd => {
smallvec![InstructionSet::AVX2]
}
} }
} }
} }

View File

@@ -122,6 +122,7 @@ pub(crate) fn emit(
InstructionSet::BMI2 => info.isa_flags.has_bmi2(), InstructionSet::BMI2 => info.isa_flags.has_bmi2(),
InstructionSet::FMA => info.isa_flags.has_fma(), InstructionSet::FMA => info.isa_flags.has_fma(),
InstructionSet::AVX => info.isa_flags.has_avx(), InstructionSet::AVX => info.isa_flags.has_avx(),
InstructionSet::AVX2 => info.isa_flags.has_avx2(),
InstructionSet::AVX512BITALG => info.isa_flags.has_avx512bitalg(), InstructionSet::AVX512BITALG => info.isa_flags.has_avx512bitalg(),
InstructionSet::AVX512DQ => info.isa_flags.has_avx512dq(), InstructionSet::AVX512DQ => info.isa_flags.has_avx512dq(),
InstructionSet::AVX512F => info.isa_flags.has_avx512f(), InstructionSet::AVX512F => info.isa_flags.has_avx512f(),
@@ -1826,6 +1827,7 @@ pub(crate) fn emit(
SseOpcode::Sqrtpd => (LegacyPrefixes::_66, 0x0F51, 2), SseOpcode::Sqrtpd => (LegacyPrefixes::_66, 0x0F51, 2),
SseOpcode::Sqrtss => (LegacyPrefixes::_F3, 0x0F51, 2), SseOpcode::Sqrtss => (LegacyPrefixes::_F3, 0x0F51, 2),
SseOpcode::Sqrtsd => (LegacyPrefixes::_F2, 0x0F51, 2), SseOpcode::Sqrtsd => (LegacyPrefixes::_F2, 0x0F51, 2),
SseOpcode::Movddup => (LegacyPrefixes::_F2, 0x0F12, 2),
_ => unimplemented!("Opcode {:?} not implemented", op), _ => unimplemented!("Opcode {:?} not implemented", op),
}; };
@@ -2450,6 +2452,13 @@ pub(crate) fn emit(
RegisterOrAmode::Amode(_) => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x10), RegisterOrAmode::Amode(_) => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x10),
_ => unreachable!(), _ => unreachable!(),
}, },
AvxOpcode::Vpbroadcastb => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x78),
AvxOpcode::Vpbroadcastw => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x79),
AvxOpcode::Vpbroadcastd => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x58),
AvxOpcode::Vbroadcastss => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x18),
AvxOpcode::Vmovddup => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x12),
_ => panic!("unexpected rmr_imm_vex opcode {op:?}"), _ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
}; };

View File

@@ -3915,47 +3915,89 @@
;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type (multi_lane 8 16) (splat src))) ;; For all the splat rules below one of the goals is that splatting a value
(let ((vec Xmm (vec_insert_lane $I8X16 (xmm_uninit_value) src 0)) ;; doesn't end up accidentally depending on the previous value in a register.
(zeros Xmm (xmm_zero $I8X16))) ;; This means that instructions are chosen to avoid false dependencies where
;; Shuffle the lowest byte lane to all other lanes. ;; new values are created fresh or otherwise overwrite previous register
(x64_pshufb vec zeros))) ;; contents where possible.
;;
;; Additionally splats are specialized to special-case load-and-splat which
;; has a number of micro-optimizations available.
(rule (lower (has_type (multi_lane 16 8) (splat src))) ;; i8x16 splats: use `vpbroadcastb` on AVX2 and otherwise `pshufb` broadcasts
(let (;; Force the input into a register so that we don't create a ;; with a mask of zero which is calculated with an xor-against-itself register.
;; VCodeConstant. (rule 0 (lower (has_type $I8X16 (splat src)))
(src RegMem (RegMem.Reg src)) (x64_pshufb (bitcast_gpr_to_xmm $I32 src) (xmm_zero $I8X16)))
(vec Xmm (vec_insert_lane $I16X8 (xmm_uninit_value) src 0)) (rule 1 (lower (has_type $I8X16 (splat src)))
(vec Xmm (vec_insert_lane $I16X8 vec src 1))) (if-let $true (has_avx2))
;; Shuffle the lowest two lanes to all other lanes. (x64_vpbroadcastb (bitcast_gpr_to_xmm $I32 src)))
(x64_pshufd vec 0))) (rule 2 (lower (has_type $I8X16 (splat (sinkable_load_exact addr))))
(x64_pshufb (x64_pinsrb (xmm_uninit_value) addr 0) (xmm_zero $I8X16)))
(rule 3 (lower (has_type $I8X16 (splat (sinkable_load_exact addr))))
(if-let $true (has_avx2))
(x64_vpbroadcastb addr))
(rule 1 (lower (has_type (multi_lane 32 4) (splat src @ (value_type (ty_scalar_float _))))) ;; i16x8 splats: use `vpbroadcastw` on AVX2 and otherwise a 16-bit value is
(lower_splat_32x4 $F32X4 src)) ;; loaded into an xmm register, `pshuflw` broadcasts the low 16-bit lane
;; to the low four lanes, and `pshufd` broadcasts the low 32-bit lane (which
;; at that point is two of the 16-bit values we want to broadcast) to all the
;; lanes.
(rule 0 (lower (has_type $I16X8 (splat src)))
(x64_pshufd (x64_pshuflw (bitcast_gpr_to_xmm $I32 src) 0) 0))
(rule 1 (lower (has_type $I16X8 (splat src)))
(if-let $true (has_avx2))
(x64_vpbroadcastw (bitcast_gpr_to_xmm $I32 src)))
(rule 2 (lower (has_type $I16X8 (splat (sinkable_load_exact addr))))
(x64_pshufd (x64_pshuflw (x64_pinsrw (xmm_uninit_value) addr 0) 0) 0))
(rule 3 (lower (has_type $I16X8 (splat (sinkable_load_exact addr))))
(if-let $true (has_avx2))
(x64_vpbroadcastw addr))
(rule (lower (has_type (multi_lane 32 4) (splat src))) ;; i32x4.splat - use `vpbroadcastd` on AVX2 and otherwise `pshufd` can be
(lower_splat_32x4 $I32X4 src)) ;; used to broadcast the low lane to all other lanes.
;;
;; Note that sinkable-load cases come later
(rule 0 (lower (has_type $I32X4 (splat src)))
(x64_pshufd (bitcast_gpr_to_xmm $I32 src) 0))
(rule 1 (lower (has_type $I32X4 (splat src)))
(if-let $true (has_avx2))
(x64_vpbroadcastd (bitcast_gpr_to_xmm $I32 src)))
(decl lower_splat_32x4 (Type Value) Xmm) ;; f32x4.splat - the source is already in an xmm register so `shufps` is all
(rule (lower_splat_32x4 ty src) ;; that's necessary to complete the splat. This is specialized to `vbroadcastss`
(let ((src RegMem src) ;; on AVX2 to leverage that specific instruction for this operation.
(vec Xmm (vec_insert_lane ty (xmm_uninit_value) src 0))) (rule 0 (lower (has_type $F32X4 (splat src)))
;; Shuffle the lowest lane to all other lanes. (let ((tmp Xmm src))
(x64_pshufd vec 0))) (x64_shufps src src 0)))
(rule 1 (lower (has_type $F32X4 (splat src)))
(if-let $true (has_avx2))
(x64_vbroadcastss src))
(rule 1 (lower (has_type (multi_lane 64 2) (splat src @ (value_type (ty_scalar_float _))))) ;; t32x4.splat of a load - use a `movss` to load into an xmm register and then
(lower_splat_64x2 $F64X2 src)) ;; `shufps` broadcasts to the other lanes. Note that this is used for both i32
;; and f32 splats.
;;
;; With AVX the `vbroadcastss` instruction suits this purpose precisely. Note
;; that the memory-operand encoding of `vbroadcastss` is usable with AVX, but
;; the register-based encoding is only available with AVX2. With the
;; `sinkable_load` extractor this should be guaranteed to use the memory-based
;; encoding hence the `has_avx` test.
(rule 4 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr))))
(let ((tmp Xmm (x64_movss_load addr)))
(x64_shufps tmp tmp 0)))
(rule 5 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr))))
(if-let $true (has_avx))
(x64_vbroadcastss addr))
(rule (lower (has_type (multi_lane 64 2) (splat src))) ;; t64x2.splat - use `movddup` which is exactly what we want and there's a
(lower_splat_64x2 $I64X2 src)) ;; minor specialization for sinkable loads to avoid going through a gpr for i64
;; splats
(decl lower_splat_64x2 (Type Value) Xmm) (rule 0 (lower (has_type $I64X2 (splat src)))
(rule (lower_splat_64x2 ty src) (x64_movddup (bitcast_gpr_to_xmm $I64 src)))
(let (;; Force the input into a register so that we don't create a (rule 0 (lower (has_type $F64X2 (splat src)))
;; VCodeConstant. (x64_movddup src))
(src RegMem (RegMem.Reg src)) (rule 5 (lower (has_type (multi_lane 64 2) (splat (sinkable_load addr))))
(vec Xmm (vec_insert_lane ty (xmm_uninit_value) src 0))) (x64_movddup addr))
(vec_insert_lane ty vec src 1)))
;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

View File

@@ -70,26 +70,42 @@ fn put_input_in_reg(ctx: &mut Lower<Inst>, spec: InsnInput) -> Reg {
.expect("Multi-register value not expected") .expect("Multi-register value not expected")
} }
enum MergeableLoadSize {
/// The load size performed by a sinkable load merging operation is
/// precisely the size necessary for the type in question.
Exact,
/// Narrower-than-32-bit values are handled by ALU insts that are at least
/// 32 bits wide, which is normally OK as we ignore upper buts; but, if we
/// generate, e.g., a direct-from-memory 32-bit add for a byte value and
/// the byte is the last byte in a page, the extra data that we load is
/// incorrectly accessed. So we only allow loads to merge for
/// 32-bit-and-above widths.
Min32,
}
/// Determines whether a load operation (indicated by `src_insn`) can be merged /// Determines whether a load operation (indicated by `src_insn`) can be merged
/// into the current lowering point. If so, returns the address-base source (as /// into the current lowering point. If so, returns the address-base source (as
/// an `InsnInput`) and an offset from that address from which to perform the /// an `InsnInput`) and an offset from that address from which to perform the
/// load. /// load.
fn is_mergeable_load(ctx: &mut Lower<Inst>, src_insn: IRInst) -> Option<(InsnInput, i32)> { fn is_mergeable_load(
ctx: &mut Lower<Inst>,
src_insn: IRInst,
size: MergeableLoadSize,
) -> Option<(InsnInput, i32)> {
let insn_data = ctx.data(src_insn); let insn_data = ctx.data(src_insn);
let inputs = ctx.num_inputs(src_insn); let inputs = ctx.num_inputs(src_insn);
if inputs != 1 { if inputs != 1 {
return None; return None;
} }
// If this type is too small to get a merged load, don't merge the load.
let load_ty = ctx.output_ty(src_insn, 0); let load_ty = ctx.output_ty(src_insn, 0);
if ty_bits(load_ty) < 32 { if ty_bits(load_ty) < 32 {
// Narrower values are handled by ALU insts that are at least 32 bits match size {
// wide, which is normally OK as we ignore upper buts; but, if we MergeableLoadSize::Exact => {}
// generate, e.g., a direct-from-memory 32-bit add for a byte value and MergeableLoadSize::Min32 => return None,
// the byte is the last byte in a page, the extra data that we load is }
// incorrectly accessed. So we only allow loads to merge for
// 32-bit-and-above widths.
return None;
} }
// Just testing the opcode is enough, because the width will always match if // Just testing the opcode is enough, because the width will always match if

View File

@@ -11,7 +11,7 @@ use crate::{isle_common_prelude_methods, isle_lower_prelude_methods};
use generated_code::{Context, MInst, RegisterClass}; use generated_code::{Context, MInst, RegisterClass};
// Types that the generated ISLE code uses via `use super::*`. // Types that the generated ISLE code uses via `use super::*`.
use super::{is_int_or_ref_ty, is_mergeable_load, lower_to_amode}; use super::{is_int_or_ref_ty, is_mergeable_load, lower_to_amode, MergeableLoadSize};
use crate::ir::LibCall; use crate::ir::LibCall;
use crate::isa::x64::lower::emit_vm_call; use crate::isa::x64::lower::emit_vm_call;
use crate::isa::x64::X64Backend; use crate::isa::x64::X64Backend;
@@ -174,6 +174,11 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
self.backend.x64_flags.has_avx() self.backend.x64_flags.has_avx()
} }
#[inline]
fn has_avx2(&mut self) -> bool {
self.backend.x64_flags.has_avx2()
}
#[inline] #[inline]
fn avx512vl_enabled(&mut self, _: Type) -> bool { fn avx512vl_enabled(&mut self, _: Type) -> bool {
self.backend.x64_flags.use_avx512vl_simd() self.backend.x64_flags.use_avx512vl_simd()
@@ -268,7 +273,25 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
fn sinkable_load(&mut self, val: Value) -> Option<SinkableLoad> { fn sinkable_load(&mut self, val: Value) -> Option<SinkableLoad> {
let input = self.lower_ctx.get_value_as_source_or_const(val); let input = self.lower_ctx.get_value_as_source_or_const(val);
if let InputSourceInst::UniqueUse(inst, 0) = input.inst { if let InputSourceInst::UniqueUse(inst, 0) = input.inst {
if let Some((addr_input, offset)) = is_mergeable_load(self.lower_ctx, inst) { if let Some((addr_input, offset)) =
is_mergeable_load(self.lower_ctx, inst, MergeableLoadSize::Min32)
{
return Some(SinkableLoad {
inst,
addr_input,
offset,
});
}
}
None
}
fn sinkable_load_exact(&mut self, val: Value) -> Option<SinkableLoad> {
let input = self.lower_ctx.get_value_as_source_or_const(val);
if let InputSourceInst::UniqueUse(inst, 0) = input.inst {
if let Some((addr_input, offset)) =
is_mergeable_load(self.lower_ctx, inst, MergeableLoadSize::Exact)
{
return Some(SinkableLoad { return Some(SinkableLoad {
inst, inst,
addr_input, addr_input,

View File

@@ -1315,11 +1315,10 @@ block0(v0: i8):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; uninit %xmm2 ; movd %edi, %xmm2
; vpinsrb $0, %xmm2, %rdi, %xmm4 ; uninit %xmm4
; uninit %xmm6 ; vpxor %xmm4, %xmm4, %xmm6
; vpxor %xmm6, %xmm6, %xmm8 ; vpshufb %xmm2, %xmm6, %xmm0
; vpshufb %xmm4, %xmm8, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret
@@ -1329,9 +1328,9 @@ block0(v0: i8):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block1: ; offset 0x4 ; block1: ; offset 0x4
; vpinsrb $0, %edi, %xmm2, %xmm4 ; movd %edi, %xmm2
; vpxor %xmm6, %xmm6, %xmm8 ; vpxor %xmm4, %xmm4, %xmm6
; vpshufb %xmm8, %xmm4, %xmm0 ; vpshufb %xmm6, %xmm2, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq

View File

@@ -170,11 +170,10 @@ block0(v0: i8):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; uninit %xmm0 ; movd %edi, %xmm0
; pinsrb $0, %xmm0, %rdi, %xmm0 ; uninit %xmm5
; uninit %xmm7 ; pxor %xmm5, %xmm5, %xmm5
; pxor %xmm7, %xmm7, %xmm7 ; pshufb %xmm0, %xmm5, %xmm0
; pshufb %xmm0, %xmm7, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret
@@ -184,9 +183,9 @@ block0(v0: i8):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block1: ; offset 0x4 ; block1: ; offset 0x4
; pinsrb $0, %edi, %xmm0 ; movd %edi, %xmm0
; pxor %xmm7, %xmm7 ; pxor %xmm5, %xmm5
; pshufb %xmm7, %xmm0 ; pshufb %xmm5, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq
@@ -203,9 +202,8 @@ block0:
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; movl $-1, %esi ; movl $-1, %esi
; uninit %xmm4 ; movd %esi, %xmm2
; pinsrw $0, %xmm4, %rsi, %xmm4 ; pshuflw $0, %xmm2, %xmm4
; pinsrw $1, %xmm4, %rsi, %xmm4
; pshufd $0, %xmm4, %xmm0 ; pshufd $0, %xmm4, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
@@ -217,8 +215,8 @@ block0:
; movq %rsp, %rbp ; movq %rsp, %rbp
; block1: ; offset 0x4 ; block1: ; offset 0x4
; movl $0xffffffff, %esi ; movl $0xffffffff, %esi
; pinsrw $0, %esi, %xmm4 ; movd %esi, %xmm2
; pinsrw $1, %esi, %xmm4 ; pshuflw $0, %xmm2, %xmm4
; pshufd $0, %xmm4, %xmm0 ; pshufd $0, %xmm4, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
@@ -234,9 +232,8 @@ block0(v0: i32):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; uninit %xmm3 ; movd %edi, %xmm2
; pinsrd $0, %xmm3, %rdi, %xmm3 ; pshufd $0, %xmm2, %xmm0
; pshufd $0, %xmm3, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret
@@ -246,8 +243,8 @@ block0(v0: i32):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block1: ; offset 0x4 ; block1: ; offset 0x4
; pinsrd $0, %edi, %xmm3 ; movd %edi, %xmm2
; pshufd $0, %xmm3, %xmm0 ; pshufd $0, %xmm2, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq
@@ -262,11 +259,7 @@ block0(v0: f64):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; movdqa %xmm0, %xmm5 ; movddup %xmm0, %xmm0
; uninit %xmm0
; movdqa %xmm5, %xmm6
; movsd %xmm0, %xmm6, %xmm0
; movlhps %xmm0, %xmm6, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret
@@ -276,10 +269,7 @@ block0(v0: f64):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block1: ; offset 0x4 ; block1: ; offset 0x4
; movdqa %xmm0, %xmm5 ; movddup %xmm0, %xmm0
; movdqa %xmm5, %xmm6
; movsd %xmm6, %xmm0
; movlhps %xmm6, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq

View File

@@ -0,0 +1,334 @@
test compile precise-output
set enable_simd
target x86_64 has_avx
function %splat_i8(i8) -> i8x16 {
block0(v0: i8):
v1 = splat.i8x16 v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movd %edi, %xmm2
; uninit %xmm4
; vpxor %xmm4, %xmm4, %xmm6
; vpshufb %xmm2, %xmm6, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movd %edi, %xmm2
; vpxor %xmm4, %xmm4, %xmm6
; vpshufb %xmm6, %xmm2, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %splat_i16(i16) -> i16x8 {
block0(v0: i16):
v1 = splat.i16x8 v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movd %edi, %xmm2
; vpshuflw $0, %xmm2, %xmm4
; vpshufd $0, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movd %edi, %xmm2
; vpshuflw $0, %xmm2, %xmm4
; vpshufd $0, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %splat_i32(i32) -> i32x4 {
block0(v0: i32):
v1 = splat.i32x4 v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movd %edi, %xmm2
; vpshufd $0, %xmm2, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movd %edi, %xmm2
; vpshufd $0, %xmm2, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %splat_i64(i64) -> i64x2 {
block0(v0: i64):
v1 = splat.i64x2 v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movq %rdi, %xmm2
; vmovddup %xmm2, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movq %rdi, %xmm2
; vmovddup %xmm2, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %splat_f32(f32) -> f32x4 {
block0(v0: f32):
v1 = splat.f32x4 v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vshufps $0, %xmm0, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vshufps $0, %xmm0, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %splat_f64(f64) -> f64x2 {
block0(v0: f64):
v1 = splat.f64x2 v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vmovddup %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vmovddup %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %load_splat_i8(i64) -> i8x16 {
block0(v0: i64):
v1 = load.i8 v0
v2 = splat.i8x16 v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; uninit %xmm2
; vpinsrb $0, %xmm2, 0(%rdi), %xmm4
; uninit %xmm6
; vpxor %xmm6, %xmm6, %xmm8
; vpshufb %xmm4, %xmm8, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpinsrb $0, (%rdi), %xmm2, %xmm4 ; trap: heap_oob
; vpxor %xmm6, %xmm6, %xmm8
; vpshufb %xmm8, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %load_splat_i16(i64) -> i16x8 {
block0(v0: i64):
v1 = load.i16 v0
v2 = splat.i16x8 v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; uninit %xmm2
; vpinsrw $0, %xmm2, 0(%rdi), %xmm4
; vpshuflw $0, %xmm4, %xmm6
; vpshufd $0, %xmm6, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpinsrw $0, (%rdi), %xmm2, %xmm4 ; trap: heap_oob
; vpshuflw $0, %xmm4, %xmm6
; vpshufd $0, %xmm6, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %load_splat_i32(i64) -> i32x4 {
block0(v0: i64):
v1 = load.i32 v0
v2 = splat.i32x4 v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vbroadcastss 0(%rdi), %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vbroadcastss (%rdi), %xmm0 ; trap: heap_oob
; movq %rbp, %rsp
; popq %rbp
; retq
function %load_splat_i64(i64) -> i64x2 {
block0(v0: i64):
v1 = load.i64 v0
v2 = splat.i64x2 v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vmovddup 0(%rdi), %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vmovddup (%rdi), %xmm0 ; trap: heap_oob
; movq %rbp, %rsp
; popq %rbp
; retq
function %load_splat_f32(i64) -> f32x4 {
block0(v0: i64):
v1 = load.f32 v0
v2 = splat.f32x4 v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vbroadcastss 0(%rdi), %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vbroadcastss (%rdi), %xmm0 ; trap: heap_oob
; movq %rbp, %rsp
; popq %rbp
; retq
function %load_splat_f64(i64) -> f64x2 {
block0(v0: i64):
v1 = load.f64 v0
v2 = splat.f64x2 v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vmovddup 0(%rdi), %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vmovddup (%rdi), %xmm0 ; trap: heap_oob
; movq %rbp, %rsp
; popq %rbp
; retq

View File

@@ -0,0 +1,318 @@
test compile precise-output
set enable_simd
target x86_64 has_avx has_avx2
function %splat_i8(i8) -> i8x16 {
block0(v0: i8):
v1 = splat.i8x16 v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movd %edi, %xmm2
; vpbroadcastb %xmm2, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movd %edi, %xmm2
; vpbroadcastb %xmm2, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %splat_i16(i16) -> i16x8 {
block0(v0: i16):
v1 = splat.i16x8 v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movd %edi, %xmm2
; vpbroadcastw %xmm2, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movd %edi, %xmm2
; vpbroadcastw %xmm2, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %splat_i32(i32) -> i32x4 {
block0(v0: i32):
v1 = splat.i32x4 v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movd %edi, %xmm2
; vpbroadcastd %xmm2, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movd %edi, %xmm2
; vpbroadcastd %xmm2, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %splat_i64(i64) -> i64x2 {
block0(v0: i64):
v1 = splat.i64x2 v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movq %rdi, %xmm2
; vmovddup %xmm2, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movq %rdi, %xmm2
; vmovddup %xmm2, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %splat_f32(f32) -> f32x4 {
block0(v0: f32):
v1 = splat.f32x4 v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vbroadcastss %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vbroadcastss %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %splat_f64(f64) -> f64x2 {
block0(v0: f64):
v1 = splat.f64x2 v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vmovddup %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vmovddup %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %load_splat_i8(i64) -> i8x16 {
block0(v0: i64):
v1 = load.i8 v0
v2 = splat.i8x16 v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpbroadcastb 0(%rdi), %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpbroadcastb (%rdi), %xmm0 ; trap: heap_oob
; movq %rbp, %rsp
; popq %rbp
; retq
function %load_splat_i16(i64) -> i16x8 {
block0(v0: i64):
v1 = load.i16 v0
v2 = splat.i16x8 v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpbroadcastw 0(%rdi), %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpbroadcastw (%rdi), %xmm0 ; trap: heap_oob
; movq %rbp, %rsp
; popq %rbp
; retq
function %load_splat_i32(i64) -> i32x4 {
block0(v0: i64):
v1 = load.i32 v0
v2 = splat.i32x4 v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vbroadcastss 0(%rdi), %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vbroadcastss (%rdi), %xmm0 ; trap: heap_oob
; movq %rbp, %rsp
; popq %rbp
; retq
function %load_splat_i64(i64) -> i64x2 {
block0(v0: i64):
v1 = load.i64 v0
v2 = splat.i64x2 v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vmovddup 0(%rdi), %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vmovddup (%rdi), %xmm0 ; trap: heap_oob
; movq %rbp, %rsp
; popq %rbp
; retq
function %load_splat_f32(i64) -> f32x4 {
block0(v0: i64):
v1 = load.f32 v0
v2 = splat.f32x4 v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vbroadcastss 0(%rdi), %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vbroadcastss (%rdi), %xmm0 ; trap: heap_oob
; movq %rbp, %rsp
; popq %rbp
; retq
function %load_splat_f64(i64) -> f64x2 {
block0(v0: i64):
v1 = load.f64 v0
v2 = splat.f64x2 v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vmovddup 0(%rdi), %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vmovddup (%rdi), %xmm0 ; trap: heap_oob
; movq %rbp, %rsp
; popq %rbp
; retq

View File

@@ -0,0 +1,338 @@
test compile precise-output
set enable_simd
target x86_64
function %splat_i8(i8) -> i8x16 {
block0(v0: i8):
v1 = splat.i8x16 v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movd %edi, %xmm0
; uninit %xmm5
; pxor %xmm5, %xmm5, %xmm5
; pshufb %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movd %edi, %xmm0
; pxor %xmm5, %xmm5
; pshufb %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %splat_i16(i16) -> i16x8 {
block0(v0: i16):
v1 = splat.i16x8 v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movd %edi, %xmm2
; pshuflw $0, %xmm2, %xmm4
; pshufd $0, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movd %edi, %xmm2
; pshuflw $0, %xmm2, %xmm4
; pshufd $0, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %splat_i32(i32) -> i32x4 {
block0(v0: i32):
v1 = splat.i32x4 v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movd %edi, %xmm2
; pshufd $0, %xmm2, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movd %edi, %xmm2
; pshufd $0, %xmm2, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %splat_i64(i64) -> i64x2 {
block0(v0: i64):
v1 = splat.i64x2 v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movq %rdi, %xmm2
; movddup %xmm2, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movq %rdi, %xmm2
; movddup %xmm2, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %splat_f32(f32) -> f32x4 {
block0(v0: f32):
v1 = splat.f32x4 v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; shufps $0, %xmm0, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; shufps $0, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %splat_f64(f64) -> f64x2 {
block0(v0: f64):
v1 = splat.f64x2 v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movddup %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movddup %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %load_splat_i8(i64) -> i8x16 {
block0(v0: i64):
v1 = load.i8 v0
v2 = splat.i8x16 v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; uninit %xmm0
; pinsrb $0, %xmm0, 0(%rdi), %xmm0
; uninit %xmm7
; pxor %xmm7, %xmm7, %xmm7
; pshufb %xmm0, %xmm7, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; pinsrb $0, (%rdi), %xmm0 ; trap: heap_oob
; pxor %xmm7, %xmm7
; pshufb %xmm7, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %load_splat_i16(i64) -> i16x8 {
block0(v0: i64):
v1 = load.i16 v0
v2 = splat.i16x8 v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; uninit %xmm3
; pinsrw $0, %xmm3, 0(%rdi), %xmm3
; pshuflw $0, %xmm3, %xmm6
; pshufd $0, %xmm6, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; pinsrw $0, (%rdi), %xmm3 ; trap: heap_oob
; pshuflw $0, %xmm3, %xmm6
; pshufd $0, %xmm6, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %load_splat_i32(i64) -> i32x4 {
block0(v0: i64):
v1 = load.i32 v0
v2 = splat.i32x4 v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movss 0(%rdi), %xmm0
; shufps $0, %xmm0, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movss (%rdi), %xmm0 ; trap: heap_oob
; shufps $0, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %load_splat_i64(i64) -> i64x2 {
block0(v0: i64):
v1 = load.i64 v0
v2 = splat.i64x2 v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movddup 0(%rdi), %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movddup (%rdi), %xmm0 ; trap: heap_oob
; movq %rbp, %rsp
; popq %rbp
; retq
function %load_splat_f32(i64) -> f32x4 {
block0(v0: i64):
v1 = load.f32 v0
v2 = splat.f32x4 v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movss 0(%rdi), %xmm0
; shufps $0, %xmm0, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movss (%rdi), %xmm0 ; trap: heap_oob
; shufps $0, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %load_splat_f64(i64) -> f64x2 {
block0(v0: i64):
v1 = load.f64 v0
v2 = splat.f64x2 v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movddup 0(%rdi), %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movddup (%rdi), %xmm0 ; trap: heap_oob
; movq %rbp, %rsp
; popq %rbp
; retq

View File

@@ -4,6 +4,8 @@ target aarch64
target s390x target s390x
set enable_simd set enable_simd
target x86_64 has_sse3 has_ssse3 has_sse41 target x86_64 has_sse3 has_ssse3 has_sse41
target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
target x86_64 has_sse3 has_ssse3 has_sse41 has_avx has_avx2
function %splat_i8x16(i8) -> i8x16 { function %splat_i8x16(i8) -> i8x16 {
block0(v0: i8): block0(v0: i8):