x64: Improve codegen for splats (#6025)

This commit goes through the lowerings for the CLIF `splat` instruction
and improves the support for each operator. Many of these lowerings are
mirrored from v8/SpiderMonkey and there are a number of improvements:

* AVX2 `v{p,}broadcast*` instructions are added and used when available.
* Float-based splats are much simpler and always a single-instruction
* Integer-based splats don't insert into an uninit xmm value and instead
  start out with a `movd` to move into an `xmm` register. This
  thoeretically breaks dependencies with prior instructions since `movd`
  creates a fresh new value in the destination register.
* Loads are now sunk into all of the instructions. A new extractor,
  `sinkable_load_exact`, was added to sink the i8/i16 loads.
This commit is contained in:
Alex Crichton
2023-03-15 16:33:56 -05:00
committed by GitHub
parent a10c50afe9
commit d76f7ee52e
12 changed files with 1216 additions and 82 deletions

View File

@@ -3915,47 +3915,89 @@
;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type (multi_lane 8 16) (splat src)))
(let ((vec Xmm (vec_insert_lane $I8X16 (xmm_uninit_value) src 0))
(zeros Xmm (xmm_zero $I8X16)))
;; Shuffle the lowest byte lane to all other lanes.
(x64_pshufb vec zeros)))
;; For all the splat rules below one of the goals is that splatting a value
;; doesn't end up accidentally depending on the previous value in a register.
;; This means that instructions are chosen to avoid false dependencies where
;; new values are created fresh or otherwise overwrite previous register
;; contents where possible.
;;
;; Additionally splats are specialized to special-case load-and-splat which
;; has a number of micro-optimizations available.
(rule (lower (has_type (multi_lane 16 8) (splat src)))
(let (;; Force the input into a register so that we don't create a
;; VCodeConstant.
(src RegMem (RegMem.Reg src))
(vec Xmm (vec_insert_lane $I16X8 (xmm_uninit_value) src 0))
(vec Xmm (vec_insert_lane $I16X8 vec src 1)))
;; Shuffle the lowest two lanes to all other lanes.
(x64_pshufd vec 0)))
;; i8x16 splats: use `vpbroadcastb` on AVX2 and otherwise `pshufb` broadcasts
;; with a mask of zero which is calculated with an xor-against-itself register.
(rule 0 (lower (has_type $I8X16 (splat src)))
(x64_pshufb (bitcast_gpr_to_xmm $I32 src) (xmm_zero $I8X16)))
(rule 1 (lower (has_type $I8X16 (splat src)))
(if-let $true (has_avx2))
(x64_vpbroadcastb (bitcast_gpr_to_xmm $I32 src)))
(rule 2 (lower (has_type $I8X16 (splat (sinkable_load_exact addr))))
(x64_pshufb (x64_pinsrb (xmm_uninit_value) addr 0) (xmm_zero $I8X16)))
(rule 3 (lower (has_type $I8X16 (splat (sinkable_load_exact addr))))
(if-let $true (has_avx2))
(x64_vpbroadcastb addr))
(rule 1 (lower (has_type (multi_lane 32 4) (splat src @ (value_type (ty_scalar_float _)))))
(lower_splat_32x4 $F32X4 src))
;; i16x8 splats: use `vpbroadcastw` on AVX2 and otherwise a 16-bit value is
;; loaded into an xmm register, `pshuflw` broadcasts the low 16-bit lane
;; to the low four lanes, and `pshufd` broadcasts the low 32-bit lane (which
;; at that point is two of the 16-bit values we want to broadcast) to all the
;; lanes.
(rule 0 (lower (has_type $I16X8 (splat src)))
(x64_pshufd (x64_pshuflw (bitcast_gpr_to_xmm $I32 src) 0) 0))
(rule 1 (lower (has_type $I16X8 (splat src)))
(if-let $true (has_avx2))
(x64_vpbroadcastw (bitcast_gpr_to_xmm $I32 src)))
(rule 2 (lower (has_type $I16X8 (splat (sinkable_load_exact addr))))
(x64_pshufd (x64_pshuflw (x64_pinsrw (xmm_uninit_value) addr 0) 0) 0))
(rule 3 (lower (has_type $I16X8 (splat (sinkable_load_exact addr))))
(if-let $true (has_avx2))
(x64_vpbroadcastw addr))
(rule (lower (has_type (multi_lane 32 4) (splat src)))
(lower_splat_32x4 $I32X4 src))
;; i32x4.splat - use `vpbroadcastd` on AVX2 and otherwise `pshufd` can be
;; used to broadcast the low lane to all other lanes.
;;
;; Note that sinkable-load cases come later
(rule 0 (lower (has_type $I32X4 (splat src)))
(x64_pshufd (bitcast_gpr_to_xmm $I32 src) 0))
(rule 1 (lower (has_type $I32X4 (splat src)))
(if-let $true (has_avx2))
(x64_vpbroadcastd (bitcast_gpr_to_xmm $I32 src)))
(decl lower_splat_32x4 (Type Value) Xmm)
(rule (lower_splat_32x4 ty src)
(let ((src RegMem src)
(vec Xmm (vec_insert_lane ty (xmm_uninit_value) src 0)))
;; Shuffle the lowest lane to all other lanes.
(x64_pshufd vec 0)))
;; f32x4.splat - the source is already in an xmm register so `shufps` is all
;; that's necessary to complete the splat. This is specialized to `vbroadcastss`
;; on AVX2 to leverage that specific instruction for this operation.
(rule 0 (lower (has_type $F32X4 (splat src)))
(let ((tmp Xmm src))
(x64_shufps src src 0)))
(rule 1 (lower (has_type $F32X4 (splat src)))
(if-let $true (has_avx2))
(x64_vbroadcastss src))
(rule 1 (lower (has_type (multi_lane 64 2) (splat src @ (value_type (ty_scalar_float _)))))
(lower_splat_64x2 $F64X2 src))
;; t32x4.splat of a load - use a `movss` to load into an xmm register and then
;; `shufps` broadcasts to the other lanes. Note that this is used for both i32
;; and f32 splats.
;;
;; With AVX the `vbroadcastss` instruction suits this purpose precisely. Note
;; that the memory-operand encoding of `vbroadcastss` is usable with AVX, but
;; the register-based encoding is only available with AVX2. With the
;; `sinkable_load` extractor this should be guaranteed to use the memory-based
;; encoding hence the `has_avx` test.
(rule 4 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr))))
(let ((tmp Xmm (x64_movss_load addr)))
(x64_shufps tmp tmp 0)))
(rule 5 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr))))
(if-let $true (has_avx))
(x64_vbroadcastss addr))
(rule (lower (has_type (multi_lane 64 2) (splat src)))
(lower_splat_64x2 $I64X2 src))
(decl lower_splat_64x2 (Type Value) Xmm)
(rule (lower_splat_64x2 ty src)
(let (;; Force the input into a register so that we don't create a
;; VCodeConstant.
(src RegMem (RegMem.Reg src))
(vec Xmm (vec_insert_lane ty (xmm_uninit_value) src 0)))
(vec_insert_lane ty vec src 1)))
;; t64x2.splat - use `movddup` which is exactly what we want and there's a
;; minor specialization for sinkable loads to avoid going through a gpr for i64
;; splats
(rule 0 (lower (has_type $I64X2 (splat src)))
(x64_movddup (bitcast_gpr_to_xmm $I64 src)))
(rule 0 (lower (has_type $F64X2 (splat src)))
(x64_movddup src))
(rule 5 (lower (has_type (multi_lane 64 2) (splat (sinkable_load addr))))
(x64_movddup addr))
;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;