x64: Improve codegen for splats (#6025)
This commit goes through the lowerings for the CLIF `splat` instruction
and improves the support for each operator. Many of these lowerings are
mirrored from v8/SpiderMonkey and there are a number of improvements:
* AVX2 `v{p,}broadcast*` instructions are added and used when available.
* Float-based splats are much simpler and always a single-instruction
* Integer-based splats don't insert into an uninit xmm value and instead
start out with a `movd` to move into an `xmm` register. This
thoeretically breaks dependencies with prior instructions since `movd`
creates a fresh new value in the destination register.
* Loads are now sunk into all of the instructions. A new extractor,
`sinkable_load_exact`, was added to sink the i8/i16 loads.
This commit is contained in:
@@ -919,6 +919,7 @@
|
||||
Pshuflw
|
||||
Pshufhw
|
||||
Pblendw
|
||||
Movddup
|
||||
))
|
||||
|
||||
(type CmpOpcode extern
|
||||
@@ -1292,6 +1293,11 @@
|
||||
Vpextrd
|
||||
Vpextrq
|
||||
Vpblendw
|
||||
Vmovddup
|
||||
Vpbroadcastb
|
||||
Vpbroadcastw
|
||||
Vpbroadcastd
|
||||
Vbroadcastss
|
||||
))
|
||||
|
||||
(type Avx512Opcode extern
|
||||
@@ -1622,6 +1628,9 @@
|
||||
(decl pure has_avx () bool)
|
||||
(extern constructor has_avx has_avx)
|
||||
|
||||
(decl pure has_avx2 () bool)
|
||||
(extern constructor has_avx2 has_avx2)
|
||||
|
||||
;;;; Helpers for Merging and Sinking Immediates/Loads ;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;; Extract a constant `Imm8Reg.Imm8` from a value operand.
|
||||
@@ -1656,9 +1665,21 @@
|
||||
|
||||
;; Extract a `SinkableLoad` that works with `RegMemImm.Mem` from a value
|
||||
;; operand.
|
||||
;;
|
||||
;; Note that this will only work for 32-bit-types-or-larger since this is
|
||||
;; pervasively used with operations that load a minimum of 32-bits. For
|
||||
;; instructions which load exactly the type width necessary use
|
||||
;; `sinkable_load_exact`.
|
||||
(decl sinkable_load (SinkableLoad) Value)
|
||||
(extern extractor sinkable_load sinkable_load)
|
||||
|
||||
;; Same as `sinkable_load` except that all type widths of loads are supported.
|
||||
;;
|
||||
;; Only use this when the instruction which performs the load is guaranteed to
|
||||
;; load the precisely correct size.
|
||||
(decl sinkable_load_exact (SinkableLoad) Value)
|
||||
(extern extractor sinkable_load_exact sinkable_load_exact)
|
||||
|
||||
;; Sink a `SinkableLoad` into a `SyntheticAmode`.
|
||||
;;
|
||||
;; This is a side-effectful operation that notifies the context that the
|
||||
@@ -1678,6 +1699,9 @@
|
||||
(decl sink_load_to_reg_mem (SinkableLoad) RegMem)
|
||||
(rule (sink_load_to_reg_mem load) (RegMem.Mem load))
|
||||
|
||||
(decl sink_load_to_gpr_mem (SinkableLoad) GprMem)
|
||||
(rule (sink_load_to_gpr_mem load) (RegMem.Mem load))
|
||||
|
||||
(decl sink_load_to_reg_mem_imm (SinkableLoad) RegMemImm)
|
||||
(rule (sink_load_to_reg_mem_imm load) (RegMemImm.Mem load))
|
||||
|
||||
@@ -4103,6 +4127,34 @@
|
||||
(rule (trap_if_fcmp (FcmpCondResult.OrCondition producer cc1 cc2) tc)
|
||||
(with_flags_side_effect producer (trap_if_or cc1 cc2 tc)))
|
||||
|
||||
;; Helper for creating `movddup` instructions
|
||||
(decl x64_movddup (XmmMem) Xmm)
|
||||
(rule (x64_movddup src)
|
||||
(xmm_unary_rm_r_unaligned (SseOpcode.Movddup) src))
|
||||
(rule 1 (x64_movddup src)
|
||||
(if-let $true (has_avx))
|
||||
(xmm_unary_rm_r_vex (AvxOpcode.Vmovddup) src))
|
||||
|
||||
;; Helper for creating `vpbroadcastb` instructions
|
||||
(decl x64_vpbroadcastb (XmmMem) Xmm)
|
||||
(rule (x64_vpbroadcastb src)
|
||||
(xmm_unary_rm_r_vex (AvxOpcode.Vpbroadcastb) src))
|
||||
|
||||
;; Helper for creating `vpbroadcastw` instructions
|
||||
(decl x64_vpbroadcastw (XmmMem) Xmm)
|
||||
(rule (x64_vpbroadcastw src)
|
||||
(xmm_unary_rm_r_vex (AvxOpcode.Vpbroadcastw) src))
|
||||
|
||||
;; Helper for creating `vpbroadcastd` instructions
|
||||
(decl x64_vpbroadcastd (XmmMem) Xmm)
|
||||
(rule (x64_vpbroadcastd src)
|
||||
(xmm_unary_rm_r_vex (AvxOpcode.Vpbroadcastd) src))
|
||||
|
||||
;; Helper for creating `vbroadcastss` instructions
|
||||
(decl x64_vbroadcastss (XmmMem) Xmm)
|
||||
(rule (x64_vbroadcastss src)
|
||||
(xmm_unary_rm_r_vex (AvxOpcode.Vbroadcastss) src))
|
||||
|
||||
;;;; Jumps ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;; Unconditional jump.
|
||||
@@ -4664,6 +4716,7 @@
|
||||
(convert AtomicRmwOp MachAtomicRmwOp atomic_rmw_op_to_mach_atomic_rmw_op)
|
||||
|
||||
(convert SinkableLoad RegMem sink_load_to_reg_mem)
|
||||
(convert SinkableLoad GprMem sink_load_to_gpr_mem)
|
||||
(convert SinkableLoad RegMemImm sink_load_to_reg_mem_imm)
|
||||
(convert SinkableLoad GprMemImm sink_load_to_gpr_mem_imm)
|
||||
(convert SinkableLoad XmmMem sink_load_to_xmm_mem)
|
||||
|
||||
Reference in New Issue
Block a user