x64: Lower extractlane, scalar_to_vector, and splat in ISLE (#4780)
Lower extractlane, scalar_to_vector and splat in ISLE. This PR also makes some changes to the SinkableLoad api * change the return type of sink_load to RegMem as there are more functions available for dealing with RegMem * add reg_mem_to_reg_mem_imm and register it as an automatic conversion
This commit is contained in:
@@ -3547,3 +3547,99 @@
|
||||
mask
|
||||
(x64_xmm_load_const $I8X16 (swizzle_zero_mask)))))
|
||||
(x64_pshufb src mask)))
|
||||
|
||||
;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;; Remove the extractlane instruction, leaving the float where it is. The upper
|
||||
;; bits will remain unchanged; for correctness, this relies on Cranelift type
|
||||
;; checking to avoid using those bits.
|
||||
(rule (lower (has_type (ty_scalar_float _) (extractlane val (u8_from_uimm8 0))))
|
||||
val)
|
||||
|
||||
;; Cases 2-4 for an F32X4
|
||||
(rule (lower (has_type $F32 (extractlane val @ (value_type (ty_vec128 ty))
|
||||
(u8_from_uimm8 lane))))
|
||||
(x64_pshufd val lane (OperandSize.Size32)))
|
||||
|
||||
;; This is the only remaining case for F64X2
|
||||
(rule (lower (has_type $F64 (extractlane val @ (value_type (ty_vec128 ty))
|
||||
(u8_from_uimm8 1))))
|
||||
;; 0xee == 0b11_10_11_10
|
||||
(x64_pshufd val 0xee (OperandSize.Size32)))
|
||||
|
||||
(rule (lower (extractlane val @ (value_type ty @ (multi_lane 8 16)) (u8_from_uimm8 lane)))
|
||||
(x64_pextrb ty val lane))
|
||||
|
||||
(rule (lower (extractlane val @ (value_type ty @ (multi_lane 16 8)) (u8_from_uimm8 lane)))
|
||||
(x64_pextrw ty val lane))
|
||||
|
||||
(rule (lower (extractlane val @ (value_type ty @ (multi_lane 32 4)) (u8_from_uimm8 lane)))
|
||||
(x64_pextrd ty val lane))
|
||||
|
||||
(rule (lower (extractlane val @ (value_type ty @ (multi_lane 64 2)) (u8_from_uimm8 lane)))
|
||||
(x64_pextrd ty val lane))
|
||||
|
||||
;; Rules for `scalar_to_vector` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;; Case 1: when moving a scalar float, we simply move from one XMM register
|
||||
;; to another, expecting the register allocator to elide this. Here we
|
||||
;; assume that the upper bits of a scalar float have not been munged with
|
||||
;; (the same assumption the old backend makes).
|
||||
(rule (lower (scalar_to_vector src @ (value_type (ty_scalar_float _))))
|
||||
src)
|
||||
|
||||
;; Case 2: when moving a scalar value of any other type, use MOVD to zero
|
||||
;; the upper lanes.
|
||||
(rule (lower (scalar_to_vector src @ (value_type ty)))
|
||||
(bitcast_gpr_to_xmm ty src))
|
||||
|
||||
;; Case 3: when presented with `load + scalar_to_vector`, coalesce into a single
|
||||
;; MOVSS/MOVSD instruction.
|
||||
(rule (lower (scalar_to_vector (and (sinkable_load src) (value_type (ty_32 _)))))
|
||||
(x64_movss_load (sink_load_to_xmm_mem src)))
|
||||
(rule (lower (scalar_to_vector (and (sinkable_load src) (value_type (ty_64 _)))))
|
||||
(x64_movsd_load (sink_load_to_xmm_mem src)))
|
||||
|
||||
;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule (lower (has_type (multi_lane 8 16) (splat src)))
|
||||
(let ((vec Xmm (vec_insert_lane $I8X16 (xmm_uninit_value) src 0))
|
||||
(zeros Xmm (x64_pxor vec vec)))
|
||||
;; Shuffle the lowest byte lane to all other lanes.
|
||||
(x64_pshufb vec zeros)))
|
||||
|
||||
(rule (lower (has_type (multi_lane 16 8) (splat src)))
|
||||
(let (;; Force the input into a register so that we don't create a
|
||||
;; VCodeConstant.
|
||||
(src RegMem (RegMem.Reg src))
|
||||
(vec Xmm (vec_insert_lane $I16X8 (xmm_uninit_value) src 0))
|
||||
(vec Xmm (vec_insert_lane $I16X8 vec src 1)))
|
||||
;; Shuffle the lowest two lanes to all other lanes.
|
||||
(x64_pshufd vec 0 (OperandSize.Size32))))
|
||||
|
||||
(rule (lower (has_type (multi_lane 32 4) (splat src @ (value_type (ty_scalar_float _)))))
|
||||
(lower_splat_32x4 $F32X4 src))
|
||||
|
||||
(rule (lower (has_type (multi_lane 32 4) (splat src)))
|
||||
(lower_splat_32x4 $I32X4 src))
|
||||
|
||||
(decl lower_splat_32x4 (Type Value) Xmm)
|
||||
(rule (lower_splat_32x4 ty src)
|
||||
(let ((src RegMem src)
|
||||
(vec Xmm (vec_insert_lane ty (xmm_uninit_value) src 0)))
|
||||
;; Shuffle the lowest lane to all other lanes.
|
||||
(x64_pshufd vec 0 (OperandSize.Size32))))
|
||||
|
||||
(rule (lower (has_type (multi_lane 64 2) (splat src @ (value_type (ty_scalar_float _)))))
|
||||
(lower_splat_64x2 $F64X2 src))
|
||||
|
||||
(rule (lower (has_type (multi_lane 64 2) (splat src)))
|
||||
(lower_splat_64x2 $I64X2 src))
|
||||
|
||||
(decl lower_splat_64x2 (Type Value) Xmm)
|
||||
(rule (lower_splat_64x2 ty src)
|
||||
(let (;; Force the input into a register so that we don't create a
|
||||
;; VCodeConstant.
|
||||
(src RegMem (RegMem.Reg src))
|
||||
(vec Xmm (vec_insert_lane ty (xmm_uninit_value) src 0)))
|
||||
(vec_insert_lane ty vec src 1)))
|
||||
|
||||
Reference in New Issue
Block a user