x64: Begin to lift SSE 4.1 requirement for SIMD support (#6216)
* x64: Change `use_sse41` to a constructor This refactors the existing `use_sse41` extractor to instead be a `constructor` to use with `if-let`. * x64: Gate the `pblendw` instruction on SSE4.1 being enabled This specialization of `shuffle` isn't a base case so adding an `if-let` here should be sufficient for gating this instruction properly on enabled CPU features. * x64: Gate `pmuldq` lowerings on SSE 4.1 The specialized rules using these instructions can fall back to the standard lowerings for non-SSE 4.1 instructions.
This commit is contained in:
@@ -1639,8 +1639,8 @@
|
||||
(decl pure use_fma () bool)
|
||||
(extern constructor use_fma use_fma)
|
||||
|
||||
(decl use_sse41 (bool) Type)
|
||||
(extern extractor infallible use_sse41 use_sse41)
|
||||
(decl pure use_sse41 () bool)
|
||||
(extern constructor use_sse41 use_sse41)
|
||||
|
||||
(decl pure use_sse42 () bool)
|
||||
(extern constructor use_sse42 use_sse42)
|
||||
|
||||
@@ -941,7 +941,7 @@
|
||||
;;
|
||||
;; A * B = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32
|
||||
;;
|
||||
;; Note, the algorithm will use `pmuldq` which operates directly on the lower
|
||||
;; Note, the algorithm will use `pmuludq` which operates directly on the lower
|
||||
;; 32-bit (`Al` or `Bl`) of a lane and writes the result to the full 64-bits of
|
||||
;; the lane of the destination. For this reason we don't need shifts to isolate
|
||||
;; the lower 32-bits, however, we will need to use shifts to isolate the high
|
||||
@@ -999,6 +999,7 @@
|
||||
x))
|
||||
(swiden_high (and (value_type (multi_lane 32 4))
|
||||
y)))))
|
||||
(if-let $true (use_sse41))
|
||||
(let ((x2 Xmm (x64_pshufd x 0xFA))
|
||||
(y2 Xmm (x64_pshufd y 0xFA)))
|
||||
(x64_pmuldq x2 y2)))
|
||||
@@ -1031,6 +1032,7 @@
|
||||
x))
|
||||
(swiden_low (and (value_type (multi_lane 32 4))
|
||||
y)))))
|
||||
(if-let $true (use_sse41))
|
||||
(let ((x2 Xmm (x64_pshufd x 0x50))
|
||||
(y2 Xmm (x64_pshufd y 0x50)))
|
||||
(x64_pmuldq x2 y2)))
|
||||
@@ -3354,84 +3356,100 @@
|
||||
|
||||
;; Rules for `ceil` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule (lower (has_type (use_sse41 $true) (ceil a @ (value_type $F32))))
|
||||
(x64_roundss a (RoundImm.RoundUp)))
|
||||
(rule 1 (lower (ceil a @ (value_type $F32)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundss a (RoundImm.RoundUp)))
|
||||
|
||||
(rule (lower (has_type (use_sse41 $false) (ceil a @ (value_type $F32))))
|
||||
(rule 1 (lower (ceil a @ (value_type $F64)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundsd a (RoundImm.RoundUp)))
|
||||
|
||||
(rule 1 (lower (ceil a @ (value_type $F32X4)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundps a (RoundImm.RoundUp)))
|
||||
|
||||
(rule 1 (lower (ceil a @ (value_type $F64X2)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundpd a (RoundImm.RoundUp)))
|
||||
|
||||
(rule (lower (ceil a @ (value_type $F32)))
|
||||
(libcall_1 (LibCall.CeilF32) a))
|
||||
|
||||
(rule (lower (has_type (use_sse41 $true) (ceil a @ (value_type $F64))))
|
||||
(x64_roundsd a (RoundImm.RoundUp)))
|
||||
|
||||
(rule (lower (has_type (use_sse41 $false) (ceil a @ (value_type $F64))))
|
||||
(rule (lower (ceil a @ (value_type $F64)))
|
||||
(libcall_1 (LibCall.CeilF64) a))
|
||||
|
||||
(rule (lower (has_type (use_sse41 $true) (ceil a @ (value_type $F32X4))))
|
||||
(x64_roundps a (RoundImm.RoundUp)))
|
||||
|
||||
(rule (lower (has_type (use_sse41 $true) (ceil a @ (value_type $F64X2))))
|
||||
(x64_roundpd a (RoundImm.RoundUp)))
|
||||
|
||||
;; Rules for `floor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule (lower (has_type (use_sse41 $true) (floor a @ (value_type $F32))))
|
||||
(x64_roundss a (RoundImm.RoundDown)))
|
||||
(rule 1 (lower (floor a @ (value_type $F32)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundss a (RoundImm.RoundDown)))
|
||||
|
||||
(rule (lower (has_type (use_sse41 $false) (floor a @ (value_type $F32))))
|
||||
(rule 1 (lower (floor a @ (value_type $F64)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundsd a (RoundImm.RoundDown)))
|
||||
|
||||
(rule 1 (lower (floor a @ (value_type $F32X4)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundps a (RoundImm.RoundDown)))
|
||||
|
||||
(rule 1 (lower (floor a @ (value_type $F64X2)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundpd a (RoundImm.RoundDown)))
|
||||
|
||||
(rule (lower (floor a @ (value_type $F32)))
|
||||
(libcall_1 (LibCall.FloorF32) a))
|
||||
|
||||
(rule (lower (has_type (use_sse41 $true) (floor a @ (value_type $F64))))
|
||||
(x64_roundsd a (RoundImm.RoundDown)))
|
||||
|
||||
(rule (lower (has_type (use_sse41 $false) (floor a @ (value_type $F64))))
|
||||
(rule (lower (floor a @ (value_type $F64)))
|
||||
(libcall_1 (LibCall.FloorF64) a))
|
||||
|
||||
(rule (lower (has_type (use_sse41 $true) (floor a @ (value_type $F32X4))))
|
||||
(x64_roundps a (RoundImm.RoundDown)))
|
||||
|
||||
(rule (lower (has_type (use_sse41 $true) (floor a @ (value_type $F64X2))))
|
||||
(x64_roundpd a (RoundImm.RoundDown)))
|
||||
|
||||
;; Rules for `nearest` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule (lower (has_type (use_sse41 $true) (nearest a @ (value_type $F32))))
|
||||
(x64_roundss a (RoundImm.RoundNearest)))
|
||||
(rule 1 (lower (nearest a @ (value_type $F32)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundss a (RoundImm.RoundNearest)))
|
||||
|
||||
(rule (lower (has_type (use_sse41 $false) (nearest a @ (value_type $F32))))
|
||||
(rule 1 (lower (nearest a @ (value_type $F64)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundsd a (RoundImm.RoundNearest)))
|
||||
|
||||
(rule 1 (lower (nearest a @ (value_type $F32X4)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundps a (RoundImm.RoundNearest)))
|
||||
|
||||
(rule 1 (lower (nearest a @ (value_type $F64X2)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundpd a (RoundImm.RoundNearest)))
|
||||
|
||||
(rule (lower (nearest a @ (value_type $F32)))
|
||||
(libcall_1 (LibCall.NearestF32) a))
|
||||
|
||||
(rule (lower (has_type (use_sse41 $true) (nearest a @ (value_type $F64))))
|
||||
(x64_roundsd a (RoundImm.RoundNearest)))
|
||||
|
||||
(rule (lower (has_type (use_sse41 $false) (nearest a @ (value_type $F64))))
|
||||
(rule (lower (nearest a @ (value_type $F64)))
|
||||
(libcall_1 (LibCall.NearestF64) a))
|
||||
|
||||
(rule (lower (has_type (use_sse41 $true) (nearest a @ (value_type $F32X4))))
|
||||
(x64_roundps a (RoundImm.RoundNearest)))
|
||||
|
||||
(rule (lower (has_type (use_sse41 $true) (nearest a @ (value_type $F64X2))))
|
||||
(x64_roundpd a (RoundImm.RoundNearest)))
|
||||
|
||||
;; Rules for `trunc` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule (lower (has_type (use_sse41 $true) (trunc a @ (value_type $F32))))
|
||||
(x64_roundss a (RoundImm.RoundZero)))
|
||||
(rule 1 (lower (trunc a @ (value_type $F32)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundss a (RoundImm.RoundZero)))
|
||||
|
||||
(rule (lower (has_type (use_sse41 $false) (trunc a @ (value_type $F32))))
|
||||
(rule 1 (lower (trunc a @ (value_type $F64)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundsd a (RoundImm.RoundZero)))
|
||||
|
||||
(rule 1 (lower (trunc a @ (value_type $F32X4)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundps a (RoundImm.RoundZero)))
|
||||
|
||||
(rule 1 (lower (trunc a @ (value_type $F64X2)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundpd a (RoundImm.RoundZero)))
|
||||
|
||||
(rule (lower (trunc a @ (value_type $F32)))
|
||||
(libcall_1 (LibCall.TruncF32) a))
|
||||
|
||||
(rule (lower (has_type (use_sse41 $true) (trunc a @ (value_type $F64))))
|
||||
(x64_roundsd a (RoundImm.RoundZero)))
|
||||
|
||||
(rule (lower (has_type (use_sse41 $false) (trunc a @ (value_type $F64))))
|
||||
(rule (lower (trunc a @ (value_type $F64)))
|
||||
(libcall_1 (LibCall.TruncF64) a))
|
||||
|
||||
(rule (lower (has_type (use_sse41 $true) (trunc a @ (value_type $F32X4))))
|
||||
(x64_roundps a (RoundImm.RoundZero)))
|
||||
|
||||
(rule (lower (has_type (use_sse41 $true) (trunc a @ (value_type $F64X2))))
|
||||
(x64_roundpd a (RoundImm.RoundZero)))
|
||||
|
||||
;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule (lower (stack_addr stack_slot offset))
|
||||
@@ -3624,6 +3642,7 @@
|
||||
;; 0 chooses the corresponding 16-it lane from `a` and a bit of 1 chooses the
|
||||
;; corresponding 16-bit lane from `b`.
|
||||
(rule 14 (lower (shuffle a b (pblendw_imm n)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_pblendw a b n))
|
||||
(decl pblendw_imm (u8) Immediate)
|
||||
(extern extractor pblendw_imm pblendw_imm)
|
||||
|
||||
@@ -225,7 +225,7 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn use_sse41(&mut self, _: Type) -> bool {
|
||||
fn use_sse41(&mut self) -> bool {
|
||||
self.backend.x64_flags.use_sse41()
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user