x64: Begin to lift SSE 4.1 requirement for SIMD support (#6216)

* x64: Change `use_sse41` to a constructor

This refactors the existing `use_sse41` extractor to instead be a
`constructor` to use with `if-let`.

* x64: Gate the `pblendw` instruction on SSE4.1 being enabled

This specialization of `shuffle` isn't a base case so adding an `if-let`
here should be sufficient for gating this instruction properly on
enabled CPU features.

* x64: Gate `pmuldq` lowerings on SSE 4.1

The specialized rules using these instructions can fall back to the
standard lowerings for non-SSE 4.1 instructions.
This commit is contained in:
Alex Crichton
2023-04-17 11:09:58 -05:00
committed by GitHub
parent 85118c8c26
commit 9a4bd7c6df
3 changed files with 75 additions and 56 deletions

View File

@@ -1639,8 +1639,8 @@
(decl pure use_fma () bool)
(extern constructor use_fma use_fma)
(decl use_sse41 (bool) Type)
(extern extractor infallible use_sse41 use_sse41)
(decl pure use_sse41 () bool)
(extern constructor use_sse41 use_sse41)
(decl pure use_sse42 () bool)
(extern constructor use_sse42 use_sse42)

View File

@@ -941,7 +941,7 @@
;;
;; A * B = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32
;;
;; Note, the algorithm will use `pmuldq` which operates directly on the lower
;; Note, the algorithm will use `pmuludq` which operates directly on the lower
;; 32-bit (`Al` or `Bl`) of a lane and writes the result to the full 64-bits of
;; the lane of the destination. For this reason we don't need shifts to isolate
;; the lower 32-bits, however, we will need to use shifts to isolate the high
@@ -999,6 +999,7 @@
x))
(swiden_high (and (value_type (multi_lane 32 4))
y)))))
(if-let $true (use_sse41))
(let ((x2 Xmm (x64_pshufd x 0xFA))
(y2 Xmm (x64_pshufd y 0xFA)))
(x64_pmuldq x2 y2)))
@@ -1031,6 +1032,7 @@
x))
(swiden_low (and (value_type (multi_lane 32 4))
y)))))
(if-let $true (use_sse41))
(let ((x2 Xmm (x64_pshufd x 0x50))
(y2 Xmm (x64_pshufd y 0x50)))
(x64_pmuldq x2 y2)))
@@ -3354,84 +3356,100 @@
;; Rules for `ceil` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type (use_sse41 $true) (ceil a @ (value_type $F32))))
(x64_roundss a (RoundImm.RoundUp)))
(rule 1 (lower (ceil a @ (value_type $F32)))
(if-let $true (use_sse41))
(x64_roundss a (RoundImm.RoundUp)))
(rule (lower (has_type (use_sse41 $false) (ceil a @ (value_type $F32))))
(rule 1 (lower (ceil a @ (value_type $F64)))
(if-let $true (use_sse41))
(x64_roundsd a (RoundImm.RoundUp)))
(rule 1 (lower (ceil a @ (value_type $F32X4)))
(if-let $true (use_sse41))
(x64_roundps a (RoundImm.RoundUp)))
(rule 1 (lower (ceil a @ (value_type $F64X2)))
(if-let $true (use_sse41))
(x64_roundpd a (RoundImm.RoundUp)))
(rule (lower (ceil a @ (value_type $F32)))
(libcall_1 (LibCall.CeilF32) a))
(rule (lower (has_type (use_sse41 $true) (ceil a @ (value_type $F64))))
(x64_roundsd a (RoundImm.RoundUp)))
(rule (lower (has_type (use_sse41 $false) (ceil a @ (value_type $F64))))
(rule (lower (ceil a @ (value_type $F64)))
(libcall_1 (LibCall.CeilF64) a))
(rule (lower (has_type (use_sse41 $true) (ceil a @ (value_type $F32X4))))
(x64_roundps a (RoundImm.RoundUp)))
(rule (lower (has_type (use_sse41 $true) (ceil a @ (value_type $F64X2))))
(x64_roundpd a (RoundImm.RoundUp)))
;; Rules for `floor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type (use_sse41 $true) (floor a @ (value_type $F32))))
(x64_roundss a (RoundImm.RoundDown)))
(rule 1 (lower (floor a @ (value_type $F32)))
(if-let $true (use_sse41))
(x64_roundss a (RoundImm.RoundDown)))
(rule (lower (has_type (use_sse41 $false) (floor a @ (value_type $F32))))
(rule 1 (lower (floor a @ (value_type $F64)))
(if-let $true (use_sse41))
(x64_roundsd a (RoundImm.RoundDown)))
(rule 1 (lower (floor a @ (value_type $F32X4)))
(if-let $true (use_sse41))
(x64_roundps a (RoundImm.RoundDown)))
(rule 1 (lower (floor a @ (value_type $F64X2)))
(if-let $true (use_sse41))
(x64_roundpd a (RoundImm.RoundDown)))
(rule (lower (floor a @ (value_type $F32)))
(libcall_1 (LibCall.FloorF32) a))
(rule (lower (has_type (use_sse41 $true) (floor a @ (value_type $F64))))
(x64_roundsd a (RoundImm.RoundDown)))
(rule (lower (has_type (use_sse41 $false) (floor a @ (value_type $F64))))
(rule (lower (floor a @ (value_type $F64)))
(libcall_1 (LibCall.FloorF64) a))
(rule (lower (has_type (use_sse41 $true) (floor a @ (value_type $F32X4))))
(x64_roundps a (RoundImm.RoundDown)))
(rule (lower (has_type (use_sse41 $true) (floor a @ (value_type $F64X2))))
(x64_roundpd a (RoundImm.RoundDown)))
;; Rules for `nearest` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type (use_sse41 $true) (nearest a @ (value_type $F32))))
(x64_roundss a (RoundImm.RoundNearest)))
(rule 1 (lower (nearest a @ (value_type $F32)))
(if-let $true (use_sse41))
(x64_roundss a (RoundImm.RoundNearest)))
(rule (lower (has_type (use_sse41 $false) (nearest a @ (value_type $F32))))
(rule 1 (lower (nearest a @ (value_type $F64)))
(if-let $true (use_sse41))
(x64_roundsd a (RoundImm.RoundNearest)))
(rule 1 (lower (nearest a @ (value_type $F32X4)))
(if-let $true (use_sse41))
(x64_roundps a (RoundImm.RoundNearest)))
(rule 1 (lower (nearest a @ (value_type $F64X2)))
(if-let $true (use_sse41))
(x64_roundpd a (RoundImm.RoundNearest)))
(rule (lower (nearest a @ (value_type $F32)))
(libcall_1 (LibCall.NearestF32) a))
(rule (lower (has_type (use_sse41 $true) (nearest a @ (value_type $F64))))
(x64_roundsd a (RoundImm.RoundNearest)))
(rule (lower (has_type (use_sse41 $false) (nearest a @ (value_type $F64))))
(rule (lower (nearest a @ (value_type $F64)))
(libcall_1 (LibCall.NearestF64) a))
(rule (lower (has_type (use_sse41 $true) (nearest a @ (value_type $F32X4))))
(x64_roundps a (RoundImm.RoundNearest)))
(rule (lower (has_type (use_sse41 $true) (nearest a @ (value_type $F64X2))))
(x64_roundpd a (RoundImm.RoundNearest)))
;; Rules for `trunc` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type (use_sse41 $true) (trunc a @ (value_type $F32))))
(x64_roundss a (RoundImm.RoundZero)))
(rule 1 (lower (trunc a @ (value_type $F32)))
(if-let $true (use_sse41))
(x64_roundss a (RoundImm.RoundZero)))
(rule (lower (has_type (use_sse41 $false) (trunc a @ (value_type $F32))))
(rule 1 (lower (trunc a @ (value_type $F64)))
(if-let $true (use_sse41))
(x64_roundsd a (RoundImm.RoundZero)))
(rule 1 (lower (trunc a @ (value_type $F32X4)))
(if-let $true (use_sse41))
(x64_roundps a (RoundImm.RoundZero)))
(rule 1 (lower (trunc a @ (value_type $F64X2)))
(if-let $true (use_sse41))
(x64_roundpd a (RoundImm.RoundZero)))
(rule (lower (trunc a @ (value_type $F32)))
(libcall_1 (LibCall.TruncF32) a))
(rule (lower (has_type (use_sse41 $true) (trunc a @ (value_type $F64))))
(x64_roundsd a (RoundImm.RoundZero)))
(rule (lower (has_type (use_sse41 $false) (trunc a @ (value_type $F64))))
(rule (lower (trunc a @ (value_type $F64)))
(libcall_1 (LibCall.TruncF64) a))
(rule (lower (has_type (use_sse41 $true) (trunc a @ (value_type $F32X4))))
(x64_roundps a (RoundImm.RoundZero)))
(rule (lower (has_type (use_sse41 $true) (trunc a @ (value_type $F64X2))))
(x64_roundpd a (RoundImm.RoundZero)))
;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (stack_addr stack_slot offset))
@@ -3624,6 +3642,7 @@
;; 0 chooses the corresponding 16-it lane from `a` and a bit of 1 chooses the
;; corresponding 16-bit lane from `b`.
(rule 14 (lower (shuffle a b (pblendw_imm n)))
(if-let $true (use_sse41))
(x64_pblendw a b n))
(decl pblendw_imm (u8) Immediate)
(extern extractor pblendw_imm pblendw_imm)

View File

@@ -225,7 +225,7 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
}
#[inline]
fn use_sse41(&mut self, _: Type) -> bool {
fn use_sse41(&mut self) -> bool {
self.backend.x64_flags.use_sse41()
}