x64: Add non-SSE4.1 lowering for pmulld (#6259)
Adds a lowering for SSE2 for i32x4-based multiplication which only first became available in SSE4.1
This commit is contained in:
@@ -916,8 +916,23 @@
|
||||
(x64_pmullw x y))
|
||||
|
||||
(rule (lower (has_type (multi_lane 32 4) (imul x y)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_pmulld x y))
|
||||
|
||||
;; Without `pmulld` the `pmuludq` instruction is used instead which performs
|
||||
;; 32-bit multiplication storing the 64-bit result. The 64-bit result is
|
||||
;; truncated to 32-bits and everything else is woven into place.
|
||||
(rule -1 (lower (has_type (multi_lane 32 4) (imul x y)))
|
||||
(let (
|
||||
(x Xmm x)
|
||||
(y Xmm y)
|
||||
(x_hi Xmm (x64_pshufd x 0b00_11_00_01))
|
||||
(y_hi Xmm (x64_pshufd y 0b00_11_00_01))
|
||||
(mul_lo Xmm (x64_pshufd (x64_pmuludq x y) 0b00_00_10_00))
|
||||
(mul_hi Xmm (x64_pshufd (x64_pmuludq x_hi y_hi) 0b00_00_10_00))
|
||||
)
|
||||
(x64_punpckldq mul_lo mul_hi)))
|
||||
|
||||
;; With AVX-512 we can implement `i64x2` multiplication with a single
|
||||
;; instruction.
|
||||
(rule 3 (lower (has_type (and (avx512vl_enabled $true)
|
||||
|
||||
@@ -2,6 +2,7 @@ test interpret
|
||||
test run
|
||||
target aarch64
|
||||
target s390x
|
||||
target x86_64 has_sse41=false
|
||||
set enable_simd
|
||||
target x86_64
|
||||
target x86_64 skylake
|
||||
|
||||
Reference in New Issue
Block a user