x64: Add non-SSE4.1 lowering for pmulld (#6259)

Adds a lowering for SSE2 for i32x4-based multiplication which only first
became available in SSE4.1
This commit is contained in:
Alex Crichton
2023-04-24 12:24:19 -05:00
committed by GitHub
parent c9a9c2c191
commit d2bb4aa13b
2 changed files with 16 additions and 0 deletions

View File

@@ -916,8 +916,23 @@
(x64_pmullw x y))
(rule (lower (has_type (multi_lane 32 4) (imul x y)))
(if-let $true (use_sse41))
(x64_pmulld x y))
;; Without `pmulld` the `pmuludq` instruction is used instead which performs
;; 32-bit multiplication storing the 64-bit result. The 64-bit result is
;; truncated to 32-bits and everything else is woven into place.
(rule -1 (lower (has_type (multi_lane 32 4) (imul x y)))
(let (
(x Xmm x)
(y Xmm y)
(x_hi Xmm (x64_pshufd x 0b00_11_00_01))
(y_hi Xmm (x64_pshufd y 0b00_11_00_01))
(mul_lo Xmm (x64_pshufd (x64_pmuludq x y) 0b00_00_10_00))
(mul_hi Xmm (x64_pshufd (x64_pmuludq x_hi y_hi) 0b00_00_10_00))
)
(x64_punpckldq mul_lo mul_hi)))
;; With AVX-512 we can implement `i64x2` multiplication with a single
;; instruction.
(rule 3 (lower (has_type (and (avx512vl_enabled $true)

View File

@@ -2,6 +2,7 @@ test interpret
test run
target aarch64
target s390x
target x86_64 has_sse41=false
set enable_simd
target x86_64
target x86_64 skylake