From d2bb4aa13b7f018854147ed41b272958a3106791 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Mon, 24 Apr 2023 12:24:19 -0500
Subject: [PATCH] x64: Add non-SSE4.1 lowering for pmulld (#6259)

Adds a lowering for SSE2 for i32x4-based multiplication which only first
became available in SSE4.1
---
 cranelift/codegen/src/isa/x64/lower.isle          | 15 +++++++++++++++
 .../filetests/runtests/simd-arithmetic.clif       |  1 +
 2 files changed, 16 insertions(+)

diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
index ec1e230963..7a2b76ceb9 100644
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -916,8 +916,23 @@
       (x64_pmullw x y))
 
 (rule (lower (has_type (multi_lane 32 4) (imul x y)))
+      (if-let $true (use_sse41))
       (x64_pmulld x y))
 
+;; Without `pmulld` the `pmuludq` instruction is used instead which performs
+;; 32-bit multiplication storing the 64-bit result. The 64-bit result is
+;; truncated to 32-bits and everything else is woven into place.
+(rule -1 (lower (has_type (multi_lane 32 4) (imul x y)))
+         (let (
+            (x Xmm x)
+            (y Xmm y)
+            (x_hi Xmm (x64_pshufd x 0b00_11_00_01))
+            (y_hi Xmm (x64_pshufd y 0b00_11_00_01))
+            (mul_lo Xmm (x64_pshufd (x64_pmuludq x y)       0b00_00_10_00))
+            (mul_hi Xmm (x64_pshufd (x64_pmuludq x_hi y_hi) 0b00_00_10_00))
+          )
+          (x64_punpckldq mul_lo mul_hi)))
+
 ;; With AVX-512 we can implement `i64x2` multiplication with a single
 ;; instruction.
 (rule 3 (lower (has_type (and (avx512vl_enabled $true)
diff --git a/cranelift/filetests/filetests/runtests/simd-arithmetic.clif b/cranelift/filetests/filetests/runtests/simd-arithmetic.clif
index ca620aa843..59f80c9602 100644
--- a/cranelift/filetests/filetests/runtests/simd-arithmetic.clif
+++ b/cranelift/filetests/filetests/runtests/simd-arithmetic.clif
@@ -2,6 +2,7 @@ test interpret
 test run
 target aarch64
 target s390x
+target x86_64 has_sse41=false
 set enable_simd
 target x86_64
 target x86_64 skylake