From d2bb4aa13b7f018854147ed41b272958a3106791 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Mon, 24 Apr 2023 12:24:19 -0500 Subject: [PATCH] x64: Add non-SSE4.1 lowering for pmulld (#6259) Adds a lowering for SSE2 for i32x4-based multiplication which only first became available in SSE4.1 --- cranelift/codegen/src/isa/x64/lower.isle | 15 +++++++++++++++ .../filetests/runtests/simd-arithmetic.clif | 1 + 2 files changed, 16 insertions(+) diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index ec1e230963..7a2b76ceb9 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -916,8 +916,23 @@ (x64_pmullw x y)) (rule (lower (has_type (multi_lane 32 4) (imul x y))) + (if-let $true (use_sse41)) (x64_pmulld x y)) +;; Without `pmulld` the `pmuludq` instruction is used instead which performs +;; 32-bit multiplication storing the 64-bit result. The 64-bit result is +;; truncated to 32-bits and everything else is woven into place. +(rule -1 (lower (has_type (multi_lane 32 4) (imul x y))) + (let ( + (x Xmm x) + (y Xmm y) + (x_hi Xmm (x64_pshufd x 0b00_11_00_01)) + (y_hi Xmm (x64_pshufd y 0b00_11_00_01)) + (mul_lo Xmm (x64_pshufd (x64_pmuludq x y) 0b00_00_10_00)) + (mul_hi Xmm (x64_pshufd (x64_pmuludq x_hi y_hi) 0b00_00_10_00)) + ) + (x64_punpckldq mul_lo mul_hi))) + ;; With AVX-512 we can implement `i64x2` multiplication with a single ;; instruction. (rule 3 (lower (has_type (and (avx512vl_enabled $true) diff --git a/cranelift/filetests/filetests/runtests/simd-arithmetic.clif b/cranelift/filetests/filetests/runtests/simd-arithmetic.clif index ca620aa843..59f80c9602 100644 --- a/cranelift/filetests/filetests/runtests/simd-arithmetic.clif +++ b/cranelift/filetests/filetests/runtests/simd-arithmetic.clif @@ -2,6 +2,7 @@ test interpret test run target aarch64 target s390x +target x86_64 has_sse41=false set enable_simd target x86_64 target x86_64 skylake