From a2a38edd8a457ec17e96e0260a87690697aef473 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Thu, 20 Apr 2023 16:56:59 -0500 Subject: [PATCH] x64: Add non-SSE 4.1 lowerings for `v{all,any}_true` (#6232) This commit adds lowerings to the x64 backend for two more CLIF instructions that currently require SSE 4.1. These lowerings are inspired by LLVM's lowerings and avoid the use of SSE 4.1 instructions. --- cranelift/codegen/src/isa/x64/inst.isle | 14 +++++++- cranelift/codegen/src/isa/x64/lower.isle | 35 +++++++++++++++---- .../filetests/runtests/simd-valltrue.clif | 5 +-- .../filetests/runtests/simd-vanytrue.clif | 5 +-- 4 files changed, 48 insertions(+), 11 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index 08d5c45846..b816b163a0 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -4196,7 +4196,19 @@ (rule (x64_pcmpeq $I8X16 x y) (x64_pcmpeqb x y)) (rule (x64_pcmpeq $I16X8 x y) (x64_pcmpeqw x y)) (rule (x64_pcmpeq $I32X4 x y) (x64_pcmpeqd x y)) -(rule (x64_pcmpeq $I64X2 x y) (x64_pcmpeqq x y)) +(rule (x64_pcmpeq $I64X2 x y) + (if-let $true (use_sse41)) + (x64_pcmpeqq x y)) + +;; Without SSE 4.1 there's no access to `pcmpeqq`, so it's emulated by comparing +;; 32-bit lanes instead. The upper and lower halves of the 32-bit comparison are +;; swapped and then these two results are and'd together. This way only if both +;; 32-bit values were equal is the result all ones, otherwise the result is +;; all zeros if either 32-bit comparison was zero. +(rule -1 (x64_pcmpeq $I64X2 x y) + (let ((cmp32 Xmm (x64_pcmpeqd x y)) + (cmp32_swapped Xmm (x64_pshufd cmp32 0b10_11_00_01))) + (x64_pand cmp32 cmp32_swapped))) (decl x64_pcmpeqb (Xmm XmmMem) Xmm) (rule 0 (x64_pcmpeqb x y) (xmm_rm_r (SseOpcode.Pcmpeqb) x y)) diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 99a58d1e75..ec1e230963 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -3949,17 +3949,40 @@ ;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule 1 (lower (vany_true val)) + (if-let $true (use_sse41)) + (let ((val Xmm val)) + (with_flags (x64_ptest val val) (x64_setcc (CC.NZ))))) + +;; Any nonzero byte in `val` means that any lane is true. Compare `val` with a +;; zeroed register and extract the high bits to a gpr mask. If the mask is +;; 0xffff then every byte was equal to zero, so test if the comparison is +;; not-equal or NZ. (rule (lower (vany_true val)) - (let ((val Xmm val)) - (with_flags (x64_ptest val val) (x64_setcc (CC.NZ))))) + (let ( + (any_byte_zero Xmm (x64_pcmpeqb val (xmm_zero $I8X16))) + (mask Gpr (x64_pmovmskb (OperandSize.Size32) any_byte_zero)) + ) + (with_flags (x64_cmp (OperandSize.Size32) (RegMemImm.Imm 0xffff) mask) + (x64_setcc (CC.NZ))))) ;; Rules for `vall_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule 1 (lower (vall_true val @ (value_type ty))) + (if-let $true (use_sse41)) + (let ((src Xmm val) + (zeros Xmm (xmm_zero ty)) + (cmp Xmm (x64_pcmpeq (vec_int_type ty) src zeros))) + (with_flags (x64_ptest cmp cmp) (x64_setcc (CC.Z))))) + +;; Perform an appropriately-sized lane-wise comparison with zero. If the +;; result is all 0s then all of them are true because nothing was equal to +;; zero. (rule (lower (vall_true val @ (value_type ty))) - (let ((src Xmm val) - (zeros Xmm (xmm_zero ty)) - (cmp Xmm (x64_pcmpeq (vec_int_type ty) src zeros))) - (with_flags (x64_ptest cmp cmp) (x64_setcc (CC.Z))))) + (let ((lanes_with_zero Xmm (x64_pcmpeq (vec_int_type ty) val (xmm_zero ty))) + (mask Gpr (x64_pmovmskb (OperandSize.Size32) lanes_with_zero))) + (with_flags (x64_test (OperandSize.Size32) mask mask) + (x64_setcc (CC.Z))))) ;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/filetests/filetests/runtests/simd-valltrue.clif b/cranelift/filetests/filetests/runtests/simd-valltrue.clif index 98fbe1757d..a9e2ffb343 100644 --- a/cranelift/filetests/filetests/runtests/simd-valltrue.clif +++ b/cranelift/filetests/filetests/runtests/simd-valltrue.clif @@ -1,10 +1,11 @@ test interpret test run +target x86_64 has_sse41=false set enable_simd target aarch64 target s390x -target x86_64 -target x86_64 has_avx +target x86_64 sse41 +target x86_64 sse41 has_avx function %vall_true_i8x16(i8x16) -> i8 { block0(v0: i8x16): diff --git a/cranelift/filetests/filetests/runtests/simd-vanytrue.clif b/cranelift/filetests/filetests/runtests/simd-vanytrue.clif index 8c6aa8cdc0..74404f0ea1 100644 --- a/cranelift/filetests/filetests/runtests/simd-vanytrue.clif +++ b/cranelift/filetests/filetests/runtests/simd-vanytrue.clif @@ -1,10 +1,11 @@ test interpret test run +target x86_64 has_sse41=false set enable_simd target aarch64 target s390x -target x86_64 -target x86_64 has_avx +target x86_64 sse41 +target x86_64 sse41 has_avx function %vany_true_i8x16(i8x16) -> i8 { block0(v0: i8x16):