x64: Add non-SSE 4.1 lowerings for v{all,any}_true (#6232)

This commit adds lowerings to the x64 backend for two more CLIF
instructions that currently require SSE 4.1. These lowerings are
inspired by LLVM's lowerings and avoid the use of SSE 4.1 instructions.
This commit is contained in:
Alex Crichton
2023-04-20 16:56:59 -05:00
committed by GitHub
parent 60e4a00413
commit a2a38edd8a
4 changed files with 48 additions and 11 deletions

View File

@@ -4196,7 +4196,19 @@
(rule (x64_pcmpeq $I8X16 x y) (x64_pcmpeqb x y))
(rule (x64_pcmpeq $I16X8 x y) (x64_pcmpeqw x y))
(rule (x64_pcmpeq $I32X4 x y) (x64_pcmpeqd x y))
(rule (x64_pcmpeq $I64X2 x y) (x64_pcmpeqq x y))
(rule (x64_pcmpeq $I64X2 x y)
(if-let $true (use_sse41))
(x64_pcmpeqq x y))
;; Without SSE 4.1 there's no access to `pcmpeqq`, so it's emulated by comparing
;; 32-bit lanes instead. The upper and lower halves of the 32-bit comparison are
;; swapped and then these two results are and'd together. This way only if both
;; 32-bit values were equal is the result all ones, otherwise the result is
;; all zeros if either 32-bit comparison was zero.
(rule -1 (x64_pcmpeq $I64X2 x y)
(let ((cmp32 Xmm (x64_pcmpeqd x y))
(cmp32_swapped Xmm (x64_pshufd cmp32 0b10_11_00_01)))
(x64_pand cmp32 cmp32_swapped)))
(decl x64_pcmpeqb (Xmm XmmMem) Xmm)
(rule 0 (x64_pcmpeqb x y) (xmm_rm_r (SseOpcode.Pcmpeqb) x y))

View File

@@ -3949,17 +3949,40 @@
;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule 1 (lower (vany_true val))
(if-let $true (use_sse41))
(let ((val Xmm val))
(with_flags (x64_ptest val val) (x64_setcc (CC.NZ)))))
;; Any nonzero byte in `val` means that any lane is true. Compare `val` with a
;; zeroed register and extract the high bits to a gpr mask. If the mask is
;; 0xffff then every byte was equal to zero, so test if the comparison is
;; not-equal or NZ.
(rule (lower (vany_true val))
(let ((val Xmm val))
(with_flags (x64_ptest val val) (x64_setcc (CC.NZ)))))
(let (
(any_byte_zero Xmm (x64_pcmpeqb val (xmm_zero $I8X16)))
(mask Gpr (x64_pmovmskb (OperandSize.Size32) any_byte_zero))
)
(with_flags (x64_cmp (OperandSize.Size32) (RegMemImm.Imm 0xffff) mask)
(x64_setcc (CC.NZ)))))
;; Rules for `vall_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule 1 (lower (vall_true val @ (value_type ty)))
(if-let $true (use_sse41))
(let ((src Xmm val)
(zeros Xmm (xmm_zero ty))
(cmp Xmm (x64_pcmpeq (vec_int_type ty) src zeros)))
(with_flags (x64_ptest cmp cmp) (x64_setcc (CC.Z)))))
;; Perform an appropriately-sized lane-wise comparison with zero. If the
;; result is all 0s then all of them are true because nothing was equal to
;; zero.
(rule (lower (vall_true val @ (value_type ty)))
(let ((src Xmm val)
(zeros Xmm (xmm_zero ty))
(cmp Xmm (x64_pcmpeq (vec_int_type ty) src zeros)))
(with_flags (x64_ptest cmp cmp) (x64_setcc (CC.Z)))))
(let ((lanes_with_zero Xmm (x64_pcmpeq (vec_int_type ty) val (xmm_zero ty)))
(mask Gpr (x64_pmovmskb (OperandSize.Size32) lanes_with_zero)))
(with_flags (x64_test (OperandSize.Size32) mask mask)
(x64_setcc (CC.Z)))))
;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

View File

@@ -1,10 +1,11 @@
test interpret
test run
target x86_64 has_sse41=false
set enable_simd
target aarch64
target s390x
target x86_64
target x86_64 has_avx
target x86_64 sse41
target x86_64 sse41 has_avx
function %vall_true_i8x16(i8x16) -> i8 {
block0(v0: i8x16):

View File

@@ -1,10 +1,11 @@
test interpret
test run
target x86_64 has_sse41=false
set enable_simd
target aarch64
target s390x
target x86_64
target x86_64 has_avx
target x86_64 sse41
target x86_64 sse41 has_avx
function %vany_true_i8x16(i8x16) -> i8 {
block0(v0: i8x16):