x64: Add non-SSE 4.1 lowerings for v{all,any}_true (#6232)
This commit adds lowerings to the x64 backend for two more CLIF instructions that currently require SSE 4.1. These lowerings are inspired by LLVM's lowerings and avoid the use of SSE 4.1 instructions.
This commit is contained in:
@@ -4196,7 +4196,19 @@
|
||||
(rule (x64_pcmpeq $I8X16 x y) (x64_pcmpeqb x y))
|
||||
(rule (x64_pcmpeq $I16X8 x y) (x64_pcmpeqw x y))
|
||||
(rule (x64_pcmpeq $I32X4 x y) (x64_pcmpeqd x y))
|
||||
(rule (x64_pcmpeq $I64X2 x y) (x64_pcmpeqq x y))
|
||||
(rule (x64_pcmpeq $I64X2 x y)
|
||||
(if-let $true (use_sse41))
|
||||
(x64_pcmpeqq x y))
|
||||
|
||||
;; Without SSE 4.1 there's no access to `pcmpeqq`, so it's emulated by comparing
|
||||
;; 32-bit lanes instead. The upper and lower halves of the 32-bit comparison are
|
||||
;; swapped and then these two results are and'd together. This way only if both
|
||||
;; 32-bit values were equal is the result all ones, otherwise the result is
|
||||
;; all zeros if either 32-bit comparison was zero.
|
||||
(rule -1 (x64_pcmpeq $I64X2 x y)
|
||||
(let ((cmp32 Xmm (x64_pcmpeqd x y))
|
||||
(cmp32_swapped Xmm (x64_pshufd cmp32 0b10_11_00_01)))
|
||||
(x64_pand cmp32 cmp32_swapped)))
|
||||
|
||||
(decl x64_pcmpeqb (Xmm XmmMem) Xmm)
|
||||
(rule 0 (x64_pcmpeqb x y) (xmm_rm_r (SseOpcode.Pcmpeqb) x y))
|
||||
|
||||
@@ -3949,17 +3949,40 @@
|
||||
|
||||
;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule 1 (lower (vany_true val))
|
||||
(if-let $true (use_sse41))
|
||||
(let ((val Xmm val))
|
||||
(with_flags (x64_ptest val val) (x64_setcc (CC.NZ)))))
|
||||
|
||||
;; Any nonzero byte in `val` means that any lane is true. Compare `val` with a
|
||||
;; zeroed register and extract the high bits to a gpr mask. If the mask is
|
||||
;; 0xffff then every byte was equal to zero, so test if the comparison is
|
||||
;; not-equal or NZ.
|
||||
(rule (lower (vany_true val))
|
||||
(let ((val Xmm val))
|
||||
(with_flags (x64_ptest val val) (x64_setcc (CC.NZ)))))
|
||||
(let (
|
||||
(any_byte_zero Xmm (x64_pcmpeqb val (xmm_zero $I8X16)))
|
||||
(mask Gpr (x64_pmovmskb (OperandSize.Size32) any_byte_zero))
|
||||
)
|
||||
(with_flags (x64_cmp (OperandSize.Size32) (RegMemImm.Imm 0xffff) mask)
|
||||
(x64_setcc (CC.NZ)))))
|
||||
|
||||
;; Rules for `vall_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule 1 (lower (vall_true val @ (value_type ty)))
|
||||
(if-let $true (use_sse41))
|
||||
(let ((src Xmm val)
|
||||
(zeros Xmm (xmm_zero ty))
|
||||
(cmp Xmm (x64_pcmpeq (vec_int_type ty) src zeros)))
|
||||
(with_flags (x64_ptest cmp cmp) (x64_setcc (CC.Z)))))
|
||||
|
||||
;; Perform an appropriately-sized lane-wise comparison with zero. If the
|
||||
;; result is all 0s then all of them are true because nothing was equal to
|
||||
;; zero.
|
||||
(rule (lower (vall_true val @ (value_type ty)))
|
||||
(let ((src Xmm val)
|
||||
(zeros Xmm (xmm_zero ty))
|
||||
(cmp Xmm (x64_pcmpeq (vec_int_type ty) src zeros)))
|
||||
(with_flags (x64_ptest cmp cmp) (x64_setcc (CC.Z)))))
|
||||
(let ((lanes_with_zero Xmm (x64_pcmpeq (vec_int_type ty) val (xmm_zero ty)))
|
||||
(mask Gpr (x64_pmovmskb (OperandSize.Size32) lanes_with_zero)))
|
||||
(with_flags (x64_test (OperandSize.Size32) mask mask)
|
||||
(x64_setcc (CC.Z)))))
|
||||
|
||||
;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
test interpret
|
||||
test run
|
||||
target x86_64 has_sse41=false
|
||||
set enable_simd
|
||||
target aarch64
|
||||
target s390x
|
||||
target x86_64
|
||||
target x86_64 has_avx
|
||||
target x86_64 sse41
|
||||
target x86_64 sse41 has_avx
|
||||
|
||||
function %vall_true_i8x16(i8x16) -> i8 {
|
||||
block0(v0: i8x16):
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
test interpret
|
||||
test run
|
||||
target x86_64 has_sse41=false
|
||||
set enable_simd
|
||||
target aarch64
|
||||
target s390x
|
||||
target x86_64
|
||||
target x86_64 has_avx
|
||||
target x86_64 sse41
|
||||
target x86_64 sse41 has_avx
|
||||
|
||||
function %vany_true_i8x16(i8x16) -> i8 {
|
||||
block0(v0: i8x16):
|
||||
|
||||
Reference in New Issue
Block a user