diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index 08d5c45846..b816b163a0 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -4196,7 +4196,19 @@ (rule (x64_pcmpeq $I8X16 x y) (x64_pcmpeqb x y)) (rule (x64_pcmpeq $I16X8 x y) (x64_pcmpeqw x y)) (rule (x64_pcmpeq $I32X4 x y) (x64_pcmpeqd x y)) -(rule (x64_pcmpeq $I64X2 x y) (x64_pcmpeqq x y)) +(rule (x64_pcmpeq $I64X2 x y) + (if-let $true (use_sse41)) + (x64_pcmpeqq x y)) + +;; Without SSE 4.1 there's no access to `pcmpeqq`, so it's emulated by comparing +;; 32-bit lanes instead. The upper and lower halves of the 32-bit comparison are +;; swapped and then these two results are and'd together. This way only if both +;; 32-bit values were equal is the result all ones, otherwise the result is +;; all zeros if either 32-bit comparison was zero. +(rule -1 (x64_pcmpeq $I64X2 x y) + (let ((cmp32 Xmm (x64_pcmpeqd x y)) + (cmp32_swapped Xmm (x64_pshufd cmp32 0b10_11_00_01))) + (x64_pand cmp32 cmp32_swapped))) (decl x64_pcmpeqb (Xmm XmmMem) Xmm) (rule 0 (x64_pcmpeqb x y) (xmm_rm_r (SseOpcode.Pcmpeqb) x y)) diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 99a58d1e75..ec1e230963 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -3949,17 +3949,40 @@ ;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule 1 (lower (vany_true val)) + (if-let $true (use_sse41)) + (let ((val Xmm val)) + (with_flags (x64_ptest val val) (x64_setcc (CC.NZ))))) + +;; Any nonzero byte in `val` means that any lane is true. Compare `val` with a +;; zeroed register and extract the high bits to a gpr mask. If the mask is +;; 0xffff then every byte was equal to zero, so test if the comparison is +;; not-equal or NZ. (rule (lower (vany_true val)) - (let ((val Xmm val)) - (with_flags (x64_ptest val val) (x64_setcc (CC.NZ))))) + (let ( + (any_byte_zero Xmm (x64_pcmpeqb val (xmm_zero $I8X16))) + (mask Gpr (x64_pmovmskb (OperandSize.Size32) any_byte_zero)) + ) + (with_flags (x64_cmp (OperandSize.Size32) (RegMemImm.Imm 0xffff) mask) + (x64_setcc (CC.NZ))))) ;; Rules for `vall_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule 1 (lower (vall_true val @ (value_type ty))) + (if-let $true (use_sse41)) + (let ((src Xmm val) + (zeros Xmm (xmm_zero ty)) + (cmp Xmm (x64_pcmpeq (vec_int_type ty) src zeros))) + (with_flags (x64_ptest cmp cmp) (x64_setcc (CC.Z))))) + +;; Perform an appropriately-sized lane-wise comparison with zero. If the +;; result is all 0s then all of them are true because nothing was equal to +;; zero. (rule (lower (vall_true val @ (value_type ty))) - (let ((src Xmm val) - (zeros Xmm (xmm_zero ty)) - (cmp Xmm (x64_pcmpeq (vec_int_type ty) src zeros))) - (with_flags (x64_ptest cmp cmp) (x64_setcc (CC.Z))))) + (let ((lanes_with_zero Xmm (x64_pcmpeq (vec_int_type ty) val (xmm_zero ty))) + (mask Gpr (x64_pmovmskb (OperandSize.Size32) lanes_with_zero))) + (with_flags (x64_test (OperandSize.Size32) mask mask) + (x64_setcc (CC.Z))))) ;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/filetests/filetests/runtests/simd-valltrue.clif b/cranelift/filetests/filetests/runtests/simd-valltrue.clif index 98fbe1757d..a9e2ffb343 100644 --- a/cranelift/filetests/filetests/runtests/simd-valltrue.clif +++ b/cranelift/filetests/filetests/runtests/simd-valltrue.clif @@ -1,10 +1,11 @@ test interpret test run +target x86_64 has_sse41=false set enable_simd target aarch64 target s390x -target x86_64 -target x86_64 has_avx +target x86_64 sse41 +target x86_64 sse41 has_avx function %vall_true_i8x16(i8x16) -> i8 { block0(v0: i8x16): diff --git a/cranelift/filetests/filetests/runtests/simd-vanytrue.clif b/cranelift/filetests/filetests/runtests/simd-vanytrue.clif index 8c6aa8cdc0..74404f0ea1 100644 --- a/cranelift/filetests/filetests/runtests/simd-vanytrue.clif +++ b/cranelift/filetests/filetests/runtests/simd-vanytrue.clif @@ -1,10 +1,11 @@ test interpret test run +target x86_64 has_sse41=false set enable_simd target aarch64 target s390x -target x86_64 -target x86_64 has_avx +target x86_64 sse41 +target x86_64 sse41 has_avx function %vany_true_i8x16(i8x16) -> i8 { block0(v0: i8x16):