x64: Lower SIMD requirement to SSE4.1 from SSE4.2 (#6206)

Cranelift only has one instruction SIMD which depends on SSE4.2 so this commit adds a lowering rule for `pcmpgtq` which doesn't use SSE4.2 and enables lowering the baseline requirement for SIMD support from SSE4.2 to SSE4.1. The `has_sse42` setting is no longer enabled by default for Cranelift. Additionally `enable_simd` no longer requires `has_sse42` on x64. Finally the fuzz-generator for Wasmtime codegen settings now enables flipping the `has_sse42` setting instead of unconditionally setting it to `true`. The specific lowering for `pcmpgtq` is copied from LLVM's lowering of this instruction.
2023-04-14 12:24:43 -05:00
parent 26f9ce02bc
commit 2d25db047f
8 changed files with 62 additions and 15 deletions
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -1642,6 +1642,9 @@
 (decl use_sse41 (bool) Type)
 (extern extractor infallible use_sse41 use_sse41)

+(decl pure use_sse42 () bool)
+(extern constructor use_sse42 use_sse42)
+
 (decl pure use_avx_simd () bool)
 (extern constructor use_avx_simd use_avx_simd)

@@ -4214,7 +4217,51 @@
 (rule (x64_pcmpgt $I8X16 x y) (x64_pcmpgtb x y))
 (rule (x64_pcmpgt $I16X8 x y) (x64_pcmpgtw x y))
 (rule (x64_pcmpgt $I32X4 x y) (x64_pcmpgtd x y))
-(rule (x64_pcmpgt $I64X2 x y) (x64_pcmpgtq x y))
+
+;; SSE4.2 gives a single-instruction for this lowering, but prior to that it's a
+;; bit more complicated.
+(rule 1 (x64_pcmpgt $I64X2 x y)
+        (if-let $true (use_sse42))
+        (x64_pcmpgtq x y))
+
+;; Without SSE4.2 a 64-bit comparison is expanded to a number of instructions.
+;; The basic idea is to delegate to a 32-bit comparison and work with the
+;; results from there. The comparison to execute is:
+;;
+;;    [ xhi ][ xlo ] > [ yhi ][ ylo ]
+;;
+;; If xhi != yhi, then the result is whatever the result of that comparison is.
+;; If xhi == yhi, then the result is the unsigned comparison of xlo/ylo since
+;; the 64-bit value is positive. To achieve this as part of the same comparison
+;; the upper bit of `xlo` and `ylo` is flipped to change the sign when compared
+;; as a 32-bit signed number. The result here is then:
+;;
+;; * if xlo and yhi had the same upper bit, then the unsigned comparison should
+;;   be the same as comparing the flipped versions as signed.
+;; * if xlo had an upper bit of 0 and ylo had an upper bit of 1, then xlo > ylo
+;;   is false. When flipping the bits xlo becomes negative and ylo becomes
+;;   positive when compared as 32-bits, so the result is the same.
+;; * if xlo had an upper bit of 1 and ylo had an upper bit of 0, then xlo > ylo
+;;   is true. When flipping the bits xlo becomes positive and ylo becomes
+;;   negative when compared as 32-bits, so the result is the same.
+;;
+;; Given all that the sequence here is to flip the upper bits of xlo and ylo,
+;; then compare the masked results for equality and for gt. If the upper 32-bits
+;; are not equal then the gt result for the upper bits is used. If the upper
+;; 32-bits are equal then the lower 32-bits comparison is used instead.
+(rule 0 (x64_pcmpgt $I64X2 x y)
+        (let (
+            (mask Xmm (x64_movdqu_load (emit_u128_le_const 0x00000000_80000000_00000000_80000000)))
+            (x_masked           Xmm (x64_pxor mask x))
+            (y_masked           Xmm (x64_pxor mask y))
+            (cmp32              Xmm (x64_pcmpgtd x_masked y_masked))
+            (low_halves_gt      Xmm (x64_pshufd cmp32 0xa0))
+            (high_halves_gt     Xmm (x64_pshufd cmp32 0xf5))
+            (cmp_eq             Xmm (x64_pcmpeqd x_masked y_masked))
+            (high_halves_eq     Xmm (x64_pshufd cmp_eq 0xf5))
+            (low_gt_and_high_eq Xmm (x64_pand low_halves_gt high_halves_eq))
+          )
+          (x64_por low_gt_and_high_eq high_halves_gt)))

 (decl x64_pcmpgtb (Xmm XmmMem) Xmm)
 (rule 0 (x64_pcmpgtb x y) (xmm_rm_r (SseOpcode.Pcmpgtb) x y))