x64: Lower shuffle and swizzle in ISLE (#4772)

Lower `shuffle` and `swizzle` in ISLE. This PR surfaced a bug with the lowering of `shuffle` when avx512vl and avx512vbmi are enabled: we use `vpermi2b` as the implementation, but panic if the immediate shuffle mask contains any out-of-bounds values. The behavior when the avx512 extensions are not present is that out-of-bounds values are turned into `0` in the result. I've resolved this by detecting when the shuffle immediate has out-of-bounds indices in the avx512-enabled lowering, and generating an additional mask to zero out the lanes where those indices occur. This brings the avx512 case into line with the semantics of the `shuffle` op: 94bcbe8446/cranelift/codegen/meta/src/shared/instructions.rs (L1495-L1498)
2022-08-24 14:49:51 -07:00
parent b4c25ef63e
commit b8b6f2781e
12 changed files with 295 additions and 190 deletions
--- a/cranelift/filetests/filetests/isa/x64/shuffle-avx512.clif
+++ b/cranelift/filetests/filetests/isa/x64/shuffle-avx512.clif
@@ -0,0 +1,58 @@
+test compile precise-output
+set enable_simd
+target x86_64 has_avx512vl has_avx512vbmi
+
+function %shuffle_in_bounds(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    ;; pick the second lane of v1, the rest use the first lane of v0
+    v2 = shuffle v0, v1, 0x11000000000000000000000000000000
+    return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm0, %xmm9
+;   load_const VCodeConstant(0), %xmm0
+;   vpermi2b %xmm1, %xmm0, %xmm9
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %shuffle_out_of_bounds(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    ;; pick zero for the first lane, the rest use first lane of v0
+    ;; This should introduce two constants, one for the permutation and one to
+    ;; mask the non-zero values for lanes 1-15
+    v2 = shuffle v0, v1, 0x80000000000000000000000000000000
+    return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm0, %xmm12
+;   load_const VCodeConstant(1), %xmm0
+;   load_const VCodeConstant(0), %xmm7
+;   vpermi2b %xmm1, %xmm7, %xmm12
+;   andps   %xmm0, %xmm7, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f3(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [3 0 31 26 4 6 12 11 23 13 24 4 2 15 17 5]
+    return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm0, %xmm9
+;   load_const VCodeConstant(0), %xmm0
+;   vpermi2b %xmm1, %xmm0, %xmm9
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
--- a/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif
@@ -15,13 +15,13 @@ block0:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   load_const VCodeConstant(3), %xmm6
-;   load_const VCodeConstant(2), %xmm0
-;   load_const VCodeConstant(0), %xmm7
-;   pshufb  %xmm6, %xmm7, %xmm6
-;   load_const VCodeConstant(1), %xmm10
-;   pshufb  %xmm0, %xmm10, %xmm0
-;   orps    %xmm0, %xmm6, %xmm0
+;   load_const VCodeConstant(3), %xmm0
+;   load_const VCodeConstant(2), %xmm5
+;   load_const VCodeConstant(0), %xmm3
+;   pshufb  %xmm0, %xmm3, %xmm0
+;   load_const VCodeConstant(1), %xmm7
+;   pshufb  %xmm5, %xmm7, %xmm5
+;   por     %xmm0, %xmm5, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -37,8 +37,8 @@ block0:
 ;   movq    %rsp, %rbp
 ; block0:
 ;   load_const VCodeConstant(1), %xmm0
-;   load_const VCodeConstant(0), %xmm4
-;   pshufb  %xmm0, %xmm4, %xmm0
+;   load_const VCodeConstant(0), %xmm2
+;   pshufb  %xmm0, %xmm2, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -55,10 +55,10 @@ block0:
 ;   movq    %rsp, %rbp
 ; block0:
 ;   load_const VCodeConstant(1), %xmm0
-;   load_const VCodeConstant(1), %xmm5
-;   load_const VCodeConstant(0), %xmm6
-;   paddusb %xmm5, %xmm6, %xmm5
-;   pshufb  %xmm0, %xmm5, %xmm0
+;   load_const VCodeConstant(1), %xmm3
+;   load_const VCodeConstant(0), %xmm4
+;   paddusb %xmm3, %xmm4, %xmm3
+;   pshufb  %xmm0, %xmm3, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
--- a/cranelift/filetests/filetests/runtests/simd-shuffle.clif
+++ b/cranelift/filetests/filetests/runtests/simd-shuffle.clif
@@ -4,6 +4,7 @@ target aarch64
 target s390x
 set enable_simd
 target x86_64 has_sse3 has_ssse3 has_sse41
+target x86_64 has_sse3 has_ssse3 has_sse41 has_avx512vl has_avx512vbmi

 function %shuffle_i8x16(i8x16, i8x16) -> i8x16 {
 block0(v0: i8x16, v1: i8x16):
@@ -11,3 +12,10 @@ block0(v0: i8x16, v1: i8x16):
    return v2
 }
 ; run: %shuffle_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [4 1 32 27 5 7 13 12 24 14 25 5 3 16 18 6]
+
+function %shuffle_zeros(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [3 0 32 255 4 6 12 11 23 13 24 4 2 97 17 5]
+    return v2
+}
+; run: %shuffle_zeros([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [4 1 0 0 5 7 13 12 24 14 25 5 3 0 18 6]