x64: Lower shuffle and swizzle in ISLE (#4772)

Lower `shuffle` and `swizzle` in ISLE.

This PR surfaced a bug with the lowering of `shuffle` when avx512vl and avx512vbmi are enabled: we use `vpermi2b` as the implementation, but panic if the immediate shuffle mask contains any out-of-bounds values. The behavior when the avx512 extensions are not present is that out-of-bounds values are turned into `0` in the result.

I've resolved this by detecting when the shuffle immediate has out-of-bounds indices in the avx512-enabled lowering, and generating an additional mask to zero out the lanes where those indices occur. This brings the avx512 case into line with the semantics of the `shuffle` op: 94bcbe8446/cranelift/codegen/meta/src/shared/instructions.rs (L1495-L1498)
This commit is contained in:
Trevor Elliott
2022-08-24 14:49:51 -07:00
committed by GitHub
parent b4c25ef63e
commit b8b6f2781e
12 changed files with 295 additions and 190 deletions

View File

@@ -0,0 +1,58 @@
test compile precise-output
set enable_simd
target x86_64 has_avx512vl has_avx512vbmi
function %shuffle_in_bounds(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
;; pick the second lane of v1, the rest use the first lane of v0
v2 = shuffle v0, v1, 0x11000000000000000000000000000000
return v2
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm9
; load_const VCodeConstant(0), %xmm0
; vpermi2b %xmm1, %xmm0, %xmm9
; movq %rbp, %rsp
; popq %rbp
; ret
function %shuffle_out_of_bounds(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
;; pick zero for the first lane, the rest use first lane of v0
;; This should introduce two constants, one for the permutation and one to
;; mask the non-zero values for lanes 1-15
v2 = shuffle v0, v1, 0x80000000000000000000000000000000
return v2
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm12
; load_const VCodeConstant(1), %xmm0
; load_const VCodeConstant(0), %xmm7
; vpermi2b %xmm1, %xmm7, %xmm12
; andps %xmm0, %xmm7, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %f3(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = shuffle v0, v1, [3 0 31 26 4 6 12 11 23 13 24 4 2 15 17 5]
return v2
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm9
; load_const VCodeConstant(0), %xmm0
; vpermi2b %xmm1, %xmm0, %xmm9
; movq %rbp, %rsp
; popq %rbp
; ret

View File

@@ -15,13 +15,13 @@ block0:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; load_const VCodeConstant(3), %xmm6
; load_const VCodeConstant(2), %xmm0
; load_const VCodeConstant(0), %xmm7
; pshufb %xmm6, %xmm7, %xmm6
; load_const VCodeConstant(1), %xmm10
; pshufb %xmm0, %xmm10, %xmm0
; orps %xmm0, %xmm6, %xmm0
; load_const VCodeConstant(3), %xmm0
; load_const VCodeConstant(2), %xmm5
; load_const VCodeConstant(0), %xmm3
; pshufb %xmm0, %xmm3, %xmm0
; load_const VCodeConstant(1), %xmm7
; pshufb %xmm5, %xmm7, %xmm5
; por %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
@@ -37,8 +37,8 @@ block0:
; movq %rsp, %rbp
; block0:
; load_const VCodeConstant(1), %xmm0
; load_const VCodeConstant(0), %xmm4
; pshufb %xmm0, %xmm4, %xmm0
; load_const VCodeConstant(0), %xmm2
; pshufb %xmm0, %xmm2, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
@@ -55,10 +55,10 @@ block0:
; movq %rsp, %rbp
; block0:
; load_const VCodeConstant(1), %xmm0
; load_const VCodeConstant(1), %xmm5
; load_const VCodeConstant(0), %xmm6
; paddusb %xmm5, %xmm6, %xmm5
; pshufb %xmm0, %xmm5, %xmm0
; load_const VCodeConstant(1), %xmm3
; load_const VCodeConstant(0), %xmm4
; paddusb %xmm3, %xmm4, %xmm3
; pshufb %xmm0, %xmm3, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret

View File

@@ -4,6 +4,7 @@ target aarch64
target s390x
set enable_simd
target x86_64 has_sse3 has_ssse3 has_sse41
target x86_64 has_sse3 has_ssse3 has_sse41 has_avx512vl has_avx512vbmi
function %shuffle_i8x16(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
@@ -11,3 +12,10 @@ block0(v0: i8x16, v1: i8x16):
return v2
}
; run: %shuffle_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [4 1 32 27 5 7 13 12 24 14 25 5 3 16 18 6]
function %shuffle_zeros(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = shuffle v0, v1, [3 0 32 255 4 6 12 11 23 13 24 4 2 97 17 5]
return v2
}
; run: %shuffle_zeros([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [4 1 0 0 5 7 13 12 24 14 25 5 3 0 18 6]