x64: Add a smattering of lowerings for shuffle specializations (#5930)

* x64: Add lowerings for `punpck{h,l}wd`

Add some special cases for `shuffle` for more specialized x86
instructions.

* x64: Add `shuffle` lowerings for `pshufd`

This commit adds special-cased lowerings for the x64 `shuffle`
instruction when the `pshufd` instruction alone is necessary. This is
possible when the shuffle immediate permutes 32-bit values within one of
the vector inputs of the `shuffle` instruction, but not both.

* x64: Add shuffle lowerings for `punpck{h,l}{q,}dq`

This adds specific permutations for some x86 instructions which
specifically interleave high/low bytes for 32 and 64-bit values. This
corresponds to the preexisting specific lowerings for interleaving 8 and
16-bit values.

* x64: Add `shuffle` lowerings for `shufps`

This commit adds targeted lowerings for the `shuffle` instruction that
match the pattern that `shufps` supports. The `shufps` instruction
selects two elements from the first vector and two elements from the
second vector which means while it's not generally applicable it should
still be more useful than the catch-all lowering of `shuffle`.

* x64: Add shuffle support for `pshuf{l,h}w`

This commit adds special lowering cases for these instructions which
permute 16-bit values within a 128-bit value either within the upper or
lower half of the 128-bit value.

* x64: Specialize `shuffle` with an all-zeros immediate

Instead of loading the all-zeros immediate from a rip-relative address
at the end of the function instead generate a zero with a `pxor`
instruction and then use `pshufb` to do the broadcast.

* Review comments
This commit is contained in:
Alex Crichton
2023-03-09 16:58:19 -06:00
committed by GitHub
parent 8a2bf29444
commit 1c3a1bda6c
10 changed files with 1332 additions and 10 deletions

View File

@@ -0,0 +1,116 @@
test compile precise-output
set enable_simd
target x86_64 has_avx
function %punpckldq(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [0 1 2 3 16 17 18 19 4 5 6 7 20 21 22 23]
v5 = bitcast.i32x4 little v4
return v5
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpunpckldq %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpunpckldq %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %punpckhdq(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [8 9 10 11 24 25 26 27 12 13 14 15 28 29 30 31]
v5 = bitcast.i32x4 little v4
return v5
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpunpckhdq %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpunpckhdq %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %punpcklqdq(i64x2, i64x2) -> i64x2 {
block0(v0: i64x2, v1: i64x2):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23]
v5 = bitcast.i64x2 little v4
return v5
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpunpcklqdq %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpunpcklqdq %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %punpckhqdq(i64x2, i64x2) -> i64x2 {
block0(v0: i64x2, v1: i64x2):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31]
v5 = bitcast.i64x2 little v4
return v5
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpunpckhqdq %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpunpckhqdq %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq

View File

@@ -52,3 +52,594 @@ block0(v0: i8x16, v1: i8x16):
; popq %rbp
; retq
function %punpcklwd(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [0 1 16 17 2 3 18 19 4 5 20 21 6 7 22 23]
v5 = bitcast.i16x8 little v4
return v5
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; punpcklwd %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; punpcklwd %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %punpckhwd(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [8 9 24 25 10 11 26 27 12 13 28 29 14 15 30 31]
v5 = bitcast.i16x8 little v4
return v5
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; punpckhwd %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; punpckhwd %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %pshufd_0022(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [0 1 2 3 0 1 2 3 8 9 10 11 8 9 10 11]
v5 = bitcast.i32x4 little v4
return v5
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pshufd $160, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; pshufd $0xa0, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %pshufd_3120(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [12 13 14 15 4 5 6 7 8 9 10 11 0 1 2 3]
v5 = bitcast.i32x4 little v4
return v5
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pshufd $39, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; pshufd $0x27, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %pshufd_7546(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [28 29 30 31 20 21 22 23 16 17 18 19 24 25 26 27]
v5 = bitcast.i32x4 little v4
return v5
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pshufd $135, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; pshufd $0x87, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %not_single_pshufd(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
v5 = bitcast.i32x4 little v4
return v5
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; shufps $78, %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; shufps $0x4e, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %punpckldq(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [0 1 2 3 16 17 18 19 4 5 6 7 20 21 22 23]
v5 = bitcast.i32x4 little v4
return v5
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; punpckldq %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; punpckldq %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %punpckhdq(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [8 9 10 11 24 25 26 27 12 13 14 15 28 29 30 31]
v5 = bitcast.i32x4 little v4
return v5
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; punpckhdq %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; punpckhdq %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %punpcklqdq(i64x2, i64x2) -> i64x2 {
block0(v0: i64x2, v1: i64x2):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23]
v5 = bitcast.i64x2 little v4
return v5
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; punpcklqdq %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; punpcklqdq %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %punpckhqdq(i64x2, i64x2) -> i64x2 {
block0(v0: i64x2, v1: i64x2):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31]
v5 = bitcast.i64x2 little v4
return v5
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; punpckhqdq %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; punpckhqdq %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %shufps_3277(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [12 13 14 15 8 9 10 11 28 29 30 31 28 29 30 31]
v5 = bitcast.i32x4 little v4
return v5
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; shufps $251, %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; shufps $0xfb, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %shufps_6500(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [24 25 26 27 20 21 22 23 0 1 2 3 0 1 2 3]
v5 = bitcast.i32x4 little v4
return v5
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm4
; movdqa %xmm1, %xmm0
; shufps $6, %xmm0, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movdqa %xmm0, %xmm4
; movdqa %xmm1, %xmm0
; shufps $6, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %pshuflw_3210(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [6 7 4 5 2 3 0 1 8 9 10 11 12 13 14 15]
v5 = bitcast.i16x8 little v4
return v5
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pshuflw $27, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; pshuflw $0x1b, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %pshuflw_3131(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [6 7 4 5 6 7 4 5 8 9 10 11 12 13 14 15]
v5 = bitcast.i16x8 little v4
return v5
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pshuflw $187, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; pshuflw $0xbb, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %pshuflw_rhs_3210(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [22 23 20 21 18 19 16 17 24 25 26 27 28 29 30 31]
v5 = bitcast.i16x8 little v4
return v5
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pshuflw $27, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; pshuflw $0x1b, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %pshuflw_rhs_3131(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [22 23 18 19 22 23 18 19 24 25 26 27 28 29 30 31]
v5 = bitcast.i16x8 little v4
return v5
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pshuflw $119, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; pshuflw $0x77, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %pshufhw_3210(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 14 15 12 13 10 11 8 9]
v5 = bitcast.i16x8 little v4
return v5
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pshufhw $27, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; pshufhw $0x1b, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %pshufhw_3131(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 14 15 10 11 14 15 10 11]
v5 = bitcast.i16x8 little v4
return v5
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pshufhw $119, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; pshufhw $0x77, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %pshufhw_rhs_3210(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [16 17 18 19 20 21 22 23 30 31 28 29 26 27 24 25]
v5 = bitcast.i16x8 little v4
return v5
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pshufhw $27, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; pshufhw $0x1b, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %pshufhw_rhs_3131(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [16 17 18 19 20 21 22 23 30 31 26 27 30 31 26 27]
v5 = bitcast.i16x8 little v4
return v5
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pshufhw $119, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; pshufhw $0x77, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %shuffle_all_zeros(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = shuffle v0, v1, [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pxor %xmm3, %xmm3, %xmm3
; pshufb %xmm0, %xmm3, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; pxor %xmm3, %xmm3
; pshufb %xmm3, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq

View File

@@ -1,9 +1,10 @@
test interpret
;; test interpret ;; FIXME(#5915)
test run
target aarch64
target s390x
set enable_simd
target x86_64 has_sse3 has_ssse3 has_sse41
target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
target x86_64 has_sse3 has_ssse3 has_sse41 has_avx512vl has_avx512vbmi
function %shuffle_i8x16(i8x16, i8x16) -> i8x16 {
@@ -26,3 +27,234 @@ block0(v0: i8x16):
return v1
}
; run: %shuffle1([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]) == [8 9 10 11 12 13 14 15 0 1 2 3 4 5 6 7]
function %punpcklbw(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = shuffle v0, v1, [0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23]
return v2
}
; run: %punpcklbw([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [1 17 2 18 3 19 4 20 5 21 6 22 7 23 8 24]
function %punpckhbw(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = shuffle v0, v1, [8 24 9 25 10 26 11 27 12 28 13 29 14 30 15 31]
return v2
}
; run: %punpckhbw([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [9 25 10 26 11 27 12 28 13 29 14 30 15 31 16 32]
function %punpcklwd(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [0 1 16 17 2 3 18 19 4 5 20 21 6 7 22 23]
v5 = bitcast.i16x8 little v4
return v5
}
; run: %punpcklwd([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [1 9 2 10 3 11 4 12]
function %punpckhwd(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [8 9 24 25 10 11 26 27 12 13 28 29 14 15 30 31]
v5 = bitcast.i16x8 little v4
return v5
}
; run: %punpckhwd([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [5 13 6 14 7 15 8 16]
function %pshufd_0022(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [0 1 2 3 0 1 2 3 8 9 10 11 8 9 10 11]
v5 = bitcast.i32x4 little v4
return v5
}
; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [1 1 3 3]
function %pshufd_3120(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [12 13 14 15 4 5 6 7 8 9 10 11 0 1 2 3]
v5 = bitcast.i32x4 little v4
return v5
}
; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [4 2 3 1]
function %pshufd_7546(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [28 29 30 31 20 21 22 23 16 17 18 19 24 25 26 27]
v5 = bitcast.i32x4 little v4
return v5
}
; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [8 6 5 7]
function %not_pshufd(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
v5 = bitcast.i32x4 little v4
return v5
}
; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [3 4 5 6]
function %punpckldq(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [0 1 2 3 16 17 18 19 4 5 6 7 20 21 22 23]
v5 = bitcast.i32x4 little v4
return v5
}
; run: %punpckldq([1 2 3 4], [5 6 7 8]) == [1 5 2 6]
function %punpckhdq(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [8 9 10 11 24 25 26 27 12 13 14 15 28 29 30 31]
v5 = bitcast.i32x4 little v4
return v5
}
; run: %punpckldq([1 2 3 4], [5 6 7 8]) == [3 7 4 8]
function %punpcklqdq(i64x2, i64x2) -> i64x2 {
block0(v0: i64x2, v1: i64x2):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23]
v5 = bitcast.i64x2 little v4
return v5
}
; run: %punpcklqdq([1 2], [5 6]) == [1 5]
function %punpckhqdq(i64x2, i64x2) -> i64x2 {
block0(v0: i64x2, v1: i64x2):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31]
v5 = bitcast.i64x2 little v4
return v5
}
; run: %punpckhqdq([1 2], [5 6]) == [2 6]
function %shufps_0145(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23]
v5 = bitcast.i32x4 little v4
return v5
}
; run: %shufps_0145([1 2 3 4], [5 6 7 8]) == [1 2 5 6]
function %shufps_3277(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [12 13 14 15 8 9 10 11 28 29 30 31 28 29 30 31]
v5 = bitcast.i32x4 little v4
return v5
}
; run: %shufps_0145([1 2 3 4], [5 6 7 8]) == [4 3 8 8]
function %shufps_6500(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [24 25 26 27 20 21 22 23 0 1 2 3 0 1 2 3]
v5 = bitcast.i32x4 little v4
return v5
}
; run: %shufps_0145([1 2 3 4], [5 6 7 8]) == [7 6 1 1]
function %pshuflw_3210(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [6 7 4 5 2 3 0 1 8 9 10 11 12 13 14 15]
v5 = bitcast.i16x8 little v4
return v5
}
; run: %pshuflw_3210([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [4 3 2 1 5 6 7 8]
function %pshuflw_3131(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [6 7 4 5 6 7 4 5 8 9 10 11 12 13 14 15]
v5 = bitcast.i16x8 little v4
return v5
}
; run: %pshuflw_3131([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [4 3 4 3 5 6 7 8]
function %pshuflw_rhs_3210(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [22 23 20 21 18 19 16 17 24 25 26 27 28 29 30 31]
v5 = bitcast.i16x8 little v4
return v5
}
; run: %pshuflw_rhs_3210([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [12 11 10 9 13 14 15 16]
function %pshuflw_rhs_3131(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [22 23 18 19 22 23 18 19 24 25 26 27 28 29 30 31]
v5 = bitcast.i16x8 little v4
return v5
}
; run: %pshuflw_rhs_3131([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [12 10 12 10 13 14 15 16]
function %pshufhw_3210(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 14 15 12 13 10 11 8 9]
v5 = bitcast.i16x8 little v4
return v5
}
; run: %pshufhw_3210([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [1 2 3 4 8 7 6 5]
function %pshufhw_3131(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 14 15 10 11 14 15 10 11]
v5 = bitcast.i16x8 little v4
return v5
}
; run: %pshufhw_3131([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [1 2 3 4 8 6 8 6]
function %pshufhw_rhs_3210(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [16 17 18 19 20 21 22 23 30 31 28 29 26 27 24 25]
v5 = bitcast.i16x8 little v4
return v5
}
; run: %pshufhw_rhs_3210([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [9 10 11 12 16 15 14 13]
function %pshufhw_rhs_3131(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [16 17 18 19 20 21 22 23 30 31 26 27 30 31 26 27]
v5 = bitcast.i16x8 little v4
return v5
}
; run: %pshufhw_rhs_3131([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [9 10 11 12 16 14 16 14]
function %shuffle_all_zeros(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = shuffle v0, v1, [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
return v2
}
; run: %shuffle_all_zeros([5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1], [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]) == [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]