wasmtime/cranelift/filetests/filetests/runtests/simd-shuffle.clif

;; test interpret ;; FIXME(#5915)
test run
target aarch64
target s390x
set enable_simd
target x86_64 has_sse3 has_ssse3 has_sse41
target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
target x86_64 has_sse3 has_ssse3 has_sse41 has_avx512vl has_avx512vbmi

function %shuffle_i8x16(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
    v2 = shuffle v0, v1, [3 0 31 26 4 6 12 11 23 13 24 4 2 15 17 5]
    return v2
}
; run: %shuffle_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [4 1 32 27 5 7 13 12 24 14 25 5 3 16 18 6]

function %shuffle1(i8x16) -> i8x16 {
block0(v0: i8x16):
    v1 = shuffle v0, v0, [8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
    return v1
}
; run: %shuffle1([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]) == [8 9 10 11 12 13 14 15 0 1 2 3 4 5 6 7]

function %punpcklbw(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
    v2 = shuffle v0, v1, [0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23]
    return v2
}
; run: %punpcklbw([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [1 17 2 18 3 19 4 20 5 21 6 22 7 23 8 24]

function %punpckhbw(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
    v2 = shuffle v0, v1, [8 24 9 25 10 26 11 27 12 28 13 29 14 30 15 31]
    return v2
}
; run: %punpckhbw([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [9 25 10 26 11 27 12 28 13 29 14 30 15 31 16 32]

function %punpcklwd(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 16 17 2 3 18 19 4 5 20 21 6 7 22 23]
    v5 = bitcast.i16x8 little v4
    return v5
}
; run: %punpcklwd([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [1 9 2 10 3 11 4 12]

function %punpckhwd(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [8 9 24 25 10 11 26 27 12 13 28 29 14 15 30 31]
    v5 = bitcast.i16x8 little v4
    return v5
}
; run: %punpckhwd([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [5 13 6 14 7 15 8 16]

function %pshufd_0022(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 2 3 0 1 2 3 8 9 10 11 8 9 10 11]
    v5 = bitcast.i32x4 little v4
    return v5
}
; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [1 1 3 3]

function %pshufd_3120(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [12 13 14 15 4 5 6 7 8 9 10 11 0 1 2 3]
    v5 = bitcast.i32x4 little v4
    return v5
}
; run: %pshufd_3120([1 2 3 4], [5 6 7 8]) == [4 2 3 1]

function %pshufd_7546(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [28 29 30 31 20 21 22 23 16 17 18 19 24 25 26 27]
    v5 = bitcast.i32x4 little v4
    return v5
}
; run: %pshufd_7546([1 2 3 4], [5 6 7 8]) == [8 6 5 7]

function %not_pshufd(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
    v5 = bitcast.i32x4 little v4
    return v5
}
; run: %not_pshufd([1 2 3 4], [5 6 7 8]) == [3 4 5 6]

function %not_pshufd2(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 20 21 22 23 20 21 22 23]
    v5 = bitcast.i32x4 little v4
    return v5
}
; run: %not_pshufd2([1 2 3 4], [5 6 7 8]) == [3 4 6 6]

function %punpckldq(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 2 3 16 17 18 19 4 5 6 7 20 21 22 23]
    v5 = bitcast.i32x4 little v4
    return v5
}
; run: %punpckldq([1 2 3 4], [5 6 7 8]) == [1 5 2 6]

function %punpckhdq(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [8 9 10 11 24 25 26 27 12 13 14 15 28 29 30 31]
    v5 = bitcast.i32x4 little v4
    return v5
}
; run: %punpckldq([1 2 3 4], [5 6 7 8]) == [3 7 4 8]

function %punpcklqdq(i64x2, i64x2) -> i64x2 {
block0(v0: i64x2, v1: i64x2):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23]
    v5 = bitcast.i64x2 little v4
    return v5
}
; run: %punpcklqdq([1 2], [5 6]) == [1 5]

function %punpckhqdq(i64x2, i64x2) -> i64x2 {
block0(v0: i64x2, v1: i64x2):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31]
    v5 = bitcast.i64x2 little v4
    return v5
}
; run: %punpckhqdq([1 2], [5 6]) == [2 6]

function %shufps_0145(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23]
    v5 = bitcast.i32x4 little v4
    return v5
}
; run: %shufps_0145([1 2 3 4], [5 6 7 8]) == [1 2 5 6]

function %shufps_3277(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [12 13 14 15 8 9 10 11 28 29 30 31 28 29 30 31]
    v5 = bitcast.i32x4 little v4
    return v5
}
; run: %shufps_0145([1 2 3 4], [5 6 7 8]) == [4 3 8 8]

function %shufps_6500(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [24 25 26 27 20 21 22 23 0 1 2 3 0 1 2 3]
    v5 = bitcast.i32x4 little v4
    return v5
}
; run: %shufps_0145([1 2 3 4], [5 6 7 8]) == [7 6 1 1]

function %pshuflw_3210(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [6 7 4 5 2 3 0 1 8 9 10 11 12 13 14 15]
    v5 = bitcast.i16x8 little v4
    return v5
}
; run: %pshuflw_3210([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [4 3 2 1 5 6 7 8]

function %pshuflw_3131(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [6 7 4 5 6 7 4 5 8 9 10 11 12 13 14 15]
    v5 = bitcast.i16x8 little v4
    return v5
}
; run: %pshuflw_3131([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [4 3 4 3 5 6 7 8]

function %pshuflw_rhs_3210(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [22 23 20 21 18 19 16 17 24 25 26 27 28 29 30 31]
    v5 = bitcast.i16x8 little v4
    return v5
}
; run: %pshuflw_rhs_3210([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [12 11 10 9 13 14 15 16]

function %pshuflw_rhs_3131(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [22 23 18 19 22 23 18 19 24 25 26 27 28 29 30 31]
    v5 = bitcast.i16x8 little v4
    return v5
}
; run: %pshuflw_rhs_3131([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [12 10 12 10 13 14 15 16]

function %pshufhw_3210(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 14 15 12 13 10 11 8 9]
    v5 = bitcast.i16x8 little v4
    return v5
}
; run: %pshufhw_3210([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [1 2 3 4 8 7 6 5]

function %pshufhw_3131(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 14 15 10 11 14 15 10 11]
    v5 = bitcast.i16x8 little v4
    return v5
}
; run: %pshufhw_3131([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [1 2 3 4 8 6 8 6]

function %pshufhw_rhs_3210(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [16 17 18 19 20 21 22 23 30 31 28 29 26 27 24 25]
    v5 = bitcast.i16x8 little v4
    return v5
}
; run: %pshufhw_rhs_3210([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [9 10 11 12 16 15 14 13]

function %pshufhw_rhs_3131(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [16 17 18 19 20 21 22 23 30 31 26 27 30 31 26 27]
    v5 = bitcast.i16x8 little v4
    return v5
}
; run: %pshufhw_rhs_3131([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [9 10 11 12 16 14 16 14]

function %shuffle_all_zeros(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
    v2 = shuffle v0, v1, [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
    return v2
}
; run: %shuffle_all_zeros([5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1], [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]) == [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]

function %aarch64_uzp1_i8x16(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
    v2 = shuffle v0, v1, [0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30]
    return v2
}
; run: %aarch64_uzp1_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31]

function %aarch64_uzp2_i8x16(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
    v2 = shuffle v0, v1, [1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31]
    return v2
}
; run: %aarch64_uzp2_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32]

function %aarch64_uzp1_i16x8(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 4 5 8 9 12 13 16 17 20 21 24 25 28 29]
    v5 = bitcast.i16x8 little v4
    return v5
}
; run: %aarch64_uzp1_i16x8([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [1 3 5 7 9 11 13 15]

function %aarch64_uzp2_i16x8(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [2 3 6 7 10 11 14 15 18 19 22 23 26 27 30 31]
    v5 = bitcast.i16x8 little v4
    return v5
}
; run: %aarch64_uzp2_i16x8([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [2 4 6 8 10 12 14 16]

function %aarch64_uzp1_i32x4(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27]
    v5 = bitcast.i32x4 little v4
    return v5
}
; run: %aarch64_uzp1_i32x4([1 2 3 4], [5 6 7 8]) == [1 3 5 7]

function %aarch64_uzp2_i32x4(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31]
    v5 = bitcast.i32x4 little v4
    return v5
}
; run: %aarch64_uzp2_i32x4([1 2 3 4], [5 6 7 8]) == [2 4 6 8]

function %aarch64_uzp1_i64x2(i64x2, i64x2) -> i64x2 {
block0(v0: i64x2, v1: i64x2):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23]
    v5 = bitcast.i64x2 little v4
    return v5
}
; run: %aarch64_uzp1_i64x2([1 2], [3 4]) == [1 3]

function %aarch64_uzp2_i64x2(i64x2, i64x2) -> i64x2 {
block0(v0: i64x2, v1: i64x2):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31]
    v5 = bitcast.i64x2 little v4
    return v5
}
; run: %aarch64_uzp1_i64x2([1 2], [3 4]) == [2 4]

function %aarch64_trn1_i8x16(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
    v2 = shuffle v0, v1, [0 16 2 18 4 20 6 22 8 24 10 26 12 28 14 30]
    return v2
}
; run: %aarch64_trn1_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [1 17 3 19 5 21 7 23 9 25 11 27 13 29 15 31]

function %aarch64_trn2_i8x16(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
    v2 = shuffle v0, v1, [1 17 3 19 5 21 7 23 9 25 11 27 13 29 15 31]
    return v2
}
; run: %aarch64_trn2_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [2 18 4 20 6 22 8 24 10 26 12 28 14 30 16 32]

function %aarch64_trn1_i16x8(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 16 17 4 5 20 21 8 9 24 25 12 13 28 29]
    v5 = bitcast.i16x8 little v4
    return v5
}
; run: %aarch64_trn1_i16x8([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [1 9 3 11 5 13 7 15]

function %aarch64_trn2_i16x8(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [2 3 18 19 6 7 22 23 10 11 26 27 14 15 30 31]
    v5 = bitcast.i16x8 little v4
    return v5
}
; run: %aarch64_trn2_i16x8([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [2 10 4 12 6 14 8 16]

function %aarch64_trn1_i32x4(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 2 3 16 17 18 19 8 9 10 11 24 25 26 27]
    v5 = bitcast.i32x4 little v4
    return v5
}
; run: %aarch64_trn1_i32x4([1 2 3 4], [5 6 7 8]) == [1 5 3 7]

function %aarch64_trn2_i32x4(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [4 5 6 7 20 21 22 23 12 13 14 15 28 29 30 31]
    v5 = bitcast.i32x4 little v4
    return v5
}
; run: %aarch64_trn2_i32x4([1 2 3 4], [5 6 7 8]) == [2 6 4 8]

function %aarch64_trn1_i64x2(i64x2, i64x2) -> i64x2 {
block0(v0: i64x2, v1: i64x2):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23]
    v5 = bitcast.i64x2 little v4
    return v5
}
; run: %aarch64_trn1_i64x2([1 2], [3 4]) == [1 3]

function %aarch64_trn2_i64x2(i64x2, i64x2) -> i64x2 {
block0(v0: i64x2, v1: i64x2):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31]
    v5 = bitcast.i64x2 little v4
    return v5
}
; run: %aarch64_trn2_i64x2([1 2], [3 4]) == [2 4]

function %aarch64_ext_0(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
    v2 = shuffle v0, v1, [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
    return v2
}
; run: %aarch64_ext_0([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]

function %aarch64_ext_1(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
    v2 = shuffle v0, v1, [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
    return v2
}
; run: %aarch64_ext_1([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17]

function %aarch64_ext_5(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
    v2 = shuffle v0, v1, [5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20]
    return v2
}
; run: %aarch64_ext_5([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21]

function %aarch64_ext_11(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
    v2 = shuffle v0, v1, [11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]
    return v2
}
; run: %aarch64_ext_11([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27]

function %aarch64_ext_16(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
    v2 = shuffle v0, v1, [16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31]
    return v2
}
; run: %aarch64_ext_16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]

function %aarch64_dup_i8x16(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
    v2 = shuffle v0, v1, [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]
    return v2
}
; run: %aarch64_dup_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6]

function %aarch64_dup_i16x8(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [12 13 12 13 12 13 12 13 12 13 12 13 12 13 12 13]
    v5 = bitcast.i16x8 little v4
    return v5
}
; run: %aarch64_dup_i16x8([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [7 7 7 7 7 7 7 7]

function %aarch64_dup_i32x4_1(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [8 9 10 11 8 9 10 11 8 9 10 11 8 9 10 11]
    v5 = bitcast.i32x4 little v4
    return v5
}
; run: %aarch64_dup_i32x4_1([1 2 3 4], [5 6 7 8]) == [3 3 3 3]

function %aarch64_dup_i32x4_2(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [16 17 18 19 16 17 18 19 16 17 18 19 16 17 18 19]
    v5 = bitcast.i32x4 little v4
    return v5
}
; run: %aarch64_dup_i32x4_2([1 2 3 4], [5 6 7 8]) == [5 5 5 5]

function %aarch64_dup_i64x2(i64x2, i64x2) -> i64x2 {
block0(v0: i64x2, v1: i64x2):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7]
    v5 = bitcast.i64x2 little v4
    return v5
}
; run: %aarch64_dup_i64x2([1 2], [5 6]) == [1 1]

function %aarch64_rev16(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [1 0 3 2 5 4 7 6 9 8 11 10 13 12 15 14]
    v5 = bitcast.i16x8 little v4
    return v5
}
; run: %aarch64_rev16([0x1 0x2 0x3 0x4 0x100 0x200 0x300 0x400], [0 0 0 0 0 0 0 0]) == [0x100 0x200 0x300 0x400 0x1 0x2 0x3 0x4]

function %aarch64_rev32_bytes(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [3 2 1 0 7 6 5 4 11 10 9 8 15 14 13 12]
    v5 = bitcast.i32x4 little v4
    return v5
}
; run: %aarch64_rev32_bytes([1 2 3 4], [0 0 0 0]) == [0x1000000 0x2000000 0x3000000 0x4000000]
; run: %aarch64_rev32_bytes([0xff000000 0x00ff0000 0x0000ff00 0x000000ff], [0 0 0 0]) == [0xff 0xff00 0xff0000 0xff000000]

function %aarch64_rev32_words(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [2 3 0 1 6 7 4 5 10 11 8 9 14 15 12 13]
    v5 = bitcast.i32x4 little v4
    return v5
}
; run: %aarch64_rev32_words([1 2 3 4], [0 0 0 0]) == [0x10000 0x20000 0x30000 0x40000]
; run: %aarch64_rev32_words([0xff000000 0x00ff0000 0x0000ff00 0x000000ff], [0 0 0 0]) == [0xff00 0xff 0xff000000 0xff0000]

function %aarch64_rev64_bytes(i64x2, i64x2) -> i64x2 {
block0(v0: i64x2, v1: i64x2):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [7 6 5 4 3 2 1 0 15 14 13 12 11 10 9 8]
    v5 = bitcast.i64x2 little v4
    return v5
}
; run: %aarch64_rev64_bytes([0x0102030405060708 0x0807060504030201], [0 0]) == [0x0807060504030201 0x0102030405060708]

function %aarch64_rev64_words(i64x2, i64x2) -> i64x2 {
block0(v0: i64x2, v1: i64x2):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [6 7 4 5 2 3 0 1 14 15 12 13 10 11 8 9]
    v5 = bitcast.i64x2 little v4
    return v5
}
; run: %aarch64_rev64_words([0x0102030405060708 0x0807060504030201], [0 0]) == [0x0708050603040102 0x0201040306050807]

function %aarch64_rev64_doublewords(i64x2, i64x2) -> i64x2 {
block0(v0: i64x2, v1: i64x2):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [4 5 6 7 0 1 2 3 12 13 14 15 8 9 10 11]
    v5 = bitcast.i64x2 little v4
    return v5
}
; run: %aarch64_rev64_words([0x0102030405060708 0x0807060504030201], [0 0]) == [0x0506070801020304 0x0403020108070605]

function %pblendw_0b10011001(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [16 17 2 3 4 5 22 23 24 25 10 11 12 13 30 31]
    v5 = bitcast.i16x8 little v4
    return v5
}
; run: %pblendw_0b10011001([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [9 2 3 12 13 6 7 16]