aarch64: Add specialized shuffle lowerings (#5977)

* aarch64: Add `shuffle` lowerings for the `uzp{1,2}` instructions This commit uses the same style of patterns in the x64 backend to start adding specific lowerings of the Cranelift `shuffle` instruction to particular AArch64 instructions. * aarch64: Add `shuffle` lowerings to the `zip{1,2}` instructions These instructions match the `punpck*` family of instructions on x64 and should help provide more efficient lowerings than the current `shuffle` fallback. * aarch64: Add `shuffle` lowerings for `trn{1,2}` Along the lines of prior commits adds specific patterns to lowering for individual AArch64 instructions available. * aarch64: Add a `shuffle` lowering for the `ext` instruction This instruction will more-or-less concatenate two 128-bit vector registers to create a 256-bit value, shift it right, and then take the lower 128-bits into the destination. This can be modeled with a `shuffle` of consecutive bytes so this adds a lowering rule to generate this instruction. * aarch64: Add `shuffle` special case for `dup` This commit adds special cases for Cranelift's `shuffle` on AArch64 when the lowering can be represented with a `dup` instruction which broadcasts one vector's lane into all lanes of the destination. * aarch64: Add `shuffle` specializations for `rev` instructions This commit adds shuffle mask specializations for the `rev{16,32,64}` family of instructions on AArch64 which can be used to reverse bytes, 16-bit values, or 32-bit values within larger values. * Fix tests * Add doc-comments in ISLE
2023-03-10 15:37:13 -06:00
parent 5623f7280c
commit 52896e020d
10 changed files with 1305 additions and 11 deletions
--- a/cranelift/filetests/filetests/isa/aarch64/shuffle.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/shuffle.clif
@@ -0,0 +1,747 @@
+test compile precise-output
+set unwind_info=false
+target aarch64
+
+function %shuffle_i8x16(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [3 0 31 26 4 6 12 11 23 13 24 4 2 15 17 5]
+    return v2
+}
+
+; VCode:
+; block0:
+;   mov v30.16b, v0.16b
+;   mov v31.16b, v1.16b
+;   ldr q3, pc+8 ; b 20 ; data.f128 0x05110f0204180d170b0c06041a1f0003
+;   tbl v0.16b, { v30.16b, v31.16b }, v3.16b
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   mov v30.16b, v0.16b
+;   mov v31.16b, v1.16b
+;   ldr q3, #0x10
+;   b #0x20
+;   adc w3, w0, wzr
+;   add w4, w16, w12, lsl #1
+;   orr z23.b, p3/m, z23.b, z8.b
+;   mov z2.b, p1/z, #0x78
+;   tbl v0.16b, {v30.16b, v31.16b}, v3.16b
+;   ret
+
+function %aarch64_uzp1_i8x16(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30]
+    return v2
+}
+
+; VCode:
+; block0:
+;   uzp1 v0.16b, v0.16b, v1.16b
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   uzp1 v0.16b, v0.16b, v1.16b
+;   ret
+
+function %aarch64_uzp2_i8x16(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31]
+    return v2
+}
+
+; VCode:
+; block0:
+;   uzp2 v0.16b, v0.16b, v1.16b
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   uzp2 v0.16b, v0.16b, v1.16b
+;   ret
+
+function %aarch64_uzp1_i16x8(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 4 5 8 9 12 13 16 17 20 21 24 25 28 29]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+; block0:
+;   uzp1 v0.8h, v0.8h, v1.8h
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   uzp1 v0.8h, v0.8h, v1.8h
+;   ret
+
+function %aarch64_uzp2_i16x8(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [2 3 6 7 10 11 14 15 18 19 22 23 26 27 30 31]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+; block0:
+;   uzp2 v0.8h, v0.8h, v1.8h
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   uzp2 v0.8h, v0.8h, v1.8h
+;   ret
+
+function %aarch64_uzp1_i32x4(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+; block0:
+;   uzp1 v0.4s, v0.4s, v1.4s
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   uzp1 v0.4s, v0.4s, v1.4s
+;   ret
+
+function %aarch64_uzp2_i32x4(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+; block0:
+;   uzp2 v0.4s, v0.4s, v1.4s
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   uzp2 v0.4s, v0.4s, v1.4s
+;   ret
+
+function %aarch64_uzp1_i64x2(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+
+; VCode:
+; block0:
+;   uzp1 v0.2d, v0.2d, v1.2d
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   uzp1 v0.2d, v0.2d, v1.2d
+;   ret
+
+function %aarch64_uzp2_i64x2(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+
+; VCode:
+; block0:
+;   uzp2 v0.2d, v0.2d, v1.2d
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   uzp2 v0.2d, v0.2d, v1.2d
+;   ret
+
+function %punpcklbw(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23]
+    return v2
+}
+
+; VCode:
+; block0:
+;   zip1 v0.16b, v0.16b, v1.16b
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   zip1 v0.16b, v0.16b, v1.16b
+;   ret
+
+function %punpckhbw(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [8 24 9 25 10 26 11 27 12 28 13 29 14 30 15 31]
+    return v2
+}
+
+; VCode:
+; block0:
+;   zip2 v0.16b, v0.16b, v1.16b
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   zip2 v0.16b, v0.16b, v1.16b
+;   ret
+
+function %punpcklwd(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 16 17 2 3 18 19 4 5 20 21 6 7 22 23]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+; block0:
+;   zip1 v0.8h, v0.8h, v1.8h
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   zip1 v0.8h, v0.8h, v1.8h
+;   ret
+
+function %punpckhwd(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [8 9 24 25 10 11 26 27 12 13 28 29 14 15 30 31]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+; block0:
+;   zip2 v0.8h, v0.8h, v1.8h
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   zip2 v0.8h, v0.8h, v1.8h
+;   ret
+
+function %punpckldq(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 16 17 18 19 4 5 6 7 20 21 22 23]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+; block0:
+;   zip1 v0.4s, v0.4s, v1.4s
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   zip1 v0.4s, v0.4s, v1.4s
+;   ret
+
+function %punpckhdq(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [8 9 10 11 24 25 26 27 12 13 14 15 28 29 30 31]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+; block0:
+;   zip2 v0.4s, v0.4s, v1.4s
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   zip2 v0.4s, v0.4s, v1.4s
+;   ret
+
+function %punpcklqdq(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+
+; VCode:
+; block0:
+;   uzp1 v0.2d, v0.2d, v1.2d
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   uzp1 v0.2d, v0.2d, v1.2d
+;   ret
+
+function %punpckhqdq(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+
+; VCode:
+; block0:
+;   uzp2 v0.2d, v0.2d, v1.2d
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   uzp2 v0.2d, v0.2d, v1.2d
+;   ret
+
+function %aarch64_trn1_i8x16(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [0 16 2 18 4 20 6 22 8 24 10 26 12 28 14 30]
+    return v2
+}
+
+; VCode:
+; block0:
+;   trn1 v0.16b, v0.16b, v1.16b
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   trn1 v0.16b, v0.16b, v1.16b
+;   ret
+
+function %aarch64_trn2_i8x16(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [1 17 3 19 5 21 7 23 9 25 11 27 13 29 15 31]
+    return v2
+}
+
+; VCode:
+; block0:
+;   trn2 v0.16b, v0.16b, v1.16b
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   trn2 v0.16b, v0.16b, v1.16b
+;   ret
+
+function %aarch64_trn1_i16x8(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 16 17 4 5 20 21 8 9 24 25 12 13 28 29]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+; block0:
+;   trn1 v0.8h, v0.8h, v1.8h
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   trn1 v0.8h, v0.8h, v1.8h
+;   ret
+
+function %aarch64_trn2_i16x8(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [2 3 18 19 6 7 22 23 10 11 26 27 14 15 30 31]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+; block0:
+;   trn2 v0.8h, v0.8h, v1.8h
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   trn2 v0.8h, v0.8h, v1.8h
+;   ret
+
+function %aarch64_trn1_i32x4(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 16 17 18 19 8 9 10 11 24 25 26 27]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+; block0:
+;   trn1 v0.4s, v0.4s, v1.4s
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   trn1 v0.4s, v0.4s, v1.4s
+;   ret
+
+function %aarch64_trn2_i32x4(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [4 5 6 7 20 21 22 23 12 13 14 15 28 29 30 31]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+; block0:
+;   trn2 v0.4s, v0.4s, v1.4s
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   trn2 v0.4s, v0.4s, v1.4s
+;   ret
+
+function %aarch64_trn1_i64x2(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+
+; VCode:
+; block0:
+;   uzp1 v0.2d, v0.2d, v1.2d
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   uzp1 v0.2d, v0.2d, v1.2d
+;   ret
+
+function %aarch64_trn2_i64x2(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+
+; VCode:
+; block0:
+;   uzp2 v0.2d, v0.2d, v1.2d
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   uzp2 v0.2d, v0.2d, v1.2d
+;   ret
+
+function %aarch64_ext_0(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+    return v2
+}
+
+; VCode:
+; block0:
+;   ext v0.16b, v0.16b, v1.16b, #0
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   ext v0.16b, v0.16b, v1.16b, #0
+;   ret
+
+function %aarch64_ext_1(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
+    return v2
+}
+
+; VCode:
+; block0:
+;   ext v0.16b, v0.16b, v1.16b, #1
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   ext v0.16b, v0.16b, v1.16b, #1
+;   ret
+
+function %aarch64_ext_5(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20]
+    return v2
+}
+
+; VCode:
+; block0:
+;   ext v0.16b, v0.16b, v1.16b, #5
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   ext v0.16b, v0.16b, v1.16b, #5
+;   ret
+
+function %aarch64_ext_11(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]
+    return v2
+}
+
+; VCode:
+; block0:
+;   ext v0.16b, v0.16b, v1.16b, #11
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   ext v0.16b, v0.16b, v1.16b, #0xb
+;   ret
+
+function %aarch64_ext_16(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31]
+    return v2
+}
+
+; VCode:
+; block0:
+;   mov v30.16b, v0.16b
+;   mov v31.16b, v1.16b
+;   ldr q3, pc+8 ; b 20 ; data.f128 0x1f1e1d1c1b1a19181716151413121110
+;   tbl v0.16b, { v30.16b, v31.16b }, v3.16b
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   mov v30.16b, v0.16b
+;   mov v31.16b, v1.16b
+;   ldr q3, #0x10
+;   b #0x20
+;   sbfiz w16, w8, #0xe, #5
+;   b #0xfffffffffc585464
+;   madd w24, w8, w26, w6
+;   fmadd s28, s8, s30, s7
+;   tbl v0.16b, {v30.16b, v31.16b}, v3.16b
+;   ret
+
+function %aarch64_dup_i8x16(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]
+    return v2
+}
+
+; VCode:
+; block0:
+;   dup v0.16b, v0.b[5]
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   dup v0.16b, v0.b[5]
+;   ret
+
+function %aarch64_dup_i16x8(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [12 13 12 13 12 13 12 13 12 13 12 13 12 13 12 13]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+; block0:
+;   dup v0.8h, v0.h[6]
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   dup v0.8h, v0.h[6]
+;   ret
+
+function %aarch64_dup_i32x4(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [12 13 14 15 12 13 14 15 12 13 14 15 12 13 14 15]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+; block0:
+;   dup v0.4s, v0.s[3]
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   dup v0.4s, v0.s[3]
+;   ret
+
+function %aarch64_dup_i64x2(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+
+; VCode:
+; block0:
+;   dup v0.2d, v0.d[0]
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   dup v0.2d, v0.d[0]
+;   ret
+
+function %aarch64_rev16(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [1 0 3 2 5 4 7 6 9 8 11 10 13 12 15 14]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+; block0:
+;   rev16 v0.16b, v0.16b
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   rev16 v0.16b, v0.16b
+;   ret
+
+function %aarch64_rev32_bytes(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [3 2 1 0 7 6 5 4 11 10 9 8 15 14 13 12]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+; block0:
+;   rev32 v0.16b, v0.16b
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   rev32 v0.16b, v0.16b
+;   ret
+
+function %aarch64_rev32_words(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [2 3 0 1 6 7 4 5 10 11 8 9 14 15 12 13]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+; block0:
+;   rev32 v0.8h, v0.8h
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   rev32 v0.8h, v0.8h
+;   ret
+
+function %aarch64_rev64_bytes(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [7 6 5 4 3 2 1 0 15 14 13 12 11 10 9 8]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+
+; VCode:
+; block0:
+;   rev64 v0.16b, v0.16b
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   rev64 v0.16b, v0.16b
+;   ret
+
+function %aarch64_rev64_words(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [6 7 4 5 2 3 0 1 14 15 12 13 10 11 8 9]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+
+; VCode:
+; block0:
+;   rev64 v0.8h, v0.8h
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   rev64 v0.8h, v0.8h
+;   ret
+
+function %aarch64_rev64_doublewords(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [4 5 6 7 0 1 2 3 12 13 14 15 8 9 10 11]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+
+; VCode:
+; block0:
+;   rev64 v0.4s, v0.4s
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   rev64 v0.4s, v0.4s
+;   ret
+
--- a/cranelift/filetests/filetests/runtests/simd-shuffle.clif
+++ b/cranelift/filetests/filetests/runtests/simd-shuffle.clif
@@ -258,3 +258,295 @@ block0(v0: i8x16, v1: i8x16):
    return v2
 }
 ; run: %shuffle_all_zeros([5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1], [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]) == [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]
+
+function %aarch64_uzp1_i8x16(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30]
+    return v2
+}
+; run: %aarch64_uzp1_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31]
+
+function %aarch64_uzp2_i8x16(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31]
+    return v2
+}
+; run: %aarch64_uzp2_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32]
+
+function %aarch64_uzp1_i16x8(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 4 5 8 9 12 13 16 17 20 21 24 25 28 29]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+; run: %aarch64_uzp1_i16x8([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [1 3 5 7 9 11 13 15]
+
+function %aarch64_uzp2_i16x8(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [2 3 6 7 10 11 14 15 18 19 22 23 26 27 30 31]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+; run: %aarch64_uzp2_i16x8([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [2 4 6 8 10 12 14 16]
+
+function %aarch64_uzp1_i32x4(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %aarch64_uzp1_i32x4([1 2 3 4], [5 6 7 8]) == [1 3 5 7]
+
+function %aarch64_uzp2_i32x4(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %aarch64_uzp2_i32x4([1 2 3 4], [5 6 7 8]) == [2 4 6 8]
+
+function %aarch64_uzp1_i64x2(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+; run: %aarch64_uzp1_i64x2([1 2], [3 4]) == [1 3]
+
+function %aarch64_uzp2_i64x2(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+; run: %aarch64_uzp1_i64x2([1 2], [3 4]) == [2 4]
+
+function %aarch64_trn1_i8x16(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [0 16 2 18 4 20 6 22 8 24 10 26 12 28 14 30]
+    return v2
+}
+; run: %aarch64_trn1_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [1 17 3 19 5 21 7 23 9 25 11 27 13 29 15 31]
+
+function %aarch64_trn2_i8x16(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [1 17 3 19 5 21 7 23 9 25 11 27 13 29 15 31]
+    return v2
+}
+; run: %aarch64_trn2_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [2 18 4 20 6 22 8 24 10 26 12 28 14 30 16 32]
+
+function %aarch64_trn1_i16x8(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 16 17 4 5 20 21 8 9 24 25 12 13 28 29]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+; run: %aarch64_trn1_i16x8([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [1 9 3 11 5 13 7 15]
+
+function %aarch64_trn2_i16x8(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [2 3 18 19 6 7 22 23 10 11 26 27 14 15 30 31]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+; run: %aarch64_trn2_i16x8([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [2 10 4 12 6 14 8 16]
+
+function %aarch64_trn1_i32x4(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 16 17 18 19 8 9 10 11 24 25 26 27]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %aarch64_trn1_i32x4([1 2 3 4], [5 6 7 8]) == [1 5 3 7]
+
+function %aarch64_trn2_i32x4(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [4 5 6 7 20 21 22 23 12 13 14 15 28 29 30 31]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %aarch64_trn2_i32x4([1 2 3 4], [5 6 7 8]) == [2 6 4 8]
+
+function %aarch64_trn1_i64x2(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+; run: %aarch64_trn1_i64x2([1 2], [3 4]) == [1 3]
+
+function %aarch64_trn2_i64x2(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+; run: %aarch64_trn2_i64x2([1 2], [3 4]) == [2 4]
+
+function %aarch64_ext_0(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+    return v2
+}
+; run: %aarch64_ext_0([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
+
+function %aarch64_ext_1(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
+    return v2
+}
+; run: %aarch64_ext_1([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17]
+
+function %aarch64_ext_5(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20]
+    return v2
+}
+; run: %aarch64_ext_5([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21]
+
+function %aarch64_ext_11(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]
+    return v2
+}
+; run: %aarch64_ext_11([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27]
+
+function %aarch64_ext_16(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31]
+    return v2
+}
+; run: %aarch64_ext_16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]
+
+function %aarch64_dup_i8x16(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]
+    return v2
+}
+; run: %aarch64_dup_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6]
+
+function %aarch64_dup_i16x8(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [12 13 12 13 12 13 12 13 12 13 12 13 12 13 12 13]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+; run: %aarch64_dup_i16x8([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [7 7 7 7 7 7 7 7]
+
+function %aarch64_dup_i32x4_1(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [8 9 10 11 8 9 10 11 8 9 10 11 8 9 10 11]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %aarch64_dup_i32x4_1([1 2 3 4], [5 6 7 8]) == [3 3 3 3]
+
+function %aarch64_dup_i32x4_2(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [16 17 18 19 16 17 18 19 16 17 18 19 16 17 18 19]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %aarch64_dup_i32x4_2([1 2 3 4], [5 6 7 8]) == [5 5 5 5]
+
+function %aarch64_dup_i64x2(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+; run: %aarch64_dup_i64x2([1 2], [5 6]) == [1 1]
+
+function %aarch64_rev16(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [1 0 3 2 5 4 7 6 9 8 11 10 13 12 15 14]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+; run: %aarch64_rev16([0x1 0x2 0x3 0x4 0x100 0x200 0x300 0x400], [0 0 0 0 0 0 0 0]) == [0x100 0x200 0x300 0x400 0x1 0x2 0x3 0x4]
+
+function %aarch64_rev32_bytes(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [3 2 1 0 7 6 5 4 11 10 9 8 15 14 13 12]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %aarch64_rev32_bytes([1 2 3 4], [0 0 0 0]) == [0x1000000 0x2000000 0x3000000 0x4000000]
+; run: %aarch64_rev32_bytes([0xff000000 0x00ff0000 0x0000ff00 0x000000ff], [0 0 0 0]) == [0xff 0xff00 0xff0000 0xff000000]
+
+function %aarch64_rev32_words(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [2 3 0 1 6 7 4 5 10 11 8 9 14 15 12 13]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %aarch64_rev32_words([1 2 3 4], [0 0 0 0]) == [0x10000 0x20000 0x30000 0x40000]
+; run: %aarch64_rev32_words([0xff000000 0x00ff0000 0x0000ff00 0x000000ff], [0 0 0 0]) == [0xff00 0xff 0xff000000 0xff0000]
+
+function %aarch64_rev64_bytes(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [7 6 5 4 3 2 1 0 15 14 13 12 11 10 9 8]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+; run: %aarch64_rev64_bytes([0x0102030405060708 0x0807060504030201], [0 0]) == [0x0807060504030201 0x0102030405060708]
+
+function %aarch64_rev64_words(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [6 7 4 5 2 3 0 1 14 15 12 13 10 11 8 9]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+; run: %aarch64_rev64_words([0x0102030405060708 0x0807060504030201], [0 0]) == [0x0708050603040102 0x0201040306050807]
+
+function %aarch64_rev64_doublewords(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [4 5 6 7 0 1 2 3 12 13 14 15 8 9 10 11]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+; run: %aarch64_rev64_words([0x0102030405060708 0x0807060504030201], [0 0]) == [0x0506070801020304 0x0403020108070605]