From 1c3a1bda6cd1bf8435f13dc4868cd4f435bcfb5f Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Thu, 9 Mar 2023 16:58:19 -0600
Subject: [PATCH] x64: Add a smattering of lowerings for `shuffle`
 specializations (#5930)

* x64: Add lowerings for `punpck{h,l}wd`

Add some special cases for `shuffle` for more specialized x86
instructions.

* x64: Add `shuffle` lowerings for `pshufd`

This commit adds special-cased lowerings for the x64 `shuffle`
instruction when the `pshufd` instruction alone is necessary. This is
possible when the shuffle immediate permutes 32-bit values within one of
the vector inputs of the `shuffle` instruction, but not both.

* x64: Add shuffle lowerings for `punpck{h,l}{q,}dq`

This adds specific permutations for some x86 instructions which
specifically interleave high/low bytes for 32 and 64-bit values. This
corresponds to the preexisting specific lowerings for interleaving 8 and
16-bit values.

* x64: Add `shuffle` lowerings for `shufps`

This commit adds targeted lowerings for the `shuffle` instruction that
match the pattern that `shufps` supports. The `shufps` instruction
selects two elements from the first vector and two elements from the
second vector which means while it's not generally applicable it should
still be more useful than the catch-all lowering of `shuffle`.

* x64: Add shuffle support for `pshuf{l,h}w`

This commit adds special lowering cases for these instructions which
permute 16-bit values within a 128-bit value either within the upper or
lower half of the 128-bit value.

* x64: Specialize `shuffle` with an all-zeros immediate

Instead of loading the all-zeros immediate from a rip-relative address
at the end of the function instead generate a zero with a `pxor`
instruction and then use `pshufb` to do the broadcast.

* Review comments
---
 cranelift/codegen/src/isa/x64/inst.isle       |  60 ++
 cranelift/codegen/src/isa/x64/inst/args.rs    |  28 +-
 cranelift/codegen/src/isa/x64/inst/emit.rs    |  12 +
 cranelift/codegen/src/isa/x64/lower.isle      |  96 ++-
 cranelift/codegen/src/isa/x64/lower/isle.rs   | 118 ++++
 cranelift/codegen/src/machinst/isle.rs        |  77 +++
 cranelift/codegen/src/prelude_lower.isle      |  10 +
 .../filetests/isa/x64/shuffle-avx.clif        | 116 ++++
 .../filetests/filetests/isa/x64/shuffle.clif  | 591 ++++++++++++++++++
 .../filetests/runtests/simd-shuffle.clif      | 234 ++++++-
 10 files changed, 1332 insertions(+), 10 deletions(-)
 create mode 100644 cranelift/filetests/filetests/isa/x64/shuffle-avx.clif

diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle
index 260796cc5e..a03e380839 100644
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -863,6 +863,12 @@
             Xorpd
             Phaddw
             Phaddd
+            Punpckhdq
+            Punpckldq
+            Punpckhqdq
+            Punpcklqdq
+            Pshuflw
+            Pshufhw
           ))
 
 (type CmpOpcode extern
@@ -1347,6 +1353,12 @@
             Vcvttps2dq
             Vphaddw
             Vphaddd
+            Vpunpckhdq
+            Vpunpckldq
+            Vpunpckhqdq
+            Vpunpcklqdq
+            Vpshuflw
+            Vpshufhw
           ))
 
 (type Avx512Opcode extern
@@ -2729,6 +2741,38 @@
       (if-let $true (has_avx))
       (xmm_rmir_vex (AvxOpcode.Vpunpcklwd) src1 src2))
 
+;; Helper for creating `punpckldq` instructions.
+(decl x64_punpckldq (Xmm XmmMem) Xmm)
+(rule 0 (x64_punpckldq src1 src2)
+      (xmm_rm_r (SseOpcode.Punpckldq) src1 src2))
+(rule 1 (x64_punpckldq src1 src2)
+      (if-let $true (has_avx))
+      (xmm_rmir_vex (AvxOpcode.Vpunpckldq) src1 src2))
+
+;; Helper for creating `punpckhdq` instructions.
+(decl x64_punpckhdq (Xmm XmmMem) Xmm)
+(rule 0 (x64_punpckhdq src1 src2)
+      (xmm_rm_r (SseOpcode.Punpckhdq) src1 src2))
+(rule 1 (x64_punpckhdq src1 src2)
+      (if-let $true (has_avx))
+      (xmm_rmir_vex (AvxOpcode.Vpunpckhdq) src1 src2))
+
+;; Helper for creating `punpcklqdq` instructions.
+(decl x64_punpcklqdq (Xmm XmmMem) Xmm)
+(rule 0 (x64_punpcklqdq src1 src2)
+      (xmm_rm_r (SseOpcode.Punpcklqdq) src1 src2))
+(rule 1 (x64_punpcklqdq src1 src2)
+      (if-let $true (has_avx))
+      (xmm_rmir_vex (AvxOpcode.Vpunpcklqdq) src1 src2))
+
+;; Helper for creating `punpckhqdq` instructions.
+(decl x64_punpckhqdq (Xmm XmmMem) Xmm)
+(rule 0 (x64_punpckhqdq src1 src2)
+      (xmm_rm_r (SseOpcode.Punpckhqdq) src1 src2))
+(rule 1 (x64_punpckhqdq src1 src2)
+      (if-let $true (has_avx))
+      (xmm_rmir_vex (AvxOpcode.Vpunpckhqdq) src1 src2))
+
 ;; Helper for creating `unpcklps` instructions.
 (decl x64_unpcklps (Xmm XmmMem) Xmm)
 (rule 0 (x64_unpcklps src1 src2)
@@ -3284,6 +3328,22 @@
       (if-let $true (has_avx))
       (xmm_rmir_vex (AvxOpcode.Vpshufb) src1 src2))
 
+;; Helper for creating `pshuflw` instructions.
+(decl x64_pshuflw (XmmMem u8) Xmm)
+(rule (x64_pshuflw src imm)
+      (xmm_unary_rm_r_imm (SseOpcode.Pshuflw) src imm))
+(rule 1 (x64_pshuflw src imm)
+      (if-let $true (has_avx))
+      (xmm_unary_rm_r_imm_vex (AvxOpcode.Vpshuflw) src imm))
+
+;; Helper for creating `pshufhw` instructions.
+(decl x64_pshufhw (XmmMem u8) Xmm)
+(rule (x64_pshufhw src imm)
+      (xmm_unary_rm_r_imm (SseOpcode.Pshufhw) src imm))
+(rule 1 (x64_pshufhw src imm)
+      (if-let $true (has_avx))
+      (xmm_unary_rm_r_imm_vex (AvxOpcode.Vpshufhw) src imm))
+
 ;; Helper for creating `shufps` instructions.
 (decl x64_shufps (Xmm XmmMem u8) Xmm)
 (rule 0 (x64_shufps src1 src2 byte)
diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs
index ca6e40ce55..01ee044ab3 100644
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -1117,6 +1117,12 @@ pub enum SseOpcode {
     Xorpd,
     Phaddw,
     Phaddd,
+    Punpckhdq,
+    Punpckldq,
+    Punpckhqdq,
+    Punpcklqdq,
+    Pshuflw,
+    Pshufhw,
 }
 
 impl SseOpcode {
@@ -1256,7 +1262,13 @@ impl SseOpcode {
             | SseOpcode::Subpd
             | SseOpcode::Subsd
             | SseOpcode::Ucomisd
-            | SseOpcode::Xorpd => SSE2,
+            | SseOpcode::Xorpd
+            | SseOpcode::Punpckldq
+            | SseOpcode::Punpckhdq
+            | SseOpcode::Punpcklqdq
+            | SseOpcode::Punpckhqdq
+            | SseOpcode::Pshuflw
+            | SseOpcode::Pshufhw => SSE2,
 
             SseOpcode::Pabsb
             | SseOpcode::Pabsw
@@ -1501,6 +1513,12 @@ impl fmt::Debug for SseOpcode {
             SseOpcode::Xorpd => "xorpd",
             SseOpcode::Phaddw => "phaddw",
             SseOpcode::Phaddd => "phaddd",
+            SseOpcode::Punpckldq => "punpckldq",
+            SseOpcode::Punpckhdq => "punpckhdq",
+            SseOpcode::Punpcklqdq => "punpcklqdq",
+            SseOpcode::Punpckhqdq => "punpckhqdq",
+            SseOpcode::Pshuflw => "pshuflw",
+            SseOpcode::Pshufhw => "pshufhw",
         };
         write!(fmt, "{}", name)
     }
@@ -1669,7 +1687,13 @@ impl AvxOpcode {
             | AvxOpcode::Vcvttpd2dq
             | AvxOpcode::Vcvttps2dq
             | AvxOpcode::Vphaddw
-            | AvxOpcode::Vphaddd => {
+            | AvxOpcode::Vphaddd
+            | AvxOpcode::Vpunpckldq
+            | AvxOpcode::Vpunpckhdq
+            | AvxOpcode::Vpunpcklqdq
+            | AvxOpcode::Vpunpckhqdq
+            | AvxOpcode::Vpshuflw
+            | AvxOpcode::Vpshufhw => {
                 smallvec![InstructionSet::AVX]
             }
         }
diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
index b92e1ecd9b..6d86bffd05 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -1789,6 +1789,8 @@ pub(crate) fn emit(
                 SseOpcode::Roundpd => (LegacyPrefixes::_66, 0x0F3A09, 3),
                 SseOpcode::Roundsd => (LegacyPrefixes::_66, 0x0F3A0B, 3),
                 SseOpcode::Pshufd => (LegacyPrefixes::_66, 0x0F70, 2),
+                SseOpcode::Pshuflw => (LegacyPrefixes::_F2, 0x0F70, 2),
+                SseOpcode::Pshufhw => (LegacyPrefixes::_F3, 0x0F70, 2),
                 _ => unimplemented!("Opcode {:?} not implemented", op),
             };
             match src {
@@ -1946,6 +1948,10 @@ pub(crate) fn emit(
                 SseOpcode::Punpckhwd => (LegacyPrefixes::_66, 0x0F69, 2),
                 SseOpcode::Punpcklbw => (LegacyPrefixes::_66, 0x0F60, 2),
                 SseOpcode::Punpcklwd => (LegacyPrefixes::_66, 0x0F61, 2),
+                SseOpcode::Punpckldq => (LegacyPrefixes::_66, 0x0F62, 2),
+                SseOpcode::Punpcklqdq => (LegacyPrefixes::_66, 0x0F6C, 2),
+                SseOpcode::Punpckhdq => (LegacyPrefixes::_66, 0x0F6A, 2),
+                SseOpcode::Punpckhqdq => (LegacyPrefixes::_66, 0x0F6D, 2),
                 SseOpcode::Pxor => (LegacyPrefixes::_66, 0x0FEF, 2),
                 SseOpcode::Subps => (LegacyPrefixes::None, 0x0F5C, 2),
                 SseOpcode::Subpd => (LegacyPrefixes::_66, 0x0F5C, 2),
@@ -2171,6 +2177,10 @@ pub(crate) fn emit(
                 AvxOpcode::Vmaxsd => (LP::_F2, OM::_0F, 0x5F),
                 AvxOpcode::Vphaddw => (LP::_66, OM::_0F38, 0x01),
                 AvxOpcode::Vphaddd => (LP::_66, OM::_0F38, 0x02),
+                AvxOpcode::Vpunpckldq => (LP::_66, OM::_0F, 0x62),
+                AvxOpcode::Vpunpckhdq => (LP::_66, OM::_0F, 0x6A),
+                AvxOpcode::Vpunpcklqdq => (LP::_66, OM::_0F, 0x6C),
+                AvxOpcode::Vpunpckhqdq => (LP::_66, OM::_0F, 0x6D),
                 _ => panic!("unexpected rmir vex opcode {op:?}"),
             };
             VexInstruction::new()
@@ -2400,6 +2410,8 @@ pub(crate) fn emit(
             let (prefix, map, opcode) = match op {
                 AvxOpcode::Vroundps => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x08),
                 AvxOpcode::Vroundpd => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x09),
+                AvxOpcode::Vpshuflw => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x70),
+                AvxOpcode::Vpshufhw => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x70),
                 _ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
             };
 
diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
index ac27aae4d6..e33c5ee784 100644
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -3529,16 +3529,98 @@
 
 ;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-;; Special case for the `punpckhbw` instruction which interleaves the upper
-;; lanes of the two input registers.
-(rule 4 (lower (shuffle a b (u128_from_immediate 0x1f0f_1e0e_1d0d_1c0c_1b0b_1a0a_1909_1808)))
-      (x64_punpckhbw a b))
+;; Special case the `pshuf{l,h}w` instruction which shuffles four 16-bit
+;; integers within one value, preserving the other four 16-bit integers in that
+;; value (either the high or low half). The complicated logic is in the
+;; extractors here implemented in Rust and note that there's two cases for each
+;; instruction here to match when either the first or second shuffle operand is
+;; used.
+(rule 12 (lower (shuffle x y (pshuflw_lhs_imm imm)))
+      (x64_pshuflw x imm))
+(rule 11 (lower (shuffle x y (pshuflw_rhs_imm imm)))
+      (x64_pshuflw y imm))
+(rule 10 (lower (shuffle x y (pshufhw_lhs_imm imm)))
+      (x64_pshufhw x imm))
+(rule 9 (lower (shuffle x y (pshufhw_rhs_imm imm)))
+      (x64_pshufhw y imm))
 
-;; Special case for the `punpcklbw` instruction which interleaves the lower
-;; lanes of the two input registers.
-(rule 4 (lower (shuffle a b (u128_from_immediate 0x1707_1606_1505_1404_1303_1202_1101_1000)))
+(decl pshuflw_lhs_imm (u8) Immediate)
+(extern extractor pshuflw_lhs_imm pshuflw_lhs_imm)
+(decl pshuflw_rhs_imm (u8) Immediate)
+(extern extractor pshuflw_rhs_imm pshuflw_rhs_imm)
+(decl pshufhw_lhs_imm (u8) Immediate)
+(extern extractor pshufhw_lhs_imm pshufhw_lhs_imm)
+(decl pshufhw_rhs_imm (u8) Immediate)
+(extern extractor pshufhw_rhs_imm pshufhw_rhs_imm)
+
+;; Special case for the `pshufd` instruction which will permute 32-bit values
+;; within a single register. This is only applicable if the `imm` specified
+;; selects 32-bit values from either `x` or `y`, but not both. This means
+;; there's one rule for selecting from `x` and another rule for selecting from
+;; `y`.
+(rule 8 (lower (shuffle x y (pshufd_lhs_imm imm)))
+      (x64_pshufd x imm))
+(rule 7 (lower (shuffle x y (pshufd_rhs_imm imm)))
+      (x64_pshufd y imm))
+
+(decl pshufd_lhs_imm (u8) Immediate)
+(extern extractor pshufd_lhs_imm pshufd_lhs_imm)
+(decl pshufd_rhs_imm (u8) Immediate)
+(extern extractor pshufd_rhs_imm pshufd_rhs_imm)
+
+;; Special case for i8-level interleaving of upper/low bytes.
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f0f_1e0e_1d0d_1c0c_1b0b_1a0a_1909_1808)))
+      (x64_punpckhbw a b))
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x1707_1606_1505_1404_1303_1202_1101_1000)))
       (x64_punpcklbw a b))
 
+;; Special case for i16-level interleaving of upper/low bytes.
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f1e_0f0e_1d1c_0d0c_1b1a_0b0a_1918_0908)))
+      (x64_punpckhwd a b))
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x1716_0706_1514_0504_1312_0302_1110_0100)))
+      (x64_punpcklwd a b))
+
+;; Special case for i32-level interleaving of upper/low bytes.
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c_0f0e0d0c_1b1a1918_0b0a0908)))
+      (x64_punpckhdq a b))
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x17161514_07060504_13121110_03020100)))
+      (x64_punpckldq a b))
+
+;; Special case for i64-level interleaving of upper/low bytes.
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c1b1a1918_0f0e0d0c0b0a0908)))
+      (x64_punpckhqdq a b))
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x1716151413121110_0706050403020100)))
+      (x64_punpcklqdq a b))
+
+;; If the vector shift mask is all 0s then that means the first byte of the
+;; first operand is broadcast to all bytes. Falling through would load an
+;; all-zeros constant from a rip-relative location but it should be slightly
+;; more efficient to execute the `pshufb` here-and-now with an xor'd-to-be-zero
+;; register.
+(rule 6 (lower (shuffle a _ (u128_from_immediate 0)))
+      (x64_pshufb a (xmm_zero $I8X16)))
+
+;; Special case for the `shufps` instruction which will select two 32-bit values
+;; from the first operand and two 32-bit values from the second operand. Note
+;; that there is a second case here as well for when the operands can be
+;; swapped.
+;;
+;; Note that the priority of this instruction is currently lower than the above
+;; special cases since `shufps` handles many of them and for now it's
+;; hypothesized that the dedicated instructions are better than `shufps`.
+;; Someone with more knowledge about x86 timings should perhaps reorder the
+;; rules here eventually though.
+(rule 5 (lower (shuffle x y (shufps_imm imm)))
+      (x64_shufps x y imm))
+(rule 4 (lower (shuffle x y (shufps_rev_imm imm)))
+      (x64_shufps y x imm))
+
+(decl shufps_imm(u8) Immediate)
+(extern extractor shufps_imm shufps_imm)
+(decl shufps_rev_imm(u8) Immediate)
+(extern extractor shufps_rev_imm shufps_rev_imm)
+
+
 ;; If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM
 ;; register. We statically build `constructed_mask` to zero out any unknown lane
 ;; indices (may not be completely necessary: verification could fail incorrect
diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs
index 61be54a005..a5549e0fd5 100644
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@@ -999,6 +999,124 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
             },
         }
     }
+
+    fn pshufd_lhs_imm(&mut self, imm: Immediate) -> Option<u8> {
+        let (a, b, c, d) = self.shuffle32_from_imm(imm)?;
+        if a < 4 && b < 4 && c < 4 && d < 4 {
+            Some(a | (b << 2) | (c << 4) | (d << 6))
+        } else {
+            None
+        }
+    }
+
+    fn pshufd_rhs_imm(&mut self, imm: Immediate) -> Option<u8> {
+        let (a, b, c, d) = self.shuffle32_from_imm(imm)?;
+        // When selecting from the right-hand-side, subtract these all by 4
+        // which will bail out if anything is less than 4. Afterwards the check
+        // is the same as `pshufd_lhs_imm` above.
+        let a = a.checked_sub(4)?;
+        let b = b.checked_sub(4)?;
+        let c = c.checked_sub(4)?;
+        let d = d.checked_sub(4)?;
+        if a < 4 && b < 4 && c < 4 && d < 4 {
+            Some(a | (b << 2) | (c << 4) | (d << 6))
+        } else {
+            None
+        }
+    }
+
+    fn shufps_imm(&mut self, imm: Immediate) -> Option<u8> {
+        // The `shufps` instruction selects the first two elements from the
+        // first vector and the second two elements from the second vector, so
+        // offset the third/fourth selectors by 4 and then make sure everything
+        // fits in 32-bits.
+        let (a, b, c, d) = self.shuffle32_from_imm(imm)?;
+        let c = c.checked_sub(4)?;
+        let d = d.checked_sub(4)?;
+        if a < 4 && b < 4 && c < 4 && d < 4 {
+            Some(a | (b << 2) | (c << 4) | (d << 6))
+        } else {
+            None
+        }
+    }
+
+    fn shufps_rev_imm(&mut self, imm: Immediate) -> Option<u8> {
+        // This is almost the same as `shufps_imm` except the elements that are
+        // subtracted are reversed. This handles the case that `shufps`
+        // instruction can be emitted if the order of the operands are swapped.
+        let (a, b, c, d) = self.shuffle32_from_imm(imm)?;
+        let a = a.checked_sub(4)?;
+        let b = b.checked_sub(4)?;
+        if a < 4 && b < 4 && c < 4 && d < 4 {
+            Some(a | (b << 2) | (c << 4) | (d << 6))
+        } else {
+            None
+        }
+    }
+
+    fn pshuflw_lhs_imm(&mut self, imm: Immediate) -> Option<u8> {
+        // Similar to `shufps` except this operates over 16-bit values so four
+        // of them must be fixed and the other four must be in-range to encode
+        // in the immediate.
+        let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;
+        if a < 4 && b < 4 && c < 4 && d < 4 && [e, f, g, h] == [4, 5, 6, 7] {
+            Some(a | (b << 2) | (c << 4) | (d << 6))
+        } else {
+            None
+        }
+    }
+
+    fn pshuflw_rhs_imm(&mut self, imm: Immediate) -> Option<u8> {
+        let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;
+        let a = a.checked_sub(8)?;
+        let b = b.checked_sub(8)?;
+        let c = c.checked_sub(8)?;
+        let d = d.checked_sub(8)?;
+        let e = e.checked_sub(8)?;
+        let f = f.checked_sub(8)?;
+        let g = g.checked_sub(8)?;
+        let h = h.checked_sub(8)?;
+        if a < 4 && b < 4 && c < 4 && d < 4 && [e, f, g, h] == [4, 5, 6, 7] {
+            Some(a | (b << 2) | (c << 4) | (d << 6))
+        } else {
+            None
+        }
+    }
+
+    fn pshufhw_lhs_imm(&mut self, imm: Immediate) -> Option<u8> {
+        // Similar to `pshuflw` except that the first four operands must be
+        // fixed and the second four are offset by an extra 4 and tested to
+        // make sure they're all in the range [4, 8).
+        let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;
+        let e = e.checked_sub(4)?;
+        let f = f.checked_sub(4)?;
+        let g = g.checked_sub(4)?;
+        let h = h.checked_sub(4)?;
+        if e < 4 && f < 4 && g < 4 && h < 4 && [a, b, c, d] == [0, 1, 2, 3] {
+            Some(e | (f << 2) | (g << 4) | (h << 6))
+        } else {
+            None
+        }
+    }
+
+    fn pshufhw_rhs_imm(&mut self, imm: Immediate) -> Option<u8> {
+        // Note that everything here is offset by at least 8 and the upper
+        // bits are offset by 12 to test they're in the range of [12, 16).
+        let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;
+        let a = a.checked_sub(8)?;
+        let b = b.checked_sub(8)?;
+        let c = c.checked_sub(8)?;
+        let d = d.checked_sub(8)?;
+        let e = e.checked_sub(12)?;
+        let f = f.checked_sub(12)?;
+        let g = g.checked_sub(12)?;
+        let h = h.checked_sub(12)?;
+        if e < 4 && f < 4 && g < 4 && h < 4 && [a, b, c, d] == [0, 1, 2, 3] {
+            Some(e | (f << 2) | (g << 4) | (h << 6))
+        } else {
+            None
+        }
+    }
 }
 
 impl IsleContext<'_, '_, MInst, X64Backend> {
diff --git a/cranelift/codegen/src/machinst/isle.rs b/cranelift/codegen/src/machinst/isle.rs
index 70bcb7d12e..3eb0db7ea5 100644
--- a/cranelift/codegen/src/machinst/isle.rs
+++ b/cranelift/codegen/src/machinst/isle.rs
@@ -585,9 +585,86 @@ macro_rules! isle_lower_prelude_methods {
                 .collect();
             self.lower_ctx.gen_return(rets);
         }
+
+        /// Attempts to interpret the shuffle immediate `imm` as a shuffle of
+        /// 32-bit lanes, returning four integers, each of which is less than 8,
+        /// which represents a permutation of 32-bit lanes as specified by
+        /// `imm`.
+        ///
+        /// For example the shuffle immediate
+        ///
+        /// `0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27`
+        ///
+        /// would return `Some((0, 2, 4, 6))`.
+        fn shuffle32_from_imm(&mut self, imm: Immediate) -> Option<(u8, u8, u8, u8)> {
+            use crate::machinst::isle::shuffle_imm_as_le_lane_idx;
+
+            let bytes = self.lower_ctx.get_immediate_data(imm).as_slice();
+            Some((
+                shuffle_imm_as_le_lane_idx(4, &bytes[0..4])?,
+                shuffle_imm_as_le_lane_idx(4, &bytes[4..8])?,
+                shuffle_imm_as_le_lane_idx(4, &bytes[8..12])?,
+                shuffle_imm_as_le_lane_idx(4, &bytes[12..16])?,
+            ))
+        }
+
+        /// Same as `shuffle32_from_imm`, but for 16-bit lane shuffles.
+        fn shuffle16_from_imm(
+            &mut self,
+            imm: Immediate,
+        ) -> Option<(u8, u8, u8, u8, u8, u8, u8, u8)> {
+            use crate::machinst::isle::shuffle_imm_as_le_lane_idx;
+            let bytes = self.lower_ctx.get_immediate_data(imm).as_slice();
+            Some((
+                shuffle_imm_as_le_lane_idx(2, &bytes[0..2])?,
+                shuffle_imm_as_le_lane_idx(2, &bytes[2..4])?,
+                shuffle_imm_as_le_lane_idx(2, &bytes[4..6])?,
+                shuffle_imm_as_le_lane_idx(2, &bytes[6..8])?,
+                shuffle_imm_as_le_lane_idx(2, &bytes[8..10])?,
+                shuffle_imm_as_le_lane_idx(2, &bytes[10..12])?,
+                shuffle_imm_as_le_lane_idx(2, &bytes[12..14])?,
+                shuffle_imm_as_le_lane_idx(2, &bytes[14..16])?,
+            ))
+        }
     };
 }
 
+/// Returns the `size`-byte lane referred to by the shuffle immediate specified
+/// in `bytes`.
+///
+/// This helper is used by `shuffleNN_from_imm` above and is used to interpret a
+/// byte-based shuffle as a higher-level shuffle of bigger lanes. This will see
+/// if the `bytes` specified, which must have `size` length, specifies a lane in
+/// vectors aligned to a `size`-byte boundary.
+///
+/// Returns `None` if `bytes` doesn't specify a `size`-byte lane aligned
+/// appropriately, or returns `Some(n)` where `n` is the index of the lane being
+/// shuffled.
+pub fn shuffle_imm_as_le_lane_idx(size: u8, bytes: &[u8]) -> Option<u8> {
+    assert_eq!(bytes.len(), usize::from(size));
+
+    // The first index in `bytes` must be aligned to a `size` boundary for the
+    // bytes to be a valid specifier for a lane of `size` bytes.
+    if bytes[0] % size != 0 {
+        return None;
+    }
+
+    // Afterwards the bytes must all be one larger than the prior to specify a
+    // contiguous sequence of bytes that's being shuffled. Basically `bytes`
+    // must refer to the entire `size`-byte lane, in little-endian order.
+    for i in 0..size - 1 {
+        let idx = usize::from(i);
+        if bytes[idx] + 1 != bytes[idx + 1] {
+            return None;
+        }
+    }
+
+    // All of the `bytes` are in-order, meaning that this is a valid shuffle
+    // immediate to specify a lane of `size` bytes. The index, when viewed as
+    // `size`-byte immediates, will be the first byte divided by the byte size.
+    Some(bytes[0] / size)
+}
+
 /// Helpers specifically for machines that use ABICaller.
 #[macro_export]
 #[doc(hidden)]
diff --git a/cranelift/codegen/src/prelude_lower.isle b/cranelift/codegen/src/prelude_lower.isle
index 51e15cb2a1..a7e59d5908 100644
--- a/cranelift/codegen/src/prelude_lower.isle
+++ b/cranelift/codegen/src/prelude_lower.isle
@@ -592,6 +592,16 @@
 (decl u64_from_constant (u64) Constant)
 (extern extractor u64_from_constant u64_from_constant)
 
+;; Extracts lane indices, represented as u8's, if the immediate for a
+;; `shuffle` instruction represents shuffling N-bit values. The u8 values
+;; returned will be in the range of 0 to (256/N)-1, inclusive, and index the
+;; N-bit chunks of two concatenated 128-bit vectors starting from the
+;; least-significant bits.
+(decl shuffle32_from_imm (u8 u8 u8 u8) Immediate)
+(extern extractor shuffle32_from_imm shuffle32_from_imm)
+(decl shuffle16_from_imm (u8 u8 u8 u8 u8 u8 u8 u8) Immediate)
+(extern extractor shuffle16_from_imm shuffle16_from_imm)
+
 ;;;; Helpers for generating returns ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 ;; Extractor to check for the special case that a `WritableValueRegs`
diff --git a/cranelift/filetests/filetests/isa/x64/shuffle-avx.clif b/cranelift/filetests/filetests/isa/x64/shuffle-avx.clif
new file mode 100644
index 0000000000..30cf9721e1
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/shuffle-avx.clif
@@ -0,0 +1,116 @@
+test compile precise-output
+set enable_simd
+target x86_64 has_avx
+
+function %punpckldq(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 16 17 18 19 4 5 6 7 20 21 22 23]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   vpunpckldq %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   vpunpckldq %xmm1, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %punpckhdq(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [8 9 10 11 24 25 26 27 12 13 14 15 28 29 30 31]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   vpunpckhdq %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   vpunpckhdq %xmm1, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %punpcklqdq(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   vpunpcklqdq %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   vpunpcklqdq %xmm1, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %punpckhqdq(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   vpunpckhqdq %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   vpunpckhqdq %xmm1, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
diff --git a/cranelift/filetests/filetests/isa/x64/shuffle.clif b/cranelift/filetests/filetests/isa/x64/shuffle.clif
index 529b95cc5d..b056d9f168 100644
--- a/cranelift/filetests/filetests/isa/x64/shuffle.clif
+++ b/cranelift/filetests/filetests/isa/x64/shuffle.clif
@@ -52,3 +52,594 @@ block0(v0: i8x16, v1: i8x16):
 ;   popq %rbp
 ;   retq
 
+function %punpcklwd(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 16 17 2 3 18 19 4 5 20 21 6 7 22 23]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   punpcklwd %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   punpcklwd %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %punpckhwd(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [8 9 24 25 10 11 26 27 12 13 28 29 14 15 30 31]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   punpckhwd %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   punpckhwd %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %pshufd_0022(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 0 1 2 3 8 9 10 11 8 9 10 11]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshufd  $160, %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pshufd $0xa0, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %pshufd_3120(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [12 13 14 15 4 5 6 7 8 9 10 11 0 1 2 3]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshufd  $39, %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pshufd $0x27, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %pshufd_7546(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [28 29 30 31 20 21 22 23 16 17 18 19 24 25 26 27]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshufd  $135, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pshufd $0x87, %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %not_single_pshufd(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   shufps  $78, %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   shufps $0x4e, %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %punpckldq(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 16 17 18 19 4 5 6 7 20 21 22 23]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   punpckldq %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   punpckldq %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %punpckhdq(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [8 9 10 11 24 25 26 27 12 13 14 15 28 29 30 31]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   punpckhdq %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   punpckhdq %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %punpcklqdq(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   punpcklqdq %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   punpcklqdq %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %punpckhqdq(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   punpckhqdq %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   punpckhqdq %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %shufps_3277(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [12 13 14 15 8 9 10 11 28 29 30 31 28 29 30 31]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   shufps  $251, %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   shufps $0xfb, %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %shufps_6500(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [24 25 26 27 20 21 22 23 0 1 2 3 0 1 2 3]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm0, %xmm4
+;   movdqa  %xmm1, %xmm0
+;   shufps  $6, %xmm0, %xmm4, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movdqa %xmm0, %xmm4
+;   movdqa %xmm1, %xmm0
+;   shufps $6, %xmm4, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %pshuflw_3210(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [6 7 4 5 2 3 0 1 8 9 10 11 12 13 14 15]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshuflw $27, %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pshuflw $0x1b, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %pshuflw_3131(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [6 7 4 5 6 7 4 5 8 9 10 11 12 13 14 15]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshuflw $187, %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pshuflw $0xbb, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %pshuflw_rhs_3210(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [22 23 20 21 18 19 16 17 24 25 26 27 28 29 30 31]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshuflw $27, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pshuflw $0x1b, %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %pshuflw_rhs_3131(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [22 23 18 19 22 23 18 19 24 25 26 27 28 29 30 31]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshuflw $119, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pshuflw $0x77, %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %pshufhw_3210(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 14 15 12 13 10 11 8 9]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshufhw $27, %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pshufhw $0x1b, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %pshufhw_3131(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 14 15 10 11 14 15 10 11]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshufhw $119, %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pshufhw $0x77, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %pshufhw_rhs_3210(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [16 17 18 19 20 21 22 23 30 31 28 29 26 27 24 25]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshufhw $27, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pshufhw $0x1b, %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %pshufhw_rhs_3131(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [16 17 18 19 20 21 22 23 30 31 26 27 30 31 26 27]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshufhw $119, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pshufhw $0x77, %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %shuffle_all_zeros(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pxor    %xmm3, %xmm3, %xmm3
+;   pshufb  %xmm0, %xmm3, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pxor %xmm3, %xmm3
+;   pshufb %xmm3, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
diff --git a/cranelift/filetests/filetests/runtests/simd-shuffle.clif b/cranelift/filetests/filetests/runtests/simd-shuffle.clif
index 621eebda62..60b515628d 100644
--- a/cranelift/filetests/filetests/runtests/simd-shuffle.clif
+++ b/cranelift/filetests/filetests/runtests/simd-shuffle.clif
@@ -1,9 +1,10 @@
-test interpret
+;; test interpret ;; FIXME(#5915)
 test run
 target aarch64
 target s390x
 set enable_simd
 target x86_64 has_sse3 has_ssse3 has_sse41
+target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
 target x86_64 has_sse3 has_ssse3 has_sse41 has_avx512vl has_avx512vbmi
 
 function %shuffle_i8x16(i8x16, i8x16) -> i8x16 {
@@ -26,3 +27,234 @@ block0(v0: i8x16):
     return v1
 }
 ; run: %shuffle1([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]) == [8 9 10 11 12 13 14 15 0 1 2 3 4 5 6 7]
+
+function %punpcklbw(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23]
+    return v2
+}
+; run: %punpcklbw([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [1 17 2 18 3 19 4 20 5 21 6 22 7 23 8 24]
+
+function %punpckhbw(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [8 24 9 25 10 26 11 27 12 28 13 29 14 30 15 31]
+    return v2
+}
+; run: %punpckhbw([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [9 25 10 26 11 27 12 28 13 29 14 30 15 31 16 32]
+
+function %punpcklwd(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 16 17 2 3 18 19 4 5 20 21 6 7 22 23]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+; run: %punpcklwd([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [1 9 2 10 3 11 4 12]
+
+function %punpckhwd(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [8 9 24 25 10 11 26 27 12 13 28 29 14 15 30 31]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+; run: %punpckhwd([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [5 13 6 14 7 15 8 16]
+
+function %pshufd_0022(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 0 1 2 3 8 9 10 11 8 9 10 11]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [1 1 3 3]
+
+function %pshufd_3120(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [12 13 14 15 4 5 6 7 8 9 10 11 0 1 2 3]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [4 2 3 1]
+
+function %pshufd_7546(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [28 29 30 31 20 21 22 23 16 17 18 19 24 25 26 27]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [8 6 5 7]
+
+function %not_pshufd(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [3 4 5 6]
+
+function %punpckldq(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 16 17 18 19 4 5 6 7 20 21 22 23]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %punpckldq([1 2 3 4], [5 6 7 8]) == [1 5 2 6]
+
+function %punpckhdq(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [8 9 10 11 24 25 26 27 12 13 14 15 28 29 30 31]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %punpckldq([1 2 3 4], [5 6 7 8]) == [3 7 4 8]
+
+function %punpcklqdq(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+; run: %punpcklqdq([1 2], [5 6]) == [1 5]
+
+function %punpckhqdq(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+; run: %punpckhqdq([1 2], [5 6]) == [2 6]
+
+function %shufps_0145(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %shufps_0145([1 2 3 4], [5 6 7 8]) == [1 2 5 6]
+
+function %shufps_3277(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [12 13 14 15 8 9 10 11 28 29 30 31 28 29 30 31]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %shufps_0145([1 2 3 4], [5 6 7 8]) == [4 3 8 8]
+
+function %shufps_6500(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [24 25 26 27 20 21 22 23 0 1 2 3 0 1 2 3]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %shufps_0145([1 2 3 4], [5 6 7 8]) == [7 6 1 1]
+
+function %pshuflw_3210(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [6 7 4 5 2 3 0 1 8 9 10 11 12 13 14 15]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+; run: %pshuflw_3210([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [4 3 2 1 5 6 7 8]
+
+function %pshuflw_3131(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [6 7 4 5 6 7 4 5 8 9 10 11 12 13 14 15]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+; run: %pshuflw_3131([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [4 3 4 3 5 6 7 8]
+
+function %pshuflw_rhs_3210(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [22 23 20 21 18 19 16 17 24 25 26 27 28 29 30 31]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+; run: %pshuflw_rhs_3210([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [12 11 10 9 13 14 15 16]
+
+function %pshuflw_rhs_3131(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [22 23 18 19 22 23 18 19 24 25 26 27 28 29 30 31]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+; run: %pshuflw_rhs_3131([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [12 10 12 10 13 14 15 16]
+
+function %pshufhw_3210(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 14 15 12 13 10 11 8 9]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+; run: %pshufhw_3210([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [1 2 3 4 8 7 6 5]
+
+function %pshufhw_3131(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 14 15 10 11 14 15 10 11]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+; run: %pshufhw_3131([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [1 2 3 4 8 6 8 6]
+
+function %pshufhw_rhs_3210(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [16 17 18 19 20 21 22 23 30 31 28 29 26 27 24 25]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+; run: %pshufhw_rhs_3210([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [9 10 11 12 16 15 14 13]
+
+function %pshufhw_rhs_3131(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [16 17 18 19 20 21 22 23 30 31 26 27 30 31 26 27]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+; run: %pshufhw_rhs_3131([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [9 10 11 12 16 14 16 14]
+
+function %shuffle_all_zeros(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+    return v2
+}
+; run: %shuffle_all_zeros([5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1], [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]) == [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]