x64: Add a smattering of lowerings for shuffle specializations (#5930)

* x64: Add lowerings for `punpck{h,l}wd` Add some special cases for `shuffle` for more specialized x86 instructions. * x64: Add `shuffle` lowerings for `pshufd` This commit adds special-cased lowerings for the x64 `shuffle` instruction when the `pshufd` instruction alone is necessary. This is possible when the shuffle immediate permutes 32-bit values within one of the vector inputs of the `shuffle` instruction, but not both. * x64: Add shuffle lowerings for `punpck{h,l}{q,}dq` This adds specific permutations for some x86 instructions which specifically interleave high/low bytes for 32 and 64-bit values. This corresponds to the preexisting specific lowerings for interleaving 8 and 16-bit values. * x64: Add `shuffle` lowerings for `shufps` This commit adds targeted lowerings for the `shuffle` instruction that match the pattern that `shufps` supports. The `shufps` instruction selects two elements from the first vector and two elements from the second vector which means while it's not generally applicable it should still be more useful than the catch-all lowering of `shuffle`. * x64: Add shuffle support for `pshuf{l,h}w` This commit adds special lowering cases for these instructions which permute 16-bit values within a 128-bit value either within the upper or lower half of the 128-bit value. * x64: Specialize `shuffle` with an all-zeros immediate Instead of loading the all-zeros immediate from a rip-relative address at the end of the function instead generate a zero with a `pxor` instruction and then use `pshufb` to do the broadcast. * Review comments
2023-03-09 16:58:19 -06:00
parent 8a2bf29444
commit 1c3a1bda6c
10 changed files with 1332 additions and 10 deletions
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -863,6 +863,12 @@
            Xorpd
            Phaddw
            Phaddd
            Punpckhdq
            Punpckldq
            Punpckhqdq
            Punpcklqdq
            Pshuflw
            Pshufhw
          ))
 (type CmpOpcode extern
@@ -1347,6 +1353,12 @@
            Vcvttps2dq
            Vphaddw
            Vphaddd
            Vpunpckhdq
            Vpunpckldq
            Vpunpckhqdq
            Vpunpcklqdq
            Vpshuflw
            Vpshufhw
          ))
 (type Avx512Opcode extern
@@ -2729,6 +2741,38 @@
      (if-let $true (has_avx))
      (xmm_rmir_vex (AvxOpcode.Vpunpcklwd) src1 src2))
 ;; Helper for creating `punpckldq` instructions.
 (decl x64_punpckldq (Xmm XmmMem) Xmm)
 (rule 0 (x64_punpckldq src1 src2)
      (xmm_rm_r (SseOpcode.Punpckldq) src1 src2))
 (rule 1 (x64_punpckldq src1 src2)
      (if-let $true (has_avx))
      (xmm_rmir_vex (AvxOpcode.Vpunpckldq) src1 src2))
 ;; Helper for creating `punpckhdq` instructions.
 (decl x64_punpckhdq (Xmm XmmMem) Xmm)
 (rule 0 (x64_punpckhdq src1 src2)
      (xmm_rm_r (SseOpcode.Punpckhdq) src1 src2))
 (rule 1 (x64_punpckhdq src1 src2)
      (if-let $true (has_avx))
      (xmm_rmir_vex (AvxOpcode.Vpunpckhdq) src1 src2))
 ;; Helper for creating `punpcklqdq` instructions.
 (decl x64_punpcklqdq (Xmm XmmMem) Xmm)
 (rule 0 (x64_punpcklqdq src1 src2)
      (xmm_rm_r (SseOpcode.Punpcklqdq) src1 src2))
 (rule 1 (x64_punpcklqdq src1 src2)
      (if-let $true (has_avx))
      (xmm_rmir_vex (AvxOpcode.Vpunpcklqdq) src1 src2))
 ;; Helper for creating `punpckhqdq` instructions.
 (decl x64_punpckhqdq (Xmm XmmMem) Xmm)
 (rule 0 (x64_punpckhqdq src1 src2)
      (xmm_rm_r (SseOpcode.Punpckhqdq) src1 src2))
 (rule 1 (x64_punpckhqdq src1 src2)
      (if-let $true (has_avx))
      (xmm_rmir_vex (AvxOpcode.Vpunpckhqdq) src1 src2))
 ;; Helper for creating `unpcklps` instructions.
 (decl x64_unpcklps (Xmm XmmMem) Xmm)
 (rule 0 (x64_unpcklps src1 src2)
@@ -3284,6 +3328,22 @@
      (if-let $true (has_avx))
      (xmm_rmir_vex (AvxOpcode.Vpshufb) src1 src2))
 ;; Helper for creating `pshuflw` instructions.
 (decl x64_pshuflw (XmmMem u8) Xmm)
 (rule (x64_pshuflw src imm)
      (xmm_unary_rm_r_imm (SseOpcode.Pshuflw) src imm))
 (rule 1 (x64_pshuflw src imm)
      (if-let $true (has_avx))
      (xmm_unary_rm_r_imm_vex (AvxOpcode.Vpshuflw) src imm))
 ;; Helper for creating `pshufhw` instructions.
 (decl x64_pshufhw (XmmMem u8) Xmm)
 (rule (x64_pshufhw src imm)
      (xmm_unary_rm_r_imm (SseOpcode.Pshufhw) src imm))
 (rule 1 (x64_pshufhw src imm)
      (if-let $true (has_avx))
      (xmm_unary_rm_r_imm_vex (AvxOpcode.Vpshufhw) src imm))
 ;; Helper for creating `shufps` instructions.
 (decl x64_shufps (Xmm XmmMem u8) Xmm)
 (rule 0 (x64_shufps src1 src2 byte)
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -1117,6 +1117,12 @@ pub enum SseOpcode {
    Xorpd,
    Phaddw,
    Phaddd,
    Punpckhdq,
    Punpckldq,
    Punpckhqdq,
    Punpcklqdq,
    Pshuflw,
    Pshufhw,
 }
 impl SseOpcode {
@@ -1256,7 +1262,13 @@ impl SseOpcode {
            | SseOpcode::Subpd
            | SseOpcode::Subsd
            | SseOpcode::Ucomisd
-            | SseOpcode::Xorpd => SSE2,
+            | SseOpcode::Xorpd
            | SseOpcode::Punpckldq
            | SseOpcode::Punpckhdq
            | SseOpcode::Punpcklqdq
            | SseOpcode::Punpckhqdq
            | SseOpcode::Pshuflw
            | SseOpcode::Pshufhw => SSE2,
            SseOpcode::Pabsb
            | SseOpcode::Pabsw
@@ -1501,6 +1513,12 @@ impl fmt::Debug for SseOpcode {
            SseOpcode::Xorpd => "xorpd",
            SseOpcode::Phaddw => "phaddw",
            SseOpcode::Phaddd => "phaddd",
            SseOpcode::Punpckldq => "punpckldq",
            SseOpcode::Punpckhdq => "punpckhdq",
            SseOpcode::Punpcklqdq => "punpcklqdq",
            SseOpcode::Punpckhqdq => "punpckhqdq",
            SseOpcode::Pshuflw => "pshuflw",
            SseOpcode::Pshufhw => "pshufhw",
        };
        write!(fmt, "{}", name)
    }
@@ -1669,7 +1687,13 @@ impl AvxOpcode {
            | AvxOpcode::Vcvttpd2dq
            | AvxOpcode::Vcvttps2dq
            | AvxOpcode::Vphaddw
-            | AvxOpcode::Vphaddd => {
+            | AvxOpcode::Vphaddd
            | AvxOpcode::Vpunpckldq
            | AvxOpcode::Vpunpckhdq
            | AvxOpcode::Vpunpcklqdq
            | AvxOpcode::Vpunpckhqdq
            | AvxOpcode::Vpshuflw
            | AvxOpcode::Vpshufhw => {
                smallvec![InstructionSet::AVX]
            }
        }
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -1789,6 +1789,8 @@ pub(crate) fn emit(
                SseOpcode::Roundpd => (LegacyPrefixes::_66, 0x0F3A09, 3),
                SseOpcode::Roundsd => (LegacyPrefixes::_66, 0x0F3A0B, 3),
                SseOpcode::Pshufd => (LegacyPrefixes::_66, 0x0F70, 2),
                SseOpcode::Pshuflw => (LegacyPrefixes::_F2, 0x0F70, 2),
                SseOpcode::Pshufhw => (LegacyPrefixes::_F3, 0x0F70, 2),
                _ => unimplemented!("Opcode {:?} not implemented", op),
            };
            match src {
@@ -1946,6 +1948,10 @@ pub(crate) fn emit(
                SseOpcode::Punpckhwd => (LegacyPrefixes::_66, 0x0F69, 2),
                SseOpcode::Punpcklbw => (LegacyPrefixes::_66, 0x0F60, 2),
                SseOpcode::Punpcklwd => (LegacyPrefixes::_66, 0x0F61, 2),
                SseOpcode::Punpckldq => (LegacyPrefixes::_66, 0x0F62, 2),
                SseOpcode::Punpcklqdq => (LegacyPrefixes::_66, 0x0F6C, 2),
                SseOpcode::Punpckhdq => (LegacyPrefixes::_66, 0x0F6A, 2),
                SseOpcode::Punpckhqdq => (LegacyPrefixes::_66, 0x0F6D, 2),
                SseOpcode::Pxor => (LegacyPrefixes::_66, 0x0FEF, 2),
                SseOpcode::Subps => (LegacyPrefixes::None, 0x0F5C, 2),
                SseOpcode::Subpd => (LegacyPrefixes::_66, 0x0F5C, 2),
@@ -2171,6 +2177,10 @@ pub(crate) fn emit(
                AvxOpcode::Vmaxsd => (LP::_F2, OM::_0F, 0x5F),
                AvxOpcode::Vphaddw => (LP::_66, OM::_0F38, 0x01),
                AvxOpcode::Vphaddd => (LP::_66, OM::_0F38, 0x02),
                AvxOpcode::Vpunpckldq => (LP::_66, OM::_0F, 0x62),
                AvxOpcode::Vpunpckhdq => (LP::_66, OM::_0F, 0x6A),
                AvxOpcode::Vpunpcklqdq => (LP::_66, OM::_0F, 0x6C),
                AvxOpcode::Vpunpckhqdq => (LP::_66, OM::_0F, 0x6D),
                _ => panic!("unexpected rmir vex opcode {op:?}"),
            };
            VexInstruction::new()
@@ -2400,6 +2410,8 @@ pub(crate) fn emit(
            let (prefix, map, opcode) = match op {
                AvxOpcode::Vroundps => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x08),
                AvxOpcode::Vroundpd => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x09),
                AvxOpcode::Vpshuflw => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x70),
                AvxOpcode::Vpshufhw => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x70),
                _ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
            };
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -3529,16 +3529,98 @@
 ;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; Special case for the `punpckhbw` instruction which interleaves the upper
+;; Special case the `pshuf{l,h}w` instruction which shuffles four 16-bit
-;; lanes of the two input registers.
+;; integers within one value, preserving the other four 16-bit integers in that
-(rule 4 (lower (shuffle a b (u128_from_immediate 0x1f0f_1e0e_1d0d_1c0c_1b0b_1a0a_1909_1808)))
+;; value (either the high or low half). The complicated logic is in the
-      (x64_punpckhbw a b))
+;; extractors here implemented in Rust and note that there's two cases for each
 ;; instruction here to match when either the first or second shuffle operand is
 ;; used.
 (rule 12 (lower (shuffle x y (pshuflw_lhs_imm imm)))
      (x64_pshuflw x imm))
 (rule 11 (lower (shuffle x y (pshuflw_rhs_imm imm)))
      (x64_pshuflw y imm))
 (rule 10 (lower (shuffle x y (pshufhw_lhs_imm imm)))
      (x64_pshufhw x imm))
 (rule 9 (lower (shuffle x y (pshufhw_rhs_imm imm)))
      (x64_pshufhw y imm))
-;; Special case for the `punpcklbw` instruction which interleaves the lower
+(decl pshuflw_lhs_imm (u8) Immediate)
-;; lanes of the two input registers.
+(extern extractor pshuflw_lhs_imm pshuflw_lhs_imm)
-(rule 4 (lower (shuffle a b (u128_from_immediate 0x1707_1606_1505_1404_1303_1202_1101_1000)))
+(decl pshuflw_rhs_imm (u8) Immediate)
 (extern extractor pshuflw_rhs_imm pshuflw_rhs_imm)
 (decl pshufhw_lhs_imm (u8) Immediate)
 (extern extractor pshufhw_lhs_imm pshufhw_lhs_imm)
 (decl pshufhw_rhs_imm (u8) Immediate)
 (extern extractor pshufhw_rhs_imm pshufhw_rhs_imm)
 ;; Special case for the `pshufd` instruction which will permute 32-bit values
 ;; within a single register. This is only applicable if the `imm` specified
 ;; selects 32-bit values from either `x` or `y`, but not both. This means
 ;; there's one rule for selecting from `x` and another rule for selecting from
 ;; `y`.
 (rule 8 (lower (shuffle x y (pshufd_lhs_imm imm)))
      (x64_pshufd x imm))
 (rule 7 (lower (shuffle x y (pshufd_rhs_imm imm)))
      (x64_pshufd y imm))
 (decl pshufd_lhs_imm (u8) Immediate)
 (extern extractor pshufd_lhs_imm pshufd_lhs_imm)
 (decl pshufd_rhs_imm (u8) Immediate)
 (extern extractor pshufd_rhs_imm pshufd_rhs_imm)
 ;; Special case for i8-level interleaving of upper/low bytes.
 (rule 6 (lower (shuffle a b (u128_from_immediate 0x1f0f_1e0e_1d0d_1c0c_1b0b_1a0a_1909_1808)))
      (x64_punpckhbw a b))
 (rule 6 (lower (shuffle a b (u128_from_immediate 0x1707_1606_1505_1404_1303_1202_1101_1000)))
      (x64_punpcklbw a b))
 ;; Special case for i16-level interleaving of upper/low bytes.
 (rule 6 (lower (shuffle a b (u128_from_immediate 0x1f1e_0f0e_1d1c_0d0c_1b1a_0b0a_1918_0908)))
      (x64_punpckhwd a b))
 (rule 6 (lower (shuffle a b (u128_from_immediate 0x1716_0706_1514_0504_1312_0302_1110_0100)))
      (x64_punpcklwd a b))
 ;; Special case for i32-level interleaving of upper/low bytes.
 (rule 6 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c_0f0e0d0c_1b1a1918_0b0a0908)))
      (x64_punpckhdq a b))
 (rule 6 (lower (shuffle a b (u128_from_immediate 0x17161514_07060504_13121110_03020100)))
      (x64_punpckldq a b))
 ;; Special case for i64-level interleaving of upper/low bytes.
 (rule 6 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c1b1a1918_0f0e0d0c0b0a0908)))
      (x64_punpckhqdq a b))
 (rule 6 (lower (shuffle a b (u128_from_immediate 0x1716151413121110_0706050403020100)))
      (x64_punpcklqdq a b))
 ;; If the vector shift mask is all 0s then that means the first byte of the
 ;; first operand is broadcast to all bytes. Falling through would load an
 ;; all-zeros constant from a rip-relative location but it should be slightly
 ;; more efficient to execute the `pshufb` here-and-now with an xor'd-to-be-zero
 ;; register.
 (rule 6 (lower (shuffle a _ (u128_from_immediate 0)))
      (x64_pshufb a (xmm_zero $I8X16)))
 ;; Special case for the `shufps` instruction which will select two 32-bit values
 ;; from the first operand and two 32-bit values from the second operand. Note
 ;; that there is a second case here as well for when the operands can be
 ;; swapped.
 ;;
 ;; Note that the priority of this instruction is currently lower than the above
 ;; special cases since `shufps` handles many of them and for now it's
 ;; hypothesized that the dedicated instructions are better than `shufps`.
 ;; Someone with more knowledge about x86 timings should perhaps reorder the
 ;; rules here eventually though.
 (rule 5 (lower (shuffle x y (shufps_imm imm)))
      (x64_shufps x y imm))
 (rule 4 (lower (shuffle x y (shufps_rev_imm imm)))
      (x64_shufps y x imm))
 (decl shufps_imm(u8) Immediate)
 (extern extractor shufps_imm shufps_imm)
 (decl shufps_rev_imm(u8) Immediate)
 (extern extractor shufps_rev_imm shufps_rev_imm)
 ;; If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM
 ;; register. We statically build `constructed_mask` to zero out any unknown lane
 ;; indices (may not be completely necessary: verification could fail incorrect
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@@ -999,6 +999,124 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
            },
        }
    }
    fn pshufd_lhs_imm(&mut self, imm: Immediate) -> Option<u8> {
        let (a, b, c, d) = self.shuffle32_from_imm(imm)?;
        if a < 4 && b < 4 && c < 4 && d < 4 {
            Some(a | (b << 2) | (c << 4) | (d << 6))
        } else {
            None
        }
    }
    fn pshufd_rhs_imm(&mut self, imm: Immediate) -> Option<u8> {
        let (a, b, c, d) = self.shuffle32_from_imm(imm)?;
        // When selecting from the right-hand-side, subtract these all by 4
        // which will bail out if anything is less than 4. Afterwards the check
        // is the same as `pshufd_lhs_imm` above.
        let a = a.checked_sub(4)?;
        let b = b.checked_sub(4)?;
        let c = c.checked_sub(4)?;
        let d = d.checked_sub(4)?;
        if a < 4 && b < 4 && c < 4 && d < 4 {
            Some(a | (b << 2) | (c << 4) | (d << 6))
        } else {
            None
        }
    }
    fn shufps_imm(&mut self, imm: Immediate) -> Option<u8> {
        // The `shufps` instruction selects the first two elements from the
        // first vector and the second two elements from the second vector, so
        // offset the third/fourth selectors by 4 and then make sure everything
        // fits in 32-bits.
        let (a, b, c, d) = self.shuffle32_from_imm(imm)?;
        let c = c.checked_sub(4)?;
        let d = d.checked_sub(4)?;
        if a < 4 && b < 4 && c < 4 && d < 4 {
            Some(a | (b << 2) | (c << 4) | (d << 6))
        } else {
            None
        }
    }
    fn shufps_rev_imm(&mut self, imm: Immediate) -> Option<u8> {
        // This is almost the same as `shufps_imm` except the elements that are
        // subtracted are reversed. This handles the case that `shufps`
        // instruction can be emitted if the order of the operands are swapped.
        let (a, b, c, d) = self.shuffle32_from_imm(imm)?;
        let a = a.checked_sub(4)?;
        let b = b.checked_sub(4)?;
        if a < 4 && b < 4 && c < 4 && d < 4 {
            Some(a | (b << 2) | (c << 4) | (d << 6))
        } else {
            None
        }
    }
    fn pshuflw_lhs_imm(&mut self, imm: Immediate) -> Option<u8> {
        // Similar to `shufps` except this operates over 16-bit values so four
        // of them must be fixed and the other four must be in-range to encode
        // in the immediate.
        let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;
        if a < 4 && b < 4 && c < 4 && d < 4 && [e, f, g, h] == [4, 5, 6, 7] {
            Some(a | (b << 2) | (c << 4) | (d << 6))
        } else {
            None
        }
    }
    fn pshuflw_rhs_imm(&mut self, imm: Immediate) -> Option<u8> {
        let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;
        let a = a.checked_sub(8)?;
        let b = b.checked_sub(8)?;
        let c = c.checked_sub(8)?;
        let d = d.checked_sub(8)?;
        let e = e.checked_sub(8)?;
        let f = f.checked_sub(8)?;
        let g = g.checked_sub(8)?;
        let h = h.checked_sub(8)?;
        if a < 4 && b < 4 && c < 4 && d < 4 && [e, f, g, h] == [4, 5, 6, 7] {
            Some(a | (b << 2) | (c << 4) | (d << 6))
        } else {
            None
        }
    }
    fn pshufhw_lhs_imm(&mut self, imm: Immediate) -> Option<u8> {
        // Similar to `pshuflw` except that the first four operands must be
        // fixed and the second four are offset by an extra 4 and tested to
        // make sure they're all in the range [4, 8).
        let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;
        let e = e.checked_sub(4)?;
        let f = f.checked_sub(4)?;
        let g = g.checked_sub(4)?;
        let h = h.checked_sub(4)?;
        if e < 4 && f < 4 && g < 4 && h < 4 && [a, b, c, d] == [0, 1, 2, 3] {
            Some(e | (f << 2) | (g << 4) | (h << 6))
        } else {
            None
        }
    }
    fn pshufhw_rhs_imm(&mut self, imm: Immediate) -> Option<u8> {
        // Note that everything here is offset by at least 8 and the upper
        // bits are offset by 12 to test they're in the range of [12, 16).
        let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;
        let a = a.checked_sub(8)?;
        let b = b.checked_sub(8)?;
        let c = c.checked_sub(8)?;
        let d = d.checked_sub(8)?;
        let e = e.checked_sub(12)?;
        let f = f.checked_sub(12)?;
        let g = g.checked_sub(12)?;
        let h = h.checked_sub(12)?;
        if e < 4 && f < 4 && g < 4 && h < 4 && [a, b, c, d] == [0, 1, 2, 3] {
            Some(e | (f << 2) | (g << 4) | (h << 6))
        } else {
            None
        }
    }
 }
 impl IsleContext<'_, '_, MInst, X64Backend> {
--- a/cranelift/codegen/src/machinst/isle.rs
+++ b/cranelift/codegen/src/machinst/isle.rs
@@ -585,9 +585,86 @@ macro_rules! isle_lower_prelude_methods {
                .collect();
            self.lower_ctx.gen_return(rets);
        }
        /// Attempts to interpret the shuffle immediate `imm` as a shuffle of
        /// 32-bit lanes, returning four integers, each of which is less than 8,
        /// which represents a permutation of 32-bit lanes as specified by
        /// `imm`.
        ///
        /// For example the shuffle immediate
        ///
        /// `0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27`
        ///
        /// would return `Some((0, 2, 4, 6))`.
        fn shuffle32_from_imm(&mut self, imm: Immediate) -> Option<(u8, u8, u8, u8)> {
            use crate::machinst::isle::shuffle_imm_as_le_lane_idx;
            let bytes = self.lower_ctx.get_immediate_data(imm).as_slice();
            Some((
                shuffle_imm_as_le_lane_idx(4, &bytes[0..4])?,
                shuffle_imm_as_le_lane_idx(4, &bytes[4..8])?,
                shuffle_imm_as_le_lane_idx(4, &bytes[8..12])?,
                shuffle_imm_as_le_lane_idx(4, &bytes[12..16])?,
            ))
        }
        /// Same as `shuffle32_from_imm`, but for 16-bit lane shuffles.
        fn shuffle16_from_imm(
            &mut self,
            imm: Immediate,
        ) -> Option<(u8, u8, u8, u8, u8, u8, u8, u8)> {
            use crate::machinst::isle::shuffle_imm_as_le_lane_idx;
            let bytes = self.lower_ctx.get_immediate_data(imm).as_slice();
            Some((
                shuffle_imm_as_le_lane_idx(2, &bytes[0..2])?,
                shuffle_imm_as_le_lane_idx(2, &bytes[2..4])?,
                shuffle_imm_as_le_lane_idx(2, &bytes[4..6])?,
                shuffle_imm_as_le_lane_idx(2, &bytes[6..8])?,
                shuffle_imm_as_le_lane_idx(2, &bytes[8..10])?,
                shuffle_imm_as_le_lane_idx(2, &bytes[10..12])?,
                shuffle_imm_as_le_lane_idx(2, &bytes[12..14])?,
                shuffle_imm_as_le_lane_idx(2, &bytes[14..16])?,
            ))
        }
    };
 }
 /// Returns the `size`-byte lane referred to by the shuffle immediate specified
 /// in `bytes`.
 ///
 /// This helper is used by `shuffleNN_from_imm` above and is used to interpret a
 /// byte-based shuffle as a higher-level shuffle of bigger lanes. This will see
 /// if the `bytes` specified, which must have `size` length, specifies a lane in
 /// vectors aligned to a `size`-byte boundary.
 ///
 /// Returns `None` if `bytes` doesn't specify a `size`-byte lane aligned
 /// appropriately, or returns `Some(n)` where `n` is the index of the lane being
 /// shuffled.
 pub fn shuffle_imm_as_le_lane_idx(size: u8, bytes: &[u8]) -> Option<u8> {
    assert_eq!(bytes.len(), usize::from(size));
    // The first index in `bytes` must be aligned to a `size` boundary for the
    // bytes to be a valid specifier for a lane of `size` bytes.
    if bytes[0] % size != 0 {
        return None;
    }
    // Afterwards the bytes must all be one larger than the prior to specify a
    // contiguous sequence of bytes that's being shuffled. Basically `bytes`
    // must refer to the entire `size`-byte lane, in little-endian order.
    for i in 0..size - 1 {
        let idx = usize::from(i);
        if bytes[idx] + 1 != bytes[idx + 1] {
            return None;
        }
    }
    // All of the `bytes` are in-order, meaning that this is a valid shuffle
    // immediate to specify a lane of `size` bytes. The index, when viewed as
    // `size`-byte immediates, will be the first byte divided by the byte size.
    Some(bytes[0] / size)
 }
 /// Helpers specifically for machines that use ABICaller.
 #[macro_export]
 #[doc(hidden)]
--- a/cranelift/codegen/src/prelude_lower.isle
+++ b/cranelift/codegen/src/prelude_lower.isle
@@ -592,6 +592,16 @@
 (decl u64_from_constant (u64) Constant)
 (extern extractor u64_from_constant u64_from_constant)
 ;; Extracts lane indices, represented as u8's, if the immediate for a
 ;; `shuffle` instruction represents shuffling N-bit values. The u8 values
 ;; returned will be in the range of 0 to (256/N)-1, inclusive, and index the
 ;; N-bit chunks of two concatenated 128-bit vectors starting from the
 ;; least-significant bits.
 (decl shuffle32_from_imm (u8 u8 u8 u8) Immediate)
 (extern extractor shuffle32_from_imm shuffle32_from_imm)
 (decl shuffle16_from_imm (u8 u8 u8 u8 u8 u8 u8 u8) Immediate)
 (extern extractor shuffle16_from_imm shuffle16_from_imm)
 ;;;; Helpers for generating returns ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Extractor to check for the special case that a `WritableValueRegs`
--- a/cranelift/filetests/filetests/isa/x64/shuffle-avx.clif
+++ b/cranelift/filetests/filetests/isa/x64/shuffle-avx.clif
@@ -0,0 +1,116 @@
 test compile precise-output
 set enable_simd
 target x86_64 has_avx
 function %punpckldq(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 2 3 16 17 18 19 4 5 6 7 20 21 22 23]
    v5 = bitcast.i32x4 little v4
    return v5
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   vpunpckldq %xmm0, %xmm1, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   vpunpckldq %xmm1, %xmm0, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %punpckhdq(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [8 9 10 11 24 25 26 27 12 13 14 15 28 29 30 31]
    v5 = bitcast.i32x4 little v4
    return v5
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   vpunpckhdq %xmm0, %xmm1, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   vpunpckhdq %xmm1, %xmm0, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %punpcklqdq(i64x2, i64x2) -> i64x2 {
 block0(v0: i64x2, v1: i64x2):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23]
    v5 = bitcast.i64x2 little v4
    return v5
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   vpunpcklqdq %xmm0, %xmm1, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   vpunpcklqdq %xmm1, %xmm0, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %punpckhqdq(i64x2, i64x2) -> i64x2 {
 block0(v0: i64x2, v1: i64x2):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31]
    v5 = bitcast.i64x2 little v4
    return v5
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   vpunpckhqdq %xmm0, %xmm1, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   vpunpckhqdq %xmm1, %xmm0, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
--- a/cranelift/filetests/filetests/isa/x64/shuffle.clif
+++ b/cranelift/filetests/filetests/isa/x64/shuffle.clif
@@ -52,3 +52,594 @@ block0(v0: i8x16, v1: i8x16):
 ;   popq %rbp
 ;   retq
 function %punpcklwd(i16x8, i16x8) -> i16x8 {
 block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 16 17 2 3 18 19 4 5 20 21 6 7 22 23]
    v5 = bitcast.i16x8 little v4
    return v5
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   punpcklwd %xmm0, %xmm1, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   punpcklwd %xmm1, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %punpckhwd(i16x8, i16x8) -> i16x8 {
 block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [8 9 24 25 10 11 26 27 12 13 28 29 14 15 30 31]
    v5 = bitcast.i16x8 little v4
    return v5
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   punpckhwd %xmm0, %xmm1, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   punpckhwd %xmm1, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %pshufd_0022(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 2 3 0 1 2 3 8 9 10 11 8 9 10 11]
    v5 = bitcast.i32x4 little v4
    return v5
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   pshufd  $160, %xmm0, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   pshufd $0xa0, %xmm0, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %pshufd_3120(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [12 13 14 15 4 5 6 7 8 9 10 11 0 1 2 3]
    v5 = bitcast.i32x4 little v4
    return v5
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   pshufd  $39, %xmm0, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   pshufd $0x27, %xmm0, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %pshufd_7546(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [28 29 30 31 20 21 22 23 16 17 18 19 24 25 26 27]
    v5 = bitcast.i32x4 little v4
    return v5
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   pshufd  $135, %xmm1, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   pshufd $0x87, %xmm1, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %not_single_pshufd(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
    v5 = bitcast.i32x4 little v4
    return v5
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   shufps  $78, %xmm0, %xmm1, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   shufps $0x4e, %xmm1, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %punpckldq(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 2 3 16 17 18 19 4 5 6 7 20 21 22 23]
    v5 = bitcast.i32x4 little v4
    return v5
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   punpckldq %xmm0, %xmm1, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   punpckldq %xmm1, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %punpckhdq(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [8 9 10 11 24 25 26 27 12 13 14 15 28 29 30 31]
    v5 = bitcast.i32x4 little v4
    return v5
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   punpckhdq %xmm0, %xmm1, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   punpckhdq %xmm1, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %punpcklqdq(i64x2, i64x2) -> i64x2 {
 block0(v0: i64x2, v1: i64x2):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23]
    v5 = bitcast.i64x2 little v4
    return v5
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   punpcklqdq %xmm0, %xmm1, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   punpcklqdq %xmm1, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %punpckhqdq(i64x2, i64x2) -> i64x2 {
 block0(v0: i64x2, v1: i64x2):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31]
    v5 = bitcast.i64x2 little v4
    return v5
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   punpckhqdq %xmm0, %xmm1, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   punpckhqdq %xmm1, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %shufps_3277(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [12 13 14 15 8 9 10 11 28 29 30 31 28 29 30 31]
    v5 = bitcast.i32x4 little v4
    return v5
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   shufps  $251, %xmm0, %xmm1, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   shufps $0xfb, %xmm1, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %shufps_6500(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [24 25 26 27 20 21 22 23 0 1 2 3 0 1 2 3]
    v5 = bitcast.i32x4 little v4
    return v5
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   movdqa  %xmm0, %xmm4
 ;   movdqa  %xmm1, %xmm0
 ;   shufps  $6, %xmm0, %xmm4, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   movdqa %xmm0, %xmm4
 ;   movdqa %xmm1, %xmm0
 ;   shufps $6, %xmm4, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %pshuflw_3210(i16x8, i16x8) -> i16x8 {
 block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [6 7 4 5 2 3 0 1 8 9 10 11 12 13 14 15]
    v5 = bitcast.i16x8 little v4
    return v5
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   pshuflw $27, %xmm0, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   pshuflw $0x1b, %xmm0, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %pshuflw_3131(i16x8, i16x8) -> i16x8 {
 block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [6 7 4 5 6 7 4 5 8 9 10 11 12 13 14 15]
    v5 = bitcast.i16x8 little v4
    return v5
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   pshuflw $187, %xmm0, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   pshuflw $0xbb, %xmm0, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %pshuflw_rhs_3210(i16x8, i16x8) -> i16x8 {
 block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [22 23 20 21 18 19 16 17 24 25 26 27 28 29 30 31]
    v5 = bitcast.i16x8 little v4
    return v5
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   pshuflw $27, %xmm1, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   pshuflw $0x1b, %xmm1, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %pshuflw_rhs_3131(i16x8, i16x8) -> i16x8 {
 block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [22 23 18 19 22 23 18 19 24 25 26 27 28 29 30 31]
    v5 = bitcast.i16x8 little v4
    return v5
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   pshuflw $119, %xmm1, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   pshuflw $0x77, %xmm1, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %pshufhw_3210(i16x8, i16x8) -> i16x8 {
 block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 14 15 12 13 10 11 8 9]
    v5 = bitcast.i16x8 little v4
    return v5
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   pshufhw $27, %xmm0, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   pshufhw $0x1b, %xmm0, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %pshufhw_3131(i16x8, i16x8) -> i16x8 {
 block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 14 15 10 11 14 15 10 11]
    v5 = bitcast.i16x8 little v4
    return v5
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   pshufhw $119, %xmm0, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   pshufhw $0x77, %xmm0, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %pshufhw_rhs_3210(i16x8, i16x8) -> i16x8 {
 block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [16 17 18 19 20 21 22 23 30 31 28 29 26 27 24 25]
    v5 = bitcast.i16x8 little v4
    return v5
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   pshufhw $27, %xmm1, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   pshufhw $0x1b, %xmm1, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %pshufhw_rhs_3131(i16x8, i16x8) -> i16x8 {
 block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [16 17 18 19 20 21 22 23 30 31 26 27 30 31 26 27]
    v5 = bitcast.i16x8 little v4
    return v5
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   pshufhw $119, %xmm1, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   pshufhw $0x77, %xmm1, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %shuffle_all_zeros(i8x16, i8x16) -> i8x16 {
 block0(v0: i8x16, v1: i8x16):
    v2 = shuffle v0, v1, [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
    return v2
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   pxor    %xmm3, %xmm3, %xmm3
 ;   pshufb  %xmm0, %xmm3, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   pxor %xmm3, %xmm3
 ;   pshufb %xmm3, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
--- a/cranelift/filetests/filetests/runtests/simd-shuffle.clif
+++ b/cranelift/filetests/filetests/runtests/simd-shuffle.clif
@@ -1,9 +1,10 @@
-test interpret
+;; test interpret ;; FIXME(#5915)
 test run
 target aarch64
 target s390x
 set enable_simd
 target x86_64 has_sse3 has_ssse3 has_sse41
 target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
 target x86_64 has_sse3 has_ssse3 has_sse41 has_avx512vl has_avx512vbmi
 function %shuffle_i8x16(i8x16, i8x16) -> i8x16 {
@@ -26,3 +27,234 @@ block0(v0: i8x16):
    return v1
 }
 ; run: %shuffle1([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]) == [8 9 10 11 12 13 14 15 0 1 2 3 4 5 6 7]
 function %punpcklbw(i8x16, i8x16) -> i8x16 {
 block0(v0: i8x16, v1: i8x16):
    v2 = shuffle v0, v1, [0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23]
    return v2
 }
 ; run: %punpcklbw([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [1 17 2 18 3 19 4 20 5 21 6 22 7 23 8 24]
 function %punpckhbw(i8x16, i8x16) -> i8x16 {
 block0(v0: i8x16, v1: i8x16):
    v2 = shuffle v0, v1, [8 24 9 25 10 26 11 27 12 28 13 29 14 30 15 31]
    return v2
 }
 ; run: %punpckhbw([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [9 25 10 26 11 27 12 28 13 29 14 30 15 31 16 32]
 function %punpcklwd(i16x8, i16x8) -> i16x8 {
 block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 16 17 2 3 18 19 4 5 20 21 6 7 22 23]
    v5 = bitcast.i16x8 little v4
    return v5
 }
 ; run: %punpcklwd([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [1 9 2 10 3 11 4 12]
 function %punpckhwd(i16x8, i16x8) -> i16x8 {
 block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [8 9 24 25 10 11 26 27 12 13 28 29 14 15 30 31]
    v5 = bitcast.i16x8 little v4
    return v5
 }
 ; run: %punpckhwd([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [5 13 6 14 7 15 8 16]
 function %pshufd_0022(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 2 3 0 1 2 3 8 9 10 11 8 9 10 11]
    v5 = bitcast.i32x4 little v4
    return v5
 }
 ; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [1 1 3 3]
 function %pshufd_3120(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [12 13 14 15 4 5 6 7 8 9 10 11 0 1 2 3]
    v5 = bitcast.i32x4 little v4
    return v5
 }
 ; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [4 2 3 1]
 function %pshufd_7546(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [28 29 30 31 20 21 22 23 16 17 18 19 24 25 26 27]
    v5 = bitcast.i32x4 little v4
    return v5
 }
 ; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [8 6 5 7]
 function %not_pshufd(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
    v5 = bitcast.i32x4 little v4
    return v5
 }
 ; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [3 4 5 6]
 function %punpckldq(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 2 3 16 17 18 19 4 5 6 7 20 21 22 23]
    v5 = bitcast.i32x4 little v4
    return v5
 }
 ; run: %punpckldq([1 2 3 4], [5 6 7 8]) == [1 5 2 6]
 function %punpckhdq(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [8 9 10 11 24 25 26 27 12 13 14 15 28 29 30 31]
    v5 = bitcast.i32x4 little v4
    return v5
 }
 ; run: %punpckldq([1 2 3 4], [5 6 7 8]) == [3 7 4 8]
 function %punpcklqdq(i64x2, i64x2) -> i64x2 {
 block0(v0: i64x2, v1: i64x2):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23]
    v5 = bitcast.i64x2 little v4
    return v5
 }
 ; run: %punpcklqdq([1 2], [5 6]) == [1 5]
 function %punpckhqdq(i64x2, i64x2) -> i64x2 {
 block0(v0: i64x2, v1: i64x2):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31]
    v5 = bitcast.i64x2 little v4
    return v5
 }
 ; run: %punpckhqdq([1 2], [5 6]) == [2 6]
 function %shufps_0145(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23]
    v5 = bitcast.i32x4 little v4
    return v5
 }
 ; run: %shufps_0145([1 2 3 4], [5 6 7 8]) == [1 2 5 6]
 function %shufps_3277(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [12 13 14 15 8 9 10 11 28 29 30 31 28 29 30 31]
    v5 = bitcast.i32x4 little v4
    return v5
 }
 ; run: %shufps_0145([1 2 3 4], [5 6 7 8]) == [4 3 8 8]
 function %shufps_6500(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [24 25 26 27 20 21 22 23 0 1 2 3 0 1 2 3]
    v5 = bitcast.i32x4 little v4
    return v5
 }
 ; run: %shufps_0145([1 2 3 4], [5 6 7 8]) == [7 6 1 1]
 function %pshuflw_3210(i16x8, i16x8) -> i16x8 {
 block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [6 7 4 5 2 3 0 1 8 9 10 11 12 13 14 15]
    v5 = bitcast.i16x8 little v4
    return v5
 }
 ; run: %pshuflw_3210([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [4 3 2 1 5 6 7 8]
 function %pshuflw_3131(i16x8, i16x8) -> i16x8 {
 block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [6 7 4 5 6 7 4 5 8 9 10 11 12 13 14 15]
    v5 = bitcast.i16x8 little v4
    return v5
 }
 ; run: %pshuflw_3131([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [4 3 4 3 5 6 7 8]
 function %pshuflw_rhs_3210(i16x8, i16x8) -> i16x8 {
 block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [22 23 20 21 18 19 16 17 24 25 26 27 28 29 30 31]
    v5 = bitcast.i16x8 little v4
    return v5
 }
 ; run: %pshuflw_rhs_3210([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [12 11 10 9 13 14 15 16]
 function %pshuflw_rhs_3131(i16x8, i16x8) -> i16x8 {
 block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [22 23 18 19 22 23 18 19 24 25 26 27 28 29 30 31]
    v5 = bitcast.i16x8 little v4
    return v5
 }
 ; run: %pshuflw_rhs_3131([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [12 10 12 10 13 14 15 16]
 function %pshufhw_3210(i16x8, i16x8) -> i16x8 {
 block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 14 15 12 13 10 11 8 9]
    v5 = bitcast.i16x8 little v4
    return v5
 }
 ; run: %pshufhw_3210([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [1 2 3 4 8 7 6 5]
 function %pshufhw_3131(i16x8, i16x8) -> i16x8 {
 block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 14 15 10 11 14 15 10 11]
    v5 = bitcast.i16x8 little v4
    return v5
 }
 ; run: %pshufhw_3131([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [1 2 3 4 8 6 8 6]
 function %pshufhw_rhs_3210(i16x8, i16x8) -> i16x8 {
 block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [16 17 18 19 20 21 22 23 30 31 28 29 26 27 24 25]
    v5 = bitcast.i16x8 little v4
    return v5
 }
 ; run: %pshufhw_rhs_3210([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [9 10 11 12 16 15 14 13]
 function %pshufhw_rhs_3131(i16x8, i16x8) -> i16x8 {
 block0(v0: i16x8, v1: i16x8):
    v2 = bitcast.i8x16 little v0
    v3 = bitcast.i8x16 little v1
    v4 = shuffle v2, v3, [16 17 18 19 20 21 22 23 30 31 26 27 30 31 26 27]
    v5 = bitcast.i16x8 little v4
    return v5
 }
 ; run: %pshufhw_rhs_3131([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [9 10 11 12 16 14 16 14]
 function %shuffle_all_zeros(i8x16, i8x16) -> i8x16 {
 block0(v0: i8x16, v1: i8x16):
    v2 = shuffle v0, v1, [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
    return v2
 }
 ; run: %shuffle_all_zeros([5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1], [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]) == [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]