cranelift: port sshr to ISLE on x64 (#3681)

2022-01-12 07:13:58 -08:00
parent 1ef0abb12c
commit 7454f1f3af
13 changed files with 1003 additions and 563 deletions
--- a/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.manifest
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.manifest
@@ -1,4 +1,4 @@
 src/clif.isle f176ef3bba99365
-src/prelude.isle babc931e5dc5b4cf
+src/prelude.isle d95510fad2e2473c
 src/isa/aarch64/inst.isle 5fa80451697b084f
 src/isa/aarch64/lower.isle 2d2e1e076a0c8a23
--- a/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.rs
@@ -20,6 +20,8 @@ pub trait Context {
    fn pack_value_array_2(&mut self, arg0: Value, arg1: Value) -> ValueArray2;
    fn unpack_value_array_3(&mut self, arg0: &ValueArray3) -> (Value, Value, Value);
    fn pack_value_array_3(&mut self, arg0: Value, arg1: Value, arg2: Value) -> ValueArray3;
+    fn u32_add(&mut self, arg0: u32, arg1: u32) -> u32;
+    fn u8_and(&mut self, arg0: u8, arg1: u8) -> u8;
    fn value_reg(&mut self, arg0: Reg) -> ValueRegs;
    fn value_regs(&mut self, arg0: Reg, arg1: Reg) -> ValueRegs;
    fn temp_writable_reg(&mut self, arg0: Type) -> WritableReg;
@@ -32,6 +34,7 @@ pub trait Context {
    fn u32_as_u64(&mut self, arg0: u32) -> u64;
    fn ty_bits(&mut self, arg0: Type) -> u8;
    fn ty_bits_u16(&mut self, arg0: Type) -> u16;
+    fn lane_type(&mut self, arg0: Type) -> Type;
    fn fits_in_16(&mut self, arg0: Type) -> Option<Type>;
    fn fits_in_32(&mut self, arg0: Type) -> Option<Type>;
    fn fits_in_64(&mut self, arg0: Type) -> Option<Type>;
@@ -52,7 +55,6 @@ pub trait Context {
    fn first_result(&mut self, arg0: Inst) -> Option<Value>;
    fn inst_data(&mut self, arg0: Inst) -> InstructionData;
    fn value_type(&mut self, arg0: Value) -> Type;
-    fn ty_bits_mask(&mut self, arg0: Type) -> u64;
    fn multi_lane(&mut self, arg0: Type) -> Option<(u8, u16)>;
    fn def_inst(&mut self, arg0: Value) -> Option<Inst>;
    fn trap_code_division_by_zero(&mut self) -> TrapCode;
@@ -89,13 +91,13 @@ pub trait Context {
    fn rotr_opposite_amount(&mut self, arg0: Type, arg1: ImmShift) -> ImmShift;
 }

-/// Internal type ProducesFlags: defined at src/prelude.isle line 263.
+/// Internal type ProducesFlags: defined at src/prelude.isle line 273.
 #[derive(Clone, Debug)]
 pub enum ProducesFlags {
    ProducesFlags { inst: MInst, result: Reg },
 }

-/// Internal type ConsumesFlags: defined at src/prelude.isle line 266.
+/// Internal type ConsumesFlags: defined at src/prelude.isle line 276.
 #[derive(Clone, Debug)]
 pub enum ConsumesFlags {
    ConsumesFlags { inst: MInst, result: Reg },
@@ -975,7 +977,7 @@ pub enum AtomicRMWOp {
 // Generated as internal constructor for term temp_reg.
 pub fn constructor_temp_reg<C: Context>(ctx: &mut C, arg0: Type) -> Option<Reg> {
    let pattern0_0 = arg0;
-    // Rule at src/prelude.isle line 60.
+    // Rule at src/prelude.isle line 66.
    let expr0_0 = C::temp_writable_reg(ctx, pattern0_0);
    let expr1_0 = C::writable_reg_to_reg(ctx, expr0_0);
    return Some(expr1_0);
@@ -984,7 +986,7 @@ pub fn constructor_temp_reg<C: Context>(ctx: &mut C, arg0: Type) -> Option<Reg>
 // Generated as internal constructor for term lo_reg.
 pub fn constructor_lo_reg<C: Context>(ctx: &mut C, arg0: Value) -> Option<Reg> {
    let pattern0_0 = arg0;
-    // Rule at src/prelude.isle line 95.
+    // Rule at src/prelude.isle line 101.
    let expr0_0 = C::put_in_regs(ctx, pattern0_0);
    let expr1_0: usize = 0;
    let expr2_0 = C::value_regs_get(ctx, expr0_0, expr1_0);
@@ -1009,7 +1011,7 @@ pub fn constructor_with_flags<C: Context>(
            result: pattern3_1,
        } = pattern2_0
        {
-            // Rule at src/prelude.isle line 276.
+            // Rule at src/prelude.isle line 286.
            let expr0_0 = C::emit(ctx, &pattern1_0);
            let expr1_0 = C::emit(ctx, &pattern3_0);
            let expr2_0 = C::value_regs(ctx, pattern1_1, pattern3_1);
@@ -1037,7 +1039,7 @@ pub fn constructor_with_flags_1<C: Context>(
            result: pattern3_1,
        } = pattern2_0
        {
-            // Rule at src/prelude.isle line 284.
+            // Rule at src/prelude.isle line 294.
            let expr0_0 = C::emit(ctx, &pattern1_0);
            let expr1_0 = C::emit(ctx, &pattern3_0);
            return Some(pattern3_1);
@@ -1071,7 +1073,7 @@ pub fn constructor_with_flags_2<C: Context>(
                result: pattern5_1,
            } = pattern4_0
            {
-                // Rule at src/prelude.isle line 294.
+                // Rule at src/prelude.isle line 304.
                let expr0_0 = C::emit(ctx, &pattern1_0);
                let expr1_0 = C::emit(ctx, &pattern5_0);
                let expr2_0 = C::emit(ctx, &pattern3_0);
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -35,6 +35,7 @@
                       (dst WritableReg)
                       (imm u8)
                       (size OperandSize))
+            (XmmUninitializedValue (dst WritableReg))
            (CmpRmiR (size OperandSize)
                     (opcode CmpOpcode)
                     (src RegMemImm)
@@ -292,6 +293,15 @@
       (Mem (addr SyntheticAmode))
       (Imm (simm32 u32))))

+;; Put the given clif value into a `RegMemImm` operand.
+;;
+;; Asserts that the value fits into a single register, and doesn't require
+;; multiple registers for its representation (like `i128` for example).
+;;
+;; As a side effect, this marks the value as used.
+(decl put_in_reg_mem_imm (Value) RegMemImm)
+(extern constructor put_in_reg_mem_imm put_in_reg_mem_imm)
+
 (type RegMem extern
      (enum
       (Reg (reg Reg))
@@ -319,6 +329,18 @@
      (enum (Imm8 (imm u8))
            (Reg (reg Reg))))

+;; Put the given clif value into a `Imm8Reg` operand, masked to the bit width of
+;; the given type.
+;;
+;; Asserts that the value fits into a single register, and doesn't require
+;; multiple registers for its representation (like `i128` for example).
+;;
+;; As a side effect, this marks the value as used.
+;;
+;; This is used when lowering various shifts and rotates.
+(decl put_masked_in_imm8_reg (Value Type) Imm8Reg)
+(extern constructor put_masked_in_imm8_reg put_masked_in_imm8_reg)
+
 (type CC extern
      (enum O
            NO
@@ -383,9 +405,12 @@
 (decl imm8_from_value (Imm8Reg) Value)
 (extern extractor imm8_from_value imm8_from_value)

-;; Mask an `Imm8Reg.Imm8`.
-(decl mask_imm8_const (Imm8Reg u64) Imm8Reg)
-(extern constructor mask_imm8_const mask_imm8_const)
+;; Mask a constant to the bit-width of the given type and package it into an
+;; `Imm8Reg.Imm8`. This is used for shifts and rotates, so that we don't try and
+;; shift/rotate more bits than the type has available, per Cranelift's
+;; semantics.
+(decl const_to_type_masked_imm8 (u64 Type) Imm8Reg)
+(extern constructor const_to_type_masked_imm8 const_to_type_masked_imm8)

 ;; Extract a constant `RegMemImm.Imm` from a value operand.
 (decl simm32_from_value (RegMemImm) Value)
@@ -494,6 +519,37 @@
                                         wr))))
        r))

+;; Helper for creating an SSE register holding an `i64x2` from two `i64` values.
+(decl make_i64x2_from_lanes (RegMem RegMem) Reg)
+(rule (make_i64x2_from_lanes lo hi)
+      (let ((dst_w WritableReg (temp_writable_reg $I64X2))
+            (dst_r Reg (writable_reg_to_reg dst_w))
+            (_0 Unit (emit (MInst.XmmUninitializedValue dst_w)))
+            (_1 Unit (emit (MInst.XmmRmRImm (SseOpcode.Pinsrd)
+                                            dst_r
+                                            lo
+                                            dst_w
+                                            0
+                                            (OperandSize.Size64))))
+            (_2 Unit (emit (MInst.XmmRmRImm (SseOpcode.Pinsrd)
+                                            dst_r
+                                            hi
+                                            dst_w
+                                            1
+                                            (OperandSize.Size64)))))
+        dst_r))
+
+;; Move a `RegMemImm.Reg` operand to an XMM register, if necessary.
+(decl reg_mem_imm_to_xmm (RegMemImm) RegMemImm)
+(rule (reg_mem_imm_to_xmm rmi @ (RegMemImm.Mem _)) rmi)
+(rule (reg_mem_imm_to_xmm rmi @ (RegMemImm.Imm _)) rmi)
+(rule (reg_mem_imm_to_xmm (RegMemImm.Reg r))
+      (RegMemImm.Reg (gpr_to_xmm $I8X16
+                                 (SseOpcode.Movd)
+                                 (RegMem.Reg r)
+                                 (OperandSize.Size32))))
+
+
 ;;;; Instruction Constructors ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; These constructors create SSA-style `MInst`s. It is their responsibility to
@@ -1058,6 +1114,21 @@
 (rule (pminud src1 src2)
      (xmm_rm_r $I8X16 (SseOpcode.Pminud) src1 src2))

+;; Helper for creating `punpcklbw` instructions.
+(decl punpcklbw (Reg RegMem) Reg)
+(rule (punpcklbw src1 src2)
+      (xmm_rm_r $I8X16 (SseOpcode.Punpcklbw) src1 src2))
+
+;; Helper for creating `punpckhbw` instructions.
+(decl punpckhbw (Reg RegMem) Reg)
+(rule (punpckhbw src1 src2)
+      (xmm_rm_r $I8X16 (SseOpcode.Punpckhbw) src1 src2))
+
+;; Helper for creating `packsswb` instructions.
+(decl packsswb (Reg RegMem) Reg)
+(rule (packsswb src1 src2)
+      (xmm_rm_r $I8X16 (SseOpcode.Packsswb) src1 src2))
+
 ;; Helper for creating `MInst.XmmRmRImm` instructions.
 (decl xmm_rm_r_imm (SseOpcode Reg RegMem u8 OperandSize) Reg)
 (rule (xmm_rm_r_imm op src1 src2 imm size)
@@ -1180,6 +1251,16 @@
 (rule (psrlq src1 src2)
      (xmm_rmi_reg (SseOpcode.Psrlq) src1 src2))

+;; Helper for creating `psraw` instructions.
+(decl psraw (Reg RegMemImm) Reg)
+(rule (psraw src1 src2)
+      (xmm_rmi_reg (SseOpcode.Psraw) src1 src2))
+
+;; Helper for creating `psrad` instructions.
+(decl psrad (Reg RegMemImm) Reg)
+(rule (psrad src1 src2)
+      (xmm_rmi_reg (SseOpcode.Psrad) src1 src2))
+
 ;; Helper for creating `MInst.MulHi` instructions.
 ;;
 ;; Returns the (lo, hi) register halves of the multiplication.
@@ -1252,6 +1333,19 @@
 (rule (insertps src1 src2 lane)
      (xmm_rm_r_imm (SseOpcode.Insertps) src1 src2 lane (OperandSize.Size32)))

+;; Helper for creating `pextrd` instructions.
+(decl pextrd (Type Reg u8) Reg)
+(rule (pextrd ty src lane)
+      (let ((w_dst WritableReg (temp_writable_reg ty))
+            (r_dst Reg (writable_reg_to_reg w_dst))
+            (_ Unit (emit (MInst.XmmRmRImm (SseOpcode.Pextrd)
+                                           r_dst
+                                           (RegMem.Reg src)
+                                           w_dst
+                                           lane
+                                           (operand_size_of_type_32_64 (lane_type ty))))))
+        r_dst))
+
 ;; Helper for creating `not` instructions.
 (decl not (Type Reg) Reg)
 (rule (not ty src)
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -537,13 +537,7 @@
 ;; `i64` and smaller.

 (rule (lower (has_type (fits_in_64 ty) (ishl src amt)))
-      ;; NB: Only the low bits of `amt` matter since we logically mask the shift
-      ;; amount to the value's bit width.
-      (let ((amt_ Reg (lo_reg amt)))
-        (value_reg (shl ty (put_in_reg src) (Imm8Reg.Reg amt_)))))
-
-(rule (lower (has_type (fits_in_64 ty) (ishl src (imm8_from_value amt))))
-      (value_reg (shl ty (put_in_reg src) amt)))
+      (value_reg (shl ty (put_in_reg src) (put_masked_in_imm8_reg amt ty))))

 ;; `i128`.

@@ -582,15 +576,8 @@
 ;; `i64` and smaller.

 (rule (lower (has_type (fits_in_64 ty) (ushr src amt)))
-      (let ((src_ Reg (extend_to_reg src ty (ExtendKind.Zero)))
-            ;; NB: Only the low bits of `amt` matter since we logically mask the
-            ;; shift amount to the value's bit width.
-            (amt_ Reg (lo_reg amt)))
-        (value_reg (shr ty src_ (Imm8Reg.Reg amt_)))))
-
-(rule (lower (has_type (fits_in_64 ty) (ushr src (imm8_from_value amt))))
      (let ((src_ Reg (extend_to_reg src ty (ExtendKind.Zero))))
-        (value_reg (shr ty src_ amt))))
+        (value_reg (shr ty src_ (put_masked_in_imm8_reg amt ty)))))

 ;; `i128`.

@@ -623,6 +610,109 @@
      (let ((amt_ Reg (lo_reg amt)))
        (shr_i128 (put_in_regs src) amt_)))

+;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; `i64` and smaller.
+
+(rule (lower (has_type (fits_in_64 ty) (sshr src amt)))
+      (let ((src_ Reg (extend_to_reg src ty (ExtendKind.Sign))))
+        (value_reg (sar ty src_ (put_masked_in_imm8_reg amt ty)))))
+
+;; `i128`.
+
+(decl sar_i128 (ValueRegs Reg) ValueRegs)
+(rule (sar_i128 src amt)
+      ;; Unpack the low/high halves of `src`.
+      (let ((src_lo Reg (value_regs_get src 0))
+            (src_hi Reg (value_regs_get src 1))
+            ;; Do a shift of each half. NB: the low half uses an unsigned shift
+            ;; because its MSB is not a sign bit.
+            (lo_shifted Reg (shr $I64 src_lo (Imm8Reg.Reg amt)))
+            (hi_shifted Reg (sar $I64 src_hi (Imm8Reg.Reg amt)))
+            ;; `src_hi << (64 - amt)` are the bits to carry over from the low
+            ;; half to the high half.
+            (carry Reg (shl $I64 src_hi (Imm8Reg.Reg (sub $I64 (imm $I64 64) (RegMemImm.Reg amt)))))
+            ;; Nullify the carry if we are shifting by a multiple of 128.
+            (carry_ Reg (with_flags_1 (test (OperandSize.Size64) (RegMemImm.Imm 127) amt)
+                                      (cmove $I64 (CC.Z) (RegMem.Reg (imm $I64 0)) carry)))
+            ;; Add the carry into the low half.
+            (lo_shifted_ Reg (or $I64 lo_shifted (RegMemImm.Reg carry_)))
+            ;; Get all sign bits.
+            (sign_bits Reg (sar $I64 src_hi (Imm8Reg.Imm8 63))))
+        ;; Combine the two shifted halves. However, if we are shifting by >= 64
+        ;; (modulo 128), then the hi bits are all sign bits and the lo bits are
+        ;; what would otherwise be our hi bits.
+        (with_flags_2 (test (OperandSize.Size64) (RegMemImm.Imm 64) amt)
+                      (cmove $I64 (CC.Z) (RegMem.Reg lo_shifted_) hi_shifted)
+                      (cmove $I64 (CC.Z) (RegMem.Reg hi_shifted) sign_bits))))
+
+(rule (lower (has_type $I128 (sshr src amt)))
+      ;; NB: Only the low bits of `amt` matter since we logically mask the shift
+      ;; amount to the value's bit width.
+      (let ((amt_ Reg (lo_reg amt)))
+        (sar_i128 (put_in_regs src) amt_)))
+
+;; SSE.
+
+;; Since the x86 instruction set does not have an 8x16 shift instruction and the
+;; approach used for `ishl` and `ushr` cannot be easily used (the masks do not
+;; preserve the sign), we use a different approach here: separate the low and
+;; high lanes, shift them separately, and merge them into the final result.
+;;
+;; Visually, this looks like the following, where `src.i8x16 = [s0, s1, ...,
+;; s15]:
+;;
+;;   lo.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)]
+;;   shifted_lo.i16x8 = shift each lane of `low`
+;;   hi.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]
+;;   shifted_hi.i16x8 = shift each lane of `high`
+;;   result = [s0'', s1'', ..., s15'']
+(rule (lower (has_type $I8X16 (sshr src amt @ (value_type amt_ty))))
+      (let ((src_ Reg (put_in_reg src))
+            ;; In order for `packsswb` later to only use the high byte of each
+            ;; 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to
+            ;; fill in the upper bits appropriately.
+            (lo Reg (punpcklbw src_ (RegMem.Reg src_)))
+            (hi Reg (punpckhbw src_ (RegMem.Reg src_)))
+            (amt_ RegMemImm (sshr_i8x16_bigger_shift amt_ty (put_in_reg_mem_imm amt)))
+            (shifted_lo Reg (psraw lo amt_))
+            (shifted_hi Reg (psraw hi amt_)))
+        (value_reg (packsswb shifted_lo (RegMem.Reg shifted_hi)))))
+
+(decl sshr_i8x16_bigger_shift (Type RegMemImm) RegMemImm)
+(rule (sshr_i8x16_bigger_shift _ty (RegMemImm.Imm i))
+      (RegMemImm.Imm (u32_add i 8)))
+(rule (sshr_i8x16_bigger_shift ty (RegMemImm.Reg r))
+      (reg_mem_imm_to_xmm (RegMemImm.Reg (add ty r (RegMemImm.Imm 8)))))
+(rule (sshr_i8x16_bigger_shift ty rmi @ (RegMemImm.Mem _m))
+      (reg_mem_imm_to_xmm (RegMemImm.Reg (add ty (imm ty 8) rmi))))
+
+;; `sshr.{i16x8,i32x4}` can be a simple `psra{w,d}`, we just have to make sure
+;; that if the shift amount is in a register, it is in an XMM register.
+(rule (lower (has_type $I16X8 (sshr src amt)))
+      (value_reg (psraw (put_in_reg src)
+                        (reg_mem_imm_to_xmm (put_in_reg_mem_imm amt)))))
+(rule (lower (has_type $I32X4 (sshr src amt)))
+      (value_reg (psrad (put_in_reg src)
+                        (reg_mem_imm_to_xmm (put_in_reg_mem_imm amt)))))
+
+;; The `sshr.i64x2` CLIF instruction has no single x86 instruction in the older
+;; feature sets. Newer ones like AVX512VL + AVX512F include `vpsraq`, a 128-bit
+;; instruction that would fit here, but this backend does not currently have
+;; support for EVEX encodings. To remedy this, we extract each 64-bit lane to a
+;; GPR, shift each using a scalar instruction, and insert the shifted values
+;; back in the `dst` XMM register.
+;;
+;; (TODO: when EVEX support is available, add an alternate lowering here).
+(rule (lower (has_type $I64X2 (sshr src amt)))
+      (let ((src_ Reg (put_in_reg src))
+            (lo Reg (pextrd $I64 src_ 0))
+            (hi Reg (pextrd $I64 src_ 1))
+            (amt_ Imm8Reg (put_masked_in_imm8_reg amt $I64))
+            (shifted_lo Reg (sar $I64 lo amt_))
+            (shifted_hi Reg (sar $I64 hi amt_)))
+        (value_reg (make_i64x2_from_lanes (RegMem.Reg shifted_lo)
+                                          (RegMem.Reg shifted_hi)))))
 ;;;; Rules for `rotl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 ;; `i16` and `i8`: we need to extend the shift amount, or mask the
@@ -632,8 +722,11 @@
      (let ((amt_ Reg (extend_to_reg amt $I32 (ExtendKind.Zero))))
        (value_reg (m_rotl ty (put_in_reg src) (Imm8Reg.Reg amt_)))))

-(rule (lower (has_type (ty_8_or_16 ty) (rotl src (imm8_from_value amt))))
-      (value_reg (m_rotl ty (put_in_reg src) (mask_imm8_const amt (ty_bits_mask ty)))))
+(rule (lower (has_type (ty_8_or_16 ty)
+                       (rotl src (u64_from_iconst amt))))
+      (value_reg (m_rotl ty
+                         (put_in_reg src)
+                         (const_to_type_masked_imm8 amt ty))))

 ;; `i64` and `i32`: we can rely on x86's rotate-amount masking since
 ;;  we operate on the whole register.
@@ -644,8 +737,11 @@
      (let ((amt_ Reg (lo_reg amt)))
        (value_reg (m_rotl ty (put_in_reg src) (Imm8Reg.Reg amt_)))))

-(rule (lower (has_type (ty_32_or_64 ty) (rotl src (imm8_from_value amt))))
-      (value_reg (m_rotl ty (put_in_reg src) amt)))
+(rule (lower (has_type (ty_32_or_64 ty)
+                       (rotl src (u64_from_iconst amt))))
+      (value_reg (m_rotl ty
+                         (put_in_reg src)
+                         (const_to_type_masked_imm8 amt ty))))

 ;; `i128`.

--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -1538,13 +1538,18 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        | Opcode::Umin
        | Opcode::Bnot
        | Opcode::Bitselect
-        | Opcode::Vselect => implemented_in_isle(ctx),
+        | Opcode::Vselect
+        | Opcode::Sshr => implemented_in_isle(ctx),

-        Opcode::Ishl | Opcode::Ushr | Opcode::Sshr | Opcode::Rotl | Opcode::Rotr => {
+        Opcode::Ishl | Opcode::Ushr | Opcode::Rotl | Opcode::Rotr => {
            let dst_ty = ctx.output_ty(insn, 0);
            debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty);

            if !dst_ty.is_vector() && dst_ty.bits() <= 64 {
+                if op != Opcode::Rotr {
+                    implemented_in_isle(ctx);
+                }
+
                // Scalar shifts on x86 have various encodings:
                // - shift by one bit, e.g. `SAL r/m8, 1` (not used here)
                // - shift by an immediate amount, e.g. `SAL r/m8, imm8`
@@ -1557,10 +1562,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                            OperandSize::Size32,
                            extend_input_to_reg(ctx, inputs[0], ExtSpec::ZeroExtendTo32),
                        ),
-                        Opcode::Sshr => (
-                            OperandSize::Size32,
-                            extend_input_to_reg(ctx, inputs[0], ExtSpec::SignExtendTo32),
-                        ),
                        Opcode::Rotl | Opcode::Rotr => (
                            OperandSize::from_ty(dst_ty),
                            put_input_in_reg(ctx, inputs[0]),
@@ -1590,7 +1591,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                let shift_kind = match op {
                    Opcode::Ishl => ShiftKind::ShiftLeft,
                    Opcode::Ushr => ShiftKind::ShiftRightLogical,
-                    Opcode::Sshr => ShiftKind::ShiftRightArithmetic,
                    Opcode::Rotl => ShiftKind::RotateLeft,
                    Opcode::Rotr => ShiftKind::RotateRight,
                    _ => unreachable!(),
@@ -1608,50 +1608,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                let dst = get_output_reg(ctx, outputs[0]);

                match op {
-                    Opcode::Ishl => {
-                        emit_shl_i128(ctx, src, dst, amt_src);
-                    }
-                    Opcode::Ushr => {
-                        emit_shr_i128(ctx, src, dst, amt_src, /* is_signed = */ false);
-                    }
-                    Opcode::Sshr => {
-                        emit_shr_i128(ctx, src, dst, amt_src, /* is_signed = */ true);
-                    }
-                    Opcode::Rotl => {
-                        // (mov tmp, src)
-                        // (shl.i128 tmp, amt)
-                        // (mov dst, src)
-                        // (ushr.i128 dst, 128-amt)
-                        // (or dst, tmp)
-                        let tmp = ctx.alloc_tmp(types::I128);
-                        emit_shl_i128(ctx, src, tmp, amt_src);
-                        let inv_amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
-                        ctx.emit(Inst::imm(OperandSize::Size64, 128, inv_amt));
-                        ctx.emit(Inst::alu_rmi_r(
-                            OperandSize::Size64,
-                            AluRmiROpcode::Sub,
-                            RegMemImm::reg(amt_src),
-                            inv_amt,
-                        ));
-                        emit_shr_i128(
-                            ctx,
-                            src,
-                            dst,
-                            inv_amt.to_reg(),
-                            /* is_signed = */ false,
-                        );
-                        ctx.emit(Inst::alu_rmi_r(
-                            OperandSize::Size64,
-                            AluRmiROpcode::Or,
-                            RegMemImm::reg(tmp.regs()[0].to_reg()),
-                            dst.regs()[0],
-                        ));
-                        ctx.emit(Inst::alu_rmi_r(
-                            OperandSize::Size64,
-                            AluRmiROpcode::Or,
-                            RegMemImm::reg(tmp.regs()[1].to_reg()),
-                            dst.regs()[1],
-                        ));
+                    Opcode::Ishl | Opcode::Ushr | Opcode::Rotl => {
+                        implemented_in_isle(ctx);
                    }
                    Opcode::Rotr => {
                        // (mov tmp, src)
@@ -1808,127 +1766,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    _ => SseOpcode::Pand,
                };
                ctx.emit(Inst::xmm_rm_r(sse_op, RegMem::from(mask_value), dst));
-            } else if dst_ty == types::I8X16 && op == Opcode::Sshr {
-                // Since the x86 instruction set does not have an 8x16 shift instruction and the approach used for
-                // `ishl` and `ushr` cannot be easily used (the masks do not preserve the sign), we use a different
-                // approach here: separate the low and high lanes, shift them separately, and merge them into the final
-                // result. Visually, this looks like the following, where `src.i8x16 = [s0, s1, ..., s15]:
-                //   low.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)]
-                //   shifted_low.i16x8 = shift each lane of `low`
-                //   high.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]
-                //   shifted_high.i16x8 = shift each lane of `high`
-                //   dst.i8x16 = [s0'', s1'', ..., s15'']
-                let src = put_input_in_reg(ctx, inputs[0]);
-                let shift_by = input_to_reg_mem_imm(ctx, inputs[1]);
-                let shift_by_ty = ctx.input_ty(insn, 1);
-                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-
-                // In order for PACKSSWB later to only use the high byte of each 16x8 lane, we shift right an extra 8
-                // bits, relying on PSRAW to fill in the upper bits appropriately.
-                let bigger_shift_by = match shift_by {
-                    // When we know the shift amount at compile time, we add the extra shift amount statically.
-                    RegMemImm::Imm { simm32 } => RegMemImm::imm(simm32 + 8),
-                    // Otherwise we add instructions to add the extra shift amount and move the value into an XMM
-                    // register.
-                    RegMemImm::Reg { reg } => {
-                        let bigger_shift_by_gpr = ctx.alloc_tmp(shift_by_ty).only_reg().unwrap();
-                        ctx.emit(Inst::mov_r_r(OperandSize::Size64, reg, bigger_shift_by_gpr));
-
-                        let size = if shift_by_ty == types::I64 {
-                            OperandSize::Size64
-                        } else {
-                            OperandSize::Size32
-                        };
-                        let imm = RegMemImm::imm(8);
-                        ctx.emit(Inst::alu_rmi_r(
-                            size,
-                            AluRmiROpcode::Add,
-                            imm,
-                            bigger_shift_by_gpr,
-                        ));
-
-                        let bigger_shift_by_xmm = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
-                        ctx.emit(Inst::gpr_to_xmm(
-                            SseOpcode::Movd,
-                            RegMem::from(bigger_shift_by_gpr),
-                            OperandSize::Size32,
-                            bigger_shift_by_xmm,
-                        ));
-                        RegMemImm::reg(bigger_shift_by_xmm.to_reg())
-                    }
-                    RegMemImm::Mem { .. } => unimplemented!("load shift amount to XMM register"),
-                };
-
-                // Unpack and shift the lower lanes of `src` into the `dst` register.
-                ctx.emit(Inst::gen_move(dst, src, dst_ty));
-                ctx.emit(Inst::xmm_rm_r(SseOpcode::Punpcklbw, RegMem::from(dst), dst));
-                ctx.emit(Inst::xmm_rmi_reg(
-                    SseOpcode::Psraw,
-                    bigger_shift_by.clone(),
-                    dst,
-                ));
-
-                // Unpack and shift the upper lanes of `src` into a temporary register, `upper_lanes`.
-                let upper_lanes = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
-                ctx.emit(Inst::gen_move(upper_lanes, src, dst_ty));
-                ctx.emit(Inst::xmm_rm_r(
-                    SseOpcode::Punpckhbw,
-                    RegMem::from(upper_lanes),
-                    upper_lanes,
-                ));
-                ctx.emit(Inst::xmm_rmi_reg(
-                    SseOpcode::Psraw,
-                    bigger_shift_by,
-                    upper_lanes,
-                ));
-
-                // Merge the upper and lower shifted lanes into `dst`.
-                ctx.emit(Inst::xmm_rm_r(
-                    SseOpcode::Packsswb,
-                    RegMem::from(upper_lanes),
-                    dst,
-                ));
-            } else if dst_ty == types::I64X2 && op == Opcode::Sshr {
-                // The `sshr.i8x16` CLIF instruction has no single x86 instruction in the older feature sets; newer ones
-                // like AVX512VL + AVX512F include VPSRAQ, a 128-bit instruction that would fit here, but this backend
-                // does not currently have support for EVEX encodings (TODO when EVEX support is available, add an
-                // alternate lowering here). To remedy this, we extract each 64-bit lane to a GPR, shift each using a
-                // scalar instruction, and insert the shifted values back in the `dst` XMM register.
-                let src = put_input_in_reg(ctx, inputs[0]);
-                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-                ctx.emit(Inst::gen_move(dst, src, dst_ty));
-
-                // Extract the upper and lower lanes into temporary GPRs.
-                let lower_lane = ctx.alloc_tmp(types::I64).only_reg().unwrap();
-                emit_extract_lane(ctx, src, lower_lane, 0, types::I64);
-                let upper_lane = ctx.alloc_tmp(types::I64).only_reg().unwrap();
-                emit_extract_lane(ctx, src, upper_lane, 1, types::I64);
-
-                // Shift each value.
-                let mut shift = |reg: Writable<Reg>| {
-                    let kind = ShiftKind::ShiftRightArithmetic;
-                    if let Some(shift_by) = ctx.get_input_as_source_or_const(insn, 1).constant {
-                        // Mask the shift amount according to Cranelift's semantics.
-                        let shift_by = (shift_by as u8) & (types::I64.bits() as u8 - 1);
-                        ctx.emit(Inst::shift_r(
-                            OperandSize::Size64,
-                            kind,
-                            Some(shift_by),
-                            reg,
-                        ));
-                    } else {
-                        let dynamic_shift_by = put_input_in_reg(ctx, inputs[1]);
-                        let w_rcx = Writable::from_reg(regs::rcx());
-                        ctx.emit(Inst::mov_r_r(OperandSize::Size64, dynamic_shift_by, w_rcx));
-                        ctx.emit(Inst::shift_r(OperandSize::Size64, kind, None, reg));
-                    };
-                };
-                shift(lower_lane);
-                shift(upper_lane);
-
-                // Insert the scalar values back into the `dst` vector.
-                emit_insert_lane(ctx, RegMem::from(lower_lane), dst, 0, types::I64);
-                emit_insert_lane(ctx, RegMem::from(upper_lane), dst, 1, types::I64);
            } else {
                // For the remaining packed shifts not covered above, x86 has implementations that can either:
                // - shift using an immediate
@@ -1940,13 +1777,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    types::I16X8 => match op {
                        Opcode::Ishl => SseOpcode::Psllw,
                        Opcode::Ushr => SseOpcode::Psrlw,
-                        Opcode::Sshr => SseOpcode::Psraw,
                        _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
                    },
                    types::I32X4 => match op {
                        Opcode::Ishl => SseOpcode::Pslld,
                        Opcode::Ushr => SseOpcode::Psrld,
-                        Opcode::Sshr => SseOpcode::Psrad,
                        _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
                    },
                    types::I64X2 => match op {
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@@ -69,6 +69,31 @@ where
        OperandSize::from_ty(ty)
    }

+    fn put_in_reg_mem_imm(&mut self, val: Value) -> RegMemImm {
+        let inputs = self.lower_ctx.get_value_as_source_or_const(val);
+
+        if let Some(c) = inputs.constant {
+            if let Some(imm) = to_simm32(c as i64) {
+                return imm;
+            }
+
+            // Generate constants fresh at each use to minimize long-range
+            // register pressure.
+            let ty = self.value_type(val);
+            return RegMemImm::reg(generated_code::constructor_imm(self, ty, c).unwrap());
+        }
+
+        if let Some((src_insn, 0)) = inputs.inst {
+            if let Some((addr_input, offset)) = is_mergeable_load(self.lower_ctx, src_insn) {
+                self.lower_ctx.sink_inst(src_insn);
+                let amode = lower_to_amode(self.lower_ctx, addr_input, offset);
+                return RegMemImm::mem(amode);
+            }
+        }
+
+        RegMemImm::reg(self.put_in_reg(val))
+    }
+
    fn put_in_reg_mem(&mut self, val: Value) -> RegMem {
        let inputs = self.lower_ctx.get_value_as_source_or_const(val);

@@ -90,6 +115,23 @@ where
        RegMem::reg(self.put_in_reg(val))
    }

+    fn put_masked_in_imm8_reg(&mut self, val: Value, ty: Type) -> Imm8Reg {
+        let inputs = self.lower_ctx.get_value_as_source_or_const(val);
+
+        if let Some(c) = inputs.constant {
+            let mask = 1_u64
+                .checked_shl(ty.bits() as u32)
+                .map_or(u64::MAX, |x| x - 1);
+            return Imm8Reg::Imm8 {
+                imm: (c & mask) as u8,
+            };
+        }
+
+        Imm8Reg::Reg {
+            reg: self.put_in_regs(val).regs()[0],
+        }
+    }
+
    #[inline]
    fn encode_fcmp_imm(&mut self, imm: &FcmpImm) -> u8 {
        imm.encode()
@@ -131,12 +173,12 @@ where
    }

    #[inline]
-    fn mask_imm8_const(&mut self, imm8: &Imm8Reg, mask: u64) -> Imm8Reg {
-        match imm8 {
-            &Imm8Reg::Reg { reg } => Imm8Reg::Reg { reg },
-            &Imm8Reg::Imm8 { imm } => Imm8Reg::Imm8 {
-                imm: imm & (mask as u8),
-            },
+    fn const_to_type_masked_imm8(&mut self, c: u64, ty: Type) -> Imm8Reg {
+        let mask = 1_u64
+            .checked_shl(ty.bits() as u32)
+            .map_or(u64::MAX, |x| x - 1);
+        Imm8Reg::Imm8 {
+            imm: (c & mask) as u8,
        }
    }

--- a/cranelift/codegen/src/isa/x64/lower/isle/generated_code.manifest
+++ b/cranelift/codegen/src/isa/x64/lower/isle/generated_code.manifest
@@ -1,4 +1,4 @@
 src/clif.isle f176ef3bba99365
-src/prelude.isle babc931e5dc5b4cf
-src/isa/x64/inst.isle bc5fc626492752c8
-src/isa/x64/lower.isle 33e94300f4c08455
+src/prelude.isle d95510fad2e2473c
+src/isa/x64/inst.isle c16462cc359dd466
+src/isa/x64/lower.isle 9f761598e3949e8e
--- a/cranelift/codegen/src/isa/x64/lower/isle/generated_code.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle/generated_code.rs
--- a/cranelift/codegen/src/machinst/isle.rs
+++ b/cranelift/codegen/src/machinst/isle.rs
@@ -95,11 +95,6 @@ macro_rules! isle_prelude_methods {
            ty.bits().try_into().unwrap()
        }

-        #[inline]
-        fn ty_bits_mask(&mut self, ty: Type) -> u64 {
-            (1 << (self.ty_bits(ty) as u64)) - 1
-        }
-
        #[inline]
        fn ty_bits_u16(&mut self, ty: Type) -> u16 {
            ty.bits()
@@ -260,6 +255,21 @@ macro_rules! isle_prelude_methods {
                n => Some(n as u64),
            }
        }
+
+        #[inline]
+        fn u32_add(&mut self, a: u32, b: u32) -> u32 {
+            a.wrapping_add(b)
+        }
+
+        #[inline]
+        fn u8_and(&mut self, a: u8, b: u8) -> u8 {
+            a & b
+        }
+
+        #[inline]
+        fn lane_type(&mut self, ty: Type) -> Type {
+            ty.lane_type()
+        }
    };
 }

--- a/cranelift/codegen/src/prelude.isle
+++ b/cranelift/codegen/src/prelude.isle
@@ -38,6 +38,12 @@
 (type ValueList (primitive ValueList))
 (type ValueRegs (primitive ValueRegs))

+(decl u32_add (u32 u32) u32)
+(extern constructor u32_add u32_add)
+
+(decl u8_and (u8 u8) u8)
+(extern constructor u8_and u8_and)
+
 ;;;; Registers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 (type Reg (primitive Reg))
@@ -146,6 +152,10 @@
 (decl ty_bits_u16 (Type) u16)
 (extern constructor ty_bits_u16 ty_bits_u16)

+;; Get the type of each lane in the given type.
+(decl lane_type (Type) Type)
+(extern constructor lane_type lane_type)
+
 ;;;; Helper Clif Extractors ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 ;; An extractor that only matches types that can fit in 16 bits.
@@ -242,11 +252,6 @@
           (and (result_type ty)
                inst))

-;; Return a bitmask that will mask off a count to be within `ty`'s
-;; bit-width. Used for shifts/rotates.
-(decl ty_bits_mask (Type) u64)
-(extern constructor ty_bits_mask ty_bits_mask)
-
 ;; Match a multi-lane type, extracting (# bits per lane, # lanes) from the given
 ;; type. Will only match when there is more than one lane.
 (decl multi_lane (u8 u16) Type)
@@ -256,6 +261,11 @@
 (decl def_inst (Inst) Value)
 (extern extractor def_inst def_inst)

+;; Extract a constant `u64` from a value defined by an `iconst`.
+(decl u64_from_iconst (u64) Value)
+(extractor (u64_from_iconst x)
+           (def_inst (iconst (u64_from_imm64 x))))
+
 ;;;; Helpers for Working with Flags ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 ;; Newtype wrapper around `MInst` for instructions that are used for their
--- a/cranelift/filetests/filetests/isa/x64/i128.clif
+++ b/cranelift/filetests/filetests/isa/x64/i128.clif
@@ -1173,35 +1173,32 @@ block0(v0: i128, v1: i128):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 28)
+;   (instruction range: 0 .. 25)
 ;   Inst 0:   pushq   %rbp
 ;   Inst 1:   movq    %rsp, %rbp
-;   Inst 2:   movq    %rdi, %r8
-;   Inst 3:   movq    %rsi, %rdi
-;   Inst 4:   movq    %rdi, %rsi
-;   Inst 5:   movq    %rdx, %rcx
-;   Inst 6:   sarq    %cl, %rsi
-;   Inst 7:   movq    %rdx, %rcx
-;   Inst 8:   shrq    %cl, %r8
-;   Inst 9:   movl    $64, %ecx
-;   Inst 10:   subq    %rdx, %rcx
-;   Inst 11:   movq    %rdi, %rax
-;   Inst 12:   shlq    %cl, %rax
-;   Inst 13:   xorq    %rcx, %rcx
-;   Inst 14:   testq   $127, %rdx
-;   Inst 15:   cmovzq  %rcx, %rax
-;   Inst 16:   orq     %r8, %rax
-;   Inst 17:   sarq    $63, %rdi
-;   Inst 18:   xorq    %rcx, %rcx
-;   Inst 19:   andq    $64, %rdx
-;   Inst 20:   cmovzq  %rsi, %rdi
-;   Inst 21:   cmovzq  %rax, %rcx
-;   Inst 22:   cmovnzq %rsi, %rcx
-;   Inst 23:   movq    %rcx, %rax
-;   Inst 24:   movq    %rdi, %rdx
-;   Inst 25:   movq    %rbp, %rsp
-;   Inst 26:   popq    %rbp
-;   Inst 27:   ret
+;   Inst 2:   movq    %rdi, %rax
+;   Inst 3:   movq    %rdx, %rcx
+;   Inst 4:   shrq    %cl, %rax
+;   Inst 5:   movq    %rsi, %rdi
+;   Inst 6:   movq    %rdx, %rcx
+;   Inst 7:   sarq    %cl, %rdi
+;   Inst 8:   movl    $64, %ecx
+;   Inst 9:   subq    %rdx, %rcx
+;   Inst 10:   movq    %rsi, %r8
+;   Inst 11:   shlq    %cl, %r8
+;   Inst 12:   xorq    %rcx, %rcx
+;   Inst 13:   testq   $127, %rdx
+;   Inst 14:   cmovzq  %rcx, %r8
+;   Inst 15:   orq     %r8, %rax
+;   Inst 16:   sarq    $63, %rsi
+;   Inst 17:   testq   $64, %rdx
+;   Inst 18:   cmovzq  %rdi, %rsi
+;   Inst 19:   cmovzq  %rax, %rdi
+;   Inst 20:   movq    %rdi, %rax
+;   Inst 21:   movq    %rsi, %rdx
+;   Inst 22:   movq    %rbp, %rsp
+;   Inst 23:   popq    %rbp
+;   Inst 24:   ret
 ; }}

 function %f33(i128, i128) -> i128 {
--- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
@@ -324,16 +324,16 @@ block0(v0: i32):
 ;   (instruction range: 0 .. 15)
 ;   Inst 0:   pushq   %rbp
 ;   Inst 1:   movq    %rsp, %rbp
-;   Inst 2:   load_const VCodeConstant(0), %xmm0
-;   Inst 3:   addl    $8, %edi
-;   Inst 4:   movd    %edi, %xmm2
-;   Inst 5:   movdqa  %xmm0, %xmm1
-;   Inst 6:   punpcklbw %xmm1, %xmm1
-;   Inst 7:   psraw   %xmm2, %xmm1
-;   Inst 8:   punpckhbw %xmm0, %xmm0
+;   Inst 2:   load_const VCodeConstant(0), %xmm2
+;   Inst 3:   movdqa  %xmm2, %xmm0
+;   Inst 4:   punpcklbw %xmm2, %xmm0
+;   Inst 5:   movdqa  %xmm2, %xmm1
+;   Inst 6:   punpckhbw %xmm2, %xmm1
+;   Inst 7:   addl    $8, %edi
+;   Inst 8:   movd    %edi, %xmm2
 ;   Inst 9:   psraw   %xmm2, %xmm0
-;   Inst 10:   packsswb %xmm0, %xmm1
-;   Inst 11:   movdqa  %xmm1, %xmm0
+;   Inst 10:   psraw   %xmm2, %xmm1
+;   Inst 11:   packsswb %xmm1, %xmm0
 ;   Inst 12:   movq    %rbp, %rsp
 ;   Inst 13:   popq    %rbp
 ;   Inst 14:   ret
@@ -349,19 +349,20 @@ block0(v0: i8x16, v1: i32):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 12)
+;   (instruction range: 0 .. 13)
 ;   Inst 0:   pushq   %rbp
 ;   Inst 1:   movq    %rsp, %rbp
 ;   Inst 2:   movdqa  %xmm0, %xmm1
-;   Inst 3:   movdqa  %xmm1, %xmm0
-;   Inst 4:   punpcklbw %xmm0, %xmm0
-;   Inst 5:   psraw   $11, %xmm0
-;   Inst 6:   punpckhbw %xmm1, %xmm1
-;   Inst 7:   psraw   $11, %xmm1
-;   Inst 8:   packsswb %xmm1, %xmm0
-;   Inst 9:   movq    %rbp, %rsp
-;   Inst 10:   popq    %rbp
-;   Inst 11:   ret
+;   Inst 3:   punpcklbw %xmm0, %xmm1
+;   Inst 4:   movdqa  %xmm0, %xmm2
+;   Inst 5:   punpckhbw %xmm0, %xmm2
+;   Inst 6:   psraw   $11, %xmm1
+;   Inst 7:   psraw   $11, %xmm2
+;   Inst 8:   packsswb %xmm2, %xmm1
+;   Inst 9:   movdqa  %xmm1, %xmm0
+;   Inst 10:   movq    %rbp, %rsp
+;   Inst 11:   popq    %rbp
+;   Inst 12:   ret
 ; }}

 function %sshr_i64x2(i64x2, i32) -> i64x2 {
@@ -374,21 +375,20 @@ block0(v0: i64x2, v1: i32):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 15)
+;   (instruction range: 0 .. 14)
 ;   Inst 0:   pushq   %rbp
 ;   Inst 1:   movq    %rsp, %rbp
-;   Inst 2:   movdqa  %xmm0, %xmm1
-;   Inst 3:   pextrd.w $0, %xmm0, %rsi
-;   Inst 4:   pextrd.w $1, %xmm0, %rax
-;   Inst 5:   movq    %rdi, %rcx
-;   Inst 6:   sarq    %cl, %rsi
-;   Inst 7:   movq    %rdi, %rcx
-;   Inst 8:   sarq    %cl, %rax
-;   Inst 9:   pinsrd.w $0, %rsi, %xmm1
-;   Inst 10:   pinsrd.w $1, %rax, %xmm1
-;   Inst 11:   movdqa  %xmm1, %xmm0
-;   Inst 12:   movq    %rbp, %rsp
-;   Inst 13:   popq    %rbp
-;   Inst 14:   ret
+;   Inst 2:   pextrd.w $0, %xmm0, %rsi
+;   Inst 3:   pextrd.w $1, %xmm0, %rax
+;   Inst 4:   movq    %rdi, %rcx
+;   Inst 5:   sarq    %cl, %rsi
+;   Inst 6:   movq    %rdi, %rcx
+;   Inst 7:   sarq    %cl, %rax
+;   Inst 8:   uninit  %xmm0
+;   Inst 9:   pinsrd.w $0, %rsi, %xmm0
+;   Inst 10:   pinsrd.w $1, %rax, %xmm0
+;   Inst 11:   movq    %rbp, %rsp
+;   Inst 12:   popq    %rbp
+;   Inst 13:   ret
 ; }}

--- a/cranelift/filetests/src/runner.rs
+++ b/cranelift/filetests/src/runner.rs
@@ -13,7 +13,7 @@ use std::path::{Path, PathBuf};
 use std::time;

 /// Timeout in seconds when we're not making progress.
-const TIMEOUT_PANIC: usize = 10;
+const TIMEOUT_PANIC: usize = 60;

 /// Timeout for reporting slow tests without panicking.
 const TIMEOUT_SLOW: usize = 3;