cranelift: port sshr to ISLE on x64 (#3681)

2022-01-12 07:13:58 -08:00
parent 1ef0abb12c
commit 7454f1f3af
13 changed files with 1003 additions and 563 deletions
--- a/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.manifest
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.manifest
@@ -1,4 +1,4 @@
 src/clif.isle f176ef3bba99365
-src/prelude.isle babc931e5dc5b4cf
+src/prelude.isle d95510fad2e2473c
 src/isa/aarch64/inst.isle 5fa80451697b084f
 src/isa/aarch64/lower.isle 2d2e1e076a0c8a23
--- a/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.rs
@@ -20,6 +20,8 @@ pub trait Context {
    fn pack_value_array_2(&mut self, arg0: Value, arg1: Value) -> ValueArray2;
    fn unpack_value_array_3(&mut self, arg0: &ValueArray3) -> (Value, Value, Value);
    fn pack_value_array_3(&mut self, arg0: Value, arg1: Value, arg2: Value) -> ValueArray3;
    fn u32_add(&mut self, arg0: u32, arg1: u32) -> u32;
    fn u8_and(&mut self, arg0: u8, arg1: u8) -> u8;
    fn value_reg(&mut self, arg0: Reg) -> ValueRegs;
    fn value_regs(&mut self, arg0: Reg, arg1: Reg) -> ValueRegs;
    fn temp_writable_reg(&mut self, arg0: Type) -> WritableReg;
@@ -32,6 +34,7 @@ pub trait Context {
    fn u32_as_u64(&mut self, arg0: u32) -> u64;
    fn ty_bits(&mut self, arg0: Type) -> u8;
    fn ty_bits_u16(&mut self, arg0: Type) -> u16;
    fn lane_type(&mut self, arg0: Type) -> Type;
    fn fits_in_16(&mut self, arg0: Type) -> Option<Type>;
    fn fits_in_32(&mut self, arg0: Type) -> Option<Type>;
    fn fits_in_64(&mut self, arg0: Type) -> Option<Type>;
@@ -52,7 +55,6 @@ pub trait Context {
    fn first_result(&mut self, arg0: Inst) -> Option<Value>;
    fn inst_data(&mut self, arg0: Inst) -> InstructionData;
    fn value_type(&mut self, arg0: Value) -> Type;
    fn ty_bits_mask(&mut self, arg0: Type) -> u64;
    fn multi_lane(&mut self, arg0: Type) -> Option<(u8, u16)>;
    fn def_inst(&mut self, arg0: Value) -> Option<Inst>;
    fn trap_code_division_by_zero(&mut self) -> TrapCode;
@@ -89,13 +91,13 @@ pub trait Context {
    fn rotr_opposite_amount(&mut self, arg0: Type, arg1: ImmShift) -> ImmShift;
 }
-/// Internal type ProducesFlags: defined at src/prelude.isle line 263.
+/// Internal type ProducesFlags: defined at src/prelude.isle line 273.
 #[derive(Clone, Debug)]
 pub enum ProducesFlags {
    ProducesFlags { inst: MInst, result: Reg },
 }
-/// Internal type ConsumesFlags: defined at src/prelude.isle line 266.
+/// Internal type ConsumesFlags: defined at src/prelude.isle line 276.
 #[derive(Clone, Debug)]
 pub enum ConsumesFlags {
    ConsumesFlags { inst: MInst, result: Reg },
@@ -975,7 +977,7 @@ pub enum AtomicRMWOp {
 // Generated as internal constructor for term temp_reg.
 pub fn constructor_temp_reg<C: Context>(ctx: &mut C, arg0: Type) -> Option<Reg> {
    let pattern0_0 = arg0;
-    // Rule at src/prelude.isle line 60.
+    // Rule at src/prelude.isle line 66.
    let expr0_0 = C::temp_writable_reg(ctx, pattern0_0);
    let expr1_0 = C::writable_reg_to_reg(ctx, expr0_0);
    return Some(expr1_0);
@@ -984,7 +986,7 @@ pub fn constructor_temp_reg<C: Context>(ctx: &mut C, arg0: Type) -> Option<Reg>
 // Generated as internal constructor for term lo_reg.
 pub fn constructor_lo_reg<C: Context>(ctx: &mut C, arg0: Value) -> Option<Reg> {
    let pattern0_0 = arg0;
-    // Rule at src/prelude.isle line 95.
+    // Rule at src/prelude.isle line 101.
    let expr0_0 = C::put_in_regs(ctx, pattern0_0);
    let expr1_0: usize = 0;
    let expr2_0 = C::value_regs_get(ctx, expr0_0, expr1_0);
@@ -1009,7 +1011,7 @@ pub fn constructor_with_flags<C: Context>(
            result: pattern3_1,
        } = pattern2_0
        {
-            // Rule at src/prelude.isle line 276.
+            // Rule at src/prelude.isle line 286.
            let expr0_0 = C::emit(ctx, &pattern1_0);
            let expr1_0 = C::emit(ctx, &pattern3_0);
            let expr2_0 = C::value_regs(ctx, pattern1_1, pattern3_1);
@@ -1037,7 +1039,7 @@ pub fn constructor_with_flags_1<C: Context>(
            result: pattern3_1,
        } = pattern2_0
        {
-            // Rule at src/prelude.isle line 284.
+            // Rule at src/prelude.isle line 294.
            let expr0_0 = C::emit(ctx, &pattern1_0);
            let expr1_0 = C::emit(ctx, &pattern3_0);
            return Some(pattern3_1);
@@ -1071,7 +1073,7 @@ pub fn constructor_with_flags_2<C: Context>(
                result: pattern5_1,
            } = pattern4_0
            {
-                // Rule at src/prelude.isle line 294.
+                // Rule at src/prelude.isle line 304.
                let expr0_0 = C::emit(ctx, &pattern1_0);
                let expr1_0 = C::emit(ctx, &pattern5_0);
                let expr2_0 = C::emit(ctx, &pattern3_0);
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -35,6 +35,7 @@
                       (dst WritableReg)
                       (imm u8)
                       (size OperandSize))
            (XmmUninitializedValue (dst WritableReg))
            (CmpRmiR (size OperandSize)
                     (opcode CmpOpcode)
                     (src RegMemImm)
@@ -292,6 +293,15 @@
       (Mem (addr SyntheticAmode))
       (Imm (simm32 u32))))
 ;; Put the given clif value into a `RegMemImm` operand.
 ;;
 ;; Asserts that the value fits into a single register, and doesn't require
 ;; multiple registers for its representation (like `i128` for example).
 ;;
 ;; As a side effect, this marks the value as used.
 (decl put_in_reg_mem_imm (Value) RegMemImm)
 (extern constructor put_in_reg_mem_imm put_in_reg_mem_imm)
 (type RegMem extern
      (enum
       (Reg (reg Reg))
@@ -319,6 +329,18 @@
      (enum (Imm8 (imm u8))
            (Reg (reg Reg))))
 ;; Put the given clif value into a `Imm8Reg` operand, masked to the bit width of
 ;; the given type.
 ;;
 ;; Asserts that the value fits into a single register, and doesn't require
 ;; multiple registers for its representation (like `i128` for example).
 ;;
 ;; As a side effect, this marks the value as used.
 ;;
 ;; This is used when lowering various shifts and rotates.
 (decl put_masked_in_imm8_reg (Value Type) Imm8Reg)
 (extern constructor put_masked_in_imm8_reg put_masked_in_imm8_reg)
 (type CC extern
      (enum O
            NO
@@ -383,9 +405,12 @@
 (decl imm8_from_value (Imm8Reg) Value)
 (extern extractor imm8_from_value imm8_from_value)
-;; Mask an `Imm8Reg.Imm8`.
+;; Mask a constant to the bit-width of the given type and package it into an
-(decl mask_imm8_const (Imm8Reg u64) Imm8Reg)
+;; `Imm8Reg.Imm8`. This is used for shifts and rotates, so that we don't try and
-(extern constructor mask_imm8_const mask_imm8_const)
+;; shift/rotate more bits than the type has available, per Cranelift's
 ;; semantics.
 (decl const_to_type_masked_imm8 (u64 Type) Imm8Reg)
 (extern constructor const_to_type_masked_imm8 const_to_type_masked_imm8)
 ;; Extract a constant `RegMemImm.Imm` from a value operand.
 (decl simm32_from_value (RegMemImm) Value)
@@ -494,6 +519,37 @@
                                         wr))))
        r))
 ;; Helper for creating an SSE register holding an `i64x2` from two `i64` values.
 (decl make_i64x2_from_lanes (RegMem RegMem) Reg)
 (rule (make_i64x2_from_lanes lo hi)
      (let ((dst_w WritableReg (temp_writable_reg $I64X2))
            (dst_r Reg (writable_reg_to_reg dst_w))
            (_0 Unit (emit (MInst.XmmUninitializedValue dst_w)))
            (_1 Unit (emit (MInst.XmmRmRImm (SseOpcode.Pinsrd)
                                            dst_r
                                            lo
                                            dst_w
                                            0
                                            (OperandSize.Size64))))
            (_2 Unit (emit (MInst.XmmRmRImm (SseOpcode.Pinsrd)
                                            dst_r
                                            hi
                                            dst_w
                                            1
                                            (OperandSize.Size64)))))
        dst_r))
 ;; Move a `RegMemImm.Reg` operand to an XMM register, if necessary.
 (decl reg_mem_imm_to_xmm (RegMemImm) RegMemImm)
 (rule (reg_mem_imm_to_xmm rmi @ (RegMemImm.Mem _)) rmi)
 (rule (reg_mem_imm_to_xmm rmi @ (RegMemImm.Imm _)) rmi)
 (rule (reg_mem_imm_to_xmm (RegMemImm.Reg r))
      (RegMemImm.Reg (gpr_to_xmm $I8X16
                                 (SseOpcode.Movd)
                                 (RegMem.Reg r)
                                 (OperandSize.Size32))))
 ;;;; Instruction Constructors ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; These constructors create SSA-style `MInst`s. It is their responsibility to
@@ -1058,6 +1114,21 @@
 (rule (pminud src1 src2)
      (xmm_rm_r $I8X16 (SseOpcode.Pminud) src1 src2))
 ;; Helper for creating `punpcklbw` instructions.
 (decl punpcklbw (Reg RegMem) Reg)
 (rule (punpcklbw src1 src2)
      (xmm_rm_r $I8X16 (SseOpcode.Punpcklbw) src1 src2))
 ;; Helper for creating `punpckhbw` instructions.
 (decl punpckhbw (Reg RegMem) Reg)
 (rule (punpckhbw src1 src2)
      (xmm_rm_r $I8X16 (SseOpcode.Punpckhbw) src1 src2))
 ;; Helper for creating `packsswb` instructions.
 (decl packsswb (Reg RegMem) Reg)
 (rule (packsswb src1 src2)
      (xmm_rm_r $I8X16 (SseOpcode.Packsswb) src1 src2))
 ;; Helper for creating `MInst.XmmRmRImm` instructions.
 (decl xmm_rm_r_imm (SseOpcode Reg RegMem u8 OperandSize) Reg)
 (rule (xmm_rm_r_imm op src1 src2 imm size)
@@ -1180,6 +1251,16 @@
 (rule (psrlq src1 src2)
      (xmm_rmi_reg (SseOpcode.Psrlq) src1 src2))
 ;; Helper for creating `psraw` instructions.
 (decl psraw (Reg RegMemImm) Reg)
 (rule (psraw src1 src2)
      (xmm_rmi_reg (SseOpcode.Psraw) src1 src2))
 ;; Helper for creating `psrad` instructions.
 (decl psrad (Reg RegMemImm) Reg)
 (rule (psrad src1 src2)
      (xmm_rmi_reg (SseOpcode.Psrad) src1 src2))
 ;; Helper for creating `MInst.MulHi` instructions.
 ;;
 ;; Returns the (lo, hi) register halves of the multiplication.
@@ -1252,6 +1333,19 @@
 (rule (insertps src1 src2 lane)
      (xmm_rm_r_imm (SseOpcode.Insertps) src1 src2 lane (OperandSize.Size32)))
 ;; Helper for creating `pextrd` instructions.
 (decl pextrd (Type Reg u8) Reg)
 (rule (pextrd ty src lane)
      (let ((w_dst WritableReg (temp_writable_reg ty))
            (r_dst Reg (writable_reg_to_reg w_dst))
            (_ Unit (emit (MInst.XmmRmRImm (SseOpcode.Pextrd)
                                           r_dst
                                           (RegMem.Reg src)
                                           w_dst
                                           lane
                                           (operand_size_of_type_32_64 (lane_type ty))))))
        r_dst))
 ;; Helper for creating `not` instructions.
 (decl not (Type Reg) Reg)
 (rule (not ty src)
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -537,13 +537,7 @@
 ;; `i64` and smaller.
 (rule (lower (has_type (fits_in_64 ty) (ishl src amt)))
-      ;; NB: Only the low bits of `amt` matter since we logically mask the shift
+      (value_reg (shl ty (put_in_reg src) (put_masked_in_imm8_reg amt ty))))
      ;; amount to the value's bit width.
      (let ((amt_ Reg (lo_reg amt)))
        (value_reg (shl ty (put_in_reg src) (Imm8Reg.Reg amt_)))))
 (rule (lower (has_type (fits_in_64 ty) (ishl src (imm8_from_value amt))))
      (value_reg (shl ty (put_in_reg src) amt)))
 ;; `i128`.
@@ -582,15 +576,8 @@
 ;; `i64` and smaller.
 (rule (lower (has_type (fits_in_64 ty) (ushr src amt)))
      (let ((src_ Reg (extend_to_reg src ty (ExtendKind.Zero)))
            ;; NB: Only the low bits of `amt` matter since we logically mask the
            ;; shift amount to the value's bit width.
            (amt_ Reg (lo_reg amt)))
        (value_reg (shr ty src_ (Imm8Reg.Reg amt_)))))
 (rule (lower (has_type (fits_in_64 ty) (ushr src (imm8_from_value amt))))
      (let ((src_ Reg (extend_to_reg src ty (ExtendKind.Zero))))
-        (value_reg (shr ty src_ amt))))
+        (value_reg (shr ty src_ (put_masked_in_imm8_reg amt ty)))))
 ;; `i128`.
@@ -623,6 +610,109 @@
      (let ((amt_ Reg (lo_reg amt)))
        (shr_i128 (put_in_regs src) amt_)))
 ;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; `i64` and smaller.
 (rule (lower (has_type (fits_in_64 ty) (sshr src amt)))
      (let ((src_ Reg (extend_to_reg src ty (ExtendKind.Sign))))
        (value_reg (sar ty src_ (put_masked_in_imm8_reg amt ty)))))
 ;; `i128`.
 (decl sar_i128 (ValueRegs Reg) ValueRegs)
 (rule (sar_i128 src amt)
      ;; Unpack the low/high halves of `src`.
      (let ((src_lo Reg (value_regs_get src 0))
            (src_hi Reg (value_regs_get src 1))
            ;; Do a shift of each half. NB: the low half uses an unsigned shift
            ;; because its MSB is not a sign bit.
            (lo_shifted Reg (shr $I64 src_lo (Imm8Reg.Reg amt)))
            (hi_shifted Reg (sar $I64 src_hi (Imm8Reg.Reg amt)))
            ;; `src_hi << (64 - amt)` are the bits to carry over from the low
            ;; half to the high half.
            (carry Reg (shl $I64 src_hi (Imm8Reg.Reg (sub $I64 (imm $I64 64) (RegMemImm.Reg amt)))))
            ;; Nullify the carry if we are shifting by a multiple of 128.
            (carry_ Reg (with_flags_1 (test (OperandSize.Size64) (RegMemImm.Imm 127) amt)
                                      (cmove $I64 (CC.Z) (RegMem.Reg (imm $I64 0)) carry)))
            ;; Add the carry into the low half.
            (lo_shifted_ Reg (or $I64 lo_shifted (RegMemImm.Reg carry_)))
            ;; Get all sign bits.
            (sign_bits Reg (sar $I64 src_hi (Imm8Reg.Imm8 63))))
        ;; Combine the two shifted halves. However, if we are shifting by >= 64
        ;; (modulo 128), then the hi bits are all sign bits and the lo bits are
        ;; what would otherwise be our hi bits.
        (with_flags_2 (test (OperandSize.Size64) (RegMemImm.Imm 64) amt)
                      (cmove $I64 (CC.Z) (RegMem.Reg lo_shifted_) hi_shifted)
                      (cmove $I64 (CC.Z) (RegMem.Reg hi_shifted) sign_bits))))
 (rule (lower (has_type $I128 (sshr src amt)))
      ;; NB: Only the low bits of `amt` matter since we logically mask the shift
      ;; amount to the value's bit width.
      (let ((amt_ Reg (lo_reg amt)))
        (sar_i128 (put_in_regs src) amt_)))
 ;; SSE.
 ;; Since the x86 instruction set does not have an 8x16 shift instruction and the
 ;; approach used for `ishl` and `ushr` cannot be easily used (the masks do not
 ;; preserve the sign), we use a different approach here: separate the low and
 ;; high lanes, shift them separately, and merge them into the final result.
 ;;
 ;; Visually, this looks like the following, where `src.i8x16 = [s0, s1, ...,
 ;; s15]:
 ;;
 ;;   lo.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)]
 ;;   shifted_lo.i16x8 = shift each lane of `low`
 ;;   hi.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]
 ;;   shifted_hi.i16x8 = shift each lane of `high`
 ;;   result = [s0'', s1'', ..., s15'']
 (rule (lower (has_type $I8X16 (sshr src amt @ (value_type amt_ty))))
      (let ((src_ Reg (put_in_reg src))
            ;; In order for `packsswb` later to only use the high byte of each
            ;; 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to
            ;; fill in the upper bits appropriately.
            (lo Reg (punpcklbw src_ (RegMem.Reg src_)))
            (hi Reg (punpckhbw src_ (RegMem.Reg src_)))
            (amt_ RegMemImm (sshr_i8x16_bigger_shift amt_ty (put_in_reg_mem_imm amt)))
            (shifted_lo Reg (psraw lo amt_))
            (shifted_hi Reg (psraw hi amt_)))
        (value_reg (packsswb shifted_lo (RegMem.Reg shifted_hi)))))
 (decl sshr_i8x16_bigger_shift (Type RegMemImm) RegMemImm)
 (rule (sshr_i8x16_bigger_shift _ty (RegMemImm.Imm i))
      (RegMemImm.Imm (u32_add i 8)))
 (rule (sshr_i8x16_bigger_shift ty (RegMemImm.Reg r))
      (reg_mem_imm_to_xmm (RegMemImm.Reg (add ty r (RegMemImm.Imm 8)))))
 (rule (sshr_i8x16_bigger_shift ty rmi @ (RegMemImm.Mem _m))
      (reg_mem_imm_to_xmm (RegMemImm.Reg (add ty (imm ty 8) rmi))))
 ;; `sshr.{i16x8,i32x4}` can be a simple `psra{w,d}`, we just have to make sure
 ;; that if the shift amount is in a register, it is in an XMM register.
 (rule (lower (has_type $I16X8 (sshr src amt)))
      (value_reg (psraw (put_in_reg src)
                        (reg_mem_imm_to_xmm (put_in_reg_mem_imm amt)))))
 (rule (lower (has_type $I32X4 (sshr src amt)))
      (value_reg (psrad (put_in_reg src)
                        (reg_mem_imm_to_xmm (put_in_reg_mem_imm amt)))))
 ;; The `sshr.i64x2` CLIF instruction has no single x86 instruction in the older
 ;; feature sets. Newer ones like AVX512VL + AVX512F include `vpsraq`, a 128-bit
 ;; instruction that would fit here, but this backend does not currently have
 ;; support for EVEX encodings. To remedy this, we extract each 64-bit lane to a
 ;; GPR, shift each using a scalar instruction, and insert the shifted values
 ;; back in the `dst` XMM register.
 ;;
 ;; (TODO: when EVEX support is available, add an alternate lowering here).
 (rule (lower (has_type $I64X2 (sshr src amt)))
      (let ((src_ Reg (put_in_reg src))
            (lo Reg (pextrd $I64 src_ 0))
            (hi Reg (pextrd $I64 src_ 1))
            (amt_ Imm8Reg (put_masked_in_imm8_reg amt $I64))
            (shifted_lo Reg (sar $I64 lo amt_))
            (shifted_hi Reg (sar $I64 hi amt_)))
        (value_reg (make_i64x2_from_lanes (RegMem.Reg shifted_lo)
                                          (RegMem.Reg shifted_hi)))))
 ;;;; Rules for `rotl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; `i16` and `i8`: we need to extend the shift amount, or mask the
@@ -632,8 +722,11 @@
      (let ((amt_ Reg (extend_to_reg amt $I32 (ExtendKind.Zero))))
        (value_reg (m_rotl ty (put_in_reg src) (Imm8Reg.Reg amt_)))))
-(rule (lower (has_type (ty_8_or_16 ty) (rotl src (imm8_from_value amt))))
+(rule (lower (has_type (ty_8_or_16 ty)
-      (value_reg (m_rotl ty (put_in_reg src) (mask_imm8_const amt (ty_bits_mask ty)))))
+                       (rotl src (u64_from_iconst amt))))
      (value_reg (m_rotl ty
                         (put_in_reg src)
                         (const_to_type_masked_imm8 amt ty))))
 ;; `i64` and `i32`: we can rely on x86's rotate-amount masking since
 ;;  we operate on the whole register.
@@ -644,8 +737,11 @@
      (let ((amt_ Reg (lo_reg amt)))
        (value_reg (m_rotl ty (put_in_reg src) (Imm8Reg.Reg amt_)))))
-(rule (lower (has_type (ty_32_or_64 ty) (rotl src (imm8_from_value amt))))
+(rule (lower (has_type (ty_32_or_64 ty)
-      (value_reg (m_rotl ty (put_in_reg src) amt)))
+                       (rotl src (u64_from_iconst amt))))
      (value_reg (m_rotl ty
                         (put_in_reg src)
                         (const_to_type_masked_imm8 amt ty))))
 ;; `i128`.
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -1538,13 +1538,18 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        | Opcode::Umin
        | Opcode::Bnot
        | Opcode::Bitselect
-        | Opcode::Vselect => implemented_in_isle(ctx),
+        | Opcode::Vselect
        | Opcode::Sshr => implemented_in_isle(ctx),
-        Opcode::Ishl | Opcode::Ushr | Opcode::Sshr | Opcode::Rotl | Opcode::Rotr => {
+        Opcode::Ishl | Opcode::Ushr | Opcode::Rotl | Opcode::Rotr => {
            let dst_ty = ctx.output_ty(insn, 0);
            debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty);
            if !dst_ty.is_vector() && dst_ty.bits() <= 64 {
                if op != Opcode::Rotr {
                    implemented_in_isle(ctx);
                }
                // Scalar shifts on x86 have various encodings:
                // - shift by one bit, e.g. `SAL r/m8, 1` (not used here)
                // - shift by an immediate amount, e.g. `SAL r/m8, imm8`
@@ -1557,10 +1562,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                            OperandSize::Size32,
                            extend_input_to_reg(ctx, inputs[0], ExtSpec::ZeroExtendTo32),
                        ),
                        Opcode::Sshr => (
                            OperandSize::Size32,
                            extend_input_to_reg(ctx, inputs[0], ExtSpec::SignExtendTo32),
                        ),
                        Opcode::Rotl | Opcode::Rotr => (
                            OperandSize::from_ty(dst_ty),
                            put_input_in_reg(ctx, inputs[0]),
@@ -1590,7 +1591,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                let shift_kind = match op {
                    Opcode::Ishl => ShiftKind::ShiftLeft,
                    Opcode::Ushr => ShiftKind::ShiftRightLogical,
                    Opcode::Sshr => ShiftKind::ShiftRightArithmetic,
                    Opcode::Rotl => ShiftKind::RotateLeft,
                    Opcode::Rotr => ShiftKind::RotateRight,
                    _ => unreachable!(),
@@ -1608,50 +1608,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                let dst = get_output_reg(ctx, outputs[0]);
                match op {
-                    Opcode::Ishl => {
+                    Opcode::Ishl | Opcode::Ushr | Opcode::Rotl => {
-                        emit_shl_i128(ctx, src, dst, amt_src);
+                        implemented_in_isle(ctx);
                    }
                    Opcode::Ushr => {
                        emit_shr_i128(ctx, src, dst, amt_src, /* is_signed = */ false);
                    }
                    Opcode::Sshr => {
                        emit_shr_i128(ctx, src, dst, amt_src, /* is_signed = */ true);
                    }
                    Opcode::Rotl => {
                        // (mov tmp, src)
                        // (shl.i128 tmp, amt)
                        // (mov dst, src)
                        // (ushr.i128 dst, 128-amt)
                        // (or dst, tmp)
                        let tmp = ctx.alloc_tmp(types::I128);
                        emit_shl_i128(ctx, src, tmp, amt_src);
                        let inv_amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                        ctx.emit(Inst::imm(OperandSize::Size64, 128, inv_amt));
                        ctx.emit(Inst::alu_rmi_r(
                            OperandSize::Size64,
                            AluRmiROpcode::Sub,
                            RegMemImm::reg(amt_src),
                            inv_amt,
                        ));
                        emit_shr_i128(
                            ctx,
                            src,
                            dst,
                            inv_amt.to_reg(),
                            /* is_signed = */ false,
                        );
                        ctx.emit(Inst::alu_rmi_r(
                            OperandSize::Size64,
                            AluRmiROpcode::Or,
                            RegMemImm::reg(tmp.regs()[0].to_reg()),
                            dst.regs()[0],
                        ));
                        ctx.emit(Inst::alu_rmi_r(
                            OperandSize::Size64,
                            AluRmiROpcode::Or,
                            RegMemImm::reg(tmp.regs()[1].to_reg()),
                            dst.regs()[1],
                        ));
                    }
                    Opcode::Rotr => {
                        // (mov tmp, src)
@@ -1808,127 +1766,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    _ => SseOpcode::Pand,
                };
                ctx.emit(Inst::xmm_rm_r(sse_op, RegMem::from(mask_value), dst));
            } else if dst_ty == types::I8X16 && op == Opcode::Sshr {
                // Since the x86 instruction set does not have an 8x16 shift instruction and the approach used for
                // `ishl` and `ushr` cannot be easily used (the masks do not preserve the sign), we use a different
                // approach here: separate the low and high lanes, shift them separately, and merge them into the final
                // result. Visually, this looks like the following, where `src.i8x16 = [s0, s1, ..., s15]:
                //   low.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)]
                //   shifted_low.i16x8 = shift each lane of `low`
                //   high.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]
                //   shifted_high.i16x8 = shift each lane of `high`
                //   dst.i8x16 = [s0'', s1'', ..., s15'']
                let src = put_input_in_reg(ctx, inputs[0]);
                let shift_by = input_to_reg_mem_imm(ctx, inputs[1]);
                let shift_by_ty = ctx.input_ty(insn, 1);
                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                // In order for PACKSSWB later to only use the high byte of each 16x8 lane, we shift right an extra 8
                // bits, relying on PSRAW to fill in the upper bits appropriately.
                let bigger_shift_by = match shift_by {
                    // When we know the shift amount at compile time, we add the extra shift amount statically.
                    RegMemImm::Imm { simm32 } => RegMemImm::imm(simm32 + 8),
                    // Otherwise we add instructions to add the extra shift amount and move the value into an XMM
                    // register.
                    RegMemImm::Reg { reg } => {
                        let bigger_shift_by_gpr = ctx.alloc_tmp(shift_by_ty).only_reg().unwrap();
                        ctx.emit(Inst::mov_r_r(OperandSize::Size64, reg, bigger_shift_by_gpr));
                        let size = if shift_by_ty == types::I64 {
                            OperandSize::Size64
                        } else {
                            OperandSize::Size32
                        };
                        let imm = RegMemImm::imm(8);
                        ctx.emit(Inst::alu_rmi_r(
                            size,
                            AluRmiROpcode::Add,
                            imm,
                            bigger_shift_by_gpr,
                        ));
                        let bigger_shift_by_xmm = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
                        ctx.emit(Inst::gpr_to_xmm(
                            SseOpcode::Movd,
                            RegMem::from(bigger_shift_by_gpr),
                            OperandSize::Size32,
                            bigger_shift_by_xmm,
                        ));
                        RegMemImm::reg(bigger_shift_by_xmm.to_reg())
                    }
                    RegMemImm::Mem { .. } => unimplemented!("load shift amount to XMM register"),
                };
                // Unpack and shift the lower lanes of `src` into the `dst` register.
                ctx.emit(Inst::gen_move(dst, src, dst_ty));
                ctx.emit(Inst::xmm_rm_r(SseOpcode::Punpcklbw, RegMem::from(dst), dst));
                ctx.emit(Inst::xmm_rmi_reg(
                    SseOpcode::Psraw,
                    bigger_shift_by.clone(),
                    dst,
                ));
                // Unpack and shift the upper lanes of `src` into a temporary register, `upper_lanes`.
                let upper_lanes = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
                ctx.emit(Inst::gen_move(upper_lanes, src, dst_ty));
                ctx.emit(Inst::xmm_rm_r(
                    SseOpcode::Punpckhbw,
                    RegMem::from(upper_lanes),
                    upper_lanes,
                ));
                ctx.emit(Inst::xmm_rmi_reg(
                    SseOpcode::Psraw,
                    bigger_shift_by,
                    upper_lanes,
                ));
                // Merge the upper and lower shifted lanes into `dst`.
                ctx.emit(Inst::xmm_rm_r(
                    SseOpcode::Packsswb,
                    RegMem::from(upper_lanes),
                    dst,
                ));
            } else if dst_ty == types::I64X2 && op == Opcode::Sshr {
                // The `sshr.i8x16` CLIF instruction has no single x86 instruction in the older feature sets; newer ones
                // like AVX512VL + AVX512F include VPSRAQ, a 128-bit instruction that would fit here, but this backend
                // does not currently have support for EVEX encodings (TODO when EVEX support is available, add an
                // alternate lowering here). To remedy this, we extract each 64-bit lane to a GPR, shift each using a
                // scalar instruction, and insert the shifted values back in the `dst` XMM register.
                let src = put_input_in_reg(ctx, inputs[0]);
                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                ctx.emit(Inst::gen_move(dst, src, dst_ty));
                // Extract the upper and lower lanes into temporary GPRs.
                let lower_lane = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                emit_extract_lane(ctx, src, lower_lane, 0, types::I64);
                let upper_lane = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                emit_extract_lane(ctx, src, upper_lane, 1, types::I64);
                // Shift each value.
                let mut shift = |reg: Writable<Reg>| {
                    let kind = ShiftKind::ShiftRightArithmetic;
                    if let Some(shift_by) = ctx.get_input_as_source_or_const(insn, 1).constant {
                        // Mask the shift amount according to Cranelift's semantics.
                        let shift_by = (shift_by as u8) & (types::I64.bits() as u8 - 1);
                        ctx.emit(Inst::shift_r(
                            OperandSize::Size64,
                            kind,
                            Some(shift_by),
                            reg,
                        ));
                    } else {
                        let dynamic_shift_by = put_input_in_reg(ctx, inputs[1]);
                        let w_rcx = Writable::from_reg(regs::rcx());
                        ctx.emit(Inst::mov_r_r(OperandSize::Size64, dynamic_shift_by, w_rcx));
                        ctx.emit(Inst::shift_r(OperandSize::Size64, kind, None, reg));
                    };
                };
                shift(lower_lane);
                shift(upper_lane);
                // Insert the scalar values back into the `dst` vector.
                emit_insert_lane(ctx, RegMem::from(lower_lane), dst, 0, types::I64);
                emit_insert_lane(ctx, RegMem::from(upper_lane), dst, 1, types::I64);
            } else {
                // For the remaining packed shifts not covered above, x86 has implementations that can either:
                // - shift using an immediate
@@ -1940,13 +1777,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    types::I16X8 => match op {
                        Opcode::Ishl => SseOpcode::Psllw,
                        Opcode::Ushr => SseOpcode::Psrlw,
                        Opcode::Sshr => SseOpcode::Psraw,
                        _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
                    },
                    types::I32X4 => match op {
                        Opcode::Ishl => SseOpcode::Pslld,
                        Opcode::Ushr => SseOpcode::Psrld,
                        Opcode::Sshr => SseOpcode::Psrad,
                        _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
                    },
                    types::I64X2 => match op {
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@@ -69,6 +69,31 @@ where
        OperandSize::from_ty(ty)
    }
    fn put_in_reg_mem_imm(&mut self, val: Value) -> RegMemImm {
        let inputs = self.lower_ctx.get_value_as_source_or_const(val);
        if let Some(c) = inputs.constant {
            if let Some(imm) = to_simm32(c as i64) {
                return imm;
            }
            // Generate constants fresh at each use to minimize long-range
            // register pressure.
            let ty = self.value_type(val);
            return RegMemImm::reg(generated_code::constructor_imm(self, ty, c).unwrap());
        }
        if let Some((src_insn, 0)) = inputs.inst {
            if let Some((addr_input, offset)) = is_mergeable_load(self.lower_ctx, src_insn) {
                self.lower_ctx.sink_inst(src_insn);
                let amode = lower_to_amode(self.lower_ctx, addr_input, offset);
                return RegMemImm::mem(amode);
            }
        }
        RegMemImm::reg(self.put_in_reg(val))
    }
    fn put_in_reg_mem(&mut self, val: Value) -> RegMem {
        let inputs = self.lower_ctx.get_value_as_source_or_const(val);
@@ -90,6 +115,23 @@ where
        RegMem::reg(self.put_in_reg(val))
    }
    fn put_masked_in_imm8_reg(&mut self, val: Value, ty: Type) -> Imm8Reg {
        let inputs = self.lower_ctx.get_value_as_source_or_const(val);
        if let Some(c) = inputs.constant {
            let mask = 1_u64
                .checked_shl(ty.bits() as u32)
                .map_or(u64::MAX, |x| x - 1);
            return Imm8Reg::Imm8 {
                imm: (c & mask) as u8,
            };
        }
        Imm8Reg::Reg {
            reg: self.put_in_regs(val).regs()[0],
        }
    }
    #[inline]
    fn encode_fcmp_imm(&mut self, imm: &FcmpImm) -> u8 {
        imm.encode()
@@ -131,12 +173,12 @@ where
    }
    #[inline]
-    fn mask_imm8_const(&mut self, imm8: &Imm8Reg, mask: u64) -> Imm8Reg {
+    fn const_to_type_masked_imm8(&mut self, c: u64, ty: Type) -> Imm8Reg {
-        match imm8 {
+        let mask = 1_u64
-            &Imm8Reg::Reg { reg } => Imm8Reg::Reg { reg },
+            .checked_shl(ty.bits() as u32)
-            &Imm8Reg::Imm8 { imm } => Imm8Reg::Imm8 {
+            .map_or(u64::MAX, |x| x - 1);
-                imm: imm & (mask as u8),
+        Imm8Reg::Imm8 {
-            },
+            imm: (c & mask) as u8,
        }
    }
--- a/cranelift/codegen/src/isa/x64/lower/isle/generated_code.manifest
+++ b/cranelift/codegen/src/isa/x64/lower/isle/generated_code.manifest
@@ -1,4 +1,4 @@
 src/clif.isle f176ef3bba99365
-src/prelude.isle babc931e5dc5b4cf
+src/prelude.isle d95510fad2e2473c
-src/isa/x64/inst.isle bc5fc626492752c8
+src/isa/x64/inst.isle c16462cc359dd466
-src/isa/x64/lower.isle 33e94300f4c08455
+src/isa/x64/lower.isle 9f761598e3949e8e
--- a/cranelift/codegen/src/isa/x64/lower/isle/generated_code.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle/generated_code.rs
--- a/cranelift/codegen/src/machinst/isle.rs
+++ b/cranelift/codegen/src/machinst/isle.rs
@@ -95,11 +95,6 @@ macro_rules! isle_prelude_methods {
            ty.bits().try_into().unwrap()
        }
        #[inline]
        fn ty_bits_mask(&mut self, ty: Type) -> u64 {
            (1 << (self.ty_bits(ty) as u64)) - 1
        }
        #[inline]
        fn ty_bits_u16(&mut self, ty: Type) -> u16 {
            ty.bits()
@@ -260,6 +255,21 @@ macro_rules! isle_prelude_methods {
                n => Some(n as u64),
            }
        }
        #[inline]
        fn u32_add(&mut self, a: u32, b: u32) -> u32 {
            a.wrapping_add(b)
        }
        #[inline]
        fn u8_and(&mut self, a: u8, b: u8) -> u8 {
            a & b
        }
        #[inline]
        fn lane_type(&mut self, ty: Type) -> Type {
            ty.lane_type()
        }
    };
 }
--- a/cranelift/codegen/src/prelude.isle
+++ b/cranelift/codegen/src/prelude.isle
@@ -38,6 +38,12 @@
 (type ValueList (primitive ValueList))
 (type ValueRegs (primitive ValueRegs))
 (decl u32_add (u32 u32) u32)
 (extern constructor u32_add u32_add)
 (decl u8_and (u8 u8) u8)
 (extern constructor u8_and u8_and)
 ;;;; Registers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (type Reg (primitive Reg))
@@ -146,6 +152,10 @@
 (decl ty_bits_u16 (Type) u16)
 (extern constructor ty_bits_u16 ty_bits_u16)
 ;; Get the type of each lane in the given type.
 (decl lane_type (Type) Type)
 (extern constructor lane_type lane_type)
 ;;;; Helper Clif Extractors ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; An extractor that only matches types that can fit in 16 bits.
@@ -242,11 +252,6 @@
           (and (result_type ty)
                inst))
 ;; Return a bitmask that will mask off a count to be within `ty`'s
 ;; bit-width. Used for shifts/rotates.
 (decl ty_bits_mask (Type) u64)
 (extern constructor ty_bits_mask ty_bits_mask)
 ;; Match a multi-lane type, extracting (# bits per lane, # lanes) from the given
 ;; type. Will only match when there is more than one lane.
 (decl multi_lane (u8 u16) Type)
@@ -256,6 +261,11 @@
 (decl def_inst (Inst) Value)
 (extern extractor def_inst def_inst)
 ;; Extract a constant `u64` from a value defined by an `iconst`.
 (decl u64_from_iconst (u64) Value)
 (extractor (u64_from_iconst x)
           (def_inst (iconst (u64_from_imm64 x))))
 ;;;; Helpers for Working with Flags ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Newtype wrapper around `MInst` for instructions that are used for their
--- a/cranelift/filetests/filetests/isa/x64/i128.clif
+++ b/cranelift/filetests/filetests/isa/x64/i128.clif
@@ -1173,35 +1173,32 @@ block0(v0: i128, v1: i128):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 28)
+;   (instruction range: 0 .. 25)
 ;   Inst 0:   pushq   %rbp
 ;   Inst 1:   movq    %rsp, %rbp
-;   Inst 2:   movq    %rdi, %r8
+;   Inst 2:   movq    %rdi, %rax
-;   Inst 3:   movq    %rsi, %rdi
+;   Inst 3:   movq    %rdx, %rcx
-;   Inst 4:   movq    %rdi, %rsi
+;   Inst 4:   shrq    %cl, %rax
-;   Inst 5:   movq    %rdx, %rcx
+;   Inst 5:   movq    %rsi, %rdi
-;   Inst 6:   sarq    %cl, %rsi
+;   Inst 6:   movq    %rdx, %rcx
-;   Inst 7:   movq    %rdx, %rcx
+;   Inst 7:   sarq    %cl, %rdi
-;   Inst 8:   shrq    %cl, %r8
+;   Inst 8:   movl    $64, %ecx
-;   Inst 9:   movl    $64, %ecx
+;   Inst 9:   subq    %rdx, %rcx
-;   Inst 10:   subq    %rdx, %rcx
+;   Inst 10:   movq    %rsi, %r8
-;   Inst 11:   movq    %rdi, %rax
+;   Inst 11:   shlq    %cl, %r8
-;   Inst 12:   shlq    %cl, %rax
+;   Inst 12:   xorq    %rcx, %rcx
-;   Inst 13:   xorq    %rcx, %rcx
+;   Inst 13:   testq   $127, %rdx
-;   Inst 14:   testq   $127, %rdx
+;   Inst 14:   cmovzq  %rcx, %r8
-;   Inst 15:   cmovzq  %rcx, %rax
+;   Inst 15:   orq     %r8, %rax
-;   Inst 16:   orq     %r8, %rax
+;   Inst 16:   sarq    $63, %rsi
-;   Inst 17:   sarq    $63, %rdi
+;   Inst 17:   testq   $64, %rdx
-;   Inst 18:   xorq    %rcx, %rcx
+;   Inst 18:   cmovzq  %rdi, %rsi
-;   Inst 19:   andq    $64, %rdx
+;   Inst 19:   cmovzq  %rax, %rdi
-;   Inst 20:   cmovzq  %rsi, %rdi
+;   Inst 20:   movq    %rdi, %rax
-;   Inst 21:   cmovzq  %rax, %rcx
+;   Inst 21:   movq    %rsi, %rdx
-;   Inst 22:   cmovnzq %rsi, %rcx
+;   Inst 22:   movq    %rbp, %rsp
-;   Inst 23:   movq    %rcx, %rax
+;   Inst 23:   popq    %rbp
-;   Inst 24:   movq    %rdi, %rdx
+;   Inst 24:   ret
 ;   Inst 25:   movq    %rbp, %rsp
 ;   Inst 26:   popq    %rbp
 ;   Inst 27:   ret
 ; }}
 function %f33(i128, i128) -> i128 {
--- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
@@ -324,16 +324,16 @@ block0(v0: i32):
 ;   (instruction range: 0 .. 15)
 ;   Inst 0:   pushq   %rbp
 ;   Inst 1:   movq    %rsp, %rbp
-;   Inst 2:   load_const VCodeConstant(0), %xmm0
+;   Inst 2:   load_const VCodeConstant(0), %xmm2
-;   Inst 3:   addl    $8, %edi
+;   Inst 3:   movdqa  %xmm2, %xmm0
-;   Inst 4:   movd    %edi, %xmm2
+;   Inst 4:   punpcklbw %xmm2, %xmm0
-;   Inst 5:   movdqa  %xmm0, %xmm1
+;   Inst 5:   movdqa  %xmm2, %xmm1
-;   Inst 6:   punpcklbw %xmm1, %xmm1
+;   Inst 6:   punpckhbw %xmm2, %xmm1
-;   Inst 7:   psraw   %xmm2, %xmm1
+;   Inst 7:   addl    $8, %edi
-;   Inst 8:   punpckhbw %xmm0, %xmm0
+;   Inst 8:   movd    %edi, %xmm2
 ;   Inst 9:   psraw   %xmm2, %xmm0
-;   Inst 10:   packsswb %xmm0, %xmm1
+;   Inst 10:   psraw   %xmm2, %xmm1
-;   Inst 11:   movdqa  %xmm1, %xmm0
+;   Inst 11:   packsswb %xmm1, %xmm0
 ;   Inst 12:   movq    %rbp, %rsp
 ;   Inst 13:   popq    %rbp
 ;   Inst 14:   ret
@@ -349,19 +349,20 @@ block0(v0: i8x16, v1: i32):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 12)
+;   (instruction range: 0 .. 13)
 ;   Inst 0:   pushq   %rbp
 ;   Inst 1:   movq    %rsp, %rbp
 ;   Inst 2:   movdqa  %xmm0, %xmm1
-;   Inst 3:   movdqa  %xmm1, %xmm0
+;   Inst 3:   punpcklbw %xmm0, %xmm1
-;   Inst 4:   punpcklbw %xmm0, %xmm0
+;   Inst 4:   movdqa  %xmm0, %xmm2
-;   Inst 5:   psraw   $11, %xmm0
+;   Inst 5:   punpckhbw %xmm0, %xmm2
-;   Inst 6:   punpckhbw %xmm1, %xmm1
+;   Inst 6:   psraw   $11, %xmm1
-;   Inst 7:   psraw   $11, %xmm1
+;   Inst 7:   psraw   $11, %xmm2
-;   Inst 8:   packsswb %xmm1, %xmm0
+;   Inst 8:   packsswb %xmm2, %xmm1
-;   Inst 9:   movq    %rbp, %rsp
+;   Inst 9:   movdqa  %xmm1, %xmm0
-;   Inst 10:   popq    %rbp
+;   Inst 10:   movq    %rbp, %rsp
-;   Inst 11:   ret
+;   Inst 11:   popq    %rbp
 ;   Inst 12:   ret
 ; }}
 function %sshr_i64x2(i64x2, i32) -> i64x2 {
@@ -374,21 +375,20 @@ block0(v0: i64x2, v1: i32):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 15)
+;   (instruction range: 0 .. 14)
 ;   Inst 0:   pushq   %rbp
 ;   Inst 1:   movq    %rsp, %rbp
-;   Inst 2:   movdqa  %xmm0, %xmm1
+;   Inst 2:   pextrd.w $0, %xmm0, %rsi
-;   Inst 3:   pextrd.w $0, %xmm0, %rsi
+;   Inst 3:   pextrd.w $1, %xmm0, %rax
-;   Inst 4:   pextrd.w $1, %xmm0, %rax
+;   Inst 4:   movq    %rdi, %rcx
-;   Inst 5:   movq    %rdi, %rcx
+;   Inst 5:   sarq    %cl, %rsi
-;   Inst 6:   sarq    %cl, %rsi
+;   Inst 6:   movq    %rdi, %rcx
-;   Inst 7:   movq    %rdi, %rcx
+;   Inst 7:   sarq    %cl, %rax
-;   Inst 8:   sarq    %cl, %rax
+;   Inst 8:   uninit  %xmm0
-;   Inst 9:   pinsrd.w $0, %rsi, %xmm1
+;   Inst 9:   pinsrd.w $0, %rsi, %xmm0
-;   Inst 10:   pinsrd.w $1, %rax, %xmm1
+;   Inst 10:   pinsrd.w $1, %rax, %xmm0
-;   Inst 11:   movdqa  %xmm1, %xmm0
+;   Inst 11:   movq    %rbp, %rsp
-;   Inst 12:   movq    %rbp, %rsp
+;   Inst 12:   popq    %rbp
-;   Inst 13:   popq    %rbp
+;   Inst 13:   ret
 ;   Inst 14:   ret
 ; }}
--- a/cranelift/filetests/src/runner.rs
+++ b/cranelift/filetests/src/runner.rs
@@ -13,7 +13,7 @@ use std::path::{Path, PathBuf};
 use std::time;
 /// Timeout in seconds when we're not making progress.
-const TIMEOUT_PANIC: usize = 10;
+const TIMEOUT_PANIC: usize = 60;
 /// Timeout for reporting slow tests without panicking.
 const TIMEOUT_SLOW: usize = 3;