From bd93e69eb4bbd5d75c41a7cf33630cfa35f1f462 Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Wed, 11 Nov 2020 15:45:32 -0800
Subject: [PATCH] [machinst x64]: implement packed shifts

---
 cranelift/codegen/src/isa/x64/lower.rs        | 446 +++++++++++++++---
 .../isa/x64/simd-bitwise-compile.clif         |  75 +++
 .../filetests/isa/x64/simd-bitwise-run.clif   | 108 +++++
 3 files changed, 559 insertions(+), 70 deletions(-)
diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs
index f2681c8bb7..8f158a8ea3 100644
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -258,6 +258,58 @@ fn emit_insert_lane<C: LowerCtx<I = Inst>>(
     }
 }
 
+/// Emit an instruction to extract a lane of `src` into `dst`.
+fn emit_extract_lane<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    src: Reg,
+    dst: Writable<Reg>,
+    lane: u8,
+    ty: Type,
+) {
+    if !ty.is_float() {
+        let (sse_op, is64) = match ty.lane_bits() {
+            8 => (SseOpcode::Pextrb, false),
+            16 => (SseOpcode::Pextrw, false),
+            32 => (SseOpcode::Pextrd, false),
+            64 => (SseOpcode::Pextrd, true),
+            _ => panic!("Unable to extractlane for lane size: {}", ty.lane_bits()),
+        };
+        let src = RegMem::reg(src);
+        ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, is64));
+    } else if ty == types::F32 || ty == types::F64 {
+        if lane == 0 {
+            // Remove the extractlane instruction, leaving the float where it is. The upper
+            // bits will remain unchanged; for correctness, this relies on Cranelift type
+            // checking to avoid using those bits.
+            ctx.emit(Inst::gen_move(dst, src, ty));
+        } else {
+            // Otherwise, shuffle the bits in `lane` to the lowest lane.
+            let sse_op = SseOpcode::Pshufd;
+            let mask = match ty {
+                // Move the value at `lane` to lane 0, copying existing value at lane 0 to
+                // other lanes. Again, this relies on Cranelift type checking to avoid
+                // using those bits.
+                types::F32 => {
+                    assert!(lane > 0 && lane < 4);
+                    0b00_00_00_00 | lane
+                }
+                // Move the value at `lane` 1 (we know it must be 1 because of the `if`
+                // statement above) to lane 0 and leave lane 1 unchanged. The Cranelift type
+                // checking assumption also applies here.
+                types::F64 => {
+                    assert!(lane == 1);
+                    0b11_10_11_10
+                }
+                _ => unreachable!(),
+            };
+            let src = RegMem::reg(src);
+            ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, mask, false));
+        }
+    } else {
+        panic!("unable to emit extractlane for type: {}", ty)
+    }
+}
+
 /// Emits an int comparison instruction.
 ///
 /// Note: make sure that there are no instructions modifying the flags between a call to this
@@ -930,51 +982,338 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let dst_ty = ctx.output_ty(insn, 0);
             debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty);
 
-            let (size, lhs) = match dst_ty {
-                types::I8 | types::I16 => match op {
-                    Opcode::Ishl => (4, put_input_in_reg(ctx, inputs[0])),
-                    Opcode::Ushr => (
-                        4,
-                        extend_input_to_reg(ctx, inputs[0], ExtSpec::ZeroExtendTo32),
-                    ),
-                    Opcode::Sshr => (
-                        4,
-                        extend_input_to_reg(ctx, inputs[0], ExtSpec::SignExtendTo32),
-                    ),
-                    Opcode::Rotl | Opcode::Rotr => {
+            if !dst_ty.is_vector() {
+                // Scalar shifts on x86 have various encodings:
+                // - shift by one bit, e.g. `SAL r/m8, 1` (not used here)
+                // - shift by an immediate amount, e.g. `SAL r/m8, imm8`
+                // - shift by a dynamic amount but only from the CL register, e.g. `SAL r/m8, CL`.
+                // This implementation uses the last two encoding methods.
+                let (size, lhs) = match dst_ty {
+                    types::I8 | types::I16 => match op {
+                        Opcode::Ishl => (4, put_input_in_reg(ctx, inputs[0])),
+                        Opcode::Ushr => (
+                            4,
+                            extend_input_to_reg(ctx, inputs[0], ExtSpec::ZeroExtendTo32),
+                        ),
+                        Opcode::Sshr => (
+                            4,
+                            extend_input_to_reg(ctx, inputs[0], ExtSpec::SignExtendTo32),
+                        ),
+                        Opcode::Rotl | Opcode::Rotr => {
+                            (dst_ty.bytes() as u8, put_input_in_reg(ctx, inputs[0]))
+                        }
+                        _ => unreachable!(),
+                    },
+                    types::I32 | types::I64 => {
                         (dst_ty.bytes() as u8, put_input_in_reg(ctx, inputs[0]))
                     }
+                    _ => unreachable!("unhandled output type for shift/rotates: {}", dst_ty),
+                };
+
+                let (count, rhs) = if let Some(cst) = ctx.get_input(insn, 1).constant {
+                    // Mask count, according to Cranelift's semantics.
+                    let cst = (cst as u8) & (dst_ty.bits() as u8 - 1);
+                    (Some(cst), None)
+                } else {
+                    (None, Some(put_input_in_reg(ctx, inputs[1])))
+                };
+
+                let dst = get_output_reg(ctx, outputs[0]);
+
+                let shift_kind = match op {
+                    Opcode::Ishl => ShiftKind::ShiftLeft,
+                    Opcode::Ushr => ShiftKind::ShiftRightLogical,
+                    Opcode::Sshr => ShiftKind::ShiftRightArithmetic,
+                    Opcode::Rotl => ShiftKind::RotateLeft,
+                    Opcode::Rotr => ShiftKind::RotateRight,
                     _ => unreachable!(),
-                },
-                types::I32 | types::I64 => (dst_ty.bytes() as u8, put_input_in_reg(ctx, inputs[0])),
-                _ => unreachable!("unhandled output type for shift/rotates: {}", dst_ty),
-            };
+                };
 
-            let (count, rhs) = if let Some(cst) = ctx.get_input(insn, 1).constant {
-                // Mask count, according to Cranelift's semantics.
-                let cst = (cst as u8) & (dst_ty.bits() as u8 - 1);
-                (Some(cst), None)
+                let w_rcx = Writable::from_reg(regs::rcx());
+                ctx.emit(Inst::mov_r_r(true, lhs, dst));
+                if count.is_none() {
+                    ctx.emit(Inst::mov_r_r(true, rhs.unwrap(), w_rcx));
+                }
+                ctx.emit(Inst::shift_r(size, shift_kind, count, dst));
+            } else if dst_ty == types::I8X16 && (op == Opcode::Ishl || op == Opcode::Ushr) {
+                // Since the x86 instruction set does not have any 8x16 shift instructions (even in higher feature sets
+                // like AVX), we lower the `ishl.i8x16` and `ushr.i8x16` to a sequence of instructions. The basic idea,
+                // whether the `shift_by` amount is an immediate or not, is to use a 16x8 shift and then mask off the
+                // incorrect bits to 0s (see below for handling signs in `sshr.i8x16`).
+                let src = put_input_in_reg(ctx, inputs[0]);
+                let shift_by = input_to_reg_mem_imm(ctx, inputs[1]);
+                let dst = get_output_reg(ctx, outputs[0]);
+
+                // If necessary, move the shift index into the lowest bits of a vector register.
+                let shift_by_moved = match &shift_by {
+                    RegMemImm::Imm { .. } => shift_by.clone(),
+                    RegMemImm::Reg { reg } => {
+                        let tmp_shift_by = ctx.alloc_tmp(RegClass::V128, dst_ty);
+                        ctx.emit(Inst::gpr_to_xmm(
+                            SseOpcode::Movd,
+                            RegMem::reg(*reg),
+                            OperandSize::Size32,
+                            tmp_shift_by,
+                        ));
+                        RegMemImm::reg(tmp_shift_by.to_reg())
+                    }
+                    RegMemImm::Mem { .. } => unimplemented!("load shift amount to XMM register"),
+                };
+
+                // Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be correct for half of the lanes;
+                // the others must be fixed up with the mask below.
+                let shift_opcode = match op {
+                    Opcode::Ishl => SseOpcode::Psllw,
+                    Opcode::Ushr => SseOpcode::Psrlw,
+                    _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
+                };
+                ctx.emit(Inst::gen_move(dst, src, dst_ty));
+                ctx.emit(Inst::xmm_rmi_reg(shift_opcode, shift_by_moved, dst));
+
+                // Choose which mask to use to fixup the shifted lanes. Since we must use a 16x8 shift, we need to fix
+                // up the bits that migrate from one half of the lane to the other. Each 16-byte mask (which rustfmt
+                // forces to multiple lines) is indexed by the shift amount: e.g. if we shift right by 0 (no movement),
+                // we want to retain all the bits so we mask with `0xff`; if we shift right by 1, we want to retain all
+                // bits except the MSB so we mask with `0x7f`; etc.
+                const USHR_MASKS: [u8; 128] = [
+                    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+                    0xff, 0xff, 0xff, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+                    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,
+                    0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x1f, 0x1f, 0x1f, 0x1f,
+                    0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x0f,
+                    0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+                    0x0f, 0x0f, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+                    0x07, 0x07, 0x07, 0x07, 0x07, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+                    0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01,
+                    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                ];
+                const SHL_MASKS: [u8; 128] = [
+                    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+                    0xff, 0xff, 0xff, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
+                    0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
+                    0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xf8, 0xf8, 0xf8, 0xf8,
+                    0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf0,
+                    0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+                    0xf0, 0xf0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,
+                    0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
+                    0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0x80, 0x80, 0x80, 0x80, 0x80,
+                    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+                ];
+                let mask = match op {
+                    Opcode::Ishl => &SHL_MASKS,
+                    Opcode::Ushr => &USHR_MASKS,
+                    _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
+                };
+
+                // Figure out the address of the shift mask.
+                let mask_address = match shift_by {
+                    RegMemImm::Imm { simm32 } => {
+                        // When the shift amount is known, we can statically (i.e. at compile time) determine the mask to
+                        // use and only emit that.
+                        debug_assert!(simm32 < 8);
+                        let mask_offset = simm32 as usize * 16;
+                        let mask_constant = ctx.use_constant(VCodeConstantData::WellKnown(
+                            &mask[mask_offset..mask_offset + 16],
+                        ));
+                        SyntheticAmode::ConstantOffset(mask_constant)
+                    }
+                    RegMemImm::Reg { reg } => {
+                        // Otherwise, we must emit the entire mask table and dynamically (i.e. at run time) find the correct
+                        // mask offset in the table. We do this use LEA to find the base address of the mask table and then
+                        // complex addressing to offset to the right mask: `base_address + shift_by * 4`
+                        let base_mask_address = ctx.alloc_tmp(RegClass::I64, types::I64);
+                        let mask_offset = ctx.alloc_tmp(RegClass::I64, types::I64);
+                        let mask_constant = ctx.use_constant(VCodeConstantData::WellKnown(mask));
+                        ctx.emit(Inst::lea(
+                            SyntheticAmode::ConstantOffset(mask_constant),
+                            base_mask_address,
+                        ));
+                        ctx.emit(Inst::gen_move(mask_offset, reg, types::I64));
+                        ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, Some(4), mask_offset));
+                        Amode::imm_reg_reg_shift(
+                            0,
+                            base_mask_address.to_reg(),
+                            mask_offset.to_reg(),
+                            0,
+                        )
+                        .into()
+                    }
+                    RegMemImm::Mem { addr: _ } => unimplemented!("load mask address"),
+                };
+
+                // Load the mask into a temporary register, `mask_value`.
+                let mask_value = ctx.alloc_tmp(RegClass::V128, dst_ty);
+                ctx.emit(Inst::load(dst_ty, mask_address, mask_value, ExtKind::None));
+
+                // Remove the bits that would have disappeared in a true 8x16 shift. TODO in the future,
+                // this AND instruction could be coalesced with the load above.
+                let sse_op = match dst_ty {
+                    types::F32X4 => SseOpcode::Andps,
+                    types::F64X2 => SseOpcode::Andpd,
+                    _ => SseOpcode::Pand,
+                };
+                ctx.emit(Inst::xmm_rm_r(sse_op, RegMem::from(mask_value), dst));
+            } else if dst_ty == types::I8X16 && op == Opcode::Sshr {
+                // Since the x86 instruction set does not have an 8x16 shift instruction and the approach used for
+                // `ishl` and `ushr` cannot be easily used (the masks do not preserve the sign), we use a different
+                // approach here: separate the low and high lanes, shift them separately, and merge them into the final
+                // result. Visually, this looks like the following, where `src.i8x16 = [s0, s1, ..., s15]:
+                //   low.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)]
+                //   shifted_low.i16x8 = shift each lane of `low`
+                //   high.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]
+                //   shifted_high.i16x8 = shift each lane of `high`
+                //   dst.i8x16 = [s0'', s1'', ..., s15'']
+                let src = put_input_in_reg(ctx, inputs[0]);
+                let shift_by = input_to_reg_mem_imm(ctx, inputs[1]);
+                let shift_by_ty = ctx.input_ty(insn, 1);
+                let dst = get_output_reg(ctx, outputs[0]);
+
+                // In order for PACKSSWB later to only use the high byte of each 16x8 lane, we shift right an extra 8
+                // bits, relying on PSRAW to fill in the upper bits appropriately.
+                let bigger_shift_by = match shift_by {
+                    // When we know the shift amount at compile time, we add the extra shift amount statically.
+                    RegMemImm::Imm { simm32 } => RegMemImm::imm(simm32 + 8),
+                    // Otherwise we add instructions to add the extra shift amount and move the value into an XMM
+                    // register.
+                    RegMemImm::Reg { reg } => {
+                        let bigger_shift_by_gpr = ctx.alloc_tmp(RegClass::I64, shift_by_ty);
+                        ctx.emit(Inst::mov_r_r(true, reg, bigger_shift_by_gpr));
+
+                        let is_64 = shift_by_ty == types::I64;
+                        let imm = RegMemImm::imm(8);
+                        ctx.emit(Inst::alu_rmi_r(
+                            is_64,
+                            AluRmiROpcode::Add,
+                            imm,
+                            bigger_shift_by_gpr,
+                        ));
+
+                        let bigger_shift_by_xmm = ctx.alloc_tmp(RegClass::V128, dst_ty);
+                        ctx.emit(Inst::gpr_to_xmm(
+                            SseOpcode::Movd,
+                            RegMem::from(bigger_shift_by_gpr),
+                            OperandSize::Size32,
+                            bigger_shift_by_xmm,
+                        ));
+                        RegMemImm::reg(bigger_shift_by_xmm.to_reg())
+                    }
+                    RegMemImm::Mem { .. } => unimplemented!("load shift amount to XMM register"),
+                };
+
+                // Unpack and shift the lower lanes of `src` into the `dst` register.
+                ctx.emit(Inst::gen_move(dst, src, dst_ty));
+                ctx.emit(Inst::xmm_rm_r(SseOpcode::Punpcklbw, RegMem::from(dst), dst));
+                ctx.emit(Inst::xmm_rmi_reg(
+                    SseOpcode::Psraw,
+                    bigger_shift_by.clone(),
+                    dst,
+                ));
+
+                // Unpack and shift the upper lanes of `src` into a temporary register, `upper_lanes`.
+                let upper_lanes = ctx.alloc_tmp(RegClass::V128, dst_ty);
+                ctx.emit(Inst::gen_move(upper_lanes, src, dst_ty));
+                ctx.emit(Inst::xmm_rm_r(
+                    SseOpcode::Punpckhbw,
+                    RegMem::from(upper_lanes),
+                    upper_lanes,
+                ));
+                ctx.emit(Inst::xmm_rmi_reg(
+                    SseOpcode::Psraw,
+                    bigger_shift_by,
+                    upper_lanes,
+                ));
+
+                // Merge the upper and lower shifted lanes into `dst`.
+                ctx.emit(Inst::xmm_rm_r(
+                    SseOpcode::Packsswb,
+                    RegMem::from(upper_lanes),
+                    dst,
+                ));
+            } else if dst_ty == types::I64X2 && op == Opcode::Sshr {
+                // The `sshr.i8x16` CLIF instruction has no single x86 instruction in the older feature sets; newer ones
+                // like AVX512VL and AVX512F include VPSRAQ, a 128-bit instruction that would fit here, but this backend
+                // does not currently have support for EVEX encodings (TODO when EVEX support is available, add an
+                // alternate lowering here). To remedy this, we extract each 64-bit lane to a GPR, shift each using a
+                // scalar instruction, and insert the shifted values back in the `dst` XMM register.
+                let src = put_input_in_reg(ctx, inputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]);
+                ctx.emit(Inst::gen_move(dst, src, dst_ty));
+
+                // Extract the upper and lower lanes into temporary GPRs.
+                let lower_lane = ctx.alloc_tmp(RegClass::I64, types::I64);
+                emit_extract_lane(ctx, src, lower_lane, 0, types::I64);
+                let upper_lane = ctx.alloc_tmp(RegClass::I64, types::I64);
+                emit_extract_lane(ctx, src, upper_lane, 1, types::I64);
+
+                // Shift each value.
+                let mut shift = |reg: Writable<Reg>| {
+                    let kind = ShiftKind::ShiftRightArithmetic;
+                    if let Some(shift_by) = ctx.get_input(insn, 1).constant {
+                        // Mask the shift amount according to Cranelift's semantics.
+                        let shift_by = (shift_by as u8) & (types::I64.bits() as u8 - 1);
+                        ctx.emit(Inst::shift_r(8, kind, Some(shift_by), reg));
+                    } else {
+                        let dynamic_shift_by = put_input_in_reg(ctx, inputs[1]);
+                        let w_rcx = Writable::from_reg(regs::rcx());
+                        ctx.emit(Inst::mov_r_r(true, dynamic_shift_by, w_rcx));
+                        ctx.emit(Inst::shift_r(8, kind, None, reg));
+                    };
+                };
+                shift(lower_lane);
+                shift(upper_lane);
+
+                // Insert the scalar values back into the `dst` vector.
+                emit_insert_lane(ctx, RegMem::from(lower_lane), dst, 0, types::I64);
+                emit_insert_lane(ctx, RegMem::from(upper_lane), dst, 1, types::I64);
             } else {
-                (None, Some(put_input_in_reg(ctx, inputs[1])))
-            };
+                // For the remaining packed shifts not covered above, x86 has implementations that can either:
+                // - shift using an immediate
+                // - shift using a dynamic value given in the lower bits of another XMM register.
+                let src = put_input_in_reg(ctx, inputs[0]);
+                let shift_by = input_to_reg_mem_imm(ctx, inputs[1]);
+                let dst = get_output_reg(ctx, outputs[0]);
+                let sse_op = match dst_ty {
+                    types::I16X8 => match op {
+                        Opcode::Ishl => SseOpcode::Psllw,
+                        Opcode::Ushr => SseOpcode::Psrlw,
+                        Opcode::Sshr => SseOpcode::Psraw,
+                        _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
+                    },
+                    types::I32X4 => match op {
+                        Opcode::Ishl => SseOpcode::Pslld,
+                        Opcode::Ushr => SseOpcode::Psrld,
+                        Opcode::Sshr => SseOpcode::Psrad,
+                        _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
+                    },
+                    types::I64X2 => match op {
+                        Opcode::Ishl => SseOpcode::Psllq,
+                        Opcode::Ushr => SseOpcode::Psrlq,
+                        _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
+                    },
+                    _ => unreachable!(),
+                };
 
-            let dst = get_output_reg(ctx, outputs[0]);
+                // If necessary, move the shift index into the lowest bits of a vector register.
+                let shift_by = match shift_by {
+                    RegMemImm::Imm { .. } => shift_by,
+                    RegMemImm::Reg { reg } => {
+                        let tmp_shift_by = ctx.alloc_tmp(RegClass::V128, dst_ty);
+                        ctx.emit(Inst::gpr_to_xmm(
+                            SseOpcode::Movd,
+                            RegMem::reg(reg),
+                            OperandSize::Size32,
+                            tmp_shift_by,
+                        ));
+                        RegMemImm::reg(tmp_shift_by.to_reg())
+                    }
+                    RegMemImm::Mem { .. } => unimplemented!("load shift amount to XMM register"),
+                };
 
-            let shift_kind = match op {
-                Opcode::Ishl => ShiftKind::ShiftLeft,
-                Opcode::Ushr => ShiftKind::ShiftRightLogical,
-                Opcode::Sshr => ShiftKind::ShiftRightArithmetic,
-                Opcode::Rotl => ShiftKind::RotateLeft,
-                Opcode::Rotr => ShiftKind::RotateRight,
-                _ => unreachable!(),
-            };
+                // Move the `src` to the same register as `dst`.
+                ctx.emit(Inst::gen_move(dst, src, dst_ty));
 
-            let w_rcx = Writable::from_reg(regs::rcx());
-            ctx.emit(Inst::mov_r_r(true, lhs, dst));
-            if count.is_none() {
-                ctx.emit(Inst::mov_r_r(true, rhs.unwrap(), w_rcx));
+                ctx.emit(Inst::xmm_rmi_reg(sse_op, shift_by, dst));
             }
-            ctx.emit(Inst::shift_r(size, shift_kind, count, dst));
         }
 
         Opcode::Ineg => {
@@ -3329,40 +3668,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             };
             debug_assert!(lane < src_ty.lane_count() as u8);
 
-            if !ty.is_float() {
-                let (sse_op, w_bit) = match ty.lane_bits() {
-                    8 => (SseOpcode::Pextrb, false),
-                    16 => (SseOpcode::Pextrw, false),
-                    32 => (SseOpcode::Pextrd, false),
-                    64 => (SseOpcode::Pextrd, true),
-                    _ => panic!("Unable to extractlane for lane size: {}", ty.lane_bits()),
-                };
-                let src = RegMem::reg(src);
-                ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, w_bit));
-            } else {
-                if lane == 0 {
-                    // Remove the extractlane instruction, leaving the float where it is. The upper
-                    // bits will remain unchanged; for correctness, this relies on Cranelift type
-                    // checking to avoid using those bits.
-                    ctx.emit(Inst::gen_move(dst, src, ty));
-                } else {
-                    // Otherwise, shuffle the bits in `lane` to the lowest lane.
-                    let sse_op = SseOpcode::Pshufd;
-                    let mask = match src_ty {
-                        // Move the value at `lane` to lane 0, copying existing value at lane 0 to
-                        // other lanes. Again, this relies on Cranelift type checking to avoid
-                        // using those bits.
-                        types::F32X4 => 0b00_00_00_00 | lane,
-                        // Move the value at `lane` 1 (we know it must be 1 because of the `if`
-                        // statement above) to lane 0 and leave lane 1 unchanged. The Cranelift type
-                        // checking assumption also applies here.
-                        types::F64X2 => 0b11_10_11_10,
-                        _ => unreachable!(),
-                    };
-                    let src = RegMem::reg(src);
-                    ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, mask, false));
-                }
-            }
+            emit_extract_lane(ctx, src, dst, lane, ty);
         }
 
         Opcode::Splat | Opcode::LoadSplat => {
diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
index fe94b84548..fe52e3c503 100644
--- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
@@ -15,3 +15,78 @@ block0:
 ; nextln: pandn   %xmm2, %xmm0
 ; nextln: por     %xmm1, %xmm0
 ; not:    movdqa
+
+
+
+; 8x16 shifts: these lower to complex sequences of instructions
+
+function %ishl_i8x16(i32) -> i8x16 {
+block0(v0: i32):
+    v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+    v2 = ishl v1, v0
+    return v2
+}
+; check:  movd    %edi, %xmm1
+; nextln: psllw   %xmm1, %xmm0
+; nextln: lea     const(VCodeConstant(0)), %r12
+; nextln: shlq    $$4, %rdi
+; nextln: movdqu  0(%r12,%rdi,1), %xmm1
+; nextln: pand    %xmm1, %xmm0
+
+function %ushr_i8x16_imm() -> i8x16 {
+block0:
+    v0 = iconst.i32 1
+    v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+    v2 = ushr v1, v0
+    return v2
+}
+; check:  load_const VCodeConstant(1), %xmm0
+; nextln: psrlw   $$1, %xmm0
+; nextln: movdqu  const(VCodeConstant(0)), %xmm1
+; nextln: pand    %xmm1, %xmm0
+
+function %sshr_i8x16(i32) -> i8x16 {
+block0(v0: i32):
+    v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+    v2 = sshr v1, v0
+    return v2
+}
+; check:  addl    $$8, %edi
+; nextln: movd    %edi, %xmm2
+; nextln: movdqa  %xmm0, %xmm1
+; nextln: punpcklbw %xmm1, %xmm1
+; nextln: psraw   %xmm2, %xmm1
+; nextln: punpckhbw %xmm0, %xmm0
+; nextln: psraw   %xmm2, %xmm0
+
+function %sshr_i8x16_imm(i8x16, i32) -> i8x16 {
+block0(v0: i8x16, v1: i32):
+    v2 = sshr_imm v0, 3
+    return v2
+}
+; check:  movdqa  %xmm0, %xmm1
+; nextln: movdqa  %xmm1, %xmm0
+; nextln: punpcklbw %xmm0, %xmm0
+; nextln: psraw   $$11, %xmm0
+; nextln: punpckhbw %xmm1, %xmm1
+; nextln: psraw   $$11, %xmm1
+; nextln: packsswb %xmm1, %xmm0
+
+
+
+; i16x4 arithmetic shifts: x86 does not have a instruction for this
+
+function %sshr_i64x2(i64x2, i32) -> i64x2 {
+block0(v0: i64x2, v1: i32):
+    v2 = sshr v0, v1
+    return v2
+}
+; check:  pextrd.w $$0, %xmm0, %r12
+; nextln: pextrd.w $$1, %xmm0, %r13
+; nextln: movq    %rdi, %rcx
+; nextln: sarq    %cl, %r12
+; nextln: movq    %rdi, %rcx
+; nextln: sarq    %cl, %r13
+; nextln: pinsrd.w $$0, %r12, %xmm1
+; nextln: pinsrd.w $$1, %r13, %xmm1
+; nextln: movdqa  %xmm1, %xmm0
diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitwise-run.clif b/cranelift/filetests/filetests/isa/x64/simd-bitwise-run.clif
index 3f1c814a2c..8ab624d6c2 100644
--- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-run.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-run.clif
@@ -10,3 +10,111 @@ block0(v0: i8x16, v1: i8x16, v2: i8x16):
 }
 ; Remember that bitselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector.
 ; run: %bitselect_i8x16([0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255], [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42], [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127]) == [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42]
+
+
+
+; shift left
+
+function %ishl_i8x16(i8x16, i32) -> i8x16 {
+block0(v0: i8x16, v1: i32):
+    v2 = ishl v0, v1
+    return v2
+}
+; run: %ishl_i8x16([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], 4) == [0x00 0x10 0x20 0x30 0x40 0x50 0x60 0x70 0x80 0x90 0xa0 0xb0 0xc0 0xd0 0xe0 0xf0]
+
+function %ishl_i16x8(i16x8, i32) -> i16x8 {
+block0(v0: i16x8, v1: i32):
+    v2 = ishl v0, v1
+    return v2
+}
+; run: %ishl_i16x8([1 2 4 8 16 32 64 128], 17) == [0 0 0 0 0 0 0 0]
+
+function %ishl_i32x4(i32x4, i32) -> i32x4 {
+block0(v0: i32x4, v1: i32):
+    v2 = ishl v0, v1
+    return v2
+}
+; run: %ishl_i32x4([1 2 4 8], 1) == [2 4 8 16]
+
+function %ishl_imm_i64x2(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v2 = ishl_imm v0, 1
+    return v2
+}
+; run: %ishl_imm_i64x2([1 0]) == [2 0]
+
+
+
+; shift right (logical)
+
+function %ushr_i8x16(i8x16, i32) -> i8x16 {
+block0(v0: i8x16, v1: i32):
+    v2 = ushr v0, v1
+    return v2
+}
+; run: %ushr_i8x16([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], 1) == [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7]
+
+function %ushr_i32x4(i32x4, i32) -> i32x4 {
+block0(v0: i32x4, v1: i32):
+    v2 = ushr v0, v1
+    return v2
+}
+; run: %ushr_i32x4([1 2 4 8], 33) == [0 0 0 0]
+
+function %ushr_i64x2(i64x2, i32) -> i64x2 {
+block0(v0: i64x2, v1: i32):
+    v2 = ushr v0, v1
+    return v2
+}
+; run: %ushr_i64x2([1 2], 1) == [0 1]
+
+
+
+; shift right (arithmetic)
+
+function %sshr_i8x16(i8x16, i32) -> i8x16 {
+block0(v0: i8x16, v1: i32):
+    v2 = sshr v0, v1
+    return v2
+}
+; run: %sshr_i8x16([0 0xff 2 0xfd 4 0xfb 6 0xf9 8 0xf7 10 0xf5 12 0xf3 14 0xf1], 1) == [0 0xff 1 0xfe 2 0xfd 3 0xfc 4 0xfb 5 0xfa 6 0xf9 7 0xf8]
+
+function %sshr_i16x8(i16x8, i32) -> i16x8 {
+block0(v0: i16x8, v1: i32):
+    v2 = sshr v0, v1
+    return v2
+}
+; note: because of the shifted-in sign-bit, lane 0 remains -1 == 0xffff, whereas lane 4 has been shifted to -8 == 0xfff8
+; run: %ushr_i16x8([-1 2 4 8 -16 32 64 128], 1) == [-1 1 2 4 -8 16 32 64]
+
+function %sshr_i32x4(i32x4, i32) -> i32x4 {
+block0(v0: i32x4, v1: i32):
+    v2 = sshr v0, v1
+    return v2
+}
+; note: shifting in the sign-bit repeatedly in lane 3 fills the result with 1s (-1 == 0xffff_ffff)
+; run: %ushr_i32x4([1 2 4 -8], 33) == [0 0 0 0xffff_ffff]
+
+function %sshr_i64x2(i64x2, i32) -> i64x2 {
+block0(v0:i64x2, v1:i32):
+    v2 = sshr v0, v1
+    return v2
+}
+; run: %sshr_i64x2([1 -1], 0) == [1 -1]
+; run: %sshr_i64x2([1 -1], 1) == [0 -1] ; note the -1 shift result
+; run: %sshr_i64x2([2 -2], 1) == [1 -1]
+; run: %sshr_i64x2([0x80000000_00000000 0x7FFFFFFF_FFFFFFFF], 63) == [0xFFFFFFFF_FFFFFFFF 0]
+
+function %sshr_imm_i32x4(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = sshr_imm v0, 1
+    return v1
+}
+; run: %sshr_imm_i32x4([1 2 4 -8]) == [0 1 2 -4]
+
+function %sshr_imm_i16x8(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = sshr_imm v0, 1
+    return v1
+}
+; run: %sshr_imm_i16x8([1 2 4 -8 0 0 0 0]) == [0 1 2 -4 0 0 0 0]