diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs
index c34c13d089..32c1e2f957 100644
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -250,6 +250,17 @@ impl PerCpuModeEncodings {
             self.enc64(inst.clone().bind(I64).bind_any(), template);
         }
     }
+
+    /// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand binding) has already happened
+    fn enc_32_64_isap(
+        &mut self,
+        inst: BoundInstruction,
+        template: Template,
+        isap: SettingPredicateNumber,
+    ) {
+        self.enc32_isap(inst.clone(), template.clone(), isap);
+        self.enc64_isap(inst, template, isap);
+    }
 }
 
 // Definitions.
@@ -379,6 +390,8 @@ pub fn define(
     let x86_fmax = x86.by_name("x86_fmax");
     let x86_fmin = x86.by_name("x86_fmin");
     let x86_pop = x86.by_name("x86_pop");
+    let x86_pshufd = x86.by_name("x86_pshufd");
+    let x86_pshufb = x86.by_name("x86_pshufb");
     let x86_push = x86.by_name("x86_push");
     let x86_sdivmodx = x86.by_name("x86_sdivmodx");
     let x86_smulx = x86.by_name("x86_smulx");
@@ -462,6 +475,7 @@ pub fn define(
     let rec_pushq = r.template("pushq");
     let rec_ret = r.template("ret");
     let rec_r_ib = r.template("r_ib");
+    let rec_r_ib_unsigned = r.template("r_ib_unsigned");
     let rec_r_id = r.template("r_id");
     let rec_rcmp = r.template("rcmp");
     let rec_rcmp_ib = r.template("rcmp_ib");
@@ -519,6 +533,7 @@ pub fn define(
     let use_lzcnt = settings.predicate_by_name("use_lzcnt");
     let use_bmi1 = settings.predicate_by_name("use_bmi1");
     let use_sse2 = settings.predicate_by_name("use_sse2");
+    let use_ssse3 = settings.predicate_by_name("use_ssse3");
     let use_sse41 = settings.predicate_by_name("use_sse41");
 
     // Definitions.
@@ -1575,6 +1590,28 @@ pub fn define(
     e.enc_both(ffcmp.bind(F32), rec_fcmp.opcodes(vec![0x0f, 0x2e]));
     e.enc_both(ffcmp.bind(F64), rec_fcmp.opcodes(vec![0x66, 0x0f, 0x2e]));
 
+    // SIMD splat: before x86 can use vector data, it must be moved to XMM registers; see
+    // legalize.rs for how this is done; once there, x86_pshuf* (below) is used for broadcasting the
+    // value across the register
+
+    // PSHUFB, 8-bit shuffle using two XMM registers
+    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
+        let number_of_lanes = 128 / ty.lane_bits();
+        let instruction = x86_pshufb.bind_vector(ty, number_of_lanes);
+        let template = rec_fa.nonrex().opcodes(vec![0x66, 0x0f, 0x38, 0x00]);
+        e.enc32_isap(instruction.clone(), template.clone(), use_ssse3);
+        e.enc64_isap(instruction, template, use_ssse3);
+    }
+
+    // PSHUFD, 32-bit shuffle using one XMM register and a u8 immediate
+    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) {
+        let number_of_lanes = 128 / ty.lane_bits();
+        let instruction = x86_pshufd.bind_vector(ty, number_of_lanes);
+        let template = rec_r_ib_unsigned.nonrex().opcodes(vec![0x66, 0x0f, 0x70]);
+        e.enc32_isap(instruction.clone(), template.clone(), use_sse2);
+        e.enc64_isap(instruction, template, use_sse2);
+    }
+
     // SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according
     // to the Intel manual: "When the destination operand is an XMM register, the source operand is
     // written to the low doubleword of the register and the regiser is zero-extended to 128 bits."
diff --git a/cranelift/codegen/meta/src/isa/x86/instructions.rs b/cranelift/codegen/meta/src/isa/x86/instructions.rs
index 9aa7363e1a..6464261938 100644
--- a/cranelift/codegen/meta/src/isa/x86/instructions.rs
+++ b/cranelift/codegen/meta/src/isa/x86/instructions.rs
@@ -7,7 +7,7 @@ use crate::cdsl::instructions::{
 use crate::cdsl::operands::{create_operand as operand, create_operand_doc as operand_doc};
 use crate::cdsl::types::ValueType;
 use crate::cdsl::typevar::{Interval, TypeSetBuilder, TypeVar};
-use crate::shared::types;
+use crate::shared::{immediates, types, OperandKinds};
 
 pub fn define(
     mut all_instructions: &mut AllInstructions,
@@ -249,5 +249,46 @@ pub fn define(
         .operands_out(vec![y, rflags]),
     );
 
+    let immediates = OperandKinds::from(immediates::define());
+    let uimm8 = immediates.by_name("uimm8");
+    let TxN = &TypeVar::new(
+        "TxN",
+        "A SIMD vector type",
+        TypeSetBuilder::new()
+            .ints(Interval::All)
+            .floats(Interval::All)
+            .bools(Interval::All)
+            .simd_lanes(Interval::All)
+            .includes_scalars(false)
+            .build(),
+    );
+    let a = &operand_doc("a", TxN, "A vector value (i.e. held in an XMM register)");
+    let b = &operand_doc("b", TxN, "A vector value (i.e. held in an XMM register)");
+    let i = &operand_doc("i", uimm8, "An ordering operand controlling the copying of data from the source to the destination; see PSHUFD in Intel manual for details");
+
+    ig.push(
+        Inst::new(
+            "x86_pshufd",
+            r#"
+    Packed Shuffle Doublewords -- copies data from either memory or lanes in an extended
+    register and re-orders the data according to the passed immediate byte.
+    "#,
+        )
+        .operands_in(vec![a, i]) // TODO allow copying from memory here (need more permissive type than TxN)
+        .operands_out(vec![a]),
+    );
+
+    ig.push(
+        Inst::new(
+            "x86_pshufb",
+            r#"
+    Packed Shuffle Bytes -- re-orders data in an extended register using a shuffle
+    mask from either memory or another extended register
+    "#,
+        )
+        .operands_in(vec![a, b]) // TODO allow re-ordering from memory here (need more permissive type than TxN)
+        .operands_out(vec![a]),
+    );
+
     ig.build()
 }
diff --git a/cranelift/codegen/meta/src/isa/x86/recipes.rs b/cranelift/codegen/meta/src/isa/x86/recipes.rs
index 45441cf67a..f2353d3a66 100644
--- a/cranelift/codegen/meta/src/isa/x86/recipes.rs
+++ b/cranelift/codegen/meta/src/isa/x86/recipes.rs
@@ -367,6 +367,7 @@ pub fn define<'shared>(
     let f_call = formats.by_name("Call");
     let f_call_indirect = formats.by_name("CallIndirect");
     let f_copy_special = formats.by_name("CopySpecial");
+    let f_extract_lane = formats.by_name("ExtractLane"); // TODO this would preferably retrieve a BinaryImm8 format but because formats are compared structurally and ExtractLane has the same structure this is impossible--if we rename ExtractLane, it may even impact parsing
     let f_float_compare = formats.by_name("FloatCompare");
     let f_float_cond = formats.by_name("FloatCond");
     let f_float_cond_trap = formats.by_name("FloatCondTrap");
@@ -794,6 +795,27 @@ pub fn define<'shared>(
         );
     }
 
+    // XX /r ib with 8-bit unsigned immediate (e.g. for pshufd)
+    {
+        let format = formats.get(f_extract_lane);
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("r_ib_unsigned", f_extract_lane, 2)
+                .operands_in(vec![fpr])
+                .operands_out(vec![fpr])
+                .inst_predicate(InstructionPredicate::new_is_unsigned_int(
+                    format, "lane", 8, 0,
+                )) // TODO if the format name is changed then "lane" should be renamed to something more appropriate--ordering mask? broadcast immediate?
+                .emit(
+                    r#"
+                    {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink);
+                    modrm_rr(in_reg0, out_reg0, sink);
+                    let imm:i64 = lane.into();
+                    sink.put1(imm as u8);
+                "#,
+                ),
+        );
+    }
+
     {
         // XX /n id with 32-bit immediate sign-extended. UnaryImm version.
         let format = formats.get(f_unary_imm);
diff --git a/cranelift/filetests/filetests/isa/x86/pshufb.clif b/cranelift/filetests/filetests/isa/x86/pshufb.clif
new file mode 100644
index 0000000000..7c23c5ab61
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x86/pshufb.clif
@@ -0,0 +1,13 @@
+test binemit
+set enable_simd
+target x86_64 has_sse2=true has_ssse3=true
+
+function %test_pshufb() {
+ebb0:
+[-, %rax]   v0 = iconst.i8 42
+[-, %xmm0]  v1 = scalar_to_vector.i8x16 v0   ; bin: 66 40 0f 6e c0
+[-, %rbx]   v2 = iconst.i8 43
+[-, %xmm4]  v3 = scalar_to_vector.i8x16 v2   ; bin: 66 40 0f 6e e3
+[-, %xmm0]  v4 = x86_pshufb v1, v3               ; bin: 66 0f 38 00 c4
+            return
+}
diff --git a/cranelift/filetests/filetests/isa/x86/pshufd.clif b/cranelift/filetests/filetests/isa/x86/pshufd.clif
new file mode 100644
index 0000000000..183af4fc0e
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x86/pshufd.clif
@@ -0,0 +1,11 @@
+test binemit
+set enable_simd
+target x86_64 has_sse2=true
+
+function %test_pshuf() {
+ebb0:
+[-, %rax]   v0 = iconst.i32 42
+[-, %xmm0]  v1 = scalar_to_vector.i32x4 v0  ; bin: 66 40 0f 6e c0
+[-, %xmm0]  v2 = x86_pshufd v1, 0                ; bin: 66 0f 70 c0 00
+            return
+}