Add x86 implementation of shuffle

2019-08-26 14:50:05 -07:00
parent 9e088e4164
commit af1499ce99
18 changed files with 336 additions and 44 deletions
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -1785,7 +1785,7 @@ pub(crate) fn define(
    let allowed_simd_type = |t: &LaneType| t.lane_bits() >= 8 && t.lane_bits() < 128;

    // PSHUFB, 8-bit shuffle using two XMM registers.
-    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
        let instruction = x86_pshufb.bind_vector_from_lane(ty, sse_vector_size);
        let template = rec_fa.nonrex().opcodes(vec![0x66, 0x0f, 0x38, 00]);
        e.enc32_isap(instruction.clone(), template.clone(), use_ssse3_simd);
@@ -1804,7 +1804,7 @@ pub(crate) fn define(

    // SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according
    // to the Intel manual: "When the destination operand is an XMM register, the source operand is
-    // written to the low doubleword of the register and the regiser is zero-extended to 128 bits."
+    // written to the low doubleword of the register and the register is zero-extended to 128 bits."
    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
        let instruction = scalar_to_vector.bind_vector_from_lane(ty, sse_vector_size);
        if ty.is_float() {
@@ -1929,6 +1929,13 @@ pub(crate) fn define(
        e.enc_32_64_maybe_isap(instruction, template, None); // from SSE
    }

+    // SIMD bor using ORPS
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
+        let instruction = bor.bind_vector_from_lane(ty, sse_vector_size);
+        let template = rec_fa.nonrex().opcodes(vec![0x0f, 0x56]);
+        e.enc_32_64_maybe_isap(instruction, template, None); // from SSE
+    }
+
    // Reference type instructions

    // Null references implemented as iconst 0.
--- a/cranelift/codegen/meta/src/isa/x86/legalize.rs
+++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs
@@ -45,6 +45,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
    let selectif = insts.by_name("selectif");
    let smulhi = insts.by_name("smulhi");
    let splat = insts.by_name("splat");
+    let shuffle = insts.by_name("shuffle");
    let srem = insts.by_name("srem");
    let udiv = insts.by_name("udiv");
    let umulhi = insts.by_name("umulhi");
@@ -380,6 +381,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
        );
    }

+    narrow.custom_legalize(shuffle, "convert_shuffle");
    narrow.custom_legalize(extractlane, "convert_extractlane");
    narrow.custom_legalize(insertlane, "convert_insertlane");

--- a/cranelift/codegen/meta/src/isa/x86/recipes.rs
+++ b/cranelift/codegen/meta/src/isa/x86/recipes.rs
@@ -396,11 +396,11 @@ pub(crate) fn define<'shared>(
    let f_trap = formats.by_name("Trap");
    let f_unary = formats.by_name("Unary");
    let f_unary_bool = formats.by_name("UnaryBool");
+    let f_unary_const = formats.by_name("UnaryConst");
    let f_unary_global_value = formats.by_name("UnaryGlobalValue");
    let f_unary_ieee32 = formats.by_name("UnaryIeee32");
    let f_unary_ieee64 = formats.by_name("UnaryIeee64");
    let f_unary_imm = formats.by_name("UnaryImm");
-    let f_unary_imm128 = formats.by_name("UnaryImm128");

    // Predicates shorthands.
    let use_sse41 = settings.predicate_by_name("use_sse41");
@@ -2437,14 +2437,14 @@ pub(crate) fn define<'shared>(
    );

    recipes.add_template_recipe(
-        EncodingRecipeBuilder::new("vconst", f_unary_imm128, 5)
+        EncodingRecipeBuilder::new("vconst", f_unary_const, 5)
            .operands_out(vec![fpr])
            .clobbers_flags(false)
            .emit(
                r#"
                    {{PUT_OP}}(bits, rex2(0, out_reg0), sink);
                    modrm_riprel(out_reg0, sink);
-                    const_disp4(imm, func, sink);
+                    const_disp4(constant_handle, func, sink);
                "#,
            ),
    );
--- a/cranelift/codegen/meta/src/shared/formats.rs
+++ b/cranelift/codegen/meta/src/shared/formats.rs
@@ -6,10 +6,10 @@ pub(crate) fn define(imm: &Immediates, entities: &EntityRefs) -> FormatRegistry

    registry.insert(Builder::new("Unary").value());
    registry.insert(Builder::new("UnaryImm").imm(&imm.imm64));
-    registry.insert(Builder::new("UnaryImm128").imm(&imm.uimm128));
    registry.insert(Builder::new("UnaryIeee32").imm(&imm.ieee32));
    registry.insert(Builder::new("UnaryIeee64").imm(&imm.ieee64));
    registry.insert(Builder::new("UnaryBool").imm(&imm.boolean));
+    registry.insert(Builder::new("UnaryConst").imm(&imm.pool_constant));
    registry.insert(Builder::new("UnaryGlobalValue").imm(&entities.global_value));

    registry.insert(Builder::new("Binary").value().value());
@@ -43,6 +43,12 @@ pub(crate) fn define(imm: &Immediates, entities: &EntityRefs) -> FormatRegistry
            .value()
            .imm_with_name("lane", &imm.uimm8),
    );
+    registry.insert(
+        Builder::new("Shuffle")
+            .value()
+            .value()
+            .imm_with_name("mask", &imm.uimm128),
+    );

    registry.insert(Builder::new("IntCompare").imm(&imm.intcc).value().value());
    registry.insert(
--- a/cranelift/codegen/meta/src/shared/immediates.rs
+++ b/cranelift/codegen/meta/src/shared/immediates.rs
@@ -23,6 +23,12 @@ pub(crate) struct Immediates {
    /// const.
    pub uimm128: OperandKind,

+    /// A constant stored in the constant pool.
+    ///
+    /// This operand is used to pass constants to instructions like vconst while storing the
+    /// actual bytes in the constant pool.
+    pub pool_constant: OperandKind,
+
    /// A 32-bit immediate signed offset.
    ///
    /// This is used to represent an immediate address offset in load/store instructions.
@@ -84,6 +90,12 @@ impl Immediates {

            uimm128: Builder::new_imm("uimm128")
                .doc("A 128-bit immediate unsigned integer.")
+                .rust_type("ir::Immediate")
+                .build(),
+
+            pool_constant: Builder::new_imm("poolConstant")
+                .doc("A constant stored in the constant pool.")
+                .default_member("constant_handle")
                .rust_type("ir::Constant")
                .build(),

--- a/cranelift/codegen/meta/src/shared/instructions.rs
+++ b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -1090,7 +1090,7 @@ pub(crate) fn define(

    let N = &operand_doc(
        "N",
-        &imm.uimm128,
+        &imm.pool_constant,
        "The 16 immediate bytes of a 128-bit vector",
    );
    let a = &operand_doc("a", TxN, "A constant vector value");
@@ -1108,6 +1108,41 @@ pub(crate) fn define(
        .operands_out(vec![a]),
    );

+    let mask = &operand_doc(
+        "mask",
+        &imm.uimm128,
+        "The 16 immediate bytes used for selecting the elements to shuffle",
+    );
+    let Tx16 = &TypeVar::new(
+        "Tx16",
+        "A SIMD vector with exactly 16 lanes of 8-bit values; eventually this may support other \
+         lane counts and widths",
+        TypeSetBuilder::new()
+            .ints(8..8)
+            .bools(8..8)
+            .simd_lanes(16..16)
+            .includes_scalars(false)
+            .build(),
+    );
+    let a = &operand_doc("a", Tx16, "A vector value");
+    let b = &operand_doc("b", Tx16, "A vector value");
+
+    ig.push(
+        Inst::new(
+            "shuffle",
+            r#"
+        SIMD vector shuffle.
+        
+        Shuffle two vectors using the given immediate bytes. For each of the 16 bytes of the
+        immediate, a value i of 0-15 selects the i-th element of the first vector and a value i of 
+        16-31 selects the (i-16)th element of the second vector. Immediate values outside of the 
+        0-31 range place a 0 in the resulting vector lane.
+        "#,
+        )
+        .operands_in(vec![a, b, mask])
+        .operands_out(vec![a]),
+    );
+
    let a = &operand_doc("a", Ref, "A constant reference null value");

    ig.push(
--- a/cranelift/codegen/src/ir/dfg.rs
+++ b/cranelift/codegen/src/ir/dfg.rs
@@ -5,7 +5,7 @@ use crate::ir;
 use crate::ir::builder::ReplaceBuilder;
 use crate::ir::extfunc::ExtFuncData;
 use crate::ir::instructions::{BranchInfo, CallInfo, InstructionData};
-use crate::ir::{types, ConstantPool};
+use crate::ir::{types, ConstantPool, Immediate};
 use crate::ir::{
    Ebb, FuncRef, Inst, SigRef, Signature, Type, Value, ValueLabelAssignments, ValueList,
    ValueListPool,
@@ -19,6 +19,7 @@ use core::mem;
 use core::ops::{Index, IndexMut};
 use core::u16;
 use std::collections::HashMap;
+use std::vec::Vec;

 /// A data flow graph defines all instructions and extended basic blocks in a function as well as
 /// the data flow dependencies between them. The DFG also tracks values which can be either
@@ -70,6 +71,9 @@ pub struct DataFlowGraph {

    /// Constants used within the function
    pub constants: ConstantPool,
+
+    /// Stores large immediates that otherwise will not fit on InstructionData
+    pub immediates: PrimaryMap<Immediate, Vec<u8>>,
 }

 impl DataFlowGraph {
@@ -85,6 +89,7 @@ impl DataFlowGraph {
            ext_funcs: PrimaryMap::new(),
            values_labels: None,
            constants: ConstantPool::new(),
+            immediates: PrimaryMap::new(),
        }
    }

@@ -98,7 +103,8 @@ impl DataFlowGraph {
        self.signatures.clear();
        self.ext_funcs.clear();
        self.values_labels = None;
-        self.constants.clear()
+        self.constants.clear();
+        self.immediates.clear();
    }

    /// Get the total number of instructions created in this function, whether they are currently
--- a/cranelift/codegen/src/ir/entities.rs
+++ b/cranelift/codegen/src/ir/entities.rs
@@ -181,6 +181,29 @@ impl Constant {
    }
 }

+/// An opaque reference to an immediate.
+///
+/// Some immediates (e.g. SIMD shuffle masks) are too large to store in the
+/// [`InstructionData`](super::instructions::InstructionData) struct and therefore must be
+/// tracked separately in [`DataFlowGraph::immediates`](super::dfg::DataFlowGraph). `Immediate`
+/// provides a way to reference values stored there.
+#[derive(Copy, Clone, PartialEq, Eq, Hash)]
+pub struct Immediate(u32);
+entity_impl!(Immediate, "imm");
+
+impl Immediate {
+    /// Create an immediate reference from its number.
+    ///
+    /// This method is for use by the parser.
+    pub fn with_number(n: u32) -> Option<Self> {
+        if n < u32::MAX {
+            Some(Immediate(n))
+        } else {
+            None
+        }
+    }
+}
+
 /// An opaque reference to a [jump table](https://en.wikipedia.org/wiki/Branch_table).
 ///
 /// `JumpTable`s are used for indirect branching and are specialized for dense,
--- a/cranelift/codegen/src/ir/mod.rs
+++ b/cranelift/codegen/src/ir/mod.rs
@@ -31,7 +31,8 @@ pub use crate::ir::builder::{InsertBuilder, InstBuilder, InstBuilderBase, InstIn
 pub use crate::ir::constant::{ConstantData, ConstantOffset, ConstantPool};
 pub use crate::ir::dfg::{DataFlowGraph, ValueDef};
 pub use crate::ir::entities::{
-    Constant, Ebb, FuncRef, GlobalValue, Heap, Inst, JumpTable, SigRef, StackSlot, Table, Value,
+    Constant, Ebb, FuncRef, GlobalValue, Heap, Immediate, Inst, JumpTable, SigRef, StackSlot,
+    Table, Value,
 };
 pub use crate::ir::extfunc::{
    AbiParam, ArgumentExtension, ArgumentPurpose, ExtFuncData, Signature,
--- a/cranelift/codegen/src/isa/x86/enc_tables.rs
+++ b/cranelift/codegen/src/isa/x86/enc_tables.rs
@@ -899,6 +899,80 @@ fn expand_fcvt_to_uint_sat(
    cfg.recompute_ebb(pos.func, done);
 }

+/// Convert shuffle instructions.
+fn convert_shuffle(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    _isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    if let ir::InstructionData::Shuffle { args, mask, .. } = pos.func.dfg[inst] {
+        // A mask-building helper: in 128-bit SIMD, 0-15 indicate which lane to read from and a 1
+        // in the most significant position zeroes the lane.
+        let zero_unknown_lane_index = |b: u8| if b > 15 { 0b10000000 } else { b };
+
+        // We only have to worry about aliasing here because copies will be introduced later (in
+        // regalloc).
+        let a = pos.func.dfg.resolve_aliases(args[0]);
+        let b = pos.func.dfg.resolve_aliases(args[1]);
+        let mask = pos
+            .func
+            .dfg
+            .immediates
+            .get(mask)
+            .expect("The shuffle immediate should have been recorded before this point")
+            .clone();
+        if a == b {
+            // PSHUFB the first argument (since it is the same as the second).
+            let constructed_mask = mask
+                .iter()
+                // If the mask is greater than 15 it still may be referring to a lane in b.
+                .map(|&b| if b > 15 { b.wrapping_sub(16) } else { b })
+                .map(zero_unknown_lane_index)
+                .collect();
+            let handle = pos.func.dfg.constants.insert(constructed_mask);
+            // Move the built mask into another XMM register.
+            let a_type = pos.func.dfg.value_type(a);
+            let mask_value = pos.ins().vconst(a_type, handle);
+            // Shuffle the single incoming argument.
+            pos.func.dfg.replace(inst).x86_pshufb(a, mask_value);
+        } else {
+            // PSHUFB the first argument, placing zeroes for unused lanes.
+            let constructed_mask = mask.iter().cloned().map(zero_unknown_lane_index).collect();
+            let handle = pos.func.dfg.constants.insert(constructed_mask);
+            // Move the built mask into another XMM register.
+            let a_type = pos.func.dfg.value_type(a);
+            let mask_value = pos.ins().vconst(a_type, handle);
+            // Shuffle the first argument.
+            let shuffled_first_arg = pos.ins().x86_pshufb(a, mask_value);
+
+            // PSHUFB the second argument, placing zeroes for unused lanes.
+            let constructed_mask = mask
+                .iter()
+                .map(|b| b.wrapping_sub(16))
+                .map(zero_unknown_lane_index)
+                .collect();
+            let handle = pos.func.dfg.constants.insert(constructed_mask);
+            // Move the built mask into another XMM register.
+            let b_type = pos.func.dfg.value_type(b);
+            let mask_value = pos.ins().vconst(b_type, handle);
+            // Shuffle the second argument.
+            let shuffled_second_arg = pos.ins().x86_pshufb(b, mask_value);
+
+            // OR the vectors together to form the final shuffled value.
+            pos.func
+                .dfg
+                .replace(inst)
+                .bor(shuffled_first_arg, shuffled_second_arg);
+
+            // TODO when AVX512 is enabled we should replace this sequence with a single VPERMB
+        };
+    }
+}
+
 /// Because floats already exist in XMM registers, we can keep them there when executing a CLIF
 /// extractlane instruction
 fn convert_extractlane(
--- a/cranelift/codegen/src/verifier/mod.rs
+++ b/cranelift/codegen/src/verifier/mod.rs
@@ -706,7 +706,6 @@ impl<'a> Verifier<'a> {
            // Exhaustive list so we can't forget to add new formats
            Unary { .. }
            | UnaryImm { .. }
-            | UnaryImm128 { .. }
            | UnaryIeee32 { .. }
            | UnaryIeee64 { .. }
            | UnaryBool { .. }
@@ -715,6 +714,8 @@ impl<'a> Verifier<'a> {
            | Ternary { .. }
            | InsertLane { .. }
            | ExtractLane { .. }
+            | UnaryConst { .. }
+            | Shuffle { .. }
            | IntCompare { .. }
            | IntCompareImm { .. }
            | IntCond { .. }
--- a/cranelift/codegen/src/write.rs
+++ b/cranelift/codegen/src/write.rs
@@ -488,11 +488,6 @@ pub fn write_operands(
    match dfg[inst] {
        Unary { arg, .. } => write!(w, " {}", arg),
        UnaryImm { imm, .. } => write!(w, " {}", imm),
-        UnaryImm128 { imm, .. } => {
-            let data = dfg.constants.get(imm);
-            let uimm128 = Uimm128::from(&data[..]);
-            write!(w, " {}", uimm128)
-        }
        UnaryIeee32 { imm, .. } => write!(w, " {}", imm),
        UnaryIeee64 { imm, .. } => write!(w, " {}", imm),
        UnaryBool { imm, .. } => write!(w, " {}", imm),
@@ -510,6 +505,20 @@ pub fn write_operands(
        NullAry { .. } => write!(w, " "),
        InsertLane { lane, args, .. } => write!(w, " {}, {}, {}", args[0], lane, args[1]),
        ExtractLane { lane, arg, .. } => write!(w, " {}, {}", arg, lane),
+        UnaryConst {
+            constant_handle, ..
+        } => {
+            let data = dfg.constants.get(constant_handle);
+            let uimm128 = Uimm128::from(&data[..]);
+            write!(w, " {}", uimm128)
+        }
+        Shuffle { mask, args, .. } => {
+            let data = dfg.immediates.get(mask).expect(
+                "Expected the shuffle mask to already be inserted into the immediates table",
+            );
+            let uimm128 = Uimm128::from(&data[..]);
+            write!(w, " {}, {}, {}", args[0], args[1], uimm128)
+        }
        IntCompare { cond, args, .. } => write!(w, " {} {}, {}", cond, args[0], args[1]),
        IntCompareImm { cond, arg, imm, .. } => write!(w, " {} {}, {}", cond, arg, imm),
        IntCond { cond, arg, .. } => write!(w, " {} {}", cond, arg),
--- a/cranelift/filetests/filetests/isa/x86/shuffle-legalize.clif
+++ b/cranelift/filetests/filetests/isa/x86/shuffle-legalize.clif
@@ -0,0 +1,31 @@
+test legalizer
+set enable_simd
+target x86_64 skylake
+
+function %test_shuffle_different_ssa_values() -> i8x16 {
+ebb0:
+    v0 = vconst.i8x16 0x00
+    v1 = vconst.i8x16 0x01
+    v2 = shuffle v0, v1, 0x11000000000000000000000000000000     ; pick the second lane of v1, the rest use the first lane of v0
+    return v2
+}
+
+; check:  v1 = vconst.i8x16 0x01
+; nextln: v3 = vconst.i8x16 0x80000000000000000000000000000000
+; nextln: v4 = x86_pshufb v0, v3
+; nextln: v5 = vconst.i8x16 0x01808080808080808080808080808080
+; nextln: v6 = x86_pshufb v1, v5
+; nextln: v2 = bor v4, v6
+
+
+
+function %test_shuffle_same_ssa_value() -> i8x16 {
+ebb0:
+    v1 = vconst.i8x16 0x01
+    v2 = shuffle v1, v1, 0x13000000000000000000000000000000     ; pick the fourth lane of v1 and the rest from the first lane of v1
+    return v2
+}
+
+; check:  v1 = vconst.i8x16 0x01
+; nextln: v3 = vconst.i8x16 0x03000000000000000000000000000000
+; nextln: v2 = x86_pshufb v1, v3
--- a/cranelift/filetests/filetests/isa/x86/shuffle-run.clif
+++ b/cranelift/filetests/filetests/isa/x86/shuffle-run.clif
@@ -0,0 +1,44 @@
+test run
+set enable_simd
+
+function %test_shuffle_different_ssa_values() -> b1 {
+ebb0:
+    v0 = vconst.i8x16 0x00
+    v1 = vconst.i8x16 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42]
+    v2 = shuffle v0, v1, [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 31]     ; use the first lane of v0 throughout except use the last lane of v1
+    v3 = extractlane.i8x16 v2, 15
+    v4 = iconst.i8 42
+    v5 = icmp eq v3, v4
+    return v5
+}
+
+; run
+
+function %test_shuffle_same_ssa_value() -> b1 {
+ebb0:
+    v0 = vconst.i8x16 0x01000000_00000000_00000000_00000000     ; note where lane 15 is when written with hexadecimal syntax
+    v1 = shuffle v0, v0, 0x0f0f0f0f_0f0f0f0f_0f0f0f0f_0f0f0f0f  ; use the last lane of v0 to fill all lanes
+    v2 = extractlane.i8x16 v1, 4
+    v3 = iconst.i8 0x01
+    v4 = icmp eq v2, v3
+    return v4
+}
+
+; run
+
+function %compare_shuffle() -> b1 {
+ebb0:
+    v1 = vconst.i32x4 [0 1 2 3]
+    v2 = raw_bitcast.i8x16  v1 ; we have to cast because shuffle is type-limited to Tx16
+    ; keep each lane in place from the first vector
+    v3 = shuffle v2, v2, [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+    v4 = raw_bitcast.i32x4 v3
+    v5 = extractlane.i32x4 v4, 3
+    v6 = icmp_imm eq v5, 3
+    v7 = extractlane.i32x4 v4, 0
+    v8 = icmp_imm eq v7, 0
+    v9 = band v6, v8
+    return v9
+}
+
+; run
--- a/cranelift/filetests/filetests/isa/x86/vconst-rodata.clif
+++ b/cranelift/filetests/filetests/isa/x86/vconst-rodata.clif
@@ -1,6 +1,5 @@
 test rodata
 set enable_simd=true
-set probestack_enabled=false
 target x86_64 haswell

 function %test_vconst_i32() -> i32x4 {
--- a/cranelift/reader/src/parser.rs
+++ b/cranelift/reader/src/parser.rs
@@ -2243,23 +2243,6 @@ impl<'a> Parser<'a> {
                opcode,
                imm: self.match_imm64("expected immediate integer operand")?,
            },
-            InstructionFormat::UnaryImm128 => match explicit_control_type {
-                None => {
-                    return err!(
-                        self.loc,
-                        "Expected {:?} to have a controlling type variable, e.g. inst.i32x4",
-                        opcode
-                    )
-                }
-                Some(ty) => {
-                    let uimm128 = self.match_uimm128_or_literals(ty)?;
-                    let constant_handle = ctx.function.dfg.constants.insert(uimm128.0.to_vec());
-                    InstructionData::UnaryImm128 {
-                        opcode,
-                        imm: constant_handle,
-                    }
-                }
-            },
            InstructionFormat::UnaryIeee32 => InstructionData::UnaryIeee32 {
                opcode,
                imm: self.match_ieee32("expected immediate 32-bit float operand")?,
@@ -2442,6 +2425,36 @@ impl<'a> Parser<'a> {
                let lane = self.match_uimm8("expected lane number")?;
                InstructionData::ExtractLane { opcode, lane, arg }
            }
+            InstructionFormat::UnaryConst => match explicit_control_type {
+                None => {
+                    return err!(
+                        self.loc,
+                        "Expected {:?} to have a controlling type variable, e.g. inst.i32x4",
+                        opcode
+                    )
+                }
+                Some(controlling_type) => {
+                    let uimm128 = self.match_uimm128_or_literals(controlling_type)?;
+                    let constant_handle = ctx.function.dfg.constants.insert(uimm128.to_vec());
+                    InstructionData::UnaryConst {
+                        opcode,
+                        constant_handle,
+                    }
+                }
+            },
+            InstructionFormat::Shuffle => {
+                let a = self.match_value("expected SSA value first operand")?;
+                self.match_token(Token::Comma, "expected ',' between operands")?;
+                let b = self.match_value("expected SSA value second operand")?;
+                self.match_token(Token::Comma, "expected ',' between operands")?;
+                let uimm128 = self.match_uimm128_or_literals(I8X16)?;
+                let mask = ctx.function.dfg.immediates.push(uimm128.to_vec());
+                InstructionData::Shuffle {
+                    opcode,
+                    mask,
+                    args: [a, b],
+                }
+            }
            InstructionFormat::IntCompare => {
                let cond = self.match_enum("expected intcc condition code")?;
                let lhs = self.match_value("expected SSA value first operand")?;
--- a/cranelift/serde/src/serde_clif_json.rs
+++ b/cranelift/serde/src/serde_clif_json.rs
@@ -1,4 +1,3 @@
-use cranelift_codegen::ir::immediates::Uimm128;
 use cranelift_codegen::ir::{Ebb, Function, Inst, InstructionData, Signature};
 use serde_derive::{Deserialize, Serialize};

@@ -59,6 +58,11 @@ pub enum SerInstData {
        arg: String,
        lane: String,
    },
+    Shuffle {
+        opcode: String,
+        args: [String; 2],
+        mask: String,
+    },
    IntCompare {
        opcode: String,
        args: [String; 2],
@@ -262,14 +266,6 @@ pub fn get_inst_data(inst_index: Inst, func: &Function) -> SerInstData {
            opcode: opcode.to_string(),
            imm: imm.to_string(),
        },
-        InstructionData::UnaryImm128 { opcode, imm } => {
-            let data = func.dfg.constants.get(imm);
-            let uimm128 = Uimm128::from(&data[..]);
-            SerInstData::UnaryImm {
-                opcode: opcode.to_string(),
-                imm: uimm128.to_string(),
-            }
-        }
        InstructionData::UnaryIeee32 { opcode, imm } => SerInstData::UnaryIeee32 {
            opcode: opcode.to_string(),
            imm: imm.to_string(),
@@ -340,6 +336,28 @@ pub fn get_inst_data(inst_index: Inst, func: &Function) -> SerInstData {
            arg: arg.to_string(),
            lane: lane.to_string(),
        },
+        InstructionData::UnaryConst {
+            opcode,
+            constant_handle,
+        } => {
+            let constant = func.dfg.constants.get(constant_handle);
+            SerInstData::UnaryImm {
+                opcode: opcode.to_string(),
+                imm: format!("{:?}", constant),
+            }
+        }
+        InstructionData::Shuffle { opcode, args, mask } => {
+            let mask = func
+                .dfg
+                .immediates
+                .get(mask)
+                .expect("Expected shuffle mask to already be inserted in immediate mapping");
+            SerInstData::Shuffle {
+                opcode: opcode.to_string(),
+                args: [args[0].to_string(), args[1].to_string()],
+                mask: format!("{:?}", mask),
+            }
+        }
        InstructionData::IntCompare { opcode, args, cond } => {
            let hold_args = [args[0].to_string(), args[1].to_string()];
            SerInstData::IntCompare {
--- a/cranelift/wasm/src/code_translator.rs
+++ b/cranelift/wasm/src/code_translator.rs
@@ -974,9 +974,20 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
                builder,
            ))
        }
+        Operator::V8x16Shuffle { lanes, .. } => {
+            let (vector_a, vector_b) = state.pop2();
+            let a = optionally_bitcast_vector(vector_a, I8X16, builder);
+            let b = optionally_bitcast_vector(vector_b, I8X16, builder);
+            let mask = builder.func.dfg.immediates.push(lanes.to_vec());
+            let shuffled = builder.ins().shuffle(a, b, mask);
+            state.push1(shuffled)
+            // At this point the original types of a and b are lost; users of this value (i.e. this
+            // WASM-to-CLIF translator) may need to raw_bitcast for type-correctness. This is due
+            // to WASM using the less specific v128 type for certain operations and more specific
+            // types (e.g. i8x16) for others.
+        }
        Operator::V128Load { .. }
        | Operator::V128Store { .. }
-        | Operator::V8x16Shuffle { .. }
        | Operator::I8x16Eq
        | Operator::I8x16Ne
        | Operator::I8x16LtS