Merge pull request #2440 from jlb6740/remaining_simd_conversions

Adds support for i32x4.trunc_sat_f32x4_u
2020-11-30 22:53:59 -08:00
parent 26509cb080 09f3d4e331
commit 4bf2c15014
5 changed files with 401 additions and 3 deletions
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -467,7 +467,10 @@ pub enum SseOpcode {
    Pabsb,
    Pabsw,
    Pabsd,
+    Packssdw,
    Packsswb,
+    Packusdw,
+    Packuswb,
    Paddb,
    Paddd,
    Paddq,
@@ -476,6 +479,7 @@ pub enum SseOpcode {
    Paddsw,
    Paddusb,
    Paddusw,
+    Palignr,
    Pand,
    Pandn,
    Pavgb,
@@ -507,6 +511,18 @@ pub enum SseOpcode {
    Pminuw,
    Pminud,
    Pmovmskb,
+    Pmovsxbd,
+    Pmovsxbw,
+    Pmovsxbq,
+    Pmovsxwd,
+    Pmovsxwq,
+    Pmovsxdq,
+    Pmovzxbd,
+    Pmovzxbw,
+    Pmovzxbq,
+    Pmovzxwd,
+    Pmovzxwq,
+    Pmovzxdq,
    Pmulld,
    Pmullw,
    Pmuludq,
@@ -620,7 +636,9 @@ impl SseOpcode {
            | SseOpcode::Mulpd
            | SseOpcode::Mulsd
            | SseOpcode::Orpd
+            | SseOpcode::Packssdw
            | SseOpcode::Packsswb
+            | SseOpcode::Packuswb
            | SseOpcode::Paddb
            | SseOpcode::Paddd
            | SseOpcode::Paddq
@@ -676,9 +694,14 @@ impl SseOpcode {
            | SseOpcode::Ucomisd
            | SseOpcode::Xorpd => SSE2,

-            SseOpcode::Pabsb | SseOpcode::Pabsw | SseOpcode::Pabsd | SseOpcode::Pshufb => SSSE3,
+            SseOpcode::Pabsb
+            | SseOpcode::Pabsw
+            | SseOpcode::Pabsd
+            | SseOpcode::Palignr
+            | SseOpcode::Pshufb => SSSE3,

            SseOpcode::Insertps
+            | SseOpcode::Packusdw
            | SseOpcode::Pcmpeqq
            | SseOpcode::Pextrb
            | SseOpcode::Pextrd
@@ -692,6 +715,18 @@ impl SseOpcode {
            | SseOpcode::Pminsd
            | SseOpcode::Pminuw
            | SseOpcode::Pminud
+            | SseOpcode::Pmovsxbd
+            | SseOpcode::Pmovsxbw
+            | SseOpcode::Pmovsxbq
+            | SseOpcode::Pmovsxwd
+            | SseOpcode::Pmovsxwq
+            | SseOpcode::Pmovsxdq
+            | SseOpcode::Pmovzxbd
+            | SseOpcode::Pmovzxbw
+            | SseOpcode::Pmovzxbq
+            | SseOpcode::Pmovzxwd
+            | SseOpcode::Pmovzxwq
+            | SseOpcode::Pmovzxdq
            | SseOpcode::Pmulld
            | SseOpcode::Ptest
            | SseOpcode::Roundss
@@ -772,7 +807,10 @@ impl fmt::Debug for SseOpcode {
            SseOpcode::Pabsb => "pabsb",
            SseOpcode::Pabsw => "pabsw",
            SseOpcode::Pabsd => "pabsd",
+            SseOpcode::Packssdw => "packssdw",
            SseOpcode::Packsswb => "packsswb",
+            SseOpcode::Packusdw => "packusdw",
+            SseOpcode::Packuswb => "packuswb",
            SseOpcode::Paddb => "paddb",
            SseOpcode::Paddd => "paddd",
            SseOpcode::Paddq => "paddq",
@@ -781,6 +819,7 @@ impl fmt::Debug for SseOpcode {
            SseOpcode::Paddsw => "paddsw",
            SseOpcode::Paddusb => "paddusb",
            SseOpcode::Paddusw => "paddusw",
+            SseOpcode::Palignr => "palignr",
            SseOpcode::Pand => "pand",
            SseOpcode::Pandn => "pandn",
            SseOpcode::Pavgb => "pavgb",
@@ -812,6 +851,18 @@ impl fmt::Debug for SseOpcode {
            SseOpcode::Pminuw => "pminuw",
            SseOpcode::Pminud => "pminud",
            SseOpcode::Pmovmskb => "pmovmskb",
+            SseOpcode::Pmovsxbd => "pmovsxbd",
+            SseOpcode::Pmovsxbw => "pmovsxbw",
+            SseOpcode::Pmovsxbq => "pmovsxbq",
+            SseOpcode::Pmovsxwd => "pmovsxwd",
+            SseOpcode::Pmovsxwq => "pmovsxwq",
+            SseOpcode::Pmovsxdq => "pmovsxdq",
+            SseOpcode::Pmovzxbd => "pmovzxbd",
+            SseOpcode::Pmovzxbw => "pmovzxbw",
+            SseOpcode::Pmovzxbq => "pmovzxbq",
+            SseOpcode::Pmovzxwd => "pmovzxwd",
+            SseOpcode::Pmovzxwq => "pmovzxwq",
+            SseOpcode::Pmovzxdq => "pmovzxdq",
            SseOpcode::Pmulld => "pmulld",
            SseOpcode::Pmullw => "pmullw",
            SseOpcode::Pmuludq => "pmuludq",
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -1781,7 +1781,10 @@ pub(crate) fn emit(
                SseOpcode::Mulsd => (LegacyPrefixes::_F2, 0x0F59, 2),
                SseOpcode::Orpd => (LegacyPrefixes::_66, 0x0F56, 2),
                SseOpcode::Orps => (LegacyPrefixes::None, 0x0F56, 2),
+                SseOpcode::Packssdw => (LegacyPrefixes::_66, 0x0F6B, 2),
                SseOpcode::Packsswb => (LegacyPrefixes::_66, 0x0F63, 2),
+                SseOpcode::Packusdw => (LegacyPrefixes::_66, 0x0F382B, 3),
+                SseOpcode::Packuswb => (LegacyPrefixes::_66, 0x0F67, 2),
                SseOpcode::Paddb => (LegacyPrefixes::_66, 0x0FFC, 2),
                SseOpcode::Paddd => (LegacyPrefixes::_66, 0x0FFE, 2),
                SseOpcode::Paddq => (LegacyPrefixes::_66, 0x0FD4, 2),
@@ -1802,6 +1805,18 @@ pub(crate) fn emit(
                SseOpcode::Pcmpgtw => (LegacyPrefixes::_66, 0x0F65, 2),
                SseOpcode::Pcmpgtd => (LegacyPrefixes::_66, 0x0F66, 2),
                SseOpcode::Pcmpgtq => (LegacyPrefixes::_66, 0x0F3837, 3),
+                SseOpcode::Pmovsxbd => (LegacyPrefixes::_66, 0x0F3821, 3),
+                SseOpcode::Pmovsxbw => (LegacyPrefixes::_66, 0x0F3820, 3),
+                SseOpcode::Pmovsxbq => (LegacyPrefixes::_66, 0x0F3822, 3),
+                SseOpcode::Pmovsxwd => (LegacyPrefixes::_66, 0x0F3823, 3),
+                SseOpcode::Pmovsxwq => (LegacyPrefixes::_66, 0x0F3824, 3),
+                SseOpcode::Pmovsxdq => (LegacyPrefixes::_66, 0x0F3825, 3),
+                SseOpcode::Pmovzxbd => (LegacyPrefixes::_66, 0x0F3831, 3),
+                SseOpcode::Pmovzxbw => (LegacyPrefixes::_66, 0x0F3830, 3),
+                SseOpcode::Pmovzxbq => (LegacyPrefixes::_66, 0x0F3832, 3),
+                SseOpcode::Pmovzxwd => (LegacyPrefixes::_66, 0x0F3833, 3),
+                SseOpcode::Pmovzxwq => (LegacyPrefixes::_66, 0x0F3834, 3),
+                SseOpcode::Pmovzxdq => (LegacyPrefixes::_66, 0x0F3835, 3),
                SseOpcode::Pmaxsb => (LegacyPrefixes::_66, 0x0F383C, 3),
                SseOpcode::Pmaxsw => (LegacyPrefixes::_66, 0x0FEE, 2),
                SseOpcode::Pmaxsd => (LegacyPrefixes::_66, 0x0F383D, 3),
@@ -1958,6 +1973,7 @@ pub(crate) fn emit(
                SseOpcode::Cmpss => (LegacyPrefixes::_F3, 0x0FC2, 2),
                SseOpcode::Cmpsd => (LegacyPrefixes::_F2, 0x0FC2, 2),
                SseOpcode::Insertps => (LegacyPrefixes::_66, 0x0F3A21, 3),
+                SseOpcode::Palignr => (LegacyPrefixes::_66, 0x0F3A0F, 3),
                SseOpcode::Pinsrb => (LegacyPrefixes::_66, 0x0F3A20, 3),
                SseOpcode::Pinsrw => (LegacyPrefixes::_66, 0x0FC4, 2),
                SseOpcode::Pinsrd => (LegacyPrefixes::_66, 0x0F3A22, 3),
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -3151,12 +3151,30 @@ fn test_x64_emit() {
        "pshufb  %xmm11, %xmm2",
    ));

+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Packssdw, RegMem::reg(xmm11), w_xmm12),
+        "66450F6BE3",
+        "packssdw %xmm11, %xmm12",
+    ));
+
    insns.push((
        Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(xmm11), w_xmm2),
        "66410F63D3",
        "packsswb %xmm11, %xmm2",
    ));

+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Packusdw, RegMem::reg(xmm13), w_xmm6),
+        "66410F382BF5",
+        "packusdw %xmm13, %xmm6",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Packuswb, RegMem::reg(xmm9), w_xmm4),
+        "66410F67E1",
+        "packuswb %xmm9, %xmm4",
+    ));
+
    insns.push((
        Inst::xmm_rm_r(SseOpcode::Punpckhbw, RegMem::reg(xmm3), w_xmm2),
        "660F68D3",
@@ -3183,6 +3201,81 @@ fn test_x64_emit() {
        "cvttps2dq %xmm9, %xmm8",
    ));

+    // ========================================================
+    // XMM_RM_R: Packed Move
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmovsxbd, RegMem::reg(xmm6), w_xmm8),
+        "66440F3821C6",
+        "pmovsxbd %xmm6, %xmm8",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmovsxbw, RegMem::reg(xmm9), w_xmm10),
+        "66450F3820D1",
+        "pmovsxbw %xmm9, %xmm10",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmovsxbq, RegMem::reg(xmm1), w_xmm1),
+        "660F3822C9",
+        "pmovsxbq %xmm1, %xmm1",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmovsxwd, RegMem::reg(xmm13), w_xmm10),
+        "66450F3823D5",
+        "pmovsxwd %xmm13, %xmm10",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmovsxwq, RegMem::reg(xmm12), w_xmm12),
+        "66450F3824E4",
+        "pmovsxwq %xmm12, %xmm12",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmovsxdq, RegMem::reg(xmm10), w_xmm8),
+        "66450F3825C2",
+        "pmovsxdq %xmm10, %xmm8",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmovzxbd, RegMem::reg(xmm5), w_xmm6),
+        "660F3831F5",
+        "pmovzxbd %xmm5, %xmm6",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmovzxbw, RegMem::reg(xmm5), w_xmm13),
+        "66440F3830ED",
+        "pmovzxbw %xmm5, %xmm13",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmovzxbq, RegMem::reg(xmm10), w_xmm11),
+        "66450F3832DA",
+        "pmovzxbq %xmm10, %xmm11",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmovzxwd, RegMem::reg(xmm2), w_xmm10),
+        "66440F3833D2",
+        "pmovzxwd %xmm2, %xmm10",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmovzxwq, RegMem::reg(xmm7), w_xmm4),
+        "660F3834E7",
+        "pmovzxwq %xmm7, %xmm4",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmovzxdq, RegMem::reg(xmm3), w_xmm4),
+        "660F3835E3",
+        "pmovzxdq %xmm3, %xmm4",
+    ));
+
    // XMM_Mov_R_M: float stores
    insns.push((
        Inst::xmm_mov_r_m(SseOpcode::Movss, xmm15, Amode::imm_reg(128, r12)),
@@ -3406,6 +3499,11 @@ fn test_x64_emit() {
        "410FC2FF00",
        "cmpps   $0, %xmm15, %xmm7",
    ));
+    insns.push((
+        Inst::xmm_rm_r_imm(SseOpcode::Palignr, RegMem::reg(xmm1), w_xmm9, 3, false),
+        "66440F3A0FC903",
+        "palignr $3, %xmm1, %xmm9",
+    ));

    // ========================================================
    // Pertaining to atomics.
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -2722,6 +2722,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            } else {
                if op == Opcode::FcvtToSintSat {
                    // Sets destination to zero if float is NaN
+                    assert_eq!(types::F32X4, ctx.input_ty(insn, 0));
                    let tmp = ctx.alloc_tmp(RegClass::V128, types::I32X4);
                    ctx.emit(Inst::xmm_unary_rm_r(
                        SseOpcode::Movapd,
@@ -2776,7 +2777,118 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                        dst,
                    ));
                } else if op == Opcode::FcvtToUintSat {
-                    unimplemented!("f32x4.convert_i32x4_u");
+                    // The algorithm for converting floats to unsigned ints is a little tricky. The
+                    // complication arises because we are converting from a signed 64-bit int with a positive
+                    // integer range from 1..INT_MAX (0x1..0x7FFFFFFF) to an unsigned integer with an extended
+                    // range from (INT_MAX+1)..UINT_MAX. It's this range from (INT_MAX+1)..UINT_MAX
+                    // (0x80000000..0xFFFFFFFF) that needs to be accounted for as a special case since our
+                    // conversion instruction (cvttps2dq) only converts as high as INT_MAX (0x7FFFFFFF), but
+                    // which conveniently setting underflows and overflows (smaller than MIN_INT or larger than
+                    // MAX_INT) to be INT_MAX+1 (0x80000000). Nothing that the range (INT_MAX+1)..UINT_MAX includes
+                    // precisely INT_MAX values we can correctly account for and convert every value in this range
+                    // if we simply subtract INT_MAX+1 before doing the cvttps2dq conversion. After the subtraction
+                    // every value originally (INT_MAX+1)..UINT_MAX is now the range (0..INT_MAX).
+                    // After the conversion we add INT_MAX+1 back to this converted value, noting again that
+                    // values we are trying to account for were already set to INT_MAX+1 during the original conversion.
+                    // We simply have to create a mask and make sure we are adding together only the lanes that need
+                    // to be accounted for. Digesting it all the steps then are:
+                    //
+                    // Step 1 - Account for NaN and negative floats by setting these src values to zero.
+                    // Step 2 - Make a copy (tmp1) of the src value since we need to convert twice for
+                    //          reasons described above.
+                    // Step 3 - Convert the original src values. This will convert properly all floats up to INT_MAX
+                    // Step 4 - Subtract INT_MAX from the copy set (tmp1). Note, all zero and negative values are those
+                    //          values that were originally in the range (0..INT_MAX). This will come in handy during
+                    //          step 7 when we zero negative lanes.
+                    // Step 5 - Create a bit mask for tmp1 that will correspond to all lanes originally less than
+                    //          UINT_MAX that are now less than INT_MAX thanks to the subtraction.
+                    // Step 6 - Convert the second set of values (tmp1)
+                    // Step 7 - Prep the converted second set by zeroing out negative lanes (these have already been
+                    //          converted correctly with the first set) and by setting overflow lanes to 0x7FFFFFFF
+                    //          as this will allow us to properly saturate overflow lanes when adding to 0x80000000
+                    // Step 8 - Add the orginal converted src and the converted tmp1 where float values originally less
+                    //          than and equal to INT_MAX will be unchanged, float values originally between INT_MAX+1 and
+                    //          UINT_MAX will add together (INT_MAX) + (SRC - INT_MAX), and float values originally
+                    //          greater than UINT_MAX will be saturated to UINT_MAX (0xFFFFFFFF) after adding (0x8000000 + 0x7FFFFFFF).
+                    //
+                    //
+                    // The table below illustrates the result after each step where it matters for the converted set.
+                    // Note the original value range (original src set) is the final dst in Step 8:
+                    //
+                    // Original src set:
+                    // | Original Value Range |    Step 1    |         Step 3         |          Step 8           |
+                    // |  -FLT_MIN..FLT_MAX   | 0.0..FLT_MAX | 0..INT_MAX(w/overflow) | 0..UINT_MAX(w/saturation) |
+                    //
+                    // Copied src set (tmp1):
+                    // |    Step 2    |                  Step 4                  |
+                    // | 0.0..FLT_MAX | (0.0-(INT_MAX+1))..(FLT_MAX-(INT_MAX+1)) |
+                    //
+                    // |                       Step 6                        |                 Step 7                 |
+                    // | (0-(INT_MAX+1))..(UINT_MAX-(INT_MAX+1))(w/overflow) | ((INT_MAX+1)-(INT_MAX+1))..(INT_MAX+1) |
+
+                    // Create temporaries
+                    assert_eq!(types::F32X4, ctx.input_ty(insn, 0));
+                    let tmp1 = ctx.alloc_tmp(RegClass::V128, types::I32X4);
+                    let tmp2 = ctx.alloc_tmp(RegClass::V128, types::I32X4);
+
+                    // Converting to unsigned int so if float src is negative or NaN
+                    // will first set to zero.
+                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp2));
+                    ctx.emit(Inst::gen_move(dst, src, input_ty));
+                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Maxps, RegMem::from(tmp2), dst));
+
+                    // Set tmp2 to INT_MAX+1. It is important to note here that after it looks
+                    // like we are only converting INT_MAX (0x7FFFFFFF) but in fact because
+                    // single precision IEEE-754 floats can only accurately represent contingous
+                    // integers up to 2^23 and outside of this range it rounds to the closest
+                    // integer that it can represent. In the case of INT_MAX, this value gets
+                    // represented as 0x4f000000 which is the integer value (INT_MAX+1).
+
+                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Pcmpeqd, RegMem::from(tmp2), tmp2));
+                    ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(1), tmp2));
+                    ctx.emit(Inst::xmm_rm_r(
+                        SseOpcode::Cvtdq2ps,
+                        RegMem::from(tmp2),
+                        tmp2,
+                    ));
+
+                    // Make a copy of these lanes and then do the first conversion.
+                    // Overflow lanes greater than the maximum allowed signed value will
+                    // set to 0x80000000. Negative and NaN lanes will be 0x0
+                    ctx.emit(Inst::xmm_mov(SseOpcode::Movaps, RegMem::from(dst), tmp1));
+                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvttps2dq, RegMem::from(dst), dst));
+
+                    // Set lanes to src - max_signed_int
+                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Subps, RegMem::from(tmp2), tmp1));
+
+                    // Create mask for all positive lanes to saturate (i.e. greater than
+                    // or equal to the maxmimum allowable unsigned int).
+                    let cond = FcmpImm::from(FloatCC::LessThanOrEqual);
+                    ctx.emit(Inst::xmm_rm_r_imm(
+                        SseOpcode::Cmpps,
+                        RegMem::from(tmp1),
+                        tmp2,
+                        cond.encode(),
+                        false,
+                    ));
+
+                    // Convert those set of lanes that have the max_signed_int factored out.
+                    ctx.emit(Inst::xmm_rm_r(
+                        SseOpcode::Cvttps2dq,
+                        RegMem::from(tmp1),
+                        tmp1,
+                    ));
+
+                    // Prepare converted lanes by zeroing negative lanes and prepping lanes
+                    // that have positive overflow (based on the mask) by setting these lanes
+                    // to 0x7FFFFFFF
+                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp1));
+                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp2));
+                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmaxsd, RegMem::from(tmp2), tmp1));
+
+                    // Add this second set of converted lanes to the original to properly handle
+                    // values greater than max signed int.
+                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Paddd, RegMem::from(tmp1), dst));
                } else {
                    // Since this branch is also guarded by a check for vector types
                    // neither Opcode::FcvtToUint nor Opcode::FcvtToSint can reach here
@@ -2786,7 +2898,127 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                }
            }
        }
-
+        Opcode::UwidenHigh | Opcode::UwidenLow | Opcode::SwidenHigh | Opcode::SwidenLow => {
+            let input_ty = ctx.input_ty(insn, 0);
+            let output_ty = ctx.output_ty(insn, 0);
+            let src = put_input_in_reg(ctx, inputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            if output_ty.is_vector() {
+                match op {
+                    Opcode::SwidenLow => match (input_ty, output_ty) {
+                        (types::I8X16, types::I16X8) => {
+                            ctx.emit(Inst::gen_move(dst, src, output_ty));
+                            ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmovsxbw, RegMem::from(dst), dst));
+                        }
+                        (types::I16X8, types::I32X4) => {
+                            ctx.emit(Inst::gen_move(dst, src, output_ty));
+                            ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmovsxwd, RegMem::from(dst), dst));
+                        }
+                        _ => unreachable!(),
+                    },
+                    Opcode::SwidenHigh => match (input_ty, output_ty) {
+                        (types::I8X16, types::I16X8) => {
+                            ctx.emit(Inst::gen_move(dst, src, output_ty));
+                            ctx.emit(Inst::xmm_rm_r_imm(
+                                SseOpcode::Palignr,
+                                RegMem::reg(src),
+                                dst,
+                                8,
+                                false,
+                            ));
+                            ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmovsxbw, RegMem::from(dst), dst));
+                        }
+                        (types::I16X8, types::I32X4) => {
+                            ctx.emit(Inst::gen_move(dst, src, output_ty));
+                            ctx.emit(Inst::xmm_rm_r_imm(
+                                SseOpcode::Palignr,
+                                RegMem::reg(src),
+                                dst,
+                                8,
+                                false,
+                            ));
+                            ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmovsxwd, RegMem::from(dst), dst));
+                        }
+                        _ => unreachable!(),
+                    },
+                    Opcode::UwidenLow => match (input_ty, output_ty) {
+                        (types::I8X16, types::I16X8) => {
+                            ctx.emit(Inst::gen_move(dst, src, output_ty));
+                            ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmovzxbw, RegMem::from(dst), dst));
+                        }
+                        (types::I16X8, types::I32X4) => {
+                            ctx.emit(Inst::gen_move(dst, src, output_ty));
+                            ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmovzxwd, RegMem::from(dst), dst));
+                        }
+                        _ => unreachable!(),
+                    },
+                    Opcode::UwidenHigh => match (input_ty, output_ty) {
+                        (types::I8X16, types::I16X8) => {
+                            ctx.emit(Inst::gen_move(dst, src, output_ty));
+                            ctx.emit(Inst::xmm_rm_r_imm(
+                                SseOpcode::Palignr,
+                                RegMem::reg(src),
+                                dst,
+                                8,
+                                false,
+                            ));
+                            ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmovzxbw, RegMem::from(dst), dst));
+                        }
+                        (types::I16X8, types::I32X4) => {
+                            ctx.emit(Inst::gen_move(dst, src, output_ty));
+                            ctx.emit(Inst::xmm_rm_r_imm(
+                                SseOpcode::Palignr,
+                                RegMem::reg(src),
+                                dst,
+                                8,
+                                false,
+                            ));
+                            ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmovzxwd, RegMem::from(dst), dst));
+                        }
+                        _ => unreachable!(),
+                    },
+                    _ => unreachable!(),
+                }
+            } else {
+                panic!("Unsupported non-vector type for widen instruction {:?}", ty);
+            }
+        }
+        Opcode::Snarrow | Opcode::Unarrow => {
+            let input_ty = ctx.input_ty(insn, 0);
+            let output_ty = ctx.output_ty(insn, 0);
+            let src1 = put_input_in_reg(ctx, inputs[0]);
+            let src2 = put_input_in_reg(ctx, inputs[1]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            if output_ty.is_vector() {
+                match op {
+                    Opcode::Snarrow => match (input_ty, output_ty) {
+                        (types::I16X8, types::I8X16) => {
+                            ctx.emit(Inst::gen_move(dst, src1, input_ty));
+                            ctx.emit(Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(src2), dst));
+                        }
+                        (types::I32X4, types::I16X8) => {
+                            ctx.emit(Inst::gen_move(dst, src1, input_ty));
+                            ctx.emit(Inst::xmm_rm_r(SseOpcode::Packssdw, RegMem::reg(src2), dst));
+                        }
+                        _ => unreachable!(),
+                    },
+                    Opcode::Unarrow => match (input_ty, output_ty) {
+                        (types::I16X8, types::I8X16) => {
+                            ctx.emit(Inst::gen_move(dst, src1, input_ty));
+                            ctx.emit(Inst::xmm_rm_r(SseOpcode::Packuswb, RegMem::reg(src2), dst));
+                        }
+                        (types::I32X4, types::I16X8) => {
+                            ctx.emit(Inst::gen_move(dst, src1, input_ty));
+                            ctx.emit(Inst::xmm_rm_r(SseOpcode::Packusdw, RegMem::reg(src2), dst));
+                        }
+                        _ => unreachable!(),
+                    },
+                    _ => unreachable!(),
+                }
+            } else {
+                panic!("Unsupported non-vector type for widen instruction {:?}", ty);
+            }
+        }
        Opcode::Bitcast => {
            let input_ty = ctx.input_ty(insn, 0);
            let output_ty = ctx.output_ty(insn, 0);