diff --git a/build.rs b/build.rs index da9822e711..c4fcc1a961 100644 --- a/build.rs +++ b/build.rs @@ -210,6 +210,7 @@ fn experimental_x64_should_panic(testsuite: &str, testname: &str, strategy: &str ("simd", "simd_load_splat") => return false, ("simd", "simd_splat") => return false, ("simd", "simd_store") => return false, + ("simd", "simd_conversions") => return false, ("simd", _) => return true, _ => {} } diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index e992288560..4542f3386d 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -467,7 +467,10 @@ pub enum SseOpcode { Pabsb, Pabsw, Pabsd, + Packssdw, Packsswb, + Packusdw, + Packuswb, Paddb, Paddd, Paddq, @@ -476,6 +479,7 @@ pub enum SseOpcode { Paddsw, Paddusb, Paddusw, + Palignr, Pand, Pandn, Pavgb, @@ -507,6 +511,18 @@ pub enum SseOpcode { Pminuw, Pminud, Pmovmskb, + Pmovsxbd, + Pmovsxbw, + Pmovsxbq, + Pmovsxwd, + Pmovsxwq, + Pmovsxdq, + Pmovzxbd, + Pmovzxbw, + Pmovzxbq, + Pmovzxwd, + Pmovzxwq, + Pmovzxdq, Pmulld, Pmullw, Pmuludq, @@ -620,7 +636,9 @@ impl SseOpcode { | SseOpcode::Mulpd | SseOpcode::Mulsd | SseOpcode::Orpd + | SseOpcode::Packssdw | SseOpcode::Packsswb + | SseOpcode::Packuswb | SseOpcode::Paddb | SseOpcode::Paddd | SseOpcode::Paddq @@ -676,9 +694,14 @@ impl SseOpcode { | SseOpcode::Ucomisd | SseOpcode::Xorpd => SSE2, - SseOpcode::Pabsb | SseOpcode::Pabsw | SseOpcode::Pabsd | SseOpcode::Pshufb => SSSE3, + SseOpcode::Pabsb + | SseOpcode::Pabsw + | SseOpcode::Pabsd + | SseOpcode::Palignr + | SseOpcode::Pshufb => SSSE3, SseOpcode::Insertps + | SseOpcode::Packusdw | SseOpcode::Pcmpeqq | SseOpcode::Pextrb | SseOpcode::Pextrd @@ -692,6 +715,18 @@ impl SseOpcode { | SseOpcode::Pminsd | SseOpcode::Pminuw | SseOpcode::Pminud + | SseOpcode::Pmovsxbd + | SseOpcode::Pmovsxbw + | SseOpcode::Pmovsxbq + | SseOpcode::Pmovsxwd + | SseOpcode::Pmovsxwq + | SseOpcode::Pmovsxdq + | SseOpcode::Pmovzxbd + | SseOpcode::Pmovzxbw + | SseOpcode::Pmovzxbq + | SseOpcode::Pmovzxwd + | SseOpcode::Pmovzxwq + | SseOpcode::Pmovzxdq | SseOpcode::Pmulld | SseOpcode::Ptest | SseOpcode::Roundss @@ -772,7 +807,10 @@ impl fmt::Debug for SseOpcode { SseOpcode::Pabsb => "pabsb", SseOpcode::Pabsw => "pabsw", SseOpcode::Pabsd => "pabsd", + SseOpcode::Packssdw => "packssdw", SseOpcode::Packsswb => "packsswb", + SseOpcode::Packusdw => "packusdw", + SseOpcode::Packuswb => "packuswb", SseOpcode::Paddb => "paddb", SseOpcode::Paddd => "paddd", SseOpcode::Paddq => "paddq", @@ -781,6 +819,7 @@ impl fmt::Debug for SseOpcode { SseOpcode::Paddsw => "paddsw", SseOpcode::Paddusb => "paddusb", SseOpcode::Paddusw => "paddusw", + SseOpcode::Palignr => "palignr", SseOpcode::Pand => "pand", SseOpcode::Pandn => "pandn", SseOpcode::Pavgb => "pavgb", @@ -812,6 +851,18 @@ impl fmt::Debug for SseOpcode { SseOpcode::Pminuw => "pminuw", SseOpcode::Pminud => "pminud", SseOpcode::Pmovmskb => "pmovmskb", + SseOpcode::Pmovsxbd => "pmovsxbd", + SseOpcode::Pmovsxbw => "pmovsxbw", + SseOpcode::Pmovsxbq => "pmovsxbq", + SseOpcode::Pmovsxwd => "pmovsxwd", + SseOpcode::Pmovsxwq => "pmovsxwq", + SseOpcode::Pmovsxdq => "pmovsxdq", + SseOpcode::Pmovzxbd => "pmovzxbd", + SseOpcode::Pmovzxbw => "pmovzxbw", + SseOpcode::Pmovzxbq => "pmovzxbq", + SseOpcode::Pmovzxwd => "pmovzxwd", + SseOpcode::Pmovzxwq => "pmovzxwq", + SseOpcode::Pmovzxdq => "pmovzxdq", SseOpcode::Pmulld => "pmulld", SseOpcode::Pmullw => "pmullw", SseOpcode::Pmuludq => "pmuludq", diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 7d15063ad4..56ecc0e843 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1781,7 +1781,10 @@ pub(crate) fn emit( SseOpcode::Mulsd => (LegacyPrefixes::_F2, 0x0F59, 2), SseOpcode::Orpd => (LegacyPrefixes::_66, 0x0F56, 2), SseOpcode::Orps => (LegacyPrefixes::None, 0x0F56, 2), + SseOpcode::Packssdw => (LegacyPrefixes::_66, 0x0F6B, 2), SseOpcode::Packsswb => (LegacyPrefixes::_66, 0x0F63, 2), + SseOpcode::Packusdw => (LegacyPrefixes::_66, 0x0F382B, 3), + SseOpcode::Packuswb => (LegacyPrefixes::_66, 0x0F67, 2), SseOpcode::Paddb => (LegacyPrefixes::_66, 0x0FFC, 2), SseOpcode::Paddd => (LegacyPrefixes::_66, 0x0FFE, 2), SseOpcode::Paddq => (LegacyPrefixes::_66, 0x0FD4, 2), @@ -1802,6 +1805,18 @@ pub(crate) fn emit( SseOpcode::Pcmpgtw => (LegacyPrefixes::_66, 0x0F65, 2), SseOpcode::Pcmpgtd => (LegacyPrefixes::_66, 0x0F66, 2), SseOpcode::Pcmpgtq => (LegacyPrefixes::_66, 0x0F3837, 3), + SseOpcode::Pmovsxbd => (LegacyPrefixes::_66, 0x0F3821, 3), + SseOpcode::Pmovsxbw => (LegacyPrefixes::_66, 0x0F3820, 3), + SseOpcode::Pmovsxbq => (LegacyPrefixes::_66, 0x0F3822, 3), + SseOpcode::Pmovsxwd => (LegacyPrefixes::_66, 0x0F3823, 3), + SseOpcode::Pmovsxwq => (LegacyPrefixes::_66, 0x0F3824, 3), + SseOpcode::Pmovsxdq => (LegacyPrefixes::_66, 0x0F3825, 3), + SseOpcode::Pmovzxbd => (LegacyPrefixes::_66, 0x0F3831, 3), + SseOpcode::Pmovzxbw => (LegacyPrefixes::_66, 0x0F3830, 3), + SseOpcode::Pmovzxbq => (LegacyPrefixes::_66, 0x0F3832, 3), + SseOpcode::Pmovzxwd => (LegacyPrefixes::_66, 0x0F3833, 3), + SseOpcode::Pmovzxwq => (LegacyPrefixes::_66, 0x0F3834, 3), + SseOpcode::Pmovzxdq => (LegacyPrefixes::_66, 0x0F3835, 3), SseOpcode::Pmaxsb => (LegacyPrefixes::_66, 0x0F383C, 3), SseOpcode::Pmaxsw => (LegacyPrefixes::_66, 0x0FEE, 2), SseOpcode::Pmaxsd => (LegacyPrefixes::_66, 0x0F383D, 3), @@ -1958,6 +1973,7 @@ pub(crate) fn emit( SseOpcode::Cmpss => (LegacyPrefixes::_F3, 0x0FC2, 2), SseOpcode::Cmpsd => (LegacyPrefixes::_F2, 0x0FC2, 2), SseOpcode::Insertps => (LegacyPrefixes::_66, 0x0F3A21, 3), + SseOpcode::Palignr => (LegacyPrefixes::_66, 0x0F3A0F, 3), SseOpcode::Pinsrb => (LegacyPrefixes::_66, 0x0F3A20, 3), SseOpcode::Pinsrw => (LegacyPrefixes::_66, 0x0FC4, 2), SseOpcode::Pinsrd => (LegacyPrefixes::_66, 0x0F3A22, 3), diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index 6c2fe6f2d4..fb9f0c1c07 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -3151,12 +3151,30 @@ fn test_x64_emit() { "pshufb %xmm11, %xmm2", )); + insns.push(( + Inst::xmm_rm_r(SseOpcode::Packssdw, RegMem::reg(xmm11), w_xmm12), + "66450F6BE3", + "packssdw %xmm11, %xmm12", + )); + insns.push(( Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(xmm11), w_xmm2), "66410F63D3", "packsswb %xmm11, %xmm2", )); + insns.push(( + Inst::xmm_rm_r(SseOpcode::Packusdw, RegMem::reg(xmm13), w_xmm6), + "66410F382BF5", + "packusdw %xmm13, %xmm6", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Packuswb, RegMem::reg(xmm9), w_xmm4), + "66410F67E1", + "packuswb %xmm9, %xmm4", + )); + insns.push(( Inst::xmm_rm_r(SseOpcode::Punpckhbw, RegMem::reg(xmm3), w_xmm2), "660F68D3", @@ -3183,6 +3201,81 @@ fn test_x64_emit() { "cvttps2dq %xmm9, %xmm8", )); + // ======================================================== + // XMM_RM_R: Packed Move + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmovsxbd, RegMem::reg(xmm6), w_xmm8), + "66440F3821C6", + "pmovsxbd %xmm6, %xmm8", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmovsxbw, RegMem::reg(xmm9), w_xmm10), + "66450F3820D1", + "pmovsxbw %xmm9, %xmm10", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmovsxbq, RegMem::reg(xmm1), w_xmm1), + "660F3822C9", + "pmovsxbq %xmm1, %xmm1", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmovsxwd, RegMem::reg(xmm13), w_xmm10), + "66450F3823D5", + "pmovsxwd %xmm13, %xmm10", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmovsxwq, RegMem::reg(xmm12), w_xmm12), + "66450F3824E4", + "pmovsxwq %xmm12, %xmm12", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmovsxdq, RegMem::reg(xmm10), w_xmm8), + "66450F3825C2", + "pmovsxdq %xmm10, %xmm8", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmovzxbd, RegMem::reg(xmm5), w_xmm6), + "660F3831F5", + "pmovzxbd %xmm5, %xmm6", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmovzxbw, RegMem::reg(xmm5), w_xmm13), + "66440F3830ED", + "pmovzxbw %xmm5, %xmm13", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmovzxbq, RegMem::reg(xmm10), w_xmm11), + "66450F3832DA", + "pmovzxbq %xmm10, %xmm11", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmovzxwd, RegMem::reg(xmm2), w_xmm10), + "66440F3833D2", + "pmovzxwd %xmm2, %xmm10", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmovzxwq, RegMem::reg(xmm7), w_xmm4), + "660F3834E7", + "pmovzxwq %xmm7, %xmm4", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmovzxdq, RegMem::reg(xmm3), w_xmm4), + "660F3835E3", + "pmovzxdq %xmm3, %xmm4", + )); + // XMM_Mov_R_M: float stores insns.push(( Inst::xmm_mov_r_m(SseOpcode::Movss, xmm15, Amode::imm_reg(128, r12)), @@ -3406,6 +3499,11 @@ fn test_x64_emit() { "410FC2FF00", "cmpps $0, %xmm15, %xmm7", )); + insns.push(( + Inst::xmm_rm_r_imm(SseOpcode::Palignr, RegMem::reg(xmm1), w_xmm9, 3, false), + "66440F3A0FC903", + "palignr $3, %xmm1, %xmm9", + )); // ======================================================== // Pertaining to atomics. diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index bbe886c24b..09ded3c948 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -2722,6 +2722,7 @@ fn lower_insn_to_regs>( } else { if op == Opcode::FcvtToSintSat { // Sets destination to zero if float is NaN + assert_eq!(types::F32X4, ctx.input_ty(insn, 0)); let tmp = ctx.alloc_tmp(RegClass::V128, types::I32X4); ctx.emit(Inst::xmm_unary_rm_r( SseOpcode::Movapd, @@ -2776,7 +2777,118 @@ fn lower_insn_to_regs>( dst, )); } else if op == Opcode::FcvtToUintSat { - unimplemented!("f32x4.convert_i32x4_u"); + // The algorithm for converting floats to unsigned ints is a little tricky. The + // complication arises because we are converting from a signed 64-bit int with a positive + // integer range from 1..INT_MAX (0x1..0x7FFFFFFF) to an unsigned integer with an extended + // range from (INT_MAX+1)..UINT_MAX. It's this range from (INT_MAX+1)..UINT_MAX + // (0x80000000..0xFFFFFFFF) that needs to be accounted for as a special case since our + // conversion instruction (cvttps2dq) only converts as high as INT_MAX (0x7FFFFFFF), but + // which conveniently setting underflows and overflows (smaller than MIN_INT or larger than + // MAX_INT) to be INT_MAX+1 (0x80000000). Nothing that the range (INT_MAX+1)..UINT_MAX includes + // precisely INT_MAX values we can correctly account for and convert every value in this range + // if we simply subtract INT_MAX+1 before doing the cvttps2dq conversion. After the subtraction + // every value originally (INT_MAX+1)..UINT_MAX is now the range (0..INT_MAX). + // After the conversion we add INT_MAX+1 back to this converted value, noting again that + // values we are trying to account for were already set to INT_MAX+1 during the original conversion. + // We simply have to create a mask and make sure we are adding together only the lanes that need + // to be accounted for. Digesting it all the steps then are: + // + // Step 1 - Account for NaN and negative floats by setting these src values to zero. + // Step 2 - Make a copy (tmp1) of the src value since we need to convert twice for + // reasons described above. + // Step 3 - Convert the original src values. This will convert properly all floats up to INT_MAX + // Step 4 - Subtract INT_MAX from the copy set (tmp1). Note, all zero and negative values are those + // values that were originally in the range (0..INT_MAX). This will come in handy during + // step 7 when we zero negative lanes. + // Step 5 - Create a bit mask for tmp1 that will correspond to all lanes originally less than + // UINT_MAX that are now less than INT_MAX thanks to the subtraction. + // Step 6 - Convert the second set of values (tmp1) + // Step 7 - Prep the converted second set by zeroing out negative lanes (these have already been + // converted correctly with the first set) and by setting overflow lanes to 0x7FFFFFFF + // as this will allow us to properly saturate overflow lanes when adding to 0x80000000 + // Step 8 - Add the orginal converted src and the converted tmp1 where float values originally less + // than and equal to INT_MAX will be unchanged, float values originally between INT_MAX+1 and + // UINT_MAX will add together (INT_MAX) + (SRC - INT_MAX), and float values originally + // greater than UINT_MAX will be saturated to UINT_MAX (0xFFFFFFFF) after adding (0x8000000 + 0x7FFFFFFF). + // + // + // The table below illustrates the result after each step where it matters for the converted set. + // Note the original value range (original src set) is the final dst in Step 8: + // + // Original src set: + // | Original Value Range | Step 1 | Step 3 | Step 8 | + // | -FLT_MIN..FLT_MAX | 0.0..FLT_MAX | 0..INT_MAX(w/overflow) | 0..UINT_MAX(w/saturation) | + // + // Copied src set (tmp1): + // | Step 2 | Step 4 | + // | 0.0..FLT_MAX | (0.0-(INT_MAX+1))..(FLT_MAX-(INT_MAX+1)) | + // + // | Step 6 | Step 7 | + // | (0-(INT_MAX+1))..(UINT_MAX-(INT_MAX+1))(w/overflow) | ((INT_MAX+1)-(INT_MAX+1))..(INT_MAX+1) | + + // Create temporaries + assert_eq!(types::F32X4, ctx.input_ty(insn, 0)); + let tmp1 = ctx.alloc_tmp(RegClass::V128, types::I32X4); + let tmp2 = ctx.alloc_tmp(RegClass::V128, types::I32X4); + + // Converting to unsigned int so if float src is negative or NaN + // will first set to zero. + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp2)); + ctx.emit(Inst::gen_move(dst, src, input_ty)); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Maxps, RegMem::from(tmp2), dst)); + + // Set tmp2 to INT_MAX+1. It is important to note here that after it looks + // like we are only converting INT_MAX (0x7FFFFFFF) but in fact because + // single precision IEEE-754 floats can only accurately represent contingous + // integers up to 2^23 and outside of this range it rounds to the closest + // integer that it can represent. In the case of INT_MAX, this value gets + // represented as 0x4f000000 which is the integer value (INT_MAX+1). + + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pcmpeqd, RegMem::from(tmp2), tmp2)); + ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(1), tmp2)); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Cvtdq2ps, + RegMem::from(tmp2), + tmp2, + )); + + // Make a copy of these lanes and then do the first conversion. + // Overflow lanes greater than the maximum allowed signed value will + // set to 0x80000000. Negative and NaN lanes will be 0x0 + ctx.emit(Inst::xmm_mov(SseOpcode::Movaps, RegMem::from(dst), tmp1)); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvttps2dq, RegMem::from(dst), dst)); + + // Set lanes to src - max_signed_int + ctx.emit(Inst::xmm_rm_r(SseOpcode::Subps, RegMem::from(tmp2), tmp1)); + + // Create mask for all positive lanes to saturate (i.e. greater than + // or equal to the maxmimum allowable unsigned int). + let cond = FcmpImm::from(FloatCC::LessThanOrEqual); + ctx.emit(Inst::xmm_rm_r_imm( + SseOpcode::Cmpps, + RegMem::from(tmp1), + tmp2, + cond.encode(), + false, + )); + + // Convert those set of lanes that have the max_signed_int factored out. + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Cvttps2dq, + RegMem::from(tmp1), + tmp1, + )); + + // Prepare converted lanes by zeroing negative lanes and prepping lanes + // that have positive overflow (based on the mask) by setting these lanes + // to 0x7FFFFFFF + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp1)); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp2)); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmaxsd, RegMem::from(tmp2), tmp1)); + + // Add this second set of converted lanes to the original to properly handle + // values greater than max signed int. + ctx.emit(Inst::xmm_rm_r(SseOpcode::Paddd, RegMem::from(tmp1), dst)); } else { // Since this branch is also guarded by a check for vector types // neither Opcode::FcvtToUint nor Opcode::FcvtToSint can reach here @@ -2786,7 +2898,127 @@ fn lower_insn_to_regs>( } } } - + Opcode::UwidenHigh | Opcode::UwidenLow | Opcode::SwidenHigh | Opcode::SwidenLow => { + let input_ty = ctx.input_ty(insn, 0); + let output_ty = ctx.output_ty(insn, 0); + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + if output_ty.is_vector() { + match op { + Opcode::SwidenLow => match (input_ty, output_ty) { + (types::I8X16, types::I16X8) => { + ctx.emit(Inst::gen_move(dst, src, output_ty)); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmovsxbw, RegMem::from(dst), dst)); + } + (types::I16X8, types::I32X4) => { + ctx.emit(Inst::gen_move(dst, src, output_ty)); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmovsxwd, RegMem::from(dst), dst)); + } + _ => unreachable!(), + }, + Opcode::SwidenHigh => match (input_ty, output_ty) { + (types::I8X16, types::I16X8) => { + ctx.emit(Inst::gen_move(dst, src, output_ty)); + ctx.emit(Inst::xmm_rm_r_imm( + SseOpcode::Palignr, + RegMem::reg(src), + dst, + 8, + false, + )); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmovsxbw, RegMem::from(dst), dst)); + } + (types::I16X8, types::I32X4) => { + ctx.emit(Inst::gen_move(dst, src, output_ty)); + ctx.emit(Inst::xmm_rm_r_imm( + SseOpcode::Palignr, + RegMem::reg(src), + dst, + 8, + false, + )); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmovsxwd, RegMem::from(dst), dst)); + } + _ => unreachable!(), + }, + Opcode::UwidenLow => match (input_ty, output_ty) { + (types::I8X16, types::I16X8) => { + ctx.emit(Inst::gen_move(dst, src, output_ty)); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmovzxbw, RegMem::from(dst), dst)); + } + (types::I16X8, types::I32X4) => { + ctx.emit(Inst::gen_move(dst, src, output_ty)); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmovzxwd, RegMem::from(dst), dst)); + } + _ => unreachable!(), + }, + Opcode::UwidenHigh => match (input_ty, output_ty) { + (types::I8X16, types::I16X8) => { + ctx.emit(Inst::gen_move(dst, src, output_ty)); + ctx.emit(Inst::xmm_rm_r_imm( + SseOpcode::Palignr, + RegMem::reg(src), + dst, + 8, + false, + )); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmovzxbw, RegMem::from(dst), dst)); + } + (types::I16X8, types::I32X4) => { + ctx.emit(Inst::gen_move(dst, src, output_ty)); + ctx.emit(Inst::xmm_rm_r_imm( + SseOpcode::Palignr, + RegMem::reg(src), + dst, + 8, + false, + )); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmovzxwd, RegMem::from(dst), dst)); + } + _ => unreachable!(), + }, + _ => unreachable!(), + } + } else { + panic!("Unsupported non-vector type for widen instruction {:?}", ty); + } + } + Opcode::Snarrow | Opcode::Unarrow => { + let input_ty = ctx.input_ty(insn, 0); + let output_ty = ctx.output_ty(insn, 0); + let src1 = put_input_in_reg(ctx, inputs[0]); + let src2 = put_input_in_reg(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + if output_ty.is_vector() { + match op { + Opcode::Snarrow => match (input_ty, output_ty) { + (types::I16X8, types::I8X16) => { + ctx.emit(Inst::gen_move(dst, src1, input_ty)); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(src2), dst)); + } + (types::I32X4, types::I16X8) => { + ctx.emit(Inst::gen_move(dst, src1, input_ty)); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Packssdw, RegMem::reg(src2), dst)); + } + _ => unreachable!(), + }, + Opcode::Unarrow => match (input_ty, output_ty) { + (types::I16X8, types::I8X16) => { + ctx.emit(Inst::gen_move(dst, src1, input_ty)); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Packuswb, RegMem::reg(src2), dst)); + } + (types::I32X4, types::I16X8) => { + ctx.emit(Inst::gen_move(dst, src1, input_ty)); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Packusdw, RegMem::reg(src2), dst)); + } + _ => unreachable!(), + }, + _ => unreachable!(), + } + } else { + panic!("Unsupported non-vector type for widen instruction {:?}", ty); + } + } Opcode::Bitcast => { let input_ty = ctx.input_ty(insn, 0); let output_ty = ctx.output_ty(insn, 0);