diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index 3f4bb8154b..7aee35bdb3 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -1619,6 +1619,7 @@ fn define_simd( let x86_insertps = x86.by_name("x86_insertps"); let x86_movlhps = x86.by_name("x86_movlhps"); let x86_movsd = x86.by_name("x86_movsd"); + let x86_packss = x86.by_name("x86_packss"); let x86_pextr = x86.by_name("x86_pextr"); let x86_pinsr = x86.by_name("x86_pinsr"); let x86_pmaxs = x86.by_name("x86_pmaxs"); @@ -1804,6 +1805,10 @@ fn define_simd( rec_fa.opcodes(low), ); } + for (ty, opcodes) in &[(I16, &PACKSSWB), (I32, &PACKSSDW)] { + let x86_packss = x86_packss.bind(vector(*ty, sse_vector_size)); + e.enc_both_inferred(x86_packss, rec_fa.opcodes(*opcodes)); + } // SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8). for from_type in ValueType::all_lane_types().filter(allowed_simd_type) { diff --git a/cranelift/codegen/meta/src/isa/x86/instructions.rs b/cranelift/codegen/meta/src/isa/x86/instructions.rs index 9a04f34b7c..2675937353 100644 --- a/cranelift/codegen/meta/src/isa/x86/instructions.rs +++ b/cranelift/codegen/meta/src/isa/x86/instructions.rs @@ -6,7 +6,6 @@ use crate::cdsl::instructions::{ use crate::cdsl::operands::Operand; use crate::cdsl::types::ValueType; use crate::cdsl::typevar::{Interval, TypeSetBuilder, TypeVar}; - use crate::shared::entities::EntityRefs; use crate::shared::formats::Formats; use crate::shared::immediates::Immediates; @@ -275,7 +274,7 @@ pub(crate) fn define( ); let a = &Operand::new("a", TxN).with_doc("A vector value (i.e. held in an XMM register)"); let b = &Operand::new("b", TxN).with_doc("A vector value (i.e. held in an XMM register)"); - let i = &Operand::new("i", uimm8,).with_doc( "An ordering operand controlling the copying of data from the source to the destination; see PSHUFD in Intel manual for details"); + let i = &Operand::new("i", uimm8).with_doc("An ordering operand controlling the copying of data from the source to the destination; see PSHUFD in Intel manual for details"); ig.push( Inst::new( @@ -410,6 +409,35 @@ pub(crate) fn define( .operands_out(vec![a]), ); + let I16xN = &TypeVar::new( + "I16xN", + "A SIMD vector type containing integers 16-bits wide and up", + TypeSetBuilder::new() + .ints(16..32) + .simd_lanes(4..8) + .includes_scalars(false) + .build(), + ); + + let x = &Operand::new("x", I16xN); + let y = &Operand::new("y", I16xN); + let a = &Operand::new("a", &I16xN.split_lanes()); + + ig.push( + Inst::new( + "x86_packss", + r#" + Convert packed signed integers the lanes of ``x`` and ``y`` into half-width integers, using + signed saturation to handle overflows. For example, with notional i16x2 vectors, where + ``x = [x1, x0]`` and ``y = [y1, y0]``, this operation would result in + ``a = [y1', y0', x1', x0']`` (using the Intel manual's right-to-left lane ordering). + "#, + &formats.binary, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + let x = &Operand::new("x", FxN); let y = &Operand::new("y", FxN); let a = &Operand::new("a", FxN); diff --git a/cranelift/codegen/meta/src/isa/x86/opcodes.rs b/cranelift/codegen/meta/src/isa/x86/opcodes.rs index 7008c9a91e..d34761d246 100644 --- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs +++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs @@ -291,6 +291,14 @@ pub static OR_IMM8_SIGN_EXTEND: [u8; 1] = [0x83]; /// Return the bitwise logical OR of packed single-precision values in xmm and x/m (SSE). pub static ORPS: [u8; 2] = [0x0f, 0x56]; +/// Converts 8 packed signed word integers from xmm1 and from xxm2/m128 into 16 packed signed byte +/// integers in xmm1 using signed saturation (SSE2). +pub static PACKSSWB: [u8; 3] = [0x66, 0x0f, 0x63]; + +/// Converts 4 packed signed doubleword integers from xmm1 and from xmm2/m128 into 8 packed signed +/// word integers in xmm1 using signed saturation (SSE2). +pub static PACKSSDW: [u8; 3] = [0x66, 0x0f, 0x6b]; + /// Add packed byte integers from xmm2/m128 and xmm1 (SSE2). pub static PADDB: [u8; 3] = [0x66, 0x0f, 0xfc]; diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs index cca49aef7e..0a6b4908ef 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.rs +++ b/cranelift/codegen/src/isa/aarch64/lower.rs @@ -2375,6 +2375,7 @@ fn lower_insn_to_regs>(ctx: &mut C, insn: IRInst) { | Opcode::X86Pmaxu | Opcode::X86Pmins | Opcode::X86Pminu + | Opcode::X86Packss | Opcode::X86Punpckh | Opcode::X86Punpckl | Opcode::X86ElfTlsGetAddr diff --git a/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif b/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif index be0a7beb35..a1ffac1822 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif @@ -109,3 +109,9 @@ block0(v0: i32x4 [%xmm7], v1: i32x4 [%xmm6]): [-, %xmm7] v2 = x86_punpckl v0, v1 ; bin: 66 0f 62 fe return } + +function %packss_i16x8(i16x8, i16x8) { +block0(v0: i16x8 [%xmm7], v1: i16x8 [%xmm8]): +[-, %xmm7] v2 = x86_packss v0, v1 ; bin: 66 41 0f 63 f8 + return +} diff --git a/cranelift/filetests/filetests/isa/x86/simd-lane-access-run.clif b/cranelift/filetests/filetests/isa/x86/simd-lane-access-run.clif index 3b54057989..115a0be7cb 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-lane-access-run.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-lane-access-run.clif @@ -205,3 +205,16 @@ block0: return v5 } ; run + +function %pack() -> b1 { +block0: + v0 = vconst.i32x4 [0 1 -1 0x0001ffff] + v1 = vconst.i32x4 [4 5 -6 0xffffffff] + v2 = x86_packss v0, v1 + + v3 = vconst.i16x8 [0 1 -1 0x7fff 4 5 -6 0xffff] + v4 = icmp eq v2, v3 + v5 = vall_true v4 + return v5 +} +; run