diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index e226117149..d507abc0f5 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -1639,6 +1639,7 @@ fn define_simd( let x86_movlhps = x86.by_name("x86_movlhps"); let x86_movsd = x86.by_name("x86_movsd"); let x86_packss = x86.by_name("x86_packss"); + let x86_pblendw = x86.by_name("x86_pblendw"); let x86_pextr = x86.by_name("x86_pextr"); let x86_pinsr = x86.by_name("x86_pinsr"); let x86_pmaxs = x86.by_name("x86_pmaxs"); @@ -1744,6 +1745,13 @@ fn define_simd( e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd)); } + // PBLENDW, select lanes using a u8 immediate. + for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 16) { + let instruction = x86_pblendw.bind(vector(ty, sse_vector_size)); + let template = rec_fa_ib.opcodes(&PBLENDW); + e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd)); + } + // SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according // to the Intel manual: "When the destination operand is an XMM register, the source operand is // written to the low doubleword of the register and the register is zero-extended to 128 bits." diff --git a/cranelift/codegen/meta/src/isa/x86/instructions.rs b/cranelift/codegen/meta/src/isa/x86/instructions.rs index 5e9c80e6ad..4afbc88747 100644 --- a/cranelift/codegen/meta/src/isa/x86/instructions.rs +++ b/cranelift/codegen/meta/src/isa/x86/instructions.rs @@ -333,6 +333,20 @@ pub(crate) fn define( .operands_out(vec![a]), ); + let mask = &Operand::new("mask", uimm8).with_doc("mask to select lanes from b"); + ig.push( + Inst::new( + "x86_pblendw", + r#" + Blend packed words using an immediate mask. Each bit of the 8-bit immediate corresponds to a + lane in ``b``: if the bit is set, the lane is copied into ``a``. + "#, + &formats.ternary_imm8, + ) + .operands_in(vec![a, b, mask]) + .operands_out(vec![a]), + ); + let Idx = &Operand::new("Idx", uimm8).with_doc("Lane index"); let x = &Operand::new("x", TxN); let a = &Operand::new("a", &TxN.lane_of()); diff --git a/cranelift/codegen/meta/src/isa/x86/opcodes.rs b/cranelift/codegen/meta/src/isa/x86/opcodes.rs index 23efc620d2..d2391fe2ee 100644 --- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs +++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs @@ -347,6 +347,10 @@ pub static PAVGW: [u8; 3] = [0x66, 0x0f, 0xE3]; /// in XMM0 and store the values into xmm1 (SSE4.1). pub static PBLENDVB: [u8; 4] = [0x66, 0x0f, 0x38, 0x10]; +/// Select words from xmm1 and xmm2/m128 from mask specified in imm8 and store the values into xmm1 +/// (SSE4.1). +pub static PBLENDW: [u8; 4] = [0x66, 0x0f, 0x3a, 0x0e]; + /// Compare packed data for equal (SSE2). pub static PCMPEQB: [u8; 3] = [0x66, 0x0f, 0x74]; diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 2e17cd7b0e..5805ab63c4 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -2046,6 +2046,7 @@ pub(crate) fn lower_insn_to_regs>( | Opcode::X86Pop | Opcode::X86Bsr | Opcode::X86Bsf + | Opcode::X86Pblendw | Opcode::X86Pshufd | Opcode::X86Pshufb | Opcode::X86Pextr diff --git a/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif b/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif index e5eea1f637..24bc8cfa24 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif @@ -96,6 +96,14 @@ block0: return } +;; blend + +function %pblendw(b16x8, b16x8) { +block0(v0: b16x8 [%xmm10], v1: b16x8 [%xmm2]): +[-, %xmm10] v2 = x86_pblendw v0, v1, 0x55 ; bin: 66 44 0f 3a 0e d2 55 + return +} + ;; pack/unpack function %unpack_high_i8x16(i8x16, i8x16) {