Add x86_pblendw instruction

This instruction is necessary for lowering `fcvt_from_uint`.
2020-05-26 14:12:25 -07:00
parent 546fc9ddf1
commit 772ce73f7f
5 changed files with 35 additions and 0 deletions
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -1639,6 +1639,7 @@ fn define_simd(
    let x86_movlhps = x86.by_name("x86_movlhps");
    let x86_movsd = x86.by_name("x86_movsd");
    let x86_packss = x86.by_name("x86_packss");
+    let x86_pblendw = x86.by_name("x86_pblendw");
    let x86_pextr = x86.by_name("x86_pextr");
    let x86_pinsr = x86.by_name("x86_pinsr");
    let x86_pmaxs = x86.by_name("x86_pmaxs");
@@ -1744,6 +1745,13 @@ fn define_simd(
        e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd));
    }

+    // PBLENDW, select lanes using a u8 immediate.
+    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 16) {
+        let instruction = x86_pblendw.bind(vector(ty, sse_vector_size));
+        let template = rec_fa_ib.opcodes(&PBLENDW);
+        e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd));
+    }
+
    // SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according
    // to the Intel manual: "When the destination operand is an XMM register, the source operand is
    // written to the low doubleword of the register and the register is zero-extended to 128 bits."
--- a/cranelift/codegen/meta/src/isa/x86/instructions.rs
+++ b/cranelift/codegen/meta/src/isa/x86/instructions.rs
@@ -333,6 +333,20 @@ pub(crate) fn define(
        .operands_out(vec![a]),
    );

+    let mask = &Operand::new("mask", uimm8).with_doc("mask to select lanes from b");
+    ig.push(
+        Inst::new(
+            "x86_pblendw",
+            r#"
+    Blend packed words using an immediate mask. Each bit of the 8-bit immediate corresponds to a 
+    lane in ``b``: if the bit is set, the lane is copied into ``a``.
+    "#,
+            &formats.ternary_imm8,
+        )
+        .operands_in(vec![a, b, mask])
+        .operands_out(vec![a]),
+    );
+
    let Idx = &Operand::new("Idx", uimm8).with_doc("Lane index");
    let x = &Operand::new("x", TxN);
    let a = &Operand::new("a", &TxN.lane_of());
--- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs
+++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs
@@ -347,6 +347,10 @@ pub static PAVGW: [u8; 3] = [0x66, 0x0f, 0xE3];
 /// in XMM0 and store the values into xmm1 (SSE4.1).
 pub static PBLENDVB: [u8; 4] = [0x66, 0x0f, 0x38, 0x10];

+/// Select words from xmm1 and xmm2/m128 from mask specified in imm8 and store the values into xmm1
+/// (SSE4.1).
+pub static PBLENDW: [u8; 4] = [0x66, 0x0f, 0x3a, 0x0e];
+
 /// Compare packed data for equal (SSE2).
 pub static PCMPEQB: [u8; 3] = [0x66, 0x0f, 0x74];

--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -2046,6 +2046,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        | Opcode::X86Pop
        | Opcode::X86Bsr
        | Opcode::X86Bsf
+        | Opcode::X86Pblendw
        | Opcode::X86Pshufd
        | Opcode::X86Pshufb
        | Opcode::X86Pextr
--- a/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif
@@ -96,6 +96,14 @@ block0:
            return
 }

+;; blend
+
+function %pblendw(b16x8, b16x8) {
+block0(v0: b16x8 [%xmm10], v1: b16x8 [%xmm2]):
+[-, %xmm10] v2 = x86_pblendw v0, v1, 0x55   ; bin: 66 44 0f 3a 0e d2 55
+            return
+}
+
 ;; pack/unpack

 function %unpack_high_i8x16(i8x16, i8x16) {