Add x86 unpack instructions

2020-03-24 17:20:56 -07:00
parent 18c31403e8
commit f5fc09f64a
6 changed files with 109 additions and 0 deletions
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -1631,6 +1631,8 @@ fn define_simd(
    let x86_psra = x86.by_name("x86_psra");
    let x86_psrl = x86.by_name("x86_psrl");
    let x86_ptest = x86.by_name("x86_ptest");
+    let x86_punpckh = x86.by_name("x86_punpckh");
+    let x86_punpckl = x86.by_name("x86_punpckl");

    // Shorthands for recipes.
    let rec_evex_reg_vvvv_rm_128 = r.template("evex_reg_vvvv_rm_128");
@@ -1783,6 +1785,26 @@ fn define_simd(
        }
    }

+    // SIMD packing/unpacking
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
+        let (high, low) = match ty.lane_bits() {
+            8 => (&PUNPCKHBW, &PUNPCKLBW),
+            16 => (&PUNPCKHWD, &PUNPCKLWD),
+            32 => (&PUNPCKHDQ, &PUNPCKLDQ),
+            64 => (&PUNPCKHQDQ, &PUNPCKLQDQ),
+            _ => panic!("invalid size for SIMD packing/unpacking"),
+        };
+
+        e.enc_both_inferred(
+            x86_punpckh.bind(vector(ty, sse_vector_size)),
+            rec_fa.opcodes(high),
+        );
+        e.enc_both_inferred(
+            x86_punpckl.bind(vector(ty, sse_vector_size)),
+            rec_fa.opcodes(low),
+        );
+    }
+
    // SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8).
    for from_type in ValueType::all_lane_types().filter(allowed_simd_type) {
        for to_type in
--- a/cranelift/codegen/meta/src/isa/x86/instructions.rs
+++ b/cranelift/codegen/meta/src/isa/x86/instructions.rs
@@ -376,6 +376,40 @@ pub(crate) fn define(
        .operands_out(vec![a]),
    );

+    let x = &Operand::new("x", TxN);
+    let y = &Operand::new("y", TxN);
+    let a = &Operand::new("a", TxN);
+
+    ig.push(
+        Inst::new(
+            "x86_punpckh",
+            r#"
+        Unpack the high-order lanes of ``x`` and ``y`` and interleave into ``a``. With notional
+        i8x4 vectors, where ``x = [x3, x2, x1, x0]`` and ``y = [y3, y2, y1, y0]``, this operation
+        would result in ``a = [y3, x3, y2, x2]`` (using the Intel manual's right-to-left lane
+        ordering). 
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
+    ig.push(
+        Inst::new(
+            "x86_punpckl",
+            r#"
+        Unpack the low-order lanes of ``x`` and ``y`` and interleave into ``a``. With notional
+        i8x4 vectors, where ``x = [x3, x2, x1, x0]`` and ``y = [y3, y2, y1, y0]``, this operation
+        would result in ``a = [y1, x1, y0, x0]`` (using the Intel manual's right-to-left lane
+        ordering).
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
    let x = &Operand::new("x", FxN);
    let y = &Operand::new("y", FxN);
    let a = &Operand::new("a", FxN);
--- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs
+++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs
@@ -537,6 +537,30 @@ pub static PSUBUSW: [u8; 3] = [0x66, 0x0f, 0xd9];
 /// 0s (SSE4.1).
 pub static PTEST: [u8; 4] = [0x66, 0x0f, 0x38, 0x17];

+/// Unpack and interleave high-order bytes from xmm1 and xmm2/m128 into xmm1 (SSE2).
+pub static PUNPCKHBW: [u8; 3] = [0x66, 0x0f, 0x68];
+
+/// Unpack and interleave high-order words from xmm1 and xmm2/m128 into xmm1 (SSE2).
+pub static PUNPCKHWD: [u8; 3] = [0x66, 0x0f, 0x69];
+
+/// Unpack and interleave high-order doublewords from xmm1 and xmm2/m128 into xmm1 (SSE2).
+pub static PUNPCKHDQ: [u8; 3] = [0x66, 0x0f, 0x6A];
+
+/// Unpack and interleave high-order quadwords from xmm1 and xmm2/m128 into xmm1 (SSE2).
+pub static PUNPCKHQDQ: [u8; 3] = [0x66, 0x0f, 0x6D];
+
+/// Unpack and interleave low-order bytes from xmm1 and xmm2/m128 into xmm1 (SSE2).
+pub static PUNPCKLBW: [u8; 3] = [0x66, 0x0f, 0x60];
+
+/// Unpack and interleave low-order words from xmm1 and xmm2/m128 into xmm1 (SSE2).
+pub static PUNPCKLWD: [u8; 3] = [0x66, 0x0f, 0x61];
+
+/// Unpack and interleave low-order doublewords from xmm1 and xmm2/m128 into xmm1 (SSE2).
+pub static PUNPCKLDQ: [u8; 3] = [0x66, 0x0f, 0x62];
+
+/// Unpack and interleave low-order quadwords from xmm1 and xmm2/m128 into xmm1 (SSE2).
+pub static PUNPCKLQDQ: [u8; 3] = [0x66, 0x0f, 0x6C];
+
 /// Push r{16,32,64}.
 pub static PUSH_REG: [u8; 1] = [0x50];