Add x86 unpack instructions

2020-03-24 17:20:56 -07:00
parent 18c31403e8
commit f5fc09f64a
6 changed files with 109 additions and 0 deletions
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -1631,6 +1631,8 @@ fn define_simd(
    let x86_psra = x86.by_name("x86_psra");
    let x86_psrl = x86.by_name("x86_psrl");
    let x86_ptest = x86.by_name("x86_ptest");
    let x86_punpckh = x86.by_name("x86_punpckh");
    let x86_punpckl = x86.by_name("x86_punpckl");
    // Shorthands for recipes.
    let rec_evex_reg_vvvv_rm_128 = r.template("evex_reg_vvvv_rm_128");
@@ -1783,6 +1785,26 @@ fn define_simd(
        }
    }
    // SIMD packing/unpacking
    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
        let (high, low) = match ty.lane_bits() {
            8 => (&PUNPCKHBW, &PUNPCKLBW),
            16 => (&PUNPCKHWD, &PUNPCKLWD),
            32 => (&PUNPCKHDQ, &PUNPCKLDQ),
            64 => (&PUNPCKHQDQ, &PUNPCKLQDQ),
            _ => panic!("invalid size for SIMD packing/unpacking"),
        };
        e.enc_both_inferred(
            x86_punpckh.bind(vector(ty, sse_vector_size)),
            rec_fa.opcodes(high),
        );
        e.enc_both_inferred(
            x86_punpckl.bind(vector(ty, sse_vector_size)),
            rec_fa.opcodes(low),
        );
    }
    // SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8).
    for from_type in ValueType::all_lane_types().filter(allowed_simd_type) {
        for to_type in
--- a/cranelift/codegen/meta/src/isa/x86/instructions.rs
+++ b/cranelift/codegen/meta/src/isa/x86/instructions.rs
@@ -376,6 +376,40 @@ pub(crate) fn define(
        .operands_out(vec![a]),
    );
    let x = &Operand::new("x", TxN);
    let y = &Operand::new("y", TxN);
    let a = &Operand::new("a", TxN);
    ig.push(
        Inst::new(
            "x86_punpckh",
            r#"
        Unpack the high-order lanes of ``x`` and ``y`` and interleave into ``a``. With notional
        i8x4 vectors, where ``x = [x3, x2, x1, x0]`` and ``y = [y3, y2, y1, y0]``, this operation
        would result in ``a = [y3, x3, y2, x2]`` (using the Intel manual's right-to-left lane
        ordering). 
        "#,
            &formats.binary,
        )
        .operands_in(vec![x, y])
        .operands_out(vec![a]),
    );
    ig.push(
        Inst::new(
            "x86_punpckl",
            r#"
        Unpack the low-order lanes of ``x`` and ``y`` and interleave into ``a``. With notional
        i8x4 vectors, where ``x = [x3, x2, x1, x0]`` and ``y = [y3, y2, y1, y0]``, this operation
        would result in ``a = [y1, x1, y0, x0]`` (using the Intel manual's right-to-left lane
        ordering).
        "#,
            &formats.binary,
        )
        .operands_in(vec![x, y])
        .operands_out(vec![a]),
    );
    let x = &Operand::new("x", FxN);
    let y = &Operand::new("y", FxN);
    let a = &Operand::new("a", FxN);
--- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs
+++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs
@@ -537,6 +537,30 @@ pub static PSUBUSW: [u8; 3] = [0x66, 0x0f, 0xd9];
 /// 0s (SSE4.1).
 pub static PTEST: [u8; 4] = [0x66, 0x0f, 0x38, 0x17];
 /// Unpack and interleave high-order bytes from xmm1 and xmm2/m128 into xmm1 (SSE2).
 pub static PUNPCKHBW: [u8; 3] = [0x66, 0x0f, 0x68];
 /// Unpack and interleave high-order words from xmm1 and xmm2/m128 into xmm1 (SSE2).
 pub static PUNPCKHWD: [u8; 3] = [0x66, 0x0f, 0x69];
 /// Unpack and interleave high-order doublewords from xmm1 and xmm2/m128 into xmm1 (SSE2).
 pub static PUNPCKHDQ: [u8; 3] = [0x66, 0x0f, 0x6A];
 /// Unpack and interleave high-order quadwords from xmm1 and xmm2/m128 into xmm1 (SSE2).
 pub static PUNPCKHQDQ: [u8; 3] = [0x66, 0x0f, 0x6D];
 /// Unpack and interleave low-order bytes from xmm1 and xmm2/m128 into xmm1 (SSE2).
 pub static PUNPCKLBW: [u8; 3] = [0x66, 0x0f, 0x60];
 /// Unpack and interleave low-order words from xmm1 and xmm2/m128 into xmm1 (SSE2).
 pub static PUNPCKLWD: [u8; 3] = [0x66, 0x0f, 0x61];
 /// Unpack and interleave low-order doublewords from xmm1 and xmm2/m128 into xmm1 (SSE2).
 pub static PUNPCKLDQ: [u8; 3] = [0x66, 0x0f, 0x62];
 /// Unpack and interleave low-order quadwords from xmm1 and xmm2/m128 into xmm1 (SSE2).
 pub static PUNPCKLQDQ: [u8; 3] = [0x66, 0x0f, 0x6C];
 /// Push r{16,32,64}.
 pub static PUSH_REG: [u8; 1] = [0x50];
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -2375,6 +2375,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) {
        | Opcode::X86Pmaxu
        | Opcode::X86Pmins
        | Opcode::X86Pminu
        | Opcode::X86Punpckh
        | Opcode::X86Punpckl
        | Opcode::X86ElfTlsGetAddr
        | Opcode::X86MachoTlsGetAddr => {
            panic!("x86-specific opcode in supposedly arch-neutral IR!");
--- a/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif
@@ -95,3 +95,17 @@ block0:
 [-, %xmm0]  v4 = x86_pshufb v1, v3           ; bin: 66 41 0f 38 00 c4
            return
 }
 ;; pack/unpack
 function %unpack_high_i8x16(i8x16, i8x16) {
 block0(v0: i8x16 [%xmm0], v1: i8x16 [%xmm12]):
 [-, %xmm0]  v2 = x86_punpckh v0, v1         ; bin: 66 41 0f 68 c4
            return
 }
 function %unpack_low_i32x4(i32x4, i32x4) {
 block0(v0: i32x4 [%xmm7], v1: i32x4 [%xmm6]):
 [-, %xmm7]  v2 = x86_punpckl v0, v1         ; bin: 66 0f 62 fe
            return
 }
--- a/cranelift/filetests/filetests/isa/x86/simd-lane-access-run.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-lane-access-run.clif
@@ -192,3 +192,16 @@ block0:
    return v5
 }
 ; run
 function %unpack_low() -> b1 {
 block0:
    v0 = vconst.i32x4 [0 1 2 3]
    v1 = vconst.i32x4 [4 5 6 7]
    v2 = x86_punpckl v0, v1
    v3 = vconst.i32x4 [0 4 1 5]
    v4 = icmp eq v2, v3
    v5 = vall_true v4
    return v5
 }
 ; run