From f5fc09f64ae136c88d23ecca32c06b2fb6ae36ff Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Tue, 24 Mar 2020 17:20:56 -0700 Subject: [PATCH] Add x86 unpack instructions --- .../codegen/meta/src/isa/x86/encodings.rs | 22 ++++++++++++ .../codegen/meta/src/isa/x86/instructions.rs | 34 +++++++++++++++++++ cranelift/codegen/meta/src/isa/x86/opcodes.rs | 24 +++++++++++++ cranelift/codegen/src/isa/aarch64/lower.rs | 2 ++ .../isa/x86/simd-lane-access-binemit.clif | 14 ++++++++ .../isa/x86/simd-lane-access-run.clif | 13 +++++++ 6 files changed, 109 insertions(+) diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index 12dd4de0e4..3f4bb8154b 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -1631,6 +1631,8 @@ fn define_simd( let x86_psra = x86.by_name("x86_psra"); let x86_psrl = x86.by_name("x86_psrl"); let x86_ptest = x86.by_name("x86_ptest"); + let x86_punpckh = x86.by_name("x86_punpckh"); + let x86_punpckl = x86.by_name("x86_punpckl"); // Shorthands for recipes. let rec_evex_reg_vvvv_rm_128 = r.template("evex_reg_vvvv_rm_128"); @@ -1783,6 +1785,26 @@ fn define_simd( } } + // SIMD packing/unpacking + for ty in ValueType::all_lane_types().filter(allowed_simd_type) { + let (high, low) = match ty.lane_bits() { + 8 => (&PUNPCKHBW, &PUNPCKLBW), + 16 => (&PUNPCKHWD, &PUNPCKLWD), + 32 => (&PUNPCKHDQ, &PUNPCKLDQ), + 64 => (&PUNPCKHQDQ, &PUNPCKLQDQ), + _ => panic!("invalid size for SIMD packing/unpacking"), + }; + + e.enc_both_inferred( + x86_punpckh.bind(vector(ty, sse_vector_size)), + rec_fa.opcodes(high), + ); + e.enc_both_inferred( + x86_punpckl.bind(vector(ty, sse_vector_size)), + rec_fa.opcodes(low), + ); + } + // SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8). for from_type in ValueType::all_lane_types().filter(allowed_simd_type) { for to_type in diff --git a/cranelift/codegen/meta/src/isa/x86/instructions.rs b/cranelift/codegen/meta/src/isa/x86/instructions.rs index 44ac3eeab9..9a04f34b7c 100644 --- a/cranelift/codegen/meta/src/isa/x86/instructions.rs +++ b/cranelift/codegen/meta/src/isa/x86/instructions.rs @@ -376,6 +376,40 @@ pub(crate) fn define( .operands_out(vec![a]), ); + let x = &Operand::new("x", TxN); + let y = &Operand::new("y", TxN); + let a = &Operand::new("a", TxN); + + ig.push( + Inst::new( + "x86_punpckh", + r#" + Unpack the high-order lanes of ``x`` and ``y`` and interleave into ``a``. With notional + i8x4 vectors, where ``x = [x3, x2, x1, x0]`` and ``y = [y3, y2, y1, y0]``, this operation + would result in ``a = [y3, x3, y2, x2]`` (using the Intel manual's right-to-left lane + ordering). + "#, + &formats.binary, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + + ig.push( + Inst::new( + "x86_punpckl", + r#" + Unpack the low-order lanes of ``x`` and ``y`` and interleave into ``a``. With notional + i8x4 vectors, where ``x = [x3, x2, x1, x0]`` and ``y = [y3, y2, y1, y0]``, this operation + would result in ``a = [y1, x1, y0, x0]`` (using the Intel manual's right-to-left lane + ordering). + "#, + &formats.binary, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + let x = &Operand::new("x", FxN); let y = &Operand::new("y", FxN); let a = &Operand::new("a", FxN); diff --git a/cranelift/codegen/meta/src/isa/x86/opcodes.rs b/cranelift/codegen/meta/src/isa/x86/opcodes.rs index bbfd05a5d8..7008c9a91e 100644 --- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs +++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs @@ -537,6 +537,30 @@ pub static PSUBUSW: [u8; 3] = [0x66, 0x0f, 0xd9]; /// 0s (SSE4.1). pub static PTEST: [u8; 4] = [0x66, 0x0f, 0x38, 0x17]; +/// Unpack and interleave high-order bytes from xmm1 and xmm2/m128 into xmm1 (SSE2). +pub static PUNPCKHBW: [u8; 3] = [0x66, 0x0f, 0x68]; + +/// Unpack and interleave high-order words from xmm1 and xmm2/m128 into xmm1 (SSE2). +pub static PUNPCKHWD: [u8; 3] = [0x66, 0x0f, 0x69]; + +/// Unpack and interleave high-order doublewords from xmm1 and xmm2/m128 into xmm1 (SSE2). +pub static PUNPCKHDQ: [u8; 3] = [0x66, 0x0f, 0x6A]; + +/// Unpack and interleave high-order quadwords from xmm1 and xmm2/m128 into xmm1 (SSE2). +pub static PUNPCKHQDQ: [u8; 3] = [0x66, 0x0f, 0x6D]; + +/// Unpack and interleave low-order bytes from xmm1 and xmm2/m128 into xmm1 (SSE2). +pub static PUNPCKLBW: [u8; 3] = [0x66, 0x0f, 0x60]; + +/// Unpack and interleave low-order words from xmm1 and xmm2/m128 into xmm1 (SSE2). +pub static PUNPCKLWD: [u8; 3] = [0x66, 0x0f, 0x61]; + +/// Unpack and interleave low-order doublewords from xmm1 and xmm2/m128 into xmm1 (SSE2). +pub static PUNPCKLDQ: [u8; 3] = [0x66, 0x0f, 0x62]; + +/// Unpack and interleave low-order quadwords from xmm1 and xmm2/m128 into xmm1 (SSE2). +pub static PUNPCKLQDQ: [u8; 3] = [0x66, 0x0f, 0x6C]; + /// Push r{16,32,64}. pub static PUSH_REG: [u8; 1] = [0x50]; diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs index 4ccba9f6ab..cca49aef7e 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.rs +++ b/cranelift/codegen/src/isa/aarch64/lower.rs @@ -2375,6 +2375,8 @@ fn lower_insn_to_regs>(ctx: &mut C, insn: IRInst) { | Opcode::X86Pmaxu | Opcode::X86Pmins | Opcode::X86Pminu + | Opcode::X86Punpckh + | Opcode::X86Punpckl | Opcode::X86ElfTlsGetAddr | Opcode::X86MachoTlsGetAddr => { panic!("x86-specific opcode in supposedly arch-neutral IR!"); diff --git a/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif b/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif index bc2f873fe6..be0a7beb35 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif @@ -95,3 +95,17 @@ block0: [-, %xmm0] v4 = x86_pshufb v1, v3 ; bin: 66 41 0f 38 00 c4 return } + +;; pack/unpack + +function %unpack_high_i8x16(i8x16, i8x16) { +block0(v0: i8x16 [%xmm0], v1: i8x16 [%xmm12]): +[-, %xmm0] v2 = x86_punpckh v0, v1 ; bin: 66 41 0f 68 c4 + return +} + +function %unpack_low_i32x4(i32x4, i32x4) { +block0(v0: i32x4 [%xmm7], v1: i32x4 [%xmm6]): +[-, %xmm7] v2 = x86_punpckl v0, v1 ; bin: 66 0f 62 fe + return +} diff --git a/cranelift/filetests/filetests/isa/x86/simd-lane-access-run.clif b/cranelift/filetests/filetests/isa/x86/simd-lane-access-run.clif index 564b6b87be..3b54057989 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-lane-access-run.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-lane-access-run.clif @@ -192,3 +192,16 @@ block0: return v5 } ; run + +function %unpack_low() -> b1 { +block0: + v0 = vconst.i32x4 [0 1 2 3] + v1 = vconst.i32x4 [4 5 6 7] + v2 = x86_punpckl v0, v1 + + v3 = vconst.i32x4 [0 4 1 5] + v4 = icmp eq v2, v3 + v5 = vall_true v4 + return v5 +} +; run