Add x86 SIMD ishl

Only the shifts with applicable SSE2 instructions (i.e. 16-64 bit width) are implemented here.
2019-10-02 13:38:54 -07:00
parent 67733bd2fc
commit 6460fe705f
7 changed files with 152 additions and 1 deletions
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -521,6 +521,7 @@ pub(crate) fn define<'defs>(
    let x86_pinsr = x86.by_name("x86_pinsr");
    let x86_pshufd = x86.by_name("x86_pshufd");
    let x86_pshufb = x86.by_name("x86_pshufb");
    let x86_psll = x86.by_name("x86_psll");
    let x86_push = x86.by_name("x86_push");
    let x86_sdivmodx = x86.by_name("x86_sdivmodx");
    let x86_smulx = x86.by_name("x86_smulx");
@@ -1988,6 +1989,26 @@ pub(crate) fn define<'defs>(
        e.enc_32_64(bxor, rec_fa.opcodes(&PXOR));
    }
    // SIMD bitcast from I32/I64 to the low bits of a vector (e.g. I64x2); this register movement
    // allows SIMD shifts to be legalized more easily. TODO ideally this would be typed as an
    // I128x1 but restrictions on the type builder prevent this; the general idea here is that
    // the upper bits are all zeroed and do not form parts of any separate lane. See
    // https://github.com/CraneStation/cranelift/issues/1146.
    e.enc_both(
        bitcast.bind(vector(I64, sse_vector_size)).bind(I32),
        rec_frurm.opcodes(&MOVD_LOAD_XMM),
    );
    e.enc64(
        bitcast.bind(vector(I64, sse_vector_size)).bind(I64),
        rec_frurm.opcodes(&MOVD_LOAD_XMM).rex().w(),
    );
    // SIMD shift left
    for (ty, opcodes) in &[(I16, &PSLLW), (I32, &PSLLD), (I64, &PSLLQ)] {
        let x86_psll = x86_psll.bind(vector(*ty, sse_vector_size));
        e.enc_32_64(x86_psll, rec_fa.opcodes(*opcodes));
    }
    // SIMD icmp using PCMPEQ*
    for ty in ValueType::all_lane_types().filter(|t| t.is_int() && allowed_simd_type(t)) {
        let (opcodes, isa_predicate): (&[_], _) = match ty.lane_bits() {
--- a/cranelift/codegen/meta/src/isa/x86/instructions.rs
+++ b/cranelift/codegen/meta/src/isa/x86/instructions.rs
@@ -387,5 +387,40 @@ pub(crate) fn define(
        .operands_out(vec![a]),
    );
    let IxN = &TypeVar::new(
        "IxN",
        "A SIMD vector type containing integers",
        TypeSetBuilder::new()
            .ints(Interval::All)
            .simd_lanes(Interval::All)
            .includes_scalars(false)
            .build(),
    );
    let I64x2 = &TypeVar::new(
        "I64x2",
        "A SIMD vector type containing one large integer (the upper lane is concatenated with \
         the lower lane to form the integer)",
        TypeSetBuilder::new()
            .ints(64..64)
            .simd_lanes(2..2)
            .includes_scalars(false)
            .build(),
    );
    let x = &operand_doc("x", IxN, "Vector value to shift");
    let y = &operand_doc("y", I64x2, "Number of bits to shift");
    let a = &operand("a", IxN);
    ig.push(
        Inst::new(
            "x86_psll",
            r#"
        Shift Packed Data Left Logical -- This implements the behavior of the shared instruction 
        ``ishl`` but alters the shift operand to live in an XMM register as expected by the PSSL*
        family of instructions.
        "#,
        )
        .operands_in(vec![x, y])
        .operands_out(vec![a]),
    );
    ig.build()
 }
--- a/cranelift/codegen/meta/src/isa/x86/legalize.rs
+++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs
@@ -3,7 +3,7 @@ use crate::cdsl::instructions::{vector, Bindable, InstructionGroup};
 use crate::cdsl::types::{LaneType, ValueType};
 use crate::cdsl::xform::TransformGroupBuilder;
 use crate::shared::types::Float::F64;
-use crate::shared::types::Int::{I32, I64};
+use crate::shared::types::Int::{I16, I32, I64};
 use crate::shared::Definitions as SharedDefinitions;
 pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGroup) {
@@ -20,6 +20,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
    // List of instructions.
    let insts = &shared.instructions;
    let band = insts.by_name("band");
    let bitcast = insts.by_name("bitcast");
    let bor = insts.by_name("bor");
    let bnot = insts.by_name("bnot");
    let bxor = insts.by_name("bxor");
@@ -40,6 +41,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
    let imul = insts.by_name("imul");
    let ineg = insts.by_name("ineg");
    let insertlane = insts.by_name("insertlane");
    let ishl = insts.by_name("ishl");
    let isub = insts.by_name("isub");
    let popcnt = insts.by_name("popcnt");
    let raw_bitcast = insts.by_name("raw_bitcast");
@@ -60,6 +62,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
    let x86_bsr = x86_instructions.by_name("x86_bsr");
    let x86_pshufb = x86_instructions.by_name("x86_pshufb");
    let x86_pshufd = x86_instructions.by_name("x86_pshufd");
    let x86_psll = x86_instructions.by_name("x86_psll");
    let x86_umulx = x86_instructions.by_name("x86_umulx");
    let x86_smulx = x86_instructions.by_name("x86_smulx");
@@ -394,6 +397,16 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
        );
    }
    // SIMD shift left
    for ty in &[I16, I32, I64] {
        let ishl = ishl.bind(vector(*ty, sse_vector_size));
        let bitcast = bitcast.bind(vector(I64, sse_vector_size));
        narrow.legalize(
            def!(a = ishl(x, y)),
            vec![def!(b = bitcast(y)), def!(a = x86_psll(x, b))],
        );
    }
    narrow.custom_legalize(shuffle, "convert_shuffle");
    narrow.custom_legalize(extractlane, "convert_extractlane");
    narrow.custom_legalize(insertlane, "convert_insertlane");
--- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs
+++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs
@@ -320,6 +320,15 @@ pub static PSHUFB: [u8; 4] = [0x66, 0x0f, 0x38, 0x00];
 /// store the result in xmm1 (SSE2).
 pub static PSHUFD: [u8; 3] = [0x66, 0x0f, 0x70];
 /// Shift words in xmm1 left by xmm2/m128 while shifting in 0s (SSE2).
 pub static PSLLW: [u8; 3] = [0x66, 0x0f, 0xf1];
 /// Shift doublewords in xmm1 left by xmm2/m128 while shifting in 0s (SSE2).
 pub static PSLLD: [u8; 3] = [0x66, 0x0f, 0xf2];
 /// Shift quadwords in xmm1 left by xmm2/m128 while shifting in 0s (SSE2).
 pub static PSLLQ: [u8; 3] = [0x66, 0x0f, 0xf3];
 /// Subtract packed byte integers in xmm2/m128 from packed byte integers in xmm1 (SSE2).
 pub static PSUBB: [u8; 3] = [0x66, 0x0f, 0xf8];
--- a/cranelift/filetests/filetests/isa/x86/simd-bitwise-binemit.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-bitwise-binemit.clif
@@ -0,0 +1,21 @@
 test binemit
 set enable_simd
 target x86_64 skylake
 function %ishl_i16x8(i16x8, i64x2) -> i16x8 {
 ebb0(v0: i16x8 [%xmm2], v1: i64x2 [%xmm1]):
 [-, %xmm2]  v2 = x86_psll v0, v1     ; bin: 66 0f f1 d1
            return v2
 }
 function %ishl_i32x4(i32x4, i64x2) -> i32x4 {
 ebb0(v0: i32x4 [%xmm4], v1: i64x2 [%xmm0]):
 [-, %xmm4]  v2 = x86_psll v0, v1      ; bin: 66 0f f2 e0
            return v2
 }
 function %ishl_i64x2(i64x2, i64x2) -> i64x2 {
 ebb0(v0: i64x2 [%xmm6], v1: i64x2 [%xmm3]):
 [-, %xmm6]  v2 = x86_psll v0, v1      ; bin: 66 0f f3 f3
            return v2
 }
--- a/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif
@@ -0,0 +1,13 @@
 test legalizer
 set enable_simd
 target x86_64 skylake
 function %ishl_i32x4() -> i32x4 {
 ebb0:
    v0 = iconst.i32 1
    v1 = vconst.i32x4 [1 2 4 8]
    v2 = ishl v1, v0
    ; check: v3 = bitcast.i64x2 v0
    ; nextln: v2 = x86_psll v1, v3
    return v2
 }
--- a/cranelift/filetests/filetests/isa/x86/simd-bitwise-run.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-bitwise-run.clif
@@ -0,0 +1,39 @@
 test run
 set enable_simd
 target x86_64 skylake
 ; TODO: once available, replace all lane extraction with `icmp + all_ones`
 function %ishl_i32x4() -> b1 {
 ebb0:
    v0 = iconst.i32 1
    v1 = vconst.i32x4 [1 2 4 8]
    v2 = ishl v1, v0
    v3 = extractlane v2, 0
    v4 = icmp_imm eq v3, 2
    v5 = extractlane v2, 3
    v6 = icmp_imm eq v5, 16
    v7 = band v4, v6
    return v7
 }
 ; run
 function %ishl_too_large_i16x8() -> b1 {
 ebb0:
    v0 = iconst.i32 17 ; note that this will shift off the end of each lane
    v1 = vconst.i16x8 [1 2 4 8 16 32 64 128]
    v2 = ishl v1, v0
    v3 = extractlane v2, 0
    v4 = icmp_imm eq v3, 0
    v5 = extractlane v2, 3
    v6 = icmp_imm eq v5, 0
    v7 = band v4, v6
    return v7
 }
 ; run