Add x86 implementation of shuffle

2019-08-26 14:50:05 -07:00
parent 9e088e4164
commit af1499ce99
18 changed files with 336 additions and 44 deletions
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -1785,7 +1785,7 @@ pub(crate) fn define(
    let allowed_simd_type = |t: &LaneType| t.lane_bits() >= 8 && t.lane_bits() < 128;

    // PSHUFB, 8-bit shuffle using two XMM registers.
-    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
        let instruction = x86_pshufb.bind_vector_from_lane(ty, sse_vector_size);
        let template = rec_fa.nonrex().opcodes(vec![0x66, 0x0f, 0x38, 00]);
        e.enc32_isap(instruction.clone(), template.clone(), use_ssse3_simd);
@@ -1804,7 +1804,7 @@ pub(crate) fn define(

    // SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according
    // to the Intel manual: "When the destination operand is an XMM register, the source operand is
-    // written to the low doubleword of the register and the regiser is zero-extended to 128 bits."
+    // written to the low doubleword of the register and the register is zero-extended to 128 bits."
    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
        let instruction = scalar_to_vector.bind_vector_from_lane(ty, sse_vector_size);
        if ty.is_float() {
@@ -1929,6 +1929,13 @@ pub(crate) fn define(
        e.enc_32_64_maybe_isap(instruction, template, None); // from SSE
    }

+    // SIMD bor using ORPS
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
+        let instruction = bor.bind_vector_from_lane(ty, sse_vector_size);
+        let template = rec_fa.nonrex().opcodes(vec![0x0f, 0x56]);
+        e.enc_32_64_maybe_isap(instruction, template, None); // from SSE
+    }
+
    // Reference type instructions

    // Null references implemented as iconst 0.
--- a/cranelift/codegen/meta/src/isa/x86/legalize.rs
+++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs
@@ -45,6 +45,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
    let selectif = insts.by_name("selectif");
    let smulhi = insts.by_name("smulhi");
    let splat = insts.by_name("splat");
+    let shuffle = insts.by_name("shuffle");
    let srem = insts.by_name("srem");
    let udiv = insts.by_name("udiv");
    let umulhi = insts.by_name("umulhi");
@@ -380,6 +381,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
        );
    }

+    narrow.custom_legalize(shuffle, "convert_shuffle");
    narrow.custom_legalize(extractlane, "convert_extractlane");
    narrow.custom_legalize(insertlane, "convert_insertlane");

--- a/cranelift/codegen/meta/src/isa/x86/recipes.rs
+++ b/cranelift/codegen/meta/src/isa/x86/recipes.rs
@@ -396,11 +396,11 @@ pub(crate) fn define<'shared>(
    let f_trap = formats.by_name("Trap");
    let f_unary = formats.by_name("Unary");
    let f_unary_bool = formats.by_name("UnaryBool");
+    let f_unary_const = formats.by_name("UnaryConst");
    let f_unary_global_value = formats.by_name("UnaryGlobalValue");
    let f_unary_ieee32 = formats.by_name("UnaryIeee32");
    let f_unary_ieee64 = formats.by_name("UnaryIeee64");
    let f_unary_imm = formats.by_name("UnaryImm");
-    let f_unary_imm128 = formats.by_name("UnaryImm128");

    // Predicates shorthands.
    let use_sse41 = settings.predicate_by_name("use_sse41");
@@ -2437,14 +2437,14 @@ pub(crate) fn define<'shared>(
    );

    recipes.add_template_recipe(
-        EncodingRecipeBuilder::new("vconst", f_unary_imm128, 5)
+        EncodingRecipeBuilder::new("vconst", f_unary_const, 5)
            .operands_out(vec![fpr])
            .clobbers_flags(false)
            .emit(
                r#"
                    {{PUT_OP}}(bits, rex2(0, out_reg0), sink);
                    modrm_riprel(out_reg0, sink);
-                    const_disp4(imm, func, sink);
+                    const_disp4(constant_handle, func, sink);
                "#,
            ),
    );
--- a/cranelift/codegen/meta/src/shared/formats.rs
+++ b/cranelift/codegen/meta/src/shared/formats.rs
@@ -6,10 +6,10 @@ pub(crate) fn define(imm: &Immediates, entities: &EntityRefs) -> FormatRegistry

    registry.insert(Builder::new("Unary").value());
    registry.insert(Builder::new("UnaryImm").imm(&imm.imm64));
-    registry.insert(Builder::new("UnaryImm128").imm(&imm.uimm128));
    registry.insert(Builder::new("UnaryIeee32").imm(&imm.ieee32));
    registry.insert(Builder::new("UnaryIeee64").imm(&imm.ieee64));
    registry.insert(Builder::new("UnaryBool").imm(&imm.boolean));
+    registry.insert(Builder::new("UnaryConst").imm(&imm.pool_constant));
    registry.insert(Builder::new("UnaryGlobalValue").imm(&entities.global_value));

    registry.insert(Builder::new("Binary").value().value());
@@ -43,6 +43,12 @@ pub(crate) fn define(imm: &Immediates, entities: &EntityRefs) -> FormatRegistry
            .value()
            .imm_with_name("lane", &imm.uimm8),
    );
+    registry.insert(
+        Builder::new("Shuffle")
+            .value()
+            .value()
+            .imm_with_name("mask", &imm.uimm128),
+    );

    registry.insert(Builder::new("IntCompare").imm(&imm.intcc).value().value());
    registry.insert(
--- a/cranelift/codegen/meta/src/shared/immediates.rs
+++ b/cranelift/codegen/meta/src/shared/immediates.rs
@@ -23,6 +23,12 @@ pub(crate) struct Immediates {
    /// const.
    pub uimm128: OperandKind,

+    /// A constant stored in the constant pool.
+    ///
+    /// This operand is used to pass constants to instructions like vconst while storing the
+    /// actual bytes in the constant pool.
+    pub pool_constant: OperandKind,
+
    /// A 32-bit immediate signed offset.
    ///
    /// This is used to represent an immediate address offset in load/store instructions.
@@ -84,6 +90,12 @@ impl Immediates {

            uimm128: Builder::new_imm("uimm128")
                .doc("A 128-bit immediate unsigned integer.")
+                .rust_type("ir::Immediate")
+                .build(),
+
+            pool_constant: Builder::new_imm("poolConstant")
+                .doc("A constant stored in the constant pool.")
+                .default_member("constant_handle")
                .rust_type("ir::Constant")
                .build(),

--- a/cranelift/codegen/meta/src/shared/instructions.rs
+++ b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -1090,7 +1090,7 @@ pub(crate) fn define(

    let N = &operand_doc(
        "N",
-        &imm.uimm128,
+        &imm.pool_constant,
        "The 16 immediate bytes of a 128-bit vector",
    );
    let a = &operand_doc("a", TxN, "A constant vector value");
@@ -1108,6 +1108,41 @@ pub(crate) fn define(
        .operands_out(vec![a]),
    );

+    let mask = &operand_doc(
+        "mask",
+        &imm.uimm128,
+        "The 16 immediate bytes used for selecting the elements to shuffle",
+    );
+    let Tx16 = &TypeVar::new(
+        "Tx16",
+        "A SIMD vector with exactly 16 lanes of 8-bit values; eventually this may support other \
+         lane counts and widths",
+        TypeSetBuilder::new()
+            .ints(8..8)
+            .bools(8..8)
+            .simd_lanes(16..16)
+            .includes_scalars(false)
+            .build(),
+    );
+    let a = &operand_doc("a", Tx16, "A vector value");
+    let b = &operand_doc("b", Tx16, "A vector value");
+
+    ig.push(
+        Inst::new(
+            "shuffle",
+            r#"
+        SIMD vector shuffle.
+        
+        Shuffle two vectors using the given immediate bytes. For each of the 16 bytes of the
+        immediate, a value i of 0-15 selects the i-th element of the first vector and a value i of 
+        16-31 selects the (i-16)th element of the second vector. Immediate values outside of the 
+        0-31 range place a 0 in the resulting vector lane.
+        "#,
+        )
+        .operands_in(vec![a, b, mask])
+        .operands_out(vec![a]),
+    );
+
    let a = &operand_doc("a", Ref, "A constant reference null value");

    ig.push(