Add x86 implementation of splat instruction

2019-07-11 13:21:36 -07:00
parent 3b36a1d1d8
commit 084e279def
4 changed files with 173 additions and 2 deletions
--- a/cranelift/codegen/meta/src/isa/x86/legalize.rs
+++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs
@@ -1,7 +1,8 @@
 use crate::cdsl::ast::{var, ExprBuilder, Literal};
 use crate::cdsl::instructions::InstructionGroup;
+use crate::cdsl::types::ValueType;
 use crate::cdsl::xform::TransformGroupBuilder;
-
+use crate::shared::types::Float::F64;
 use crate::shared::types::Int::{I32, I64};
 use crate::shared::Definitions as SharedDefinitions;

@@ -19,9 +20,11 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou
    // List of instructions.
    let insts = &shared.instructions;
    let band = insts.by_name("band");
+    let bitcast = insts.by_name("bitcast");
    let bor = insts.by_name("bor");
    let clz = insts.by_name("clz");
    let ctz = insts.by_name("ctz");
+    let f64const = insts.by_name("f64const");
    let fcmp = insts.by_name("fcmp");
    let fcvt_from_uint = insts.by_name("fcvt_from_uint");
    let fcvt_to_sint = insts.by_name("fcvt_to_sint");
@@ -33,11 +36,15 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou
    let iadd = insts.by_name("iadd");
    let iconst = insts.by_name("iconst");
    let imul = insts.by_name("imul");
+    let insertlane = insts.by_name("insertlane");
    let isub = insts.by_name("isub");
    let popcnt = insts.by_name("popcnt");
+    let raw_bitcast = insts.by_name("raw_bitcast");
+    let scalar_to_vector = insts.by_name("scalar_to_vector");
    let sdiv = insts.by_name("sdiv");
    let selectif = insts.by_name("selectif");
    let smulhi = insts.by_name("smulhi");
+    let splat = insts.by_name("splat");
    let srem = insts.by_name("srem");
    let udiv = insts.by_name("udiv");
    let umulhi = insts.by_name("umulhi");
@@ -46,6 +53,8 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou

    let x86_bsf = x86_instructions.by_name("x86_bsf");
    let x86_bsr = x86_instructions.by_name("x86_bsr");
+    let x86_pshufb = x86_instructions.by_name("x86_pshufb");
+    let x86_pshufd = x86_instructions.by_name("x86_pshufd");
    let x86_umulx = x86_instructions.by_name("x86_umulx");
    let x86_smulx = x86_instructions.by_name("x86_smulx");

@@ -53,6 +62,8 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou
    let floatcc = shared.operand_kinds.by_name("floatcc");
    let imm64 = shared.operand_kinds.by_name("imm64");
    let intcc = shared.operand_kinds.by_name("intcc");
+    let uimm8 = shared.operand_kinds.by_name("uimm8");
+    let ieee64 = shared.operand_kinds.by_name("ieee64");

    // Division and remainder.
    //
@@ -290,4 +301,84 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou
    );

    group.build_and_add_to(&mut shared.transform_groups);
+
+    let mut narrow = TransformGroupBuilder::new(
+        "x86_narrow",
+        r#"
+    Legalize instructions by narrowing.
+
+    Use x86-specific instructions if needed."#,
+    )
+    .isa("x86")
+    .chain_with(shared.transform_groups.by_name("narrow").id);
+
+    // SIMD
+    let uimm8_zero = Literal::constant(uimm8, 0x00);
+    let uimm8_one = Literal::constant(uimm8, 0x01);
+    let ieee64_zero = Literal::constant(ieee64, 0x00);
+    let b = var("b");
+    let c = var("c");
+    let d = var("d");
+
+    // SIMD splat: 8-bits
+    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
+        let splat_x8x16 = splat.bind_vector(ty, 128 / ty.lane_bits());
+        let bitcast_f64_to_any8x16 = bitcast.bind_vector(ty, 128 / ty.lane_bits()).bind(F64);
+        narrow.legalize(
+            def!(y = splat_x8x16(x)),
+            vec![
+                def!(a = scalar_to_vector(x)), // move into the lowest 8 bits of an XMM register
+                def!(b = f64const(ieee64_zero)), // zero out a different XMM register; the shuffle mask for moving the lowest byte to all other byte lanes is 0x0
+                def!(c = bitcast_f64_to_any8x16(b)), // no instruction emitted; informs the SSA that the 0 in b can be used as a vector of this type
+                def!(y = x86_pshufb(a, c)), // PSHUFB takes two XMM operands, one of which is a shuffle mask (i.e. b)
+            ],
+        );
+    }
+
+    // SIMD splat: 16-bits
+    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 16) {
+        let splat_x16x8 = splat.bind_vector(ty, 128 / ty.lane_bits());
+        let raw_bitcast_any16x8_to_i32x4 = raw_bitcast
+            .bind_vector(I32, 4)
+            .bind_vector(ty, 128 / ty.lane_bits());
+        let raw_bitcast_i32x4_to_any16x8 = raw_bitcast
+            .bind_vector(ty, 128 / ty.lane_bits())
+            .bind_vector(I32, 4);
+        narrow.legalize(
+            def!(y = splat_x16x8(x)),
+            vec![
+                def!(a = scalar_to_vector(x)), // move into the lowest 16 bits of an XMM register
+                def!(b = insertlane(a, uimm8_one, x)), // insert the value again but in the next lowest 16 bits
+                def!(c = raw_bitcast_any16x8_to_i32x4(b)), // no instruction emitted; pretend this is an I32x4 so we can use PSHUFD
+                def!(d = x86_pshufd(c, uimm8_zero)), // broadcast the bytes in the XMM register with PSHUFD
+                def!(y = raw_bitcast_i32x4_to_any16x8(d)), // no instruction emitted; pretend this is an X16x8 again
+            ],
+        );
+    }
+
+    // SIMD splat: 32-bits
+    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) {
+        let splat_any32x4 = splat.bind_vector(ty, 128 / ty.lane_bits());
+        narrow.legalize(
+            def!(y = splat_any32x4(x)),
+            vec![
+                def!(a = scalar_to_vector(x)), // translate to an x86 MOV to get the value in an XMM register
+                def!(y = x86_pshufd(a, uimm8_zero)), // broadcast the bytes in the XMM register with PSHUF
+            ],
+        );
+    }
+
+    // SIMD splat: 64-bits
+    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 64) {
+        let splat_any64x2 = splat.bind_vector(ty, 128 / ty.lane_bits());
+        narrow.legalize(
+            def!(y = splat_any64x2(x)),
+            vec![
+                def!(a = scalar_to_vector(x)), // move into the lowest 64 bits of an XMM register
+                def!(y = insertlane(a, uimm8_one, x)), // move into the highest 64 bits of the same XMM register
+            ],
+        );
+    }
+
+    narrow.build_and_add_to(&mut shared.transform_groups);
 }
--- a/cranelift/codegen/meta/src/isa/x86/mod.rs
+++ b/cranelift/codegen/meta/src/isa/x86/mod.rs
@@ -30,6 +30,7 @@ pub fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
    let expand_flags = shared_defs.transform_groups.by_name("expand_flags");
    let narrow = shared_defs.transform_groups.by_name("narrow");
    let widen = shared_defs.transform_groups.by_name("widen");
+    let x86_narrow = shared_defs.transform_groups.by_name("x86_narrow");
    let x86_expand = shared_defs.transform_groups.by_name("x86_expand");

    x86_32.legalize_monomorphic(expand_flags);
@@ -42,7 +43,7 @@ pub fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
    x86_32.legalize_type(F64, x86_expand);

    x86_64.legalize_monomorphic(expand_flags);
-    x86_64.legalize_default(narrow);
+    x86_64.legalize_default(x86_narrow);
    x86_64.legalize_type(B1, expand_flags);
    x86_64.legalize_type(I8, widen);
    x86_64.legalize_type(I16, widen);