Add x86 implementation of splat instruction

2019-07-11 13:21:36 -07:00
parent 3b36a1d1d8
commit 084e279def
4 changed files with 173 additions and 2 deletions
--- a/cranelift/codegen/meta/src/isa/x86/legalize.rs
+++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs
@@ -1,7 +1,8 @@
 use crate::cdsl::ast::{var, ExprBuilder, Literal};
 use crate::cdsl::instructions::InstructionGroup;
+use crate::cdsl::types::ValueType;
 use crate::cdsl::xform::TransformGroupBuilder;
-
+use crate::shared::types::Float::F64;
 use crate::shared::types::Int::{I32, I64};
 use crate::shared::Definitions as SharedDefinitions;

@@ -19,9 +20,11 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou
    // List of instructions.
    let insts = &shared.instructions;
    let band = insts.by_name("band");
+    let bitcast = insts.by_name("bitcast");
    let bor = insts.by_name("bor");
    let clz = insts.by_name("clz");
    let ctz = insts.by_name("ctz");
+    let f64const = insts.by_name("f64const");
    let fcmp = insts.by_name("fcmp");
    let fcvt_from_uint = insts.by_name("fcvt_from_uint");
    let fcvt_to_sint = insts.by_name("fcvt_to_sint");
@@ -33,11 +36,15 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou
    let iadd = insts.by_name("iadd");
    let iconst = insts.by_name("iconst");
    let imul = insts.by_name("imul");
+    let insertlane = insts.by_name("insertlane");
    let isub = insts.by_name("isub");
    let popcnt = insts.by_name("popcnt");
+    let raw_bitcast = insts.by_name("raw_bitcast");
+    let scalar_to_vector = insts.by_name("scalar_to_vector");
    let sdiv = insts.by_name("sdiv");
    let selectif = insts.by_name("selectif");
    let smulhi = insts.by_name("smulhi");
+    let splat = insts.by_name("splat");
    let srem = insts.by_name("srem");
    let udiv = insts.by_name("udiv");
    let umulhi = insts.by_name("umulhi");
@@ -46,6 +53,8 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou

    let x86_bsf = x86_instructions.by_name("x86_bsf");
    let x86_bsr = x86_instructions.by_name("x86_bsr");
+    let x86_pshufb = x86_instructions.by_name("x86_pshufb");
+    let x86_pshufd = x86_instructions.by_name("x86_pshufd");
    let x86_umulx = x86_instructions.by_name("x86_umulx");
    let x86_smulx = x86_instructions.by_name("x86_smulx");

@@ -53,6 +62,8 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou
    let floatcc = shared.operand_kinds.by_name("floatcc");
    let imm64 = shared.operand_kinds.by_name("imm64");
    let intcc = shared.operand_kinds.by_name("intcc");
+    let uimm8 = shared.operand_kinds.by_name("uimm8");
+    let ieee64 = shared.operand_kinds.by_name("ieee64");

    // Division and remainder.
    //
@@ -290,4 +301,84 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou
    );

    group.build_and_add_to(&mut shared.transform_groups);
+
+    let mut narrow = TransformGroupBuilder::new(
+        "x86_narrow",
+        r#"
+    Legalize instructions by narrowing.
+
+    Use x86-specific instructions if needed."#,
+    )
+    .isa("x86")
+    .chain_with(shared.transform_groups.by_name("narrow").id);
+
+    // SIMD
+    let uimm8_zero = Literal::constant(uimm8, 0x00);
+    let uimm8_one = Literal::constant(uimm8, 0x01);
+    let ieee64_zero = Literal::constant(ieee64, 0x00);
+    let b = var("b");
+    let c = var("c");
+    let d = var("d");
+
+    // SIMD splat: 8-bits
+    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
+        let splat_x8x16 = splat.bind_vector(ty, 128 / ty.lane_bits());
+        let bitcast_f64_to_any8x16 = bitcast.bind_vector(ty, 128 / ty.lane_bits()).bind(F64);
+        narrow.legalize(
+            def!(y = splat_x8x16(x)),
+            vec![
+                def!(a = scalar_to_vector(x)), // move into the lowest 8 bits of an XMM register
+                def!(b = f64const(ieee64_zero)), // zero out a different XMM register; the shuffle mask for moving the lowest byte to all other byte lanes is 0x0
+                def!(c = bitcast_f64_to_any8x16(b)), // no instruction emitted; informs the SSA that the 0 in b can be used as a vector of this type
+                def!(y = x86_pshufb(a, c)), // PSHUFB takes two XMM operands, one of which is a shuffle mask (i.e. b)
+            ],
+        );
+    }
+
+    // SIMD splat: 16-bits
+    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 16) {
+        let splat_x16x8 = splat.bind_vector(ty, 128 / ty.lane_bits());
+        let raw_bitcast_any16x8_to_i32x4 = raw_bitcast
+            .bind_vector(I32, 4)
+            .bind_vector(ty, 128 / ty.lane_bits());
+        let raw_bitcast_i32x4_to_any16x8 = raw_bitcast
+            .bind_vector(ty, 128 / ty.lane_bits())
+            .bind_vector(I32, 4);
+        narrow.legalize(
+            def!(y = splat_x16x8(x)),
+            vec![
+                def!(a = scalar_to_vector(x)), // move into the lowest 16 bits of an XMM register
+                def!(b = insertlane(a, uimm8_one, x)), // insert the value again but in the next lowest 16 bits
+                def!(c = raw_bitcast_any16x8_to_i32x4(b)), // no instruction emitted; pretend this is an I32x4 so we can use PSHUFD
+                def!(d = x86_pshufd(c, uimm8_zero)), // broadcast the bytes in the XMM register with PSHUFD
+                def!(y = raw_bitcast_i32x4_to_any16x8(d)), // no instruction emitted; pretend this is an X16x8 again
+            ],
+        );
+    }
+
+    // SIMD splat: 32-bits
+    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) {
+        let splat_any32x4 = splat.bind_vector(ty, 128 / ty.lane_bits());
+        narrow.legalize(
+            def!(y = splat_any32x4(x)),
+            vec![
+                def!(a = scalar_to_vector(x)), // translate to an x86 MOV to get the value in an XMM register
+                def!(y = x86_pshufd(a, uimm8_zero)), // broadcast the bytes in the XMM register with PSHUF
+            ],
+        );
+    }
+
+    // SIMD splat: 64-bits
+    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 64) {
+        let splat_any64x2 = splat.bind_vector(ty, 128 / ty.lane_bits());
+        narrow.legalize(
+            def!(y = splat_any64x2(x)),
+            vec![
+                def!(a = scalar_to_vector(x)), // move into the lowest 64 bits of an XMM register
+                def!(y = insertlane(a, uimm8_one, x)), // move into the highest 64 bits of the same XMM register
+            ],
+        );
+    }
+
+    narrow.build_and_add_to(&mut shared.transform_groups);
 }
--- a/cranelift/codegen/meta/src/isa/x86/mod.rs
+++ b/cranelift/codegen/meta/src/isa/x86/mod.rs
@@ -30,6 +30,7 @@ pub fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
    let expand_flags = shared_defs.transform_groups.by_name("expand_flags");
    let narrow = shared_defs.transform_groups.by_name("narrow");
    let widen = shared_defs.transform_groups.by_name("widen");
+    let x86_narrow = shared_defs.transform_groups.by_name("x86_narrow");
    let x86_expand = shared_defs.transform_groups.by_name("x86_expand");

    x86_32.legalize_monomorphic(expand_flags);
@@ -42,7 +43,7 @@ pub fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
    x86_32.legalize_type(F64, x86_expand);

    x86_64.legalize_monomorphic(expand_flags);
-    x86_64.legalize_default(narrow);
+    x86_64.legalize_default(x86_narrow);
    x86_64.legalize_type(B1, expand_flags);
    x86_64.legalize_type(I8, widen);
    x86_64.legalize_type(I16, widen);
--- a/cranelift/codegen/src/ir/immediates.rs
+++ b/cranelift/codegen/src/ir/immediates.rs
@@ -738,6 +738,12 @@ impl From<f64> for Ieee64 {
    }
 }

+impl From<u64> for Ieee64 {
+    fn from(x: u64) -> Self {
+        Ieee64::with_float(f64::from_bits(x))
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/cranelift/filetests/filetests/isa/x86/legalize-splat.clif
+++ b/cranelift/filetests/filetests/isa/x86/legalize-splat.clif
@@ -0,0 +1,73 @@
+test compile
+set enable_simd=true
+set probestack_enabled=false
+target x86_64 haswell
+
+; use baldrdash calling convention here for simplicity (avoids prologue, epilogue)
+function %test_splat_i32() -> i32x4 baldrdash {
+ebb0:
+    v0 = iconst.i32 42
+    v1 = splat.i32x4 v0
+    return v1
+}
+
+; sameln: function %test_splat_i32() -> i32x4 [%xmm0] baldrdash {
+; nextln:   ss0 = incoming_arg 0, offset 0
+; nextln: 
+; nextln:   ebb0:
+; nextln:     v0 = iconst.i32 42
+; nextln:     v2 = scalar_to_vector.i32x4 v0
+; nextln:     v1 = x86_pshufd v2, 0
+; nextln:     return v1
+; nextln: }
+
+
+
+function %test_splat_i64() -> i64x2 baldrdash {
+ebb0:
+    v0 = iconst.i64 42
+    v1 = splat.i64x2 v0
+    return v1
+}
+
+; check:   ebb0:
+; nextln:     v0 = iconst.i64 42
+; nextln:     v2 = scalar_to_vector.i64x2 v0
+; nextln:     v1 = insertlane v2, 1, v0
+; nextln:     return v1
+
+
+
+function %test_splat_b16() -> b16x8 baldrdash {
+ebb0:
+    v0 = bconst.b16 true
+    v1 = splat.b16x8 v0
+    return v1
+}
+
+; check:   ebb0:
+; nextln:     v0 = bconst.b16 true
+; nextln:     v2 = scalar_to_vector.b16x8 v0
+; nextln:     v3 = insertlane v2, 1, v0
+; nextln:     v4 = raw_bitcast.i32x4 v3
+; nextln:     v5 = x86_pshufd v4, 0
+; nextln:     v1 = raw_bitcast.b16x8 v5
+; nextln:     return v1
+
+
+
+function %test_splat_i8() -> i8x16 baldrdash {
+ebb0:
+    v0 = iconst.i8 42
+    v1 = splat.i8x16 v0
+    return v1
+}
+
+; check:   ebb0:
+; nextln:     v2 = iconst.i32 42
+; nextln:     v0 = ireduce.i8 v2
+; nextln:     v3 = scalar_to_vector.i8x16 v0
+; nextln:     v4 = f64const 0.0
+; nextln:     v5 = bitcast.i8x16 v4
+; nextln:     v1 = x86_pshufb v3, v5
+; nextln:     return v1