Avoid extra register movement when lowering an x86 insertlane to a float vector

2019-08-23 11:38:29 -07:00
parent 3dfc68afb1
commit 295b2ef614
11 changed files with 334 additions and 55 deletions
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -396,7 +396,6 @@ pub(crate) fn define(
    let ifcmp_sp = shared.by_name("ifcmp_sp");
    let imul = shared.by_name("imul");
    let indirect_jump_table_br = shared.by_name("indirect_jump_table_br");
-    let insertlane = shared.by_name("insertlane");
    let ireduce = shared.by_name("ireduce");
    let ishl = shared.by_name("ishl");
    let ishl_imm = shared.by_name("ishl_imm");
@@ -469,8 +468,12 @@ pub(crate) fn define(
    let x86_cvtt2si = x86.by_name("x86_cvtt2si");
    let x86_fmax = x86.by_name("x86_fmax");
    let x86_fmin = x86.by_name("x86_fmin");
+    let x86_insertps = x86.by_name("x86_insertps");
+    let x86_movlhps = x86.by_name("x86_movlhps");
+    let x86_movsd = x86.by_name("x86_movsd");
    let x86_pop = x86.by_name("x86_pop");
    let x86_pextr = x86.by_name("x86_pextr");
+    let x86_pinsr = x86.by_name("x86_pinsr");
    let x86_pshufd = x86.by_name("x86_pshufd");
    let x86_pshufb = x86.by_name("x86_pshufb");
    let x86_push = x86.by_name("x86_push");
@@ -501,6 +504,7 @@ pub(crate) fn define(
    let rec_f64imm_z = r.template("f64imm_z");
    let rec_fa = r.template("fa");
    let rec_fax = r.template("fax");
+    let rec_fa_ib = r.template("fa_ib");
    let rec_fcmp = r.template("fcmp");
    let rec_fcscc = r.template("fcscc");
    let rec_ffillnull = r.recipe("ffillnull");
@@ -1785,16 +1789,16 @@ pub(crate) fn define(
    }

    // SIMD insertlane
-    let mut insertlane_mapping: HashMap<u64, (Vec<u8>, Option<SettingPredicateNumber>)> =
+    let mut x86_pinsr_mapping: HashMap<u64, (Vec<u8>, Option<SettingPredicateNumber>)> =
        HashMap::new();
-    insertlane_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x20], Some(use_sse41_simd))); // PINSRB
-    insertlane_mapping.insert(16, (vec![0x66, 0x0f, 0xc4], None)); // PINSRW from SSE2
-    insertlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41_simd))); // PINSRD
-    insertlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41_simd))); // PINSRQ, only x86_64
+    x86_pinsr_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x20], Some(use_sse41_simd))); // PINSRB
+    x86_pinsr_mapping.insert(16, (vec![0x66, 0x0f, 0xc4], None)); // PINSRW from SSE2
+    x86_pinsr_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41_simd))); // PINSRD
+    x86_pinsr_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41_simd))); // PINSRQ, only x86_64

    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
-        if let Some((opcode, isap)) = insertlane_mapping.get(&ty.lane_bits()) {
-            let instruction = insertlane.bind_vector_from_lane(ty, sse_vector_size);
+        if let Some((opcode, isap)) = x86_pinsr_mapping.get(&ty.lane_bits()) {
+            let instruction = x86_pinsr.bind_vector_from_lane(ty, sse_vector_size);
            let template = rec_r_ib_unsigned_r.opcodes(opcode.clone());
            if ty.lane_bits() < 64 {
                e.enc_32_64_maybe_isap(instruction, template.nonrex(), isap.clone());
@@ -1805,13 +1809,34 @@ pub(crate) fn define(
        }
    }

+    // for legalizing insertlane with floats, INSERTPS from SSE4.1
+    {
+        let instruction = x86_insertps.bind_vector_from_lane(F32, sse_vector_size);
+        let template = rec_fa_ib.nonrex().opcodes(vec![0x66, 0x0f, 0x3a, 0x21]);
+        e.enc_32_64_maybe_isap(instruction, template, Some(use_sse41_simd));
+    }
+
+    // for legalizing insertlane with floats,  MOVSD from SSE2
+    {
+        let instruction = x86_movsd.bind_vector_from_lane(F64, sse_vector_size);
+        let template = rec_fa.nonrex().opcodes(vec![0xf2, 0x0f, 0x10]);
+        e.enc_32_64_maybe_isap(instruction, template, None); // from SSE2
+    }
+
+    // for legalizing insertlane with floats, MOVLHPS from SSE
+    {
+        let instruction = x86_movlhps.bind_vector_from_lane(F64, sse_vector_size);
+        let template = rec_fa.nonrex().opcodes(vec![0x0f, 0x16]);
+        e.enc_32_64_maybe_isap(instruction, template, None); // from SSE
+    }
+
    // SIMD extractlane
    let mut x86_pextr_mapping: HashMap<u64, (Vec<u8>, Option<SettingPredicateNumber>)> =
        HashMap::new();
-    x86_pextr_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x14], Some(use_sse41))); // PEXTRB
-    x86_pextr_mapping.insert(16, (vec![0x66, 0x0f, 0xc5], None)); // PEXTRW from zSSE2, SSE4.1 has a PEXTRW that can move to reg/m16 but the opcode is four bytes
-    x86_pextr_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41))); // PEXTRD
-    x86_pextr_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41))); // PEXTRQ, only x86_64
+    x86_pextr_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x14], Some(use_sse41_simd))); // PEXTRB
+    x86_pextr_mapping.insert(16, (vec![0x66, 0x0f, 0xc5], None)); // PEXTRW from SSE2, SSE4.1 has a PEXTRW that can move to reg/m16 but the opcode is four bytes
+    x86_pextr_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41_simd))); // PEXTRD
+    x86_pextr_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41_simd))); // PEXTRQ, only x86_64

    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
        if let Some((opcode, isap)) = x86_pextr_mapping.get(&ty.lane_bits()) {
--- a/cranelift/codegen/meta/src/isa/x86/instructions.rs
+++ b/cranelift/codegen/meta/src/isa/x86/instructions.rs
@@ -308,5 +308,84 @@ pub(crate) fn define(
        .operands_out(vec![a]),
    );

+    let IBxN = &TypeVar::new(
+        "IBxN",
+        "A SIMD vector type containing only booleans and integers",
+        TypeSetBuilder::new()
+            .ints(Interval::All)
+            .bools(Interval::All)
+            .simd_lanes(Interval::All)
+            .includes_scalars(false)
+            .build(),
+    );
+    let x = &operand("x", IBxN);
+    let y = &operand_doc("y", &IBxN.lane_of(), "New lane value");
+    let a = &operand("a", IBxN);
+
+    ig.push(
+        Inst::new(
+            "x86_pinsr",
+            r#"
+        Insert ``y`` into ``x`` at lane ``Idx``.
+        The lane index, ``Idx``, is an immediate value, not an SSA value. It
+        must indicate a valid lane index for the type of ``x``.
+        "#,
+        )
+        .operands_in(vec![x, Idx, y])
+        .operands_out(vec![a]),
+    );
+
+    let FxN = &TypeVar::new(
+        "FxN",
+        "A SIMD vector type containing floats",
+        TypeSetBuilder::new()
+            .floats(Interval::All)
+            .simd_lanes(Interval::All)
+            .includes_scalars(false)
+            .build(),
+    );
+    let x = &operand("x", FxN);
+    let y = &operand_doc("y", &FxN.lane_of(), "New lane value");
+    let a = &operand("a", FxN);
+
+    ig.push(
+        Inst::new(
+            "x86_insertps",
+            r#"
+        Insert a lane of ``y`` into ``x`` at using ``Idx`` to encode both which lane the value is 
+        extracted from and which it is inserted to. This is similar to x86_pinsr but inserts 
+        floats, which are already stored in an XMM register.
+        "#,
+        )
+        .operands_in(vec![x, Idx, y])
+        .operands_out(vec![a]),
+    );
+
+    let x = &operand("x", FxN);
+    let y = &operand("y", FxN);
+    let a = &operand("a", FxN);
+
+    ig.push(
+        Inst::new(
+            "x86_movsd",
+            r#"
+        Move the low 64 bits of the float vector ``y`` to the low 64 bits of float vector ``x``
+        "#,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
+    ig.push(
+        Inst::new(
+            "x86_movlhps",
+            r#"
+        Move the low 64 bits of the float vector ``y`` to the high 64 bits of float vector ``x``
+        "#,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
    ig.build()
 }
--- a/cranelift/codegen/meta/src/isa/x86/legalize.rs
+++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs
@@ -381,6 +381,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
    }

    narrow.custom_legalize(extractlane, "convert_extractlane");
+    narrow.custom_legalize(insertlane, "convert_insertlane");

    narrow.build_and_add_to(&mut shared.transform_groups);
 }
--- a/cranelift/codegen/meta/src/isa/x86/recipes.rs
+++ b/cranelift/codegen/meta/src/isa/x86/recipes.rs
@@ -566,6 +566,27 @@ pub(crate) fn define<'shared>(
            ),
    );

+    // XX /r with FPR ins and outs. A form with a byte immediate.
+    {
+        let format = formats.get(f_insert_lane);
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("fa_ib", f_insert_lane, 2)
+                .operands_in(vec![fpr, fpr])
+                .operands_out(vec![0])
+                .inst_predicate(InstructionPredicate::new_is_unsigned_int(
+                    format, "lane", 8, 0,
+                ))
+                .emit(
+                    r#"
+                    {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink);
+                    modrm_rr(in_reg1, in_reg0, sink);
+                    let imm:i64 = lane.into();
+                    sink.put1(imm as u8);
+                "#,
+                ),
+        );
+    }
+
    // XX /n for a unary operation with extension bits.
    recipes.add_template_recipe(
        EncodingRecipeBuilder::new("ur", f_unary, 1)
--- a/cranelift/codegen/src/isa/x86/enc_tables.rs
+++ b/cranelift/codegen/src/isa/x86/enc_tables.rs
@@ -950,3 +950,65 @@ fn convert_extractlane(
        }
    }
 }
+
+/// Because floats exist in XMM registers, we can keep them there when executing a CLIF
+/// insertlane instruction
+fn convert_insertlane(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    _isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    if let ir::InstructionData::InsertLane {
+        opcode: ir::Opcode::Insertlane,
+        args: [vector, replacement],
+        lane,
+    } = pos.func.dfg[inst]
+    {
+        let value_type = pos.func.dfg.value_type(vector);
+        if value_type.lane_type().is_float() {
+            // Floats are already in XMM registers and can stay there.
+            match value_type {
+                F32X4 => {
+                    assert!(lane > 0 && lane <= 3);
+                    let immediate = 0b00_00_00_00 | lane << 4;
+                    // Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane
+                    // shifted into bits 5:6).
+                    pos.func
+                        .dfg
+                        .replace(inst)
+                        .x86_insertps(vector, immediate, replacement)
+                }
+                F64X2 => {
+                    let replacement_as_vector = pos.ins().raw_bitcast(F64X2, replacement); // only necessary due to SSA types
+                    if lane == 0 {
+                        // Move the lowest quadword in replacement to vector without changing
+                        // the upper bits.
+                        pos.func
+                            .dfg
+                            .replace(inst)
+                            .x86_movsd(vector, replacement_as_vector)
+                    } else {
+                        assert_eq!(lane, 1);
+                        // Move the low 64 bits of replacement vector to the high 64 bits of the
+                        // vector.
+                        pos.func
+                            .dfg
+                            .replace(inst)
+                            .x86_movlhps(vector, replacement_as_vector)
+                    }
+                }
+                _ => unreachable!(),
+            };
+        } else {
+            // For non-floats, lower with the usual PINSR* instruction.
+            pos.func
+                .dfg
+                .replace(inst)
+                .x86_pinsr(vector, lane, replacement);
+        }
+    }
+}
--- a/cranelift/codegen/src/verifier/locations.rs
+++ b/cranelift/codegen/src/verifier/locations.rs
@@ -107,8 +107,10 @@ impl<'a> LocationVerifier<'a> {
        fatal!(
            errors,
            inst,
-            "{} constraints not satisfied",
-            self.encinfo.display(enc)
+            "{} constraints not satisfied in: {}\n{}",
+            self.encinfo.display(enc),
+            self.func.dfg.display_inst(inst, self.isa),
+            self.func.display(self.isa)
        )
    }

--- a/cranelift/filetests/filetests/isa/x86/extractlane-run.clif
+++ b/cranelift/filetests/filetests/isa/x86/extractlane-run.clif
@@ -28,3 +28,41 @@ ebb0:
    return v3
 }
 ; run
+
+function %test_extractlane_i32_with_vector_reuse() -> b1 {
+ebb0:
+    v0 = iconst.i32 42
+    v1 = iconst.i32 99
+
+    v2 = splat.i32x4 v0
+    v3 = insertlane v2, 2, v1
+
+    v4 = extractlane v3, 3
+    v5 = icmp eq v4, v0
+
+    v6 = extractlane v3, 2
+    v7 = icmp eq v6, v1
+
+    v8 = band v5, v7
+    return v8
+}
+; run
+
+function %test_extractlane_f32_with_vector_reuse() -> b1 {
+ebb0:
+    v0 = f32const 0x42.42
+    v1 = f32const 0x99.99
+
+    v2 = splat.f32x4 v0
+    v3 = insertlane v2, 2, v1
+
+    v4 = extractlane v3, 3
+    v5 = fcmp eq v4, v0
+
+    v6 = extractlane v3, 2
+    v7 = fcmp eq v6, v1
+
+    v8 = band v5, v7
+    return v8
+}
+; run
--- a/cranelift/filetests/filetests/isa/x86/insertlane-binemit.clif
+++ b/cranelift/filetests/filetests/isa/x86/insertlane-binemit.clif
@@ -0,0 +1,42 @@
+test binemit
+set enable_simd
+target x86_64 haswell
+
+; for insertlane, floats are legalized differently than integers and booleans; integers and booleans use x86_pinsr
+; which is manually placed in the IR so that it can be binemit-tested
+
+function %test_insertlane_b8() {
+ebb0:
+[-, %rax]   v0 = bconst.b8 true
+[-, %rbx]   v1 = bconst.b8 false
+[-, %xmm0]  v2 = splat.b8x16 v0
+[-, %xmm0]  v3 = x86_pinsr v2, 10, v1    ; bin: 66 0f 3a 20 c3 0a
+            return
+}
+
+function %test_insertlane_i16() {
+ebb0:
+[-, %rax]   v0 = iconst.i16 4
+[-, %rbx]   v1 = iconst.i16 5
+[-, %xmm1]  v2 = splat.i16x8 v0
+[-, %xmm1]  v3 = x86_pinsr v2, 4, v1    ; bin: 66 0f c4 cb 04
+            return
+}
+
+function %test_insertlane_i32() {
+ebb0:
+[-, %rax]   v0 = iconst.i32 42
+[-, %rbx]   v1 = iconst.i32 99
+[-, %xmm4]  v2 = splat.i32x4 v0
+[-, %xmm4]  v3 = x86_pinsr v2, 2, v1    ; bin: 66 0f 3a 22 e3 02
+            return
+}
+
+function %test_insertlane_b64() {
+ebb0:
+[-, %rax]   v0 = bconst.b64 true
+[-, %rbx]   v1 = bconst.b64 false
+[-, %xmm2]  v2 = splat.b64x2 v0
+[-, %xmm2]  v3 = x86_pinsr v2, 1, v1    ; bin: 66 48 0f 3a 22 d3 01
+            return
+}
--- a/cranelift/filetests/filetests/isa/x86/insertlane-run.clif
+++ b/cranelift/filetests/filetests/isa/x86/insertlane-run.clif
@@ -0,0 +1,48 @@
+test run
+set enable_simd
+
+; TODO once SIMD vector comparison is implemented, remove use of extractlane below
+
+function %test_insertlane_b8() -> b8 {
+ebb0:
+    v1 = bconst.b8 true
+    v2 = vconst.b8x16 [false false false false false false false false false false false false false
+     false false false]
+    v3 = insertlane v2, 10, v1
+    v4 = extractlane v3, 10
+    return v4
+}
+; run
+
+function %test_insertlane_f32() -> b1 {
+ebb0:
+    v0 = f32const 0x42.42
+    v1 = vconst.f32x4 0x00
+    v2 = insertlane v1, 1, v0
+    v3 = extractlane v2, 1
+    v4 = fcmp eq v3, v0
+    return v4
+}
+; run
+
+function %test_insertlane_f64_lane1() -> b1 {
+ebb0:
+    v0 = f64const 0x42.42
+    v1 = vconst.f64x2 0x00
+    v2 = insertlane v1, 1, v0
+    v3 = extractlane v2, 1
+    v4 = fcmp eq v3, v0
+    return v4
+}
+; run
+
+function %test_insertlane_f64_lane0() -> b1 {
+ebb0:
+    v0 = f64const 0x42.42
+    v1 = vconst.f64x2 0x00
+    v2 = insertlane v1, 0, v0
+    v3 = extractlane v2, 0
+    v4 = fcmp eq v3, v0
+    return v4
+}
+; run
--- a/cranelift/filetests/filetests/isa/x86/insertlane.clif
+++ b/cranelift/filetests/filetests/isa/x86/insertlane.clif
@@ -1,39 +0,0 @@
-test binemit
-set enable_simd
-target x86_64 haswell
-
-function %test_insertlane_b8() {
-ebb0:
-[-, %rax]   v0 = bconst.b8 true
-[-, %rbx]   v1 = bconst.b8 false
-[-, %xmm0]  v2 = splat.b8x16 v0
-[-, %xmm0]  v3 = insertlane v2, 10, v1    ; bin: 66 0f 3a 20 c3 0a
-            return
-}
-
-function %test_insertlane_i16() {
-ebb0:
-[-, %rax]   v0 = iconst.i16 4
-[-, %rbx]   v1 = iconst.i16 5
-[-, %xmm1]  v2 = splat.i16x8 v0
-[-, %xmm1]  v3 = insertlane v2, 4, v1    ; bin: 66 0f c4 cb 04
-            return
-}
-
-function %test_insertlane_i32() {
-ebb0:
-[-, %rax]   v0 = iconst.i32 42
-[-, %rbx]   v1 = iconst.i32 99
-[-, %xmm4]  v2 = splat.i32x4 v0
-[-, %xmm4]  v3 = insertlane v2, 2, v1    ; bin: 66 0f 3a 22 e3 02
-            return
-}
-
-function %test_insertlane_f64() {
-ebb0:
-[-, %rax]   v0 = f64const 0x0.0
-[-, %rbx]   v1 = f64const 0x4.2
-[-, %xmm2]  v2 = splat.f64x2 v0
-[-, %xmm2]  v3 = insertlane v2, 1, v1    ; bin: 66 48 0f 3a 22 d3 01
-            return
-}
--- a/cranelift/filetests/filetests/isa/x86/legalize-splat.clif
+++ b/cranelift/filetests/filetests/isa/x86/legalize-splat.clif
@@ -33,7 +33,7 @@ ebb0:
 ; check:   ebb0:
 ; nextln:     v0 = iconst.i64 42
 ; nextln:     v2 = scalar_to_vector.i64x2 v0
-; nextln:     v1 = insertlane v2, 1, v0
+; nextln:     v1 = x86_pinsr v2, 1, v0
 ; nextln:     return v1


@@ -48,7 +48,7 @@ ebb0:
 ; check:   ebb0:
 ; nextln:     v0 = bconst.b16 true
 ; nextln:     v2 = scalar_to_vector.b16x8 v0
-; nextln:     v3 = insertlane v2, 1, v0
+; nextln:     v3 = x86_pinsr v2, 1, v0
 ; nextln:     v4 = raw_bitcast.i32x4 v3
 ; nextln:     v5 = x86_pshufd v4, 0
 ; nextln:     v1 = raw_bitcast.b16x8 v5