[machinst x64]: add insertlane implementation

2020-09-22 12:39:50 -07:00
parent a8a6e4e69d
commit 29fa894790
5 changed files with 116 additions and 24 deletions
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -383,6 +383,7 @@ pub enum SseOpcode {
    Movd,
    Movdqa,
    Movdqu,
+    Movlhps,
    Movq,
    Movss,
    Movsd,
@@ -403,6 +404,9 @@ pub enum SseOpcode {
    Paddw,
    Pavgb,
    Pavgw,
+    Pinsrb,
+    Pinsrw,
+    Pinsrd,
    Pmaxsb,
    Pmaxsw,
    Pmaxsd,
@@ -471,6 +475,7 @@ impl SseOpcode {
            | SseOpcode::Minps
            | SseOpcode::Minss
            | SseOpcode::Movaps
+            | SseOpcode::Movlhps
            | SseOpcode::Movss
            | SseOpcode::Movups
            | SseOpcode::Mulps
@@ -519,6 +524,7 @@ impl SseOpcode {
            | SseOpcode::Paddw
            | SseOpcode::Pavgb
            | SseOpcode::Pavgw
+            | SseOpcode::Pinsrw
            | SseOpcode::Pmaxsw
            | SseOpcode::Pmaxub
            | SseOpcode::Pminsw
@@ -548,6 +554,8 @@ impl SseOpcode {
            SseOpcode::Pabsb | SseOpcode::Pabsw | SseOpcode::Pabsd => SSSE3,

            SseOpcode::Insertps
+            | SseOpcode::Pinsrb
+            | SseOpcode::Pinsrd
            | SseOpcode::Pmaxsb
            | SseOpcode::Pmaxsd
            | SseOpcode::Pmaxuw
@@ -614,6 +622,7 @@ impl fmt::Debug for SseOpcode {
            SseOpcode::Movd => "movd",
            SseOpcode::Movdqa => "movdqa",
            SseOpcode::Movdqu => "movdqu",
+            SseOpcode::Movlhps => "movlhps",
            SseOpcode::Movq => "movq",
            SseOpcode::Movss => "movss",
            SseOpcode::Movsd => "movsd",
@@ -634,6 +643,9 @@ impl fmt::Debug for SseOpcode {
            SseOpcode::Paddw => "paddw",
            SseOpcode::Pavgb => "pavgb",
            SseOpcode::Pavgw => "pavgw",
+            SseOpcode::Pinsrb => "pinsrb",
+            SseOpcode::Pinsrw => "pinsrw",
+            SseOpcode::Pinsrd => "pinsrd",
            SseOpcode::Pmaxsb => "pmaxsb",
            SseOpcode::Pmaxsw => "pmaxsw",
            SseOpcode::Pmaxsd => "pmaxsd",
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -1760,14 +1760,16 @@ pub(crate) fn emit(
                SseOpcode::Divpd => (LegacyPrefixes::_66, 0x0F5E, 2),
                SseOpcode::Divss => (LegacyPrefixes::_F3, 0x0F5E, 2),
                SseOpcode::Divsd => (LegacyPrefixes::_F2, 0x0F5E, 2),
-                SseOpcode::Minps => (LegacyPrefixes::None, 0x0F5D, 2),
-                SseOpcode::Minpd => (LegacyPrefixes::_66, 0x0F5D, 2),
-                SseOpcode::Minss => (LegacyPrefixes::_F3, 0x0F5D, 2),
-                SseOpcode::Minsd => (LegacyPrefixes::_F2, 0x0F5D, 2),
                SseOpcode::Maxps => (LegacyPrefixes::None, 0x0F5F, 2),
                SseOpcode::Maxpd => (LegacyPrefixes::_66, 0x0F5F, 2),
                SseOpcode::Maxss => (LegacyPrefixes::_F3, 0x0F5F, 2),
                SseOpcode::Maxsd => (LegacyPrefixes::_F2, 0x0F5F, 2),
+                SseOpcode::Minps => (LegacyPrefixes::None, 0x0F5D, 2),
+                SseOpcode::Minpd => (LegacyPrefixes::_66, 0x0F5D, 2),
+                SseOpcode::Minss => (LegacyPrefixes::_F3, 0x0F5D, 2),
+                SseOpcode::Minsd => (LegacyPrefixes::_F2, 0x0F5D, 2),
+                SseOpcode::Movlhps => (LegacyPrefixes::None, 0x0F16, 2),
+                SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F10, 2),
                SseOpcode::Mulps => (LegacyPrefixes::None, 0x0F59, 2),
                SseOpcode::Mulpd => (LegacyPrefixes::_66, 0x0F59, 2),
                SseOpcode::Mulss => (LegacyPrefixes::_F3, 0x0F59, 2),
@@ -1906,23 +1908,36 @@ pub(crate) fn emit(
            sink.bind_label(done);
        }

-        Inst::XmmRmRImm { op, src, dst, imm } => {
-            let prefix = match op {
-                SseOpcode::Cmpps => LegacyPrefixes::None,
-                SseOpcode::Cmppd => LegacyPrefixes::_66,
-                SseOpcode::Cmpss => LegacyPrefixes::_F3,
-                SseOpcode::Cmpsd => LegacyPrefixes::_F2,
+        Inst::XmmRmRImm {
+            op,
+            src,
+            dst,
+            imm,
+            is64: w,
+        } => {
+            let (prefix, opcode, num_opcodes) = match op {
+                SseOpcode::Cmpps => (LegacyPrefixes::None, 0x0FC2, 2),
+                SseOpcode::Cmppd => (LegacyPrefixes::_66, 0x0FC2, 2),
+                SseOpcode::Cmpss => (LegacyPrefixes::_F3, 0x0FC2, 2),
+                SseOpcode::Cmpsd => (LegacyPrefixes::_F2, 0x0FC2, 2),
+                SseOpcode::Insertps => (LegacyPrefixes::_66, 0x0F3A21, 3),
+                SseOpcode::Pinsrb => (LegacyPrefixes::_66, 0x0F3A20, 3),
+                SseOpcode::Pinsrw => (LegacyPrefixes::_66, 0x0FC4, 2),
+                SseOpcode::Pinsrd => (LegacyPrefixes::_66, 0x0F3A22, 3),
                _ => unimplemented!("Opcode {:?} not implemented", op),
            };
-            let opcode = 0x0FC2;
-            let rex = RexFlags::clear_w();
+            let rex = if *w {
+                RexFlags::set_w()
+            } else {
+                RexFlags::clear_w()
+            };
            match src {
                RegMem::Reg { reg } => {
-                    emit_std_reg_reg(sink, prefix, opcode, 2, dst.to_reg(), *reg, rex);
+                    emit_std_reg_reg(sink, prefix, opcode, num_opcodes, dst.to_reg(), *reg, rex);
                }
                RegMem::Mem { addr } => {
                    let addr = &addr.finalize(state);
-                    emit_std_reg_mem(sink, prefix, opcode, 2, dst.to_reg(), addr, rex);
+                    emit_std_reg_mem(sink, prefix, opcode, num_opcodes, dst.to_reg(), addr, rex);
                }
            }
            sink.put1(*imm)
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -3441,12 +3441,12 @@ fn test_x64_emit() {
    // ========================================================
    // XmmRmRImm
    insns.push((
-        Inst::xmm_rm_r_imm(SseOpcode::Cmppd, RegMem::reg(xmm5), w_xmm1, 2),
+        Inst::xmm_rm_r_imm(SseOpcode::Cmppd, RegMem::reg(xmm5), w_xmm1, 2, false),
        "660FC2CD02",
        "cmppd   $2, %xmm5, %xmm1",
    ));
    insns.push((
-        Inst::xmm_rm_r_imm(SseOpcode::Cmpps, RegMem::reg(xmm15), w_xmm7, 0),
+        Inst::xmm_rm_r_imm(SseOpcode::Cmpps, RegMem::reg(xmm15), w_xmm7, 0, false),
        "410FC2FF00",
        "cmpps   $0, %xmm15, %xmm7",
    ));
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -333,12 +333,13 @@ pub enum Inst {
        dst: Reg,
    },

-    /// A binary XMM instruction with an 8-bit immediate: cmp (ps pd) imm (reg addr) reg
+    /// A binary XMM instruction with an 8-bit immediate: e.g. cmp (ps pd) imm (reg addr) reg
    XmmRmRImm {
        op: SseOpcode,
        src: RegMem,
        dst: Writable<Reg>,
        imm: u8,
+        is64: bool,
    },

    // =====================================
@@ -780,11 +781,22 @@ impl Inst {
        }
    }

-    pub(crate) fn xmm_rm_r_imm(op: SseOpcode, src: RegMem, dst: Writable<Reg>, imm: u8) -> Inst {
-        src.assert_regclass_is(RegClass::V128);
+    pub(crate) fn xmm_rm_r_imm(
+        op: SseOpcode,
+        src: RegMem,
+        dst: Writable<Reg>,
+        imm: u8,
+        w: bool,
+    ) -> Inst {
        debug_assert!(dst.to_reg().get_class() == RegClass::V128);
        debug_assert!(imm < 8);
-        Inst::XmmRmRImm { op, src, dst, imm }
+        Inst::XmmRmRImm {
+            op,
+            src,
+            dst,
+            imm,
+            is64: w,
+        }
    }

    pub(crate) fn movzx_rm_r(
@@ -1118,7 +1130,9 @@ impl Inst {
                        || *op == SseOpcode::Pxor)
            }

-            Self::XmmRmRImm { op, src, dst, imm } => {
+            Self::XmmRmRImm {
+                op, src, dst, imm, ..
+            } => {
                src.to_reg() == Some(dst.to_reg())
                    && (*op == SseOpcode::Cmppd || *op == SseOpcode::Cmpps)
                    && *imm == FcmpImm::Equal.encode()
@@ -1300,9 +1314,9 @@ impl ShowWithRRU for Inst {
                show_ireg_sized(rhs_dst.to_reg(), mb_rru, 8),
            ),

-            Inst::XmmRmRImm { op, src, dst, imm } => format!(
+            Inst::XmmRmRImm { op, src, dst, imm, is64 } => format!(
                "{} ${}, {}, {}",
-                ljustify(op.to_string()),
+                ljustify(format!("{}{}", op.to_string(), if *is64 { ".w" } else { "" })),
                imm,
                src.show_rru(mb_rru),
                dst.show_rru(mb_rru),
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -1394,7 +1394,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                ctx.emit(Inst::gen_move(dst, lhs, input_ty));

                // Emit the comparison.
-                ctx.emit(Inst::xmm_rm_r_imm(op, rhs, dst, imm.encode()));
+                ctx.emit(Inst::xmm_rm_r_imm(op, rhs, dst, imm.encode(), false));
            }
        }

@@ -1859,6 +1859,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                        RegMem::reg(tmp.to_reg()),
                        tmp,
                        cond.encode(),
+                        false,
                    );
                    ctx.emit(cmpps);

@@ -2639,6 +2640,56 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            ctx.emit(Inst::gen_move(dst, src, ty));
        }

+        Opcode::Insertlane => {
+            // The instruction format maps to variables like: %dst = insertlane %in_vec, %src, %lane
+            let ty = ty.unwrap();
+            let dst = get_output_reg(ctx, outputs[0]);
+            let in_vec = put_input_in_reg(ctx, inputs[0]);
+            let src_ty = ctx.input_ty(insn, 1);
+            debug_assert!(!src_ty.is_vector());
+            let src = input_to_reg_mem(ctx, inputs[1]);
+            let lane = if let InstructionData::TernaryImm8 { imm, .. } = ctx.data(insn) {
+                *imm
+            } else {
+                unreachable!();
+            };
+            debug_assert!(lane < ty.lane_count() as u8);
+
+            ctx.emit(Inst::gen_move(dst, in_vec, ty));
+            if !src_ty.is_float() {
+                let (sse_op, w_bit) = match ty.lane_bits() {
+                    8 => (SseOpcode::Pinsrb, false),
+                    16 => (SseOpcode::Pinsrw, false),
+                    32 => (SseOpcode::Pinsrd, false),
+                    64 => (SseOpcode::Pinsrd, true),
+                    _ => panic!("Unable to insertlane for lane size: {}", ty.lane_bits()),
+                };
+                ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, w_bit));
+            } else if src_ty == types::F32 {
+                let sse_op = SseOpcode::Insertps;
+                // Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane
+                // shifted into bits 5:6).
+                let lane = 0b00_00_00_00 | lane << 4;
+                ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, false));
+            } else if src_ty == types::F64 {
+                let sse_op = match lane {
+                    // Move the lowest quadword in replacement to vector without changing
+                    // the upper bits.
+                    0 => SseOpcode::Movsd,
+                    // Move the low 64 bits of replacement vector to the high 64 bits of the
+                    // vector.
+                    1 => SseOpcode::Movlhps,
+                    _ => unreachable!(),
+                };
+                // Here we use the `xmm_rm_r` encoding because it correctly tells the register
+                // allocator how we are using `dst`: we are using `dst` as a `mod` whereas other
+                // encoding formats like `xmm_unary_rm_r` treat it as a `def`.
+                ctx.emit(Inst::xmm_rm_r(sse_op, src, dst));
+            } else {
+                panic!("Unable to insertlane for type: {}", ty);
+            }
+        }
+
        Opcode::IaddImm
        | Opcode::ImulImm
        | Opcode::UdivImm