Avoid extra register movement when lowering the x86 extractlane of a float vector

This commit is based on the assumption that floats are already stored in XMM registers in x86. When extracting a lane, cranelift was moving the float to a regular register and back to an XMM register; this change avoids this by shuffling the float value to the lowest bits of the XMM register. It also assumes that the upper bits can be left as is (instead of zeroing them out).
2019-08-21 16:56:11 -07:00
parent f1363168a9
commit 00bedca274
7 changed files with 154 additions and 43 deletions
--- a/cranelift/codegen/src/isa/x86/enc_tables.rs
+++ b/cranelift/codegen/src/isa/x86/enc_tables.rs
@@ -5,6 +5,7 @@ use crate::bitset::BitSet;
 use crate::cursor::{Cursor, FuncCursor};
 use crate::flowgraph::ControlFlowGraph;
 use crate::ir::condcodes::{FloatCC, IntCC};
+use crate::ir::types::*;
 use crate::ir::{self, Function, Inst, InstBuilder};
 use crate::isa::constraints::*;
 use crate::isa::enc_tables::*;
@@ -893,3 +894,59 @@ fn expand_fcvt_to_uint_sat(
    cfg.recompute_ebb(pos.func, uint_large_ebb);
    cfg.recompute_ebb(pos.func, done);
 }
+
+/// Because floats already exist in XMM registers, we can keep them there when executing a CLIF
+/// extractlane instruction
+fn convert_extractlane(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    _isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    if let ir::InstructionData::ExtractLane {
+        opcode: ir::Opcode::Extractlane,
+        arg,
+        lane,
+    } = pos.func.dfg[inst]
+    {
+        // NOTE: the following legalization assumes that the upper bits of the XMM register do
+        // not need to be zeroed during extractlane.
+        let value_type = pos.func.dfg.value_type(arg);
+        if value_type.lane_type().is_float() {
+            // Floats are already in XMM registers and can stay there.
+            let shuffled = if lane != 0 {
+                // Replace the extractlane with a PSHUFD to get the float in the right place.
+                match value_type {
+                    F32X4 => {
+                        // Move the selected lane to the 0 lane.
+                        let shuffle_mask: u8 = 0b00_00_00_00 | lane;
+                        pos.ins().x86_pshufd(arg, shuffle_mask)
+                    }
+                    F64X2 => {
+                        assert_eq!(lane, 1);
+                        // Because we know the lane == 1, we move the upper 64 bits to the lower
+                        // 64 bits, leaving the top 64 bits as-is.
+                        let shuffle_mask = 0b11_10_11_10;
+                        let bitcast = pos.ins().raw_bitcast(F32X4, arg);
+                        pos.ins().x86_pshufd(bitcast, shuffle_mask)
+                    }
+                    _ => unreachable!(),
+                }
+            } else {
+                // Remove the extractlane instruction, leaving the float where it is.
+                arg
+            };
+            // Then we must bitcast to the right type.
+            pos.func
+                .dfg
+                .replace(inst)
+                .raw_bitcast(value_type.lane_type(), shuffled);
+        } else {
+            // For non-floats, lower with the usual PEXTR* instruction.
+            pos.func.dfg.replace(inst).x86_pextr(arg, lane);
+        }
+    }
+}