x64: avoid load-coalescing SIMD operations with non-aligned loads

Fixes #2943, though not as optimally as may be desired. With x64 SIMD instructions, the memory operand must be aligned--this change adds that check. There are cases, however, where we can do better--see #3106.
2021-07-21 11:19:40 -07:00
parent a2cfddff9c
commit 6b86984c41
2 changed files with 21 additions and 0 deletions
--- a/cranelift/codegen/src/ir/instructions.rs
+++ b/cranelift/codegen/src/ir/instructions.rs
@@ -24,6 +24,8 @@ use crate::data_value::DataValue;
 use crate::entity;
 use ir::condcodes::{FloatCC, IntCC};

+use super::MemFlags;
+
 /// Some instructions use an external list of argument values because there is not enough space in
 /// the 16-byte `InstructionData` struct. These value lists are stored in a memory pool in
 /// `dfg.value_lists`.
@@ -395,6 +397,19 @@ impl InstructionData {
        }
    }

+    /// If this is a load/store instruction, return its memory flags.
+    pub fn memflags(&self) -> Option<MemFlags> {
+        match self {
+            &InstructionData::Load { flags, .. }
+            | &InstructionData::LoadComplex { flags, .. }
+            | &InstructionData::LoadNoOffset { flags, .. }
+            | &InstructionData::Store { flags, .. }
+            | &InstructionData::StoreComplex { flags, .. }
+            | &InstructionData::StoreNoOffset { flags, .. } => Some(flags),
+            _ => None,
+        }
+    }
+
    /// Return information about a call instruction.
    ///
    /// Any instruction that can call another function reveals its call signature here.
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -153,6 +153,12 @@ fn is_mergeable_load<C: LowerCtx<I = Inst>>(
        return None;
    }

+    // SIMD instructions can only be load-coalesced when the loaded value comes
+    // from an aligned address.
+    if load_ty.is_vector() && !insn_data.memflags().map_or(false, |f| f.aligned()) {
+        return None;
+    }
+
    // Just testing the opcode is enough, because the width will always match if
    // the type does (and the type should match if the CLIF is properly
    // constructed).