egraphs: a few miscellaneous compile-time optimizations. (#5072)

* egraphs: a few miscellaneous compile-time optimizations. These optimizations together are worth about a 2% compile-time reduction, as measured on one core with spidermonkey.wasm as an input, using `hyperfine` on `wasmtime compile`. The changes included are: - Some better pre-allocation (blockparams and side-effects concatenated list vecs); - Avoiding the indirection of storing list-of-types for every Pure and Inst node, when almost all nodes produce only a single result; instead, store arity and single type if it exists, and allow result projection nodes to fill in types otherwise; - Pack the `MemoryState` enum into one `u32` (this together with the above removal of the type slice allows `Node` to shrink from 48 bytes to 32 bytes); - always-inline an accessor (`entry` on `CtxHash`) that wasn't (`always(inline)` appears to be load-bearing, rather than just `inline`); - Split the update-analysis path into two hotpaths, one for the union case and one for the new-node case (and the former can avoid recomputing for the contained node when replacing a node with node-and-child eclass entry). * Review feedback. * Fix test build. * Fix to lowering when unused output with invalid type is present.
2022-10-19 11:05:00 -07:00
parent 0667a412d7
commit c392e461a3
9 changed files with 206 additions and 139 deletions
--- a/cranelift/codegen/src/egraph/elaborate.rs
+++ b/cranelift/codegen/src/egraph/elaborate.rs
@@ -66,7 +66,11 @@ enum ElabStackEntry {
    },
    /// Waiting for a result to return one projected value of a
    /// multi-value result.
-    PendingProjection { canonical: Id, index: usize },
+    PendingProjection {
+        canonical: Id,
+        index: usize,
+        ty: Type,
+    },
 }

 #[derive(Clone, Debug)]
@@ -189,15 +193,15 @@ impl<'a> Elaborator<'a> {
    }

    fn add_node(&mut self, node: &Node, args: &[Value], to_block: Block) -> ValueList {
-        let (instdata, result_tys) = match node {
-            Node::Pure { op, types, .. } | Node::Inst { op, types, .. } => (
+        let (instdata, result_ty, arity) = match node {
+            Node::Pure { op, ty, arity, .. } | Node::Inst { op, ty, arity, .. } => (
                op.with_args(args, &mut self.func.dfg.value_lists),
-                types.as_slice(&self.node_ctx.types),
-            ),
-            Node::Load { op, ty, .. } => (
-                op.with_args(args, &mut self.func.dfg.value_lists),
-                std::slice::from_ref(ty),
+                *ty,
+                *arity,
            ),
+            Node::Load { op, ty, .. } => {
+                (op.with_args(args, &mut self.func.dfg.value_lists), *ty, 1)
+            }
            _ => panic!("Cannot `add_node()` on block param or projection"),
        };
        let srcloc = match node {
@@ -237,8 +241,12 @@ impl<'a> Elaborator<'a> {
        let inst = self.func.dfg.make_inst(instdata);
        self.func.srclocs[inst] = srcloc;

-        for &ty in result_tys {
-            self.func.dfg.append_result(inst, ty);
+        if arity == 1 {
+            self.func.dfg.append_result(inst, result_ty);
+        } else {
+            for _ in 0..arity {
+                self.func.dfg.append_result(inst, crate::ir::types::INVALID);
+            }
        }

        if is_terminator_group_inst {
@@ -371,11 +379,15 @@ impl<'a> Elaborator<'a> {
                    // the value we are projecting a part of, then
                    // eventually return here (saving state with a
                    // PendingProjection).
-                    if let Node::Result { value, result, .. } = node {
+                    if let Node::Result {
+                        value, result, ty, ..
+                    } = node
+                    {
                        trace!(" -> result; pushing arg value {}", value);
                        self.elab_stack.push(ElabStackEntry::PendingProjection {
                            index: *result,
                            canonical,
+                            ty: *ty,
                        });
                        self.elab_stack.push(ElabStackEntry::Start { id: *value });
                        continue;
@@ -493,7 +505,11 @@ impl<'a> Elaborator<'a> {
                    // Push onto the elab-results stack.
                    self.elab_result_stack.push(result)
                }
-                &ElabStackEntry::PendingProjection { index, canonical } => {
+                &ElabStackEntry::PendingProjection {
+                    ty,
+                    index,
+                    canonical,
+                } => {
                    self.elab_stack.pop();

                    // Grab the input from the elab-result stack.
@@ -511,10 +527,12 @@ impl<'a> Elaborator<'a> {
                        }
                    };
                    let values = values.as_slice(&self.func.dfg.value_lists);
+                    let value = values[index];
+                    self.func.dfg.fill_in_value_type(value, ty);
                    let value = IdValue::Value {
                        depth,
                        block,
-                        value: values[index],
+                        value,
                    };
                    self.id_to_value.insert_if_absent(canonical, value.clone());

--- a/cranelift/codegen/src/egraph/node.rs
+++ b/cranelift/codegen/src/egraph/node.rs
@@ -1,9 +1,9 @@
 //! Node definition for EGraph representation.

-use super::MemoryState;
-use crate::ir::{Block, DataFlowGraph, Inst, InstructionImms, Opcode, RelSourceLoc, Type};
+use super::PackedMemoryState;
+use crate::ir::{Block, DataFlowGraph, InstructionImms, Opcode, RelSourceLoc, Type};
 use crate::loop_analysis::LoopLevel;
-use cranelift_egraph::{BumpArena, BumpSlice, CtxEq, CtxHash, Id, Language, UnionFind};
+use cranelift_egraph::{CtxEq, CtxHash, Id, Language, UnionFind};
 use cranelift_entity::{EntityList, ListPool};
 use std::hash::{Hash, Hasher};

@@ -31,8 +31,10 @@ pub enum Node {
        op: InstructionImms,
        /// eclass arguments to the operator.
        args: EntityList<Id>,
-        /// Types of results.
-        types: BumpSlice<Type>,
+        /// Type of result, if one.
+        ty: Type,
+        /// Number of results.
+        arity: u16,
    },
    /// A CLIF instruction that has side-effects or is otherwise not
    /// representable by `Pure`.
@@ -41,15 +43,10 @@ pub enum Node {
        op: InstructionImms,
        /// eclass arguments to the operator.
        args: EntityList<Id>,
-        /// Types of results.
-        types: BumpSlice<Type>,
-        /// The index of the original instruction. We include this so
-        /// that the `Inst`s are not deduplicated: every instance is a
-        /// logically separate and unique side-effect. However,
-        /// because we clear the DataFlowGraph before elaboration,
-        /// this `Inst` is *not* valid to fetch any details from the
-        /// original instruction.
-        inst: Inst,
+        /// Type of result, if one.
+        ty: Type,
+        /// Number of results.
+        arity: u16,
        /// The source location to preserve.
        srcloc: RelSourceLoc,
        /// The loop level of this Inst.
@@ -83,14 +80,9 @@ pub enum Node {
        /// the key).
        addr: Id,
        /// The abstract memory state that this load accesses.
-        mem_state: MemoryState,
+        mem_state: PackedMemoryState,

        // -- not included in dedup key:
-        /// The `Inst` we will use for a trap location for this
-        /// load. Excluded from Eq/Hash so that loads that are
-        /// identical except for the specific instance will dedup on
-        /// top of each other.
-        inst: Inst,
        /// Source location, for traps. Not included in Eq/Hash.
        srcloc: RelSourceLoc,
    },
@@ -107,18 +99,14 @@ impl Node {

 /// Shared pools for type and id lists in nodes.
 pub struct NodeCtx {
-    /// Arena for result-type arrays.
-    pub types: BumpArena<Type>,
    /// Arena for arg eclass-ID lists.
    pub args: ListPool<Id>,
 }

 impl NodeCtx {
    pub(crate) fn with_capacity_for_dfg(dfg: &DataFlowGraph) -> Self {
-        let n_types = dfg.num_values();
        let n_args = dfg.value_lists.capacity();
        Self {
-            types: BumpArena::arena_with_capacity(n_types),
            args: ListPool::with_capacity(n_args),
        }
    }
@@ -168,26 +156,23 @@ impl CtxEq<Node, Node> for NodeCtx {
                &Node::Pure {
                    ref op,
                    ref args,
-                    ref types,
+                    ty,
+                    arity: _,
                },
                &Node::Pure {
                    op: ref other_op,
                    args: ref other_args,
-                    types: ref other_types,
+                    ty: other_ty,
+                    arity: _,
                },
-            ) => {
-                *op == *other_op
-                    && self.ids_eq(args, other_args, uf)
-                    && types.as_slice(&self.types) == other_types.as_slice(&self.types)
-            }
+            ) => *op == *other_op && self.ids_eq(args, other_args, uf) && ty == other_ty,
            (
-                &Node::Inst { inst, ref args, .. },
+                &Node::Inst { ref args, .. },
                &Node::Inst {
-                    inst: other_inst,
                    args: ref other_args,
                    ..
                },
-            ) => inst == other_inst && self.ids_eq(args, other_args, uf),
+            ) => self.ids_eq(args, other_args, uf),
            (
                &Node::Load {
                    ref op,
@@ -249,16 +234,14 @@ impl CtxHash<Node> for NodeCtx {
            &Node::Pure {
                ref op,
                ref args,
-                types: _,
+                ty,
+                arity: _,
            } => {
                op.hash(&mut state);
                self.hash_ids(args, &mut state, uf);
-                // Don't hash `types`: it requires an indirection
-                // (hence cache misses), and result type *should* be
-                // fully determined by op and args.
+                ty.hash(&mut state);
            }
-            &Node::Inst { inst, ref args, .. } => {
-                inst.hash(&mut state);
+            &Node::Inst { ref args, .. } => {
                self.hash_ids(args, &mut state, uf);
            }
            &Node::Load {
@@ -370,3 +353,14 @@ impl Language for NodeCtx {
        }
    }
 }
+
+#[cfg(test)]
+mod test {
+    #[test]
+    #[cfg(target_pointer_width = "64")]
+    fn node_size() {
+        use super::*;
+        assert_eq!(std::mem::size_of::<InstructionImms>(), 16);
+        assert_eq!(std::mem::size_of::<Node>(), 32);
+    }
+}
--- a/cranelift/codegen/src/egraph/stores.rs
+++ b/cranelift/codegen/src/egraph/stores.rs
@@ -62,7 +62,7 @@ use crate::fx::{FxHashMap, FxHashSet};
 use crate::inst_predicates::has_memory_fence_semantics;
 use crate::ir::{Block, Function, Inst, InstructionData, MemFlags, Opcode};
 use crate::trace;
-use cranelift_entity::SecondaryMap;
+use cranelift_entity::{EntityRef, SecondaryMap};
 use smallvec::{smallvec, SmallVec};

 /// For a given program point, the vector of last-store instruction
@@ -97,6 +97,32 @@ pub enum MemoryState {
    AfterInst(Inst),
 }

+/// Memory state index, packed into a u32.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct PackedMemoryState(u32);
+
+impl From<MemoryState> for PackedMemoryState {
+    fn from(state: MemoryState) -> Self {
+        match state {
+            MemoryState::Entry => Self(0),
+            MemoryState::Store(i) => Self(1 | (i.index() as u32) << 2),
+            MemoryState::BeforeInst(i) => Self(2 | (i.index() as u32) << 2),
+            MemoryState::AfterInst(i) => Self(3 | (i.index() as u32) << 2),
+        }
+    }
+}
+
+impl PackedMemoryState {
+    /// Does this memory state refer to a specific store instruction?
+    pub fn as_store(&self) -> Option<Inst> {
+        if self.0 & 3 == 1 {
+            Some(Inst::from_bits(self.0 >> 2))
+        } else {
+            None
+        }
+    }
+}
+
 impl LastStores {
    fn update(&mut self, func: &Function, inst: Inst) {
        let opcode = func.dfg[inst].opcode();
@@ -148,7 +174,7 @@ impl LastStores {
 pub struct AliasAnalysis {
    /// Last-store instruction (or none) for a given load. Use a hash map
    /// instead of a `SecondaryMap` because this is sparse.
-    load_mem_state: FxHashMap<Inst, MemoryState>,
+    load_mem_state: FxHashMap<Inst, PackedMemoryState>,
 }

 impl AliasAnalysis {
@@ -165,7 +191,7 @@ impl AliasAnalysis {
        cfg: &ControlFlowGraph,
    ) -> SecondaryMap<Block, Option<LastStores>> {
        let mut block_input = SecondaryMap::with_capacity(func.dfg.num_blocks());
-        let mut worklist: SmallVec<[Block; 8]> = smallvec![];
+        let mut worklist: SmallVec<[Block; 16]> = smallvec![];
        let mut worklist_set = FxHashSet::default();
        let entry = func.layout.entry_block().unwrap();
        worklist.push(entry);
@@ -210,8 +236,9 @@ impl AliasAnalysis {
    fn compute_load_last_stores(
        func: &Function,
        block_input: SecondaryMap<Block, Option<LastStores>>,
-    ) -> FxHashMap<Inst, MemoryState> {
+    ) -> FxHashMap<Inst, PackedMemoryState> {
        let mut load_mem_state = FxHashMap::default();
+        load_mem_state.reserve(func.dfg.num_insts() / 8);

        for block in func.layout.blocks() {
            let mut state = block_input[block].clone().unwrap();
@@ -249,7 +276,7 @@ impl AliasAnalysis {
                        mem_state,
                    );

-                    load_mem_state.insert(inst, mem_state);
+                    load_mem_state.insert(inst, mem_state.into());
                }

                state.update(func, inst);
@@ -260,7 +287,7 @@ impl AliasAnalysis {
    }

    /// Get the state seen by a load, if any.
-    pub fn get_state_for_load(&self, inst: Inst) -> Option<MemoryState> {
+    pub fn get_state_for_load(&self, inst: Inst) -> Option<PackedMemoryState> {
        self.load_mem_state.get(&inst).copied()
    }
 }