egraphs: a few miscellaneous compile-time optimizations. (#5072)

* egraphs: a few miscellaneous compile-time optimizations.

These optimizations together are worth about a 2% compile-time
reduction, as measured on one core with spidermonkey.wasm as an input,
using `hyperfine` on `wasmtime compile`.

The changes included are:
- Some better pre-allocation (blockparams and side-effects concatenated
  list vecs);
- Avoiding the indirection of storing list-of-types for every Pure and
  Inst node, when almost all nodes produce only a single result;
  instead, store arity and single type if it exists, and allow result
  projection nodes to fill in types otherwise;
- Pack the `MemoryState` enum into one `u32` (this together with the
  above removal of the type slice allows `Node` to
  shrink from 48 bytes to 32 bytes);
- always-inline an accessor (`entry` on `CtxHash`) that wasn't
  (`always(inline)` appears to be load-bearing, rather than just
  `inline`);
- Split the update-analysis path into two hotpaths, one for the union
  case and one for the new-node case (and the former can avoid
  recomputing for the contained node when replacing a node with
  node-and-child eclass entry).

* Review feedback.

* Fix test build.

* Fix to lowering when unused output with invalid type is present.
This commit is contained in:
Chris Fallin
2022-10-19 11:05:00 -07:00
committed by GitHub
parent 0667a412d7
commit c392e461a3
9 changed files with 206 additions and 139 deletions

View File

@@ -66,7 +66,11 @@ enum ElabStackEntry {
},
/// Waiting for a result to return one projected value of a
/// multi-value result.
PendingProjection { canonical: Id, index: usize },
PendingProjection {
canonical: Id,
index: usize,
ty: Type,
},
}
#[derive(Clone, Debug)]
@@ -189,15 +193,15 @@ impl<'a> Elaborator<'a> {
}
fn add_node(&mut self, node: &Node, args: &[Value], to_block: Block) -> ValueList {
let (instdata, result_tys) = match node {
Node::Pure { op, types, .. } | Node::Inst { op, types, .. } => (
let (instdata, result_ty, arity) = match node {
Node::Pure { op, ty, arity, .. } | Node::Inst { op, ty, arity, .. } => (
op.with_args(args, &mut self.func.dfg.value_lists),
types.as_slice(&self.node_ctx.types),
),
Node::Load { op, ty, .. } => (
op.with_args(args, &mut self.func.dfg.value_lists),
std::slice::from_ref(ty),
*ty,
*arity,
),
Node::Load { op, ty, .. } => {
(op.with_args(args, &mut self.func.dfg.value_lists), *ty, 1)
}
_ => panic!("Cannot `add_node()` on block param or projection"),
};
let srcloc = match node {
@@ -237,8 +241,12 @@ impl<'a> Elaborator<'a> {
let inst = self.func.dfg.make_inst(instdata);
self.func.srclocs[inst] = srcloc;
for &ty in result_tys {
self.func.dfg.append_result(inst, ty);
if arity == 1 {
self.func.dfg.append_result(inst, result_ty);
} else {
for _ in 0..arity {
self.func.dfg.append_result(inst, crate::ir::types::INVALID);
}
}
if is_terminator_group_inst {
@@ -371,11 +379,15 @@ impl<'a> Elaborator<'a> {
// the value we are projecting a part of, then
// eventually return here (saving state with a
// PendingProjection).
if let Node::Result { value, result, .. } = node {
if let Node::Result {
value, result, ty, ..
} = node
{
trace!(" -> result; pushing arg value {}", value);
self.elab_stack.push(ElabStackEntry::PendingProjection {
index: *result,
canonical,
ty: *ty,
});
self.elab_stack.push(ElabStackEntry::Start { id: *value });
continue;
@@ -493,7 +505,11 @@ impl<'a> Elaborator<'a> {
// Push onto the elab-results stack.
self.elab_result_stack.push(result)
}
&ElabStackEntry::PendingProjection { index, canonical } => {
&ElabStackEntry::PendingProjection {
ty,
index,
canonical,
} => {
self.elab_stack.pop();
// Grab the input from the elab-result stack.
@@ -511,10 +527,12 @@ impl<'a> Elaborator<'a> {
}
};
let values = values.as_slice(&self.func.dfg.value_lists);
let value = values[index];
self.func.dfg.fill_in_value_type(value, ty);
let value = IdValue::Value {
depth,
block,
value: values[index],
value,
};
self.id_to_value.insert_if_absent(canonical, value.clone());

View File

@@ -1,9 +1,9 @@
//! Node definition for EGraph representation.
use super::MemoryState;
use crate::ir::{Block, DataFlowGraph, Inst, InstructionImms, Opcode, RelSourceLoc, Type};
use super::PackedMemoryState;
use crate::ir::{Block, DataFlowGraph, InstructionImms, Opcode, RelSourceLoc, Type};
use crate::loop_analysis::LoopLevel;
use cranelift_egraph::{BumpArena, BumpSlice, CtxEq, CtxHash, Id, Language, UnionFind};
use cranelift_egraph::{CtxEq, CtxHash, Id, Language, UnionFind};
use cranelift_entity::{EntityList, ListPool};
use std::hash::{Hash, Hasher};
@@ -31,8 +31,10 @@ pub enum Node {
op: InstructionImms,
/// eclass arguments to the operator.
args: EntityList<Id>,
/// Types of results.
types: BumpSlice<Type>,
/// Type of result, if one.
ty: Type,
/// Number of results.
arity: u16,
},
/// A CLIF instruction that has side-effects or is otherwise not
/// representable by `Pure`.
@@ -41,15 +43,10 @@ pub enum Node {
op: InstructionImms,
/// eclass arguments to the operator.
args: EntityList<Id>,
/// Types of results.
types: BumpSlice<Type>,
/// The index of the original instruction. We include this so
/// that the `Inst`s are not deduplicated: every instance is a
/// logically separate and unique side-effect. However,
/// because we clear the DataFlowGraph before elaboration,
/// this `Inst` is *not* valid to fetch any details from the
/// original instruction.
inst: Inst,
/// Type of result, if one.
ty: Type,
/// Number of results.
arity: u16,
/// The source location to preserve.
srcloc: RelSourceLoc,
/// The loop level of this Inst.
@@ -83,14 +80,9 @@ pub enum Node {
/// the key).
addr: Id,
/// The abstract memory state that this load accesses.
mem_state: MemoryState,
mem_state: PackedMemoryState,
// -- not included in dedup key:
/// The `Inst` we will use for a trap location for this
/// load. Excluded from Eq/Hash so that loads that are
/// identical except for the specific instance will dedup on
/// top of each other.
inst: Inst,
/// Source location, for traps. Not included in Eq/Hash.
srcloc: RelSourceLoc,
},
@@ -107,18 +99,14 @@ impl Node {
/// Shared pools for type and id lists in nodes.
pub struct NodeCtx {
/// Arena for result-type arrays.
pub types: BumpArena<Type>,
/// Arena for arg eclass-ID lists.
pub args: ListPool<Id>,
}
impl NodeCtx {
pub(crate) fn with_capacity_for_dfg(dfg: &DataFlowGraph) -> Self {
let n_types = dfg.num_values();
let n_args = dfg.value_lists.capacity();
Self {
types: BumpArena::arena_with_capacity(n_types),
args: ListPool::with_capacity(n_args),
}
}
@@ -168,26 +156,23 @@ impl CtxEq<Node, Node> for NodeCtx {
&Node::Pure {
ref op,
ref args,
ref types,
ty,
arity: _,
},
&Node::Pure {
op: ref other_op,
args: ref other_args,
types: ref other_types,
ty: other_ty,
arity: _,
},
) => {
*op == *other_op
&& self.ids_eq(args, other_args, uf)
&& types.as_slice(&self.types) == other_types.as_slice(&self.types)
}
) => *op == *other_op && self.ids_eq(args, other_args, uf) && ty == other_ty,
(
&Node::Inst { inst, ref args, .. },
&Node::Inst { ref args, .. },
&Node::Inst {
inst: other_inst,
args: ref other_args,
..
},
) => inst == other_inst && self.ids_eq(args, other_args, uf),
) => self.ids_eq(args, other_args, uf),
(
&Node::Load {
ref op,
@@ -249,16 +234,14 @@ impl CtxHash<Node> for NodeCtx {
&Node::Pure {
ref op,
ref args,
types: _,
ty,
arity: _,
} => {
op.hash(&mut state);
self.hash_ids(args, &mut state, uf);
// Don't hash `types`: it requires an indirection
// (hence cache misses), and result type *should* be
// fully determined by op and args.
ty.hash(&mut state);
}
&Node::Inst { inst, ref args, .. } => {
inst.hash(&mut state);
&Node::Inst { ref args, .. } => {
self.hash_ids(args, &mut state, uf);
}
&Node::Load {
@@ -370,3 +353,14 @@ impl Language for NodeCtx {
}
}
}
#[cfg(test)]
mod test {
#[test]
#[cfg(target_pointer_width = "64")]
fn node_size() {
use super::*;
assert_eq!(std::mem::size_of::<InstructionImms>(), 16);
assert_eq!(std::mem::size_of::<Node>(), 32);
}
}

View File

@@ -62,7 +62,7 @@ use crate::fx::{FxHashMap, FxHashSet};
use crate::inst_predicates::has_memory_fence_semantics;
use crate::ir::{Block, Function, Inst, InstructionData, MemFlags, Opcode};
use crate::trace;
use cranelift_entity::SecondaryMap;
use cranelift_entity::{EntityRef, SecondaryMap};
use smallvec::{smallvec, SmallVec};
/// For a given program point, the vector of last-store instruction
@@ -97,6 +97,32 @@ pub enum MemoryState {
AfterInst(Inst),
}
/// Memory state index, packed into a u32.
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct PackedMemoryState(u32);
impl From<MemoryState> for PackedMemoryState {
fn from(state: MemoryState) -> Self {
match state {
MemoryState::Entry => Self(0),
MemoryState::Store(i) => Self(1 | (i.index() as u32) << 2),
MemoryState::BeforeInst(i) => Self(2 | (i.index() as u32) << 2),
MemoryState::AfterInst(i) => Self(3 | (i.index() as u32) << 2),
}
}
}
impl PackedMemoryState {
/// Does this memory state refer to a specific store instruction?
pub fn as_store(&self) -> Option<Inst> {
if self.0 & 3 == 1 {
Some(Inst::from_bits(self.0 >> 2))
} else {
None
}
}
}
impl LastStores {
fn update(&mut self, func: &Function, inst: Inst) {
let opcode = func.dfg[inst].opcode();
@@ -148,7 +174,7 @@ impl LastStores {
pub struct AliasAnalysis {
/// Last-store instruction (or none) for a given load. Use a hash map
/// instead of a `SecondaryMap` because this is sparse.
load_mem_state: FxHashMap<Inst, MemoryState>,
load_mem_state: FxHashMap<Inst, PackedMemoryState>,
}
impl AliasAnalysis {
@@ -165,7 +191,7 @@ impl AliasAnalysis {
cfg: &ControlFlowGraph,
) -> SecondaryMap<Block, Option<LastStores>> {
let mut block_input = SecondaryMap::with_capacity(func.dfg.num_blocks());
let mut worklist: SmallVec<[Block; 8]> = smallvec![];
let mut worklist: SmallVec<[Block; 16]> = smallvec![];
let mut worklist_set = FxHashSet::default();
let entry = func.layout.entry_block().unwrap();
worklist.push(entry);
@@ -210,8 +236,9 @@ impl AliasAnalysis {
fn compute_load_last_stores(
func: &Function,
block_input: SecondaryMap<Block, Option<LastStores>>,
) -> FxHashMap<Inst, MemoryState> {
) -> FxHashMap<Inst, PackedMemoryState> {
let mut load_mem_state = FxHashMap::default();
load_mem_state.reserve(func.dfg.num_insts() / 8);
for block in func.layout.blocks() {
let mut state = block_input[block].clone().unwrap();
@@ -249,7 +276,7 @@ impl AliasAnalysis {
mem_state,
);
load_mem_state.insert(inst, mem_state);
load_mem_state.insert(inst, mem_state.into());
}
state.update(func, inst);
@@ -260,7 +287,7 @@ impl AliasAnalysis {
}
/// Get the state seen by a load, if any.
pub fn get_state_for_load(&self, inst: Inst) -> Option<MemoryState> {
pub fn get_state_for_load(&self, inst: Inst) -> Option<PackedMemoryState> {
self.load_mem_state.get(&inst).copied()
}
}