From aec53ec3a9070da8d5754c6969481e8442b4f8c8 Mon Sep 17 00:00:00 2001 From: Jakob Stoklund Olesen Date: Fri, 13 Jan 2017 11:42:26 -0800 Subject: [PATCH] Add a liveness analysis. This code is best tested with larger functions with more EBBs. Perhaps a new file-test category is in order? --- lib/cretonne/src/ir/instructions.rs | 8 +- lib/cretonne/src/regalloc/liveness.rs | 294 ++++++++++++++++++++++++++ lib/cretonne/src/regalloc/mod.rs | 1 + lib/cretonne/src/sparse_map.rs | 5 + 4 files changed, 304 insertions(+), 4 deletions(-) create mode 100644 lib/cretonne/src/regalloc/liveness.rs diff --git a/lib/cretonne/src/ir/instructions.rs b/lib/cretonne/src/ir/instructions.rs index 21f9b92c3c..eb0b1e5bd4 100644 --- a/lib/cretonne/src/ir/instructions.rs +++ b/lib/cretonne/src/ir/instructions.rs @@ -412,8 +412,8 @@ pub struct ReturnData { impl InstructionData { /// Execute a closure once for each argument to this instruction. /// See also the `arguments()` method. - pub fn each_arg(&self, func: F) - where F: Fn(Value) + pub fn each_arg(&self, mut func: F) + where F: FnMut(Value) { for part in &self.arguments() { for &arg in part.iter() { @@ -424,8 +424,8 @@ impl InstructionData { /// Execute a closure with a mutable reference to each argument to this instruction. /// See also the `arguments_mut()` method. - pub fn each_arg_mut(&mut self, func: F) - where F: Fn(&mut Value) + pub fn each_arg_mut(&mut self, mut func: F) + where F: FnMut(&mut Value) { for part in &mut self.arguments_mut() { for arg in part.iter_mut() { diff --git a/lib/cretonne/src/regalloc/liveness.rs b/lib/cretonne/src/regalloc/liveness.rs new file mode 100644 index 0000000000..c7646ed701 --- /dev/null +++ b/lib/cretonne/src/regalloc/liveness.rs @@ -0,0 +1,294 @@ +//! Liveness analysis for SSA values. +//! +//! This module computes the live range of all the SSA values in a function and produces a +//! `LiveRange` instance for each. +//! +//! +//! # Liveness consumers +//! +//! The primary consumer of the liveness analysis is the SSA coloring pass which goes through each +//! EBB and assigns a register to the defined values. This algorithm needs to maintain a set of the +//! curently live values as it is iterating down the instructions in the EBB. It asks the following +//! questions: +//! +//! - What is the set of live values at the entry to the EBB? +//! - When moving past a use of a value, is that value still alive in the EBB, or was that the last +//! use? +//! - When moving past a branch, which of the live values are still live below the branch? +//! +//! The set of `LiveRange` instances can answer these questions through their `def_local_end` and +//! `livein_local_end` queries. The coloring algorithm visits EBBs in a topological order of the +//! dominator tree, so it can compute the set of live values at the begining of an EBB by starting +//! from the set of live values at the dominating branch instruction and filtering it with +//! `livein_local_end`. These sets do not need to be stored in the liveness analysis. +//! +//! The secondary consumer of the liveness analysis is the spilling pass which needs to count the +//! number of live values at every program point and insert spill code until the number of +//! registers needed is small enough. +//! +//! +//! # Alternative algorithms +//! +//! A number of different liveness analysis algorithms exist, so it is worthwhile to look at a few +//! alternatives. +//! +//! ## Dataflow equations +//! +//! The classic *live variables analysis* that you will find in all compiler books from the +//! previous century does not depend on SSA form. It is typically implemented by iteratively +//! solving dataflow equations on bitvectors of variables. The result is a live-out bitvector of +//! variables for every basic block in the program. +//! +//! This algorithm has some disadvantages that makes us look elsewhere: +//! +//! - Quadratic memory use. We need a bit per variable per basic block in the function. +//! - Sparse representation. In practice, the majority of SSA values never leave their basic block, +//! and those that do span basic blocks rarely span a large number of basic blocks. This makes +//! the bitvectors quite sparse. +//! - Traditionally, the dataflow equations were solved for real program *variables* which does not +//! include temporaries used in evaluating expressions. We have an SSA form program which blurs +//! the distinction between temporaries and variables. This makes the quadratic memory problem +//! worse because there are many more SSA values than there was variables in the original +//! program, and we don't know a priori which SSA values leave their basic block. +//! - Missing last-use information. For values that are not live-out of a basic block, we would +//! need to store information about the last use in the block somewhere. LLVM stores this +//! information as a 'kill bit' on the last use in the IR. Maintaining these kill bits has been a +//! source of problems for LLVM's register allocator. +//! +//! Dataflow equations can detect when a variable is used uninitialized, and they can handle +//! multiple definitions of the same variable. We don't need this generality since we already have +//! a program in SSA form. +//! +//! ## LLVM's liveness analysis +//! +//! LLVM's register allocator computes liveness per *virtual register*, where a virtual register is +//! a disjoint union of related SSA values that should be assigned to the same physical register. +//! It uses a compact data structure very similar to our `LiveRange`. The important difference is +//! that Cretonne's `LiveRange` only describes a single SSA value, while LLVM's `LiveInterval` +//! describes the live range of a virtual register *and* which one of the related SSA values is +//! live at any given program point. +//! +//! LLVM computes the live range of each virtual register independently by using the use-def chains +//! that are baked into its IR. The algorithm for a single virtual register is: +//! +//! 1. Initialize the live range with a single-instruction snippet of liveness at each def, using +//! the def-chain. This does not include any phi-values. +//! 2. Go through the virtual register's use chain and perform the following steps at each use: +//! 3. Perform an exhaustive depth-first traversal up the CFG from the use. Look for basic blocks +//! that already contain some liveness and extend the last live SSA value in the block to be +//! live-out. Also build a list of new basic blocks where the register needs to be live-in. +//! 4. Iteratively propagate live-out SSA values to the new live-in blocks. This may require new +//! PHI values to be created when different SSA values can reach the same block. +//! +//! The iterative SSA form reconstruction can be skipped if the depth-first search only encountered +//! one SSA value. +//! +//! This algorithm has some advantages compared to the dataflow equations: +//! +//! - The live ranges of local virtual registers are computed very quickly without ever traversing +//! the CFG. The memory needed to store these live ranges is independent of the number of basic +//! blocks in the program. +//! - The time to compute the live range of a global virtual register is proportional to the number +//! of basic blocks covered. Many virtual registers only cover a few blocks, even in very large +//! functions. +//! - A single live range can be recomputed after making modifications to the IR. No global +//! algorithm is necessary. This feature depends on having use-def chains for virtual registers +//! which Cretonne doesn't. +//! +//! Cretonne uses a very similar data structures and algorithms to LLVM, with the important +//! difference that live ranges are computed per SSA value instead of per virtual register, and the +//! uses in Cretonne IR refers to SSA values instead of virtual registers. This means that Cretonne +//! can skip the last step of reconstructing SSA form for the virtual register uses. +//! +//! ## Fast Liveness Checking for SSA-Form Programs +//! +//! A liveness analysis that is often brought up in the context of SSA-based register allocation +//! was presented at CGO 2008: +//! +//! > Boissinot, B., Hack, S., Grund, D., de Dinechin, B. D., & Rastello, F. (2008). *Fast Liveness +//! Checking for SSA-Form Programs.* CGO. +//! +//! This analysis uses a global precomputation that only depends on the CFG of the function. It +//! then allows liveness queries for any (value, program point) pair. Each query traverses the use +//! chain of the value and performs lookups in the precomputed bitvectors. +//! +//! I did not seriously consider this analysis for Cretonne because: +//! +//! - It depends critically on use chains which Cretonne doesn't have. +//! - Popular variables like the `this` pointer in a C++ method can have very large use chains. +//! Traversing such a long use chain on every liveness lookup has the potential for some nasty +//! quadratic behavior in unfortunate cases. +//! - It says "fast" in the title, but the paper only claims to be 16% faster than a dataflow based +//! approach, which isn't that impressive. +//! +//! Nevertheless, the property of only depending in the CFG structure is very useful. If Cretonne +//! gains use chains, this approach would be worth a proper evaluation. +//! +//! +//! # Cretonne's liveness analysis +//! +//! The algorithm implemented in this module is similar to LLVM's with these differences: +//! +//! - The `LiveRange` data structure describes the liveness of a single SSA value, not a virtual +//! register. +//! - Instructions in Cretonne IR contains references to SSA values, not virtual registers. +//! - All live ranges are computed in one traversal of the program. Cretonne doesn't have use +//! chains, so it is not possible to compute the live range for a single SSA value independently. +//! +//! The liveness computation visits all instructions in the program. The order is not important for +//! the algorithm to be correct. At each instruction, the used values are examined. +//! +//! - The first time a value is encountered, its live range is constructed as a dead live range +//! containing only the defining program point. +//! - The local interval of the value's live range is extended so it reaches the use. This may +//! require creating a new live-in local interval for the EBB. +//! - If the live range became live-in to the EBB, add the EBB to a work-list. +//! - While the work-list is non-empty pop a live-in EBB and repeat the two steps above, using each +//! of the live-in EBB's CFG predecessor instructions as a 'use'. +//! +//! The effect of this algorithm is to extend the live range of each to reach uses as they are +//! visited. No data about each value beyond the live range is needed between visiting uses, so +//! nothing is lost by computing the live range of all values simultaneously. +//! +//! ## Cache efficiency of Cretonne vs LLVM +//! +//! Since LLVM computes the complete live range of a virtual register in one go, it can keep the +//! whole `LiveInterval` for the register in L1 cache. Since it is visiting the instructions in use +//! chain order, some cache thrashing can occur as a result of pulling instructions into cache +//! somewhat chaotically. +//! +//! Cretonne uses a transposed algorithm, visiting instructions in order. This means that each +//! instruction is brought into cache only once, and it is likely that the other instructions on +//! the same cache line will be visited before the line is evicted. +//! +//! Cretonne's problem is that the `LiveRange` structs are visited many times and not always +//! regularly. We should strive to make the `LiveRange` struct as small as possible such that +//! multiple related values can live on the same cache line. +//! +//! - Local values should fit in a 16-byte `LiveRange` struct or smaller. The current +//! implementation contains a 24-byte `Vec` object and a redundant `value` member pushing the +//! size to 32 bytes. +//! - Related values should be stored on the same cache line. The current sparse set implementation +//! does a decent job of that. +//! - For global values, the list of live-in intervals is very likely to fit on a single cache +//! line. These lists are very likely ot be found in L2 cache at least. +//! +//! There is some room for improvement. + +use regalloc::liverange::LiveRange; +use ir::{Function, Value, Inst, Ebb, ProgramPoint}; +use ir::dfg::{DataFlowGraph, ValueDef}; +use cfg::ControlFlowGraph; +use sparse_map::SparseMap; + +/// A set of live ranges, indexed by value number. +struct LiveRangeSet(SparseMap); + +impl LiveRangeSet { + pub fn new() -> LiveRangeSet { + LiveRangeSet(SparseMap::new()) + } + + pub fn clear(&mut self) { + self.0.clear(); + } + + /// Get a mutable reference to the live range for `value`. + /// Create it if necessary. + pub fn get_or_create(&mut self, value: Value, dfg: &DataFlowGraph) -> &mut LiveRange { + // It would be better to use `get_mut()` here, but that leads to borrow checker fighting + // which can probably only be resolved by non-lexical lifetimes. + // https://github.com/rust-lang/rfcs/issues/811 + if self.0.get(value).is_none() { + // Create a live range for value. We need the program point that defines it. + let def: ProgramPoint = match dfg.value_def(value) { + ValueDef::Res(inst, _) => inst.into(), + ValueDef::Arg(ebb, _) => ebb.into(), + }; + self.0.insert(LiveRange::new(value, def)); + } + self.0.get_mut(value).unwrap() + } +} + +/// Liveness analysis for a function. +/// +/// Compute a live range for every SSA value used in the function. +pub struct Liveness { + /// The live ranges that have been computed so far. + ranges: LiveRangeSet, + + /// Working space for the `extend_to_use` algorithm. + /// This vector is always empty, except for inside that function. + /// It lives here to avoid repeated allocation of scratch memory. + worklist: Vec, +} + +impl Liveness { + /// Create a new empty liveness analysis. + /// + /// The memory allocated for this analysis can be reused for multiple functions. Use the + /// `compute` method to actually runs the analysis for a function. + pub fn new() -> Liveness { + Liveness { + ranges: LiveRangeSet::new(), + worklist: Vec::new(), + } + } + + /// Compute the live ranges of all SSA values used in `func`. + /// This clears out any existing analysis stored in this data structure. + pub fn compute(&mut self, func: &Function, cfg: &ControlFlowGraph) { + self.ranges.clear(); + + // The liveness computation needs to visit all uses, but the order doesn't matter. + // TODO: Perhaps this traversal of the function could be combined with a dead code + // elimination pass if we visit a post-order of the dominator tree? + // TODO: Resolve value aliases while we're visiting instructions? + for ebb in func.layout.ebbs() { + for inst in func.layout.ebb_insts(ebb) { + func.dfg[inst].each_arg(|arg| self.extend_to_use(arg, ebb, inst, func, cfg)); + } + } + } + + /// Extend the live range for `value` so it reaches `to` which must live in `ebb`. + fn extend_to_use(&mut self, + value: Value, + ebb: Ebb, + to: Inst, + func: &Function, + cfg: &ControlFlowGraph) { + // Get the live range, create it as a dead range if necessary. + let lr = self.ranges.get_or_create(value, &func.dfg); + + // This is our scratch working space, and we'll leave it empty when we return. + assert!(self.worklist.is_empty()); + + // Extend the range locally in `ebb`. + // If there already was a live interval in that block, we're done. + if lr.extend_in_ebb(ebb, to, &func.layout) { + self.worklist.push(ebb); + } + + // The worklist contains those EBBs where we have learned that the value needs to be + // live-in. + // + // This algorithm bcomes a depth-first traversal up the CFG, enumerating all paths through + // the CFG from the existing live range to `ebb`. + // + // Extend the live range as we go. The live range itself also serves as a visited set since + // `extend_in_ebb` will never return true twice for the same EBB. + // + while let Some(livein) = self.worklist.pop() { + // We've learned that the value needs to be live-in to the `livein` EBB. + // Make sure it is also live at all predecessor branches to `livein`. + for &(pred, branch) in cfg.get_predecessors(livein) { + if lr.extend_in_ebb(pred, branch, &func.layout) { + // This predecessor EBB also became live-in. We need to process it later. + self.worklist.push(pred); + } + } + } + } +} diff --git a/lib/cretonne/src/regalloc/mod.rs b/lib/cretonne/src/regalloc/mod.rs index 0d6c9c9574..dff29bc5bc 100644 --- a/lib/cretonne/src/regalloc/mod.rs +++ b/lib/cretonne/src/regalloc/mod.rs @@ -3,3 +3,4 @@ //! This module contains data structures and algorithms used for register allocation. pub mod liverange; +pub mod liveness; diff --git a/lib/cretonne/src/sparse_map.rs b/lib/cretonne/src/sparse_map.rs index 9f0d93a55a..48844c026a 100644 --- a/lib/cretonne/src/sparse_map.rs +++ b/lib/cretonne/src/sparse_map.rs @@ -81,6 +81,11 @@ impl SparseMap self.dense.is_empty() } + /// Remove all elements from the mapping. + pub fn clear(&mut self) { + self.dense.clear(); + } + /// Returns a reference to the value corresponding to the key. pub fn get(&self, key: K) -> Option<&V> { if let Some(idx) = self.sparse.get(key).cloned() {