wasmtime/cranelift/codegen/src/redundant_reload_remover.rs

//! This module implements a late-stage redundant-reload remover, which runs after registers have
//! been allocated and stack slots have been given specific offsets.

use crate::cursor::{Cursor, CursorPosition, EncCursor, FuncCursor};
use crate::entity::EntitySet;
use crate::flowgraph::ControlFlowGraph;
use crate::ir::dfg::DataFlowGraph;
use crate::ir::instructions::BranchInfo;
use crate::ir::stackslot::{StackSlotKind, StackSlots};
use crate::ir::{
    Block, Function, Inst, InstBuilder, InstructionData, Opcode, StackSlotData, Type, Value,
    ValueLoc,
};
use crate::isa::{RegInfo, RegUnit, TargetIsa};
use crate::regalloc::RegDiversions;
use alloc::vec::Vec;
use core::convert::TryInto;
use cranelift_entity::{PrimaryMap, SecondaryMap};

// =============================================================================================
// A description of the redundant-fill-removal algorithm
//
//
// The algorithm works forwards through each Block.  It carries along and updates a table,
// AvailEnv, with which it tracks registers that are known to have the same value as some stack
// slot.  The actions on encountering an instruction depend on the instruction, as follows:
//
// ss1 = spill r0: update the AvailEnv so as to note that slot `ss1` and register `r0`
//                 have the same value.
//
// r1 = fill ss0: look in the AvailEnv.  If it tells us that register `r1` and slot `ss0`
//                have the same value, then delete the instruction by converting it to a
//                `fill_nop`.
//
//                If it tells us that some other register `r2` has the same value as
//                slot `ss0`, convert the instruction into a copy from `r2` to `r1`.
//
// any other insn: remove from the AvailEnv, any bindings associated with registers
//                 written by this instruction, since they will be invalidated by it.
//
// Tracking the effects of `copy` instructions in AvailEnv for the case when both source and
// destination are registers does not cause any more fills to be removed or converted to copies.
// It's not clear why.
//
// There are various other instruction-handling cases in `visit_inst`, which are documented
// in-line, and do not change the core algorithm, so are not described here.
//
// The registers tracked by AvailEnv are the post-diversion registers that are really used by the
// code; they are not the pre-diversion names associated with each SSA `Value`.  The second
// `fill` case above opportunistically copies values from registers that may have been diversion
// targets in some predecessor block, and so are no longer associated with any specific SSA-level
// name at the point the copy is made.  Hence those copies (from `r2` to `r1`) cannot be done
// with an ordinary `copy` instruction.  Instead they have to be done using a new `copy_to_ssa`
// instruction, which copies from an arbitrary register to a register-resident `Value` (that is,
// "back to" SSA-world).
//
// That completes the description of the core algorithm.
//
// In the case where a block `A` jumps to `B` and `A` is the only predecessor of `B`, the
// AvailEnv at the end of `A` will still be valid at the entry to `B`.  In such a case, we can
// profitably transform `B` using the AvailEnv "inherited" from `A`.  In order to take full
// advantage of this, this module partitions the function's CFG into tree-shaped groups of
// blocks, and processes each tree as described above.  So the AvailEnv is only initialised to
// empty at the start of blocks that form the root of each tree; that is, for blocks which have
// two or more predecessors.

// =============================================================================================
// Top level algorithm structure
//
// The overall algorithm, for a function, starts like this:
//
// * (once per function): finds Blocks that have two or more predecessors, since they will be the
//   roots of Block trees.  Also, the entry node for the function is considered to be a root.
//
// It then continues with a loop that first finds a tree of Blocks ("discovery") and then removes
// redundant fills as described above ("processing"):
//
// * (discovery; once per tree): for each root, performs a depth first search to find all the Blocks
//   in the tree, guided by RedundantReloadRemover::discovery_stack.
//
// * (processing; once per tree): the just-discovered tree is then processed as described above,
//   guided by RedundantReloadRemover::processing_stack.
//
// In this way, all Blocks reachable from the function's entry point are eventually processed.  Note
// that each tree is processed as soon as it has been discovered, so the algorithm never creates a
// list of trees for the function.
//
// The running state is stored in `RedundantReloadRemover`.  This is allocated once and can be
// reused for multiple functions so as to minimise heap turnover.  The fields are, roughly:
//
//   num_regunits -- constant for the whole function; used by the tree processing phase
//   num_preds_per_block -- constant for the whole function; used by the tree discovery process
//
//   discovery_stack -- used to guide the tree discovery process
//   nodes_in_tree -- the discovered nodes are recorded here
//
//   processing_stack -- used to guide the tree processing process
//   nodes_already_visited -- used to ensure the tree processing logic terminates in the case
//                            where a tree has a branch back to its root node.
//
// There is further documentation in line below, as appropriate.

// =============================================================================================
// A side note on register choice heuristics

// The core algorithm opportunistically replaces fill instructions when it knows of a register
// that already holds the required value.  How effective this is largely depends on how long
// reloaded values happen to stay alive before the relevant register is overwritten.  And that
// depends on the register allocator's register choice heuristics.  The worst case is, when the
// register allocator reuses registers as soon as possible after they become free.  Unfortunately
// that was indeed the selection scheme, prior to development of this pass.
//
// As part of this work, the register selection scheme has been changed as follows: for registers
// written by any instruction other than a fill, use the lowest numbered available register.  But
// for registers written by a fill instruction, use the highest numbered available register.  The
// aim is to try and keep reload- and non-reload registers disjoint to the extent possible.
// Several other schemes were tried, but this one is simple and can be worth an extra 2% of
// performance in some cases.
//
// The relevant change is more or less a one-line change in the solver.

// =============================================================================================
// Data structures used for discovery of trees

// `ZeroOneOrMany` is used to record the number of predecessors a Block block has.  The `Zero` case
// is included so as to cleanly handle the case where the incoming graph has unreachable Blocks.

#[derive(Clone, PartialEq)]
enum ZeroOneOrMany {
    Zero,
    One,
    Many,
}

// =============================================================================================
// Data structures used for processing of trees

// `SlotInfo` describes a spill slot in the obvious way.  Note that it doesn't indicate which
// register(s) are currently associated with the slot.  That job is done by `AvailEnv` instead.
//
// In the CL framework, stack slots are partitioned into disjoint sets, one for each
// `StackSlotKind`.  The offset and size only give a unique identity within any particular
// `StackSlotKind`.  So, to uniquely identify a stack slot, all three fields are necessary.

#[derive(Clone, Copy)]
struct SlotInfo {
    kind: StackSlotKind,
    offset: i32,
    size: u32,
}

// `AvailEnv` maps each possible register to a stack slot that holds the same value.  The index
// space of `AvailEnv::map` is exactly the set of registers available on the current target.  If
// (as is mostly the case) a register is not known to have the same value as a stack slot, then
// its entry is `None` rather than `Some(..)`.
//
// Invariants for AvailEnv:
//
// AvailEnv may have multiple different registers bound to the same stack slot -- that is, `(kind,
// offset, size)` triple.  That's OK, and reflects the reality that those two registers contain
// the same value.  This could happen, for example, in the case
//
//   ss1 = spill r0
//   ..
//   r2 = fill ss1
//
// Then both `r0` and `r2` will have the same value as `ss1`, provided that ".." doesn't write to
// `r1`.
//
// To say that two different registers may be bound to the same stack slot is the same as saying
// that it is allowed to have two different entries in AvailEnv with the same `(kind, offset,
// size)` triple.  What is *not* allowed is to have partial overlaps.  That is, if two SlotInfos
// have the same `kind` field and have `offset` and `size` fields that overlap, then their
// `offset` and `size` fields must be identical.  This is so as to make the algorithm safe against
// situations where, for example, a 64 bit register is spilled, but then only the bottom 32 bits
// are reloaded from the slot.
//
// Although in such a case it seems likely that the Cranelift IR would be ill-typed, and so this
// case could probably not occur in practice.

#[derive(Clone)]
struct AvailEnv {
    map: Vec<Option<SlotInfo>>,
}

// `ProcessingStackElem` combines AvailEnv with contextual information needed to "navigate" within
// a Block.
//
// A ProcessingStackElem conceptually has the lifetime of exactly one Block: once the current Block is
// completed, the ProcessingStackElem will be abandoned.  In practice the top level state,
// RedundantReloadRemover, caches them, so as to avoid heap turnover.
//
// Note that ProcessingStackElem must contain a CursorPosition.  The CursorPosition, which
// indicates where we are in the current Block, cannot be implicitly maintained by looping over all
// the instructions in a Block in turn, because we may choose to suspend processing the current Block
// at a side exit, continue by processing the subtree reached via the side exit, and only later
// resume the current Block.

struct ProcessingStackElem {
    /// Indicates the AvailEnv at the current point in the Block.
    avail_env: AvailEnv,

    /// Shows where we currently are inside the Block.
    cursor: CursorPosition,

    /// Indicates the currently active register diversions at the current point.
    diversions: RegDiversions,
}

// =============================================================================================
// The top level data structure

// `RedundantReloadRemover` contains data structures for the two passes: discovery of tree shaped
// regions, and processing of them.  These are allocated once and stay alive for the entire
// function, even though they are cleared out for each new tree shaped region.  It also caches
// `num_regunits` and `num_preds_per_block`, which are computed at the start of each function and
// then remain constant.

/// The redundant reload remover's state.
pub struct RedundantReloadRemover {
    /// The total number of RegUnits available on this architecture.  This is unknown when the
    /// RedundantReloadRemover is created.  It becomes known at the beginning of processing of a
    /// function.
    num_regunits: Option<u16>,

    /// This stores, for each Block, a characterisation of the number of predecessors it has.
    num_preds_per_block: PrimaryMap<Block, ZeroOneOrMany>,

    /// The stack used for the first phase (discovery).  There is one element on the discovery
    /// stack for each currently unexplored Block in the tree being searched.
    discovery_stack: Vec<Block>,

    /// The nodes in the discovered tree are inserted here.
    nodes_in_tree: EntitySet<Block>,

    /// The stack used during the second phase (transformation).  There is one element on the
    /// processing stack for each currently-open node in the tree being transformed.
    processing_stack: Vec<ProcessingStackElem>,

    /// Used in the second phase to avoid visiting nodes more than once.
    nodes_already_visited: EntitySet<Block>,
}

// =============================================================================================
// Miscellaneous small helper functions

// Is this a kind of stack slot that is safe to track in AvailEnv?  This is probably overly
// conservative, but tracking only the SpillSlot and IncomingArgument kinds catches almost all
// available redundancy in practice.
fn is_slot_kind_tracked(kind: StackSlotKind) -> bool {
    match kind {
        StackSlotKind::SpillSlot | StackSlotKind::IncomingArg => true,
        _ => false,
    }
}

// Find out if the range `[offset, +size)` overlaps with the range in `si`.
fn overlaps(si: &SlotInfo, offset: i32, size: u32) -> bool {
    let a_offset = si.offset as i64;
    let a_size = si.size as i64;
    let b_offset = offset as i64;
    let b_size = size as i64;
    let no_overlap = a_offset + a_size <= b_offset || b_offset + b_size <= a_offset;
    !no_overlap
}

// Find, in `reginfo`, the register bank that `reg` lives in, and return the lower limit and size
// of the bank.  This is so the caller can conveniently iterate over all RegUnits in the bank that
// `reg` lives in.
fn find_bank_limits(reginfo: &RegInfo, reg: RegUnit) -> (RegUnit, u16) {
    if let Some(bank) = reginfo.bank_containing_regunit(reg) {
        return (bank.first_unit, bank.units);
    }
    // We should never get here, since `reg` must come from *some* RegBank.
    panic!("find_regclass_limits: reg not found");
}

// Returns the register that `v` is allocated to.  Assumes that `v` actually resides in a
// register.
fn reg_of_value(locations: &SecondaryMap<Value, ValueLoc>, v: Value) -> RegUnit {
    match locations[v] {
        ValueLoc::Reg(ru) => ru,
        _ => panic!("reg_of_value: value isn't in a reg"),
    }
}

// Returns the stack slot that `v` is allocated to.  Assumes that `v` actually resides in a stack
// slot.
fn slot_of_value<'s>(
    locations: &SecondaryMap<Value, ValueLoc>,
    stack_slots: &'s StackSlots,
    v: Value,
) -> &'s StackSlotData {
    match locations[v] {
        ValueLoc::Stack(slot) => &stack_slots[slot],
        _ => panic!("slot_of_value: value isn't in a stack slot"),
    }
}

// =============================================================================================
// Top level: discovery of tree shaped regions

impl RedundantReloadRemover {
    // A helper for `add_nodes_to_tree` below.
    fn discovery_stack_push_successors_of(&mut self, cfg: &ControlFlowGraph, node: Block) {
        for successor in cfg.succ_iter(node) {
            self.discovery_stack.push(successor);
        }
    }

    // Visit the tree of Blocks rooted at `starting_point` and add them to `self.nodes_in_tree`.
    // `self.num_preds_per_block` guides the process, ensuring we don't leave the tree-ish region
    // and indirectly ensuring that the process will terminate in the presence of cycles in the
    // graph.  `self.discovery_stack` holds the search state in this function.
    fn add_nodes_to_tree(&mut self, cfg: &ControlFlowGraph, starting_point: Block) {
        // One might well ask why this doesn't loop forever when it encounters cycles in the
        // control flow graph.  The reason is that any cycle in the graph that is reachable from
        // anywhere outside the cycle -- in particular, that is reachable from the function's
        // entry node -- must have at least one node that has two or more predecessors.  So the
        // logic below won't follow into it, because it regards any such node as the root of some
        // other tree.
        debug_assert!(self.discovery_stack.is_empty());
        debug_assert!(self.nodes_in_tree.is_empty());

        self.nodes_in_tree.insert(starting_point);
        self.discovery_stack_push_successors_of(cfg, starting_point);

        while let Some(node) = self.discovery_stack.pop() {
            match self.num_preds_per_block[node] {
                // We arrived at a node with multiple predecessors, so it's a new root.  Ignore it.
                ZeroOneOrMany::Many => {}
                // This node has just one predecessor, so we should incorporate it in the tree and
                // immediately transition into searching from it instead.
                ZeroOneOrMany::One => {
                    self.nodes_in_tree.insert(node);
                    self.discovery_stack_push_successors_of(cfg, node);
                }
                // This is meaningless.  We arrived at a node that doesn't point back at where we
                // came from.
                ZeroOneOrMany::Zero => panic!("add_nodes_to_tree: inconsistent graph"),
            }
        }
    }
}

// =============================================================================================
// Operations relating to `AvailEnv`

impl AvailEnv {
    // Create a new one.
    fn new(size: usize) -> Self {
        let mut env = Self {
            map: Vec::<Option<SlotInfo>>::new(),
        };
        env.map.resize(size, None);
        env
    }

    // Debug only: checks (some of) the required AvailEnv invariants.
    #[cfg(debug_assertions)]
    fn check_invariants(&self) -> bool {
        // Check that any overlapping entries overlap exactly.  This is super lame (quadratic),
        // but it's only used in debug builds.
        for i in 0..self.map.len() {
            if let Some(si) = self.map[i] {
                for j in i + 1..self.map.len() {
                    if let Some(sj) = self.map[j] {
                        // "si and sj overlap, but not exactly"
                        if si.kind == sj.kind
                            && overlaps(&si, sj.offset, sj.size)
                            && !(si.offset == sj.offset && si.size == sj.size)
                        {
                            return false;
                        }
                    }
                }
            }
        }
        true
    }

    // Invalidates the binding associated with `reg`.  Note that by construction of AvailEnv,
    // `reg` can only be associated with one binding at once.
    fn invalidate_by_reg(&mut self, reg: RegUnit) {
        self.map[reg as usize] = None;
    }

    // Invalidates any binding that has any overlap with `(kind, offset, size)`.
    fn invalidate_by_offset(&mut self, kind: StackSlotKind, offset: i32, size: u32) {
        debug_assert!(is_slot_kind_tracked(kind));
        for i in 0..self.map.len() {
            if let Some(si) = &self.map[i] {
                if si.kind == kind && overlaps(&si, offset, size) {
                    self.map[i] = None;
                }
            }
        }
    }

    // Invalidates all bindings.
    fn invalidate_all(&mut self) {
        for i in 0..self.map.len() {
            self.map[i] = None;
        }
    }

    // Updates AvailEnv to track the effect of a `regmove` instruction.
    fn copy_reg(&mut self, src: RegUnit, dst: RegUnit) {
        self.map[dst as usize] = self.map[src as usize];
    }

    // Does `env` have the exact binding characterised by `(reg, kind, offset, size)` ?
    fn has_exact_binding(&self, reg: RegUnit, kind: StackSlotKind, offset: i32, size: u32) -> bool {
        debug_assert!(is_slot_kind_tracked(kind));
        if let Some(si) = &self.map[reg as usize] {
            return si.kind == kind && si.offset == offset && si.size == size;
        }
        // No such binding.
        false
    }

    // Does `env` have a binding characterised by `(kind, offset, size)` but to a register, let's
    // call it `other_reg`, that isn't `reg`?  If so, return `other_reg`.  Note that `other_reg`
    // will have the same bank as `reg`.  It is a checked error to call this function with a
    // binding matching all four of `(reg, kind, offset, size)`.
    fn has_inexact_binding(
        &self,
        reginfo: &RegInfo,
        reg: RegUnit,
        kind: StackSlotKind,
        offset: i32,
        size: u32,
    ) -> Option<RegUnit> {
        debug_assert!(is_slot_kind_tracked(kind));
        // Find the range of RegUnit numbers for the bank that contains `reg`, and use that as our
        // search space.  This is so as to guarantee that any match is restricted to the same bank
        // as `reg`.
        let (first_unit, num_units) = find_bank_limits(reginfo, reg);
        for other_reg in first_unit..first_unit + num_units {
            if let Some(si) = &self.map[other_reg as usize] {
                if si.kind == kind && si.offset == offset && si.size == size {
                    if other_reg == reg {
                        panic!("has_inexact_binding: binding *is* exact!");
                    }
                    return Some(other_reg);
                }
            }
        }
        // No such binding.
        None
    }

    // Create the binding `(reg, kind, offset, size)` in `env`, and throw away any previous
    // binding associated with either `reg` or the `(kind, offset, size)` triple.
    fn bind(&mut self, reg: RegUnit, kind: StackSlotKind, offset: i32, size: u32) {
        debug_assert!(is_slot_kind_tracked(kind));
        self.invalidate_by_offset(kind, offset, size);
        self.map[reg as usize] = Some(SlotInfo { kind, offset, size });
    }
}

// Invalidates in `avail_env`, any binding associated with a regunit that is written by `inst`.
fn invalidate_regs_written_by_inst(
    locations: &SecondaryMap<Value, ValueLoc>,
    diversions: &RegDiversions,
    dfg: &DataFlowGraph,
    avail_env: &mut AvailEnv,
    inst: Inst,
) {
    for v in dfg.inst_results(inst).iter() {
        if let ValueLoc::Reg(ru) = locations[*v] {
            // This must be true.  It would be meaningless for an SSA value to be diverted before
            // the point where it is defined.
            debug_assert!(diversions.reg(*v, locations) == ru);
            avail_env.invalidate_by_reg(ru);
        }
    }
}

// =============================================================================================
// Processing of individual instructions

impl RedundantReloadRemover {
    // Process `inst`, possibly changing it into a different instruction, and possibly changing
    // `self.avail_env` and `func.dfg`.
    fn visit_inst(
        &mut self,
        func: &mut Function,
        reginfo: &RegInfo,
        isa: &dyn TargetIsa,
        inst: Inst,
    ) {
        // Get hold of the top-of-stack work item.  This is the state that we will mutate during
        // processing of this instruction.
        debug_assert!(!self.processing_stack.is_empty());
        let ProcessingStackElem {
            avail_env,
            diversions,
            ..
        } = self.processing_stack.last_mut().unwrap();

        #[cfg(debug_assertions)]
        debug_assert!(
            avail_env.check_invariants(),
            "visit_inst: env invariants not ok"
        );

        let dfg = &mut func.dfg;
        let locations = &func.locations;
        let stack_slots = &func.stack_slots;

        // To avoid difficulties with the borrow checker, do this in two stages.  First, examine
        // the instruction to see if it can be deleted or modified, and park the relevant
        // information in `transform`.  Update `self.avail_env` too.  Later, use `transform` to
        // actually do the transformation if necessary.
        enum Transform {
            NoChange,
            ChangeToNopFill(Value),           // delete this insn entirely
            ChangeToCopyToSSA(Type, RegUnit), // change it into a copy from the specified reg
        }
        let mut transform = Transform::NoChange;

        // In this match { .. } statement, either we must treat the instruction specially, or we
        // must call `invalidate_regs_written_by_inst` on it.
        match &dfg[inst] {
            InstructionData::Unary {
                opcode: Opcode::Spill,
                arg: src_value,
            } => {
                // Extract: (src_reg, kind, offset, size)
                // Invalidate: (kind, offset, size)
                // Add new binding: {src_reg -> (kind, offset, size)}
                // Don't forget that src_value might be diverted, so we have to deref it.
                let slot = slot_of_value(locations, stack_slots, dfg.inst_results(inst)[0]);
                let src_reg = diversions.reg(*src_value, locations);
                let kind = slot.kind;
                if is_slot_kind_tracked(kind) {
                    let offset = slot.offset.expect("visit_inst: spill with no offset");
                    let size = slot.size;
                    avail_env.bind(src_reg, kind, offset, size);
                } else {
                    // We don't expect this insn to write any regs.  But to be consistent with the
                    // rule above, do this anyway.
                    invalidate_regs_written_by_inst(locations, diversions, dfg, avail_env, inst);
                }
            }
            InstructionData::Unary {
                opcode: Opcode::Fill,
                arg: src_value,
            } => {
                // Extract: (dst_reg, kind, offset, size)
                // Invalidate: (kind, offset, size)
                // Add new: {dst_reg -> (dst_value, kind, offset, size)}
                let slot = slot_of_value(locations, stack_slots, *src_value);
                let dst_value = dfg.inst_results(inst)[0];
                let dst_reg = reg_of_value(locations, dst_value);
                // This must be true.  It would be meaningless for an SSA value to be diverted
                // before it was defined.
                debug_assert!(dst_reg == diversions.reg(dst_value, locations));
                let kind = slot.kind;
                if is_slot_kind_tracked(kind) {
                    let offset = slot.offset.expect("visit_inst: fill with no offset");
                    let size = slot.size;
                    if avail_env.has_exact_binding(dst_reg, kind, offset, size) {
                        // This instruction is an exact copy of a fill we saw earlier, and the
                        // loaded value is still valid.  So we'll schedule this instruction for
                        // deletion (below).  No need to make any changes to `avail_env`.
                        transform = Transform::ChangeToNopFill(*src_value);
                    } else if let Some(other_reg) =
                        avail_env.has_inexact_binding(reginfo, dst_reg, kind, offset, size)
                    {
                        // This fill is from the required slot, but into a different register
                        // `other_reg`.  So replace it with a copy from `other_reg` to `dst_reg`
                        // and update `dst_reg`s binding to make it the same as `other_reg`'s, so
                        // as to maximise the chances of future matches after this instruction.
                        debug_assert!(other_reg != dst_reg);
                        transform =
                            Transform::ChangeToCopyToSSA(dfg.value_type(dst_value), other_reg);
                        avail_env.copy_reg(other_reg, dst_reg);
                    } else {
                        // This fill creates some new binding we don't know about.  Update
                        // `avail_env` to track it.
                        avail_env.bind(dst_reg, kind, offset, size);
                    }
                } else {
                    // Else it's "just another instruction that writes a reg", so we'd better
                    // treat it as such, just as we do below for instructions that we don't handle
                    // specially.
                    invalidate_regs_written_by_inst(locations, diversions, dfg, avail_env, inst);
                }
            }
            InstructionData::RegMove { src, dst, .. } => {
                // These happen relatively rarely, but just frequently enough that it's worth
                // tracking the copy (at the machine level, it's really a copy) in `avail_env`.
                avail_env.copy_reg(*src, *dst);
            }
            InstructionData::RegSpill { .. }
            | InstructionData::RegFill { .. }
            | InstructionData::Call { .. }
            | InstructionData::CallIndirect { .. }
            | InstructionData::StackLoad { .. }
            | InstructionData::StackStore { .. }
            | InstructionData::Unary {
                opcode: Opcode::AdjustSpDown,
                ..
            }
            | InstructionData::UnaryImm {
                opcode: Opcode::AdjustSpUpImm,
                ..
            }
            | InstructionData::UnaryImm {
                opcode: Opcode::AdjustSpDownImm,
                ..
            } => {
                // All of these change, or might change, the memory-register bindings tracked in
                // `avail_env` in some way we don't know about, or at least, we might be able to
                // track, but for which the effort-to-benefit ratio seems too low to bother.  So
                // play safe: forget everything we know.
                //
                // For Call/CallIndirect, we could do better when compiling for calling
                // conventions that have callee-saved registers, since bindings for them would
                // remain valid across the call.
                avail_env.invalidate_all();
            }
            _ => {
                // Invalidate: any `avail_env` entry associated with a reg written by `inst`.
                invalidate_regs_written_by_inst(locations, diversions, dfg, avail_env, inst);
            }
        }

        // Actually do the transformation.
        match transform {
            Transform::NoChange => {}
            Transform::ChangeToNopFill(arg) => {
                // Load is completely redundant.  Convert it to a no-op.
                dfg.replace(inst).fill_nop(arg);
                let ok = func.update_encoding(inst, isa).is_ok();
                debug_assert!(
                    ok,
                    "fill_nop encoding missing for this type: `{}`",
                    func.dfg.display_inst(inst, isa)
                );
            }
            Transform::ChangeToCopyToSSA(ty, reg) => {
                // We already have the relevant value in some other register.  Convert the
                // load into a reg-reg copy.
                dfg.replace(inst).copy_to_ssa(ty, reg);
                let ok = func.update_encoding(inst, isa).is_ok();
                debug_assert!(ok, "copy_to_ssa encoding missing for type {}", ty);
            }
        }
    }
}

// =============================================================================================
// Top level: processing of tree shaped regions

impl RedundantReloadRemover {
    // Push a clone of the top-of-stack ProcessingStackElem.  This will be used to process exactly
    // one Block.  The diversions are created new, rather than cloned, to reflect the fact
    // that diversions are local to each Block.
    fn processing_stack_push(&mut self, cursor: CursorPosition) {
        let avail_env = if let Some(stack_top) = self.processing_stack.last() {
            stack_top.avail_env.clone()
        } else {
            AvailEnv::new(
                self.num_regunits
                    .expect("processing_stack_push: num_regunits unknown!")
                    as usize,
            )
        };
        self.processing_stack.push(ProcessingStackElem {
            avail_env,
            cursor,
            diversions: RegDiversions::new(),
        });
    }

    // This pushes the node `dst` onto the processing stack, and sets up the new
    // ProcessingStackElem accordingly.  But it does all that only if `dst` is part of the current
    // tree *and* we haven't yet visited it.
    fn processing_stack_maybe_push(&mut self, dst: Block) {
        if self.nodes_in_tree.contains(dst) && !self.nodes_already_visited.contains(dst) {
            if !self.processing_stack.is_empty() {
                // If this isn't the outermost node in the tree (that is, the root), then it must
                // have exactly one predecessor.  Nodes with no predecessors are dead and not
                // incorporated in any tree.  Nodes with two or more predecessors are the root of
                // some other tree, and visiting them as if they were part of the current tree
                // would be a serious error.
                debug_assert!(self.num_preds_per_block[dst] == ZeroOneOrMany::One);
            }
            self.processing_stack_push(CursorPosition::Before(dst));
            self.nodes_already_visited.insert(dst);
        }
    }

    // Perform redundant-reload removal on the tree shaped region of graph defined by `root` and
    // `self.nodes_in_tree`.  The following state is modified: `self.processing_stack`,
    // `self.nodes_already_visited`, and `func.dfg`.
    fn process_tree(
        &mut self,
        func: &mut Function,
        reginfo: &RegInfo,
        isa: &dyn TargetIsa,
        root: Block,
    ) {
        debug_assert!(self.nodes_in_tree.contains(root));
        debug_assert!(self.processing_stack.is_empty());
        debug_assert!(self.nodes_already_visited.is_empty());

        // Create the initial work item
        self.processing_stack_maybe_push(root);

        while !self.processing_stack.is_empty() {
            // It seems somewhat ridiculous to construct a whole new FuncCursor just so we can do
            // next_inst() on it once, and then copy the resulting position back out.  But use of
            // a function-global FuncCursor, or of the EncCursor in struct Context, leads to
            // borrow checker problems, as does including FuncCursor directly in
            // ProcessingStackElem.  In any case this is not as bad as it looks, since profiling
            // shows that the build-insert-step-extract work is reduced to just 8 machine
            // instructions in an optimised x86_64 build, presumably because rustc can inline and
            // then optimise out almost all the work.
            let tos = self.processing_stack.len() - 1;
            let mut pos = FuncCursor::new(func).at_position(self.processing_stack[tos].cursor);
            let maybe_inst = pos.next_inst();
            self.processing_stack[tos].cursor = pos.position();

            if let Some(inst) = maybe_inst {
                // Deal with this insn, possibly changing it, possibly updating the top item of
                // `self.processing_stack`.
                self.visit_inst(func, reginfo, isa, inst);

                // Update diversions after the insn.
                self.processing_stack[tos].diversions.apply(&func.dfg[inst]);

                // If the insn can branch outside this Block, push work items on the stack for all
                // target Blocks that are part of the same tree and that we haven't yet visited.
                // The next iteration of this instruction-processing loop will immediately start
                // work on the most recently pushed Block, and will eventually continue in this Block
                // when those new items have been removed from the stack.
                match func.dfg.analyze_branch(inst) {
                    BranchInfo::NotABranch => (),
                    BranchInfo::SingleDest(dst, _) => {
                        self.processing_stack_maybe_push(dst);
                    }
                    BranchInfo::Table(jt, default) => {
                        func.jump_tables[jt]
                            .iter()
                            .for_each(|dst| self.processing_stack_maybe_push(*dst));
                        if let Some(dst) = default {
                            self.processing_stack_maybe_push(dst);
                        }
                    }
                }
            } else {
                // We've come to the end of the current work-item (Block).  We'll already have
                // processed the fallthrough/continuation/whatever for it using the logic above.
                // Pop it off the stack and resume work on its parent.
                self.processing_stack.pop();
            }
        }
    }
}

// =============================================================================================
// Top level: perform redundant fill removal for a complete function

impl RedundantReloadRemover {
    /// Create a new remover state.
    pub fn new() -> Self {
        Self {
            num_regunits: None,
            num_preds_per_block: PrimaryMap::<Block, ZeroOneOrMany>::with_capacity(8),
            discovery_stack: Vec::<Block>::with_capacity(16),
            nodes_in_tree: EntitySet::<Block>::new(),
            processing_stack: Vec::<ProcessingStackElem>::with_capacity(8),
            nodes_already_visited: EntitySet::<Block>::new(),
        }
    }

    /// Clear the state of the remover.
    pub fn clear(&mut self) {
        self.clear_for_new_function();
    }

    fn clear_for_new_function(&mut self) {
        self.num_preds_per_block.clear();
        self.clear_for_new_tree();
    }

    fn clear_for_new_tree(&mut self) {
        self.discovery_stack.clear();
        self.nodes_in_tree.clear();
        self.processing_stack.clear();
        self.nodes_already_visited.clear();
    }

    #[inline(never)]
    fn do_redundant_fill_removal_on_function(
        &mut self,
        func: &mut Function,
        reginfo: &RegInfo,
        isa: &dyn TargetIsa,
        cfg: &ControlFlowGraph,
    ) {
        // Fail in an obvious way if there are more than (2^32)-1 Blocks in this function.
        let num_blocks: u32 = func.dfg.num_blocks().try_into().unwrap();

        // Clear out per-tree state.
        self.clear_for_new_function();

        // Create a PrimaryMap that summarises the number of predecessors for each block, as 0, 1
        // or "many", and that also claims the entry block as having "many" predecessors.
        self.num_preds_per_block.clear();
        self.num_preds_per_block.reserve(num_blocks as usize);

        for i in 0..num_blocks {
            let mut pi = cfg.pred_iter(Block::from_u32(i));
            let mut n_pi = ZeroOneOrMany::Zero;
            if pi.next().is_some() {
                n_pi = ZeroOneOrMany::One;
                if pi.next().is_some() {
                    n_pi = ZeroOneOrMany::Many;
                    // We don't care if there are more than two preds, so stop counting now.
                }
            }
            self.num_preds_per_block.push(n_pi);
        }
        debug_assert!(self.num_preds_per_block.len() == num_blocks as usize);

        // The entry block must be the root of some tree, so set up the state to reflect that.
        let entry_block = func
            .layout
            .entry_block()
            .expect("do_redundant_fill_removal_on_function: entry block unknown");
        debug_assert!(self.num_preds_per_block[entry_block] == ZeroOneOrMany::Zero);
        self.num_preds_per_block[entry_block] = ZeroOneOrMany::Many;

        // Now build and process trees.
        for root_ix in 0..self.num_preds_per_block.len() {
            let root = Block::from_u32(root_ix as u32);

            // Build a tree for each node that has two or more preds, and ignore all other nodes.
            if self.num_preds_per_block[root] != ZeroOneOrMany::Many {
                continue;
            }

            // Clear out per-tree state.
            self.clear_for_new_tree();

            // Discovery phase: build the tree, as `root` and `self.nodes_in_tree`.
            self.add_nodes_to_tree(cfg, root);
            debug_assert!(self.nodes_in_tree.cardinality() > 0);
            debug_assert!(self.num_preds_per_block[root] == ZeroOneOrMany::Many);

            // Processing phase: do redundant-reload-removal.
            self.process_tree(func, reginfo, isa, root);
            debug_assert!(
                self.nodes_in_tree.cardinality() == self.nodes_already_visited.cardinality()
            );
        }
    }
}

// =============================================================================================
// Top level: the external interface

struct Context<'a> {
    // Current instruction as well as reference to function and ISA.
    cur: EncCursor<'a>,

    // Cached ISA information.  We save it here to avoid frequent virtual function calls on the
    // `TargetIsa` trait object.
    reginfo: RegInfo,

    // References to contextual data structures we need.
    cfg: &'a ControlFlowGraph,

    // The running state.
    state: &'a mut RedundantReloadRemover,
}

impl RedundantReloadRemover {
    /// Run the remover.
    pub fn run(&mut self, isa: &dyn TargetIsa, func: &mut Function, cfg: &ControlFlowGraph) {
        let ctx = Context {
            cur: EncCursor::new(func, isa),
            reginfo: isa.register_info(),
            cfg,
            state: self,
        };
        let mut total_regunits = 0;
        for rb in isa.register_info().banks {
            total_regunits += rb.units;
        }
        ctx.state.num_regunits = Some(total_regunits);
        ctx.state.do_redundant_fill_removal_on_function(
            ctx.cur.func,
            &ctx.reginfo,
            ctx.cur.isa,
            &ctx.cfg,
        );
    }
}