Cranelift: implement redundant fill removal on tree-shaped CFG regions. Mozilla bug 1570584. (#906)

2019-08-25 19:37:34 +02:00
parent cc57e84cbd
commit b8fb52446c
19 changed files with 1262 additions and 24 deletions
--- a/cranelift/codegen/meta/src/isa/riscv/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/riscv/encodings.rs
@@ -9,6 +9,7 @@ use crate::cdsl::settings::SettingGroup;
 use crate::shared::types::Bool::B1;
 use crate::shared::types::Float::{F32, F64};
 use crate::shared::types::Int::{I16, I32, I64, I8};
+use crate::shared::types::Reference::{R32, R64};
 use crate::shared::Definitions as SharedDefinitions;

 use super::recipes::RecipeGroup;
@@ -121,7 +122,9 @@ pub fn define<'defs>(
    let call_indirect = shared.by_name("call_indirect");
    let copy = shared.by_name("copy");
    let copy_nop = shared.by_name("copy_nop");
+    let copy_to_ssa = shared.by_name("copy_to_ssa");
    let fill = shared.by_name("fill");
+    let fill_nop = shared.by_name("fill_nop");
    let iadd = shared.by_name("iadd");
    let iadd_imm = shared.by_name("iadd_imm");
    let iconst = shared.by_name("iconst");
@@ -141,6 +144,8 @@ pub fn define<'defs>(
    let return_ = shared.by_name("return");

    // Recipes shorthands, prefixed with r_.
+    let r_copytossa = recipes.by_name("copytossa");
+    let r_fillnull = recipes.by_name("fillnull");
    let r_icall = recipes.by_name("Icall");
    let r_icopy = recipes.by_name("Icopy");
    let r_ii = recipes.by_name("Ii");
@@ -368,6 +373,14 @@ pub fn define<'defs>(
    e.add64(enc(fill.bind(I32), r_gp_fi, load_bits(0b010)));
    e.add64(enc(fill.bind(I64), r_gp_fi, load_bits(0b011)));

+    // No-op fills, created by late-stage redundant-fill removal.
+    for &ty in &[I64, I32] {
+        e.add64(enc(fill_nop.bind(ty), r_fillnull, 0));
+        e.add32(enc(fill_nop.bind(ty), r_fillnull, 0));
+    }
+    e.add64(enc(fill_nop.bind(B1), r_fillnull, 0));
+    e.add32(enc(fill_nop.bind(B1), r_fillnull, 0));
+
    // Register copies.
    e.add32(enc(copy.bind(I32), r_icopy, opimm_bits(0b000, 0)));
    e.add64(enc(copy.bind(I64), r_icopy, opimm_bits(0b000, 0)));
@@ -394,5 +407,34 @@ pub fn define<'defs>(
        e.add64(enc(copy_nop.bind(ty), r_stacknull, 0));
    }

+    // Copy-to-SSA
+    e.add32(enc(
+        copy_to_ssa.bind(I32),
+        r_copytossa,
+        opimm_bits(0b000, 0),
+    ));
+    e.add64(enc(
+        copy_to_ssa.bind(I64),
+        r_copytossa,
+        opimm_bits(0b000, 0),
+    ));
+    e.add64(enc(
+        copy_to_ssa.bind(I32),
+        r_copytossa,
+        opimm32_bits(0b000, 0),
+    ));
+    e.add32(enc(copy_to_ssa.bind(B1), r_copytossa, opimm_bits(0b000, 0)));
+    e.add64(enc(copy_to_ssa.bind(B1), r_copytossa, opimm_bits(0b000, 0)));
+    e.add32(enc(
+        copy_to_ssa.bind_ref(R32),
+        r_copytossa,
+        opimm_bits(0b000, 0),
+    ));
+    e.add64(enc(
+        copy_to_ssa.bind_ref(R64),
+        r_copytossa,
+        opimm_bits(0b000, 0),
+    ));
+
    e
 }
--- a/cranelift/codegen/meta/src/isa/riscv/recipes.rs
+++ b/cranelift/codegen/meta/src/isa/riscv/recipes.rs
@@ -63,6 +63,7 @@ pub fn define<'formats>(
    let f_branch_icmp = formats.by_name("BranchIcmp");
    let f_call = formats.by_name("Call");
    let f_call_indirect = formats.by_name("CallIndirect");
+    let f_copy_to_ssa = formats.by_name("CopyToSsa");
    let f_int_compare = formats.by_name("IntCompare");
    let f_int_compare_imm = formats.by_name("IntCompareImm");
    let f_jump = formats.by_name("Jump");
@@ -185,6 +186,14 @@ pub fn define<'formats>(
            .emit("put_i(bits, src, 0, dst, sink);"),
    );

+    // Same for copy-to-SSA -- GPR regmove.
+    recipes.push(
+        EncodingRecipeBuilder::new("copytossa", f_copy_to_ssa, 4)
+            // No operands_in to mention, because a source register is specified directly.
+            .operands_out(vec![gpr])
+            .emit("put_i(bits, src, 0, out_reg0, sink);"),
+    );
+
    // U-type instructions have a 20-bit immediate that targets bits 12-31.
    let format = formats.get(f_unary_imm);
    recipes.push(
@@ -271,5 +280,14 @@ pub fn define<'formats>(
            .emit(""),
    );

+    // No-op fills, created by late-stage redundant-fill removal.
+    recipes.push(
+        EncodingRecipeBuilder::new("fillnull", f_unary, 0)
+            .operands_in(vec![Stack::new(gpr)])
+            .operands_out(vec![gpr])
+            .clobbers_flags(false)
+            .emit(""),
+    );
+
    recipes
 }
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -340,6 +340,7 @@ pub fn define(
    let copy = shared.by_name("copy");
    let copy_nop = shared.by_name("copy_nop");
    let copy_special = shared.by_name("copy_special");
+    let copy_to_ssa = shared.by_name("copy_to_ssa");
    let ctz = shared.by_name("ctz");
    let debugtrap = shared.by_name("debugtrap");
    let extractlane = shared.by_name("extractlane");
@@ -352,6 +353,7 @@ pub fn define(
    let fdiv = shared.by_name("fdiv");
    let ffcmp = shared.by_name("ffcmp");
    let fill = shared.by_name("fill");
+    let fill_nop = shared.by_name("fill_nop");
    let floor = shared.by_name("floor");
    let fmul = shared.by_name("fmul");
    let fpromote = shared.by_name("fpromote");
@@ -468,7 +470,9 @@ pub fn define(
    let rec_fax = r.template("fax");
    let rec_fcmp = r.template("fcmp");
    let rec_fcscc = r.template("fcscc");
+    let rec_ffillnull = r.recipe("ffillnull");
    let rec_ffillSib32 = r.template("ffillSib32");
+    let rec_fillnull = r.recipe("fillnull");
    let rec_fillSib32 = r.template("fillSib32");
    let rec_fld = r.template("fld");
    let rec_fldDisp32 = r.template("fldDisp32");
@@ -490,6 +494,7 @@ pub fn define(
    let rec_fstWithIndexDisp32 = r.template("fstWithIndexDisp32");
    let rec_fstWithIndexDisp8 = r.template("fstWithIndexDisp8");
    let rec_furm = r.template("furm");
+    let rec_furm_reg_to_ssa = r.template("furm_reg_to_ssa");
    let rec_furmi_rnd = r.template("furmi_rnd");
    let rec_got_fnaddr8 = r.template("got_fnaddr8");
    let rec_got_gvaddr8 = r.template("got_gvaddr8");
@@ -568,6 +573,7 @@ pub fn define(
    let rec_trapff = r.recipe("trapff");
    let rec_u_id = r.template("u_id");
    let rec_umr = r.template("umr");
+    let rec_umr_reg_to_ssa = r.template("umr_reg_to_ssa");
    let rec_ur = r.template("ur");
    let rec_urm = r.template("urm");
    let rec_urm_noflags = r.template("urm_noflags");
@@ -921,6 +927,18 @@ pub fn define(
    e.enc_r32_r64(fill, rec_fillSib32.opcodes(vec![0x8b]));
    e.enc_r32_r64(regfill, rec_regfill32.opcodes(vec![0x8b]));

+    // No-op fills, created by late-stage redundant-fill removal.
+    for &ty in &[I64, I32, I16, I8] {
+        e.enc64_rec(fill_nop.bind(ty), rec_fillnull, 0);
+        e.enc32_rec(fill_nop.bind(ty), rec_fillnull, 0);
+    }
+    e.enc64_rec(fill_nop.bind(B1), rec_fillnull, 0);
+    e.enc32_rec(fill_nop.bind(B1), rec_fillnull, 0);
+    for &ty in &[F64, F32] {
+        e.enc64_rec(fill_nop.bind(ty), rec_ffillnull, 0);
+        e.enc32_rec(fill_nop.bind(ty), rec_ffillnull, 0);
+    }
+
    // Load 32 bits from `b1`, `i8` and `i16` spill slots. See `spill.b1` above.

    e.enc_both(fill.bind(B1), rec_fillSib32.opcodes(vec![0x8b]));
@@ -943,6 +961,24 @@ pub fn define(
    e.enc64(copy_special, rec_copysp.opcodes(vec![0x89]).rex().w());
    e.enc32(copy_special, rec_copysp.opcodes(vec![0x89]));

+    // Copy to SSA
+    e.enc_i32_i64(copy_to_ssa, rec_umr_reg_to_ssa.opcodes(vec![0x89]));
+    e.enc_r32_r64(copy_to_ssa, rec_umr_reg_to_ssa.opcodes(vec![0x89]));
+    e.enc_both(copy_to_ssa.bind(B1), rec_umr_reg_to_ssa.opcodes(vec![0x89]));
+    e.enc_both(copy_to_ssa.bind(I8), rec_umr_reg_to_ssa.opcodes(vec![0x89]));
+    e.enc_both(
+        copy_to_ssa.bind(I16),
+        rec_umr_reg_to_ssa.opcodes(vec![0x89]),
+    );
+    e.enc_both(
+        copy_to_ssa.bind(F64),
+        rec_furm_reg_to_ssa.opcodes(vec![0xf2, 0x0f, 0x10]),
+    );
+    e.enc_both(
+        copy_to_ssa.bind(F32),
+        rec_furm_reg_to_ssa.opcodes(vec![0xf3, 0x0f, 0x10]),
+    );
+
    // Stack-slot-to-the-same-stack-slot copy, which is guaranteed to turn
    // into a no-op.
    // The same encoding is generated for both the 64- and 32-bit architectures.
--- a/cranelift/codegen/meta/src/isa/x86/recipes.rs
+++ b/cranelift/codegen/meta/src/isa/x86/recipes.rs
@@ -367,6 +367,7 @@ pub fn define<'shared>(
    let f_call = formats.by_name("Call");
    let f_call_indirect = formats.by_name("CallIndirect");
    let f_copy_special = formats.by_name("CopySpecial");
+    let f_copy_to_ssa = formats.by_name("CopyToSsa");
    let f_extract_lane = formats.by_name("ExtractLane"); // TODO this would preferably retrieve a BinaryImm8 format but because formats are compared structurally and ExtractLane has the same structure this is impossible--if we rename ExtractLane, it may even impact parsing
    let f_float_compare = formats.by_name("FloatCompare");
    let f_float_cond = formats.by_name("FloatCond");
@@ -426,6 +427,22 @@ pub fn define<'shared>(
            .emit(""),
    );

+    // No-op fills, created by late-stage redundant-fill removal.
+    recipes.add_recipe(
+        EncodingRecipeBuilder::new("fillnull", f_unary, 0)
+            .operands_in(vec![stack_gpr32])
+            .operands_out(vec![gpr])
+            .clobbers_flags(false)
+            .emit(""),
+    );
+    recipes.add_recipe(
+        EncodingRecipeBuilder::new("ffillnull", f_unary, 0)
+            .operands_in(vec![stack_gpr32])
+            .operands_out(vec![fpr])
+            .clobbers_flags(false)
+            .emit(""),
+    );
+
    recipes
        .add_recipe(EncodingRecipeBuilder::new("debugtrap", f_nullary, 1).emit("sink.put1(0xcc);"));

@@ -570,6 +587,20 @@ pub fn define<'shared>(
            ),
    );

+    // Same as umr, but with the source register specified directly.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("umr_reg_to_ssa", f_copy_to_ssa, 1)
+            // No operands_in to mention, because a source register is specified directly.
+            .operands_out(vec![gpr])
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(out_reg0, src), sink);
+                    modrm_rr(out_reg0, src, sink);
+                "#,
+            ),
+    );
+
    // XX /r, but for a unary operator with separate input/output register.
    // RM form. Clobbers FLAGS.
    recipes.add_template_recipe(
@@ -631,6 +662,20 @@ pub fn define<'shared>(
            ),
    );

+    // Same as furm, but with the source register specified directly.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("furm_reg_to_ssa", f_copy_to_ssa, 1)
+            // No operands_in to mention, because a source register is specified directly.
+            .operands_out(vec![fpr])
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(src, out_reg0), sink);
+                    modrm_rr(src, out_reg0, sink);
+                "#,
+            ),
+    );
+
    // XX /r, RM form, GPR -> FPR.
    recipes.add_template_recipe(
        EncodingRecipeBuilder::new("frurm", f_unary, 1)
--- a/cranelift/codegen/meta/src/shared/formats.rs
+++ b/cranelift/codegen/meta/src/shared/formats.rs
@@ -157,6 +157,7 @@ pub fn define(immediates: &OperandKinds, entities: &OperandKinds) -> FormatRegis
            .imm(("src", regunit))
            .imm(("dst", regunit)),
    );
+    registry.insert(Builder::new("CopyToSsa").imm(("src", regunit)));
    registry.insert(
        Builder::new("RegSpill")
            .value()
--- a/cranelift/codegen/meta/src/shared/instructions.rs
+++ b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -1194,6 +1194,22 @@ pub fn define(
        .can_load(true),
    );

+    ig.push(
+        Inst::new(
+            "fill_nop",
+            r#"
+        This is identical to `fill`, except it has no encoding, since it is a no-op.
+
+        This instruction is created only during late-stage redundant-reload removal, after all
+        registers and stack slots have been assigned.  It is used to replace `fill`s that have
+        been identified as redundant.
+        "#,
+        )
+        .operands_in(vec![x])
+        .operands_out(vec![a])
+        .can_load(true),
+    );
+
    let src = &operand("src", regunit);
    let dst = &operand("dst", regunit);

@@ -1233,6 +1249,23 @@ pub fn define(
        .other_side_effects(true),
    );

+    ig.push(
+        Inst::new(
+            "copy_to_ssa",
+            r#"
+        Copies the contents of ''src'' register to ''a'' SSA name.
+
+        This instruction copies the contents of one register, regardless of its SSA name, to
+        another register, creating a new SSA name.  In that sense it is a one-sided version
+        of ''copy_special''.  This instruction is internal and should not be created by
+        Cranelift users.
+        "#,
+        )
+        .operands_in(vec![src])
+        .operands_out(vec![a])
+        .other_side_effects(true),
+    );
+
    ig.push(
        Inst::new(
            "copy_nop",
--- a/cranelift/codegen/src/context.rs
+++ b/cranelift/codegen/src/context.rs
@@ -23,6 +23,7 @@ use crate::licm::do_licm;
 use crate::loop_analysis::LoopAnalysis;
 use crate::nan_canonicalization::do_nan_canonicalization;
 use crate::postopt::do_postopt;
+use crate::redundant_reload_remover::RedundantReloadRemover;
 use crate::regalloc;
 use crate::result::CodegenResult;
 use crate::settings::{FlagsOrIsa, OptLevel};
@@ -50,6 +51,9 @@ pub struct Context {

    /// Loop analysis of `func`.
    pub loop_analysis: LoopAnalysis,
+
+    /// Redundant-reload remover context.
+    pub redundant_reload_remover: RedundantReloadRemover,
 }

 impl Context {
@@ -72,6 +76,7 @@ impl Context {
            domtree: DominatorTree::new(),
            regalloc: regalloc::Context::new(),
            loop_analysis: LoopAnalysis::new(),
+            redundant_reload_remover: RedundantReloadRemover::new(),
        }
    }

@@ -82,6 +87,7 @@ impl Context {
        self.domtree.clear();
        self.regalloc.clear();
        self.loop_analysis.clear();
+        self.redundant_reload_remover.clear();
    }

    /// Compile the function, and emit machine code into a `Vec<u8>`.
@@ -149,6 +155,7 @@ impl Context {
        self.regalloc(isa)?;
        self.prologue_epilogue(isa)?;
        if isa.flags().opt_level() == OptLevel::Best {
+            self.redundant_reload_remover(isa)?;
            self.shrink_instructions(isa)?;
        }
        self.relax_branches(isa)
@@ -322,6 +329,14 @@ impl Context {
        Ok(())
    }

+    /// Do redundant-reload removal after allocation of both registers and stack slots.
+    pub fn redundant_reload_remover(&mut self, isa: &dyn TargetIsa) -> CodegenResult<()> {
+        self.redundant_reload_remover
+            .run(isa, &mut self.func, &self.cfg);
+        self.verify_if(isa)?;
+        Ok(())
+    }
+
    /// Run the instruction shrinking pass.
    pub fn shrink_instructions(&mut self, isa: &dyn TargetIsa) -> CodegenResult<()> {
        shrink_instructions(&mut self.func, isa);
--- a/cranelift/codegen/src/ir/stackslot.rs
+++ b/cranelift/codegen/src/ir/stackslot.rs
@@ -209,11 +209,6 @@ impl StackSlots {
        self.slots.is_valid(ss)
    }

-    /// Set the offset of a stack slot.
-    pub fn set_offset(&mut self, ss: StackSlot, offset: StackOffset) {
-        self.slots[ss].offset = Some(offset);
-    }
-
    /// Get an iterator over all the stack slot keys.
    pub fn iter(&self) -> Iter<StackSlot, StackSlotData> {
        self.slots.iter()
--- a/cranelift/codegen/src/lib.rs
+++ b/cranelift/codegen/src/lib.rs
@@ -95,6 +95,7 @@ mod nan_canonicalization;
 mod partition_slice;
 mod postopt;
 mod predicates;
+mod redundant_reload_remover;
 mod ref_slice;
 mod regalloc;
 mod result;
--- a/cranelift/codegen/src/redundant_reload_remover.rs
+++ b/cranelift/codegen/src/redundant_reload_remover.rs
@@ -0,0 +1,904 @@
+//! This module implements a late-stage redundant-reload remover, which runs after registers have
+//! been allocated and stack slots have been given specific offsets.
+
+use crate::cursor::{Cursor, CursorPosition, EncCursor, FuncCursor};
+use crate::entity::EntitySet;
+use crate::flowgraph::ControlFlowGraph;
+use crate::ir::dfg::DataFlowGraph;
+use crate::ir::instructions::BranchInfo;
+use crate::ir::stackslot::{StackSlotKind, StackSlots};
+use crate::ir::{
+    Ebb, Function, Inst, InstBuilder, InstructionData, Opcode, StackSlotData, Type, Value, ValueLoc,
+};
+use crate::isa::{RegInfo, RegUnit, TargetIsa};
+use crate::regalloc::RegDiversions;
+use core::convert::TryInto;
+use cranelift_entity::{PrimaryMap, SecondaryMap};
+use std::vec::Vec;
+
+// =============================================================================================
+// A description of the redundant-fill-removal algorithm
+//
+//
+// The algorithm works forwards through each Ebb.  It carries along and updates a table,
+// AvailEnv, with which it tracks registers that are known to have the same value as some stack
+// slot.  The actions on encountering an instruction depend on the instruction, as follows:
+//
+// ss1 = spill r0: update the AvailEnv so as to note that slot `ss1` and register `r0`
+//                 have the same value.
+//
+// r1 = fill ss0: look in the AvailEnv.  If it tells us that register `r1` and slot `ss0`
+//                have the same value, then delete the instruction by converting it to a
+//                `fill_nop`.
+//
+//                If it tells us that some other register `r2` has the same value as
+//                slot `ss0`, convert the instruction into a copy from `r2` to `r1`.
+//
+// any other insn: remove from the AvailEnv, any bindings associated with registers
+//                 written by this instruction, since they will be invalidated by it.
+//
+// Tracking the effects of `copy` instructions in AvailEnv for the case when both source and
+// destination are registers does not cause any more fills to be removed or converted to copies.
+// It's not clear why.
+//
+// There are various other instruction-handling cases in `visit_inst`, which are documented
+// in-line, and do not change the core algorithm, so are not described here.
+//
+// The registers tracked by AvailEnv are the post-diversion registers that are really used by the
+// code; they are not the pre-diversion names associated with each SSA `Value`.  The second
+// `fill` case above opportunistically copies values from registers that may have been diversion
+// targets in some predecessor block, and so are no longer associated with any specific SSA-level
+// name at the point the copy is made.  Hence those copies (from `r2` to `r1`) cannot be done
+// with an ordinary `copy` instruction.  Instead they have to be done using a new `copy_to_ssa`
+// instruction, which copies from an arbitrary register to a register-resident `Value` (that is,
+// "back to" SSA-world).
+//
+// That completes the description of the core algorithm.
+//
+// In the case where a block `A` jumps to `B` and `A` is the only predecessor of `B`, the
+// AvailEnv at the end of `A` will still be valid at the entry to `B`.  In such a case, we can
+// profitably transform `B` using the AvailEnv "inherited" from `A`.  In order to take full
+// advantage of this, this module partitions the function's CFG into tree-shaped groups of
+// blocks, and processes each tree as described above.  So the AvailEnv is only initialised to
+// empty at the start of blocks that form the root of each tree; that is, for blocks which have
+// two or more predecessors.
+
+// =============================================================================================
+// Top level algorithm structure
+//
+// The overall algorithm, for a function, starts like this:
+//
+// * (once per function): finds Ebbs that have two or more predecessors, since they will be the
+//   roots of Ebb trees.  Also, the entry node for the function is considered to be a root.
+//
+// It then continues with a loop that first finds a tree of Ebbs ("discovery") and then removes
+// redundant fills as described above ("processing"):
+//
+// * (discovery; once per tree): for each root, performs a depth first search to find all the Ebbs
+//   in the tree, guided by RedundantReloadRemover::discovery_stack.
+//
+// * (processing; once per tree): the just-discovered tree is then processed as described above,
+//   guided by RedundantReloadRemover::processing_stack.
+//
+// In this way, all Ebbs reachable from the function's entry point are eventually processed.  Note
+// that each tree is processed as soon as it has been discovered, so the algorithm never creates a
+// list of trees for the function.
+//
+// The running state is stored in `RedundantReloadRemover`.  This is allocated once and can be
+// reused for multiple functions so as to minimise heap turnover.  The fields are, roughly:
+//
+//   num_regunits -- constant for the whole function; used by the tree processing phase
+//   num_preds_per_ebb -- constant for the whole function; used by the tree discovery process
+//
+//   discovery_stack -- used to guide the tree discovery process
+//   nodes_in_tree -- the discovered nodes are recorded here
+//
+//   processing_stack -- used to guide the tree processing process
+//   nodes_already_visited -- used to ensure the tree processing logic terminates in the case
+//                            where a tree has a branch back to its root node.
+//
+// There is further documentation in line below, as appropriate.
+
+// =============================================================================================
+// A side note on register choice heuristics
+
+// The core algorithm opportunistically replaces fill instructions when it knows of a register
+// that already holds the required value.  How effective this is largely depends on how long
+// reloaded values happen to stay alive before the relevant register is overwritten.  And that
+// depends on the register allocator's register choice heuristics.  The worst case is, when the
+// register allocator reuses registers as soon as possible after they become free.  Unfortunately
+// that was indeed the selection scheme, prior to development of this pass.
+//
+// As part of this work, the register selection scheme has been changed as follows: for registers
+// written by any instruction other than a fill, use the lowest numbered available register.  But
+// for registers written by a fill instruction, use the highest numbered available register.  The
+// aim is to try and keep reload- and non-reload registers disjoint to the extent possible.
+// Several other schemes were tried, but this one is simple and can be worth an extra 2% of
+// performance in some cases.
+//
+// The relevant change is more or less a one-line change in the solver.
+
+// =============================================================================================
+// Data structures used for discovery of trees
+
+// `ZeroOneOrMany` is used to record the number of predecessors an Ebb block has.  The `Zero` case
+// is included so as to cleanly handle the case where the incoming graph has unreachable Ebbs.
+
+#[derive(Clone, PartialEq)]
+enum ZeroOneOrMany {
+    Zero,
+    One,
+    Many,
+}
+
+// =============================================================================================
+// Data structures used for processing of trees
+
+// `SlotInfo` describes a spill slot in the obvious way.  Note that it doesn't indicate which
+// register(s) are currently associated with the slot.  That job is done by `AvailEnv` instead.
+//
+// In the CL framework, stack slots are partitioned into disjoint sets, one for each
+// `StackSlotKind`.  The offset and size only give a unique identity within any particular
+// `StackSlotKind`.  So, to uniquely identify a stack slot, all three fields are necessary.
+
+#[derive(Clone, Copy)]
+struct SlotInfo {
+    kind: StackSlotKind,
+    offset: i32,
+    size: u32,
+}
+
+// `AvailEnv` maps each possible register to a stack slot that holds the same value.  The index
+// space of `AvailEnv::map` is exactly the set of registers available on the current target.  If
+// (as is mostly the case) a register is not known to have the same value as a stack slot, then
+// its entry is `None` rather than `Some(..)`.
+//
+// Invariants for AvailEnv:
+//
+// AvailEnv may have multiple different registers bound to the same stack slot -- that is, `(kind,
+// offset, size)` triple.  That's OK, and reflects the reality that those two registers contain
+// the same value.  This could happen, for example, in the case
+//
+//   ss1 = spill r0
+//   ..
+//   r2 = fill ss1
+//
+// Then both `r0` and `r2` will have the same value as `ss1`, provided that ".." doesn't write to
+// `r1`.
+//
+// To say that two different registers may be bound to the same stack slot is the same as saying
+// that it is allowed to have two different entries in AvailEnv with the same `(kind, offset,
+// size)` triple.  What is *not* allowed is to have partial overlaps.  That is, if two SlotInfos
+// have the same `kind` field and have `offset` and `size` fields that overlap, then their
+// `offset` and `size` fields must be identical.  This is so as to make the algorithm safe against
+// situations where, for example, a 64 bit register is spilled, but then only the bottom 32 bits
+// are reloaded from the slot.
+//
+// Although in such a case it seems likely that the Cranelift IR would be ill-typed, and so this
+// case could probably not occur in practice.
+
+#[derive(Clone)]
+struct AvailEnv {
+    map: Vec<Option<SlotInfo>>,
+}
+
+// `ProcessingStackElem` combines AvailEnv with contextual information needed to "navigate" within
+// an Ebb.
+//
+// A ProcessingStackElem conceptually has the lifetime of exactly one Ebb: once the current Ebb is
+// completed, the ProcessingStackElem will be abandoned.  In practice the top level state,
+// RedundantReloadRemover, caches them, so as to avoid heap turnover.
+//
+// Note that ProcessingStackElem must contain a CursorPosition.  The CursorPosition, which
+// indicates where we are in the current Ebb, cannot be implicitly maintained by looping over all
+// the instructions in an Ebb in turn, because we may choose to suspend processing the current Ebb
+// at a side exit, continue by processing the subtree reached via the side exit, and only later
+// resume the current Ebb.
+
+struct ProcessingStackElem {
+    /// Indicates the AvailEnv at the current point in the Ebb.
+    avail_env: AvailEnv,
+
+    /// Shows where we currently are inside the Ebb.
+    cursor: CursorPosition,
+
+    /// Indicates the currently active register diversions at the current point.
+    diversions: RegDiversions,
+}
+
+// =============================================================================================
+// The top level data structure
+
+// `RedundantReloadRemover` contains data structures for the two passes: discovery of tree shaped
+// regions, and processing of them.  These are allocated once and stay alive for the entire
+// function, even though they are cleared out for each new tree shaped region.  It also caches
+// `num_regunits` and `num_preds_per_ebb`, which are computed at the start of each function and
+// then remain constant.
+
+/// The redundant reload remover's state.
+pub struct RedundantReloadRemover {
+    /// The total number of RegUnits available on this architecture.  This is unknown when the
+    /// RedundantReloadRemover is created.  It becomes known at the beginning of processing of a
+    /// function.
+    num_regunits: Option<u16>,
+
+    /// This stores, for each Ebb, a characterisation of the number of predecessors it has.
+    num_preds_per_ebb: PrimaryMap<Ebb, ZeroOneOrMany>,
+
+    /// The stack used for the first phase (discovery).  There is one element on the discovery
+    /// stack for each currently unexplored Ebb in the tree being searched.
+    discovery_stack: Vec<Ebb>,
+
+    /// The nodes in the discovered tree are inserted here.
+    nodes_in_tree: EntitySet<Ebb>,
+
+    /// The stack used during the second phase (transformation).  There is one element on the
+    /// processing stack for each currently-open node in the tree being transformed.
+    processing_stack: Vec<ProcessingStackElem>,
+
+    /// Used in the second phase to avoid visiting nodes more than once.
+    nodes_already_visited: EntitySet<Ebb>,
+}
+
+// =============================================================================================
+// Miscellaneous small helper functions
+
+// Is this a kind of stack slot that is safe to track in AvailEnv?  This is probably overly
+// conservative, but tracking only the SpillSlot and IncomingArgument kinds catches almost all
+// available redundancy in practice.
+fn is_slot_kind_tracked(kind: StackSlotKind) -> bool {
+    match kind {
+        StackSlotKind::SpillSlot | StackSlotKind::IncomingArg => true,
+        _ => false,
+    }
+}
+
+// Find out if the range `[offset, +size)` overlaps with the range in `si`.
+fn overlaps(si: &SlotInfo, offset: i32, size: u32) -> bool {
+    let a_offset = si.offset as i64;
+    let a_size = si.size as i64;
+    let b_offset = offset as i64;
+    let b_size = size as i64;
+    let no_overlap = a_offset + a_size <= b_offset || b_offset + b_size <= a_offset;
+    !no_overlap
+}
+
+// Find, in `reginfo`, the register bank that `reg` lives in, and return the lower limit and size
+// of the bank.  This is so the caller can conveniently iterate over all RegUnits in the bank that
+// `reg` lives in.
+fn find_bank_limits(reginfo: &RegInfo, reg: RegUnit) -> (RegUnit, u16) {
+    if let Some(bank) = reginfo.bank_containing_regunit(reg) {
+        return (bank.first_unit, bank.units);
+    }
+    // We should never get here, since `reg` must come from *some* RegBank.
+    panic!("find_regclass_limits: reg not found");
+}
+
+// Returns the register that `v` is allocated to.  Assumes that `v` actually resides in a
+// register.
+fn reg_of_value(locations: &SecondaryMap<Value, ValueLoc>, v: Value) -> RegUnit {
+    match locations[v] {
+        ValueLoc::Reg(ru) => ru,
+        _ => panic!("reg_of_value: value isn't in a reg"),
+    }
+}
+
+// Returns the stack slot that `v` is allocated to.  Assumes that `v` actually resides in a stack
+// slot.
+fn slot_of_value<'s>(
+    locations: &SecondaryMap<Value, ValueLoc>,
+    stack_slots: &'s StackSlots,
+    v: Value,
+) -> &'s StackSlotData {
+    match locations[v] {
+        ValueLoc::Stack(slot) => &stack_slots[slot],
+        _ => panic!("slot_of_value: value isn't in a stack slot"),
+    }
+}
+
+// =============================================================================================
+// Top level: discovery of tree shaped regions
+
+impl RedundantReloadRemover {
+    // A helper for `add_nodes_to_tree` below.
+    fn discovery_stack_push_successors_of(&mut self, cfg: &ControlFlowGraph, node: Ebb) {
+        for successor in cfg.succ_iter(node) {
+            self.discovery_stack.push(successor);
+        }
+    }
+
+    // Visit the tree of Ebbs rooted at `starting_point` and add them to `self.nodes_in_tree`.
+    // `self.num_preds_per_ebb` guides the process, ensuring we don't leave the tree-ish region
+    // and indirectly ensuring that the process will terminate in the presence of cycles in the
+    // graph.  `self.discovery_stack` holds the search state in this function.
+    fn add_nodes_to_tree(&mut self, cfg: &ControlFlowGraph, starting_point: Ebb) {
+        // One might well ask why this doesn't loop forever when it encounters cycles in the
+        // control flow graph.  The reason is that any cycle in the graph that is reachable from
+        // anywhere outside the cycle -- in particular, that is reachable from the function's
+        // entry node -- must have at least one node that has two or more predecessors.  So the
+        // logic below won't follow into it, because it regards any such node as the root of some
+        // other tree.
+        debug_assert!(self.discovery_stack.is_empty());
+        debug_assert!(self.nodes_in_tree.is_empty());
+
+        self.nodes_in_tree.insert(starting_point);
+        self.discovery_stack_push_successors_of(cfg, starting_point);
+
+        while let Some(node) = self.discovery_stack.pop() {
+            match self.num_preds_per_ebb[node] {
+                // We arrived at a node with multiple predecessors, so it's a new root.  Ignore it.
+                ZeroOneOrMany::Many => {}
+                // This node has just one predecessor, so we should incorporate it in the tree and
+                // immediately transition into searching from it instead.
+                ZeroOneOrMany::One => {
+                    self.nodes_in_tree.insert(node);
+                    self.discovery_stack_push_successors_of(cfg, node);
+                }
+                // This is meaningless.  We arrived at a node that doesn't point back at where we
+                // came from.
+                ZeroOneOrMany::Zero => panic!("add_nodes_to_tree: inconsistent graph"),
+            }
+        }
+    }
+}
+
+// =============================================================================================
+// Operations relating to `AvailEnv`
+
+impl AvailEnv {
+    // Create a new one.
+    fn new(size: usize) -> Self {
+        let mut env = AvailEnv {
+            map: Vec::<Option<SlotInfo>>::new(),
+        };
+        env.map.resize(size, None);
+        env
+    }
+
+    // Debug only: checks (some of) the required AvailEnv invariants.
+    #[cfg(debug_assertions)]
+    fn check_invariants(&self) -> bool {
+        // Check that any overlapping entries overlap exactly.  This is super lame (quadratic),
+        // but it's only used in debug builds.
+        for i in 0..self.map.len() {
+            if let Some(si) = self.map[i] {
+                for j in i + 1..self.map.len() {
+                    if let Some(sj) = self.map[j] {
+                        // "si and sj overlap, but not exactly"
+                        if si.kind == sj.kind
+                            && overlaps(&si, sj.offset, sj.size)
+                            && !(si.offset == sj.offset && si.size == sj.size)
+                        {
+                            return false;
+                        }
+                    }
+                }
+            }
+        }
+        true
+    }
+
+    // Invalidates the binding associated with `reg`.  Note that by construction of AvailEnv,
+    // `reg` can only be associated with one binding at once.
+    fn invalidate_by_reg(&mut self, reg: RegUnit) {
+        self.map[reg as usize] = None;
+    }
+
+    // Invalidates any binding that has any overlap with `(kind, offset, size)`.
+    fn invalidate_by_offset(&mut self, kind: StackSlotKind, offset: i32, size: u32) {
+        debug_assert!(is_slot_kind_tracked(kind));
+        for i in 0..self.map.len() {
+            if let Some(si) = &self.map[i] {
+                if si.kind == kind && overlaps(&si, offset, size) {
+                    self.map[i] = None;
+                }
+            }
+        }
+    }
+
+    // Invalidates all bindings.
+    fn invalidate_all(&mut self) {
+        for i in 0..self.map.len() {
+            self.map[i] = None;
+        }
+    }
+
+    // Updates AvailEnv to track the effect of a `regmove` instruction.
+    fn copy_reg(&mut self, src: RegUnit, dst: RegUnit) {
+        self.map[dst as usize] = self.map[src as usize];
+    }
+
+    // Does `env` have the exact binding characterised by `(reg, kind, offset, size)` ?
+    fn has_exact_binding(&self, reg: RegUnit, kind: StackSlotKind, offset: i32, size: u32) -> bool {
+        debug_assert!(is_slot_kind_tracked(kind));
+        if let Some(si) = &self.map[reg as usize] {
+            return si.kind == kind && si.offset == offset && si.size == size;
+        }
+        // No such binding.
+        false
+    }
+
+    // Does `env` have a binding characterised by `(kind, offset, size)` but to a register, let's
+    // call it `other_reg`, that isn't `reg`?  If so, return `other_reg`.  Note that `other_reg`
+    // will have the same bank as `reg`.  It is a checked error to call this function with a
+    // binding matching all four of `(reg, kind, offset, size)`.
+    fn has_inexact_binding(
+        &self,
+        reginfo: &RegInfo,
+        reg: RegUnit,
+        kind: StackSlotKind,
+        offset: i32,
+        size: u32,
+    ) -> Option<RegUnit> {
+        debug_assert!(is_slot_kind_tracked(kind));
+        // Find the range of RegUnit numbers for the bank that contains `reg`, and use that as our
+        // search space.  This is so as to guarantee that any match is restricted to the same bank
+        // as `reg`.
+        let (first_unit, num_units) = find_bank_limits(reginfo, reg);
+        for other_reg in first_unit..first_unit + num_units {
+            if let Some(si) = &self.map[other_reg as usize] {
+                if si.kind == kind && si.offset == offset && si.size == size {
+                    if other_reg == reg {
+                        panic!("has_inexact_binding: binding *is* exact!");
+                    }
+                    return Some(other_reg);
+                }
+            }
+        }
+        // No such binding.
+        None
+    }
+
+    // Create the binding `(reg, kind, offset, size)` in `env`, and throw away any previous
+    // binding associated with either `reg` or the `(kind, offset, size)` triple.
+    fn bind(&mut self, reg: RegUnit, kind: StackSlotKind, offset: i32, size: u32) {
+        debug_assert!(is_slot_kind_tracked(kind));
+        self.invalidate_by_offset(kind, offset, size);
+        self.map[reg as usize] = Some(SlotInfo { kind, offset, size });
+    }
+}
+
+// Invalidates in `avail_env`, any binding associated with a regunit that is written by `inst`.
+fn invalidate_regs_written_by_inst(
+    locations: &SecondaryMap<Value, ValueLoc>,
+    diversions: &RegDiversions,
+    dfg: &DataFlowGraph,
+    avail_env: &mut AvailEnv,
+    inst: Inst,
+) {
+    for v in dfg.inst_results(inst).iter() {
+        if let ValueLoc::Reg(ru) = locations[*v] {
+            // This must be true.  It would be meaningless for an SSA value to be diverted before
+            // the point where it is defined.
+            debug_assert!(diversions.reg(*v, locations) == ru);
+            avail_env.invalidate_by_reg(ru);
+        }
+    }
+}
+
+// =============================================================================================
+// Processing of individual instructions
+
+impl RedundantReloadRemover {
+    // Process `inst`, possibly changing it into a different instruction, and possibly changing
+    // `self.avail_env` and `func.dfg`.
+    fn visit_inst(
+        &mut self,
+        func: &mut Function,
+        reginfo: &RegInfo,
+        isa: &dyn TargetIsa,
+        inst: Inst,
+    ) {
+        // Get hold of the top-of-stack work item.  This is the state that we will mutate during
+        // processing of this instruction.
+        debug_assert!(!self.processing_stack.is_empty());
+        let ProcessingStackElem {
+            avail_env,
+            cursor: _,
+            diversions,
+        } = &mut self.processing_stack.last_mut().unwrap();
+
+        #[cfg(debug_assertions)]
+        debug_assert!(
+            avail_env.check_invariants(),
+            "visit_inst: env invariants not ok"
+        );
+
+        let dfg = &mut func.dfg;
+        let locations = &func.locations;
+        let stack_slots = &func.stack_slots;
+
+        // To avoid difficulties with the borrow checker, do this in two stages.  First, examine
+        // the instruction to see if it can be deleted or modified, and park the relevant
+        // information in `transform`.  Update `self.avail_env` too.  Later, use `transform` to
+        // actually do the transformation if necessary.
+        enum Transform {
+            NoChange,
+            ChangeToNopFill(Value),           // delete this insn entirely
+            ChangeToCopyToSSA(Type, RegUnit), // change it into a copy from the specified reg
+        }
+        let mut transform = Transform::NoChange;
+
+        // In this match { .. } statement, either we must treat the instruction specially, or we
+        // must call `invalidate_regs_written_by_inst` on it.
+        match &dfg[inst] {
+            InstructionData::Unary {
+                opcode: Opcode::Spill,
+                arg: src_value,
+            } => {
+                // Extract: (src_reg, kind, offset, size)
+                // Invalidate: (kind, offset, size)
+                // Add new binding: {src_reg -> (kind, offset, size)}
+                // Don't forget that src_value might be diverted, so we have to deref it.
+                let slot = slot_of_value(locations, stack_slots, dfg.inst_results(inst)[0]);
+                let src_reg = diversions.reg(*src_value, locations);
+                let kind = slot.kind;
+                if is_slot_kind_tracked(kind) {
+                    let offset = slot.offset.expect("visit_inst: spill with no offset");
+                    let size = slot.size;
+                    avail_env.bind(src_reg, kind, offset, size);
+                } else {
+                    // We don't expect this insn to write any regs.  But to be consistent with the
+                    // rule above, do this anyway.
+                    invalidate_regs_written_by_inst(locations, diversions, dfg, avail_env, inst);
+                }
+            }
+            InstructionData::Unary {
+                opcode: Opcode::Fill,
+                arg: src_value,
+            } => {
+                // Extract: (dst_reg, kind, offset, size)
+                // Invalidate: (kind, offset, size)
+                // Add new: {dst_reg -> (dst_value, kind, offset, size)}
+                let slot = slot_of_value(locations, stack_slots, *src_value);
+                let dst_value = dfg.inst_results(inst)[0];
+                let dst_reg = reg_of_value(locations, dst_value);
+                // This must be true.  It would be meaningless for an SSA value to be diverted
+                // before it was defined.
+                debug_assert!(dst_reg == diversions.reg(dst_value, locations));
+                let kind = slot.kind;
+                if is_slot_kind_tracked(kind) {
+                    let offset = slot.offset.expect("visit_inst: fill with no offset");
+                    let size = slot.size;
+                    if avail_env.has_exact_binding(dst_reg, kind, offset, size) {
+                        // This instruction is an exact copy of a fill we saw earlier, and the
+                        // loaded value is still valid.  So we'll schedule this instruction for
+                        // deletion (below).  No need to make any changes to `avail_env`.
+                        transform = Transform::ChangeToNopFill(*src_value);
+                    } else if let Some(other_reg) =
+                        avail_env.has_inexact_binding(reginfo, dst_reg, kind, offset, size)
+                    {
+                        // This fill is from the required slot, but into a different register
+                        // `other_reg`.  So replace it with a copy from `other_reg` to `dst_reg`
+                        // and update `dst_reg`s binding to make it the same as `other_reg`'s, so
+                        // as to maximise the chances of future matches after this instruction.
+                        debug_assert!(other_reg != dst_reg);
+                        transform =
+                            Transform::ChangeToCopyToSSA(dfg.value_type(dst_value), other_reg);
+                        avail_env.copy_reg(other_reg, dst_reg);
+                    } else {
+                        // This fill creates some new binding we don't know about.  Update
+                        // `avail_env` to track it.
+                        avail_env.bind(dst_reg, kind, offset, size);
+                    }
+                } else {
+                    // Else it's "just another instruction that writes a reg", so we'd better
+                    // treat it as such, just as we do below for instructions that we don't handle
+                    // specially.
+                    invalidate_regs_written_by_inst(locations, diversions, dfg, avail_env, inst);
+                }
+            }
+            InstructionData::RegMove {
+                opcode: _,
+                arg: _,
+                src,
+                dst,
+            } => {
+                // These happen relatively rarely, but just frequently enough that it's worth
+                // tracking the copy (at the machine level, it's really a copy) in `avail_env`.
+                avail_env.copy_reg(*src, *dst);
+            }
+            InstructionData::RegSpill { .. }
+            | InstructionData::RegFill { .. }
+            | InstructionData::Call { .. }
+            | InstructionData::CallIndirect { .. }
+            | InstructionData::StackLoad { .. }
+            | InstructionData::StackStore { .. }
+            | InstructionData::Unary {
+                opcode: Opcode::AdjustSpDown,
+                ..
+            }
+            | InstructionData::UnaryImm {
+                opcode: Opcode::AdjustSpUpImm,
+                ..
+            }
+            | InstructionData::UnaryImm {
+                opcode: Opcode::AdjustSpDownImm,
+                ..
+            } => {
+                // All of these change, or might change, the memory-register bindings tracked in
+                // `avail_env` in some way we don't know about, or at least, we might be able to
+                // track, but for which the effort-to-benefit ratio seems too low to bother.  So
+                // play safe: forget everything we know.
+                //
+                // For Call/CallIndirect, we could do better when compiling for calling
+                // conventions that have callee-saved registers, since bindings for them would
+                // remain valid across the call.
+                avail_env.invalidate_all();
+            }
+            _ => {
+                // Invalidate: any `avail_env` entry associated with a reg written by `inst`.
+                invalidate_regs_written_by_inst(locations, diversions, dfg, avail_env, inst);
+            }
+        }
+
+        // Actually do the transformation.
+        match transform {
+            Transform::NoChange => {}
+            Transform::ChangeToNopFill(arg) => {
+                // Load is completely redundant.  Convert it to a no-op.
+                dfg.replace(inst).fill_nop(arg);
+                let ok = func.update_encoding(inst, isa).is_ok();
+                debug_assert!(ok, "fill_nop encoding missing for this type");
+            }
+            Transform::ChangeToCopyToSSA(ty, reg) => {
+                // We already have the relevant value in some other register.  Convert the
+                // load into a reg-reg copy.
+                dfg.replace(inst).copy_to_ssa(ty, reg);
+                let ok = func.update_encoding(inst, isa).is_ok();
+                debug_assert!(ok, "copy_to_ssa encoding missing for type {}", ty);
+            }
+        }
+    }
+}
+
+// =============================================================================================
+// Top level: processing of tree shaped regions
+
+impl RedundantReloadRemover {
+    // Push a clone of the top-of-stack ProcessingStackElem.  This will be used to process exactly
+    // one Ebb.  The diversions are created new, rather than cloned, to reflect the fact
+    // that diversions are local to each Ebb.
+    fn processing_stack_push(&mut self, cursor: CursorPosition) {
+        let avail_env = if let Some(stack_top) = self.processing_stack.last() {
+            stack_top.avail_env.clone()
+        } else {
+            AvailEnv::new(
+                self.num_regunits
+                    .expect("processing_stack_push: num_regunits unknown!")
+                    as usize,
+            )
+        };
+        self.processing_stack.push(ProcessingStackElem {
+            avail_env,
+            cursor,
+            diversions: RegDiversions::new(),
+        });
+    }
+
+    // This pushes the node `dst` onto the processing stack, and sets up the new
+    // ProcessingStackElem accordingly.  But it does all that only if `dst` is part of the current
+    // tree *and* we haven't yet visited it.
+    fn processing_stack_maybe_push(&mut self, dst: Ebb) {
+        if self.nodes_in_tree.contains(dst) && !self.nodes_already_visited.contains(dst) {
+            if !self.processing_stack.is_empty() {
+                // If this isn't the outermost node in the tree (that is, the root), then it must
+                // have exactly one predecessor.  Nodes with no predecessors are dead and not
+                // incorporated in any tree.  Nodes with two or more predecessors are the root of
+                // some other tree, and visiting them as if they were part of the current tree
+                // would be a serious error.
+                debug_assert!(self.num_preds_per_ebb[dst] == ZeroOneOrMany::One);
+            }
+            self.processing_stack_push(CursorPosition::Before(dst));
+            self.nodes_already_visited.insert(dst);
+        }
+    }
+
+    // Perform redundant-reload removal on the tree shaped region of graph defined by `root` and
+    // `self.nodes_in_tree`.  The following state is modified: `self.processing_stack`,
+    // `self.nodes_already_visited`, and `func.dfg`.
+    fn process_tree(
+        &mut self,
+        func: &mut Function,
+        reginfo: &RegInfo,
+        isa: &dyn TargetIsa,
+        root: Ebb,
+    ) {
+        debug_assert!(self.nodes_in_tree.contains(root));
+        debug_assert!(self.processing_stack.is_empty());
+        debug_assert!(self.nodes_already_visited.is_empty());
+
+        // Create the initial work item
+        self.processing_stack_maybe_push(root);
+
+        while !self.processing_stack.is_empty() {
+            // It seems somewhat ridiculous to construct a whole new FuncCursor just so we can do
+            // next_inst() on it once, and then copy the resulting position back out.  But use of
+            // a function-global FuncCursor, or of the EncCursor in struct Context, leads to
+            // borrow checker problems, as does including FuncCursor directly in
+            // ProcessingStackElem.  In any case this is not as bad as it looks, since profiling
+            // shows that the build-insert-step-extract work is reduced to just 8 machine
+            // instructions in an optimised x86_64 build, presumably because rustc can inline and
+            // then optimise out almost all the work.
+            let tos = self.processing_stack.len() - 1;
+            let mut pos = FuncCursor::new(func).at_position(self.processing_stack[tos].cursor);
+            let maybe_inst = pos.next_inst();
+            self.processing_stack[tos].cursor = pos.position();
+
+            if let Some(inst) = maybe_inst {
+                // Deal with this insn, possibly changing it, possibly updating the top item of
+                // `self.processing_stack`.
+                self.visit_inst(func, reginfo, isa, inst);
+
+                // Update diversions after the insn.
+                self.processing_stack[tos].diversions.apply(&func.dfg[inst]);
+
+                // If the insn can branch outside this Ebb, push work items on the stack for all
+                // target Ebbs that are part of the same tree and that we haven't yet visited.
+                // The next iteration of this instruction-processing loop will immediately start
+                // work on the most recently pushed Ebb, and will eventually continue in this Ebb
+                // when those new items have been removed from the stack.
+                match func.dfg.analyze_branch(inst) {
+                    BranchInfo::NotABranch => (),
+                    BranchInfo::SingleDest(dst, _) => {
+                        self.processing_stack_maybe_push(dst);
+                    }
+                    BranchInfo::Table(jt, default) => {
+                        func.jump_tables[jt]
+                            .iter()
+                            .for_each(|dst| self.processing_stack_maybe_push(*dst));
+                        if let Some(dst) = default {
+                            self.processing_stack_maybe_push(dst);
+                        }
+                    }
+                }
+            } else {
+                // We've come to the end of the current work-item (Ebb).  We'll already have
+                // processed the fallthrough/continuation/whatever for it using the logic above.
+                // Pop it off the stack and resume work on its parent.
+                self.processing_stack.pop();
+            }
+        }
+    }
+}
+
+// =============================================================================================
+// Top level: perform redundant fill removal for a complete function
+
+impl RedundantReloadRemover {
+    /// Create a new remover state.
+    pub fn new() -> Self {
+        Self {
+            num_regunits: None,
+            num_preds_per_ebb: PrimaryMap::<Ebb, ZeroOneOrMany>::with_capacity(8),
+            discovery_stack: Vec::<Ebb>::with_capacity(16),
+            nodes_in_tree: EntitySet::<Ebb>::new(),
+            processing_stack: Vec::<ProcessingStackElem>::with_capacity(8),
+            nodes_already_visited: EntitySet::<Ebb>::new(),
+        }
+    }
+
+    /// Clear the state of the remover.
+    pub fn clear(&mut self) {
+        self.clear_for_new_function();
+    }
+
+    fn clear_for_new_function(&mut self) {
+        self.num_preds_per_ebb.clear();
+        self.clear_for_new_tree();
+    }
+
+    fn clear_for_new_tree(&mut self) {
+        self.discovery_stack.clear();
+        self.nodes_in_tree.clear();
+        self.processing_stack.clear();
+        self.nodes_already_visited.clear();
+    }
+
+    #[inline(never)]
+    fn do_redundant_fill_removal_on_function(
+        &mut self,
+        func: &mut Function,
+        reginfo: &RegInfo,
+        isa: &dyn TargetIsa,
+        cfg: &ControlFlowGraph,
+    ) {
+        // Fail in an obvious way if there are more than (2^32)-1 Ebbs in this function.
+        let num_ebbs: u32 = func.dfg.num_ebbs().try_into().unwrap();
+
+        // Clear out per-tree state.
+        self.clear_for_new_function();
+
+        // Create a PrimaryMap that summarises the number of predecessors for each block, as 0, 1
+        // or "many", and that also claims the entry block as having "many" predecessors.
+        self.num_preds_per_ebb.clear();
+        self.num_preds_per_ebb.reserve(num_ebbs as usize);
+
+        for i in 0..num_ebbs {
+            let mut pi = cfg.pred_iter(Ebb::from_u32(i));
+            let mut n_pi = ZeroOneOrMany::Zero;
+            if let Some(_) = pi.next() {
+                n_pi = ZeroOneOrMany::One;
+                if let Some(_) = pi.next() {
+                    n_pi = ZeroOneOrMany::Many;
+                    // We don't care if there are more than two preds, so stop counting now.
+                }
+            }
+            self.num_preds_per_ebb.push(n_pi);
+        }
+        debug_assert!(self.num_preds_per_ebb.len() == num_ebbs as usize);
+
+        // The entry block must be the root of some tree, so set up the state to reflect that.
+        let entry_ebb = func
+            .layout
+            .entry_block()
+            .expect("do_redundant_fill_removal_on_function: entry ebb unknown");
+        debug_assert!(self.num_preds_per_ebb[entry_ebb] == ZeroOneOrMany::Zero);
+        self.num_preds_per_ebb[entry_ebb] = ZeroOneOrMany::Many;
+
+        // Now build and process trees.
+        for root_ix in 0..self.num_preds_per_ebb.len() {
+            let root = Ebb::from_u32(root_ix as u32);
+
+            // Build a tree for each node that has two or more preds, and ignore all other nodes.
+            if self.num_preds_per_ebb[root] != ZeroOneOrMany::Many {
+                continue;
+            }
+
+            // Clear out per-tree state.
+            self.clear_for_new_tree();
+
+            // Discovery phase: build the tree, as `root` and `self.nodes_in_tree`.
+            self.add_nodes_to_tree(cfg, root);
+            debug_assert!(self.nodes_in_tree.cardinality() > 0);
+            debug_assert!(self.num_preds_per_ebb[root] == ZeroOneOrMany::Many);
+
+            // Processing phase: do redundant-reload-removal.
+            self.process_tree(func, reginfo, isa, root);
+            debug_assert!(
+                self.nodes_in_tree.cardinality() == self.nodes_already_visited.cardinality()
+            );
+        }
+    }
+}
+
+// =============================================================================================
+// Top level: the external interface
+
+struct Context<'a> {
+    // Current instruction as well as reference to function and ISA.
+    cur: EncCursor<'a>,
+
+    // Cached ISA information.  We save it here to avoid frequent virtual function calls on the
+    // `TargetIsa` trait object.
+    reginfo: RegInfo,
+
+    // References to contextual data structures we need.
+    cfg: &'a ControlFlowGraph,
+
+    // The running state.
+    state: &'a mut RedundantReloadRemover,
+}
+
+impl RedundantReloadRemover {
+    /// Run the remover.
+    pub fn run(&mut self, isa: &dyn TargetIsa, func: &mut Function, cfg: &ControlFlowGraph) {
+        let ctx = Context {
+            cur: EncCursor::new(func, isa),
+            reginfo: isa.register_info(),
+            cfg: cfg,
+            state: &mut RedundantReloadRemover::new(),
+        };
+        let mut total_regunits = 0;
+        for rb in isa.register_info().banks {
+            total_regunits += rb.units;
+        }
+        ctx.state.num_regunits = Some(total_regunits);
+        ctx.state.do_redundant_fill_removal_on_function(
+            ctx.cur.func,
+            &ctx.reginfo,
+            ctx.cur.isa,
+            &ctx.cfg,
+        );
+    }
+}
--- a/cranelift/codegen/src/regalloc/coloring.rs
+++ b/cranelift/codegen/src/regalloc/coloring.rs
@@ -45,7 +45,7 @@
 use crate::cursor::{Cursor, EncCursor};
 use crate::dominator_tree::DominatorTree;
 use crate::ir::{AbiParam, ArgumentLoc, InstBuilder, ValueDef};
-use crate::ir::{Ebb, Function, Inst, Layout, SigRef, Value, ValueLoc};
+use crate::ir::{Ebb, Function, Inst, InstructionData, Layout, Opcode, SigRef, Value, ValueLoc};
 use crate::isa::{regs_overlap, RegClass, RegInfo, RegUnit};
 use crate::isa::{ConstraintKind, EncInfo, OperandConstraint, RecipeConstraints, TargetIsa};
 use crate::packed_option::PackedOption;
@@ -428,9 +428,25 @@ impl<'a> Context<'a> {

        // Finally, we've fully programmed the constraint solver.
        // We expect a quick solution in most cases.
-        let output_regs = self.solver.quick_solve(&regs.global).unwrap_or_else(|_| {
+        let is_reload = match &self.cur.func.dfg[inst] {
+            InstructionData::Unary {
+                opcode: Opcode::Fill,
+                arg: _,
+            } => true,
+            _ => false,
+        };
+
+        let output_regs = self
+            .solver
+            .quick_solve(&regs.global, is_reload)
+            .unwrap_or_else(|_| {
                debug!("quick_solve failed for {}", self.solver);
-            self.iterate_solution(throughs, &regs.global, &mut replace_global_defines)
+                self.iterate_solution(
+                    throughs,
+                    &regs.global,
+                    &mut replace_global_defines,
+                    is_reload,
+                )
            });

        // The solution and/or fixed input constraints may require us to shuffle the set of live
@@ -847,12 +863,13 @@ impl<'a> Context<'a> {
        throughs: &[LiveValue],
        global_regs: &RegisterSet,
        replace_global_defines: &mut bool,
+        is_reload: bool,
    ) -> RegisterSet {
        // Make sure `try_add_var()` below doesn't create a variable with too loose constraints.
        self.program_complete_input_constraints();

        loop {
-            match self.solver.real_solve(global_regs) {
+            match self.solver.real_solve(global_regs, is_reload) {
                Ok(regs) => return regs,
                Err(SolverError::Divert(rc)) => {
                    // Do we have any live-through `rc` registers that are not already variables?
--- a/cranelift/codegen/src/regalloc/register_set.rs
+++ b/cranelift/codegen/src/regalloc/register_set.rs
@@ -126,6 +126,7 @@ impl RegisterSet {
 }

 /// Iterator over available registers in a register class.
+#[derive(Clone)]
 pub struct RegSetIter {
    regs: RegUnitMask,
 }
@@ -161,6 +162,31 @@ impl Iterator for RegSetIter {
    }
 }

+impl RegSetIter {
+    pub fn rnext(&mut self) -> Option<RegUnit> {
+        let num_words = self.regs.len();
+        let bits_per_word = 8 * size_of_val(&self.regs[0]);
+
+        // Find the last set bit in `self.regs`.
+        for i in 0..num_words {
+            let word_ix = num_words - 1 - i;
+
+            let word = &mut self.regs[word_ix];
+            if *word != 0 {
+                let lzeroes = word.leading_zeros() as usize;
+
+                // Clear that highest bit so we won't find it again.
+                *word &= !(1 << (bits_per_word - 1 - lzeroes));
+
+                return Some((word_ix * bits_per_word + bits_per_word - 1 - lzeroes) as RegUnit);
+            }
+        }
+
+        // All of `self.regs` is 0.
+        None
+    }
+}
+
 impl ExactSizeIterator for RegSetIter {}

 /// Displaying an `RegisterSet` correctly requires the associated `RegInfo` from the target ISA.
@@ -261,6 +287,45 @@ mod tests {
        classes: &[],
    };

+    const RSI_1: RegSetIter = RegSetIter {
+        regs: [0x31415927, 0x27182818, 0x14141356],
+    };
+
+    const RSI_2: RegSetIter = RegSetIter {
+        regs: [0x00000000, 0x00000000, 0x00000000],
+    };
+
+    const RSI_3: RegSetIter = RegSetIter {
+        regs: [0xffffffff, 0xffffffff, 0xffffffff],
+    };
+
+    fn reverse_regset_iteration_work(rsi: &RegSetIter) {
+        // Check the reverse iterator by comparing its output with the forward iterator.
+        let rsi_f = (*rsi).clone();
+        let results_f = rsi_f.collect::<Vec<_>>();
+
+        let mut rsi_r = (*rsi).clone();
+        let mut results_r = Vec::<RegUnit>::new();
+        while let Some(r) = rsi_r.rnext() {
+            results_r.push(r);
+        }
+
+        let len_f = results_f.len();
+        let len_r = results_r.len();
+        assert_eq!(len_f, len_r);
+
+        for i in 0..len_f {
+            assert_eq!(results_f[i], results_r[len_f - 1 - i]);
+        }
+    }
+
+    #[test]
+    fn reverse_regset_iteration() {
+        reverse_regset_iteration_work(&RSI_1);
+        reverse_regset_iteration_work(&RSI_2);
+        reverse_regset_iteration_work(&RSI_3);
+    }
+
    #[test]
    fn put_and_take() {
        let mut regs = RegisterSet::new();
--- a/cranelift/codegen/src/regalloc/solver.rs
+++ b/cranelift/codegen/src/regalloc/solver.rs
@@ -852,8 +852,12 @@ impl Solver {
    /// always trivial.
    ///
    /// Returns `Ok(regs)` if a solution was found.
-    pub fn quick_solve(&mut self, global_regs: &RegisterSet) -> Result<RegisterSet, SolverError> {
-        self.find_solution(global_regs)
+    pub fn quick_solve(
+        &mut self,
+        global_regs: &RegisterSet,
+        is_reload: bool,
+    ) -> Result<RegisterSet, SolverError> {
+        self.find_solution(global_regs, is_reload)
    }

    /// Try harder to find a solution.
@@ -863,7 +867,11 @@ impl Solver {
    /// This may return an error with a register class that has run out of registers. If registers
    /// can be freed up in the starving class, this method can be called again after adding
    /// variables for the freed registers.
-    pub fn real_solve(&mut self, global_regs: &RegisterSet) -> Result<RegisterSet, SolverError> {
+    pub fn real_solve(
+        &mut self,
+        global_regs: &RegisterSet,
+        is_reload: bool,
+    ) -> Result<RegisterSet, SolverError> {
        // Compute domain sizes for all the variables given the current register sets.
        for v in &mut self.vars {
            let d = v.iter(&self.regs_in, &self.regs_out, global_regs).len();
@@ -901,7 +909,7 @@ impl Solver {
        });

        debug!("real_solve for {}", self);
-        self.find_solution(global_regs)
+        self.find_solution(global_regs, is_reload)
    }

    /// Search for a solution with the current list of variables.
@@ -909,7 +917,11 @@ impl Solver {
    /// If a solution was found, returns `Ok(regs)` with the set of available registers on the
    /// output side after the solution. If no solution could be found, returns `Err(rc)` with the
    /// constraint register class that needs more available registers.
-    fn find_solution(&mut self, global_regs: &RegisterSet) -> Result<RegisterSet, SolverError> {
+    fn find_solution(
+        &mut self,
+        global_regs: &RegisterSet,
+        is_reload: bool,
+    ) -> Result<RegisterSet, SolverError> {
        // Available registers on the input and output sides respectively.
        let mut iregs = self.regs_in.clone();
        let mut oregs = self.regs_out.clone();
@@ -917,7 +929,20 @@ impl Solver {

        for v in &mut self.vars {
            let rc = v.constraint;
-            let reg = match v.iter(&iregs, &oregs, &gregs).next() {
+
+            // Decide which register to assign.  In order to try and keep registers holding
+            // reloaded values separate from all other registers to the extent possible, we choose
+            // the first available register in the normal case, but the last available one in the
+            // case of a reload.  See "A side note on register choice heuristics" in
+            // src/redundant_reload_remover.rs for further details.
+            let mut reg_set_iter = v.iter(&iregs, &oregs, &gregs);
+            let maybe_reg = if is_reload {
+                reg_set_iter.rnext()
+            } else {
+                reg_set_iter.next()
+            };
+
+            let reg = match maybe_reg {
                Some(reg) => reg,
                None => {
                    // If `v` must avoid global interference, there is not point in requesting
@@ -1207,7 +1232,7 @@ mod tests {
        solver.reset(&regs);
        solver.reassign_in(v10, gpr, r1, r0);
        solver.inputs_done();
-        assert!(solver.quick_solve(&gregs).is_ok());
+        assert!(solver.quick_solve(&gregs, false).is_ok());
        assert_eq!(solver.schedule_moves(&regs), 0);
        assert_eq!(solver.moves(), &[mov(v10, gpr, r1, r0)]);

@@ -1217,7 +1242,7 @@ mod tests {
        solver.reassign_in(v10, gpr, r0, r1);
        solver.reassign_in(v11, gpr, r1, r2);
        solver.inputs_done();
-        assert!(solver.quick_solve(&gregs).is_ok());
+        assert!(solver.quick_solve(&gregs, false).is_ok());
        assert_eq!(solver.schedule_moves(&regs), 0);
        assert_eq!(
            solver.moves(),
@@ -1229,7 +1254,7 @@ mod tests {
        solver.reassign_in(v10, gpr, r0, r1);
        solver.reassign_in(v11, gpr, r1, r0);
        solver.inputs_done();
-        assert!(solver.quick_solve(&gregs).is_ok());
+        assert!(solver.quick_solve(&gregs, false).is_ok());
        assert_eq!(solver.schedule_moves(&regs), 0);
        assert_eq!(
            solver.moves(),
@@ -1269,7 +1294,7 @@ mod tests {
        solver.reassign_in(v11, s, s2, s0);
        solver.reassign_in(v12, s, s3, s1);
        solver.inputs_done();
-        assert!(solver.quick_solve(&gregs).is_ok());
+        assert!(solver.quick_solve(&gregs, false).is_ok());
        assert_eq!(solver.schedule_moves(&regs), 0);
        assert_eq!(
            solver.moves(),
@@ -1290,7 +1315,7 @@ mod tests {
        solver.reassign_in(v12, s, s1, s3);
        solver.reassign_in(v10, d, d1, d0);
        solver.inputs_done();
-        assert!(solver.quick_solve(&gregs).is_ok());
+        assert!(solver.quick_solve(&gregs, false).is_ok());
        assert_eq!(solver.schedule_moves(&regs), 0);
        assert_eq!(
            solver.moves(),
@@ -1335,7 +1360,7 @@ mod tests {
        solver.reassign_in(v11, gpr, r1, r2);
        solver.reassign_in(v12, gpr, r2, r0);
        solver.inputs_done();
-        assert!(solver.quick_solve(&gregs).is_ok());
+        assert!(solver.quick_solve(&gregs, false).is_ok());
        assert_eq!(solver.schedule_moves(&regs), 1);
        assert_eq!(
            solver.moves(),
@@ -1359,7 +1384,7 @@ mod tests {
        solver.reassign_in(v15, gpr, r5, r3);

        solver.inputs_done();
-        assert!(solver.quick_solve(&gregs).is_ok());
+        assert!(solver.quick_solve(&gregs, false).is_ok());
        // We resolve two cycles with one spill.
        assert_eq!(solver.schedule_moves(&regs), 1);
        assert_eq!(
--- a/cranelift/codegen/src/verifier/mod.rs
+++ b/cranelift/codegen/src/verifier/mod.rs
@@ -697,6 +697,7 @@ impl<'a> Verifier<'a> {
            | Store { .. }
            | RegMove { .. }
            | CopySpecial { .. }
+            | CopyToSsa { .. }
            | Trap { .. }
            | CondTrap { .. }
            | IntCondTrap { .. }
--- a/cranelift/codegen/src/write.rs
+++ b/cranelift/codegen/src/write.rs
@@ -664,6 +664,14 @@ pub fn write_operands(
                write!(w, " %{} -> %{}", src, dst)
            }
        }
+        CopyToSsa { src, .. } => {
+            if let Some(isa) = isa {
+                let regs = isa.register_info();
+                write!(w, " {}", regs.display_regunit(src))
+            } else {
+                write!(w, " %{}", src)
+            }
+        }
        RegSpill { arg, src, dst, .. } => {
            if let Some(isa) = isa {
                let regs = isa.register_info();
--- a/cranelift/entity/src/set.rs
+++ b/cranelift/entity/src/set.rs
@@ -45,9 +45,27 @@ where

    /// Is this set completely empty?
    pub fn is_empty(&self) -> bool {
+        // Note that this implementation will become incorrect should it ever become possible
+        // to remove elements from an `EntitySet`.
        self.len == 0
    }

+    /// Returns the cardinality of the set.  More precisely, it returns the number of calls to
+    /// `insert` with different key values, that have happened since the the set was most recently
+    /// `clear`ed or created with `new`.
+    pub fn cardinality(&self) -> usize {
+        let mut n: usize = 0;
+        for byte_ix in 0..self.len / 8 {
+            n += self.elems[byte_ix].count_ones() as usize;
+        }
+        for bit_ix in (self.len / 8) * 8..self.len {
+            if (self.elems[bit_ix / 8] & (1 << (bit_ix % 8))) != 0 {
+                n += 1;
+            }
+        }
+        n
+    }
+
    /// Remove all entries from this set.
    pub fn clear(&mut self) {
        self.len = 0;
--- a/cranelift/filetests/filetests/safepoint/call.clif
+++ b/cranelift/filetests/filetests/safepoint/call.clif
@@ -41,6 +41,7 @@ ebb1:
 ; nextln:   v3 = spill v10
 ; nextln:   brz v2, ebb1
 ; nextln:   v11 = fill v1
+; nextln:   regmove v11, %r15 -> %rax
 ; nextln:   return v11
 ; nextln: 
 ; nextln: ebb1:
@@ -48,5 +49,6 @@ ebb1:
 ; nextln:   safepoint v3
 ; nextln:   v4 = call_indirect sig1, v8()
 ; nextln:   v12 = fill.r64 v3
+; nextln:   regmove v12, %r15 -> %rax
 ; nextln:   return v12
 ; nextln: }
--- a/cranelift/reader/src/parser.rs
+++ b/cranelift/reader/src/parser.rs
@@ -2498,6 +2498,10 @@ impl<'a> Parser<'a> {
                let dst = self.match_regunit(ctx.unique_isa)?;
                InstructionData::CopySpecial { opcode, src, dst }
            }
+            InstructionFormat::CopyToSsa => InstructionData::CopyToSsa {
+                opcode,
+                src: self.match_regunit(ctx.unique_isa)?,
+            },
            InstructionFormat::RegSpill => {
                let arg = self.match_value("expected SSA value operand")?;
                self.match_token(Token::Comma, "expected ',' between operands")?;
--- a/cranelift/serde/src/serde_clif_json.rs
+++ b/cranelift/serde/src/serde_clif_json.rs
@@ -210,6 +210,10 @@ pub enum SerInstData {
        src: String,
        dst: String,
    },
+    CopyToSsa {
+        opcode: String,
+        src: String,
+    },
    RegSpill {
        opcode: String,
        arg: String,
@@ -651,6 +655,10 @@ pub fn get_inst_data(inst_index: Inst, func: &Function) -> SerInstData {
            src: src.to_string(),
            dst: dst.to_string(),
        },
+        InstructionData::CopyToSsa { opcode, src } => SerInstData::CopyToSsa {
+            opcode: opcode.to_string(),
+            src: src.to_string(),
+        },
        InstructionData::RegSpill {
            opcode,
            arg,