Cranelift: implement redundant fill removal on tree-shaped CFG regions. Mozilla bug 1570584. (#906)

This commit is contained in:
julian-seward1
2019-08-25 19:37:34 +02:00
committed by GitHub
parent cc57e84cbd
commit b8fb52446c
19 changed files with 1262 additions and 24 deletions

View File

@@ -9,6 +9,7 @@ use crate::cdsl::settings::SettingGroup;
use crate::shared::types::Bool::B1;
use crate::shared::types::Float::{F32, F64};
use crate::shared::types::Int::{I16, I32, I64, I8};
use crate::shared::types::Reference::{R32, R64};
use crate::shared::Definitions as SharedDefinitions;
use super::recipes::RecipeGroup;
@@ -121,7 +122,9 @@ pub fn define<'defs>(
let call_indirect = shared.by_name("call_indirect");
let copy = shared.by_name("copy");
let copy_nop = shared.by_name("copy_nop");
let copy_to_ssa = shared.by_name("copy_to_ssa");
let fill = shared.by_name("fill");
let fill_nop = shared.by_name("fill_nop");
let iadd = shared.by_name("iadd");
let iadd_imm = shared.by_name("iadd_imm");
let iconst = shared.by_name("iconst");
@@ -141,6 +144,8 @@ pub fn define<'defs>(
let return_ = shared.by_name("return");
// Recipes shorthands, prefixed with r_.
let r_copytossa = recipes.by_name("copytossa");
let r_fillnull = recipes.by_name("fillnull");
let r_icall = recipes.by_name("Icall");
let r_icopy = recipes.by_name("Icopy");
let r_ii = recipes.by_name("Ii");
@@ -368,6 +373,14 @@ pub fn define<'defs>(
e.add64(enc(fill.bind(I32), r_gp_fi, load_bits(0b010)));
e.add64(enc(fill.bind(I64), r_gp_fi, load_bits(0b011)));
// No-op fills, created by late-stage redundant-fill removal.
for &ty in &[I64, I32] {
e.add64(enc(fill_nop.bind(ty), r_fillnull, 0));
e.add32(enc(fill_nop.bind(ty), r_fillnull, 0));
}
e.add64(enc(fill_nop.bind(B1), r_fillnull, 0));
e.add32(enc(fill_nop.bind(B1), r_fillnull, 0));
// Register copies.
e.add32(enc(copy.bind(I32), r_icopy, opimm_bits(0b000, 0)));
e.add64(enc(copy.bind(I64), r_icopy, opimm_bits(0b000, 0)));
@@ -394,5 +407,34 @@ pub fn define<'defs>(
e.add64(enc(copy_nop.bind(ty), r_stacknull, 0));
}
// Copy-to-SSA
e.add32(enc(
copy_to_ssa.bind(I32),
r_copytossa,
opimm_bits(0b000, 0),
));
e.add64(enc(
copy_to_ssa.bind(I64),
r_copytossa,
opimm_bits(0b000, 0),
));
e.add64(enc(
copy_to_ssa.bind(I32),
r_copytossa,
opimm32_bits(0b000, 0),
));
e.add32(enc(copy_to_ssa.bind(B1), r_copytossa, opimm_bits(0b000, 0)));
e.add64(enc(copy_to_ssa.bind(B1), r_copytossa, opimm_bits(0b000, 0)));
e.add32(enc(
copy_to_ssa.bind_ref(R32),
r_copytossa,
opimm_bits(0b000, 0),
));
e.add64(enc(
copy_to_ssa.bind_ref(R64),
r_copytossa,
opimm_bits(0b000, 0),
));
e
}

View File

@@ -63,6 +63,7 @@ pub fn define<'formats>(
let f_branch_icmp = formats.by_name("BranchIcmp");
let f_call = formats.by_name("Call");
let f_call_indirect = formats.by_name("CallIndirect");
let f_copy_to_ssa = formats.by_name("CopyToSsa");
let f_int_compare = formats.by_name("IntCompare");
let f_int_compare_imm = formats.by_name("IntCompareImm");
let f_jump = formats.by_name("Jump");
@@ -185,6 +186,14 @@ pub fn define<'formats>(
.emit("put_i(bits, src, 0, dst, sink);"),
);
// Same for copy-to-SSA -- GPR regmove.
recipes.push(
EncodingRecipeBuilder::new("copytossa", f_copy_to_ssa, 4)
// No operands_in to mention, because a source register is specified directly.
.operands_out(vec![gpr])
.emit("put_i(bits, src, 0, out_reg0, sink);"),
);
// U-type instructions have a 20-bit immediate that targets bits 12-31.
let format = formats.get(f_unary_imm);
recipes.push(
@@ -271,5 +280,14 @@ pub fn define<'formats>(
.emit(""),
);
// No-op fills, created by late-stage redundant-fill removal.
recipes.push(
EncodingRecipeBuilder::new("fillnull", f_unary, 0)
.operands_in(vec![Stack::new(gpr)])
.operands_out(vec![gpr])
.clobbers_flags(false)
.emit(""),
);
recipes
}

View File

@@ -340,6 +340,7 @@ pub fn define(
let copy = shared.by_name("copy");
let copy_nop = shared.by_name("copy_nop");
let copy_special = shared.by_name("copy_special");
let copy_to_ssa = shared.by_name("copy_to_ssa");
let ctz = shared.by_name("ctz");
let debugtrap = shared.by_name("debugtrap");
let extractlane = shared.by_name("extractlane");
@@ -352,6 +353,7 @@ pub fn define(
let fdiv = shared.by_name("fdiv");
let ffcmp = shared.by_name("ffcmp");
let fill = shared.by_name("fill");
let fill_nop = shared.by_name("fill_nop");
let floor = shared.by_name("floor");
let fmul = shared.by_name("fmul");
let fpromote = shared.by_name("fpromote");
@@ -468,7 +470,9 @@ pub fn define(
let rec_fax = r.template("fax");
let rec_fcmp = r.template("fcmp");
let rec_fcscc = r.template("fcscc");
let rec_ffillnull = r.recipe("ffillnull");
let rec_ffillSib32 = r.template("ffillSib32");
let rec_fillnull = r.recipe("fillnull");
let rec_fillSib32 = r.template("fillSib32");
let rec_fld = r.template("fld");
let rec_fldDisp32 = r.template("fldDisp32");
@@ -490,6 +494,7 @@ pub fn define(
let rec_fstWithIndexDisp32 = r.template("fstWithIndexDisp32");
let rec_fstWithIndexDisp8 = r.template("fstWithIndexDisp8");
let rec_furm = r.template("furm");
let rec_furm_reg_to_ssa = r.template("furm_reg_to_ssa");
let rec_furmi_rnd = r.template("furmi_rnd");
let rec_got_fnaddr8 = r.template("got_fnaddr8");
let rec_got_gvaddr8 = r.template("got_gvaddr8");
@@ -568,6 +573,7 @@ pub fn define(
let rec_trapff = r.recipe("trapff");
let rec_u_id = r.template("u_id");
let rec_umr = r.template("umr");
let rec_umr_reg_to_ssa = r.template("umr_reg_to_ssa");
let rec_ur = r.template("ur");
let rec_urm = r.template("urm");
let rec_urm_noflags = r.template("urm_noflags");
@@ -921,6 +927,18 @@ pub fn define(
e.enc_r32_r64(fill, rec_fillSib32.opcodes(vec![0x8b]));
e.enc_r32_r64(regfill, rec_regfill32.opcodes(vec![0x8b]));
// No-op fills, created by late-stage redundant-fill removal.
for &ty in &[I64, I32, I16, I8] {
e.enc64_rec(fill_nop.bind(ty), rec_fillnull, 0);
e.enc32_rec(fill_nop.bind(ty), rec_fillnull, 0);
}
e.enc64_rec(fill_nop.bind(B1), rec_fillnull, 0);
e.enc32_rec(fill_nop.bind(B1), rec_fillnull, 0);
for &ty in &[F64, F32] {
e.enc64_rec(fill_nop.bind(ty), rec_ffillnull, 0);
e.enc32_rec(fill_nop.bind(ty), rec_ffillnull, 0);
}
// Load 32 bits from `b1`, `i8` and `i16` spill slots. See `spill.b1` above.
e.enc_both(fill.bind(B1), rec_fillSib32.opcodes(vec![0x8b]));
@@ -943,6 +961,24 @@ pub fn define(
e.enc64(copy_special, rec_copysp.opcodes(vec![0x89]).rex().w());
e.enc32(copy_special, rec_copysp.opcodes(vec![0x89]));
// Copy to SSA
e.enc_i32_i64(copy_to_ssa, rec_umr_reg_to_ssa.opcodes(vec![0x89]));
e.enc_r32_r64(copy_to_ssa, rec_umr_reg_to_ssa.opcodes(vec![0x89]));
e.enc_both(copy_to_ssa.bind(B1), rec_umr_reg_to_ssa.opcodes(vec![0x89]));
e.enc_both(copy_to_ssa.bind(I8), rec_umr_reg_to_ssa.opcodes(vec![0x89]));
e.enc_both(
copy_to_ssa.bind(I16),
rec_umr_reg_to_ssa.opcodes(vec![0x89]),
);
e.enc_both(
copy_to_ssa.bind(F64),
rec_furm_reg_to_ssa.opcodes(vec![0xf2, 0x0f, 0x10]),
);
e.enc_both(
copy_to_ssa.bind(F32),
rec_furm_reg_to_ssa.opcodes(vec![0xf3, 0x0f, 0x10]),
);
// Stack-slot-to-the-same-stack-slot copy, which is guaranteed to turn
// into a no-op.
// The same encoding is generated for both the 64- and 32-bit architectures.

View File

@@ -367,6 +367,7 @@ pub fn define<'shared>(
let f_call = formats.by_name("Call");
let f_call_indirect = formats.by_name("CallIndirect");
let f_copy_special = formats.by_name("CopySpecial");
let f_copy_to_ssa = formats.by_name("CopyToSsa");
let f_extract_lane = formats.by_name("ExtractLane"); // TODO this would preferably retrieve a BinaryImm8 format but because formats are compared structurally and ExtractLane has the same structure this is impossible--if we rename ExtractLane, it may even impact parsing
let f_float_compare = formats.by_name("FloatCompare");
let f_float_cond = formats.by_name("FloatCond");
@@ -426,6 +427,22 @@ pub fn define<'shared>(
.emit(""),
);
// No-op fills, created by late-stage redundant-fill removal.
recipes.add_recipe(
EncodingRecipeBuilder::new("fillnull", f_unary, 0)
.operands_in(vec![stack_gpr32])
.operands_out(vec![gpr])
.clobbers_flags(false)
.emit(""),
);
recipes.add_recipe(
EncodingRecipeBuilder::new("ffillnull", f_unary, 0)
.operands_in(vec![stack_gpr32])
.operands_out(vec![fpr])
.clobbers_flags(false)
.emit(""),
);
recipes
.add_recipe(EncodingRecipeBuilder::new("debugtrap", f_nullary, 1).emit("sink.put1(0xcc);"));
@@ -570,6 +587,20 @@ pub fn define<'shared>(
),
);
// Same as umr, but with the source register specified directly.
recipes.add_template_recipe(
EncodingRecipeBuilder::new("umr_reg_to_ssa", f_copy_to_ssa, 1)
// No operands_in to mention, because a source register is specified directly.
.operands_out(vec![gpr])
.clobbers_flags(false)
.emit(
r#"
{{PUT_OP}}(bits, rex2(out_reg0, src), sink);
modrm_rr(out_reg0, src, sink);
"#,
),
);
// XX /r, but for a unary operator with separate input/output register.
// RM form. Clobbers FLAGS.
recipes.add_template_recipe(
@@ -631,6 +662,20 @@ pub fn define<'shared>(
),
);
// Same as furm, but with the source register specified directly.
recipes.add_template_recipe(
EncodingRecipeBuilder::new("furm_reg_to_ssa", f_copy_to_ssa, 1)
// No operands_in to mention, because a source register is specified directly.
.operands_out(vec![fpr])
.clobbers_flags(false)
.emit(
r#"
{{PUT_OP}}(bits, rex2(src, out_reg0), sink);
modrm_rr(src, out_reg0, sink);
"#,
),
);
// XX /r, RM form, GPR -> FPR.
recipes.add_template_recipe(
EncodingRecipeBuilder::new("frurm", f_unary, 1)

View File

@@ -157,6 +157,7 @@ pub fn define(immediates: &OperandKinds, entities: &OperandKinds) -> FormatRegis
.imm(("src", regunit))
.imm(("dst", regunit)),
);
registry.insert(Builder::new("CopyToSsa").imm(("src", regunit)));
registry.insert(
Builder::new("RegSpill")
.value()

View File

@@ -1194,6 +1194,22 @@ pub fn define(
.can_load(true),
);
ig.push(
Inst::new(
"fill_nop",
r#"
This is identical to `fill`, except it has no encoding, since it is a no-op.
This instruction is created only during late-stage redundant-reload removal, after all
registers and stack slots have been assigned. It is used to replace `fill`s that have
been identified as redundant.
"#,
)
.operands_in(vec![x])
.operands_out(vec![a])
.can_load(true),
);
let src = &operand("src", regunit);
let dst = &operand("dst", regunit);
@@ -1233,6 +1249,23 @@ pub fn define(
.other_side_effects(true),
);
ig.push(
Inst::new(
"copy_to_ssa",
r#"
Copies the contents of ''src'' register to ''a'' SSA name.
This instruction copies the contents of one register, regardless of its SSA name, to
another register, creating a new SSA name. In that sense it is a one-sided version
of ''copy_special''. This instruction is internal and should not be created by
Cranelift users.
"#,
)
.operands_in(vec![src])
.operands_out(vec![a])
.other_side_effects(true),
);
ig.push(
Inst::new(
"copy_nop",

View File

@@ -23,6 +23,7 @@ use crate::licm::do_licm;
use crate::loop_analysis::LoopAnalysis;
use crate::nan_canonicalization::do_nan_canonicalization;
use crate::postopt::do_postopt;
use crate::redundant_reload_remover::RedundantReloadRemover;
use crate::regalloc;
use crate::result::CodegenResult;
use crate::settings::{FlagsOrIsa, OptLevel};
@@ -50,6 +51,9 @@ pub struct Context {
/// Loop analysis of `func`.
pub loop_analysis: LoopAnalysis,
/// Redundant-reload remover context.
pub redundant_reload_remover: RedundantReloadRemover,
}
impl Context {
@@ -72,6 +76,7 @@ impl Context {
domtree: DominatorTree::new(),
regalloc: regalloc::Context::new(),
loop_analysis: LoopAnalysis::new(),
redundant_reload_remover: RedundantReloadRemover::new(),
}
}
@@ -82,6 +87,7 @@ impl Context {
self.domtree.clear();
self.regalloc.clear();
self.loop_analysis.clear();
self.redundant_reload_remover.clear();
}
/// Compile the function, and emit machine code into a `Vec<u8>`.
@@ -149,6 +155,7 @@ impl Context {
self.regalloc(isa)?;
self.prologue_epilogue(isa)?;
if isa.flags().opt_level() == OptLevel::Best {
self.redundant_reload_remover(isa)?;
self.shrink_instructions(isa)?;
}
self.relax_branches(isa)
@@ -322,6 +329,14 @@ impl Context {
Ok(())
}
/// Do redundant-reload removal after allocation of both registers and stack slots.
pub fn redundant_reload_remover(&mut self, isa: &dyn TargetIsa) -> CodegenResult<()> {
self.redundant_reload_remover
.run(isa, &mut self.func, &self.cfg);
self.verify_if(isa)?;
Ok(())
}
/// Run the instruction shrinking pass.
pub fn shrink_instructions(&mut self, isa: &dyn TargetIsa) -> CodegenResult<()> {
shrink_instructions(&mut self.func, isa);

View File

@@ -209,11 +209,6 @@ impl StackSlots {
self.slots.is_valid(ss)
}
/// Set the offset of a stack slot.
pub fn set_offset(&mut self, ss: StackSlot, offset: StackOffset) {
self.slots[ss].offset = Some(offset);
}
/// Get an iterator over all the stack slot keys.
pub fn iter(&self) -> Iter<StackSlot, StackSlotData> {
self.slots.iter()

View File

@@ -95,6 +95,7 @@ mod nan_canonicalization;
mod partition_slice;
mod postopt;
mod predicates;
mod redundant_reload_remover;
mod ref_slice;
mod regalloc;
mod result;

View File

@@ -0,0 +1,904 @@
//! This module implements a late-stage redundant-reload remover, which runs after registers have
//! been allocated and stack slots have been given specific offsets.
use crate::cursor::{Cursor, CursorPosition, EncCursor, FuncCursor};
use crate::entity::EntitySet;
use crate::flowgraph::ControlFlowGraph;
use crate::ir::dfg::DataFlowGraph;
use crate::ir::instructions::BranchInfo;
use crate::ir::stackslot::{StackSlotKind, StackSlots};
use crate::ir::{
Ebb, Function, Inst, InstBuilder, InstructionData, Opcode, StackSlotData, Type, Value, ValueLoc,
};
use crate::isa::{RegInfo, RegUnit, TargetIsa};
use crate::regalloc::RegDiversions;
use core::convert::TryInto;
use cranelift_entity::{PrimaryMap, SecondaryMap};
use std::vec::Vec;
// =============================================================================================
// A description of the redundant-fill-removal algorithm
//
//
// The algorithm works forwards through each Ebb. It carries along and updates a table,
// AvailEnv, with which it tracks registers that are known to have the same value as some stack
// slot. The actions on encountering an instruction depend on the instruction, as follows:
//
// ss1 = spill r0: update the AvailEnv so as to note that slot `ss1` and register `r0`
// have the same value.
//
// r1 = fill ss0: look in the AvailEnv. If it tells us that register `r1` and slot `ss0`
// have the same value, then delete the instruction by converting it to a
// `fill_nop`.
//
// If it tells us that some other register `r2` has the same value as
// slot `ss0`, convert the instruction into a copy from `r2` to `r1`.
//
// any other insn: remove from the AvailEnv, any bindings associated with registers
// written by this instruction, since they will be invalidated by it.
//
// Tracking the effects of `copy` instructions in AvailEnv for the case when both source and
// destination are registers does not cause any more fills to be removed or converted to copies.
// It's not clear why.
//
// There are various other instruction-handling cases in `visit_inst`, which are documented
// in-line, and do not change the core algorithm, so are not described here.
//
// The registers tracked by AvailEnv are the post-diversion registers that are really used by the
// code; they are not the pre-diversion names associated with each SSA `Value`. The second
// `fill` case above opportunistically copies values from registers that may have been diversion
// targets in some predecessor block, and so are no longer associated with any specific SSA-level
// name at the point the copy is made. Hence those copies (from `r2` to `r1`) cannot be done
// with an ordinary `copy` instruction. Instead they have to be done using a new `copy_to_ssa`
// instruction, which copies from an arbitrary register to a register-resident `Value` (that is,
// "back to" SSA-world).
//
// That completes the description of the core algorithm.
//
// In the case where a block `A` jumps to `B` and `A` is the only predecessor of `B`, the
// AvailEnv at the end of `A` will still be valid at the entry to `B`. In such a case, we can
// profitably transform `B` using the AvailEnv "inherited" from `A`. In order to take full
// advantage of this, this module partitions the function's CFG into tree-shaped groups of
// blocks, and processes each tree as described above. So the AvailEnv is only initialised to
// empty at the start of blocks that form the root of each tree; that is, for blocks which have
// two or more predecessors.
// =============================================================================================
// Top level algorithm structure
//
// The overall algorithm, for a function, starts like this:
//
// * (once per function): finds Ebbs that have two or more predecessors, since they will be the
// roots of Ebb trees. Also, the entry node for the function is considered to be a root.
//
// It then continues with a loop that first finds a tree of Ebbs ("discovery") and then removes
// redundant fills as described above ("processing"):
//
// * (discovery; once per tree): for each root, performs a depth first search to find all the Ebbs
// in the tree, guided by RedundantReloadRemover::discovery_stack.
//
// * (processing; once per tree): the just-discovered tree is then processed as described above,
// guided by RedundantReloadRemover::processing_stack.
//
// In this way, all Ebbs reachable from the function's entry point are eventually processed. Note
// that each tree is processed as soon as it has been discovered, so the algorithm never creates a
// list of trees for the function.
//
// The running state is stored in `RedundantReloadRemover`. This is allocated once and can be
// reused for multiple functions so as to minimise heap turnover. The fields are, roughly:
//
// num_regunits -- constant for the whole function; used by the tree processing phase
// num_preds_per_ebb -- constant for the whole function; used by the tree discovery process
//
// discovery_stack -- used to guide the tree discovery process
// nodes_in_tree -- the discovered nodes are recorded here
//
// processing_stack -- used to guide the tree processing process
// nodes_already_visited -- used to ensure the tree processing logic terminates in the case
// where a tree has a branch back to its root node.
//
// There is further documentation in line below, as appropriate.
// =============================================================================================
// A side note on register choice heuristics
// The core algorithm opportunistically replaces fill instructions when it knows of a register
// that already holds the required value. How effective this is largely depends on how long
// reloaded values happen to stay alive before the relevant register is overwritten. And that
// depends on the register allocator's register choice heuristics. The worst case is, when the
// register allocator reuses registers as soon as possible after they become free. Unfortunately
// that was indeed the selection scheme, prior to development of this pass.
//
// As part of this work, the register selection scheme has been changed as follows: for registers
// written by any instruction other than a fill, use the lowest numbered available register. But
// for registers written by a fill instruction, use the highest numbered available register. The
// aim is to try and keep reload- and non-reload registers disjoint to the extent possible.
// Several other schemes were tried, but this one is simple and can be worth an extra 2% of
// performance in some cases.
//
// The relevant change is more or less a one-line change in the solver.
// =============================================================================================
// Data structures used for discovery of trees
// `ZeroOneOrMany` is used to record the number of predecessors an Ebb block has. The `Zero` case
// is included so as to cleanly handle the case where the incoming graph has unreachable Ebbs.
#[derive(Clone, PartialEq)]
enum ZeroOneOrMany {
Zero,
One,
Many,
}
// =============================================================================================
// Data structures used for processing of trees
// `SlotInfo` describes a spill slot in the obvious way. Note that it doesn't indicate which
// register(s) are currently associated with the slot. That job is done by `AvailEnv` instead.
//
// In the CL framework, stack slots are partitioned into disjoint sets, one for each
// `StackSlotKind`. The offset and size only give a unique identity within any particular
// `StackSlotKind`. So, to uniquely identify a stack slot, all three fields are necessary.
#[derive(Clone, Copy)]
struct SlotInfo {
kind: StackSlotKind,
offset: i32,
size: u32,
}
// `AvailEnv` maps each possible register to a stack slot that holds the same value. The index
// space of `AvailEnv::map` is exactly the set of registers available on the current target. If
// (as is mostly the case) a register is not known to have the same value as a stack slot, then
// its entry is `None` rather than `Some(..)`.
//
// Invariants for AvailEnv:
//
// AvailEnv may have multiple different registers bound to the same stack slot -- that is, `(kind,
// offset, size)` triple. That's OK, and reflects the reality that those two registers contain
// the same value. This could happen, for example, in the case
//
// ss1 = spill r0
// ..
// r2 = fill ss1
//
// Then both `r0` and `r2` will have the same value as `ss1`, provided that ".." doesn't write to
// `r1`.
//
// To say that two different registers may be bound to the same stack slot is the same as saying
// that it is allowed to have two different entries in AvailEnv with the same `(kind, offset,
// size)` triple. What is *not* allowed is to have partial overlaps. That is, if two SlotInfos
// have the same `kind` field and have `offset` and `size` fields that overlap, then their
// `offset` and `size` fields must be identical. This is so as to make the algorithm safe against
// situations where, for example, a 64 bit register is spilled, but then only the bottom 32 bits
// are reloaded from the slot.
//
// Although in such a case it seems likely that the Cranelift IR would be ill-typed, and so this
// case could probably not occur in practice.
#[derive(Clone)]
struct AvailEnv {
map: Vec<Option<SlotInfo>>,
}
// `ProcessingStackElem` combines AvailEnv with contextual information needed to "navigate" within
// an Ebb.
//
// A ProcessingStackElem conceptually has the lifetime of exactly one Ebb: once the current Ebb is
// completed, the ProcessingStackElem will be abandoned. In practice the top level state,
// RedundantReloadRemover, caches them, so as to avoid heap turnover.
//
// Note that ProcessingStackElem must contain a CursorPosition. The CursorPosition, which
// indicates where we are in the current Ebb, cannot be implicitly maintained by looping over all
// the instructions in an Ebb in turn, because we may choose to suspend processing the current Ebb
// at a side exit, continue by processing the subtree reached via the side exit, and only later
// resume the current Ebb.
struct ProcessingStackElem {
/// Indicates the AvailEnv at the current point in the Ebb.
avail_env: AvailEnv,
/// Shows where we currently are inside the Ebb.
cursor: CursorPosition,
/// Indicates the currently active register diversions at the current point.
diversions: RegDiversions,
}
// =============================================================================================
// The top level data structure
// `RedundantReloadRemover` contains data structures for the two passes: discovery of tree shaped
// regions, and processing of them. These are allocated once and stay alive for the entire
// function, even though they are cleared out for each new tree shaped region. It also caches
// `num_regunits` and `num_preds_per_ebb`, which are computed at the start of each function and
// then remain constant.
/// The redundant reload remover's state.
pub struct RedundantReloadRemover {
/// The total number of RegUnits available on this architecture. This is unknown when the
/// RedundantReloadRemover is created. It becomes known at the beginning of processing of a
/// function.
num_regunits: Option<u16>,
/// This stores, for each Ebb, a characterisation of the number of predecessors it has.
num_preds_per_ebb: PrimaryMap<Ebb, ZeroOneOrMany>,
/// The stack used for the first phase (discovery). There is one element on the discovery
/// stack for each currently unexplored Ebb in the tree being searched.
discovery_stack: Vec<Ebb>,
/// The nodes in the discovered tree are inserted here.
nodes_in_tree: EntitySet<Ebb>,
/// The stack used during the second phase (transformation). There is one element on the
/// processing stack for each currently-open node in the tree being transformed.
processing_stack: Vec<ProcessingStackElem>,
/// Used in the second phase to avoid visiting nodes more than once.
nodes_already_visited: EntitySet<Ebb>,
}
// =============================================================================================
// Miscellaneous small helper functions
// Is this a kind of stack slot that is safe to track in AvailEnv? This is probably overly
// conservative, but tracking only the SpillSlot and IncomingArgument kinds catches almost all
// available redundancy in practice.
fn is_slot_kind_tracked(kind: StackSlotKind) -> bool {
match kind {
StackSlotKind::SpillSlot | StackSlotKind::IncomingArg => true,
_ => false,
}
}
// Find out if the range `[offset, +size)` overlaps with the range in `si`.
fn overlaps(si: &SlotInfo, offset: i32, size: u32) -> bool {
let a_offset = si.offset as i64;
let a_size = si.size as i64;
let b_offset = offset as i64;
let b_size = size as i64;
let no_overlap = a_offset + a_size <= b_offset || b_offset + b_size <= a_offset;
!no_overlap
}
// Find, in `reginfo`, the register bank that `reg` lives in, and return the lower limit and size
// of the bank. This is so the caller can conveniently iterate over all RegUnits in the bank that
// `reg` lives in.
fn find_bank_limits(reginfo: &RegInfo, reg: RegUnit) -> (RegUnit, u16) {
if let Some(bank) = reginfo.bank_containing_regunit(reg) {
return (bank.first_unit, bank.units);
}
// We should never get here, since `reg` must come from *some* RegBank.
panic!("find_regclass_limits: reg not found");
}
// Returns the register that `v` is allocated to. Assumes that `v` actually resides in a
// register.
fn reg_of_value(locations: &SecondaryMap<Value, ValueLoc>, v: Value) -> RegUnit {
match locations[v] {
ValueLoc::Reg(ru) => ru,
_ => panic!("reg_of_value: value isn't in a reg"),
}
}
// Returns the stack slot that `v` is allocated to. Assumes that `v` actually resides in a stack
// slot.
fn slot_of_value<'s>(
locations: &SecondaryMap<Value, ValueLoc>,
stack_slots: &'s StackSlots,
v: Value,
) -> &'s StackSlotData {
match locations[v] {
ValueLoc::Stack(slot) => &stack_slots[slot],
_ => panic!("slot_of_value: value isn't in a stack slot"),
}
}
// =============================================================================================
// Top level: discovery of tree shaped regions
impl RedundantReloadRemover {
// A helper for `add_nodes_to_tree` below.
fn discovery_stack_push_successors_of(&mut self, cfg: &ControlFlowGraph, node: Ebb) {
for successor in cfg.succ_iter(node) {
self.discovery_stack.push(successor);
}
}
// Visit the tree of Ebbs rooted at `starting_point` and add them to `self.nodes_in_tree`.
// `self.num_preds_per_ebb` guides the process, ensuring we don't leave the tree-ish region
// and indirectly ensuring that the process will terminate in the presence of cycles in the
// graph. `self.discovery_stack` holds the search state in this function.
fn add_nodes_to_tree(&mut self, cfg: &ControlFlowGraph, starting_point: Ebb) {
// One might well ask why this doesn't loop forever when it encounters cycles in the
// control flow graph. The reason is that any cycle in the graph that is reachable from
// anywhere outside the cycle -- in particular, that is reachable from the function's
// entry node -- must have at least one node that has two or more predecessors. So the
// logic below won't follow into it, because it regards any such node as the root of some
// other tree.
debug_assert!(self.discovery_stack.is_empty());
debug_assert!(self.nodes_in_tree.is_empty());
self.nodes_in_tree.insert(starting_point);
self.discovery_stack_push_successors_of(cfg, starting_point);
while let Some(node) = self.discovery_stack.pop() {
match self.num_preds_per_ebb[node] {
// We arrived at a node with multiple predecessors, so it's a new root. Ignore it.
ZeroOneOrMany::Many => {}
// This node has just one predecessor, so we should incorporate it in the tree and
// immediately transition into searching from it instead.
ZeroOneOrMany::One => {
self.nodes_in_tree.insert(node);
self.discovery_stack_push_successors_of(cfg, node);
}
// This is meaningless. We arrived at a node that doesn't point back at where we
// came from.
ZeroOneOrMany::Zero => panic!("add_nodes_to_tree: inconsistent graph"),
}
}
}
}
// =============================================================================================
// Operations relating to `AvailEnv`
impl AvailEnv {
// Create a new one.
fn new(size: usize) -> Self {
let mut env = AvailEnv {
map: Vec::<Option<SlotInfo>>::new(),
};
env.map.resize(size, None);
env
}
// Debug only: checks (some of) the required AvailEnv invariants.
#[cfg(debug_assertions)]
fn check_invariants(&self) -> bool {
// Check that any overlapping entries overlap exactly. This is super lame (quadratic),
// but it's only used in debug builds.
for i in 0..self.map.len() {
if let Some(si) = self.map[i] {
for j in i + 1..self.map.len() {
if let Some(sj) = self.map[j] {
// "si and sj overlap, but not exactly"
if si.kind == sj.kind
&& overlaps(&si, sj.offset, sj.size)
&& !(si.offset == sj.offset && si.size == sj.size)
{
return false;
}
}
}
}
}
true
}
// Invalidates the binding associated with `reg`. Note that by construction of AvailEnv,
// `reg` can only be associated with one binding at once.
fn invalidate_by_reg(&mut self, reg: RegUnit) {
self.map[reg as usize] = None;
}
// Invalidates any binding that has any overlap with `(kind, offset, size)`.
fn invalidate_by_offset(&mut self, kind: StackSlotKind, offset: i32, size: u32) {
debug_assert!(is_slot_kind_tracked(kind));
for i in 0..self.map.len() {
if let Some(si) = &self.map[i] {
if si.kind == kind && overlaps(&si, offset, size) {
self.map[i] = None;
}
}
}
}
// Invalidates all bindings.
fn invalidate_all(&mut self) {
for i in 0..self.map.len() {
self.map[i] = None;
}
}
// Updates AvailEnv to track the effect of a `regmove` instruction.
fn copy_reg(&mut self, src: RegUnit, dst: RegUnit) {
self.map[dst as usize] = self.map[src as usize];
}
// Does `env` have the exact binding characterised by `(reg, kind, offset, size)` ?
fn has_exact_binding(&self, reg: RegUnit, kind: StackSlotKind, offset: i32, size: u32) -> bool {
debug_assert!(is_slot_kind_tracked(kind));
if let Some(si) = &self.map[reg as usize] {
return si.kind == kind && si.offset == offset && si.size == size;
}
// No such binding.
false
}
// Does `env` have a binding characterised by `(kind, offset, size)` but to a register, let's
// call it `other_reg`, that isn't `reg`? If so, return `other_reg`. Note that `other_reg`
// will have the same bank as `reg`. It is a checked error to call this function with a
// binding matching all four of `(reg, kind, offset, size)`.
fn has_inexact_binding(
&self,
reginfo: &RegInfo,
reg: RegUnit,
kind: StackSlotKind,
offset: i32,
size: u32,
) -> Option<RegUnit> {
debug_assert!(is_slot_kind_tracked(kind));
// Find the range of RegUnit numbers for the bank that contains `reg`, and use that as our
// search space. This is so as to guarantee that any match is restricted to the same bank
// as `reg`.
let (first_unit, num_units) = find_bank_limits(reginfo, reg);
for other_reg in first_unit..first_unit + num_units {
if let Some(si) = &self.map[other_reg as usize] {
if si.kind == kind && si.offset == offset && si.size == size {
if other_reg == reg {
panic!("has_inexact_binding: binding *is* exact!");
}
return Some(other_reg);
}
}
}
// No such binding.
None
}
// Create the binding `(reg, kind, offset, size)` in `env`, and throw away any previous
// binding associated with either `reg` or the `(kind, offset, size)` triple.
fn bind(&mut self, reg: RegUnit, kind: StackSlotKind, offset: i32, size: u32) {
debug_assert!(is_slot_kind_tracked(kind));
self.invalidate_by_offset(kind, offset, size);
self.map[reg as usize] = Some(SlotInfo { kind, offset, size });
}
}
// Invalidates in `avail_env`, any binding associated with a regunit that is written by `inst`.
fn invalidate_regs_written_by_inst(
locations: &SecondaryMap<Value, ValueLoc>,
diversions: &RegDiversions,
dfg: &DataFlowGraph,
avail_env: &mut AvailEnv,
inst: Inst,
) {
for v in dfg.inst_results(inst).iter() {
if let ValueLoc::Reg(ru) = locations[*v] {
// This must be true. It would be meaningless for an SSA value to be diverted before
// the point where it is defined.
debug_assert!(diversions.reg(*v, locations) == ru);
avail_env.invalidate_by_reg(ru);
}
}
}
// =============================================================================================
// Processing of individual instructions
impl RedundantReloadRemover {
// Process `inst`, possibly changing it into a different instruction, and possibly changing
// `self.avail_env` and `func.dfg`.
fn visit_inst(
&mut self,
func: &mut Function,
reginfo: &RegInfo,
isa: &dyn TargetIsa,
inst: Inst,
) {
// Get hold of the top-of-stack work item. This is the state that we will mutate during
// processing of this instruction.
debug_assert!(!self.processing_stack.is_empty());
let ProcessingStackElem {
avail_env,
cursor: _,
diversions,
} = &mut self.processing_stack.last_mut().unwrap();
#[cfg(debug_assertions)]
debug_assert!(
avail_env.check_invariants(),
"visit_inst: env invariants not ok"
);
let dfg = &mut func.dfg;
let locations = &func.locations;
let stack_slots = &func.stack_slots;
// To avoid difficulties with the borrow checker, do this in two stages. First, examine
// the instruction to see if it can be deleted or modified, and park the relevant
// information in `transform`. Update `self.avail_env` too. Later, use `transform` to
// actually do the transformation if necessary.
enum Transform {
NoChange,
ChangeToNopFill(Value), // delete this insn entirely
ChangeToCopyToSSA(Type, RegUnit), // change it into a copy from the specified reg
}
let mut transform = Transform::NoChange;
// In this match { .. } statement, either we must treat the instruction specially, or we
// must call `invalidate_regs_written_by_inst` on it.
match &dfg[inst] {
InstructionData::Unary {
opcode: Opcode::Spill,
arg: src_value,
} => {
// Extract: (src_reg, kind, offset, size)
// Invalidate: (kind, offset, size)
// Add new binding: {src_reg -> (kind, offset, size)}
// Don't forget that src_value might be diverted, so we have to deref it.
let slot = slot_of_value(locations, stack_slots, dfg.inst_results(inst)[0]);
let src_reg = diversions.reg(*src_value, locations);
let kind = slot.kind;
if is_slot_kind_tracked(kind) {
let offset = slot.offset.expect("visit_inst: spill with no offset");
let size = slot.size;
avail_env.bind(src_reg, kind, offset, size);
} else {
// We don't expect this insn to write any regs. But to be consistent with the
// rule above, do this anyway.
invalidate_regs_written_by_inst(locations, diversions, dfg, avail_env, inst);
}
}
InstructionData::Unary {
opcode: Opcode::Fill,
arg: src_value,
} => {
// Extract: (dst_reg, kind, offset, size)
// Invalidate: (kind, offset, size)
// Add new: {dst_reg -> (dst_value, kind, offset, size)}
let slot = slot_of_value(locations, stack_slots, *src_value);
let dst_value = dfg.inst_results(inst)[0];
let dst_reg = reg_of_value(locations, dst_value);
// This must be true. It would be meaningless for an SSA value to be diverted
// before it was defined.
debug_assert!(dst_reg == diversions.reg(dst_value, locations));
let kind = slot.kind;
if is_slot_kind_tracked(kind) {
let offset = slot.offset.expect("visit_inst: fill with no offset");
let size = slot.size;
if avail_env.has_exact_binding(dst_reg, kind, offset, size) {
// This instruction is an exact copy of a fill we saw earlier, and the
// loaded value is still valid. So we'll schedule this instruction for
// deletion (below). No need to make any changes to `avail_env`.
transform = Transform::ChangeToNopFill(*src_value);
} else if let Some(other_reg) =
avail_env.has_inexact_binding(reginfo, dst_reg, kind, offset, size)
{
// This fill is from the required slot, but into a different register
// `other_reg`. So replace it with a copy from `other_reg` to `dst_reg`
// and update `dst_reg`s binding to make it the same as `other_reg`'s, so
// as to maximise the chances of future matches after this instruction.
debug_assert!(other_reg != dst_reg);
transform =
Transform::ChangeToCopyToSSA(dfg.value_type(dst_value), other_reg);
avail_env.copy_reg(other_reg, dst_reg);
} else {
// This fill creates some new binding we don't know about. Update
// `avail_env` to track it.
avail_env.bind(dst_reg, kind, offset, size);
}
} else {
// Else it's "just another instruction that writes a reg", so we'd better
// treat it as such, just as we do below for instructions that we don't handle
// specially.
invalidate_regs_written_by_inst(locations, diversions, dfg, avail_env, inst);
}
}
InstructionData::RegMove {
opcode: _,
arg: _,
src,
dst,
} => {
// These happen relatively rarely, but just frequently enough that it's worth
// tracking the copy (at the machine level, it's really a copy) in `avail_env`.
avail_env.copy_reg(*src, *dst);
}
InstructionData::RegSpill { .. }
| InstructionData::RegFill { .. }
| InstructionData::Call { .. }
| InstructionData::CallIndirect { .. }
| InstructionData::StackLoad { .. }
| InstructionData::StackStore { .. }
| InstructionData::Unary {
opcode: Opcode::AdjustSpDown,
..
}
| InstructionData::UnaryImm {
opcode: Opcode::AdjustSpUpImm,
..
}
| InstructionData::UnaryImm {
opcode: Opcode::AdjustSpDownImm,
..
} => {
// All of these change, or might change, the memory-register bindings tracked in
// `avail_env` in some way we don't know about, or at least, we might be able to
// track, but for which the effort-to-benefit ratio seems too low to bother. So
// play safe: forget everything we know.
//
// For Call/CallIndirect, we could do better when compiling for calling
// conventions that have callee-saved registers, since bindings for them would
// remain valid across the call.
avail_env.invalidate_all();
}
_ => {
// Invalidate: any `avail_env` entry associated with a reg written by `inst`.
invalidate_regs_written_by_inst(locations, diversions, dfg, avail_env, inst);
}
}
// Actually do the transformation.
match transform {
Transform::NoChange => {}
Transform::ChangeToNopFill(arg) => {
// Load is completely redundant. Convert it to a no-op.
dfg.replace(inst).fill_nop(arg);
let ok = func.update_encoding(inst, isa).is_ok();
debug_assert!(ok, "fill_nop encoding missing for this type");
}
Transform::ChangeToCopyToSSA(ty, reg) => {
// We already have the relevant value in some other register. Convert the
// load into a reg-reg copy.
dfg.replace(inst).copy_to_ssa(ty, reg);
let ok = func.update_encoding(inst, isa).is_ok();
debug_assert!(ok, "copy_to_ssa encoding missing for type {}", ty);
}
}
}
}
// =============================================================================================
// Top level: processing of tree shaped regions
impl RedundantReloadRemover {
// Push a clone of the top-of-stack ProcessingStackElem. This will be used to process exactly
// one Ebb. The diversions are created new, rather than cloned, to reflect the fact
// that diversions are local to each Ebb.
fn processing_stack_push(&mut self, cursor: CursorPosition) {
let avail_env = if let Some(stack_top) = self.processing_stack.last() {
stack_top.avail_env.clone()
} else {
AvailEnv::new(
self.num_regunits
.expect("processing_stack_push: num_regunits unknown!")
as usize,
)
};
self.processing_stack.push(ProcessingStackElem {
avail_env,
cursor,
diversions: RegDiversions::new(),
});
}
// This pushes the node `dst` onto the processing stack, and sets up the new
// ProcessingStackElem accordingly. But it does all that only if `dst` is part of the current
// tree *and* we haven't yet visited it.
fn processing_stack_maybe_push(&mut self, dst: Ebb) {
if self.nodes_in_tree.contains(dst) && !self.nodes_already_visited.contains(dst) {
if !self.processing_stack.is_empty() {
// If this isn't the outermost node in the tree (that is, the root), then it must
// have exactly one predecessor. Nodes with no predecessors are dead and not
// incorporated in any tree. Nodes with two or more predecessors are the root of
// some other tree, and visiting them as if they were part of the current tree
// would be a serious error.
debug_assert!(self.num_preds_per_ebb[dst] == ZeroOneOrMany::One);
}
self.processing_stack_push(CursorPosition::Before(dst));
self.nodes_already_visited.insert(dst);
}
}
// Perform redundant-reload removal on the tree shaped region of graph defined by `root` and
// `self.nodes_in_tree`. The following state is modified: `self.processing_stack`,
// `self.nodes_already_visited`, and `func.dfg`.
fn process_tree(
&mut self,
func: &mut Function,
reginfo: &RegInfo,
isa: &dyn TargetIsa,
root: Ebb,
) {
debug_assert!(self.nodes_in_tree.contains(root));
debug_assert!(self.processing_stack.is_empty());
debug_assert!(self.nodes_already_visited.is_empty());
// Create the initial work item
self.processing_stack_maybe_push(root);
while !self.processing_stack.is_empty() {
// It seems somewhat ridiculous to construct a whole new FuncCursor just so we can do
// next_inst() on it once, and then copy the resulting position back out. But use of
// a function-global FuncCursor, or of the EncCursor in struct Context, leads to
// borrow checker problems, as does including FuncCursor directly in
// ProcessingStackElem. In any case this is not as bad as it looks, since profiling
// shows that the build-insert-step-extract work is reduced to just 8 machine
// instructions in an optimised x86_64 build, presumably because rustc can inline and
// then optimise out almost all the work.
let tos = self.processing_stack.len() - 1;
let mut pos = FuncCursor::new(func).at_position(self.processing_stack[tos].cursor);
let maybe_inst = pos.next_inst();
self.processing_stack[tos].cursor = pos.position();
if let Some(inst) = maybe_inst {
// Deal with this insn, possibly changing it, possibly updating the top item of
// `self.processing_stack`.
self.visit_inst(func, reginfo, isa, inst);
// Update diversions after the insn.
self.processing_stack[tos].diversions.apply(&func.dfg[inst]);
// If the insn can branch outside this Ebb, push work items on the stack for all
// target Ebbs that are part of the same tree and that we haven't yet visited.
// The next iteration of this instruction-processing loop will immediately start
// work on the most recently pushed Ebb, and will eventually continue in this Ebb
// when those new items have been removed from the stack.
match func.dfg.analyze_branch(inst) {
BranchInfo::NotABranch => (),
BranchInfo::SingleDest(dst, _) => {
self.processing_stack_maybe_push(dst);
}
BranchInfo::Table(jt, default) => {
func.jump_tables[jt]
.iter()
.for_each(|dst| self.processing_stack_maybe_push(*dst));
if let Some(dst) = default {
self.processing_stack_maybe_push(dst);
}
}
}
} else {
// We've come to the end of the current work-item (Ebb). We'll already have
// processed the fallthrough/continuation/whatever for it using the logic above.
// Pop it off the stack and resume work on its parent.
self.processing_stack.pop();
}
}
}
}
// =============================================================================================
// Top level: perform redundant fill removal for a complete function
impl RedundantReloadRemover {
/// Create a new remover state.
pub fn new() -> Self {
Self {
num_regunits: None,
num_preds_per_ebb: PrimaryMap::<Ebb, ZeroOneOrMany>::with_capacity(8),
discovery_stack: Vec::<Ebb>::with_capacity(16),
nodes_in_tree: EntitySet::<Ebb>::new(),
processing_stack: Vec::<ProcessingStackElem>::with_capacity(8),
nodes_already_visited: EntitySet::<Ebb>::new(),
}
}
/// Clear the state of the remover.
pub fn clear(&mut self) {
self.clear_for_new_function();
}
fn clear_for_new_function(&mut self) {
self.num_preds_per_ebb.clear();
self.clear_for_new_tree();
}
fn clear_for_new_tree(&mut self) {
self.discovery_stack.clear();
self.nodes_in_tree.clear();
self.processing_stack.clear();
self.nodes_already_visited.clear();
}
#[inline(never)]
fn do_redundant_fill_removal_on_function(
&mut self,
func: &mut Function,
reginfo: &RegInfo,
isa: &dyn TargetIsa,
cfg: &ControlFlowGraph,
) {
// Fail in an obvious way if there are more than (2^32)-1 Ebbs in this function.
let num_ebbs: u32 = func.dfg.num_ebbs().try_into().unwrap();
// Clear out per-tree state.
self.clear_for_new_function();
// Create a PrimaryMap that summarises the number of predecessors for each block, as 0, 1
// or "many", and that also claims the entry block as having "many" predecessors.
self.num_preds_per_ebb.clear();
self.num_preds_per_ebb.reserve(num_ebbs as usize);
for i in 0..num_ebbs {
let mut pi = cfg.pred_iter(Ebb::from_u32(i));
let mut n_pi = ZeroOneOrMany::Zero;
if let Some(_) = pi.next() {
n_pi = ZeroOneOrMany::One;
if let Some(_) = pi.next() {
n_pi = ZeroOneOrMany::Many;
// We don't care if there are more than two preds, so stop counting now.
}
}
self.num_preds_per_ebb.push(n_pi);
}
debug_assert!(self.num_preds_per_ebb.len() == num_ebbs as usize);
// The entry block must be the root of some tree, so set up the state to reflect that.
let entry_ebb = func
.layout
.entry_block()
.expect("do_redundant_fill_removal_on_function: entry ebb unknown");
debug_assert!(self.num_preds_per_ebb[entry_ebb] == ZeroOneOrMany::Zero);
self.num_preds_per_ebb[entry_ebb] = ZeroOneOrMany::Many;
// Now build and process trees.
for root_ix in 0..self.num_preds_per_ebb.len() {
let root = Ebb::from_u32(root_ix as u32);
// Build a tree for each node that has two or more preds, and ignore all other nodes.
if self.num_preds_per_ebb[root] != ZeroOneOrMany::Many {
continue;
}
// Clear out per-tree state.
self.clear_for_new_tree();
// Discovery phase: build the tree, as `root` and `self.nodes_in_tree`.
self.add_nodes_to_tree(cfg, root);
debug_assert!(self.nodes_in_tree.cardinality() > 0);
debug_assert!(self.num_preds_per_ebb[root] == ZeroOneOrMany::Many);
// Processing phase: do redundant-reload-removal.
self.process_tree(func, reginfo, isa, root);
debug_assert!(
self.nodes_in_tree.cardinality() == self.nodes_already_visited.cardinality()
);
}
}
}
// =============================================================================================
// Top level: the external interface
struct Context<'a> {
// Current instruction as well as reference to function and ISA.
cur: EncCursor<'a>,
// Cached ISA information. We save it here to avoid frequent virtual function calls on the
// `TargetIsa` trait object.
reginfo: RegInfo,
// References to contextual data structures we need.
cfg: &'a ControlFlowGraph,
// The running state.
state: &'a mut RedundantReloadRemover,
}
impl RedundantReloadRemover {
/// Run the remover.
pub fn run(&mut self, isa: &dyn TargetIsa, func: &mut Function, cfg: &ControlFlowGraph) {
let ctx = Context {
cur: EncCursor::new(func, isa),
reginfo: isa.register_info(),
cfg: cfg,
state: &mut RedundantReloadRemover::new(),
};
let mut total_regunits = 0;
for rb in isa.register_info().banks {
total_regunits += rb.units;
}
ctx.state.num_regunits = Some(total_regunits);
ctx.state.do_redundant_fill_removal_on_function(
ctx.cur.func,
&ctx.reginfo,
ctx.cur.isa,
&ctx.cfg,
);
}
}

View File

@@ -45,7 +45,7 @@
use crate::cursor::{Cursor, EncCursor};
use crate::dominator_tree::DominatorTree;
use crate::ir::{AbiParam, ArgumentLoc, InstBuilder, ValueDef};
use crate::ir::{Ebb, Function, Inst, Layout, SigRef, Value, ValueLoc};
use crate::ir::{Ebb, Function, Inst, InstructionData, Layout, Opcode, SigRef, Value, ValueLoc};
use crate::isa::{regs_overlap, RegClass, RegInfo, RegUnit};
use crate::isa::{ConstraintKind, EncInfo, OperandConstraint, RecipeConstraints, TargetIsa};
use crate::packed_option::PackedOption;
@@ -428,9 +428,25 @@ impl<'a> Context<'a> {
// Finally, we've fully programmed the constraint solver.
// We expect a quick solution in most cases.
let output_regs = self.solver.quick_solve(&regs.global).unwrap_or_else(|_| {
let is_reload = match &self.cur.func.dfg[inst] {
InstructionData::Unary {
opcode: Opcode::Fill,
arg: _,
} => true,
_ => false,
};
let output_regs = self
.solver
.quick_solve(&regs.global, is_reload)
.unwrap_or_else(|_| {
debug!("quick_solve failed for {}", self.solver);
self.iterate_solution(throughs, &regs.global, &mut replace_global_defines)
self.iterate_solution(
throughs,
&regs.global,
&mut replace_global_defines,
is_reload,
)
});
// The solution and/or fixed input constraints may require us to shuffle the set of live
@@ -847,12 +863,13 @@ impl<'a> Context<'a> {
throughs: &[LiveValue],
global_regs: &RegisterSet,
replace_global_defines: &mut bool,
is_reload: bool,
) -> RegisterSet {
// Make sure `try_add_var()` below doesn't create a variable with too loose constraints.
self.program_complete_input_constraints();
loop {
match self.solver.real_solve(global_regs) {
match self.solver.real_solve(global_regs, is_reload) {
Ok(regs) => return regs,
Err(SolverError::Divert(rc)) => {
// Do we have any live-through `rc` registers that are not already variables?

View File

@@ -126,6 +126,7 @@ impl RegisterSet {
}
/// Iterator over available registers in a register class.
#[derive(Clone)]
pub struct RegSetIter {
regs: RegUnitMask,
}
@@ -161,6 +162,31 @@ impl Iterator for RegSetIter {
}
}
impl RegSetIter {
pub fn rnext(&mut self) -> Option<RegUnit> {
let num_words = self.regs.len();
let bits_per_word = 8 * size_of_val(&self.regs[0]);
// Find the last set bit in `self.regs`.
for i in 0..num_words {
let word_ix = num_words - 1 - i;
let word = &mut self.regs[word_ix];
if *word != 0 {
let lzeroes = word.leading_zeros() as usize;
// Clear that highest bit so we won't find it again.
*word &= !(1 << (bits_per_word - 1 - lzeroes));
return Some((word_ix * bits_per_word + bits_per_word - 1 - lzeroes) as RegUnit);
}
}
// All of `self.regs` is 0.
None
}
}
impl ExactSizeIterator for RegSetIter {}
/// Displaying an `RegisterSet` correctly requires the associated `RegInfo` from the target ISA.
@@ -261,6 +287,45 @@ mod tests {
classes: &[],
};
const RSI_1: RegSetIter = RegSetIter {
regs: [0x31415927, 0x27182818, 0x14141356],
};
const RSI_2: RegSetIter = RegSetIter {
regs: [0x00000000, 0x00000000, 0x00000000],
};
const RSI_3: RegSetIter = RegSetIter {
regs: [0xffffffff, 0xffffffff, 0xffffffff],
};
fn reverse_regset_iteration_work(rsi: &RegSetIter) {
// Check the reverse iterator by comparing its output with the forward iterator.
let rsi_f = (*rsi).clone();
let results_f = rsi_f.collect::<Vec<_>>();
let mut rsi_r = (*rsi).clone();
let mut results_r = Vec::<RegUnit>::new();
while let Some(r) = rsi_r.rnext() {
results_r.push(r);
}
let len_f = results_f.len();
let len_r = results_r.len();
assert_eq!(len_f, len_r);
for i in 0..len_f {
assert_eq!(results_f[i], results_r[len_f - 1 - i]);
}
}
#[test]
fn reverse_regset_iteration() {
reverse_regset_iteration_work(&RSI_1);
reverse_regset_iteration_work(&RSI_2);
reverse_regset_iteration_work(&RSI_3);
}
#[test]
fn put_and_take() {
let mut regs = RegisterSet::new();

View File

@@ -852,8 +852,12 @@ impl Solver {
/// always trivial.
///
/// Returns `Ok(regs)` if a solution was found.
pub fn quick_solve(&mut self, global_regs: &RegisterSet) -> Result<RegisterSet, SolverError> {
self.find_solution(global_regs)
pub fn quick_solve(
&mut self,
global_regs: &RegisterSet,
is_reload: bool,
) -> Result<RegisterSet, SolverError> {
self.find_solution(global_regs, is_reload)
}
/// Try harder to find a solution.
@@ -863,7 +867,11 @@ impl Solver {
/// This may return an error with a register class that has run out of registers. If registers
/// can be freed up in the starving class, this method can be called again after adding
/// variables for the freed registers.
pub fn real_solve(&mut self, global_regs: &RegisterSet) -> Result<RegisterSet, SolverError> {
pub fn real_solve(
&mut self,
global_regs: &RegisterSet,
is_reload: bool,
) -> Result<RegisterSet, SolverError> {
// Compute domain sizes for all the variables given the current register sets.
for v in &mut self.vars {
let d = v.iter(&self.regs_in, &self.regs_out, global_regs).len();
@@ -901,7 +909,7 @@ impl Solver {
});
debug!("real_solve for {}", self);
self.find_solution(global_regs)
self.find_solution(global_regs, is_reload)
}
/// Search for a solution with the current list of variables.
@@ -909,7 +917,11 @@ impl Solver {
/// If a solution was found, returns `Ok(regs)` with the set of available registers on the
/// output side after the solution. If no solution could be found, returns `Err(rc)` with the
/// constraint register class that needs more available registers.
fn find_solution(&mut self, global_regs: &RegisterSet) -> Result<RegisterSet, SolverError> {
fn find_solution(
&mut self,
global_regs: &RegisterSet,
is_reload: bool,
) -> Result<RegisterSet, SolverError> {
// Available registers on the input and output sides respectively.
let mut iregs = self.regs_in.clone();
let mut oregs = self.regs_out.clone();
@@ -917,7 +929,20 @@ impl Solver {
for v in &mut self.vars {
let rc = v.constraint;
let reg = match v.iter(&iregs, &oregs, &gregs).next() {
// Decide which register to assign. In order to try and keep registers holding
// reloaded values separate from all other registers to the extent possible, we choose
// the first available register in the normal case, but the last available one in the
// case of a reload. See "A side note on register choice heuristics" in
// src/redundant_reload_remover.rs for further details.
let mut reg_set_iter = v.iter(&iregs, &oregs, &gregs);
let maybe_reg = if is_reload {
reg_set_iter.rnext()
} else {
reg_set_iter.next()
};
let reg = match maybe_reg {
Some(reg) => reg,
None => {
// If `v` must avoid global interference, there is not point in requesting
@@ -1207,7 +1232,7 @@ mod tests {
solver.reset(&regs);
solver.reassign_in(v10, gpr, r1, r0);
solver.inputs_done();
assert!(solver.quick_solve(&gregs).is_ok());
assert!(solver.quick_solve(&gregs, false).is_ok());
assert_eq!(solver.schedule_moves(&regs), 0);
assert_eq!(solver.moves(), &[mov(v10, gpr, r1, r0)]);
@@ -1217,7 +1242,7 @@ mod tests {
solver.reassign_in(v10, gpr, r0, r1);
solver.reassign_in(v11, gpr, r1, r2);
solver.inputs_done();
assert!(solver.quick_solve(&gregs).is_ok());
assert!(solver.quick_solve(&gregs, false).is_ok());
assert_eq!(solver.schedule_moves(&regs), 0);
assert_eq!(
solver.moves(),
@@ -1229,7 +1254,7 @@ mod tests {
solver.reassign_in(v10, gpr, r0, r1);
solver.reassign_in(v11, gpr, r1, r0);
solver.inputs_done();
assert!(solver.quick_solve(&gregs).is_ok());
assert!(solver.quick_solve(&gregs, false).is_ok());
assert_eq!(solver.schedule_moves(&regs), 0);
assert_eq!(
solver.moves(),
@@ -1269,7 +1294,7 @@ mod tests {
solver.reassign_in(v11, s, s2, s0);
solver.reassign_in(v12, s, s3, s1);
solver.inputs_done();
assert!(solver.quick_solve(&gregs).is_ok());
assert!(solver.quick_solve(&gregs, false).is_ok());
assert_eq!(solver.schedule_moves(&regs), 0);
assert_eq!(
solver.moves(),
@@ -1290,7 +1315,7 @@ mod tests {
solver.reassign_in(v12, s, s1, s3);
solver.reassign_in(v10, d, d1, d0);
solver.inputs_done();
assert!(solver.quick_solve(&gregs).is_ok());
assert!(solver.quick_solve(&gregs, false).is_ok());
assert_eq!(solver.schedule_moves(&regs), 0);
assert_eq!(
solver.moves(),
@@ -1335,7 +1360,7 @@ mod tests {
solver.reassign_in(v11, gpr, r1, r2);
solver.reassign_in(v12, gpr, r2, r0);
solver.inputs_done();
assert!(solver.quick_solve(&gregs).is_ok());
assert!(solver.quick_solve(&gregs, false).is_ok());
assert_eq!(solver.schedule_moves(&regs), 1);
assert_eq!(
solver.moves(),
@@ -1359,7 +1384,7 @@ mod tests {
solver.reassign_in(v15, gpr, r5, r3);
solver.inputs_done();
assert!(solver.quick_solve(&gregs).is_ok());
assert!(solver.quick_solve(&gregs, false).is_ok());
// We resolve two cycles with one spill.
assert_eq!(solver.schedule_moves(&regs), 1);
assert_eq!(

View File

@@ -697,6 +697,7 @@ impl<'a> Verifier<'a> {
| Store { .. }
| RegMove { .. }
| CopySpecial { .. }
| CopyToSsa { .. }
| Trap { .. }
| CondTrap { .. }
| IntCondTrap { .. }

View File

@@ -664,6 +664,14 @@ pub fn write_operands(
write!(w, " %{} -> %{}", src, dst)
}
}
CopyToSsa { src, .. } => {
if let Some(isa) = isa {
let regs = isa.register_info();
write!(w, " {}", regs.display_regunit(src))
} else {
write!(w, " %{}", src)
}
}
RegSpill { arg, src, dst, .. } => {
if let Some(isa) = isa {
let regs = isa.register_info();

View File

@@ -45,9 +45,27 @@ where
/// Is this set completely empty?
pub fn is_empty(&self) -> bool {
// Note that this implementation will become incorrect should it ever become possible
// to remove elements from an `EntitySet`.
self.len == 0
}
/// Returns the cardinality of the set. More precisely, it returns the number of calls to
/// `insert` with different key values, that have happened since the the set was most recently
/// `clear`ed or created with `new`.
pub fn cardinality(&self) -> usize {
let mut n: usize = 0;
for byte_ix in 0..self.len / 8 {
n += self.elems[byte_ix].count_ones() as usize;
}
for bit_ix in (self.len / 8) * 8..self.len {
if (self.elems[bit_ix / 8] & (1 << (bit_ix % 8))) != 0 {
n += 1;
}
}
n
}
/// Remove all entries from this set.
pub fn clear(&mut self) {
self.len = 0;

View File

@@ -41,6 +41,7 @@ ebb1:
; nextln: v3 = spill v10
; nextln: brz v2, ebb1
; nextln: v11 = fill v1
; nextln: regmove v11, %r15 -> %rax
; nextln: return v11
; nextln:
; nextln: ebb1:
@@ -48,5 +49,6 @@ ebb1:
; nextln: safepoint v3
; nextln: v4 = call_indirect sig1, v8()
; nextln: v12 = fill.r64 v3
; nextln: regmove v12, %r15 -> %rax
; nextln: return v12
; nextln: }

View File

@@ -2498,6 +2498,10 @@ impl<'a> Parser<'a> {
let dst = self.match_regunit(ctx.unique_isa)?;
InstructionData::CopySpecial { opcode, src, dst }
}
InstructionFormat::CopyToSsa => InstructionData::CopyToSsa {
opcode,
src: self.match_regunit(ctx.unique_isa)?,
},
InstructionFormat::RegSpill => {
let arg = self.match_value("expected SSA value operand")?;
self.match_token(Token::Comma, "expected ',' between operands")?;

View File

@@ -210,6 +210,10 @@ pub enum SerInstData {
src: String,
dst: String,
},
CopyToSsa {
opcode: String,
src: String,
},
RegSpill {
opcode: String,
arg: String,
@@ -651,6 +655,10 @@ pub fn get_inst_data(inst_index: Inst, func: &Function) -> SerInstData {
src: src.to_string(),
dst: dst.to_string(),
},
InstructionData::CopyToSsa { opcode, src } => SerInstData::CopyToSsa {
opcode: opcode.to_string(),
src: src.to_string(),
},
InstructionData::RegSpill {
opcode,
arg,