MachInst lowering logic: allow effectful instructions to merge.

This PR updates the "coloring" scheme that accounts for side-effects in the MachInst lowering logic. As a result, the new backends will now be able to merge effectful operations (such as memory loads) *into* other operations; previously, only the other way (pure ops merged into effectful ops) was possible. This will allow, for example, a load+ALU-op combination, as is common on x86. It should even allow a load + ALU-op + store sequence to merge into one lowered instruction. The scheme arose from many fruitful discussions with @julian-seward1 (thanks!); significant credit is due to him for the insights here. The first insight is that given the right basic conditions, i.e. that the root instruction is the only use of an effectful instruction's result, all we need is that the "color" of the effectful instruction is *one less* than the color of the current instruction. It's easier to think about colors on the program points between instructions: if the color coming *out* of the first (effectful def) instruction and *in* to the second (effectful or effect-free use) instruction are the same, then they can merge. Basically the color denotes a version of global state; if the same, then no other effectful ops happened in the meantime. The second insight is that we can keep state as we scan, tracking the "current color", and *update* this when we sink (merge) an op. Hence when we sink a load into another op, we effectively *re-color* every instruction it moved over; this may allow further sinks. Consider the example (and assume that we consider loads effectful in order to conservatively ensure a strong memory model; otherwise, replace with other effectful value-producing insts): ``` v0 = load x v1 = load y v2 = add v0, 1 v3 = add v1, 1 ``` Scanning from bottom to top, we first see the add producing `v3` and we can sink the load producing `v1` into it, producing a load + ALU-op machine instruction. This is legal because `v1` moves over only `v2`, which is a pure instruction. Consider, though, `v2`: under a simple scheme that has no other context, `v0` could not sink to `v2` because it would move over `v1`, another load. But because we already sunk `v1` down to `v3`, we are free to sink `v0` to `v2`; the update of the "current color" during the scan allows this. This PR also cleans up the `LowerCtx` interface a bit at the same time: whereas previously it always gave some subset of (constant, mergeable inst, register) directly from `LowerCtx::get_input()`, it now returns zero or more of (constant, mergable inst) from `LowerCtx::maybe_get_input_as_source_or_const()`, and returns the register only from `LowerCtx::put_input_in_reg()`. This removes the need to explicitly denote uses of the register, so it's a little safer. Note that this PR does not actually make use of the new ability to merge loads into other ops; that will come in future PRs, especially to optimize the `x64` backend by using direct-memory operands.
2020-11-05 00:01:56 -08:00
parent 8675fa5aa7
commit 3c8cb7b908
6 changed files with 271 additions and 186 deletions
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -111,7 +111,7 @@ pub(crate) enum ResultRegImmShift {
 /// Lower an instruction input to a 64-bit constant, if possible.
 pub(crate) fn input_to_const<C: LowerCtx<I = Inst>>(ctx: &mut C, input: InsnInput) -> Option<u64> {
-    let input = ctx.get_input(input.insn, input.input);
+    let input = ctx.get_input_as_source_or_const(input.insn, input.input);
    input.constant
 }
@@ -171,7 +171,7 @@ pub(crate) fn put_input_in_reg<C: LowerCtx<I = Inst>>(
    debug!("put_input_in_reg: input {:?}", input);
    let ty = ctx.input_ty(input.insn, input.input);
    let from_bits = ty_bits(ty) as u8;
-    let inputs = ctx.get_input(input.insn, input.input);
+    let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
    let in_reg = if let Some(c) = inputs.constant {
        // Generate constants fresh at each use to minimize long-range register pressure.
        let masked = if from_bits < 64 {
@@ -189,8 +189,7 @@ pub(crate) fn put_input_in_reg<C: LowerCtx<I = Inst>>(
        }
        to_reg.to_reg()
    } else {
-        ctx.use_input_reg(inputs);
+        ctx.put_input_in_reg(input.insn, input.input)
        inputs.reg
    };
    match (narrow_mode, from_bits) {
@@ -272,7 +271,7 @@ fn put_input_in_rs<C: LowerCtx<I = Inst>>(
    input: InsnInput,
    narrow_mode: NarrowValueMode,
 ) -> ResultRS {
-    let inputs = ctx.get_input(input.insn, input.input);
+    let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
    if let Some((insn, 0)) = inputs.inst {
        let op = ctx.data(insn).opcode();
@@ -305,7 +304,7 @@ fn put_input_in_rse<C: LowerCtx<I = Inst>>(
    input: InsnInput,
    narrow_mode: NarrowValueMode,
 ) -> ResultRSE {
-    let inputs = ctx.get_input(input.insn, input.input);
+    let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
    if let Some((insn, 0)) = inputs.inst {
        let op = ctx.data(insn).opcode();
        let out_ty = ctx.output_ty(insn, 0);
@@ -1052,7 +1051,7 @@ pub(crate) fn maybe_input_insn<C: LowerCtx<I = Inst>>(
    input: InsnInput,
    op: Opcode,
 ) -> Option<IRInst> {
-    let inputs = c.get_input(input.insn, input.input);
+    let inputs = c.get_input_as_source_or_const(input.insn, input.input);
    debug!(
        "maybe_input_insn: input {:?} has options {:?}; looking for op {:?}",
        input, inputs, op
@@ -1092,14 +1091,14 @@ pub(crate) fn maybe_input_insn_via_conv<C: LowerCtx<I = Inst>>(
    op: Opcode,
    conv: Opcode,
 ) -> Option<IRInst> {
-    let inputs = c.get_input(input.insn, input.input);
+    let inputs = c.get_input_as_source_or_const(input.insn, input.input);
    if let Some((src_inst, _)) = inputs.inst {
        let data = c.data(src_inst);
        if data.opcode() == op {
            return Some(src_inst);
        }
        if data.opcode() == conv {
-            let inputs = c.get_input(src_inst, 0);
+            let inputs = c.get_input_as_source_or_const(src_inst, 0);
            if let Some((src_inst, _)) = inputs.inst {
                let data = c.data(src_inst);
                if data.opcode() == op {
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -2959,10 +2959,11 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            // register. We simply use the variant of the add instruction that
            // sets flags (`adds`) here.
-            // Ensure that the second output isn't directly called for: it
+            // Note that the second output (the flags) need not be generated,
-            // should only be used by a flags-consuming op, which will directly
+            // because flags are never materialized into a register; the only
-            // understand this instruction and merge the comparison.
+            // instructions that can use a value of type `iflags` or `fflags`
-            assert!(!ctx.is_reg_needed(insn, ctx.get_output(insn, 1).to_reg()));
+            // will look directly for the flags-producing instruction (which can
            // always be found, by construction) and merge it.
            // Now handle the iadd as above, except use an AddS opcode that sets
            // flags.
--- a/cranelift/codegen/src/isa/arm32/lower.rs
+++ b/cranelift/codegen/src/isa/arm32/lower.rs
@@ -68,7 +68,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
 ) -> Reg {
    let ty = ctx.input_ty(input.insn, input.input);
    let from_bits = ty.bits() as u8;
-    let inputs = ctx.get_input(input.insn, input.input);
+    let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
    let in_reg = if let Some(c) = inputs.constant {
        let to_reg = ctx.alloc_tmp(Inst::rc_for_type(ty).unwrap(), ty);
        for inst in Inst::gen_constant(to_reg, c, ty, |reg_class, ty| ctx.alloc_tmp(reg_class, ty))
@@ -78,8 +78,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
        }
        to_reg.to_reg()
    } else {
-        ctx.use_input_reg(inputs);
+        ctx.put_input_in_reg(input.insn, input.input)
        inputs.reg
    };
    match (narrow_mode, from_bits) {
--- a/cranelift/codegen/src/isa/arm32/lower_inst.rs
+++ b/cranelift/codegen/src/isa/arm32/lower_inst.rs
@@ -316,7 +316,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        }
        Opcode::Trueif => {
            let cmp_insn = ctx
-                .get_input(inputs[0].insn, inputs[0].input)
+                .get_input_as_source_or_const(inputs[0].insn, inputs[0].input)
                .inst
                .unwrap()
                .0;
@@ -344,7 +344,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            } else {
                // Verification ensures that the input is always a single-def ifcmp.
                let cmp_insn = ctx
-                    .get_input(inputs[0].insn, inputs[0].input)
+                    .get_input_as_source_or_const(inputs[0].insn, inputs[0].input)
                    .inst
                    .unwrap()
                    .0;
@@ -471,7 +471,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        }
        Opcode::Trapif => {
            let cmp_insn = ctx
-                .get_input(inputs[0].insn, inputs[0].input)
+                .get_input_as_source_or_const(inputs[0].insn, inputs[0].input)
                .inst
                .unwrap()
                .0;
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -60,7 +60,7 @@ fn matches_input<C: LowerCtx<I = Inst>>(
    input: InsnInput,
    op: Opcode,
 ) -> Option<IRInst> {
-    let inputs = ctx.get_input(input.insn, input.input);
+    let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
    inputs.inst.and_then(|(src_inst, _)| {
        let data = ctx.data(src_inst);
        if data.opcode() == op {
@@ -77,7 +77,7 @@ fn matches_input_any<C: LowerCtx<I = Inst>>(
    input: InsnInput,
    ops: &[Opcode],
 ) -> Option<IRInst> {
-    let inputs = ctx.get_input(input.insn, input.input);
+    let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
    inputs.inst.and_then(|(src_inst, _)| {
        let data = ctx.data(src_inst);
        for &op in ops {
@@ -89,14 +89,9 @@ fn matches_input_any<C: LowerCtx<I = Inst>>(
    })
 }
 fn lowerinput_to_reg(ctx: Ctx, input: LowerInput) -> Reg {
    ctx.use_input_reg(input);
    input.reg
 }
 /// Put the given input into a register, and mark it as used (side-effect).
 fn put_input_in_reg(ctx: Ctx, spec: InsnInput) -> Reg {
-    let input = ctx.get_input(spec.insn, spec.input);
+    let input = ctx.get_input_as_source_or_const(spec.insn, spec.input);
    if let Some(c) = input.constant {
        // Generate constants fresh at each use to minimize long-range register pressure.
@@ -118,7 +113,7 @@ fn put_input_in_reg(ctx: Ctx, spec: InsnInput) -> Reg {
        }
        cst_copy.to_reg()
    } else {
-        lowerinput_to_reg(ctx, input)
+        ctx.put_input_in_reg(spec.insn, spec.input)
    }
 }
@@ -165,21 +160,16 @@ fn extend_input_to_reg(ctx: Ctx, spec: InsnInput, ext_spec: ExtSpec) -> Reg {
    dst.to_reg()
 }
 fn lowerinput_to_reg_mem(ctx: Ctx, input: LowerInput) -> RegMem {
    // TODO handle memory.
    RegMem::reg(lowerinput_to_reg(ctx, input))
 }
 /// Put the given input into a register or a memory operand.
 /// Effectful: may mark the given input as used, when returning the register form.
 fn input_to_reg_mem(ctx: Ctx, spec: InsnInput) -> RegMem {
-    let input = ctx.get_input(spec.insn, spec.input);
+    // TODO handle memory; merge a load directly, if possible.
-    lowerinput_to_reg_mem(ctx, input)
+    RegMem::reg(ctx.put_input_in_reg(spec.insn, spec.input))
 }
 /// Returns whether the given input is an immediate that can be properly sign-extended, without any
 /// possible side-effect.
-fn lowerinput_to_sext_imm(input: LowerInput, input_ty: Type) -> Option<u32> {
+fn non_reg_input_to_sext_imm(input: NonRegInput, input_ty: Type) -> Option<u32> {
    input.constant.and_then(|x| {
        // For i64 instructions (prefixed with REX.W), require that the immediate will sign-extend
        // to 64 bits. For other sizes, it doesn't matter and we can just use the plain
@@ -193,23 +183,24 @@ fn lowerinput_to_sext_imm(input: LowerInput, input_ty: Type) -> Option<u32> {
 }
 fn input_to_sext_imm(ctx: Ctx, spec: InsnInput) -> Option<u32> {
-    let input = ctx.get_input(spec.insn, spec.input);
+    let input = ctx.get_input_as_source_or_const(spec.insn, spec.input);
    let input_ty = ctx.input_ty(spec.insn, spec.input);
-    lowerinput_to_sext_imm(input, input_ty)
+    non_reg_input_to_sext_imm(input, input_ty)
 }
 fn input_to_imm(ctx: Ctx, spec: InsnInput) -> Option<u64> {
-    ctx.get_input(spec.insn, spec.input).constant
+    ctx.get_input_as_source_or_const(spec.insn, spec.input)
        .constant
 }
 /// Put the given input into an immediate, a register or a memory operand.
 /// Effectful: may mark the given input as used, when returning the register form.
 fn input_to_reg_mem_imm(ctx: Ctx, spec: InsnInput) -> RegMemImm {
-    let input = ctx.get_input(spec.insn, spec.input);
+    let input = ctx.get_input_as_source_or_const(spec.insn, spec.input);
    let input_ty = ctx.input_ty(spec.insn, spec.input);
-    match lowerinput_to_sext_imm(input, input_ty) {
+    match non_reg_input_to_sext_imm(input, input_ty) {
        Some(x) => RegMemImm::imm(x),
-        None => match lowerinput_to_reg_mem(ctx, input) {
+        None => match input_to_reg_mem(ctx, spec) {
            RegMem::Reg { reg } => RegMemImm::reg(reg),
            RegMem::Mem { addr } => RegMemImm::mem(addr),
        },
@@ -547,8 +538,6 @@ fn lower_to_amode<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput, offset: i
            )
        } else {
            for i in 0..=1 {
                let input = ctx.get_input(add, i);
                // Try to pierce through uextend.
                if let Some(uextend) = matches_input(
                    ctx,
@@ -558,7 +547,7 @@ fn lower_to_amode<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput, offset: i
                    },
                    Opcode::Uextend,
                ) {
-                    if let Some(cst) = ctx.get_input(uextend, 0).constant {
+                    if let Some(cst) = ctx.get_input_as_source_or_const(uextend, 0).constant {
                        // Zero the upper bits.
                        let input_size = ctx.input_ty(uextend, 0).bits() as u64;
                        let shift: u64 = 64 - input_size;
@@ -573,7 +562,7 @@ fn lower_to_amode<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput, offset: i
                }
                // If it's a constant, add it directly!
-                if let Some(cst) = input.constant {
+                if let Some(cst) = ctx.get_input_as_source_or_const(add, i).constant {
                    let final_offset = (offset as i64).wrapping_add(cst as i64);
                    if low32_will_sign_extend_to_64(final_offset as u64) {
                        let base = put_input_in_reg(ctx, add_inputs[1 - i]);
@@ -1010,7 +999,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    _ => unreachable!("unhandled output type for shift/rotates: {}", dst_ty),
                };
-                let (count, rhs) = if let Some(cst) = ctx.get_input(insn, 1).constant {
+                let (count, rhs) =
                    if let Some(cst) = ctx.get_input_as_source_or_const(insn, 1).constant {
                        // Mask count, according to Cranelift's semantics.
                        let cst = (cst as u8) & (dst_ty.bits() as u8 - 1);
                        (Some(cst), None)
@@ -1248,7 +1238,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                // Shift each value.
                let mut shift = |reg: Writable<Reg>| {
                    let kind = ShiftKind::ShiftRightArithmetic;
-                    if let Some(shift_by) = ctx.get_input(insn, 1).constant {
+                    if let Some(shift_by) = ctx.get_input_as_source_or_const(insn, 1).constant {
                        // Mask the shift amount according to Cranelift's semantics.
                        let shift_by = (shift_by as u8) & (types::I64.bits() as u8 - 1);
                        ctx.emit(Inst::shift_r(8, kind, Some(shift_by), reg));
@@ -3351,7 +3341,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            // Verification ensures that the input is always a single-def ifcmp.
            let cmp_insn = ctx
-                .get_input(inputs[0].insn, inputs[0].input)
+                .get_input_as_source_or_const(inputs[0].insn, inputs[0].input)
                .inst
                .unwrap()
                .0;
--- a/cranelift/codegen/src/machinst/lower.rs
+++ b/cranelift/codegen/src/machinst/lower.rs
@@ -2,6 +2,9 @@
 //! to machine instructions with virtual registers. This is *almost* the final
 //! machine code, except for register allocation.
 // TODO: separate the IR-query core of `LowerCtx` from the lowering logic built
 // on top of it, e.g. the side-effect/coloring analysis and the scan support.
 use crate::entity::SecondaryMap;
 use crate::fx::{FxHashMap, FxHashSet};
 use crate::inst_predicates::{has_lowering_side_effect, is_constant_64bit};
@@ -39,7 +42,7 @@ use smallvec::SmallVec;
 /// they can be freely permuted (modulo true dataflow dependencies) without
 /// affecting program behavior.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
-pub struct InstColor(u32);
+struct InstColor(u32);
 impl InstColor {
    fn new(n: u32) -> InstColor {
        InstColor(n)
@@ -92,11 +95,6 @@ pub trait LowerCtx {
    fn memflags(&self, ir_inst: Inst) -> Option<MemFlags>;
    /// Get the source location for a given instruction.
    fn srcloc(&self, ir_inst: Inst) -> SourceLoc;
    /// Get the side-effect color of the given instruction (specifically, at the
    /// program point just prior to the instruction). The "color" changes at
    /// every side-effecting op; the backend should not try to merge across
    /// side-effect colors unless the op being merged is known to be pure.
    fn inst_color(&self, ir_inst: Inst) -> InstColor;
    // Instruction input/output queries:
@@ -111,29 +109,25 @@ pub trait LowerCtx {
    /// Get the value of a constant instruction (`iconst`, etc.) as a 64-bit
    /// value, if possible.
    fn get_constant(&self, ir_inst: Inst) -> Option<u64>;
-    /// Get the input in any combination of three forms:
+    /// Get the input as one of two options other than a direct register:
    ///
-    /// - An instruction, if the same color as this instruction or if the
+    /// - An instruction, given that it is effect-free or able to sink its
-    ///   producing instruction has no side effects (thus in both cases
+    ///   effect to the current instruction being lowered, and given it has only
-    ///   mergeable);
+    ///   one output, and if effect-ful, given that this is the only use;
-    /// - A constant, if the value is a constant;
+    /// - A constant, if the value is a constant.
    /// - A register.
    ///
-    /// The instruction input may be available in some or all of these
+    /// The instruction input may be available in either of these forms.  It may
-    /// forms. More than one is possible: e.g., it may be produced by an
+    /// be available in neither form, if the conditions are not met; if so, use
-    /// instruction in the same block, but may also have been forced into a
+    /// `put_input_in_reg()` instead to get it in a register.
    /// register already by an earlier op. It will *always* be available
    /// in a register, at least.
    ///
-    /// If the backend uses the register, rather than one of the other
+    /// If the backend merges the effect of a side-effecting instruction, it
-    /// forms (constant or merging of the producing op), it must call
+    /// must call `sink_inst()`. When this is called, it indicates that the
-    /// `use_input_reg()` to ensure the producing inst is actually lowered
+    /// effect has been sunk to the current scan location. The sunk
-    /// as well. Failing to do so may result in the instruction that generates
+    /// instruction's result(s) must have *no* uses remaining, because it will
-    /// this value never being generated, thus resulting in incorrect execution.
+    /// not be codegen'd (it has been integrated into the current instruction).
-    /// For correctness, backends should thus wrap `get_input()` and
+    fn get_input_as_source_or_const(&self, ir_inst: Inst, idx: usize) -> NonRegInput;
-    /// `use_input_regs()` with helpers that return a register only after
+    /// Put the `idx`th input into a register and return the assigned register.
-    /// ensuring it is marked as used.
+    fn put_input_in_reg(&mut self, ir_inst: Inst, idx: usize) -> Reg;
    fn get_input(&self, ir_inst: Inst, idx: usize) -> LowerInput;
    /// Get the `idx`th output register of the given IR instruction. When
    /// `backend.lower_inst_to_regs(ctx, inst)` is called, it is expected that
    /// the backend will write results to these output register(s).  This
@@ -152,14 +146,11 @@ pub trait LowerCtx {
    fn emit(&mut self, mach_inst: Self::I);
    /// Emit a machine instruction that is a safepoint.
    fn emit_safepoint(&mut self, mach_inst: Self::I);
-    /// Indicate that the given input uses the register returned by
+    /// Indicate that the side-effect of an instruction has been sunk to the
-    /// `get_input()`. Codegen may not happen otherwise for the producing
+    /// current scan location. This can only be done to an instruction with no
-    /// instruction if it has no side effects and no uses.
+    /// uses of its result register(s), because it will cause the instruction
-    fn use_input_reg(&mut self, input: LowerInput);
+    /// not to be codegen'd at its original location.
-    /// Is the given register output needed after the given instruction? Allows
+    fn sink_inst(&mut self, ir_inst: Inst);
    /// instructions with multiple outputs to make fine-grained decisions on
    /// which outputs to actually generate.
    fn is_reg_needed(&self, ir_inst: Inst, reg: Reg) -> bool;
    /// Retrieve constant data given a handle.
    fn get_constant_data(&self, constant_handle: Constant) -> &ConstantData;
    /// Indicate that a constant should be emitted.
@@ -172,17 +163,22 @@ pub trait LowerCtx {
    fn ensure_in_vreg(&mut self, reg: Reg, ty: Type) -> Reg;
 }
-/// A representation of all of the ways in which an instruction input is
+/// A representation of all of the ways in which a value is available, aside
-/// available: as a producing instruction (in the same color-partition), as a
+/// from as a direct register.
-/// constant, and/or in an existing register. See [LowerCtx::get_input] for more
+///
-/// details.
+/// - An instruction, if it would be allowed to occur at the current location
 ///   instead (see [LowerCtx::get_input_as_source_or_const()] for more
 ///   details).
 ///
 /// - A constant, if the value is known to be a constant.
 #[derive(Clone, Copy, Debug)]
-pub struct LowerInput {
+pub struct NonRegInput {
-    /// The value is live in a register. This option is always available. Call
+    /// An instruction produces this value (as the given output), and its
-    /// [LowerCtx::use_input_reg()] if the register is used.
+    /// computation (and side-effect if applicable) could occur at the
-    pub reg: Reg,
+    /// current instruction's location instead.
-    /// An instruction produces this value; the instruction's result index that
+    ///
-    /// produces this value is given.
+    /// If this instruction's operation is merged into the current instruction,
    /// the backend must call [LowerCtx::sink_inst()].
    pub inst: Option<(Inst, usize)>,
    /// The value is a known constant.
    pub constant: Option<u64>,
@@ -242,17 +238,36 @@ pub struct Lower<'func, I: VCodeInst> {
    /// Return-value vregs.
    retval_regs: Vec<Reg>,
-    /// Instruction colors.
+    /// Instruction colors at block exits. From this map, we can recover all
-    inst_colors: SecondaryMap<Inst, InstColor>,
+    /// instruction colors by scanning backward from the block end and
    /// decrementing on any color-changing (side-effecting) instruction.
    block_end_colors: SecondaryMap<Block, InstColor>,
    /// Instruction colors at side-effecting ops. This is the *entry* color,
    /// i.e., the version of global state that exists before an instruction
    /// executes.  For each side-effecting instruction, the *exit* color is its
    /// entry color plus one.
    side_effect_inst_entry_colors: FxHashMap<Inst, InstColor>,
    /// Current color as we scan during lowering. While we are lowering an
    /// instruction, this is equal to the color *at entry to* the instruction.
    cur_scan_entry_color: Option<InstColor>,
    /// Current instruction as we scan during lowering.
    cur_inst: Option<Inst>,
    /// Instruction constant values, if known.
    inst_constants: FxHashMap<Inst, u64>,
-    /// Instruction has a side-effect and must be codegen'd.
+    /// Use-counts per SSA value, as counted in the input IR.
-    inst_needed: SecondaryMap<Inst, bool>,
+    value_uses: SecondaryMap<Value, u32>,
-    /// Value (vreg) is needed and producer must be codegen'd.
+    /// Actual uses of each SSA value so far, incremented while lowering.
-    vreg_needed: Vec<bool>,
+    value_lowered_uses: SecondaryMap<Value, u32>,
    /// Effectful instructions that have been sunk; they are not codegen'd at
    /// their original locations.
    inst_sunk: FxHashSet<Inst>,
    /// Next virtual register number to allocate.
    next_vreg: u32,
@@ -371,20 +386,19 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
        // Compute instruction colors, find constant instructions, and find instructions with
        // side-effects, in one combined pass.
        let mut cur_color = 0;
-        let mut inst_colors = SecondaryMap::with_default(InstColor::new(0));
+        let mut block_end_colors = SecondaryMap::with_default(InstColor::new(0));
        let mut side_effect_inst_entry_colors = FxHashMap::default();
        let mut inst_constants = FxHashMap::default();
-        let mut inst_needed = SecondaryMap::with_default(false);
+        let mut value_uses = SecondaryMap::with_default(0);
        for bb in f.layout.blocks() {
            cur_color += 1;
            for inst in f.layout.block_insts(bb) {
                let side_effect = has_lowering_side_effect(f, inst);
                // Assign colors. A new color is chosen *after* any side-effecting instruction.
                inst_colors[inst] = InstColor::new(cur_color);
                debug!("bb {} inst {} has color {}", bb, inst, cur_color);
                if side_effect {
-                    debug!(" -> side-effecting");
+                    side_effect_inst_entry_colors.insert(inst, InstColor::new(cur_color));
-                    inst_needed[inst] = true;
+                    debug!(" -> side-effecting; incrementing color for next inst");
                    cur_color += 1;
                }
@@ -393,21 +407,31 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
                    debug!(" -> constant: {}", c);
                    inst_constants.insert(inst, c);
                }
                // Count uses of all arguments.
                for arg in f.dfg.inst_args(inst) {
                    let arg = f.dfg.resolve_aliases(*arg);
                    value_uses[arg] += 1;
                }
            }
-        let vreg_needed = std::iter::repeat(false).take(next_vreg as usize).collect();
+            block_end_colors[bb] = InstColor::new(cur_color);
        }
        Ok(Lower {
            f,
            vcode,
            value_regs,
            retval_regs,
-            inst_colors,
+            block_end_colors,
            side_effect_inst_entry_colors,
            inst_constants,
            inst_needed,
            vreg_needed,
            next_vreg,
            value_uses,
            value_lowered_uses: SecondaryMap::default(),
            inst_sunk: FxHashSet::default(),
            cur_scan_entry_color: None,
            cur_inst: None,
            block_insts: vec![],
            block_ranges: vec![],
            bb_insts: vec![],
@@ -465,6 +489,8 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
            return Ok(());
        }
        self.cur_inst = Some(inst);
        // Make up two vectors of info:
        //
        // * one for dsts which are to be assigned constants.  We'll deal with those second, so
@@ -489,15 +515,16 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
            debug_assert!(ty == self.f.dfg.value_type(*dst_val));
            let dst_reg = self.value_regs[*dst_val];
-            let input = self.get_input_for_val(inst, src_val);
+            let input = self.get_value_as_source_or_const(src_val);
-            debug!("jump arg {} is {}, reg {:?}", i, src_val, input.reg);
+            debug!("jump arg {} is {}", i, src_val);
            i += 1;
            if let Some(c) = input.constant {
                debug!(" -> constant {}", c);
                const_bundles.push((ty, Writable::from_reg(dst_reg), c));
            } else {
-                self.use_input_reg(input);
+                let src_reg = self.put_value_in_reg(src_val);
-                let src_reg = input.reg;
+                debug!(" -> reg {:?}", src_reg);
                // Skip self-assignments.  Not only are they pointless, they falsely trigger the
                // overlap-check below and hence can cause a lot of unnecessary copying through
                // temporaries.
@@ -558,11 +585,27 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
        Ok(())
    }
    /// Has this instruction been sunk to a use-site (i.e., away from its
    /// original location)?
    fn is_inst_sunk(&self, inst: Inst) -> bool {
        self.inst_sunk.contains(&inst)
    }
    // Is any result of this instruction needed?
    fn is_any_inst_result_needed(&self, inst: Inst) -> bool {
        self.f
            .dfg
            .inst_results(inst)
            .iter()
            .any(|&result| self.value_lowered_uses[result] > 0)
    }
    fn lower_clif_block<B: LowerBackend<MInst = I>>(
        &mut self,
        backend: &B,
        block: Block,
    ) -> CodegenResult<()> {
        self.cur_scan_entry_color = Some(self.block_end_colors[block]);
        // Lowering loop:
        // - For each non-branch instruction, in reverse order:
        //   - If side-effecting (load, store, branch/call/return, possible trap), or if
@@ -582,27 +625,41 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
        // end of the BB (in the toplevel driver `lower()`).
        for inst in self.f.layout.block_insts(block).rev() {
            let data = &self.f.dfg[inst];
-            let value_needed = self
+            let has_side_effect = has_lowering_side_effect(self.f, inst);
-                .f
+            // If  inst has been sunk to another location, skip it.
-                .dfg
+            if self.is_inst_sunk(inst) {
-                .inst_results(inst)
+                continue;
-                .iter()
+            }
-                .any(|&result| self.vreg_needed[self.value_regs[result].get_index()]);
+            // Are any outputs used at least once?
            let value_needed = self.is_any_inst_result_needed(inst);
            debug!(
-                "lower_clif_block: block {} inst {} ({:?}) is_branch {} inst_needed {} value_needed {}",
+                "lower_clif_block: block {} inst {} ({:?}) is_branch {} side_effect {} value_needed {}",
                block,
                inst,
                data,
                data.opcode().is_branch(),
-                self.inst_needed[inst],
+                has_side_effect,
                value_needed,
            );
            // Skip lowering branches; these are handled separately.
            if self.f.dfg[inst].opcode().is_branch() {
                continue;
            }
-            // Normal instruction: codegen if eager bit is set. (Other instructions may also be
+
-            // codegened if not eager when they are used by another instruction.)
+            // Update scan state to color prior to this inst (as we are scanning
-            if self.inst_needed[inst] || value_needed {
+            // backward).
            self.cur_inst = Some(inst);
            if has_side_effect {
                let entry_color = *self
                    .side_effect_inst_entry_colors
                    .get(&inst)
                    .expect("every side-effecting inst should have a color-map entry");
                self.cur_scan_entry_color = Some(entry_color);
            }
            // Normal instruction: codegen if the instruction is side-effecting
            // or any of its outputs its used.
            if has_side_effect || value_needed {
                debug!("lowering: inst {}: {:?}", inst, self.f.dfg[inst]);
                backend.lower(self, inst)?;
            }
@@ -620,6 +677,7 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
            let loc = self.srcloc(inst);
            self.finish_ir_inst(loc);
        }
        self.cur_scan_entry_color = None;
        Ok(())
    }
@@ -667,6 +725,9 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
            "lower_clif_branches: block {} branches {:?} targets {:?}",
            block, branches, targets,
        );
        // When considering code-motion opportunities, consider the current
        // program point to be the first branch.
        self.cur_inst = Some(branches[0]);
        backend.lower_branch_group(self, branches, targets)?;
        let loc = self.srcloc(branches[0]);
        self.finish_ir_inst(loc);
@@ -786,60 +847,91 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
        Ok((vcode, stack_map_info))
    }
-    /// Get the actual inputs for a value. This is the implementation for
+    fn put_value_in_reg(&mut self, val: Value) -> Reg {
-    /// `get_input()` but starting from the SSA value, which is not exposed to
+        debug!("put_value_in_reg: val {}", val,);
    /// the backend.
    fn get_input_for_val(&self, at_inst: Inst, val: Value) -> LowerInput {
        debug!("get_input_for_val: val {} at inst {}", val, at_inst);
        let mut reg = self.value_regs[val];
        debug!(" -> reg {:?}", reg);
        assert!(reg.is_valid());
-        let mut inst = match self.f.dfg.value_def(val) {
+
-            // OK to merge source instruction if (i) we have a source
+        self.value_lowered_uses[val] += 1;
            // instruction, and either (ii-a) it has no side effects, or (ii-b)
            // it has the same color as this instruction.
            ValueDef::Result(src_inst, result_idx) => {
                debug!(" -> src inst {}", src_inst);
                debug!(
                    " -> has lowering side effect: {}",
                    has_lowering_side_effect(self.f, src_inst)
                );
                debug!(
                    " -> our color is {:?}, src inst is {:?}",
                    self.inst_color(at_inst),
                    self.inst_color(src_inst)
                );
                if !has_lowering_side_effect(self.f, src_inst)
                    || self.inst_color(at_inst) == self.inst_color(src_inst)
                {
                    Some((src_inst, result_idx))
                } else {
                    None
                }
            }
            _ => None,
        };
        let constant = inst.and_then(|(inst, _)| self.get_constant(inst));
        // Pinned-reg hack: if backend specifies a fixed pinned register, use it
        // directly when we encounter a GetPinnedReg op, rather than lowering
        // the actual op, and do not return the source inst to the caller; the
        // value comes "out of the ether" and we will not force generation of
        // the superfluous move.
-        if let Some((i, _)) = inst {
+        if let ValueDef::Result(i, 0) = self.f.dfg.value_def(val) {
            if self.f.dfg[i].opcode() == Opcode::GetPinnedReg {
                if let Some(pr) = self.pinned_reg {
                    reg = pr;
                }
                inst = None;
            }
        }
-        LowerInput {
+        reg
            reg,
            inst,
            constant,
    }
    /// Get the actual inputs for a value. This is the implementation for
    /// `get_input()` but starting from the SSA value, which is not exposed to
    /// the backend.
    fn get_value_as_source_or_const(&self, val: Value) -> NonRegInput {
        debug!(
            "get_input_for_val: val {} at cur_inst {:?} cur_scan_entry_color {:?}",
            val, self.cur_inst, self.cur_scan_entry_color,
        );
        let inst = match self.f.dfg.value_def(val) {
            // OK to merge source instruction if (i) we have a source
            // instruction, and:
            // - It has no side-effects, OR
            // - It has a side-effect, has one output value, that one output has
            //   only one use (this one), and the instruction's color is *one less
            //   than* the current scan color.
            //
            //   This latter set of conditions is testing whether a
            //   side-effecting instruction can sink to the current scan
            //   location; this is possible if the in-color of this inst is
            //   equal to the out-color of the producing inst, so no other
            //   side-effecting ops occur between them (which will only be true
            //   if they are in the same BB, because color increments at each BB
            //   start).
            //
            //   If it is actually sunk, then in `merge_inst()`, we update the
            //   scan color so that as we scan over the range past which the
            //   instruction was sunk, we allow other instructions (that came
            //   prior to the sunk instruction) to sink.
            ValueDef::Result(src_inst, result_idx) => {
                let src_side_effect = has_lowering_side_effect(self.f, src_inst);
                debug!(" -> src inst {}", src_inst);
                debug!(" -> has lowering side effect: {}", src_side_effect);
                if !src_side_effect {
                    // Pure instruction: always possible to sink.
                    Some((src_inst, result_idx))
                } else {
                    // Side-effect: test whether this is the only use of the
                    // only result of the instruction, and whether colors allow
                    // the code-motion.
                    if self.cur_scan_entry_color.is_some()
                        && self.value_uses[val] == 1
                        && self.num_outputs(src_inst) == 1
                        && self
                            .side_effect_inst_entry_colors
                            .get(&src_inst)
                            .unwrap()
                            .get()
                            + 1
                            == self.cur_scan_entry_color.unwrap().get()
                    {
                        Some((src_inst, 0))
                    } else {
                        None
                    }
                }
            }
            _ => None,
        };
        let constant = inst.and_then(|(inst, _)| self.get_constant(inst));
        NonRegInput { inst, constant }
    }
 }
@@ -928,10 +1020,6 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> {
        self.f.srclocs[ir_inst]
    }
    fn inst_color(&self, ir_inst: Inst) -> InstColor {
        self.inst_colors[ir_inst]
    }
    fn num_inputs(&self, ir_inst: Inst) -> usize {
        self.f.dfg.inst_args(ir_inst).len()
    }
@@ -954,10 +1042,16 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> {
        self.inst_constants.get(&ir_inst).cloned()
    }
-    fn get_input(&self, ir_inst: Inst, idx: usize) -> LowerInput {
+    fn get_input_as_source_or_const(&self, ir_inst: Inst, idx: usize) -> NonRegInput {
        let val = self.f.dfg.inst_args(ir_inst)[idx];
        let val = self.f.dfg.resolve_aliases(val);
-        self.get_input_for_val(ir_inst, val)
+        self.get_value_as_source_or_const(val)
    }
    fn put_input_in_reg(&mut self, ir_inst: Inst, idx: usize) -> Reg {
        let val = self.f.dfg.inst_args(ir_inst)[idx];
        let val = self.f.dfg.resolve_aliases(val);
        self.put_value_in_reg(val)
    }
    fn get_output(&self, ir_inst: Inst, idx: usize) -> Writable<Reg> {
@@ -989,17 +1083,19 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> {
        });
    }
-    fn use_input_reg(&mut self, input: LowerInput) {
+    fn sink_inst(&mut self, ir_inst: Inst) {
-        debug!("use_input_reg: vreg {:?} is needed", input.reg);
+        assert!(has_lowering_side_effect(self.f, ir_inst));
-        // We may directly return a real (machine) register when we know that register holds the
+        assert!(self.cur_scan_entry_color.is_some());
        // result of an opcode (e.g. GetPinnedReg).
        if input.reg.is_virtual() {
            self.vreg_needed[input.reg.get_index()] = true;
        }
    }
-    fn is_reg_needed(&self, ir_inst: Inst, reg: Reg) -> bool {
+        let sunk_inst_entry_color = self
-        self.inst_needed[ir_inst] || self.vreg_needed[reg.get_index()]
+            .side_effect_inst_entry_colors
            .get(&ir_inst)
            .cloned()
            .unwrap();
        let sunk_inst_exit_color = InstColor::new(sunk_inst_entry_color.get() + 1);
        assert!(sunk_inst_exit_color == self.cur_scan_entry_color.unwrap());
        self.cur_scan_entry_color = Some(sunk_inst_entry_color);
        self.inst_sunk.insert(ir_inst);
    }
    fn get_constant_data(&self, constant_handle: Constant) -> &ConstantData {