diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index 2d5ecd406d..aa6727b978 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -352,6 +352,13 @@ impl MachInstEmit for Inst {
     type State = EmitState;
 
     fn emit(&self, sink: &mut MachBuffer<Inst>, flags: &settings::Flags, state: &mut EmitState) {
+        // N.B.: we *must* not exceed the "worst-case size" used to compute
+        // where to insert islands, except when islands are explicitly triggered
+        // (with an `EmitIsland`). We check this in debug builds. This is `mut`
+        // to allow disabling the check for `JTSequence`, which is always
+        // emitted following an `EmitIsland`.
+        let mut start_off = sink.cur_offset();
+
         match self {
             &Inst::AluRRR { alu_op, rd, rn, rm } => {
                 let top11 = match alu_op {
@@ -1307,6 +1314,10 @@ impl MachInstEmit for Inst {
                         LabelUse::PCRel32,
                     );
                 }
+
+                // Lowering produces an EmitIsland before using a JTSequence, so we can safely
+                // disable the worst-case-size check in this case.
+                start_off = sink.cur_offset();
             }
             &Inst::LoadConst64 { rd, const_data } => {
                 let inst = Inst::ULoad64 {
@@ -1418,5 +1429,8 @@ impl MachInstEmit for Inst {
                 }
             }
         }
+
+        let end_off = sink.cur_offset();
+        debug_assert!((end_off - start_off) <= Inst::worst_case_size());
     }
 }
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index 714ba1eb4d..fd910522c5 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -657,6 +657,15 @@ pub enum Inst {
 
     /// A one-way conditional branch, invisible to the CFG processing; used *only* as part of
     /// straight-line sequences in code to be emitted.
+    ///
+    /// In more detail:
+    /// - This branch is lowered to a branch at the machine-code level, but does not end a basic
+    ///   block, and does not create edges in the CFG seen by regalloc.
+    /// - Thus, it is *only* valid to use as part of a single-in, single-out sequence that is
+    ///   lowered from a single CLIF instruction. For example, certain arithmetic operations may
+    ///   use these branches to handle certain conditions, such as overflows, traps, etc.
+    ///
+    /// See, e.g., the lowering of `trapif` (conditional trap) for an example.
     OneWayCondBr {
         target: BranchTarget,
         kind: CondBrKind,
@@ -678,7 +687,7 @@ pub enum Inst {
         trap_info: (SourceLoc, TrapCode),
     },
 
-    /// Load the address (using a PC-relative offset) of a memory location, using the `ADR`
+    /// Compute the address (using a PC-relative offset) of a memory location, using the `ADR`
     /// instruction. Note that we take a simple offset, not a `MemLabel`, here, because `Adr` is
     /// only used for now in fixed lowering sequences with hardcoded offsets. In the future we may
     /// need full `MemLabel` support.
@@ -734,9 +743,26 @@ pub enum Inst {
         offset: i64,
     },
 
-    /// Meta-insn, no-op in generated code: emit constant/branch veneer island at this point (with
-    /// a guard jump around it) if less than the needed space is available before the next branch
-    /// deadline.
+    /// Meta-insn, no-op in generated code: emit constant/branch veneer island
+    /// at this point (with a guard jump around it) if less than the needed
+    /// space is available before the next branch deadline. See the `MachBuffer`
+    /// implementation in `machinst/buffer.rs` for the overall algorithm. In
+    /// brief, we retain a set of "pending/unresolved label references" from
+    /// branches as we scan forward through instructions to emit machine code;
+    /// if we notice we're about to go out of range on an unresolved reference,
+    /// we stop, emit a bunch of "veneers" (branches in a form that has a longer
+    /// range, e.g. a 26-bit-offset unconditional jump), and point the original
+    /// label references to those. This is an "island" because it comes in the
+    /// middle of the code.
+    ///
+    /// This meta-instruction is a necessary part of the logic that determines
+    /// where to place islands. Ordinarily, we want to place them between basic
+    /// blocks, so we compute the worst-case size of each block, and emit the
+    /// island before starting a block if we would exceed a deadline before the
+    /// end of the block. However, some sequences (such as an inline jumptable)
+    /// are variable-length and not accounted for by this logic; so these
+    /// lowered sequences include an `EmitIsland` to trigger island generation
+    /// where necessary.
     EmitIsland {
         /// The needed space before the next deadline.
         needed_space: CodeOffset,
@@ -1770,6 +1796,18 @@ impl MachInst for Inst {
             ));
             ret
         } else {
+            // Must be an integer type.
+            debug_assert!(
+                ty == B1
+                    || ty == I8
+                    || ty == B8
+                    || ty == I16
+                    || ty == B16
+                    || ty == I32
+                    || ty == B32
+                    || ty == I64
+                    || ty == B64
+            );
             Inst::load_constant(to_reg, value)
         }
     }
@@ -2601,7 +2639,8 @@ pub enum LabelUse {
     /// 21-bit offset for ADR (get address of label). PC-rel, offset is not shifted. Immediate is
     /// 21 signed bits, with high 19 bits in bits 23:5 and low 2 bits in bits 30:29.
     Adr21,
-    /// 32-bit PC relative constant offset (from address of constant itself). Used in jump tables.
+    /// 32-bit PC relative constant offset (from address of constant itself),
+    /// signed. Used in jump tables.
     PCRel32,
 }
 
diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
index d1368a3d97..eb4aafd551 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -188,7 +188,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
     let inputs = ctx.get_input(input.insn, input.input);
     let in_reg = if let Some(c) = inputs.constant {
         // Generate constants fresh at each use to minimize long-range register pressure.
-        let to_reg = ctx.tmp(Inst::rc_for_type(ty).unwrap(), ty);
+        let to_reg = ctx.alloc_tmp(Inst::rc_for_type(ty).unwrap(), ty);
         for inst in Inst::gen_constant(to_reg, c, ty).into_iter() {
             ctx.emit(inst);
         }
@@ -201,7 +201,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
     match (narrow_mode, from_bits) {
         (NarrowValueMode::None, _) => in_reg,
         (NarrowValueMode::ZeroExtend32, n) if n < 32 => {
-            let tmp = ctx.tmp(RegClass::I64, I32);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I32);
             ctx.emit(Inst::Extend {
                 rd: tmp,
                 rn: in_reg,
@@ -212,7 +212,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
             tmp.to_reg()
         }
         (NarrowValueMode::SignExtend32, n) if n < 32 => {
-            let tmp = ctx.tmp(RegClass::I64, I32);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I32);
             ctx.emit(Inst::Extend {
                 rd: tmp,
                 rn: in_reg,
@@ -229,7 +229,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
                 // Constants are zero-extended to full 64-bit width on load already.
                 in_reg
             } else {
-                let tmp = ctx.tmp(RegClass::I64, I32);
+                let tmp = ctx.alloc_tmp(RegClass::I64, I32);
                 ctx.emit(Inst::Extend {
                     rd: tmp,
                     rn: in_reg,
@@ -241,7 +241,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
             }
         }
         (NarrowValueMode::SignExtend64, n) if n < 64 => {
-            let tmp = ctx.tmp(RegClass::I64, I32);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I32);
             ctx.emit(Inst::Extend {
                 rd: tmp,
                 rn: in_reg,
@@ -529,7 +529,7 @@ pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
     }
 
     // Otherwise, generate add instructions.
-    let addr = ctx.tmp(RegClass::I64, I64);
+    let addr = ctx.alloc_tmp(RegClass::I64, I64);
 
     // Get the const into a reg.
     lower_constant_u64(ctx, addr.clone(), offset as u64);
@@ -541,7 +541,7 @@ pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
         // In an addition, the stack register is the zero register, so divert it to another
         // register just before doing the actual add.
         let reg = if reg == stack_reg() {
-            let tmp = ctx.tmp(RegClass::I64, I64);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I64);
             ctx.emit(Inst::Mov {
                 rd: tmp,
                 rm: stack_reg(),
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index 8692d853de..a92dea7a7b 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -84,8 +84,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             } else {
                 VecALUOp::UQAddScalar
             };
-            let va = ctx.tmp(RegClass::V128, I128);
-            let vb = ctx.tmp(RegClass::V128, I128);
+            let va = ctx.alloc_tmp(RegClass::V128, I128);
+            let vb = ctx.alloc_tmp(RegClass::V128, I128);
             let ra = input_to_reg(ctx, inputs[0], narrow_mode);
             let rb = input_to_reg(ctx, inputs[1], narrow_mode);
             let rd = output_to_reg(ctx, outputs[0]);
@@ -115,8 +115,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             } else {
                 VecALUOp::UQSubScalar
             };
-            let va = ctx.tmp(RegClass::V128, I128);
-            let vb = ctx.tmp(RegClass::V128, I128);
+            let va = ctx.alloc_tmp(RegClass::V128, I128);
+            let vb = ctx.alloc_tmp(RegClass::V128, I128);
             let ra = input_to_reg(ctx, inputs[0], narrow_mode);
             let rb = input_to_reg(ctx, inputs[1], narrow_mode);
             let rd = output_to_reg(ctx, outputs[0]);
@@ -498,7 +498,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                             // ignored (because of the implicit masking done by the instruction),
                             // so this is equivalent to negating the input.
                             let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
-                            let tmp = ctx.tmp(RegClass::I64, ty);
+                            let tmp = ctx.alloc_tmp(RegClass::I64, ty);
                             ctx.emit(Inst::AluRRR {
                                 alu_op,
                                 rd: tmp,
@@ -521,7 +521,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                             // Really ty_bits_size - rn, but the upper bits of the result are
                             // ignored (because of the implicit masking done by the instruction),
                             // so this is equivalent to negating the input.
-                            let tmp = ctx.tmp(RegClass::I64, I32);
+                            let tmp = ctx.alloc_tmp(RegClass::I64, I32);
                             ctx.emit(Inst::AluRRR {
                                 alu_op: ALUOp::Sub32,
                                 rd: tmp,
@@ -534,7 +534,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         };
 
                         // Explicitly mask the rotation count.
-                        let tmp_masked_rm = ctx.tmp(RegClass::I64, I32);
+                        let tmp_masked_rm = ctx.alloc_tmp(RegClass::I64, I32);
                         ctx.emit(Inst::AluRRImmLogic {
                             alu_op: ALUOp::And32,
                             rd: tmp_masked_rm,
@@ -543,8 +543,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         });
                         let tmp_masked_rm = tmp_masked_rm.to_reg();
 
-                        let tmp1 = ctx.tmp(RegClass::I64, I32);
-                        let tmp2 = ctx.tmp(RegClass::I64, I32);
+                        let tmp1 = ctx.alloc_tmp(RegClass::I64, I32);
+                        let tmp2 = ctx.alloc_tmp(RegClass::I64, I32);
                         ctx.emit(Inst::AluRRImm12 {
                             alu_op: ALUOp::Sub32,
                             rd: tmp1,
@@ -583,7 +583,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         }
                         immshift.imm &= ty_bits_size - 1;
 
-                        let tmp1 = ctx.tmp(RegClass::I64, I32);
+                        let tmp1 = ctx.alloc_tmp(RegClass::I64, I32);
                         ctx.emit(Inst::AluRRImmShift {
                             alu_op: ALUOp::Lsr32,
                             rd: tmp1,
@@ -688,7 +688,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             // and fix the sequence below to work properly for this.
             let narrow_mode = NarrowValueMode::ZeroExtend64;
             let rn = input_to_reg(ctx, inputs[0], narrow_mode);
-            let tmp = ctx.tmp(RegClass::I64, I64);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I64);
 
             // If this is a 32-bit Popcnt, use Lsr32 to clear the top 32 bits of the register, then
             // the rest of the code is identical to the 64-bit version.
@@ -997,7 +997,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::Bitselect => {
-            let tmp = ctx.tmp(RegClass::I64, I64);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I64);
             let rd = output_to_reg(ctx, outputs[0]);
             let rcond = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
             let rn = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
@@ -1475,8 +1475,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
             let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
             let rd = output_to_reg(ctx, outputs[0]);
-            let tmp1 = ctx.tmp(RegClass::I64, I64);
-            let tmp2 = ctx.tmp(RegClass::I64, I64);
+            let tmp1 = ctx.alloc_tmp(RegClass::I64, I64);
+            let tmp2 = ctx.alloc_tmp(RegClass::I64, I64);
             ctx.emit(Inst::MovFromVec64 { rd: tmp1, rn: rn });
             ctx.emit(Inst::MovFromVec64 { rd: tmp2, rn: rm });
             let imml = if bits == 32 {
@@ -1546,7 +1546,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let trap_info = (ctx.srcloc(insn), TrapCode::BadConversionToInteger);
             ctx.emit(Inst::Udf { trap_info });
 
-            let tmp = ctx.tmp(RegClass::V128, I128);
+            let tmp = ctx.alloc_tmp(RegClass::V128, I128);
 
             // Check that the input is in range, with "truncate towards zero" semantics. This means
             // we allow values that are slightly out of range:
@@ -1712,8 +1712,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 _ => unreachable!(),
             };
 
-            let rtmp1 = ctx.tmp(RegClass::V128, in_ty);
-            let rtmp2 = ctx.tmp(RegClass::V128, in_ty);
+            let rtmp1 = ctx.alloc_tmp(RegClass::V128, in_ty);
+            let rtmp2 = ctx.alloc_tmp(RegClass::V128, in_ty);
 
             if in_bits == 32 {
                 ctx.emit(Inst::LoadFpuConst32 {
@@ -2072,7 +2072,9 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
             Opcode::BrTable => {
                 // Expand `br_table index, default, JT` to:
                 //
-                //   (emit island with guard jump if needed)
+                //   emit_island  // this forces an island at this point
+                //                // if the jumptable would push us past
+                //                // the deadline
                 //   subs idx, #jt_size
                 //   b.hs default
                 //   adr vTmp1, PC+16
@@ -2096,8 +2098,8 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                     NarrowValueMode::ZeroExtend32,
                 );
 
-                let rtmp1 = ctx.tmp(RegClass::I64, I32);
-                let rtmp2 = ctx.tmp(RegClass::I64, I32);
+                let rtmp1 = ctx.alloc_tmp(RegClass::I64, I32);
+                let rtmp2 = ctx.alloc_tmp(RegClass::I64, I32);
 
                 // Bounds-check and branch to default.
                 if let Some(imm12) = Imm12::maybe_from_u64(jt_size as u64) {
diff --git a/cranelift/codegen/src/machinst/blockorder.rs b/cranelift/codegen/src/machinst/blockorder.rs
index 104b2f8c15..1052d83858 100644
--- a/cranelift/codegen/src/machinst/blockorder.rs
+++ b/cranelift/codegen/src/machinst/blockorder.rs
@@ -3,12 +3,54 @@
 //! This module handles the translation from CLIF BBs to VCode BBs.
 //!
 //! The basic idea is that we compute a sequence of "lowered blocks" that
-//! correspond to subgraphs of the CLIF CFG plus an implicit block on *every*
-//! edge (not just critical edges). Conceptually, the lowering pipeline wants to
-//! insert moves for phi-nodes on every block-to-block transfer; these blocks
-//! always conceptually exist, but may be merged with an "original" CLIF block
-//! (and hence not actually exist; this is equivalent to inserting the blocks
-//! only on critical edges).
+//! correspond to one or more blocks in the graph: (CLIF CFG) `union` (implicit
+//! block on *every* edge). Conceptually, the lowering pipeline wants to insert
+//! moves for phi-nodes on every block-to-block transfer; these blocks always
+//! conceptually exist, but may be merged with an "original" CLIF block (and
+//! hence not actually exist; this is equivalent to inserting the blocks only on
+//! critical edges).
+//!
+//! In other words, starting from a CFG like this (where each "CLIF block" and
+//! "(edge N->M)" is a separate basic block):
+//!
+//! ```plain
+//!
+//!              CLIF block 0
+//!               /           \
+//!       (edge 0->1)         (edge 0->2)
+//!              |                |
+//!       CLIF block 1         CLIF block 2
+//!              \                /
+//!           (edge 1->3)   (edge 2->3)
+//!                   \      /
+//!                 CLIF block 3
+//! ```
+//!
+//! We can produce a CFG of lowered blocks like so:
+//!
+//! ```plain
+//!            +--------------+
+//!            | CLIF block 0 |
+//!            +--------------+
+//!               /           \
+//!     +--------------+     +--------------+
+//!     | (edge 0->1)  |     |(edge 0->2)   |
+//!     | CLIF block 1 |     | CLIF block 2 |
+//!     +--------------+     +--------------+
+//!              \                /
+//!          +-----------+ +-----------+
+//!          |(edge 1->3)| |(edge 2->3)|
+//!          +-----------+ +-----------+
+//!                   \      /
+//!                +------------+
+//!                |CLIF block 3|
+//!                +------------+
+//! ```
+//!
+//! (note that the edges into CLIF blocks 1 and 2 could be merged with those
+//! blocks' original bodies, but the out-edges could not because for simplicity
+//! in the successor-function definition, we only ever merge an edge onto one
+//! side of an original CLIF block.)
 //!
 //! Each `LoweredBlock` names just an original CLIF block, an original CLIF
 //! block prepended or appended with an edge block (never both, though), or just
@@ -23,6 +65,9 @@
 //! have content, because this computation happens as part of lowering *before*
 //! regalloc, and regalloc may or may not insert moves/spills/reloads on any
 //! particular edge. But it works relatively well and is conceptually simple.
+//! Furthermore, the [MachBuffer] machine-code sink performs final peephole-like
+//! branch editing that in practice elides empty blocks and simplifies some of
+//! the other redundancies that this scheme produces.
 
 use crate::entity::SecondaryMap;
 use crate::fx::{FxHashMap, FxHashSet};
diff --git a/cranelift/codegen/src/machinst/buffer.rs b/cranelift/codegen/src/machinst/buffer.rs
index b9e3bb3c1e..cb7564f258 100644
--- a/cranelift/codegen/src/machinst/buffer.rs
+++ b/cranelift/codegen/src/machinst/buffer.rs
@@ -1,12 +1,116 @@
 //! In-memory representation of compiled machine code, with labels and fixups to
 //! refer to those labels. Handles constant-pool island insertion and also
 //! veneer insertion for out-of-range jumps.
+//!
+//! This code exists to solve three problems:
+//!
+//! - Branch targets for forward branches are not known until later, when we
+//!   emit code in a single pass through the instruction structs.
+//!
+//! - On many architectures, address references or offsets have limited range.
+//!   For example, on AArch64, conditional branches can only target code +/- 1MB
+//!   from the branch itself.
+//!
+//! - The lowering of control flow from the CFG-with-edges produced by
+//!   [BlockLoweringOrder], combined with many empty edge blocks when the register
+//!   allocator does not need to insert any spills/reloads/moves in edge blocks,
+//!   results in many suboptimal branch patterns. The lowering also pays no
+//!   attention to block order, and so two-target conditional forms (cond-br
+//!   followed by uncond-br) can often by avoided because one of the targets is
+//!   the fallthrough. There are several cases here where we can simplify to use
+//!   fewer branches.
+//!
+//! This "buffer" implements a single-pass code emission strategy (with a later
+//! "fixup" pass, but only through recorded fixups, not all instructions). The
+//! basic idea is:
+//!
+//! - Emit branches as they are, including two-target (cond/uncond) compound
+//!   forms, but with zero offsets and optimistically assuming the target will be
+//!   in range. Record the "fixup" for later. Targets are denoted instead by
+//!   symbolic "labels" that are then bound to certain offsets in the buffer as
+//!   we emit code. (Nominally, there is a label at the start of every basic
+//!   block.)
+//!
+//! - As we do this, track the offset in the buffer at which the first label
+//!   reference "goes out of range". We call this the "deadline". If we reach the
+//!   deadline and we still have not bound the label to which an unresolved branch
+//!   refers, we have a problem!
+//!
+//! - To solve this problem, we emit "islands" full of "veneers". An island is
+//!   simply a chunk of code inserted in the middle of the code actually produced
+//!   by the emitter (e.g., vcode iterating over instruction structs). The emitter
+//!   has some awareness of this: it either asks for an island between blocks, so
+//!   it is not accidentally executed, or else it emits a branch around the island
+//!   when all other options fail (see [Inst::EmitIsland] meta-instruction).
+//!
+//! - A "veneer" is an instruction (or sequence of instructions) in an "island"
+//!   that implements a longer-range reference to a label. The idea is that, for
+//!   example, a branch with a limited range can branch to a "veneer" instead,
+//!   which is simply a branch in a form that can use a longer-range reference. On
+//!   AArch64, for example, conditionals have a +/- 1 MB range, but a conditional
+//!   can branch to an unconditional branch which has a +/- 128 MB range. Hence, a
+//!   conditional branch's label reference can be fixed up with a "veneer" to
+//!   achieve a longer range.
+//!
+//! - To implement all of this, we require the backend to provide a `LabelUse`
+//!   type that implements a trait. This is nominally an enum that records one of
+//!   several kinds of references to an offset in code -- basically, a relocation
+//!   type -- and will usually correspond to different instruction formats. The
+//!   `LabelUse` implementation specifies the maximum range, how to patch in the
+//!   actual label location when known, and how to generate a veneer to extend the
+//!   range.
+//!
+//! That satisfies label references, but we still may have suboptimal branch
+//! patterns. To clean up the branches, we do a simple "peephole"-style
+//! optimization on the fly. To do so, the emitter (e.g., `Inst::emit()`)
+//! informs the buffer of branches in the code and, in the case of conditionals,
+//! the code that would have been emitted to invert this branch's condition. We
+//! track the "latest branches": these are branches that are contiguous up to
+//! the current offset. (If any code is emitted after a branch, that branch or
+//! run of contiguous branches is no longer "latest".) The latest branches are
+//! those that we can edit by simply truncating the buffer and doing something
+//! else instead.
+//!
+//! To optimize branches, we implement several simple rules, and try to apply
+//! them to the "latest branches" when possible:
+//!
+//! - A branch with a label target, when that label is bound to the ending
+//!   offset of the branch (the fallthrough location), can be removed altogether,
+//!   because the branch would have no effect).
+//!
+//! - An unconditional branch that starts at a label location, and branches to
+//!   another label, results in a "label alias": all references to the label bound
+//!   *to* this branch instruction are instead resolved to the *target* of the
+//!   branch instruction. This effectively removes empty blocks that just
+//!   unconditionally branch to the next block. We call this "branch threading".
+//!
+//! - A conditional followed by an unconditional, when the conditional branches
+//!   to the unconditional's fallthrough, results in (i) the truncation of the
+//!   unconditional, (ii) the inversion of the condition's condition, and (iii)
+//!   replacement of the conditional's target (using the original target of the
+//!   unconditional). This is a fancy way of saying "we can flip a two-target
+//!   conditional branch's taken/not-taken targets if it works better with our
+//!   fallthrough". To make this work, the emitter actually gives the buffer
+//!   *both* forms of every conditional branch: the true form is emitted into the
+//!   buffer, and the "inverted" machine-code bytes are provided as part of the
+//!   branch-fixup metadata.
+//!
+//! - An unconditional B preceded by another unconditional P, when B's label(s) have
+//!   been redirected to target(B), can be removed entirely. This is an extension
+//!   of the branch-threading optimization, and is valid because if we know there
+//!   will be no fallthrough into this branch instruction (the prior instruction
+//!   is an unconditional jump), and if we know we have successfully redirected
+//!   all labels, then this branch instruction is unreachable. Note that this
+//!   works because the redirection happens before the label is ever resolved
+//!   (fixups happen at island emission time, at which point latest-branches are
+//!   cleared, or at the end of emission), so we are sure to catch and redirect
+//!   all possible paths to this instruction.
 
 use crate::binemit::{Addend, CodeOffset, CodeSink, Reloc};
 use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode};
 use crate::machinst::{BlockIndex, MachInstLabelUse, VCodeInst};
 
-use log::debug;
+use log::trace;
 use smallvec::SmallVec;
 use std::mem;
 
@@ -35,10 +139,11 @@ pub struct MachBuffer<I: VCodeInst> {
     cur_srcloc: Option<(CodeOffset, SourceLoc)>,
     /// Known label offsets; `UNKNOWN_LABEL_OFFSET` if unknown.
     label_offsets: SmallVec<[CodeOffset; 16]>,
-    /// Label aliases: one label points to an unconditional jump to another
-    /// label, so references to the first should be resolved as references
-    /// to the second. (We don't chase arbitrarily deep to avoid problems
-    /// with cycles.)
+    /// Label aliases: when one label points to an unconditional jump, and that
+    /// jump points to another label, we can redirect references to the first
+    /// label immediately to the second. (We don't chase arbitrarily deep to
+    /// avoid problems with cycles, but rather only one level, i.e.  through one
+    /// jump.)
     label_aliases: SmallVec<[MachLabel; 16]>,
     /// Constants that must be emitted at some point.
     pending_constants: SmallVec<[MachLabelConstant; 16]>,
@@ -129,13 +234,13 @@ impl<I: VCodeInst> MachBuffer<I> {
 
     /// Add a byte.
     pub fn put1(&mut self, value: u8) {
-        debug!("MachBuffer: put byte @ {}: {:x}", self.cur_offset(), value);
+        trace!("MachBuffer: put byte @ {}: {:x}", self.cur_offset(), value);
         self.data.push(value);
     }
 
     /// Add 2 bytes.
     pub fn put2(&mut self, value: u16) {
-        debug!(
+        trace!(
             "MachBuffer: put 16-bit word @ {}: {:x}",
             self.cur_offset(),
             value
@@ -146,7 +251,7 @@ impl<I: VCodeInst> MachBuffer<I> {
 
     /// Add 4 bytes.
     pub fn put4(&mut self, value: u32) {
-        debug!(
+        trace!(
             "MachBuffer: put 32-bit word @ {}: {:x}",
             self.cur_offset(),
             value
@@ -157,7 +262,7 @@ impl<I: VCodeInst> MachBuffer<I> {
 
     /// Add 8 bytes.
     pub fn put8(&mut self, value: u64) {
-        debug!(
+        trace!(
             "MachBuffer: put 64-bit word @ {}: {:x}",
             self.cur_offset(),
             value
@@ -168,7 +273,7 @@ impl<I: VCodeInst> MachBuffer<I> {
 
     /// Add a slice of bytes.
     pub fn put_data(&mut self, data: &[u8]) {
-        debug!(
+        trace!(
             "MachBuffer: put data @ {}: len {}",
             self.cur_offset(),
             data.len()
@@ -178,7 +283,7 @@ impl<I: VCodeInst> MachBuffer<I> {
 
     /// Reserve appended space and return a mutable slice referring to it.
     pub fn get_appended_space(&mut self, len: usize) -> &mut [u8] {
-        debug!("MachBuffer: put data @ {}: len {}", self.cur_offset(), len);
+        trace!("MachBuffer: put data @ {}: len {}", self.cur_offset(), len);
         let off = self.data.len();
         let new_len = self.data.len() + len;
         self.data.resize(new_len, 0);
@@ -187,7 +292,7 @@ impl<I: VCodeInst> MachBuffer<I> {
 
     /// Align up to the given alignment.
     pub fn align_to(&mut self, align_to: CodeOffset) {
-        debug!("MachBuffer: align to {}", align_to);
+        trace!("MachBuffer: align to {}", align_to);
         assert!(align_to.is_power_of_two());
         while self.cur_offset() & (align_to - 1) != 0 {
             self.put1(0);
@@ -200,13 +305,13 @@ impl<I: VCodeInst> MachBuffer<I> {
         let l = self.label_offsets.len() as u32;
         self.label_offsets.push(UNKNOWN_LABEL_OFFSET);
         self.label_aliases.push(UNKNOWN_LABEL);
-        debug!("MachBuffer: new label -> {:?}", MachLabel(l));
+        trace!("MachBuffer: new label -> {:?}", MachLabel(l));
         MachLabel(l)
     }
 
     /// Reserve the first N MachLabels for blocks.
     pub fn reserve_labels_for_blocks(&mut self, blocks: BlockIndex) {
-        debug!("MachBuffer: first {} labels are for blocks", blocks);
+        trace!("MachBuffer: first {} labels are for blocks", blocks);
         debug_assert!(self.label_offsets.is_empty());
         self.label_offsets
             .resize(blocks as usize, UNKNOWN_LABEL_OFFSET);
@@ -215,7 +320,7 @@ impl<I: VCodeInst> MachBuffer<I> {
 
     /// Bind a label to the current offset.
     pub fn bind_label(&mut self, label: MachLabel) {
-        debug!(
+        trace!(
             "MachBuffer: bind label {:?} at offset {}",
             label,
             self.cur_offset()
@@ -244,9 +349,11 @@ impl<I: VCodeInst> MachBuffer<I> {
     /// happen immediately, the buffer must already contain bytes at `offset` up
     /// to `offset + kind.patch_size()`.
     pub fn use_label_at_offset(&mut self, offset: CodeOffset, label: MachLabel, kind: I::LabelUse) {
-        debug!(
+        trace!(
             "MachBuffer: use_label_at_offset: offset {} label {:?} kind {:?}",
-            offset, label, kind
+            offset,
+            label,
+            kind
         );
         debug_assert!(offset + kind.patch_size() <= self.cur_offset());
 
@@ -310,14 +417,15 @@ impl<I: VCodeInst> MachBuffer<I> {
         self.data.truncate(b.start as usize);
         self.fixup_records.truncate(b.fixup);
         let cur_off = self.cur_offset();
-        debug!(
+        trace!(
             "truncate_last_branch: truncated {:?}; off now {}",
-            b, cur_off
+            b,
+            cur_off
         );
         for &mut (l, ref mut off) in self.labels_by_offset.iter_mut().rev() {
             if *off > cur_off {
                 *off = cur_off;
-                debug!(" -> label {:?} reassigned to {}", l, cur_off);
+                trace!(" -> label {:?} reassigned to {}", l, cur_off);
                 self.label_offsets[l.0 as usize] = cur_off;
             } else {
                 break;
@@ -326,13 +434,15 @@ impl<I: VCodeInst> MachBuffer<I> {
     }
 
     fn optimize_branches(&mut self) {
-        debug!(
+        trace!(
             "enter optimize_branches:\n b = {:?}\n l = {:?}\n f = {:?}",
-            self.latest_branches, self.labels_by_offset, self.fixup_records
+            self.latest_branches,
+            self.labels_by_offset,
+            self.fixup_records
         );
         while let Some(b) = self.latest_branches.last() {
             let cur_off = self.cur_offset();
-            debug!("optimize_branches: last branch {:?} at off {}", b, cur_off);
+            trace!("optimize_branches: last branch {:?} at off {}", b, cur_off);
             // If there has been any code emission since the end of the last branch or
             // label definition, then there's nothing we can edit (because we
             // don't move code once placed, only back up and overwrite), so
@@ -359,11 +469,11 @@ impl<I: VCodeInst> MachBuffer<I> {
                 // Set any label equal to current branch's start as an alias of
                 // the branch's target.
                 for &(l, off) in self.labels_by_offset.iter().rev() {
-                    debug!(" -> uncond: latest label {:?} at off {}", l, off);
+                    trace!(" -> uncond: latest label {:?} at off {}", l, off);
                     if off > b.start {
                         continue;
                     } else if off == b.start {
-                        debug!(" -> setting alias to {:?}", b.target);
+                        trace!(" -> setting alias to {:?}", b.target);
                         self.label_aliases[l.0 as usize] = b.target;
                     } else {
                         break;
@@ -375,12 +485,12 @@ impl<I: VCodeInst> MachBuffer<I> {
                 // Examine any immediately preceding branch.
                 if self.latest_branches.len() > 1 {
                     let prev_b = &self.latest_branches[self.latest_branches.len() - 2];
-                    debug!(" -> more than one branch; prev_b = {:?}", prev_b);
+                    trace!(" -> more than one branch; prev_b = {:?}", prev_b);
                     // This uncond is immediately after another uncond; we've
                     // already redirected labels to this uncond away; so we can
                     // truncate this uncond.
                     if prev_b.is_uncond() && prev_b.end == b.start {
-                        debug!(" -> uncond follows another uncond; truncating");
+                        trace!(" -> uncond follows another uncond; truncating");
                         self.truncate_last_branch();
                         continue;
                     }
@@ -395,7 +505,7 @@ impl<I: VCodeInst> MachBuffer<I> {
                         && prev_b.end == b.start
                         && self.resolve_label_offset(prev_b.target) == cur_off
                     {
-                        debug!(" -> uncond follows a conditional, and conditional's target resolves to current offset");
+                        trace!(" -> uncond follows a conditional, and conditional's target resolves to current offset");
                         let target = b.target;
                         let data = prev_b.inverted.clone().unwrap();
                         self.truncate_last_branch();
@@ -407,7 +517,7 @@ impl<I: VCodeInst> MachBuffer<I> {
                         self.data.extend_from_slice(&data[..]);
                         prev_b.inverted = Some(not_inverted);
                         self.fixup_records[prev_b.fixup].label = target;
-                        debug!(" -> reassigning target of condbr to {:?}", target);
+                        trace!(" -> reassigning target of condbr to {:?}", target);
                         prev_b.target = target;
                         continue;
                     }
@@ -420,7 +530,7 @@ impl<I: VCodeInst> MachBuffer<I> {
             //   the current offset (end of branch) to the truncated
             //   end-of-code.
             if self.resolve_label_offset(b.target) == cur_off {
-                debug!("branch with target == cur off; truncating");
+                trace!("branch with target == cur off; truncating");
                 self.truncate_last_branch();
             }
 
@@ -430,9 +540,11 @@ impl<I: VCodeInst> MachBuffer<I> {
 
         self.purge_latest_branches();
 
-        debug!(
+        trace!(
             "leave optimize_branches:\n b = {:?}\n l = {:?}\n f = {:?}",
-            self.latest_branches, self.labels_by_offset, self.fixup_records
+            self.latest_branches,
+            self.labels_by_offset,
+            self.fixup_records
         );
     }
 
@@ -440,7 +552,7 @@ impl<I: VCodeInst> MachBuffer<I> {
         let cur_off = self.cur_offset();
         if let Some(l) = self.latest_branches.last() {
             if l.end < cur_off {
-                debug!("purge_latest_branches: removing branch {:?}", l);
+                trace!("purge_latest_branches: removing branch {:?}", l);
                 self.latest_branches.clear();
             }
         }
@@ -498,9 +610,11 @@ impl<I: VCodeInst> MachBuffer<I> {
             kind,
         } in fixup_records.into_iter()
         {
-            debug!(
+            trace!(
                 "emit_island: fixup for label {:?} at offset {} kind {:?}",
-                label, offset, kind
+                label,
+                offset,
+                kind
             );
             // We eagerly perform fixups whose label targets are known, if not out
             // of range, to avoid unnecessary veneers.
@@ -516,7 +630,7 @@ impl<I: VCodeInst> MachBuffer<I> {
                 false
             };
 
-            debug!(
+            trace!(
                 " -> label_offset = {}, known = {}, in_range = {} (pos {} neg {})",
                 label_offset,
                 known,
@@ -530,7 +644,7 @@ impl<I: VCodeInst> MachBuffer<I> {
             if in_range {
                 debug_assert!(known); // implied by in_range.
                 let slice = &mut self.data[start..end];
-                debug!("patching in-range!");
+                trace!("patching in-range!");
                 kind.patch(slice, offset, label_offset);
             } else if !known && !kind.supports_veneer() {
                 // Nothing for now. Keep it for next round.
@@ -543,21 +657,23 @@ impl<I: VCodeInst> MachBuffer<I> {
                 // Allocate space for a veneer in the island.
                 self.align_to(I::LabelUse::ALIGN);
                 let veneer_offset = self.cur_offset();
-                debug!("making a veneer at {}", veneer_offset);
+                trace!("making a veneer at {}", veneer_offset);
                 let slice = &mut self.data[start..end];
                 // Patch the original label use to refer to teh veneer.
-                debug!(
+                trace!(
                     "patching original at offset {} to veneer offset {}",
-                    offset, veneer_offset
+                    offset,
+                    veneer_offset
                 );
                 kind.patch(slice, offset, veneer_offset);
                 // Generate the veneer.
                 let veneer_slice = self.get_appended_space(kind.veneer_size() as usize);
                 let (veneer_fixup_off, veneer_label_use) =
                     kind.generate_veneer(veneer_slice, veneer_offset);
-                debug!(
+                trace!(
                     "generated veneer; fixup offset {}, label_use {:?}",
-                    veneer_fixup_off, veneer_label_use
+                    veneer_fixup_off,
+                    veneer_label_use
                 );
                 // If the label is known (but was just out of range), do the
                 // veneer label-use fixup now too; otherwise, save it for later.
@@ -565,7 +681,7 @@ impl<I: VCodeInst> MachBuffer<I> {
                     let start = veneer_fixup_off as usize;
                     let end = (veneer_fixup_off + veneer_label_use.patch_size()) as usize;
                     let veneer_slice = &mut self.data[start..end];
-                    debug!("doing veneer fixup right away too");
+                    trace!("doing veneer fixup right away too");
                     veneer_label_use.patch(veneer_slice, veneer_fixup_off, label_offset);
                 } else {
                     new_fixups.push(MachLabelFixup {
diff --git a/cranelift/codegen/src/machinst/lower.rs b/cranelift/codegen/src/machinst/lower.rs
index fcbf3d2810..76663450ba 100644
--- a/cranelift/codegen/src/machinst/lower.rs
+++ b/cranelift/codegen/src/machinst/lower.rs
@@ -23,9 +23,9 @@ use alloc::vec::Vec;
 use log::debug;
 use smallvec::SmallVec;
 
-/// An "instruction color" partitions instructions by side-effecting ops. All
-/// instructions with the same "color" are guaranteed not to be separated by any
-/// side-effecting op (for this purpose, loads are also considered
+/// An "instruction color" partitions CLIF instructions by side-effecting ops.
+/// All instructions with the same "color" are guaranteed not to be separated by
+/// any side-effecting op (for this purpose, loads are also considered
 /// side-effecting, to avoid subtle questions w.r.t. the memory model), and
 /// furthermore, it is guaranteed that for any two instructions A and B such
 /// that color(A) == color(B), either A dominates B and B postdominates A, or
@@ -33,7 +33,8 @@ use smallvec::SmallVec;
 /// have the same color, trivially providing the second condition.) Intuitively,
 /// this means that the ops of the same color must always execute "together", as
 /// part of one atomic contiguous section of the dynamic execution trace, and
-/// they can be freely permuted without affecting program behavior.
+/// they can be freely permuted (modulo true dataflow dependencies) without
+/// affecting program behavior.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
 pub struct InstColor(u32);
 impl InstColor {
@@ -122,7 +123,11 @@ pub trait LowerCtx {
     /// If the backend uses the register, rather than one of the other
     /// forms (constant or merging of the producing op), it must call
     /// `use_input_reg()` to ensure the producing inst is actually lowered
-    /// as well.
+    /// as well. Failing to do so may result in the instruction that generates
+    /// this value never being generated, thus resulting in incorrect execution.
+    /// For correctness, backends should thus wrap `get_input()` and
+    /// `use_input_regs()` with helpers that return a register only after
+    /// ensuring it is marked as used.
     fn get_input(&self, ir_inst: Inst, idx: usize) -> LowerInput;
     /// Get the `idx`th output register of the given IR instruction. When
     /// `backend.lower_inst_to_regs(ctx, inst)` is called, it is expected that
@@ -133,7 +138,7 @@ pub trait LowerCtx {
     // ask for an input to be gen'd into a register.
 
     /// Get a new temp.
-    fn tmp(&mut self, rc: RegClass, ty: Type) -> Writable<Reg>;
+    fn alloc_tmp(&mut self, rc: RegClass, ty: Type) -> Writable<Reg>;
     /// Emit a machine instruction.
     fn emit(&mut self, mach_inst: Self::I);
     /// Indicate that the given input uses the register returned by
@@ -477,7 +482,7 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
             // There's some overlap, so play safe and copy via temps.
             let mut tmp_regs: SmallVec<[Writable<Reg>; 16]> = SmallVec::new();
             for &ty in &phi_classes {
-                tmp_regs.push(self.tmp(I::rc_for_type(ty)?, ty));
+                tmp_regs.push(self.alloc_tmp(I::rc_for_type(ty)?, ty));
             }
 
             debug!("phi_temps = {:?}", tmp_regs);
@@ -721,6 +726,9 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
         Ok(vcode)
     }
 
+    /// Get the actual inputs for a value. This is the implementation for
+    /// `get_input()` but starting from the SSA value, which is not exposed to
+    /// the backend.
     fn get_input_for_val(&self, at_inst: Inst, val: Value) -> LowerInput {
         debug!("get_input_for_val: val {} at inst {}", val, at_inst);
         let mut reg = self.value_regs[val];
@@ -889,7 +897,7 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> {
         Writable::from_reg(self.value_regs[val])
     }
 
-    fn tmp(&mut self, rc: RegClass, ty: Type) -> Writable<Reg> {
+    fn alloc_tmp(&mut self, rc: RegClass, ty: Type) -> Writable<Reg> {
         let v = self.next_vreg;
         self.next_vreg += 1;
         let vreg = Reg::new_virtual(rc, v);