diff --git a/cranelift/codegen/src/context.rs b/cranelift/codegen/src/context.rs
index 2c4ce6b492..e5d11c6963 100644
--- a/cranelift/codegen/src/context.rs
+++ b/cranelift/codegen/src/context.rs
@@ -227,7 +227,7 @@ impl Context {
         let _tt = timing::binemit();
         let mut sink = MemoryCodeSink::new(mem, relocs, traps, stackmaps);
         if let Some(ref result) = &self.mach_compile_result {
-            result.sections.emit(&mut sink);
+            result.buffer.emit(&mut sink);
         } else {
             isa.emit_function_to_memory(&self.func, &mut sink);
         }
diff --git a/cranelift/codegen/src/inst_predicates.rs b/cranelift/codegen/src/inst_predicates.rs
index 9cefbc38f9..f0d6fdf6b5 100644
--- a/cranelift/codegen/src/inst_predicates.rs
+++ b/cranelift/codegen/src/inst_predicates.rs
@@ -40,3 +40,24 @@ pub fn has_side_effect(func: &Function, inst: Inst) -> bool {
     let opcode = data.opcode();
     trivially_has_side_effects(opcode) || is_load_with_defined_trapping(opcode, data)
 }
+
+/// Does the given instruction have any side-effect as per [has_side_effect], or else is a load?
+pub fn has_side_effect_or_load(func: &Function, inst: Inst) -> bool {
+    has_side_effect(func, inst) || func.dfg[inst].opcode().can_load()
+}
+
+/// Is the given instruction a constant value (`iconst`, `fconst`, `bconst`) that can be
+/// represented in 64 bits?
+pub fn is_constant_64bit(func: &Function, inst: Inst) -> Option<u64> {
+    let data = &func.dfg[inst];
+    if data.opcode() == Opcode::Null {
+        return Some(0);
+    }
+    match data {
+        &InstructionData::UnaryImm { imm, .. } => Some(imm.bits() as u64),
+        &InstructionData::UnaryIeee32 { imm, .. } => Some(imm.bits() as u64),
+        &InstructionData::UnaryIeee64 { imm, .. } => Some(imm.bits()),
+        &InstructionData::UnaryBool { imm, .. } => Some(if imm { 1 } else { 0 }),
+        _ => None,
+    }
+}
diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs
index d90c23421e..8f388665b5 100644
--- a/cranelift/codegen/src/isa/aarch64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -504,7 +504,7 @@ impl AArch64ABIBody {
                 rn: stack_reg(),
                 rm: stack_limit,
             });
-            insts.push(Inst::CondBrLowered {
+            insts.push(Inst::OneWayCondBr {
                 target: BranchTarget::ResolvedOffset(8),
                 // Here `Hs` == "higher or same" when interpreting the two
                 // operands as unsigned integers.
diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs
index 8eb3b9b02a..4b8142fbe5 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -3,14 +3,14 @@
 // Some variants are never constructed, but we still want them as options in the future.
 #![allow(dead_code)]
 
-use crate::binemit::CodeOffset;
 use crate::ir::Type;
 use crate::isa::aarch64::inst::*;
 use crate::isa::aarch64::lower::ty_bits;
+use crate::machinst::MachLabel;
 
 use regalloc::{RealRegUniverse, Reg, Writable};
 
-use core::convert::{Into, TryFrom};
+use core::convert::Into;
 use std::string::String;
 
 /// A shift operator for a register or immediate.
@@ -303,78 +303,44 @@ impl CondBrKind {
 
 /// A branch target. Either unresolved (basic-block index) or resolved (offset
 /// from end of current instruction).
-#[derive(Clone, Copy, Debug)]
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum BranchTarget {
-    /// An unresolved reference to a BlockIndex, as passed into
+    /// An unresolved reference to a Label, as passed into
     /// `lower_branch_group()`.
-    Block(BlockIndex),
-    /// A resolved reference to another instruction, after
-    /// `Inst::with_block_offsets()`.
+    Label(MachLabel),
+    /// A fixed PC offset.
     ResolvedOffset(isize),
 }
 
 impl BranchTarget {
-    /// Lower the branch target given offsets of each block.
-    pub fn lower(&mut self, targets: &[CodeOffset], my_offset: CodeOffset) {
+    /// Return the target's label, if it is a label-based target.
+    pub fn as_label(self) -> Option<MachLabel> {
         match self {
-            &mut BranchTarget::Block(bix) => {
-                let bix = usize::try_from(bix).unwrap();
-                assert!(bix < targets.len());
-                let block_offset_in_func = targets[bix];
-                let branch_offset = (block_offset_in_func as isize) - (my_offset as isize);
-                *self = BranchTarget::ResolvedOffset(branch_offset);
-            }
-            &mut BranchTarget::ResolvedOffset(..) => {}
-        }
-    }
-
-    /// Get the block index.
-    pub fn as_block_index(&self) -> Option<BlockIndex> {
-        match self {
-            &BranchTarget::Block(bix) => Some(bix),
+            BranchTarget::Label(l) => Some(l),
             _ => None,
         }
     }
 
-    /// Get the offset as 4-byte words. Returns `0` if not
-    /// yet resolved (in that case, we're only computing
-    /// size and the offset doesn't matter).
-    pub fn as_offset_words(&self) -> isize {
-        match self {
-            &BranchTarget::ResolvedOffset(off) => off >> 2,
+    /// Return the target's offset, if specified, or zero if label-based.
+    pub fn as_offset19_or_zero(self) -> u32 {
+        let off = match self {
+            BranchTarget::ResolvedOffset(off) => off >> 2,
             _ => 0,
-        }
+        };
+        assert!(off <= 0x3ffff);
+        assert!(off >= -0x40000);
+        (off as u32) & 0x7ffff
     }
 
-    /// Get the offset as a 26-bit offset suitable for a 26-bit jump, or `None` if overflow.
-    pub fn as_off26(&self) -> Option<u32> {
-        let off = self.as_offset_words();
-        if (off < (1 << 25)) && (off >= -(1 << 25)) {
-            Some((off as u32) & ((1 << 26) - 1))
-        } else {
-            None
-        }
-    }
-
-    /// Get the offset as a 19-bit offset, or `None` if overflow.
-    pub fn as_off19(&self) -> Option<u32> {
-        let off = self.as_offset_words();
-        if (off < (1 << 18)) && (off >= -(1 << 18)) {
-            Some((off as u32) & ((1 << 19) - 1))
-        } else {
-            None
-        }
-    }
-
-    /// Map the block index given a transform map.
-    pub fn map(&mut self, block_index_map: &[BlockIndex]) {
-        match self {
-            &mut BranchTarget::Block(ref mut bix) => {
-                let n = block_index_map[usize::try_from(*bix).unwrap()];
-                *bix = n;
-            }
-            &mut BranchTarget::ResolvedOffset(_) => {}
-        }
+    /// Return the target's offset, if specified, or zero if label-based.
+    pub fn as_offset26_or_zero(self) -> u32 {
+        let off = match self {
+            BranchTarget::ResolvedOffset(off) => off >> 2,
+            _ => 0,
+        };
+        assert!(off <= 0x1ffffff);
+        assert!(off >= -0x2000000);
+        (off as u32) & 0x3ffffff
     }
 }
 
@@ -507,7 +473,7 @@ impl ShowWithRRU for Cond {
 impl ShowWithRRU for BranchTarget {
     fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
         match self {
-            &BranchTarget::Block(block) => format!("block{}", block),
+            &BranchTarget::Label(label) => format!("label{:?}", label.get()),
             &BranchTarget::ResolvedOffset(off) => format!("{}", off),
         }
     }
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index 5a9f9fef59..2d5ecd406d 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -4,7 +4,7 @@ use crate::binemit::{CodeOffset, Reloc};
 use crate::ir::constant::ConstantData;
 use crate::ir::types::*;
 use crate::ir::TrapCode;
-use crate::isa::aarch64::{inst::regs::PINNED_REG, inst::*};
+use crate::isa::aarch64::inst::*;
 
 use regalloc::{Reg, RegClass, Writable};
 
@@ -149,6 +149,14 @@ fn enc_cbr(op_31_24: u32, off_18_0: u32, op_4: u32, cond: u32) -> u32 {
     (op_31_24 << 24) | (off_18_0 << 5) | (op_4 << 4) | cond
 }
 
+fn enc_conditional_br(taken: BranchTarget, kind: CondBrKind) -> u32 {
+    match kind {
+        CondBrKind::Zero(reg) => enc_cmpbr(0b1_011010_0, taken.as_offset19_or_zero(), reg),
+        CondBrKind::NotZero(reg) => enc_cmpbr(0b1_011010_1, taken.as_offset19_or_zero(), reg),
+        CondBrKind::Cond(c) => enc_cbr(0b01010100, taken.as_offset19_or_zero(), 0b0, c.bits()),
+    }
+}
+
 const MOVE_WIDE_FIXED: u32 = 0x92800000;
 
 #[repr(u32)]
@@ -340,10 +348,10 @@ pub struct EmitState {
     virtual_sp_offset: i64,
 }
 
-impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
+impl MachInstEmit for Inst {
     type State = EmitState;
 
-    fn emit(&self, sink: &mut O, flags: &settings::Flags, state: &mut EmitState) {
+    fn emit(&self, sink: &mut MachBuffer<Inst>, flags: &settings::Flags, state: &mut EmitState) {
         match self {
             &Inst::AluRRR { alu_op, rd, rn, rm } => {
                 let top11 = match alu_op {
@@ -616,7 +624,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                 ref mem,
                 srcloc,
             } => {
-                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem, state);
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset(), mem, state);
 
                 for inst in mem_insts.into_iter() {
                     inst.emit(sink, flags, state);
@@ -759,7 +767,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                 ref mem,
                 srcloc,
             } => {
-                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem, state);
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset(), mem, state);
 
                 for inst in mem_insts.into_iter() {
                     inst.emit(sink, flags, state);
@@ -1147,10 +1155,18 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                 panic!("Unsupported extend variant");
             }
             &Inst::Jump { ref dest } => {
-                // TODO: differentiate between as_off26() returning `None` for
-                // out-of-range vs. not-yet-finalized. The latter happens when we
-                // do early (fake) emission for size computation.
-                sink.put4(enc_jump26(0b000101, dest.as_off26().unwrap()));
+                let off = sink.cur_offset();
+                // Emit the jump itself.
+                sink.put4(enc_jump26(0b000101, dest.as_offset26_or_zero()));
+                // After the jump has been emitted, indicate that it uses a
+                // label, if so, so that a fixup can occur later. This happens
+                // after we emit the bytes because the fixup might occur right
+                // away (so the bytes must actually exist now).
+                if let Some(l) = dest.as_label() {
+                    sink.use_label_at_offset(off, l, LabelUse::Branch26);
+                    let cur_off = sink.cur_offset();
+                    sink.add_uncond_branch(off, cur_off, l);
+                }
             }
             &Inst::Ret => {
                 sink.put4(0xd65f03c0);
@@ -1178,51 +1194,35 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                     sink.add_call_site(loc, opcode);
                 }
             }
-            &Inst::CondBr { .. } => panic!("Unlowered CondBr during binemit!"),
-            &Inst::CondBrLowered { target, kind } => match kind {
-                // TODO: handle >2^19 case by emitting a compound sequence with
-                // an unconditional (26-bit) branch. We need branch-relaxation
-                // adjustment machinery to enable this (because we don't want to
-                // always emit the long form).
-                CondBrKind::Zero(reg) => {
-                    sink.put4(enc_cmpbr(0b1_011010_0, target.as_off19().unwrap(), reg));
-                }
-                CondBrKind::NotZero(reg) => {
-                    sink.put4(enc_cmpbr(0b1_011010_1, target.as_off19().unwrap(), reg));
-                }
-                CondBrKind::Cond(c) => {
-                    sink.put4(enc_cbr(
-                        0b01010100,
-                        target.as_off19().unwrap_or(0),
-                        0b0,
-                        c.bits(),
-                    ));
-                }
-            },
-            &Inst::CondBrLoweredCompound {
+            &Inst::CondBr {
                 taken,
                 not_taken,
                 kind,
             } => {
                 // Conditional part first.
-                match kind {
-                    CondBrKind::Zero(reg) => {
-                        sink.put4(enc_cmpbr(0b1_011010_0, taken.as_off19().unwrap(), reg));
-                    }
-                    CondBrKind::NotZero(reg) => {
-                        sink.put4(enc_cmpbr(0b1_011010_1, taken.as_off19().unwrap(), reg));
-                    }
-                    CondBrKind::Cond(c) => {
-                        sink.put4(enc_cbr(
-                            0b01010100,
-                            taken.as_off19().unwrap_or(0),
-                            0b0,
-                            c.bits(),
-                        ));
-                    }
+                let cond_off = sink.cur_offset();
+                sink.put4(enc_conditional_br(taken, kind));
+                if let Some(l) = taken.as_label() {
+                    sink.use_label_at_offset(cond_off, l, LabelUse::Branch19);
+                    let cur_off = sink.cur_offset();
+                    let inverted = enc_conditional_br(taken, kind.invert()).to_le_bytes();
+                    sink.add_cond_branch(cond_off, cur_off, l, &inverted[..]);
                 }
                 // Unconditional part.
-                sink.put4(enc_jump26(0b000101, not_taken.as_off26().unwrap_or(0)));
+                let uncond_off = sink.cur_offset();
+                sink.put4(enc_jump26(0b000101, not_taken.as_offset26_or_zero()));
+                if let Some(l) = not_taken.as_label() {
+                    sink.use_label_at_offset(uncond_off, l, LabelUse::Branch26);
+                    let cur_off = sink.cur_offset();
+                    sink.add_uncond_branch(uncond_off, cur_off, l);
+                }
+            }
+            &Inst::OneWayCondBr { target, kind } => {
+                let off = sink.cur_offset();
+                sink.put4(enc_conditional_br(target, kind));
+                if let Some(l) = target.as_label() {
+                    sink.use_label_at_offset(off, l, LabelUse::Branch19);
+                }
             }
             &Inst::IndirectBr { rn, .. } => {
                 sink.put4(enc_br(rn));
@@ -1239,8 +1239,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                 sink.add_trap(srcloc, code);
                 sink.put4(0xd4a00000);
             }
-            &Inst::Adr { rd, ref label } => {
-                let off = memlabel_finalize(sink.cur_offset_from_start(), label);
+            &Inst::Adr { rd, off } => {
                 assert!(off > -(1 << 20));
                 assert!(off < (1 << 20));
                 sink.put4(enc_adr(off, rd));
@@ -1261,19 +1260,13 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                 // This sequence is *one* instruction in the vcode, and is expanded only here at
                 // emission time, because we cannot allow the regalloc to insert spills/reloads in
                 // the middle; we depend on hardcoded PC-rel addressing below.
-                //
-                // N.B.: if PC-rel addressing on ADR below is changed, also update
-                // `Inst::with_block_offsets()` in aarch64/inst/mod.rs.
 
                 // Save index in a tmp (the live range of ridx only goes to start of this
                 // sequence; rtmp1 or rtmp2 may overwrite it).
                 let inst = Inst::gen_move(rtmp2, ridx, I64);
                 inst.emit(sink, flags, state);
                 // Load address of jump table
-                let inst = Inst::Adr {
-                    rd: rtmp1,
-                    label: MemLabel::PCRel(16),
-                };
+                let inst = Inst::Adr { rd: rtmp1, off: 16 };
                 inst.emit(sink, flags, state);
                 // Load value out of jump table
                 let inst = Inst::SLoad32 {
@@ -1303,12 +1296,16 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                 };
                 inst.emit(sink, flags, state);
                 // Emit jump table (table of 32-bit offsets).
-                for target in targets.iter() {
-                    let off = target.as_offset_words() * 4;
-                    let off = i32::try_from(off).unwrap();
-                    // cast i32 to u32 (two's-complement)
-                    let off = off as u32;
-                    sink.put4(off);
+                let jt_off = sink.cur_offset();
+                for &target in targets.iter() {
+                    let word_off = sink.cur_offset();
+                    let off_into_table = word_off - jt_off;
+                    sink.put4(off_into_table);
+                    sink.use_label_at_offset(
+                        word_off,
+                        target.as_label().unwrap(),
+                        LabelUse::PCRel32,
+                    );
                 }
             }
             &Inst::LoadConst64 { rd, const_data } => {
@@ -1348,7 +1345,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                 }
             }
             &Inst::LoadAddr { rd, ref mem } => {
-                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem, state);
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset(), mem, state);
                 for inst in mem_insts.into_iter() {
                     inst.emit(sink, flags, state);
                 }
@@ -1401,20 +1398,6 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                     add.emit(sink, flags, state);
                 }
             }
-            &Inst::GetPinnedReg { rd } => {
-                let inst = Inst::Mov {
-                    rd,
-                    rm: xreg(PINNED_REG),
-                };
-                inst.emit(sink, flags, state);
-            }
-            &Inst::SetPinnedReg { rm } => {
-                let inst = Inst::Mov {
-                    rd: Writable::from_reg(xreg(PINNED_REG)),
-                    rm,
-                };
-                inst.emit(sink, flags, state);
-            }
             &Inst::VirtualSPOffsetAdj { offset } => {
                 debug!(
                     "virtual sp offset adjusted by {} -> {}",
@@ -1423,6 +1406,17 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                 );
                 state.virtual_sp_offset += offset;
             }
+            &Inst::EmitIsland { needed_space } => {
+                if sink.island_needed(needed_space + 4) {
+                    let jump_around_label = sink.get_label();
+                    let jmp = Inst::Jump {
+                        dest: BranchTarget::Label(jump_around_label),
+                    };
+                    jmp.emit(sink, flags, state);
+                    sink.emit_island();
+                    sink.bind_label(jump_around_label);
+                }
+            }
         }
     }
 }
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
index f98d3c6b00..55977796ce 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -1956,7 +1956,7 @@ fn test_aarch64_binemit() {
     ));
 
     insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
             target: BranchTarget::ResolvedOffset(64),
             kind: CondBrKind::Zero(xreg(8)),
         },
@@ -1964,7 +1964,7 @@ fn test_aarch64_binemit() {
         "cbz x8, 64",
     ));
     insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
             target: BranchTarget::ResolvedOffset(64),
             kind: CondBrKind::NotZero(xreg(8)),
         },
@@ -1972,7 +1972,7 @@ fn test_aarch64_binemit() {
         "cbnz x8, 64",
     ));
     insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
             target: BranchTarget::ResolvedOffset(64),
             kind: CondBrKind::Cond(Cond::Eq),
         },
@@ -1980,7 +1980,7 @@ fn test_aarch64_binemit() {
         "b.eq 64",
     ));
     insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
             target: BranchTarget::ResolvedOffset(64),
             kind: CondBrKind::Cond(Cond::Ne),
         },
@@ -1989,7 +1989,7 @@ fn test_aarch64_binemit() {
     ));
 
     insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
             target: BranchTarget::ResolvedOffset(64),
             kind: CondBrKind::Cond(Cond::Hs),
         },
@@ -1997,7 +1997,7 @@ fn test_aarch64_binemit() {
         "b.hs 64",
     ));
     insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
             target: BranchTarget::ResolvedOffset(64),
             kind: CondBrKind::Cond(Cond::Lo),
         },
@@ -2005,7 +2005,7 @@ fn test_aarch64_binemit() {
         "b.lo 64",
     ));
     insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
             target: BranchTarget::ResolvedOffset(64),
             kind: CondBrKind::Cond(Cond::Mi),
         },
@@ -2013,7 +2013,7 @@ fn test_aarch64_binemit() {
         "b.mi 64",
     ));
     insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
             target: BranchTarget::ResolvedOffset(64),
             kind: CondBrKind::Cond(Cond::Pl),
         },
@@ -2021,7 +2021,7 @@ fn test_aarch64_binemit() {
         "b.pl 64",
     ));
     insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
             target: BranchTarget::ResolvedOffset(64),
             kind: CondBrKind::Cond(Cond::Vs),
         },
@@ -2029,7 +2029,7 @@ fn test_aarch64_binemit() {
         "b.vs 64",
     ));
     insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
             target: BranchTarget::ResolvedOffset(64),
             kind: CondBrKind::Cond(Cond::Vc),
         },
@@ -2037,7 +2037,7 @@ fn test_aarch64_binemit() {
         "b.vc 64",
     ));
     insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
             target: BranchTarget::ResolvedOffset(64),
             kind: CondBrKind::Cond(Cond::Hi),
         },
@@ -2045,7 +2045,7 @@ fn test_aarch64_binemit() {
         "b.hi 64",
     ));
     insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
             target: BranchTarget::ResolvedOffset(64),
             kind: CondBrKind::Cond(Cond::Ls),
         },
@@ -2053,7 +2053,7 @@ fn test_aarch64_binemit() {
         "b.ls 64",
     ));
     insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
             target: BranchTarget::ResolvedOffset(64),
             kind: CondBrKind::Cond(Cond::Ge),
         },
@@ -2061,7 +2061,7 @@ fn test_aarch64_binemit() {
         "b.ge 64",
     ));
     insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
             target: BranchTarget::ResolvedOffset(64),
             kind: CondBrKind::Cond(Cond::Lt),
         },
@@ -2069,7 +2069,7 @@ fn test_aarch64_binemit() {
         "b.lt 64",
     ));
     insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
             target: BranchTarget::ResolvedOffset(64),
             kind: CondBrKind::Cond(Cond::Gt),
         },
@@ -2077,7 +2077,7 @@ fn test_aarch64_binemit() {
         "b.gt 64",
     ));
     insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
             target: BranchTarget::ResolvedOffset(64),
             kind: CondBrKind::Cond(Cond::Le),
         },
@@ -2085,7 +2085,7 @@ fn test_aarch64_binemit() {
         "b.le 64",
     ));
     insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
             target: BranchTarget::ResolvedOffset(64),
             kind: CondBrKind::Cond(Cond::Al),
         },
@@ -2093,7 +2093,7 @@ fn test_aarch64_binemit() {
         "b.al 64",
     ));
     insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
             target: BranchTarget::ResolvedOffset(64),
             kind: CondBrKind::Cond(Cond::Nv),
         },
@@ -2102,7 +2102,7 @@ fn test_aarch64_binemit() {
     ));
 
     insns.push((
-        Inst::CondBrLoweredCompound {
+        Inst::CondBr {
             taken: BranchTarget::ResolvedOffset(64),
             not_taken: BranchTarget::ResolvedOffset(128),
             kind: CondBrKind::Cond(Cond::Le),
@@ -2138,7 +2138,7 @@ fn test_aarch64_binemit() {
     insns.push((
         Inst::IndirectBr {
             rn: xreg(3),
-            targets: vec![1, 2, 3],
+            targets: vec![],
         },
         "60001FD6",
         "br x3",
@@ -2149,7 +2149,7 @@ fn test_aarch64_binemit() {
     insns.push((
         Inst::Adr {
             rd: writable_xreg(15),
-            label: MemLabel::PCRel((1 << 20) - 4),
+            off: (1 << 20) - 4,
         },
         "EFFF7F10",
         "adr x15, pc+1048572",
@@ -2792,19 +2792,11 @@ fn test_aarch64_binemit() {
         let actual_printing = insn.show_rru(Some(&rru));
         assert_eq!(expected_printing, actual_printing);
 
-        // Check the encoding is as expected.
-        let text_size = {
-            let mut code_sec = MachSectionSize::new(0);
-            insn.emit(&mut code_sec, &flags, &mut Default::default());
-            code_sec.size()
-        };
-
         let mut sink = test_utils::TestCodeSink::new();
-        let mut sections = MachSections::new();
-        let code_idx = sections.add_section(0, text_size);
-        let code_sec = sections.get_section(code_idx);
-        insn.emit(code_sec, &flags, &mut Default::default());
-        sections.emit(&mut sink);
+        let mut buffer = MachBuffer::new();
+        insn.emit(&mut buffer, &flags, &mut Default::default());
+        let buffer = buffer.finish();
+        buffer.emit(&mut sink);
         let actual_encoding = &sink.stringify();
         assert_eq!(expected_encoding, actual_encoding);
     }
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index 3e4247ac14..714ba1eb4d 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -645,35 +645,28 @@ pub enum Inst {
         dest: BranchTarget,
     },
 
-    /// A conditional branch.
+    /// A conditional branch. Contains two targets; at emission time, both are emitted, but
+    /// the MachBuffer knows to truncate the trailing branch if fallthrough. We optimize the
+    /// choice of taken/not_taken (inverting the branch polarity as needed) based on the
+    /// fallthrough at the time of lowering.
     CondBr {
         taken: BranchTarget,
         not_taken: BranchTarget,
         kind: CondBrKind,
     },
 
-    /// Lowered conditional branch: contains the original branch kind (or the
-    /// inverse), but only one BranchTarget is retained. The other is
-    /// implicitly the next instruction, given the final basic-block layout.
-    CondBrLowered {
+    /// A one-way conditional branch, invisible to the CFG processing; used *only* as part of
+    /// straight-line sequences in code to be emitted.
+    OneWayCondBr {
         target: BranchTarget,
         kind: CondBrKind,
     },
 
-    /// As for `CondBrLowered`, but represents a condbr/uncond-br sequence (two
-    /// actual machine instructions). Needed when the final block layout implies
-    /// that neither arm of a conditional branch targets the fallthrough block.
-    CondBrLoweredCompound {
-        taken: BranchTarget,
-        not_taken: BranchTarget,
-        kind: CondBrKind,
-    },
-
     /// An indirect branch through a register, augmented with set of all
     /// possible successors.
     IndirectBr {
         rn: Reg,
-        targets: Vec<BlockIndex>,
+        targets: Vec<MachLabel>,
     },
 
     /// A "break" instruction, used for e.g. traps and debug breakpoints.
@@ -685,11 +678,14 @@ pub enum Inst {
         trap_info: (SourceLoc, TrapCode),
     },
 
-    /// Load the address (using a PC-relative offset) of a MemLabel, using the
-    /// `ADR` instruction.
+    /// Load the address (using a PC-relative offset) of a memory location, using the `ADR`
+    /// instruction. Note that we take a simple offset, not a `MemLabel`, here, because `Adr` is
+    /// only used for now in fixed lowering sequences with hardcoded offsets. In the future we may
+    /// need full `MemLabel` support.
     Adr {
         rd: Writable<Reg>,
-        label: MemLabel,
+        /// Offset in range -2^20 .. 2^20.
+        off: i32,
     },
 
     /// Raw 32-bit word, used for inline constants and jump-table entries.
@@ -706,7 +702,7 @@ pub enum Inst {
     /// for rationale).
     JTSequence {
         targets: Box<[BranchTarget]>,
-        targets_for_term: Box<[BlockIndex]>, // needed for MachTerminator.
+        targets_for_term: Box<[MachLabel]>, // needed for MachTerminator.
         ridx: Reg,
         rtmp1: Writable<Reg>,
         rtmp2: Writable<Reg>,
@@ -732,21 +728,19 @@ pub enum Inst {
         mem: MemArg,
     },
 
-    /// Sets the value of the pinned register to the given register target.
-    GetPinnedReg {
-        rd: Writable<Reg>,
-    },
-
-    /// Writes the value of the given source register to the pinned register.
-    SetPinnedReg {
-        rm: Reg,
-    },
-
     /// Marker, no-op in generated code: SP "virtual offset" is adjusted. This
     /// controls MemArg::NominalSPOffset args are lowered.
     VirtualSPOffsetAdj {
         offset: i64,
     },
+
+    /// Meta-insn, no-op in generated code: emit constant/branch veneer island at this point (with
+    /// a guard jump around it) if less than the needed space is available before the next branch
+    /// deadline.
+    EmitIsland {
+        /// The needed space before the next deadline.
+        needed_space: CodeOffset,
+    },
 }
 
 fn count_zero_half_words(mut value: u64) -> usize {
@@ -1111,9 +1105,7 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
             collector.add_defs(&*defs);
             collector.add_use(rn);
         }
-        &Inst::CondBr { ref kind, .. }
-        | &Inst::CondBrLowered { ref kind, .. }
-        | &Inst::CondBrLoweredCompound { ref kind, .. } => match kind {
+        &Inst::CondBr { ref kind, .. } | &Inst::OneWayCondBr { ref kind, .. } => match kind {
             CondBrKind::Zero(rt) | CondBrKind::NotZero(rt) => {
                 collector.add_use(*rt);
             }
@@ -1142,13 +1134,8 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
         &Inst::LoadAddr { rd, mem: _ } => {
             collector.add_def(rd);
         }
-        &Inst::GetPinnedReg { rd } => {
-            collector.add_def(rd);
-        }
-        &Inst::SetPinnedReg { rm } => {
-            collector.add_use(rm);
-        }
         &Inst::VirtualSPOffsetAdj { .. } => {}
+        &Inst::EmitIsland { .. } => {}
     }
 }
 
@@ -1676,13 +1663,7 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RegUsageMapper) {
             *defs = Box::new(new_defs);
             map_use(mapper, rn);
         }
-        &mut Inst::CondBr { ref mut kind, .. } => {
-            map_br(mapper, kind);
-        }
-        &mut Inst::CondBrLowered { ref mut kind, .. } => {
-            map_br(mapper, kind);
-        }
-        &mut Inst::CondBrLoweredCompound { ref mut kind, .. } => {
+        &mut Inst::CondBr { ref mut kind, .. } | &mut Inst::OneWayCondBr { ref mut kind, .. } => {
             map_br(mapper, kind);
         }
         &mut Inst::IndirectBr { ref mut rn, .. } => {
@@ -1716,13 +1697,8 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RegUsageMapper) {
             map_def(mapper, rd);
             map_mem(mapper, mem);
         }
-        &mut Inst::GetPinnedReg { ref mut rd } => {
-            map_def(mapper, rd);
-        }
-        &mut Inst::SetPinnedReg { ref mut rm } => {
-            map_use(mapper, rm);
-        }
         &mut Inst::VirtualSPOffsetAdj { .. } => {}
+        &mut Inst::EmitIsland { .. } => {}
     }
 }
 
@@ -1730,6 +1706,8 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RegUsageMapper) {
 // Instructions: misc functions and external interface
 
 impl MachInst for Inst {
+    type LabelUse = LabelUse;
+
     fn get_regs(&self, collector: &mut RegUsageCollector) {
         aarch64_get_regs(self, collector)
     }
@@ -1757,24 +1735,14 @@ impl MachInst for Inst {
     fn is_term<'a>(&'a self) -> MachTerminator<'a> {
         match self {
             &Inst::Ret | &Inst::EpiloguePlaceholder => MachTerminator::Ret,
-            &Inst::Jump { dest } => MachTerminator::Uncond(dest.as_block_index().unwrap()),
+            &Inst::Jump { dest } => MachTerminator::Uncond(dest.as_label().unwrap()),
             &Inst::CondBr {
                 taken, not_taken, ..
-            } => MachTerminator::Cond(
-                taken.as_block_index().unwrap(),
-                not_taken.as_block_index().unwrap(),
-            ),
-            &Inst::CondBrLowered { .. } => {
-                // When this is used prior to branch finalization for branches
-                // within an open-coded sequence, i.e. with ResolvedOffsets,
-                // do not consider it a terminator. From the point of view of CFG analysis,
-                // it is part of a black-box single-in single-out region, hence is not
-                // denoted a terminator.
+            } => MachTerminator::Cond(taken.as_label().unwrap(), not_taken.as_label().unwrap()),
+            &Inst::OneWayCondBr { .. } => {
+                // Explicitly invisible to CFG processing.
                 MachTerminator::None
             }
-            &Inst::CondBrLoweredCompound { .. } => {
-                panic!("is_term() called after lowering branches");
-            }
             &Inst::IndirectBr { ref targets, .. } => MachTerminator::Indirect(&targets[..]),
             &Inst::JTSequence {
                 ref targets_for_term,
@@ -1789,6 +1757,23 @@ impl MachInst for Inst {
         Inst::mov(to_reg, from_reg)
     }
 
+    fn gen_constant(to_reg: Writable<Reg>, value: u64, ty: Type) -> SmallVec<[Inst; 4]> {
+        if ty == F64 {
+            let mut ret = SmallVec::new();
+            ret.push(Inst::load_fp_constant64(to_reg, f64::from_bits(value)));
+            ret
+        } else if ty == F32 {
+            let mut ret = SmallVec::new();
+            ret.push(Inst::load_fp_constant32(
+                to_reg,
+                f32::from_bits(value as u32),
+            ));
+            ret
+        } else {
+            Inst::load_constant(to_reg, value)
+        }
+    }
+
     fn gen_zero_len_nop() -> Inst {
         Inst::Nop0
     }
@@ -1815,101 +1800,25 @@ impl MachInst for Inst {
         }
     }
 
-    fn gen_jump(blockindex: BlockIndex) -> Inst {
+    fn gen_jump(target: MachLabel) -> Inst {
         Inst::Jump {
-            dest: BranchTarget::Block(blockindex),
+            dest: BranchTarget::Label(target),
         }
     }
 
-    fn with_block_rewrites(&mut self, block_target_map: &[BlockIndex]) {
-        match self {
-            &mut Inst::Jump { ref mut dest } => {
-                dest.map(block_target_map);
-            }
-            &mut Inst::CondBr {
-                ref mut taken,
-                ref mut not_taken,
-                ..
-            } => {
-                taken.map(block_target_map);
-                not_taken.map(block_target_map);
-            }
-            &mut Inst::CondBrLowered { .. } => {
-                // See note in `is_term()`: this is used in open-coded sequences
-                // within blocks and should be left alone.
-            }
-            &mut Inst::CondBrLoweredCompound { .. } => {
-                panic!("with_block_rewrites called after branch lowering!");
-            }
-            _ => {}
-        }
+    fn reg_universe(flags: &settings::Flags) -> RealRegUniverse {
+        create_reg_universe(flags)
     }
 
-    fn with_fallthrough_block(&mut self, fallthrough: Option<BlockIndex>) {
-        match self {
-            &mut Inst::CondBr {
-                taken,
-                not_taken,
-                kind,
-            } => {
-                if taken.as_block_index() == fallthrough
-                    && not_taken.as_block_index() == fallthrough
-                {
-                    *self = Inst::Nop0;
-                } else if taken.as_block_index() == fallthrough {
-                    *self = Inst::CondBrLowered {
-                        target: not_taken,
-                        kind: kind.invert(),
-                    };
-                } else if not_taken.as_block_index() == fallthrough {
-                    *self = Inst::CondBrLowered {
-                        target: taken,
-                        kind,
-                    };
-                } else {
-                    // We need a compound sequence (condbr / uncond-br).
-                    *self = Inst::CondBrLoweredCompound {
-                        taken,
-                        not_taken,
-                        kind,
-                    };
-                }
-            }
-            &mut Inst::Jump { dest } => {
-                if dest.as_block_index() == fallthrough {
-                    *self = Inst::Nop0;
-                }
-            }
-            _ => {}
-        }
-    }
-
-    fn with_block_offsets(&mut self, my_offset: CodeOffset, targets: &[CodeOffset]) {
-        match self {
-            &mut Inst::CondBrLowered { ref mut target, .. } => {
-                target.lower(targets, my_offset);
-            }
-            &mut Inst::CondBrLoweredCompound {
-                ref mut taken,
-                ref mut not_taken,
-                ..
-            } => {
-                taken.lower(targets, my_offset);
-                not_taken.lower(targets, my_offset + 4);
-            }
-            &mut Inst::Jump { ref mut dest } => {
-                dest.lower(targets, my_offset);
-            }
-            &mut Inst::JTSequence {
-                targets: ref mut t, ..
-            } => {
-                for target in t.iter_mut() {
-                    // offset+20: jumptable is 20 bytes into compound sequence.
-                    target.lower(targets, my_offset + 20);
-                }
-            }
-            _ => {}
-        }
+    fn worst_case_size() -> CodeOffset {
+        // The maximum size, in bytes, of any `Inst`'s emitted code. We have at least one case of
+        // an 8-instruction sequence (saturating int-to-float conversions) with three embedded
+        // 64-bit f64 constants.
+        //
+        // Note that inline jump-tables handle island/pool insertion separately, so we do not need
+        // to account for them here (otherwise the worst case would be 2^31 * 4, clearly not
+        // feasible for other reasons).
+        44
     }
 }
 
@@ -2550,12 +2459,12 @@ impl ShowWithRRU for Inst {
                     }
                 }
             }
-            &Inst::CondBrLowered {
+            &Inst::OneWayCondBr {
                 ref target,
                 ref kind,
             } => {
                 let target = target.show_rru(mb_rru);
-                match &kind {
+                match kind {
                     &CondBrKind::Zero(reg) => {
                         let reg = reg.show_rru(mb_rru);
                         format!("cbz {}, {}", reg, target)
@@ -2570,30 +2479,15 @@ impl ShowWithRRU for Inst {
                     }
                 }
             }
-            &Inst::CondBrLoweredCompound {
-                ref taken,
-                ref not_taken,
-                ref kind,
-            } => {
-                let first = Inst::CondBrLowered {
-                    target: taken.clone(),
-                    kind: kind.clone(),
-                };
-                let second = Inst::Jump {
-                    dest: not_taken.clone(),
-                };
-                first.show_rru(mb_rru) + " ; " + &second.show_rru(mb_rru)
-            }
             &Inst::IndirectBr { rn, .. } => {
                 let rn = rn.show_rru(mb_rru);
                 format!("br {}", rn)
             }
             &Inst::Brk => "brk #0".to_string(),
             &Inst::Udf { .. } => "udf".to_string(),
-            &Inst::Adr { rd, ref label } => {
+            &Inst::Adr { rd, off } => {
                 let rd = rd.show_rru(mb_rru);
-                let label = label.show_rru(mb_rru);
-                format!("adr {}, {}", rd, label)
+                format!("adr {}, pc+{}", rd, off)
             }
             &Inst::Word4 { data } => format!("data.i32 {}", data),
             &Inst::Word8 { data } => format!("data.i64 {}", data),
@@ -2683,15 +2577,134 @@ impl ShowWithRRU for Inst {
                 }
                 ret
             }
-            &Inst::GetPinnedReg { rd } => {
-                let rd = rd.show_rru(mb_rru);
-                format!("get_pinned_reg {}", rd)
-            }
-            &Inst::SetPinnedReg { rm } => {
-                let rm = rm.show_rru(mb_rru);
-                format!("set_pinned_reg {}", rm)
-            }
             &Inst::VirtualSPOffsetAdj { offset } => format!("virtual_sp_offset_adjust {}", offset),
+            &Inst::EmitIsland { needed_space } => format!("emit_island {}", needed_space),
+        }
+    }
+}
+
+//=============================================================================
+// Label fixups and jump veneers.
+
+/// Different forms of label references for different instruction formats.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum LabelUse {
+    /// 19-bit branch offset (conditional branches). PC-rel, offset is imm << 2. Immediate is 19
+    /// signed bits, in bits 23:5. Used by cbz, cbnz, b.cond.
+    Branch19,
+    /// 26-bit branch offset (unconditional branches). PC-rel, offset is imm << 2. Immediate is 26
+    /// signed bits, in bits 25:0. Used by b, bl.
+    Branch26,
+    /// 19-bit offset for LDR (load literal). PC-rel, offset is imm << 2. Immediate is 19 signed bits,
+    /// in bits 23:5.
+    Ldr19,
+    /// 21-bit offset for ADR (get address of label). PC-rel, offset is not shifted. Immediate is
+    /// 21 signed bits, with high 19 bits in bits 23:5 and low 2 bits in bits 30:29.
+    Adr21,
+    /// 32-bit PC relative constant offset (from address of constant itself). Used in jump tables.
+    PCRel32,
+}
+
+impl MachInstLabelUse for LabelUse {
+    /// Alignment for veneer code. Every AArch64 instruction must be 4-byte-aligned.
+    const ALIGN: CodeOffset = 4;
+
+    /// Maximum PC-relative range (positive), inclusive.
+    fn max_pos_range(self) -> CodeOffset {
+        match self {
+            // 19-bit immediate, left-shifted by 2, for 21 bits of total range. Signed, so +2^20
+            // from zero. Likewise for two other shifted cases below.
+            LabelUse::Branch19 => (1 << 20) - 1,
+            LabelUse::Branch26 => (1 << 27) - 1,
+            LabelUse::Ldr19 => (1 << 20) - 1,
+            // Adr does not shift its immediate, so the 21-bit immediate gives 21 bits of total
+            // range.
+            LabelUse::Adr21 => (1 << 20) - 1,
+            LabelUse::PCRel32 => 0x7fffffff,
+        }
+    }
+
+    /// Maximum PC-relative range (negative).
+    fn max_neg_range(self) -> CodeOffset {
+        // All forms are twos-complement signed offsets, so negative limit is one more than
+        // positive limit.
+        self.max_pos_range() + 1
+    }
+
+    /// Size of window into code needed to do the patch.
+    fn patch_size(self) -> CodeOffset {
+        // Patch is on one instruction only for all of these label reference types.
+        4
+    }
+
+    /// Perform the patch.
+    fn patch(self, buffer: &mut [u8], use_offset: CodeOffset, label_offset: CodeOffset) {
+        let pc_rel = (label_offset as i64) - (use_offset as i64);
+        debug_assert!(pc_rel <= self.max_pos_range() as i64);
+        debug_assert!(pc_rel >= -(self.max_neg_range() as i64));
+        let pc_rel = pc_rel as u32;
+        let insn_word = u32::from_le_bytes([buffer[0], buffer[1], buffer[2], buffer[3]]);
+        let mask = match self {
+            LabelUse::Branch19 => 0x00ffffe0, // bits 23..5 inclusive
+            LabelUse::Branch26 => 0x03ffffff, // bits 25..0 inclusive
+            LabelUse::Ldr19 => 0x00ffffe0,    // bits 23..5 inclusive
+            LabelUse::Adr21 => 0x60ffffe0,    // bits 30..29, 25..5 inclusive
+            LabelUse::PCRel32 => 0xffffffff,
+        };
+        let pc_rel_shifted = match self {
+            LabelUse::Adr21 | LabelUse::PCRel32 => pc_rel,
+            _ => {
+                debug_assert!(pc_rel & 3 == 0);
+                pc_rel >> 2
+            }
+        };
+        let pc_rel_inserted = match self {
+            LabelUse::Branch19 | LabelUse::Ldr19 => (pc_rel_shifted & 0x7ffff) << 5,
+            LabelUse::Branch26 => pc_rel_shifted & 0x3ffffff,
+            LabelUse::Adr21 => (pc_rel_shifted & 0x7ffff) << 5 | (pc_rel_shifted & 0x180000) << 10,
+            LabelUse::PCRel32 => pc_rel_shifted,
+        };
+        let is_add = match self {
+            LabelUse::PCRel32 => true,
+            _ => false,
+        };
+        let insn_word = if is_add {
+            insn_word.wrapping_add(pc_rel_inserted)
+        } else {
+            (insn_word & !mask) | pc_rel_inserted
+        };
+        buffer[0..4].clone_from_slice(&u32::to_le_bytes(insn_word));
+    }
+
+    /// Is a veneer supported for this label reference type?
+    fn supports_veneer(self) -> bool {
+        match self {
+            LabelUse::Branch19 => true, // veneer is a Branch26
+            _ => false,
+        }
+    }
+
+    /// How large is the veneer, if supported?
+    fn veneer_size(self) -> CodeOffset {
+        4
+    }
+
+    /// Generate a veneer into the buffer, given that this veneer is at `veneer_offset`, and return
+    /// an offset and label-use for the veneer's use of the original label.
+    fn generate_veneer(
+        self,
+        buffer: &mut [u8],
+        veneer_offset: CodeOffset,
+    ) -> (CodeOffset, LabelUse) {
+        match self {
+            LabelUse::Branch19 => {
+                // veneer is a Branch26 (unconditional branch). Just encode directly here -- don't
+                // bother with constructing an Inst.
+                let insn_word = 0b000101 << 26;
+                buffer[0..4].clone_from_slice(&u32::to_le_bytes(insn_word));
+                (veneer_offset, LabelUse::Branch26)
+            }
+            _ => panic!("Unsupported label-reference type for veneer generation!"),
         }
     }
 }
diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
index f281c05af6..d1368a3d97 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -14,12 +14,14 @@ use crate::ir::Inst as IRInst;
 use crate::ir::{InstructionData, Opcode, TrapCode, Type};
 use crate::machinst::lower::*;
 use crate::machinst::*;
+use crate::CodegenResult;
 
 use crate::isa::aarch64::inst::*;
 use crate::isa::aarch64::AArch64Backend;
 
 use super::lower_inst;
 
+use log::debug;
 use regalloc::{Reg, RegClass, Writable};
 
 //============================================================================
@@ -104,18 +106,11 @@ pub(crate) enum ResultRegImmShift {
 }
 
 //============================================================================
-// Instruction input and output "slots".
+// Instruction input "slots".
 //
 // We use these types to refer to operand numbers, and result numbers, together
 // with the associated instruction, in a type-safe way.
 
-/// Identifier for a particular output of an instruction.
-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub(crate) struct InsnOutput {
-    pub(crate) insn: IRInst,
-    pub(crate) output: usize,
-}
-
 /// Identifier for a particular input of an instruction.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub(crate) struct InsnInput {
@@ -123,93 +118,28 @@ pub(crate) struct InsnInput {
     pub(crate) input: usize,
 }
 
-/// Producer of a value: either a previous instruction's output, or a register that will be
-/// codegen'd separately.
+/// Identifier for a particular output of an instruction.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub(crate) enum InsnInputSource {
-    Output(InsnOutput),
-    Reg(Reg),
-}
-
-impl InsnInputSource {
-    fn as_output(self) -> Option<InsnOutput> {
-        match self {
-            InsnInputSource::Output(o) => Some(o),
-            _ => None,
-        }
-    }
-}
-
-fn get_input<C: LowerCtx<I = Inst>>(ctx: &mut C, output: InsnOutput, num: usize) -> InsnInput {
-    assert!(num <= ctx.num_inputs(output.insn));
-    InsnInput {
-        insn: output.insn,
-        input: num,
-    }
-}
-
-/// Convert an instruction input to a producing instruction's output if possible (in same BB), or a
-/// register otherwise.
-fn input_source<C: LowerCtx<I = Inst>>(ctx: &mut C, input: InsnInput) -> InsnInputSource {
-    if let Some((input_inst, result_num)) = ctx.input_inst(input.insn, input.input) {
-        let out = InsnOutput {
-            insn: input_inst,
-            output: result_num,
-        };
-        InsnInputSource::Output(out)
-    } else {
-        let reg = ctx.input(input.insn, input.input);
-        InsnInputSource::Reg(reg)
-    }
+pub(crate) struct InsnOutput {
+    pub(crate) insn: IRInst,
+    pub(crate) output: usize,
 }
 
 //============================================================================
-// Lowering: convert instruction outputs to result types.
+// Lowering: convert instruction inputs to forms that we can use.
 
-/// Lower an instruction output to a 64-bit constant, if possible.
-pub(crate) fn output_to_const<C: LowerCtx<I = Inst>>(ctx: &mut C, out: InsnOutput) -> Option<u64> {
-    if out.output > 0 {
-        None
-    } else {
-        let inst_data = ctx.data(out.insn);
-        if inst_data.opcode() == Opcode::Null {
-            Some(0)
-        } else {
-            match inst_data {
-                &InstructionData::UnaryImm { opcode: _, imm } => {
-                    // Only has Into for i64; we use u64 elsewhere, so we cast.
-                    let imm: i64 = imm.into();
-                    Some(imm as u64)
-                }
-                &InstructionData::UnaryBool { opcode: _, imm } => Some(u64::from(imm)),
-                &InstructionData::UnaryIeee32 { opcode: _, imm } => Some(u64::from(imm.bits())),
-                &InstructionData::UnaryIeee64 { opcode: _, imm } => Some(imm.bits()),
-                _ => None,
-            }
-        }
-    }
+/// Lower an instruction input to a 64-bit constant, if possible.
+pub(crate) fn input_to_const<C: LowerCtx<I = Inst>>(ctx: &mut C, input: InsnInput) -> Option<u64> {
+    let input = ctx.get_input(input.insn, input.input);
+    input.constant
 }
 
-pub(crate) fn output_to_const_f32<C: LowerCtx<I = Inst>>(
+/// Lower an instruction input to a constant register-shift amount, if possible.
+pub(crate) fn input_to_shiftimm<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
-    out: InsnOutput,
-) -> Option<f32> {
-    output_to_const(ctx, out).map(|value| f32::from_bits(value as u32))
-}
-
-pub(crate) fn output_to_const_f64<C: LowerCtx<I = Inst>>(
-    ctx: &mut C,
-    out: InsnOutput,
-) -> Option<f64> {
-    output_to_const(ctx, out).map(|value| f64::from_bits(value))
-}
-
-/// Lower an instruction output to a constant register-shift amount, if possible.
-pub(crate) fn output_to_shiftimm<C: LowerCtx<I = Inst>>(
-    ctx: &mut C,
-    out: InsnOutput,
+    input: InsnInput,
 ) -> Option<ShiftOpShiftImm> {
-    output_to_const(ctx, out).and_then(ShiftOpShiftImm::maybe_from_shift)
+    input_to_const(ctx, input).and_then(ShiftOpShiftImm::maybe_from_shift)
 }
 
 /// How to handle narrow values loaded into registers; see note on `narrow_mode`
@@ -237,9 +167,9 @@ impl NarrowValueMode {
     }
 }
 
-/// Lower an instruction output to a reg.
+/// Allocate a register for an instruction output and return it.
 pub(crate) fn output_to_reg<C: LowerCtx<I = Inst>>(ctx: &mut C, out: InsnOutput) -> Writable<Reg> {
-    ctx.output(out.insn, out.output)
+    ctx.get_output(out.insn, out.output)
 }
 
 /// Lower an instruction input to a reg.
@@ -252,9 +182,22 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
     input: InsnInput,
     narrow_mode: NarrowValueMode,
 ) -> Reg {
+    debug!("input_to_reg: input {:?}", input);
     let ty = ctx.input_ty(input.insn, input.input);
     let from_bits = ty_bits(ty) as u8;
-    let in_reg = ctx.input(input.insn, input.input);
+    let inputs = ctx.get_input(input.insn, input.input);
+    let in_reg = if let Some(c) = inputs.constant {
+        // Generate constants fresh at each use to minimize long-range register pressure.
+        let to_reg = ctx.tmp(Inst::rc_for_type(ty).unwrap(), ty);
+        for inst in Inst::gen_constant(to_reg, c, ty).into_iter() {
+            ctx.emit(inst);
+        }
+        to_reg.to_reg()
+    } else {
+        ctx.use_input_reg(inputs);
+        inputs.reg
+    };
+
     match (narrow_mode, from_bits) {
         (NarrowValueMode::None, _) => in_reg,
         (NarrowValueMode::ZeroExtend32, n) if n < 32 => {
@@ -282,15 +225,20 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
         (NarrowValueMode::ZeroExtend32, 32) | (NarrowValueMode::SignExtend32, 32) => in_reg,
 
         (NarrowValueMode::ZeroExtend64, n) if n < 64 => {
-            let tmp = ctx.tmp(RegClass::I64, I32);
-            ctx.emit(Inst::Extend {
-                rd: tmp,
-                rn: in_reg,
-                signed: false,
-                from_bits,
-                to_bits: 64,
-            });
-            tmp.to_reg()
+            if inputs.constant.is_some() {
+                // Constants are zero-extended to full 64-bit width on load already.
+                in_reg
+            } else {
+                let tmp = ctx.tmp(RegClass::I64, I32);
+                ctx.emit(Inst::Extend {
+                    rd: tmp,
+                    rn: in_reg,
+                    signed: false,
+                    from_bits,
+                    to_bits: 64,
+                });
+                tmp.to_reg()
+            }
         }
         (NarrowValueMode::SignExtend64, n) if n < 64 => {
             let tmp = ctx.tmp(RegClass::I64, I32);
@@ -313,8 +261,6 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
 }
 
 /// Lower an instruction input to a reg or reg/shift, or reg/extend operand.
-/// This does not actually codegen the source instruction; it just uses the
-/// vreg into which the source instruction will generate its value.
 ///
 /// The `narrow_mode` flag indicates whether the consumer of this value needs
 /// the high bits clear. For many operations, such as an add/sub/mul or any
@@ -330,23 +276,18 @@ fn input_to_rs<C: LowerCtx<I = Inst>>(
     input: InsnInput,
     narrow_mode: NarrowValueMode,
 ) -> ResultRS {
-    if let InsnInputSource::Output(out) = input_source(ctx, input) {
-        let insn = out.insn;
-        assert!(out.output <= ctx.num_outputs(insn));
+    let inputs = ctx.get_input(input.insn, input.input);
+    if let Some((insn, 0)) = inputs.inst {
         let op = ctx.data(insn).opcode();
 
         if op == Opcode::Ishl {
-            let shiftee = get_input(ctx, out, 0);
-            let shift_amt = get_input(ctx, out, 1);
+            let shiftee = InsnInput { insn, input: 0 };
+            let shift_amt = InsnInput { insn, input: 1 };
 
             // Can we get the shift amount as an immediate?
-            if let Some(shift_amt_out) = input_source(ctx, shift_amt).as_output() {
-                if let Some(shiftimm) = output_to_shiftimm(ctx, shift_amt_out) {
-                    let reg = input_to_reg(ctx, shiftee, narrow_mode);
-                    ctx.merged(insn);
-                    ctx.merged(shift_amt_out.insn);
-                    return ResultRS::RegShift(reg, ShiftOpAndAmt::new(ShiftOp::LSL, shiftimm));
-                }
+            if let Some(shiftimm) = input_to_shiftimm(ctx, shift_amt) {
+                let reg = input_to_reg(ctx, shiftee, narrow_mode);
+                return ResultRS::RegShift(reg, ShiftOpAndAmt::new(ShiftOp::LSL, shiftimm));
             }
         }
     }
@@ -364,11 +305,10 @@ fn input_to_rse<C: LowerCtx<I = Inst>>(
     input: InsnInput,
     narrow_mode: NarrowValueMode,
 ) -> ResultRSE {
-    if let InsnInputSource::Output(out) = input_source(ctx, input) {
-        let insn = out.insn;
-        assert!(out.output <= ctx.num_outputs(insn));
+    let inputs = ctx.get_input(input.insn, input.input);
+    if let Some((insn, 0)) = inputs.inst {
         let op = ctx.data(insn).opcode();
-        let out_ty = ctx.output_ty(insn, out.output);
+        let out_ty = ctx.output_ty(insn, 0);
         let out_bits = ty_bits(out_ty);
 
         // If `out_ty` is smaller than 32 bits and we need to zero- or sign-extend,
@@ -378,7 +318,7 @@ fn input_to_rse<C: LowerCtx<I = Inst>>(
             && ((narrow_mode.is_32bit() && out_bits < 32)
                 || (!narrow_mode.is_32bit() && out_bits < 64))
         {
-            let reg = output_to_reg(ctx, out);
+            let reg = input_to_reg(ctx, InsnInput { insn, input: 0 }, NarrowValueMode::None);
             let extendop = match (narrow_mode, out_bits) {
                 (NarrowValueMode::SignExtend32, 1) | (NarrowValueMode::SignExtend64, 1) => {
                     ExtendOp::SXTB
@@ -402,15 +342,14 @@ fn input_to_rse<C: LowerCtx<I = Inst>>(
                 (NarrowValueMode::ZeroExtend64, 32) => ExtendOp::UXTW,
                 _ => unreachable!(),
             };
-            return ResultRSE::RegExtend(reg.to_reg(), extendop);
+            return ResultRSE::RegExtend(reg, extendop);
         }
 
         // Is this a zero-extend or sign-extend and can we handle that with a register-mode operator?
         if op == Opcode::Uextend || op == Opcode::Sextend {
             assert!(out_bits == 32 || out_bits == 64);
             let sign_extend = op == Opcode::Sextend;
-            let extendee = get_input(ctx, out, 0);
-            let inner_ty = ctx.input_ty(extendee.insn, extendee.input);
+            let inner_ty = ctx.input_ty(insn, 0);
             let inner_bits = ty_bits(inner_ty);
             assert!(inner_bits < out_bits);
             let extendop = match (sign_extend, inner_bits) {
@@ -424,8 +363,7 @@ fn input_to_rse<C: LowerCtx<I = Inst>>(
                 (false, 32) => ExtendOp::UXTW,
                 _ => unreachable!(),
             };
-            let reg = input_to_reg(ctx, extendee, NarrowValueMode::None);
-            ctx.merged(insn);
+            let reg = input_to_reg(ctx, InsnInput { insn, input: 0 }, NarrowValueMode::None);
             return ResultRSE::RegExtend(reg, extendop);
         }
     }
@@ -438,12 +376,9 @@ pub(crate) fn input_to_rse_imm12<C: LowerCtx<I = Inst>>(
     input: InsnInput,
     narrow_mode: NarrowValueMode,
 ) -> ResultRSEImm12 {
-    if let InsnInputSource::Output(out) = input_source(ctx, input) {
-        if let Some(imm_value) = output_to_const(ctx, out) {
-            if let Some(i) = Imm12::maybe_from_u64(imm_value) {
-                ctx.merged(out.insn);
-                return ResultRSEImm12::Imm12(i);
-            }
+    if let Some(imm_value) = input_to_const(ctx, input) {
+        if let Some(i) = Imm12::maybe_from_u64(imm_value) {
+            return ResultRSEImm12::Imm12(i);
         }
     }
 
@@ -455,14 +390,11 @@ pub(crate) fn input_to_rs_immlogic<C: LowerCtx<I = Inst>>(
     input: InsnInput,
     narrow_mode: NarrowValueMode,
 ) -> ResultRSImmLogic {
-    if let InsnInputSource::Output(out) = input_source(ctx, input) {
-        if let Some(imm_value) = output_to_const(ctx, out) {
-            let ty = ctx.output_ty(out.insn, out.output);
-            let ty = if ty_bits(ty) < 32 { I32 } else { ty };
-            if let Some(i) = ImmLogic::maybe_from_u64(imm_value, ty) {
-                ctx.merged(out.insn);
-                return ResultRSImmLogic::ImmLogic(i);
-            }
+    if let Some(imm_value) = input_to_const(ctx, input) {
+        let ty = ctx.input_ty(input.insn, input.input);
+        let ty = if ty_bits(ty) < 32 { I32 } else { ty };
+        if let Some(i) = ImmLogic::maybe_from_u64(imm_value, ty) {
+            return ResultRSImmLogic::ImmLogic(i);
         }
     }
 
@@ -473,12 +405,9 @@ pub(crate) fn input_to_reg_immshift<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     input: InsnInput,
 ) -> ResultRegImmShift {
-    if let InsnInputSource::Output(out) = input_source(ctx, input) {
-        if let Some(imm_value) = output_to_const(ctx, out) {
-            if let Some(immshift) = ImmShift::maybe_from_u64(imm_value) {
-                ctx.merged(out.insn);
-                return ResultRegImmShift::ImmShift(immshift);
-            }
+    if let Some(imm_value) = input_to_const(ctx, input) {
+        if let Some(immshift) = ImmShift::maybe_from_u64(imm_value) {
+            return ResultRegImmShift::ImmShift(immshift);
         }
     }
 
@@ -823,24 +752,29 @@ pub(crate) fn inst_trapcode(data: &InstructionData) -> Option<TrapCode> {
     }
 }
 
-/// Checks for an instance of `op` feeding the given input. Marks as merged (decrementing refcount) if so.
+/// Checks for an instance of `op` feeding the given input.
 pub(crate) fn maybe_input_insn<C: LowerCtx<I = Inst>>(
     c: &mut C,
     input: InsnInput,
     op: Opcode,
 ) -> Option<IRInst> {
-    if let InsnInputSource::Output(out) = input_source(c, input) {
-        let data = c.data(out.insn);
+    let inputs = c.get_input(input.insn, input.input);
+    debug!(
+        "maybe_input_insn: input {:?} has options {:?}; looking for op {:?}",
+        input, inputs, op
+    );
+    if let Some((src_inst, _)) = inputs.inst {
+        let data = c.data(src_inst);
+        debug!(" -> input inst {:?}", data);
         if data.opcode() == op {
-            c.merged(out.insn);
-            return Some(out.insn);
+            return Some(src_inst);
         }
     }
     None
 }
 
 /// Checks for an instance of `op` feeding the given input, possibly via a conversion `conv` (e.g.,
-/// Bint or a bitcast). Marks one or both as merged if so, as appropriate.
+/// Bint or a bitcast).
 ///
 /// FIXME cfallin 2020-03-30: this is really ugly. Factor out tree-matching stuff and make it
 /// a bit more generic.
@@ -850,21 +784,19 @@ pub(crate) fn maybe_input_insn_via_conv<C: LowerCtx<I = Inst>>(
     op: Opcode,
     conv: Opcode,
 ) -> Option<IRInst> {
-    if let Some(ret) = maybe_input_insn(c, input, op) {
-        return Some(ret);
-    }
-
-    if let InsnInputSource::Output(out) = input_source(c, input) {
-        let data = c.data(out.insn);
+    let inputs = c.get_input(input.insn, input.input);
+    if let Some((src_inst, _)) = inputs.inst {
+        let data = c.data(src_inst);
+        if data.opcode() == op {
+            return Some(src_inst);
+        }
         if data.opcode() == conv {
-            let conv_insn = out.insn;
-            let conv_input = InsnInput {
-                insn: conv_insn,
-                input: 0,
-            };
-            if let Some(inner) = maybe_input_insn(c, conv_input, op) {
-                c.merged(conv_insn);
-                return Some(inner);
+            let inputs = c.get_input(src_inst, 0);
+            if let Some((src_inst, _)) = inputs.inst {
+                let data = c.data(src_inst);
+                if data.opcode() == op {
+                    return Some(src_inst);
+                }
             }
         }
     }
@@ -876,6 +808,7 @@ pub(crate) fn lower_icmp_or_ifcmp_to_flags<C: LowerCtx<I = Inst>>(
     insn: IRInst,
     is_signed: bool,
 ) {
+    debug!("lower_icmp_or_ifcmp_to_flags: insn {}", insn);
     let ty = ctx.input_ty(insn, 0);
     let bits = ty_bits(ty);
     let narrow_mode = match (bits <= 32, is_signed) {
@@ -897,6 +830,7 @@ pub(crate) fn lower_icmp_or_ifcmp_to_flags<C: LowerCtx<I = Inst>>(
     let ty = ctx.input_ty(insn, 0);
     let rn = input_to_reg(ctx, inputs[0], narrow_mode);
     let rm = input_to_rse_imm12(ctx, inputs[1], narrow_mode);
+    debug!("lower_icmp_or_ifcmp_to_flags: rn = {:?} rm = {:?}", rn, rm);
     let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
     let rd = writable_zero_reg();
     ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
@@ -934,17 +868,21 @@ pub(crate) fn lower_fcmp_or_ffcmp_to_flags<C: LowerCtx<I = Inst>>(ctx: &mut C, i
 impl LowerBackend for AArch64Backend {
     type MInst = Inst;
 
-    fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) {
-        lower_inst::lower_insn_to_regs(ctx, ir_inst);
+    fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> {
+        lower_inst::lower_insn_to_regs(ctx, ir_inst)
     }
 
     fn lower_branch_group<C: LowerCtx<I = Inst>>(
         &self,
         ctx: &mut C,
         branches: &[IRInst],
-        targets: &[BlockIndex],
-        fallthrough: Option<BlockIndex>,
-    ) {
+        targets: &[MachLabel],
+        fallthrough: Option<MachLabel>,
+    ) -> CodegenResult<()> {
         lower_inst::lower_branch(ctx, branches, targets, fallthrough)
     }
+
+    fn maybe_pinned_reg(&self) -> Option<Reg> {
+        Some(xreg(PINNED_REG))
+    }
 }
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index ffa9e11012..8692d853de 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -1,11 +1,13 @@
 //! Lower a single Cranelift instruction into vcode.
 
+use crate::binemit::CodeOffset;
 use crate::ir::condcodes::FloatCC;
 use crate::ir::types::*;
 use crate::ir::Inst as IRInst;
 use crate::ir::{InstructionData, Opcode, TrapCode};
 use crate::machinst::lower::*;
 use crate::machinst::*;
+use crate::CodegenResult;
 
 use crate::isa::aarch64::abi::*;
 use crate::isa::aarch64::inst::*;
@@ -19,7 +21,10 @@ use smallvec::SmallVec;
 use super::lower::*;
 
 /// Actually codegen an instruction's results into registers.
-pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) {
+pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    insn: IRInst,
+) -> CodegenResult<()> {
     let op = ctx.data(insn).opcode();
     let inputs: SmallVec<[InsnInput; 4]> = (0..ctx.num_inputs(insn))
         .map(|i| InsnInput { insn, input: i })
@@ -35,17 +40,17 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
 
     match op {
         Opcode::Iconst | Opcode::Bconst | Opcode::Null => {
-            let value = output_to_const(ctx, outputs[0]).unwrap();
+            let value = ctx.get_constant(insn).unwrap();
             let rd = output_to_reg(ctx, outputs[0]);
             lower_constant_u64(ctx, rd, value);
         }
         Opcode::F32const => {
-            let value = output_to_const_f32(ctx, outputs[0]).unwrap();
+            let value = f32::from_bits(ctx.get_constant(insn).unwrap() as u32);
             let rd = output_to_reg(ctx, outputs[0]);
             lower_constant_f32(ctx, rd, value);
         }
         Opcode::F64const => {
-            let value = output_to_const_f64(ctx, outputs[0]).unwrap();
+            let value = f64::from_bits(ctx.get_constant(insn).unwrap());
             let rd = output_to_reg(ctx, outputs[0]);
             lower_constant_f64(ctx, rd, value);
         }
@@ -271,7 +276,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
 
                 // Check for divide by 0.
                 let branch_size = 8;
-                ctx.emit(Inst::CondBrLowered {
+                ctx.emit(Inst::OneWayCondBr {
                     target: BranchTarget::ResolvedOffset(branch_size),
                     kind: CondBrKind::NotZero(rm),
                 });
@@ -297,7 +302,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
 
                     // Check for divide by 0.
                     let branch_size = 20;
-                    ctx.emit(Inst::CondBrLowered {
+                    ctx.emit(Inst::OneWayCondBr {
                         target: BranchTarget::ResolvedOffset(branch_size),
                         kind: CondBrKind::Zero(rm),
                     });
@@ -324,7 +329,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                         nzcv: NZCV::new(false, false, false, false),
                         cond: Cond::Eq,
                     });
-                    ctx.emit(Inst::CondBrLowered {
+                    ctx.emit(Inst::OneWayCondBr {
                         target: BranchTarget::ResolvedOffset(12),
                         kind: CondBrKind::Cond(Cond::Vc),
                     });
@@ -337,7 +342,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
 
                     // Check for divide by 0.
                     let branch_size = 8;
-                    ctx.emit(Inst::CondBrLowered {
+                    ctx.emit(Inst::OneWayCondBr {
                         target: BranchTarget::ResolvedOffset(branch_size),
                         kind: CondBrKind::NotZero(rm),
                     });
@@ -1211,7 +1216,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
             // Branch around the break instruction with inverted cond. Go straight to lowered
             // one-target form; this is logically part of a single-in single-out template lowering.
             let cond = cond.invert();
-            ctx.emit(Inst::CondBrLowered {
+            ctx.emit(Inst::OneWayCondBr {
                 target: BranchTarget::ResolvedOffset(8),
                 kind: CondBrKind::Cond(cond),
             });
@@ -1301,11 +1306,12 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
 
         Opcode::GetPinnedReg => {
             let rd = output_to_reg(ctx, outputs[0]);
-            ctx.emit(Inst::GetPinnedReg { rd });
+            ctx.emit(Inst::mov(rd, xreg(PINNED_REG)));
         }
+
         Opcode::SetPinnedReg => {
             let rm = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
-            ctx.emit(Inst::SetPinnedReg { rm });
+            ctx.emit(Inst::mov(writable_xreg(PINNED_REG), rm));
         }
 
         Opcode::Spill
@@ -1533,7 +1539,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
             } else {
                 ctx.emit(Inst::FpuCmp64 { rn, rm: rn });
             }
-            ctx.emit(Inst::CondBrLowered {
+            ctx.emit(Inst::OneWayCondBr {
                 target: BranchTarget::ResolvedOffset(8),
                 kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::Ordered)),
             });
@@ -1574,7 +1580,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                     rn,
                     rm: tmp.to_reg(),
                 });
-                ctx.emit(Inst::CondBrLowered {
+                ctx.emit(Inst::OneWayCondBr {
                     target: BranchTarget::ResolvedOffset(8),
                     kind: CondBrKind::Cond(lower_fp_condcode(low_cond)),
                 });
@@ -1587,7 +1593,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                     rn,
                     rm: tmp.to_reg(),
                 });
-                ctx.emit(Inst::CondBrLowered {
+                ctx.emit(Inst::OneWayCondBr {
                     target: BranchTarget::ResolvedOffset(8),
                     kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan)),
                 });
@@ -1617,7 +1623,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                     rn,
                     rm: tmp.to_reg(),
                 });
-                ctx.emit(Inst::CondBrLowered {
+                ctx.emit(Inst::OneWayCondBr {
                     target: BranchTarget::ResolvedOffset(8),
                     kind: CondBrKind::Cond(lower_fp_condcode(low_cond)),
                 });
@@ -1630,7 +1636,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                     rn,
                     rm: tmp.to_reg(),
                 });
-                ctx.emit(Inst::CondBrLowered {
+                ctx.emit(Inst::OneWayCondBr {
                     target: BranchTarget::ResolvedOffset(8),
                     kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan)),
                 });
@@ -1862,14 +1868,16 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
         Opcode::AvgRound => unimplemented!(),
         Opcode::TlsValue => unimplemented!(),
     }
+
+    Ok(())
 }
 
 pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     branches: &[IRInst],
-    targets: &[BlockIndex],
-    fallthrough: Option<BlockIndex>,
-) {
+    targets: &[MachLabel],
+    fallthrough: Option<MachLabel>,
+) -> CodegenResult<()> {
     // A block should end with at most two branches. The first may be a
     // conditional branch; a conditional branch can be followed only by an
     // unconditional branch or fallthrough. Otherwise, if only one branch,
@@ -1883,18 +1891,14 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
         let op0 = ctx.data(branches[0]).opcode();
         let op1 = ctx.data(branches[1]).opcode();
 
-        //println!(
-        //    "lowering two-branch group: opcodes are {:?} and {:?}",
-        //    op0, op1
-        //);
-
         assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough);
-        let taken = BranchTarget::Block(targets[0]);
+        let taken = BranchTarget::Label(targets[0]);
         let not_taken = match op1 {
-            Opcode::Jump => BranchTarget::Block(targets[1]),
-            Opcode::Fallthrough => BranchTarget::Block(fallthrough.unwrap()),
+            Opcode::Jump => BranchTarget::Label(targets[1]),
+            Opcode::Fallthrough => BranchTarget::Label(fallthrough.unwrap()),
             _ => unreachable!(), // assert above.
         };
+
         match op0 {
             Opcode::Brz | Opcode::Brnz => {
                 let flag_input = InsnInput {
@@ -1954,6 +1958,8 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
             Opcode::BrIcmp => {
                 let condcode = inst_condcode(ctx.data(branches[0])).unwrap();
                 let cond = lower_condcode(condcode);
+                let kind = CondBrKind::Cond(cond);
+
                 let is_signed = condcode_is_signed(condcode);
                 let ty = ctx.input_ty(branches[0], 0);
                 let bits = ty_bits(ty);
@@ -1986,13 +1992,15 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                 ctx.emit(Inst::CondBr {
                     taken,
                     not_taken,
-                    kind: CondBrKind::Cond(cond),
+                    kind,
                 });
             }
 
             Opcode::Brif => {
                 let condcode = inst_condcode(ctx.data(branches[0])).unwrap();
                 let cond = lower_condcode(condcode);
+                let kind = CondBrKind::Cond(cond);
+
                 let is_signed = condcode_is_signed(condcode);
                 let flag_input = InsnInput {
                     insn: branches[0],
@@ -2003,7 +2011,7 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                     ctx.emit(Inst::CondBr {
                         taken,
                         not_taken,
-                        kind: CondBrKind::Cond(cond),
+                        kind,
                     });
                 } else {
                     // If the ifcmp result is actually placed in a
@@ -2013,7 +2021,7 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                     ctx.emit(Inst::CondBr {
                         taken,
                         not_taken,
-                        kind: CondBrKind::Cond(cond),
+                        kind,
                     });
                 }
             }
@@ -2021,6 +2029,7 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
             Opcode::Brff => {
                 let condcode = inst_fp_condcode(ctx.data(branches[0])).unwrap();
                 let cond = lower_fp_condcode(condcode);
+                let kind = CondBrKind::Cond(cond);
                 let flag_input = InsnInput {
                     insn: branches[0],
                     input: 0,
@@ -2030,7 +2039,7 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                     ctx.emit(Inst::CondBr {
                         taken,
                         not_taken,
-                        kind: CondBrKind::Cond(cond),
+                        kind,
                     });
                 } else {
                     // If the ffcmp result is actually placed in a
@@ -2040,7 +2049,7 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                     ctx.emit(Inst::CondBr {
                         taken,
                         not_taken,
-                        kind: CondBrKind::Cond(cond),
+                        kind,
                     });
                 }
             }
@@ -2057,12 +2066,13 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                 // fills in `targets[0]` with our fallthrough block, so this
                 // is valid for both Jump and Fallthrough.
                 ctx.emit(Inst::Jump {
-                    dest: BranchTarget::Block(targets[0]),
+                    dest: BranchTarget::Label(targets[0]),
                 });
             }
             Opcode::BrTable => {
                 // Expand `br_table index, default, JT` to:
                 //
+                //   (emit island with guard jump if needed)
                 //   subs idx, #jt_size
                 //   b.hs default
                 //   adr vTmp1, PC+16
@@ -2072,6 +2082,11 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                 //   [jumptable offsets relative to JT base]
                 let jt_size = targets.len() - 1;
                 assert!(jt_size <= std::u32::MAX as usize);
+
+                ctx.emit(Inst::EmitIsland {
+                    needed_space: 4 * (6 + jt_size) as CodeOffset,
+                });
+
                 let ridx = input_to_reg(
                     ctx,
                     InsnInput {
@@ -2101,10 +2116,10 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                         rm: rtmp1.to_reg(),
                     });
                 }
-                let default_target = BranchTarget::Block(targets[0]);
-                ctx.emit(Inst::CondBrLowered {
-                    kind: CondBrKind::Cond(Cond::Hs), // unsigned >=
+                let default_target = BranchTarget::Label(targets[0]);
+                ctx.emit(Inst::OneWayCondBr {
                     target: default_target.clone(),
+                    kind: CondBrKind::Cond(Cond::Hs), // unsigned >=
                 });
 
                 // Emit the compound instruction that does:
@@ -2125,9 +2140,9 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                 let jt_targets: Vec<BranchTarget> = targets
                     .iter()
                     .skip(1)
-                    .map(|bix| BranchTarget::Block(*bix))
+                    .map(|bix| BranchTarget::Label(*bix))
                     .collect();
-                let targets_for_term: Vec<BlockIndex> = targets.to_vec();
+                let targets_for_term: Vec<MachLabel> = targets.to_vec();
                 ctx.emit(Inst::JTSequence {
                     ridx,
                     rtmp1,
@@ -2140,4 +2155,6 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
             _ => panic!("Unknown branch type!"),
         }
     }
+
+    Ok(())
 }
diff --git a/cranelift/codegen/src/isa/aarch64/mod.rs b/cranelift/codegen/src/isa/aarch64/mod.rs
index d377d998c9..3aa8c779aa 100644
--- a/cranelift/codegen/src/isa/aarch64/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/mod.rs
@@ -15,7 +15,7 @@ use target_lexicon::{Aarch64Architecture, Architecture, Triple};
 
 // New backend:
 mod abi;
-mod inst;
+pub(crate) mod inst;
 mod lower;
 mod lower_inst;
 
@@ -59,7 +59,7 @@ impl MachBackend for AArch64Backend {
     ) -> CodegenResult<MachCompileResult> {
         let flags = self.flags();
         let vcode = self.compile_vcode(func, flags.clone())?;
-        let sections = vcode.emit();
+        let buffer = vcode.emit();
         let frame_size = vcode.frame_size();
 
         let disasm = if want_disasm {
@@ -68,8 +68,10 @@ impl MachBackend for AArch64Backend {
             None
         };
 
+        let buffer = buffer.finish();
+
         Ok(MachCompileResult {
-            sections,
+            buffer,
             frame_size,
             disasm,
         })
@@ -140,8 +142,8 @@ mod test {
             Triple::from_str("aarch64").unwrap(),
             settings::Flags::new(shared_flags),
         );
-        let sections = backend.compile_function(&mut func, false).unwrap().sections;
-        let code = &sections.sections[0].data;
+        let buffer = backend.compile_function(&mut func, false).unwrap().buffer;
+        let code = &buffer.data[..];
 
         // stp x29, x30, [sp, #-16]!
         // mov x29, sp
@@ -155,7 +157,7 @@ mod test {
             0x01, 0x0b, 0xbf, 0x03, 0x00, 0x91, 0xfd, 0x7b, 0xc1, 0xa8, 0xc0, 0x03, 0x5f, 0xd6,
         ];
 
-        assert_eq!(code, &golden);
+        assert_eq!(code, &golden[..]);
     }
 
     #[test]
@@ -198,34 +200,32 @@ mod test {
         let result = backend
             .compile_function(&mut func, /* want_disasm = */ false)
             .unwrap();
-        let code = &result.sections.sections[0].data;
+        let code = &result.buffer.data[..];
 
         // stp	x29, x30, [sp, #-16]!
         // mov	x29, sp
-        // mov	x1, x0
-        // mov  x0, #0x1234
-        // add	w1, w1, w0
-        // mov	w2, w1
-        // cbz	x2, ...
-        // mov	w2, w1
-        // cbz	x2, ...
-        // sub	w0, w1, w0
+        // mov	x1, #0x1234                	// #4660
+        // add	w0, w0, w1
+        // mov	w1, w0
+        // cbnz	x1, 0x28
+        // mov	x1, #0x1234                	// #4660
+        // add	w1, w0, w1
+        // mov	w1, w1
+        // cbnz	x1, 0x18
+        // mov	w1, w0
+        // cbnz	x1, 0x18
+        // mov	x1, #0x1234                	// #4660
+        // sub	w0, w0, w1
         // mov	sp, x29
         // ldp	x29, x30, [sp], #16
         // ret
-        // add	w2, w1, w0
-        // mov	w2, w2
-        // cbnz	x2, ... <---- compound branch (cond / uncond)
-        // b ...        <----
-
         let golden = vec![
-            0xfd, 0x7b, 0xbf, 0xa9, 0xfd, 0x03, 0x00, 0x91, 0xe1, 0x03, 0x00, 0xaa, 0x80, 0x46,
-            0x82, 0xd2, 0x21, 0x00, 0x00, 0x0b, 0xe2, 0x03, 0x01, 0x2a, 0xe2, 0x00, 0x00, 0xb4,
-            0xe2, 0x03, 0x01, 0x2a, 0xa2, 0x00, 0x00, 0xb5, 0x20, 0x00, 0x00, 0x4b, 0xbf, 0x03,
-            0x00, 0x91, 0xfd, 0x7b, 0xc1, 0xa8, 0xc0, 0x03, 0x5f, 0xd6, 0x22, 0x00, 0x00, 0x0b,
-            0xe2, 0x03, 0x02, 0x2a, 0xc2, 0xff, 0xff, 0xb5, 0xf7, 0xff, 0xff, 0x17,
+            253, 123, 191, 169, 253, 3, 0, 145, 129, 70, 130, 210, 0, 0, 1, 11, 225, 3, 0, 42, 161,
+            0, 0, 181, 129, 70, 130, 210, 1, 0, 1, 11, 225, 3, 1, 42, 161, 255, 255, 181, 225, 3,
+            0, 42, 97, 255, 255, 181, 129, 70, 130, 210, 0, 0, 1, 75, 191, 3, 0, 145, 253, 123,
+            193, 168, 192, 3, 95, 214,
         ];
 
-        assert_eq!(code, &golden);
+        assert_eq!(code, &golden[..]);
     }
 }
diff --git a/cranelift/codegen/src/isa/mod.rs b/cranelift/codegen/src/isa/mod.rs
index 6c9a904f03..d193936a91 100644
--- a/cranelift/codegen/src/isa/mod.rs
+++ b/cranelift/codegen/src/isa/mod.rs
@@ -77,14 +77,14 @@ mod riscv;
 #[cfg(feature = "x86")]
 mod x86;
 
-#[cfg(feature = "x64")]
-mod x64;
+//#[cfg(feature = "x64")]
+//mod x64;
 
 #[cfg(feature = "arm32")]
 mod arm32;
 
 #[cfg(feature = "arm64")]
-mod aarch64;
+pub(crate) mod aarch64;
 
 #[cfg(feature = "unwind")]
 pub mod unwind;
diff --git a/cranelift/codegen/src/isa/x86/mod.rs b/cranelift/codegen/src/isa/x86/mod.rs
index 9386e60310..0cd825b161 100644
--- a/cranelift/codegen/src/isa/x86/mod.rs
+++ b/cranelift/codegen/src/isa/x86/mod.rs
@@ -57,11 +57,11 @@ fn isa_constructor(
     let isa_flags = settings::Flags::new(&shared_flags, builder);
 
     if isa_flags.use_new_backend() {
-        #[cfg(not(feature = "x64"))]
+        //#[cfg(not(feature = "x64"))]
         panic!("new backend x86 support not included by cargo features!");
 
-        #[cfg(feature = "x64")]
-        super::x64::isa_builder(triple).finish(shared_flags)
+    //#[cfg(feature = "x64")]
+    //super::x64::isa_builder(triple).finish(shared_flags)
     } else {
         Box::new(Isa {
             triple,
diff --git a/cranelift/codegen/src/lib.rs b/cranelift/codegen/src/lib.rs
index 3483219fea..dd871924ab 100644
--- a/cranelift/codegen/src/lib.rs
+++ b/cranelift/codegen/src/lib.rs
@@ -99,7 +99,6 @@ mod iterators;
 mod legalizer;
 mod licm;
 mod nan_canonicalization;
-mod num_uses;
 mod partition_slice;
 mod postopt;
 mod predicates;
diff --git a/cranelift/codegen/src/machinst/blockorder.rs b/cranelift/codegen/src/machinst/blockorder.rs
index dd826809c4..104b2f8c15 100644
--- a/cranelift/codegen/src/machinst/blockorder.rs
+++ b/cranelift/codegen/src/machinst/blockorder.rs
@@ -1,49 +1,579 @@
 //! Computation of basic block order in emitted code.
+//!
+//! This module handles the translation from CLIF BBs to VCode BBs.
+//!
+//! The basic idea is that we compute a sequence of "lowered blocks" that
+//! correspond to subgraphs of the CLIF CFG plus an implicit block on *every*
+//! edge (not just critical edges). Conceptually, the lowering pipeline wants to
+//! insert moves for phi-nodes on every block-to-block transfer; these blocks
+//! always conceptually exist, but may be merged with an "original" CLIF block
+//! (and hence not actually exist; this is equivalent to inserting the blocks
+//! only on critical edges).
+//!
+//! Each `LoweredBlock` names just an original CLIF block, an original CLIF
+//! block prepended or appended with an edge block (never both, though), or just
+//! an edge block.
+//!
+//! To compute this lowering, we do a DFS over the CLIF-plus-edge-block graph
+//! (never actually materialized, just defined by a "successors" function), and
+//! compute the reverse postorder.
+//!
+//! This algorithm isn't perfect w.r.t. generated code quality: we don't, for
+//! example, consider any information about whether edge blocks will actually
+//! have content, because this computation happens as part of lowering *before*
+//! regalloc, and regalloc may or may not insert moves/spills/reloads on any
+//! particular edge. But it works relatively well and is conceptually simple.
 
+use crate::entity::SecondaryMap;
+use crate::fx::{FxHashMap, FxHashSet};
+use crate::ir::{Block, Function, Inst, Opcode};
+use crate::machinst::lower::visit_block_succs;
 use crate::machinst::*;
 
-/// Simple reverse postorder-based block order emission.
-///
-/// TODO: use a proper algorithm, such as the bottom-up straight-line-section
-/// construction algorithm.
-struct BlockRPO {
-    visited: Vec<bool>,
-    postorder: Vec<BlockIndex>,
+use log::debug;
+use smallvec::SmallVec;
+
+/// Mapping from CLIF BBs to VCode BBs.
+#[derive(Debug)]
+pub struct BlockLoweringOrder {
+    /// Lowered blocks, in BlockIndex order. Each block is some combination of
+    /// (i) a CLIF block, and (ii) inserted crit-edge blocks before or after;
+    /// see [LoweredBlock] for details.
+    lowered_order: Vec<LoweredBlock>,
+    /// Successors for all lowered blocks, in one serialized vector. Indexed by
+    /// the ranges in `lowered_succ_ranges`.
+    lowered_succs: Vec<(Inst, LoweredBlock)>,
+    /// BlockIndex values for successors for all lowered blocks, in the same
+    /// order as `lowered_succs`.
+    lowered_succ_indices: Vec<(Inst, BlockIndex)>,
+    /// Ranges in `lowered_succs` giving the successor lists for each lowered
+    /// block. Indexed by lowering-order index (`BlockIndex`).
+    lowered_succ_ranges: Vec<(usize, usize)>,
+    /// Mapping from CLIF BB to BlockIndex (index in lowered order). Note that
+    /// some CLIF BBs may not be lowered; in particular, we skip unreachable
+    /// blocks.
+    orig_map: SecondaryMap<Block, Option<BlockIndex>>,
 }
 
-impl BlockRPO {
-    fn new<I: VCodeInst>(vcode: &VCode<I>) -> BlockRPO {
-        BlockRPO {
-            visited: vec![false; vcode.num_blocks()],
-            postorder: Vec::with_capacity(vcode.num_blocks()),
+/// The origin of a block in the lowered block-order: either an original CLIF
+/// block, or an inserted edge-block, or a combination of the two if an edge is
+/// non-critical.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub enum LoweredBlock {
+    /// Block in original CLIF, with no merged edge-blocks.
+    Orig {
+        /// Original CLIF block.
+        block: Block,
+    },
+    /// Block in the original CLIF, plus edge-block to one succ (which is the
+    /// one successor of the original block).
+    OrigAndEdge {
+        /// The original CLIF block contained in this lowered block.
+        block: Block,
+        /// The edge (jump) instruction transitioning from this block
+        /// to the next, i.e., corresponding to the included edge-block. This
+        /// will be an instruction in `block`.
+        edge_inst: Inst,
+        /// The successor CLIF block.
+        succ: Block,
+    },
+    /// Block in the original CLIF, preceded by edge-block from one pred (which
+    /// is the one pred of the original block).
+    EdgeAndOrig {
+        /// The previous CLIF block, i.e., the edge block's predecessor.
+        pred: Block,
+        /// The edge (jump) instruction corresponding to the included
+        /// edge-block. This will be an instruction in `pred`.
+        edge_inst: Inst,
+        /// The original CLIF block included in this lowered block.
+        block: Block,
+    },
+    /// Split critical edge between two CLIF blocks. This lowered block does not
+    /// correspond to any original CLIF blocks; it only serves as an insertion
+    /// point for work to happen on the transition from `pred` to `succ`.
+    Edge {
+        /// The predecessor CLIF block.
+        pred: Block,
+        /// The edge (jump) instruction corresponding to this edge's transition.
+        /// This will be an instruction in `pred`.
+        edge_inst: Inst,
+        /// The successor CLIF block.
+        succ: Block,
+    },
+}
+
+impl LoweredBlock {
+    /// The associated original (CLIF) block included in this lowered block, if
+    /// any.
+    pub fn orig_block(self) -> Option<Block> {
+        match self {
+            LoweredBlock::Orig { block, .. }
+            | LoweredBlock::OrigAndEdge { block, .. }
+            | LoweredBlock::EdgeAndOrig { block, .. } => Some(block),
+            LoweredBlock::Edge { .. } => None,
         }
     }
 
-    fn visit<I: VCodeInst>(&mut self, vcode: &VCode<I>, block: BlockIndex) {
-        self.visited[block as usize] = true;
-        for succ in vcode.succs(block) {
-            if !self.visited[succ.get() as usize] {
-                self.visit(vcode, succ.get());
+    /// The associated in-edge, if any.
+    pub fn in_edge(self) -> Option<(Block, Inst, Block)> {
+        match self {
+            LoweredBlock::EdgeAndOrig {
+                pred,
+                edge_inst,
+                block,
+            } => Some((pred, edge_inst, block)),
+            _ => None,
+        }
+    }
+
+    /// the associated out-edge, if any. Also includes edge-only blocks.
+    pub fn out_edge(self) -> Option<(Block, Inst, Block)> {
+        match self {
+            LoweredBlock::OrigAndEdge {
+                block,
+                edge_inst,
+                succ,
+            } => Some((block, edge_inst, succ)),
+            LoweredBlock::Edge {
+                pred,
+                edge_inst,
+                succ,
+            } => Some((pred, edge_inst, succ)),
+            _ => None,
+        }
+    }
+}
+
+impl BlockLoweringOrder {
+    /// Compute and return a lowered block order for `f`.
+    pub fn new(f: &Function) -> BlockLoweringOrder {
+        debug!("BlockLoweringOrder: function body {:?}", f);
+
+        // Step 1: compute the in-edge and out-edge count of every block.
+        let mut block_in_count = SecondaryMap::with_default(0);
+        let mut block_out_count = SecondaryMap::with_default(0);
+
+        // Cache the block successors to avoid re-examining branches below.
+        let mut block_succs: SmallVec<[(Inst, Block); 128]> = SmallVec::new();
+        let mut block_succ_range = SecondaryMap::with_default((0, 0));
+        let mut fallthrough_return_block = None;
+        for block in f.layout.blocks() {
+            let block_succ_start = block_succs.len();
+            visit_block_succs(f, block, |inst, succ| {
+                block_out_count[block] += 1;
+                block_in_count[succ] += 1;
+                block_succs.push((inst, succ));
+            });
+            let block_succ_end = block_succs.len();
+            block_succ_range[block] = (block_succ_start, block_succ_end);
+
+            for inst in f.layout.block_likely_branches(block) {
+                if f.dfg[inst].opcode() == Opcode::Return {
+                    // Implicit output edge for any return.
+                    block_out_count[block] += 1;
+                }
+                if f.dfg[inst].opcode() == Opcode::FallthroughReturn {
+                    // Fallthrough return block must come last.
+                    debug_assert!(fallthrough_return_block == None);
+                    fallthrough_return_block = Some(block);
+                }
             }
         }
-        if Some(block) != vcode.fallthrough_return_block {
-            self.postorder.push(block);
+        // Implicit input edge for entry block.
+        if let Some(entry) = f.layout.entry_block() {
+            block_in_count[entry] += 1;
         }
+
+        // Here we define the implicit CLIF-plus-edges graph. There are
+        // conceptually two such graphs: the original, with every edge explicit,
+        // and the merged one, with blocks (represented by `LoweredBlock`
+        // values) that contain original CLIF blocks, edges, or both. This
+        // function returns a lowered block's successors as per the latter, with
+        // consideration to edge-block merging.
+        //
+        // Note that there is a property of the block-merging rules below
+        // that is very important to ensure we don't miss any lowered blocks:
+        // any block in the implicit CLIF-plus-edges graph will *only* be
+        // included in one block in the merged graph.
+        //
+        // This, combined with the property that every edge block is reachable
+        // only from one predecessor (and hence cannot be reached by a DFS
+        // backedge), means that it is sufficient in our DFS below to track
+        // visited-bits per original CLIF block only, not per edge. This greatly
+        // simplifies the data structures (no need to keep a sparse hash-set of
+        // (block, block) tuples).
+        let compute_lowered_succs = |ret: &mut Vec<(Inst, LoweredBlock)>, block: LoweredBlock| {
+            let start_idx = ret.len();
+            match block {
+                LoweredBlock::Orig { block } | LoweredBlock::EdgeAndOrig { block, .. } => {
+                    // At an orig block; successors are always edge blocks,
+                    // possibly with orig blocks following.
+                    let range = block_succ_range[block];
+                    for &(edge_inst, succ) in &block_succs[range.0..range.1] {
+                        if block_in_count[succ] == 1 {
+                            ret.push((
+                                edge_inst,
+                                LoweredBlock::EdgeAndOrig {
+                                    pred: block,
+                                    edge_inst,
+                                    block: succ,
+                                },
+                            ));
+                        } else {
+                            ret.push((
+                                edge_inst,
+                                LoweredBlock::Edge {
+                                    pred: block,
+                                    edge_inst,
+                                    succ,
+                                },
+                            ));
+                        }
+                    }
+                }
+                LoweredBlock::Edge {
+                    succ, edge_inst, ..
+                }
+                | LoweredBlock::OrigAndEdge {
+                    succ, edge_inst, ..
+                } => {
+                    // At an edge block; successors are always orig blocks,
+                    // possibly with edge blocks following.
+                    if block_out_count[succ] == 1 {
+                        let range = block_succ_range[succ];
+                        // check if the one succ is a real CFG edge (vs.
+                        // implicit return succ).
+                        if range.1 - range.0 > 0 {
+                            debug_assert!(range.1 - range.0 == 1);
+                            let (succ_edge_inst, succ_succ) = block_succs[range.0];
+                            ret.push((
+                                edge_inst,
+                                LoweredBlock::OrigAndEdge {
+                                    block: succ,
+                                    edge_inst: succ_edge_inst,
+                                    succ: succ_succ,
+                                },
+                            ));
+                        } else {
+                            ret.push((edge_inst, LoweredBlock::Orig { block: succ }));
+                        }
+                    } else {
+                        ret.push((edge_inst, LoweredBlock::Orig { block: succ }));
+                    }
+                }
+            }
+            let end_idx = ret.len();
+            (start_idx, end_idx)
+        };
+
+        // Build the explicit LoweredBlock-to-LoweredBlock successors list.
+        let mut lowered_succs = vec![];
+        let mut lowered_succ_indices = vec![];
+
+        // Step 2: Compute RPO traversal of the implicit CLIF-plus-edge-block graph. Use an
+        // explicit stack so we don't overflow the real stack with a deep DFS.
+        #[derive(Debug)]
+        struct StackEntry {
+            this: LoweredBlock,
+            succs: (usize, usize), // range in lowered_succs
+            cur_succ: usize,       // index in lowered_succs
+        }
+
+        let mut stack: SmallVec<[StackEntry; 16]> = SmallVec::new();
+        let mut visited = FxHashSet::default();
+        let mut postorder = vec![];
+        if let Some(entry) = f.layout.entry_block() {
+            // FIXME(cfallin): we might be able to use OrigAndEdge. Find a way
+            // to not special-case the entry block here.
+            let block = LoweredBlock::Orig { block: entry };
+            visited.insert(block);
+            let range = compute_lowered_succs(&mut lowered_succs, block);
+            lowered_succ_indices.resize(lowered_succs.len(), 0);
+            stack.push(StackEntry {
+                this: block,
+                succs: range,
+                cur_succ: range.1,
+            });
+        }
+
+        let mut deferred_last = None;
+        while !stack.is_empty() {
+            let stack_entry = stack.last_mut().unwrap();
+            let range = stack_entry.succs;
+            if stack_entry.cur_succ == range.0 {
+                let orig_block = stack_entry.this.orig_block();
+                if orig_block.is_some() && orig_block == fallthrough_return_block {
+                    deferred_last = Some((stack_entry.this, range));
+                } else {
+                    postorder.push((stack_entry.this, range));
+                }
+                stack.pop();
+            } else {
+                // Heuristic: chase the children in reverse. This puts the first
+                // successor block first in RPO, all other things being equal,
+                // which tends to prioritize loop backedges over out-edges,
+                // putting the edge-block closer to the loop body and minimizing
+                // live-ranges in linear instruction space.
+                let next = lowered_succs[stack_entry.cur_succ - 1].1;
+                stack_entry.cur_succ -= 1;
+                if visited.contains(&next) {
+                    continue;
+                }
+                visited.insert(next);
+                let range = compute_lowered_succs(&mut lowered_succs, next);
+                lowered_succ_indices.resize(lowered_succs.len(), 0);
+                stack.push(StackEntry {
+                    this: next,
+                    succs: range,
+                    cur_succ: range.1,
+                });
+            }
+        }
+
+        postorder.reverse();
+        let mut rpo = postorder;
+        if let Some(d) = deferred_last {
+            rpo.push(d);
+        }
+
+        // Step 3: now that we have RPO, build the BlockIndex/BB fwd/rev maps.
+        let mut lowered_order = vec![];
+        let mut lowered_succ_ranges = vec![];
+        let mut lb_to_bindex = FxHashMap::default();
+        for (block, succ_range) in rpo.into_iter() {
+            lb_to_bindex.insert(block, lowered_order.len() as BlockIndex);
+            lowered_order.push(block);
+            lowered_succ_ranges.push(succ_range);
+        }
+
+        let lowered_succ_indices = lowered_succs
+            .iter()
+            .map(|&(inst, succ)| (inst, lb_to_bindex.get(&succ).cloned().unwrap()))
+            .collect();
+
+        let mut orig_map = SecondaryMap::with_default(None);
+        for (i, lb) in lowered_order.iter().enumerate() {
+            let i = i as BlockIndex;
+            if let Some(b) = lb.orig_block() {
+                orig_map[b] = Some(i);
+            }
+        }
+
+        let result = BlockLoweringOrder {
+            lowered_order,
+            lowered_succs,
+            lowered_succ_indices,
+            lowered_succ_ranges,
+            orig_map,
+        };
+        debug!("BlockLoweringOrder: {:?}", result);
+        result
     }
 
-    fn rpo<I: VCodeInst>(self, vcode: &VCode<I>) -> Vec<BlockIndex> {
-        let mut rpo = self.postorder;
-        rpo.reverse();
-        if let Some(block) = vcode.fallthrough_return_block {
-            rpo.push(block);
-        }
-        rpo
+    /// Get the lowered order of blocks.
+    pub fn lowered_order(&self) -> &[LoweredBlock] {
+        &self.lowered_order[..]
+    }
+
+    /// Get the successors for a lowered block, by index in `lowered_order()`'s
+    /// returned slice. Each successsor is paired with the edge-instruction
+    /// (branch) corresponding to this edge.
+    pub fn succs(&self, block: BlockIndex) -> &[(Inst, LoweredBlock)] {
+        let range = self.lowered_succ_ranges[block as usize];
+        &self.lowered_succs[range.0..range.1]
+    }
+
+    /// Get the successor indices for a lowered block.
+    pub fn succ_indices(&self, block: BlockIndex) -> &[(Inst, BlockIndex)] {
+        let range = self.lowered_succ_ranges[block as usize];
+        &self.lowered_succ_indices[range.0..range.1]
+    }
+
+    /// Get the lowered block index containing a CLIF block, if any. (May not be
+    /// present if the original CLIF block was unreachable.)
+    pub fn lowered_block_for_bb(&self, bb: Block) -> Option<BlockIndex> {
+        self.orig_map[bb]
     }
 }
 
-/// Compute the final block order.
-pub fn compute_final_block_order<I: VCodeInst>(vcode: &VCode<I>) -> Vec<BlockIndex> {
-    let mut rpo = BlockRPO::new(vcode);
-    rpo.visit(vcode, vcode.entry());
-    rpo.rpo(vcode)
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::cursor::{Cursor, FuncCursor};
+    use crate::ir::types::*;
+    use crate::ir::{AbiParam, ExternalName, Function, InstBuilder, Signature};
+    use crate::isa::CallConv;
+
+    fn build_test_func(n_blocks: usize, edges: &[(usize, usize)]) -> Function {
+        assert!(n_blocks > 0);
+
+        let name = ExternalName::testcase("test0");
+        let mut sig = Signature::new(CallConv::SystemV);
+        sig.params.push(AbiParam::new(I32));
+        let mut func = Function::with_name_signature(name, sig);
+        let blocks = (0..n_blocks)
+            .map(|i| {
+                let bb = func.dfg.make_block();
+                assert!(bb.as_u32() == i as u32);
+                bb
+            })
+            .collect::<Vec<_>>();
+
+        let arg0 = func.dfg.append_block_param(blocks[0], I32);
+
+        let mut pos = FuncCursor::new(&mut func);
+
+        let mut edge = 0;
+        for i in 0..n_blocks {
+            pos.insert_block(blocks[i]);
+            let mut succs = vec![];
+            while edge < edges.len() && edges[edge].0 == i {
+                succs.push(edges[edge].1);
+                edge += 1;
+            }
+            if succs.len() == 0 {
+                pos.ins().return_(&[arg0]);
+            } else if succs.len() == 1 {
+                pos.ins().jump(blocks[succs[0]], &[]);
+            } else if succs.len() == 2 {
+                pos.ins().brnz(arg0, blocks[succs[0]], &[]);
+                pos.ins().jump(blocks[succs[1]], &[]);
+            } else {
+                panic!("Too many successors");
+            }
+        }
+
+        func
+    }
+
+    #[test]
+    fn test_blockorder_diamond() {
+        let func = build_test_func(4, &[(0, 1), (0, 2), (1, 3), (2, 3)]);
+        let order = BlockLoweringOrder::new(&func);
+
+        assert_eq!(order.lowered_order.len(), 6);
+
+        assert!(order.lowered_order[0].orig_block().unwrap().as_u32() == 0);
+        assert!(order.lowered_order[0].in_edge().is_none());
+        assert!(order.lowered_order[0].out_edge().is_none());
+
+        assert!(order.lowered_order[1].orig_block().unwrap().as_u32() == 1);
+        assert!(order.lowered_order[1].in_edge().unwrap().0.as_u32() == 0);
+        assert!(order.lowered_order[1].in_edge().unwrap().2.as_u32() == 1);
+
+        assert!(order.lowered_order[2].orig_block().is_none());
+        assert!(order.lowered_order[2].in_edge().is_none());
+        assert!(order.lowered_order[2].out_edge().unwrap().0.as_u32() == 1);
+        assert!(order.lowered_order[2].out_edge().unwrap().2.as_u32() == 3);
+
+        assert!(order.lowered_order[3].orig_block().unwrap().as_u32() == 2);
+        assert!(order.lowered_order[3].in_edge().unwrap().0.as_u32() == 0);
+        assert!(order.lowered_order[3].in_edge().unwrap().2.as_u32() == 2);
+        assert!(order.lowered_order[3].out_edge().is_none());
+
+        assert!(order.lowered_order[4].orig_block().is_none());
+        assert!(order.lowered_order[4].in_edge().is_none());
+        assert!(order.lowered_order[4].out_edge().unwrap().0.as_u32() == 2);
+        assert!(order.lowered_order[4].out_edge().unwrap().2.as_u32() == 3);
+
+        assert!(order.lowered_order[5].orig_block().unwrap().as_u32() == 3);
+        assert!(order.lowered_order[5].in_edge().is_none());
+        assert!(order.lowered_order[5].out_edge().is_none());
+    }
+
+    #[test]
+    fn test_blockorder_critedge() {
+        //            0
+        //          /   \
+        //         1     2
+        //        /  \     \
+        //       3    4    |
+        //       |\  _|____|
+        //       | \/ |
+        //       | /\ |
+        //       5    6
+        //
+        // (3 -> 5, 3 -> 6, 4 -> 6 are critical edges and must be split)
+        //
+        let func = build_test_func(
+            7,
+            &[
+                (0, 1),
+                (0, 2),
+                (1, 3),
+                (1, 4),
+                (2, 5),
+                (3, 5),
+                (3, 6),
+                (4, 6),
+            ],
+        );
+        let order = BlockLoweringOrder::new(&func);
+
+        assert_eq!(order.lowered_order.len(), 11);
+        println!("ordered = {:?}", order.lowered_order);
+
+        // block 0
+        assert!(order.lowered_order[0].orig_block().unwrap().as_u32() == 0);
+        assert!(order.lowered_order[0].in_edge().is_none());
+        assert!(order.lowered_order[0].out_edge().is_none());
+
+        // edge 0->1 + block 1
+        assert!(order.lowered_order[1].orig_block().unwrap().as_u32() == 1);
+        assert!(order.lowered_order[1].in_edge().unwrap().0.as_u32() == 0);
+        assert!(order.lowered_order[1].in_edge().unwrap().2.as_u32() == 1);
+        assert!(order.lowered_order[1].out_edge().is_none());
+
+        // edge 1->3 + block 3
+        assert!(order.lowered_order[2].orig_block().unwrap().as_u32() == 3);
+        assert!(order.lowered_order[2].in_edge().unwrap().0.as_u32() == 1);
+        assert!(order.lowered_order[2].in_edge().unwrap().2.as_u32() == 3);
+        assert!(order.lowered_order[2].out_edge().is_none());
+
+        // edge 3->5
+        assert!(order.lowered_order[3].orig_block().is_none());
+        assert!(order.lowered_order[3].in_edge().is_none());
+        assert!(order.lowered_order[3].out_edge().unwrap().0.as_u32() == 3);
+        assert!(order.lowered_order[3].out_edge().unwrap().2.as_u32() == 5);
+
+        // edge 3->6
+        assert!(order.lowered_order[4].orig_block().is_none());
+        assert!(order.lowered_order[4].in_edge().is_none());
+        assert!(order.lowered_order[4].out_edge().unwrap().0.as_u32() == 3);
+        assert!(order.lowered_order[4].out_edge().unwrap().2.as_u32() == 6);
+
+        // edge 1->4 + block 4
+        assert!(order.lowered_order[5].orig_block().unwrap().as_u32() == 4);
+        assert!(order.lowered_order[5].in_edge().unwrap().0.as_u32() == 1);
+        assert!(order.lowered_order[5].in_edge().unwrap().2.as_u32() == 4);
+        assert!(order.lowered_order[5].out_edge().is_none());
+
+        // edge 4->6
+        assert!(order.lowered_order[6].orig_block().is_none());
+        assert!(order.lowered_order[6].in_edge().is_none());
+        assert!(order.lowered_order[6].out_edge().unwrap().0.as_u32() == 4);
+        assert!(order.lowered_order[6].out_edge().unwrap().2.as_u32() == 6);
+
+        // block 6
+        assert!(order.lowered_order[7].orig_block().unwrap().as_u32() == 6);
+        assert!(order.lowered_order[7].in_edge().is_none());
+        assert!(order.lowered_order[7].out_edge().is_none());
+
+        // edge 0->2 + block 2
+        assert!(order.lowered_order[8].orig_block().unwrap().as_u32() == 2);
+        assert!(order.lowered_order[8].in_edge().unwrap().0.as_u32() == 0);
+        assert!(order.lowered_order[8].in_edge().unwrap().2.as_u32() == 2);
+        assert!(order.lowered_order[8].out_edge().is_none());
+
+        // edge 2->5
+        assert!(order.lowered_order[9].orig_block().is_none());
+        assert!(order.lowered_order[9].in_edge().is_none());
+        assert!(order.lowered_order[9].out_edge().unwrap().0.as_u32() == 2);
+        assert!(order.lowered_order[9].out_edge().unwrap().2.as_u32() == 5);
+
+        // block 5
+        assert!(order.lowered_order[10].orig_block().unwrap().as_u32() == 5);
+        assert!(order.lowered_order[10].in_edge().is_none());
+        assert!(order.lowered_order[10].out_edge().is_none());
+    }
 }
diff --git a/cranelift/codegen/src/machinst/buffer.rs b/cranelift/codegen/src/machinst/buffer.rs
new file mode 100644
index 0000000000..b9e3bb3c1e
--- /dev/null
+++ b/cranelift/codegen/src/machinst/buffer.rs
@@ -0,0 +1,1035 @@
+//! In-memory representation of compiled machine code, with labels and fixups to
+//! refer to those labels. Handles constant-pool island insertion and also
+//! veneer insertion for out-of-range jumps.
+
+use crate::binemit::{Addend, CodeOffset, CodeSink, Reloc};
+use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode};
+use crate::machinst::{BlockIndex, MachInstLabelUse, VCodeInst};
+
+use log::debug;
+use smallvec::SmallVec;
+use std::mem;
+
+/// A buffer of output to be produced, fixed up, and then emitted to a CodeSink
+/// in bulk.
+///
+/// This struct uses `SmallVec`s to support small-ish function bodies without
+/// any heap allocation. As such, it will be several kilobytes large. This is
+/// likely fine as long as it is stack-allocated for function emission then
+/// thrown away; but beware if many buffer objects are retained persistently.
+pub struct MachBuffer<I: VCodeInst> {
+    /// The buffer contents, as raw bytes.
+    data: SmallVec<[u8; 1024]>,
+    /// Any relocations referring to this code. Note that only *external*
+    /// relocations are tracked here; references to labels within the buffer are
+    /// resolved before emission.
+    relocs: SmallVec<[MachReloc; 16]>,
+    /// Any trap records referring to this code.
+    traps: SmallVec<[MachTrap; 16]>,
+    /// Any call site records referring to this code.
+    call_sites: SmallVec<[MachCallSite; 16]>,
+    /// Any source location mappings referring to this code.
+    srclocs: SmallVec<[MachSrcLoc; 64]>,
+    /// The current source location in progress (after `start_srcloc()` and
+    /// before `end_srcloc()`).  This is a (start_offset, src_loc) tuple.
+    cur_srcloc: Option<(CodeOffset, SourceLoc)>,
+    /// Known label offsets; `UNKNOWN_LABEL_OFFSET` if unknown.
+    label_offsets: SmallVec<[CodeOffset; 16]>,
+    /// Label aliases: one label points to an unconditional jump to another
+    /// label, so references to the first should be resolved as references
+    /// to the second. (We don't chase arbitrarily deep to avoid problems
+    /// with cycles.)
+    label_aliases: SmallVec<[MachLabel; 16]>,
+    /// Constants that must be emitted at some point.
+    pending_constants: SmallVec<[MachLabelConstant; 16]>,
+    /// Fixups that must be performed after all code is emitted.
+    fixup_records: SmallVec<[MachLabelFixup<I>; 16]>,
+    /// Current deadline at which all constants are flushed and all code labels
+    /// are extended by emitting long-range jumps in an island. This flush
+    /// should be rare (e.g., on AArch64, the shortest-range PC-rel references
+    /// are +/- 1MB for conditional jumps and load-literal instructions), so
+    /// it's acceptable to track a minimum and flush-all rather than doing more
+    /// detailed "current minimum" / sort-by-deadline trickery.
+    island_deadline: CodeOffset,
+    /// How many bytes are needed in the worst case for an island, given all
+    /// pending constants and fixups.
+    island_worst_case_size: CodeOffset,
+    /// Latest branches, to facilitate in-place editing for better fallthrough
+    /// behavior and empty-block removal.
+    latest_branches: SmallVec<[MachBranch; 4]>,
+    /// All labels, in offset order.
+    labels_by_offset: SmallVec<[(MachLabel, CodeOffset); 16]>,
+}
+
+/// A `MachBuffer` once emission is completed: holds generated code and records,
+/// without fixups. This allows the type to be independent of the backend.
+pub struct MachBufferFinalized {
+    /// The buffer contents, as raw bytes.
+    pub data: SmallVec<[u8; 1024]>,
+    /// Any relocations referring to this code. Note that only *external*
+    /// relocations are tracked here; references to labels within the buffer are
+    /// resolved before emission.
+    relocs: SmallVec<[MachReloc; 16]>,
+    /// Any trap records referring to this code.
+    traps: SmallVec<[MachTrap; 16]>,
+    /// Any call site records referring to this code.
+    call_sites: SmallVec<[MachCallSite; 16]>,
+    /// Any source location mappings referring to this code.
+    srclocs: SmallVec<[MachSrcLoc; 64]>,
+}
+
+static UNKNOWN_LABEL_OFFSET: CodeOffset = 0xffff_ffff;
+static UNKNOWN_LABEL: MachLabel = MachLabel(0xffff_ffff);
+
+/// A label refers to some offset in a `MachBuffer`. It may not be resolved at
+/// the point at which it is used by emitted code; the buffer records "fixups"
+/// for references to the label, and will come back and patch the code
+/// appropriately when the label's location is eventually known.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct MachLabel(u32);
+
+impl MachLabel {
+    /// Get a label for a block. (The first N MachLabels are always reseved for
+    /// the N blocks in the vcode.)
+    pub fn from_block(bindex: BlockIndex) -> MachLabel {
+        MachLabel(bindex)
+    }
+
+    /// Get the numeric label index.
+    pub fn get(self) -> u32 {
+        self.0
+    }
+}
+
+impl<I: VCodeInst> MachBuffer<I> {
+    /// Create a new section, known to start at `start_offset` and with a size limited to `length_limit`.
+    pub fn new() -> MachBuffer<I> {
+        MachBuffer {
+            data: SmallVec::new(),
+            relocs: SmallVec::new(),
+            traps: SmallVec::new(),
+            call_sites: SmallVec::new(),
+            srclocs: SmallVec::new(),
+            cur_srcloc: None,
+            label_offsets: SmallVec::new(),
+            label_aliases: SmallVec::new(),
+            pending_constants: SmallVec::new(),
+            fixup_records: SmallVec::new(),
+            island_deadline: UNKNOWN_LABEL_OFFSET,
+            island_worst_case_size: 0,
+            latest_branches: SmallVec::new(),
+            labels_by_offset: SmallVec::new(),
+        }
+    }
+
+    /// Current offset from start of buffer.
+    pub fn cur_offset(&self) -> CodeOffset {
+        self.data.len() as CodeOffset
+    }
+
+    /// Add a byte.
+    pub fn put1(&mut self, value: u8) {
+        debug!("MachBuffer: put byte @ {}: {:x}", self.cur_offset(), value);
+        self.data.push(value);
+    }
+
+    /// Add 2 bytes.
+    pub fn put2(&mut self, value: u16) {
+        debug!(
+            "MachBuffer: put 16-bit word @ {}: {:x}",
+            self.cur_offset(),
+            value
+        );
+        let bytes = value.to_le_bytes();
+        self.data.extend_from_slice(&bytes[..]);
+    }
+
+    /// Add 4 bytes.
+    pub fn put4(&mut self, value: u32) {
+        debug!(
+            "MachBuffer: put 32-bit word @ {}: {:x}",
+            self.cur_offset(),
+            value
+        );
+        let bytes = value.to_le_bytes();
+        self.data.extend_from_slice(&bytes[..]);
+    }
+
+    /// Add 8 bytes.
+    pub fn put8(&mut self, value: u64) {
+        debug!(
+            "MachBuffer: put 64-bit word @ {}: {:x}",
+            self.cur_offset(),
+            value
+        );
+        let bytes = value.to_le_bytes();
+        self.data.extend_from_slice(&bytes[..]);
+    }
+
+    /// Add a slice of bytes.
+    pub fn put_data(&mut self, data: &[u8]) {
+        debug!(
+            "MachBuffer: put data @ {}: len {}",
+            self.cur_offset(),
+            data.len()
+        );
+        self.data.extend_from_slice(data);
+    }
+
+    /// Reserve appended space and return a mutable slice referring to it.
+    pub fn get_appended_space(&mut self, len: usize) -> &mut [u8] {
+        debug!("MachBuffer: put data @ {}: len {}", self.cur_offset(), len);
+        let off = self.data.len();
+        let new_len = self.data.len() + len;
+        self.data.resize(new_len, 0);
+        &mut self.data[off..]
+    }
+
+    /// Align up to the given alignment.
+    pub fn align_to(&mut self, align_to: CodeOffset) {
+        debug!("MachBuffer: align to {}", align_to);
+        assert!(align_to.is_power_of_two());
+        while self.cur_offset() & (align_to - 1) != 0 {
+            self.put1(0);
+        }
+    }
+
+    /// Allocate a `Label` to refer to some offset. May not be bound to a fixed
+    /// offset yet.
+    pub fn get_label(&mut self) -> MachLabel {
+        let l = self.label_offsets.len() as u32;
+        self.label_offsets.push(UNKNOWN_LABEL_OFFSET);
+        self.label_aliases.push(UNKNOWN_LABEL);
+        debug!("MachBuffer: new label -> {:?}", MachLabel(l));
+        MachLabel(l)
+    }
+
+    /// Reserve the first N MachLabels for blocks.
+    pub fn reserve_labels_for_blocks(&mut self, blocks: BlockIndex) {
+        debug!("MachBuffer: first {} labels are for blocks", blocks);
+        debug_assert!(self.label_offsets.is_empty());
+        self.label_offsets
+            .resize(blocks as usize, UNKNOWN_LABEL_OFFSET);
+        self.label_aliases.resize(blocks as usize, UNKNOWN_LABEL);
+    }
+
+    /// Bind a label to the current offset.
+    pub fn bind_label(&mut self, label: MachLabel) {
+        debug!(
+            "MachBuffer: bind label {:?} at offset {}",
+            label,
+            self.cur_offset()
+        );
+        let offset = self.cur_offset();
+        self.label_offsets[label.0 as usize] = offset;
+        self.labels_by_offset.push((label, offset));
+        self.optimize_branches();
+    }
+
+    /// Resolve a label to an offset, if known. May return `UNKNOWN_LABEL_OFFSET`.
+    fn resolve_label_offset(&self, label: MachLabel) -> CodeOffset {
+        let alias = self.label_aliases[label.0 as usize];
+        if alias != UNKNOWN_LABEL {
+            self.label_offsets[alias.0 as usize]
+        } else {
+            self.label_offsets[label.0 as usize]
+        }
+    }
+
+    /// Emit a reference to the given label with the given reference type (i.e.,
+    /// branch-instruction format) at the current offset.  This is like a
+    /// relocation, but handled internally.
+    ///
+    /// Because the offset of the label may already be known and the patch may
+    /// happen immediately, the buffer must already contain bytes at `offset` up
+    /// to `offset + kind.patch_size()`.
+    pub fn use_label_at_offset(&mut self, offset: CodeOffset, label: MachLabel, kind: I::LabelUse) {
+        debug!(
+            "MachBuffer: use_label_at_offset: offset {} label {:?} kind {:?}",
+            offset, label, kind
+        );
+        debug_assert!(offset + kind.patch_size() <= self.cur_offset());
+
+        // Add the fixup, and update the worst-case island size based on a
+        // veneer for this label use.
+        self.fixup_records.push(MachLabelFixup {
+            label,
+            offset,
+            kind,
+        });
+        if kind.supports_veneer() {
+            self.island_worst_case_size += kind.veneer_size();
+            self.island_worst_case_size &= !(I::LabelUse::ALIGN - 1);
+        }
+        let deadline = offset + kind.max_pos_range();
+        if deadline < self.island_deadline {
+            self.island_deadline = deadline;
+        }
+    }
+
+    /// Inform the buffer of an unconditional branch at the given offset,
+    /// targetting the given label. May be used to optimize branches.
+    /// The last added label-use must correspond to this branch.
+    pub fn add_uncond_branch(&mut self, start: CodeOffset, end: CodeOffset, target: MachLabel) {
+        assert!(!self.fixup_records.is_empty());
+        let fixup = self.fixup_records.len() - 1;
+        self.latest_branches.push(MachBranch {
+            start,
+            end,
+            target,
+            fixup,
+            inverted: None,
+        });
+    }
+
+    /// Inform the buffer of a conditional branch at the given offset,
+    /// targetting the given label. May be used to optimize branches.
+    /// The last added label-use must correspond to this branch.
+    pub fn add_cond_branch(
+        &mut self,
+        start: CodeOffset,
+        end: CodeOffset,
+        target: MachLabel,
+        inverted: &[u8],
+    ) {
+        assert!(!self.fixup_records.is_empty());
+        let fixup = self.fixup_records.len() - 1;
+        let inverted = Some(SmallVec::from(inverted));
+        self.latest_branches.push(MachBranch {
+            start,
+            end,
+            target,
+            fixup,
+            inverted,
+        });
+    }
+
+    fn truncate_last_branch(&mut self) {
+        let b = self.latest_branches.pop().unwrap();
+        assert!(b.end == self.cur_offset());
+        self.data.truncate(b.start as usize);
+        self.fixup_records.truncate(b.fixup);
+        let cur_off = self.cur_offset();
+        debug!(
+            "truncate_last_branch: truncated {:?}; off now {}",
+            b, cur_off
+        );
+        for &mut (l, ref mut off) in self.labels_by_offset.iter_mut().rev() {
+            if *off > cur_off {
+                *off = cur_off;
+                debug!(" -> label {:?} reassigned to {}", l, cur_off);
+                self.label_offsets[l.0 as usize] = cur_off;
+            } else {
+                break;
+            }
+        }
+    }
+
+    fn optimize_branches(&mut self) {
+        debug!(
+            "enter optimize_branches:\n b = {:?}\n l = {:?}\n f = {:?}",
+            self.latest_branches, self.labels_by_offset, self.fixup_records
+        );
+        while let Some(b) = self.latest_branches.last() {
+            let cur_off = self.cur_offset();
+            debug!("optimize_branches: last branch {:?} at off {}", b, cur_off);
+            // If there has been any code emission since the end of the last branch or
+            // label definition, then there's nothing we can edit (because we
+            // don't move code once placed, only back up and overwrite), so
+            // clear the records and finish.
+            if b.end < cur_off {
+                break;
+            }
+
+            // If latest is an unconditional branch:
+            // - For each label at this point, make the label an alias of
+            //   the branch target. We can now assume below that the
+            //   unconditional branch is reachable only via fallthrough, and we
+            //   are free to remove it in an optimization.
+            // - If there is a prior unconditional branch that ends just before
+            //   this one begins, then we can truncate this branch, because it is
+            //   entirely unreachable (due to above). Trim the end of the
+            //   `labels_by_offset` array and continue around the loop.
+            // - If there is a prior conditional branch whose target label
+            //   resolves to the current offset (branches around the
+            //   unconditional branch), then remove the unconditional branch,
+            //   and make the target of the unconditional the target of the
+            //   conditional instead.
+            if b.is_uncond() {
+                // Set any label equal to current branch's start as an alias of
+                // the branch's target.
+                for &(l, off) in self.labels_by_offset.iter().rev() {
+                    debug!(" -> uncond: latest label {:?} at off {}", l, off);
+                    if off > b.start {
+                        continue;
+                    } else if off == b.start {
+                        debug!(" -> setting alias to {:?}", b.target);
+                        self.label_aliases[l.0 as usize] = b.target;
+                    } else {
+                        break;
+                    }
+                }
+
+                // If the branch target is the next offset,
+
+                // Examine any immediately preceding branch.
+                if self.latest_branches.len() > 1 {
+                    let prev_b = &self.latest_branches[self.latest_branches.len() - 2];
+                    debug!(" -> more than one branch; prev_b = {:?}", prev_b);
+                    // This uncond is immediately after another uncond; we've
+                    // already redirected labels to this uncond away; so we can
+                    // truncate this uncond.
+                    if prev_b.is_uncond() && prev_b.end == b.start {
+                        debug!(" -> uncond follows another uncond; truncating");
+                        self.truncate_last_branch();
+                        continue;
+                    }
+
+                    // This uncond is immediately after a conditional, and the
+                    // conditional's target is the end of this uncond, and we've
+                    // already redirected labels to this uncond away; so we can
+                    // truncate this uncond, flip the sense of the conditional, and
+                    // set the conditional's target (in `latest_branches` and in
+                    // `fixup_records`) to the uncond's target.
+                    if prev_b.is_cond()
+                        && prev_b.end == b.start
+                        && self.resolve_label_offset(prev_b.target) == cur_off
+                    {
+                        debug!(" -> uncond follows a conditional, and conditional's target resolves to current offset");
+                        let target = b.target;
+                        let data = prev_b.inverted.clone().unwrap();
+                        self.truncate_last_branch();
+                        let prev_b = self.latest_branches.last_mut().unwrap();
+                        let not_inverted = SmallVec::from(
+                            &self.data[(prev_b.start as usize)..(prev_b.end as usize)],
+                        );
+                        self.data.truncate(prev_b.start as usize);
+                        self.data.extend_from_slice(&data[..]);
+                        prev_b.inverted = Some(not_inverted);
+                        self.fixup_records[prev_b.fixup].label = target;
+                        debug!(" -> reassigning target of condbr to {:?}", target);
+                        prev_b.target = target;
+                        continue;
+                    }
+                }
+            }
+
+            // For any branch, conditional or unconditional:
+            // - If the target is a label at the current offset, then remove
+            //   the conditional branch, and reset all labels that targetted
+            //   the current offset (end of branch) to the truncated
+            //   end-of-code.
+            if self.resolve_label_offset(b.target) == cur_off {
+                debug!("branch with target == cur off; truncating");
+                self.truncate_last_branch();
+            }
+
+            // If we couldn't do anything with the last branch, then break.
+            break;
+        }
+
+        self.purge_latest_branches();
+
+        debug!(
+            "leave optimize_branches:\n b = {:?}\n l = {:?}\n f = {:?}",
+            self.latest_branches, self.labels_by_offset, self.fixup_records
+        );
+    }
+
+    fn purge_latest_branches(&mut self) {
+        let cur_off = self.cur_offset();
+        if let Some(l) = self.latest_branches.last() {
+            if l.end < cur_off {
+                debug!("purge_latest_branches: removing branch {:?}", l);
+                self.latest_branches.clear();
+            }
+        }
+    }
+
+    /// Emit a constant at some point in the future, binding the given label to
+    /// its offset. The constant will be placed at most `max_distance` from the
+    /// current offset.
+    pub fn defer_constant(
+        &mut self,
+        label: MachLabel,
+        align: CodeOffset,
+        data: &[u8],
+        max_distance: CodeOffset,
+    ) {
+        let deadline = self.cur_offset() + max_distance;
+        self.island_worst_case_size += data.len() as CodeOffset;
+        self.island_worst_case_size &= !(I::LabelUse::ALIGN - 1);
+        self.pending_constants.push(MachLabelConstant {
+            label,
+            align,
+            data: SmallVec::from(data),
+        });
+        if deadline < self.island_deadline {
+            self.island_deadline = deadline;
+        }
+    }
+
+    /// Is an island needed within the next N bytes?
+    pub fn island_needed(&self, distance: CodeOffset) -> bool {
+        let worst_case_end_of_island = self.cur_offset() + distance + self.island_worst_case_size;
+        worst_case_end_of_island > self.island_deadline
+    }
+
+    /// Emit all pending constants and veneers. Should only be called if
+    /// `island_needed()` returns true, i.e., if we actually reach a deadline:
+    /// otherwise, unnecessary veneers may be inserted.
+    pub fn emit_island(&mut self) {
+        // We're going to purge fixups, so no latest-branch editing can happen
+        // anymore.
+        self.latest_branches.clear();
+
+        let pending_constants = mem::replace(&mut self.pending_constants, SmallVec::new());
+        for MachLabelConstant { label, align, data } in pending_constants.into_iter() {
+            self.align_to(align);
+            self.bind_label(label);
+            self.put_data(&data[..]);
+        }
+
+        let fixup_records = mem::replace(&mut self.fixup_records, SmallVec::new());
+        let mut new_fixups = SmallVec::new();
+        for MachLabelFixup {
+            label,
+            offset,
+            kind,
+        } in fixup_records.into_iter()
+        {
+            debug!(
+                "emit_island: fixup for label {:?} at offset {} kind {:?}",
+                label, offset, kind
+            );
+            // We eagerly perform fixups whose label targets are known, if not out
+            // of range, to avoid unnecessary veneers.
+            let label_offset = self.resolve_label_offset(label);
+            let known = label_offset != UNKNOWN_LABEL_OFFSET;
+            let in_range = if known {
+                if label_offset >= offset {
+                    (label_offset - offset) <= kind.max_pos_range()
+                } else {
+                    (offset - label_offset) <= kind.max_neg_range()
+                }
+            } else {
+                false
+            };
+
+            debug!(
+                " -> label_offset = {}, known = {}, in_range = {} (pos {} neg {})",
+                label_offset,
+                known,
+                in_range,
+                kind.max_pos_range(),
+                kind.max_neg_range()
+            );
+
+            let start = offset as usize;
+            let end = (offset + kind.patch_size()) as usize;
+            if in_range {
+                debug_assert!(known); // implied by in_range.
+                let slice = &mut self.data[start..end];
+                debug!("patching in-range!");
+                kind.patch(slice, offset, label_offset);
+            } else if !known && !kind.supports_veneer() {
+                // Nothing for now. Keep it for next round.
+                new_fixups.push(MachLabelFixup {
+                    label,
+                    offset,
+                    kind,
+                });
+            } else if !in_range && kind.supports_veneer() {
+                // Allocate space for a veneer in the island.
+                self.align_to(I::LabelUse::ALIGN);
+                let veneer_offset = self.cur_offset();
+                debug!("making a veneer at {}", veneer_offset);
+                let slice = &mut self.data[start..end];
+                // Patch the original label use to refer to teh veneer.
+                debug!(
+                    "patching original at offset {} to veneer offset {}",
+                    offset, veneer_offset
+                );
+                kind.patch(slice, offset, veneer_offset);
+                // Generate the veneer.
+                let veneer_slice = self.get_appended_space(kind.veneer_size() as usize);
+                let (veneer_fixup_off, veneer_label_use) =
+                    kind.generate_veneer(veneer_slice, veneer_offset);
+                debug!(
+                    "generated veneer; fixup offset {}, label_use {:?}",
+                    veneer_fixup_off, veneer_label_use
+                );
+                // If the label is known (but was just out of range), do the
+                // veneer label-use fixup now too; otherwise, save it for later.
+                if known {
+                    let start = veneer_fixup_off as usize;
+                    let end = (veneer_fixup_off + veneer_label_use.patch_size()) as usize;
+                    let veneer_slice = &mut self.data[start..end];
+                    debug!("doing veneer fixup right away too");
+                    veneer_label_use.patch(veneer_slice, veneer_fixup_off, label_offset);
+                } else {
+                    new_fixups.push(MachLabelFixup {
+                        label,
+                        offset: veneer_fixup_off,
+                        kind: veneer_label_use,
+                    });
+                }
+            } else {
+                panic!(
+                    "Cannot support label-use {:?} (known = {}, in-range = {})",
+                    kind, known, in_range
+                );
+            }
+        }
+
+        self.fixup_records = new_fixups;
+        self.island_deadline = UNKNOWN_LABEL_OFFSET;
+    }
+
+    /// Finish any deferred emissions and/or fixups.
+    pub fn finish(mut self) -> MachBufferFinalized {
+        // Ensure that all labels are defined. This is a full (release-mode)
+        // assert because we must avoid looping indefinitely below; an
+        // unresolved label will prevent the fixup_records vec from emptying.
+        assert!(self
+            .label_offsets
+            .iter()
+            .all(|&off| off != UNKNOWN_LABEL_OFFSET));
+
+        while !self.pending_constants.is_empty() || !self.fixup_records.is_empty() {
+            // `emit_island()` will emit any pending veneers and constants, and
+            // as a side-effect, will also take care of any fixups with resolved
+            // labels eagerly.
+            self.emit_island();
+        }
+
+        MachBufferFinalized {
+            data: self.data,
+            relocs: self.relocs,
+            traps: self.traps,
+            call_sites: self.call_sites,
+            srclocs: self.srclocs,
+        }
+    }
+
+    /// Add an external relocation at the current offset.
+    pub fn add_reloc(
+        &mut self,
+        srcloc: SourceLoc,
+        kind: Reloc,
+        name: &ExternalName,
+        addend: Addend,
+    ) {
+        let name = name.clone();
+        self.relocs.push(MachReloc {
+            offset: self.data.len() as CodeOffset,
+            srcloc,
+            kind,
+            name,
+            addend,
+        });
+    }
+
+    /// Add a trap record at the current offset.
+    pub fn add_trap(&mut self, srcloc: SourceLoc, code: TrapCode) {
+        self.traps.push(MachTrap {
+            offset: self.data.len() as CodeOffset,
+            srcloc,
+            code,
+        });
+    }
+
+    /// Add a call-site record at the current offset.
+    pub fn add_call_site(&mut self, srcloc: SourceLoc, opcode: Opcode) {
+        self.call_sites.push(MachCallSite {
+            ret_addr: self.data.len() as CodeOffset,
+            srcloc,
+            opcode,
+        });
+    }
+
+    /// Set the `SourceLoc` for code from this offset until the offset at the
+    /// next call to `end_srcloc()`.
+    pub fn start_srcloc(&mut self, loc: SourceLoc) {
+        self.cur_srcloc = Some((self.cur_offset(), loc));
+    }
+
+    /// Mark the end of the `SourceLoc` segment started at the last
+    /// `start_srcloc()` call.
+    pub fn end_srcloc(&mut self) {
+        let (start, loc) = self
+            .cur_srcloc
+            .take()
+            .expect("end_srcloc() called without start_srcloc()");
+        let end = self.cur_offset();
+        // Skip zero-length extends.
+        debug_assert!(end >= start);
+        if end > start {
+            self.srclocs.push(MachSrcLoc { start, end, loc });
+        }
+    }
+}
+
+impl MachBufferFinalized {
+    /// Get a list of source location mapping tuples in sorted-by-start-offset order.
+    pub fn get_srclocs_sorted(&self) -> &[MachSrcLoc] {
+        &self.srclocs[..]
+    }
+
+    /// Get the total required size for the code.
+    pub fn total_size(&self) -> CodeOffset {
+        self.data.len() as CodeOffset
+    }
+
+    /// Emit this buffer to the given CodeSink.
+    pub fn emit<CS: CodeSink>(&self, sink: &mut CS) {
+        // N.B.: we emit every section into the .text section as far as
+        // the `CodeSink` is concerned; we do not bother to segregate
+        // the contents into the actual program text, the jumptable and the
+        // rodata (constant pool). This allows us to generate code assuming
+        // that these will not be relocated relative to each other, and avoids
+        // having to designate each section as belonging in one of the three
+        // fixed categories defined by `CodeSink`. If this becomes a problem
+        // later (e.g. because of memory permissions or similar), we can
+        // add this designation and segregate the output; take care, however,
+        // to add the appropriate relocations in this case.
+
+        let mut next_reloc = 0;
+        let mut next_trap = 0;
+        let mut next_call_site = 0;
+        for (idx, byte) in self.data.iter().enumerate() {
+            if next_reloc < self.relocs.len() {
+                let reloc = &self.relocs[next_reloc];
+                if reloc.offset == idx as CodeOffset {
+                    sink.reloc_external(reloc.srcloc, reloc.kind, &reloc.name, reloc.addend);
+                    next_reloc += 1;
+                }
+            }
+            if next_trap < self.traps.len() {
+                let trap = &self.traps[next_trap];
+                if trap.offset == idx as CodeOffset {
+                    sink.trap(trap.code, trap.srcloc);
+                    next_trap += 1;
+                }
+            }
+            if next_call_site < self.call_sites.len() {
+                let call_site = &self.call_sites[next_call_site];
+                if call_site.ret_addr == idx as CodeOffset {
+                    sink.add_call_site(call_site.opcode, call_site.srcloc);
+                    next_call_site += 1;
+                }
+            }
+            sink.put1(*byte);
+        }
+
+        sink.begin_jumptables();
+        sink.begin_rodata();
+        sink.end_codegen();
+    }
+}
+
+/// A constant that is deferred to the next constant-pool opportunity.
+struct MachLabelConstant {
+    /// This label will refer to the constant's offset.
+    label: MachLabel,
+    /// Required alignment.
+    align: CodeOffset,
+    /// This data will be emitted when able.
+    data: SmallVec<[u8; 16]>,
+}
+
+/// A fixup to perform on the buffer once code is emitted. Fixups always refer
+/// to labels and patch the code based on label offsets. Hence, they are like
+/// relocations, but internal to one buffer.
+#[derive(Debug)]
+struct MachLabelFixup<I: VCodeInst> {
+    /// The label whose offset controls this fixup.
+    label: MachLabel,
+    /// The offset to fix up / patch to refer to this label.
+    offset: CodeOffset,
+    /// The kind of fixup. This is architecture-specific; each architecture may have,
+    /// e.g., several types of branch instructions, each with differently-sized
+    /// offset fields and different places within the instruction to place the
+    /// bits.
+    kind: I::LabelUse,
+}
+
+/// A relocation resulting from a compilation.
+struct MachReloc {
+    /// The offset at which the relocation applies, *relative to the
+    /// containing section*.
+    offset: CodeOffset,
+    /// The original source location.
+    srcloc: SourceLoc,
+    /// The kind of relocation.
+    kind: Reloc,
+    /// The external symbol / name to which this relocation refers.
+    name: ExternalName,
+    /// The addend to add to the symbol value.
+    addend: i64,
+}
+
+/// A trap record resulting from a compilation.
+struct MachTrap {
+    /// The offset at which the trap instruction occurs, *relative to the
+    /// containing section*.
+    offset: CodeOffset,
+    /// The original source location.
+    srcloc: SourceLoc,
+    /// The trap code.
+    code: TrapCode,
+}
+
+/// A call site record resulting from a compilation.
+struct MachCallSite {
+    /// The offset of the call's return address, *relative to the containing section*.
+    ret_addr: CodeOffset,
+    /// The original source location.
+    srcloc: SourceLoc,
+    /// The call's opcode.
+    opcode: Opcode,
+}
+
+/// A source-location mapping resulting from a compilation.
+#[derive(Clone, Debug)]
+pub struct MachSrcLoc {
+    /// The start of the region of code corresponding to a source location.
+    /// This is relative to the start of the function, not to the start of the
+    /// section.
+    pub start: CodeOffset,
+    /// The end of the region of code corresponding to a source location.
+    /// This is relative to the start of the section, not to the start of the
+    /// section.
+    pub end: CodeOffset,
+    /// The source location.
+    pub loc: SourceLoc,
+}
+
+/// Record of branch instruction in the buffer, to facilitate editing.
+#[derive(Clone, Debug)]
+struct MachBranch {
+    start: CodeOffset,
+    end: CodeOffset,
+    target: MachLabel,
+    fixup: usize,
+    inverted: Option<SmallVec<[u8; 8]>>,
+}
+
+impl MachBranch {
+    fn is_cond(&self) -> bool {
+        self.inverted.is_some()
+    }
+    fn is_uncond(&self) -> bool {
+        self.inverted.is_none()
+    }
+}
+
+// We use an actual instruction definition to do tests, so we depend on the `arm64` feature here.
+#[cfg(all(test, feature = "arm64"))]
+mod test {
+    use super::*;
+    use crate::isa::aarch64::inst::xreg;
+    use crate::isa::aarch64::inst::{BranchTarget, CondBrKind, Inst};
+    use crate::machinst::MachInstEmit;
+    use crate::settings;
+    use std::default::Default;
+
+    fn label(n: u32) -> MachLabel {
+        MachLabel::from_block(n)
+    }
+    fn target(n: u32) -> BranchTarget {
+        BranchTarget::Label(label(n))
+    }
+
+    #[test]
+    fn test_elide_jump_to_next() {
+        let flags = settings::Flags::new(settings::builder());
+        let mut buf = MachBuffer::new();
+        let mut state = Default::default();
+
+        buf.reserve_labels_for_blocks(2);
+        buf.bind_label(label(0));
+        let inst = Inst::Jump { dest: target(1) };
+        inst.emit(&mut buf, &flags, &mut state);
+        buf.bind_label(label(1));
+        let buf = buf.finish();
+        assert_eq!(0, buf.total_size());
+    }
+
+    #[test]
+    fn test_elide_trivial_jump_blocks() {
+        let flags = settings::Flags::new(settings::builder());
+        let mut buf = MachBuffer::new();
+        let mut state = Default::default();
+
+        buf.reserve_labels_for_blocks(4);
+
+        buf.bind_label(label(0));
+        let inst = Inst::CondBr {
+            kind: CondBrKind::NotZero(xreg(0)),
+            taken: target(1),
+            not_taken: target(2),
+        };
+        inst.emit(&mut buf, &flags, &mut state);
+
+        buf.bind_label(label(1));
+        let inst = Inst::Jump { dest: target(3) };
+        inst.emit(&mut buf, &flags, &mut state);
+
+        buf.bind_label(label(2));
+        let inst = Inst::Jump { dest: target(3) };
+        inst.emit(&mut buf, &flags, &mut state);
+
+        buf.bind_label(label(3));
+
+        let buf = buf.finish();
+        assert_eq!(0, buf.total_size());
+    }
+
+    #[test]
+    fn test_flip_cond() {
+        let flags = settings::Flags::new(settings::builder());
+        let mut buf = MachBuffer::new();
+        let mut state = Default::default();
+
+        buf.reserve_labels_for_blocks(4);
+
+        buf.bind_label(label(0));
+        let inst = Inst::CondBr {
+            kind: CondBrKind::NotZero(xreg(0)),
+            taken: target(1),
+            not_taken: target(2),
+        };
+        inst.emit(&mut buf, &flags, &mut state);
+
+        buf.bind_label(label(1));
+        let inst = Inst::Nop4;
+        inst.emit(&mut buf, &flags, &mut state);
+
+        buf.bind_label(label(2));
+        let inst = Inst::Nop4;
+        inst.emit(&mut buf, &flags, &mut state);
+
+        buf.bind_label(label(3));
+
+        let buf = buf.finish();
+
+        let mut buf2 = MachBuffer::new();
+        let mut state = Default::default();
+        let inst = Inst::OneWayCondBr {
+            kind: CondBrKind::Zero(xreg(0)),
+            target: BranchTarget::ResolvedOffset(8),
+        };
+        inst.emit(&mut buf2, &flags, &mut state);
+        let inst = Inst::Nop4;
+        inst.emit(&mut buf2, &flags, &mut state);
+        inst.emit(&mut buf2, &flags, &mut state);
+
+        let buf2 = buf2.finish();
+
+        assert_eq!(buf.data, buf2.data);
+    }
+
+    #[test]
+    fn test_island() {
+        let flags = settings::Flags::new(settings::builder());
+        let mut buf = MachBuffer::new();
+        let mut state = Default::default();
+
+        buf.reserve_labels_for_blocks(4);
+
+        buf.bind_label(label(0));
+        let inst = Inst::CondBr {
+            kind: CondBrKind::NotZero(xreg(0)),
+            taken: target(2),
+            not_taken: target(3),
+        };
+        inst.emit(&mut buf, &flags, &mut state);
+
+        buf.bind_label(label(1));
+        while buf.cur_offset() < 2000000 {
+            if buf.island_needed(0) {
+                buf.emit_island();
+            }
+            let inst = Inst::Nop4;
+            inst.emit(&mut buf, &flags, &mut state);
+        }
+
+        buf.bind_label(label(2));
+        let inst = Inst::Nop4;
+        inst.emit(&mut buf, &flags, &mut state);
+
+        buf.bind_label(label(3));
+        let inst = Inst::Nop4;
+        inst.emit(&mut buf, &flags, &mut state);
+
+        let buf = buf.finish();
+
+        assert_eq!(2000000 + 8, buf.total_size());
+
+        let mut buf2 = MachBuffer::new();
+        let mut state = Default::default();
+        let inst = Inst::CondBr {
+            kind: CondBrKind::NotZero(xreg(0)),
+            taken: BranchTarget::ResolvedOffset(1048576 - 4),
+            not_taken: BranchTarget::ResolvedOffset(2000000 + 4 - 4),
+        };
+        inst.emit(&mut buf2, &flags, &mut state);
+
+        let buf2 = buf2.finish();
+
+        assert_eq!(&buf.data[0..8], &buf2.data[..]);
+    }
+
+    #[test]
+    fn test_island_backward() {
+        let flags = settings::Flags::new(settings::builder());
+        let mut buf = MachBuffer::new();
+        let mut state = Default::default();
+
+        buf.reserve_labels_for_blocks(4);
+
+        buf.bind_label(label(0));
+        let inst = Inst::Nop4;
+        inst.emit(&mut buf, &flags, &mut state);
+
+        buf.bind_label(label(1));
+        let inst = Inst::Nop4;
+        inst.emit(&mut buf, &flags, &mut state);
+
+        buf.bind_label(label(2));
+        while buf.cur_offset() < 2000000 {
+            let inst = Inst::Nop4;
+            inst.emit(&mut buf, &flags, &mut state);
+        }
+
+        buf.bind_label(label(3));
+        let inst = Inst::CondBr {
+            kind: CondBrKind::NotZero(xreg(0)),
+            taken: target(0),
+            not_taken: target(1),
+        };
+        inst.emit(&mut buf, &flags, &mut state);
+
+        let buf = buf.finish();
+
+        assert_eq!(2000000 + 12, buf.total_size());
+
+        let mut buf2 = MachBuffer::new();
+        let mut state = Default::default();
+        let inst = Inst::CondBr {
+            kind: CondBrKind::NotZero(xreg(0)),
+            taken: BranchTarget::ResolvedOffset(8),
+            not_taken: BranchTarget::ResolvedOffset(4 - (2000000 + 4)),
+        };
+        inst.emit(&mut buf2, &flags, &mut state);
+        let inst = Inst::Jump {
+            dest: BranchTarget::ResolvedOffset(-(2000000 + 8)),
+        };
+        inst.emit(&mut buf2, &flags, &mut state);
+
+        let buf2 = buf2.finish();
+
+        assert_eq!(&buf.data[2000000..], &buf2.data[..]);
+    }
+}
diff --git a/cranelift/codegen/src/machinst/compile.rs b/cranelift/codegen/src/machinst/compile.rs
index 8f81320fd3..508e242cd7 100644
--- a/cranelift/codegen/src/machinst/compile.rs
+++ b/cranelift/codegen/src/machinst/compile.rs
@@ -18,8 +18,12 @@ pub fn compile<B: LowerBackend + MachBackend>(
 where
     B::MInst: ShowWithRRU,
 {
-    // This lowers the CL IR.
-    let mut vcode = Lower::new(f, abi)?.lower(b)?;
+    // Compute lowered block order.
+    let block_order = BlockLoweringOrder::new(f);
+    // Build the lowering context.
+    let lower = Lower::new(f, abi, block_order)?;
+    // Lower the IR.
+    let mut vcode = lower.lower(b)?;
 
     debug!(
         "vcode from lowering: \n{}",
@@ -65,11 +69,6 @@ where
     // all at once. This also inserts prologues/epilogues.
     vcode.replace_insns_from_regalloc(result);
 
-    vcode.remove_redundant_branches();
-
-    // Do final passes over code to finalize branches.
-    vcode.finalize_branches();
-
     debug!(
         "vcode after regalloc: final version:\n{}",
         vcode.show_rru(Some(b.reg_universe()))
diff --git a/cranelift/codegen/src/machinst/lower.rs b/cranelift/codegen/src/machinst/lower.rs
index 47384f462e..fcbf3d2810 100644
--- a/cranelift/codegen/src/machinst/lower.rs
+++ b/cranelift/codegen/src/machinst/lower.rs
@@ -3,54 +3,97 @@
 //! machine code, except for register allocation.
 
 use crate::entity::SecondaryMap;
-use crate::inst_predicates::has_side_effect;
+use crate::fx::{FxHashMap, FxHashSet};
+use crate::inst_predicates::{has_side_effect_or_load, is_constant_64bit};
 use crate::ir::instructions::BranchInfo;
 use crate::ir::{
     ArgumentExtension, Block, ExternalName, Function, GlobalValueData, Inst, InstructionData,
     MemFlags, Opcode, Signature, SourceLoc, Type, Value, ValueDef,
 };
-use crate::machinst::{ABIBody, BlockIndex, VCode, VCodeBuilder, VCodeInst};
-use crate::{num_uses::NumUses, CodegenResult};
+use crate::machinst::{
+    ABIBody, BlockIndex, BlockLoweringOrder, LoweredBlock, MachLabel, VCode, VCodeBuilder,
+    VCodeInst,
+};
+use crate::CodegenResult;
 
-use regalloc::{Reg, RegClass, Set, VirtualReg, Writable};
+use regalloc::{Reg, RegClass, VirtualReg, Writable};
 
 use alloc::boxed::Box;
 use alloc::vec::Vec;
 use log::debug;
 use smallvec::SmallVec;
-use std::collections::VecDeque;
 
-/// A context that machine-specific lowering code can use to emit lowered instructions. This is the
-/// view of the machine-independent per-function lowering context that is seen by the machine
-/// backend.
+/// An "instruction color" partitions instructions by side-effecting ops. All
+/// instructions with the same "color" are guaranteed not to be separated by any
+/// side-effecting op (for this purpose, loads are also considered
+/// side-effecting, to avoid subtle questions w.r.t. the memory model), and
+/// furthermore, it is guaranteed that for any two instructions A and B such
+/// that color(A) == color(B), either A dominates B and B postdominates A, or
+/// vice-versa. (For now, in practice, only ops in the same basic block can ever
+/// have the same color, trivially providing the second condition.) Intuitively,
+/// this means that the ops of the same color must always execute "together", as
+/// part of one atomic contiguous section of the dynamic execution trace, and
+/// they can be freely permuted without affecting program behavior.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub struct InstColor(u32);
+impl InstColor {
+    fn new(n: u32) -> InstColor {
+        InstColor(n)
+    }
+
+    /// Get an arbitrary index representing this color. The index is unique
+    /// *within a single function compilation*, but indices may be reused across
+    /// functions.
+    pub fn get(self) -> u32 {
+        self.0
+    }
+}
+
+/// A context that machine-specific lowering code can use to emit lowered
+/// instructions. This is the view of the machine-independent per-function
+/// lowering context that is seen by the machine backend.
 pub trait LowerCtx {
     /// The instruction type for which this lowering framework is instantiated.
-    type I;
+    type I: VCodeInst;
+
+    // Function-level queries:
+
+    /// Get the `ABIBody`.
+    fn abi(&mut self) -> &dyn ABIBody<I = Self::I>;
+    /// Get the (virtual) register that receives the return value. A return
+    /// instruction should lower into a sequence that fills this register. (Why
+    /// not allow the backend to specify its own result register for the return?
+    /// Because there may be multiple return points.)
+    fn retval(&self, idx: usize) -> Writable<Reg>;
+
+    // General instruction queries:
 
     /// Get the instdata for a given IR instruction.
     fn data(&self, ir_inst: Inst) -> &InstructionData;
     /// Get the controlling type for a polymorphic IR instruction.
     fn ty(&self, ir_inst: Inst) -> Type;
-    /// Get the `ABIBody`.
-    fn abi(&mut self) -> &dyn ABIBody<I = Self::I>;
-    /// Emit a machine instruction.
-    fn emit(&mut self, mach_inst: Self::I);
-    /// Indicate that an IR instruction has been merged, and so one of its
-    /// uses is gone (replaced by uses of the instruction's inputs). This
-    /// helps the lowering algorithm to perform on-the-fly DCE, skipping over
-    /// unused instructions (such as immediates incorporated directly).
-    fn merged(&mut self, from_inst: Inst);
-    /// Get the producing instruction, if any, and output number, for the `idx`th input to the
-    /// given IR instruction
-    fn input_inst(&self, ir_inst: Inst, idx: usize) -> Option<(Inst, usize)>;
-    /// Map a Value to its associated writable (probably virtual) Reg.
-    fn value_to_writable_reg(&self, val: Value) -> Writable<Reg>;
-    /// Map a Value to its associated (probably virtual) Reg.
-    fn value_to_reg(&self, val: Value) -> Reg;
-    /// Get the `idx`th input to the given IR instruction as a virtual register.
-    fn input(&self, ir_inst: Inst, idx: usize) -> Reg;
-    /// Get the `idx`th output of the given IR instruction as a virtual register.
-    fn output(&self, ir_inst: Inst, idx: usize) -> Writable<Reg>;
+    /// Get the target for a call instruction, as an `ExternalName`. Returns a tuple
+    /// providing this name and the "relocation distance", i.e., whether the backend
+    /// can assume the target will be "nearby" (within some small offset) or an
+    /// arbitrary address. (This comes from the `colocated` bit in the CLIF.)
+    fn call_target<'b>(&'b self, ir_inst: Inst) -> Option<(&'b ExternalName, RelocDistance)>;
+    /// Get the signature for a call or call-indirect instruction.
+    fn call_sig<'b>(&'b self, ir_inst: Inst) -> Option<&'b Signature>;
+    /// Get the symbol name, relocation distance estimate, and offset for a
+    /// symbol_value instruction.
+    fn symbol_value<'b>(&'b self, ir_inst: Inst) -> Option<(&'b ExternalName, RelocDistance, i64)>;
+    /// Returns the memory flags of a given memory access.
+    fn memflags(&self, ir_inst: Inst) -> Option<MemFlags>;
+    /// Get the source location for a given instruction.
+    fn srcloc(&self, ir_inst: Inst) -> SourceLoc;
+    /// Get the side-effect color of the given instruction (specifically, at the
+    /// program point just prior to the instruction). The "color" changes at
+    /// every side-effecting op; the backend should not try to merge across
+    /// side-effect colors unless the op being merged is known to be pure.
+    fn inst_color(&self, ir_inst: Inst) -> InstColor;
+
+    // Instruction input/output queries:
+
     /// Get the number of inputs to the given IR instruction.
     fn num_inputs(&self, ir_inst: Inst) -> usize;
     /// Get the number of outputs to the given IR instruction.
@@ -59,27 +102,60 @@ pub trait LowerCtx {
     fn input_ty(&self, ir_inst: Inst, idx: usize) -> Type;
     /// Get the type for an instruction's output.
     fn output_ty(&self, ir_inst: Inst, idx: usize) -> Type;
+    /// Get the value of a constant instruction (`iconst`, etc.) as a 64-bit
+    /// value, if possible.
+    fn get_constant(&self, ir_inst: Inst) -> Option<u64>;
+    /// Get the input in any combination of three forms:
+    ///
+    /// - An instruction, if the same color as this instruction or if the
+    ///   producing instruction has no side effects (thus in both cases
+    ///   mergeable);
+    /// - A constant, if the value is a constant;
+    /// - A register.
+    ///
+    /// The instruction input may be available in some or all of these
+    /// forms. More than one is possible: e.g., it may be produced by an
+    /// instruction in the same block, but may also have been forced into a
+    /// register already by an earlier op. It will *always* be available
+    /// in a register, at least.
+    ///
+    /// If the backend uses the register, rather than one of the other
+    /// forms (constant or merging of the producing op), it must call
+    /// `use_input_reg()` to ensure the producing inst is actually lowered
+    /// as well.
+    fn get_input(&self, ir_inst: Inst, idx: usize) -> LowerInput;
+    /// Get the `idx`th output register of the given IR instruction. When
+    /// `backend.lower_inst_to_regs(ctx, inst)` is called, it is expected that
+    /// the backend will write results to these output register(s).
+    fn get_output(&mut self, ir_inst: Inst, idx: usize) -> Writable<Reg>;
+
+    // Codegen primitives: allocate temps, emit instructions, set result registers,
+    // ask for an input to be gen'd into a register.
+
     /// Get a new temp.
     fn tmp(&mut self, rc: RegClass, ty: Type) -> Writable<Reg>;
-    /// Get the number of block params.
-    fn num_bb_params(&self, bb: Block) -> usize;
-    /// Get the register for a block param.
-    fn bb_param(&self, bb: Block, idx: usize) -> Reg;
-    /// Get the register for a return value.
-    fn retval(&self, idx: usize) -> Writable<Reg>;
-    /// Get the target for a call instruction, as an `ExternalName`. Returns a tuple
-    /// providing this name and the "relocation distance", i.e., whether the backend
-    /// can assume the target will be "nearby" (within some small offset) or an
-    /// arbitrary address. (This comes from the `colocated` bit in the CLIF.)
-    fn call_target<'b>(&'b self, ir_inst: Inst) -> Option<(&'b ExternalName, RelocDistance)>;
-    /// Get the signature for a call or call-indirect instruction.
-    fn call_sig<'b>(&'b self, ir_inst: Inst) -> Option<&'b Signature>;
-    /// Get the symbol name, relocation distance estimate, and offset for a symbol_value instruction.
-    fn symbol_value<'b>(&'b self, ir_inst: Inst) -> Option<(&'b ExternalName, RelocDistance, i64)>;
-    /// Returns the memory flags of a given memory access.
-    fn memflags(&self, ir_inst: Inst) -> Option<MemFlags>;
-    /// Get the source location for a given instruction.
-    fn srcloc(&self, ir_inst: Inst) -> SourceLoc;
+    /// Emit a machine instruction.
+    fn emit(&mut self, mach_inst: Self::I);
+    /// Indicate that the given input uses the register returned by
+    /// `get_input()`. Codegen may not happen otherwise for the producing
+    /// instruction if it has no side effects and no uses.
+    fn use_input_reg(&mut self, input: LowerInput);
+}
+
+/// A representation of all of the ways in which an instruction input is
+/// available: as a producing instruction (in the same color-partition), as a
+/// constant, and/or in an existing register. See [LowerCtx::get_input] for more
+/// details.
+#[derive(Clone, Copy, Debug)]
+pub struct LowerInput {
+    /// The value is live in a register. This option is always available. Call
+    /// [LowerCtx::use_input_reg()] if the register is used.
+    pub reg: Reg,
+    /// An instruction produces this value; the instruction's result index that
+    /// produces this value is given.
+    pub inst: Option<(Inst, usize)>,
+    /// The value is a known constant.
+    pub constant: Option<u64>,
 }
 
 /// A machine backend.
@@ -87,20 +163,31 @@ pub trait LowerBackend {
     /// The machine instruction type.
     type MInst: VCodeInst;
 
-    /// Lower a single instruction. Instructions are lowered in reverse order.
-    /// This function need not handle branches; those are always passed to
-    /// `lower_branch_group` below.
-    fn lower<C: LowerCtx<I = Self::MInst>>(&self, ctx: &mut C, inst: Inst);
+    /// Lower a single instruction.
+    ///
+    /// For a branch, this function should not generate the actual branch
+    /// instruction. However, it must force any values it needs for the branch
+    /// edge (block-param actuals) into registers, because the actual branch
+    /// generation (`lower_branch_group()`) happens *after* any possible merged
+    /// out-edge.
+    fn lower<C: LowerCtx<I = Self::MInst>>(&self, ctx: &mut C, inst: Inst) -> CodegenResult<()>;
 
-    /// Lower a block-terminating group of branches (which together can be seen as one
-    /// N-way branch), given a vcode BlockIndex for each target.
+    /// Lower a block-terminating group of branches (which together can be seen
+    /// as one N-way branch), given a vcode MachLabel for each target.
     fn lower_branch_group<C: LowerCtx<I = Self::MInst>>(
         &self,
         ctx: &mut C,
         insts: &[Inst],
-        targets: &[BlockIndex],
-        fallthrough: Option<BlockIndex>,
-    );
+        targets: &[MachLabel],
+        fallthrough: Option<MachLabel>,
+    ) -> CodegenResult<()>;
+
+    /// A bit of a hack: give a fixed register that always holds the result of a
+    /// `get_pinned_reg` instruction, if known.  This allows elision of moves
+    /// into the associated vreg, instead using the real reg directly.
+    fn maybe_pinned_reg(&self) -> Option<Reg> {
+        None
+    }
 }
 
 /// Machine-independent lowering driver / machine-instruction container. Maintains a correspondence
@@ -112,17 +199,42 @@ pub struct Lower<'func, I: VCodeInst> {
     /// Lowered machine instructions.
     vcode: VCodeBuilder<I>,
 
-    /// Number of active uses (minus `dec_use()` calls by backend) of each instruction.
-    num_uses: SecondaryMap<Inst, u32>,
-
     /// Mapping from `Value` (SSA value in IR) to virtual register.
     value_regs: SecondaryMap<Value, Reg>,
 
     /// Return-value vregs.
     retval_regs: Vec<(Reg, ArgumentExtension)>,
 
+    /// Instruction colors.
+    inst_colors: SecondaryMap<Inst, InstColor>,
+
+    /// Instruction constant values, if known.
+    inst_constants: FxHashMap<Inst, u64>,
+
+    /// Instruction has a side-effect and must be codegen'd.
+    inst_needed: SecondaryMap<Inst, bool>,
+
+    /// Value (vreg) is needed and producer must be codegen'd.
+    vreg_needed: Vec<bool>,
+
     /// Next virtual register number to allocate.
     next_vreg: u32,
+
+    /// Insts in reverse block order, before final copy to vcode.
+    block_insts: Vec<(SourceLoc, I)>,
+
+    /// Ranges in `block_insts` constituting BBs.
+    block_ranges: Vec<(usize, usize)>,
+
+    /// Instructions collected for the BB in progress, in reverse order, with
+    /// source-locs attached.
+    bb_insts: Vec<(SourceLoc, I)>,
+
+    /// Instructions collected for the CLIF inst in progress, in forward order.
+    ir_insts: Vec<I>,
+
+    /// The register to use for GetPinnedReg, if any, on this architecture.
+    pinned_reg: Option<Reg>,
 }
 
 /// Notion of "relocation distance". This gives an estimate of how far away a symbol will be from a
@@ -143,7 +255,7 @@ fn alloc_vreg(
     value: Value,
     next_vreg: &mut u32,
 ) -> VirtualReg {
-    if value_regs[value].get_index() == 0 {
+    if value_regs[value].is_invalid() {
         // default value in map.
         let v = *next_vreg;
         *next_vreg += 1;
@@ -159,41 +271,35 @@ enum GenerateReturn {
 
 impl<'func, I: VCodeInst> Lower<'func, I> {
     /// Prepare a new lowering context for the given IR function.
-    pub fn new(f: &'func Function, abi: Box<dyn ABIBody<I = I>>) -> CodegenResult<Lower<'func, I>> {
-        let mut vcode = VCodeBuilder::new(abi);
+    pub fn new(
+        f: &'func Function,
+        abi: Box<dyn ABIBody<I = I>>,
+        block_order: BlockLoweringOrder,
+    ) -> CodegenResult<Lower<'func, I>> {
+        let mut vcode = VCodeBuilder::new(abi, block_order);
 
-        let num_uses = NumUses::compute(f).take_uses();
+        let mut next_vreg: u32 = 0;
 
-        let mut next_vreg: u32 = 1;
+        let mut value_regs = SecondaryMap::with_default(Reg::invalid());
 
-        // Default register should never be seen, but the `value_regs` map needs a default and we
-        // don't want to push `Option` everywhere. All values will be assigned registers by the
-        // loops over block parameters and instruction results below.
-        //
-        // We do not use vreg 0 so that we can detect any unassigned register that leaks through.
-        let default_register = Reg::new_virtual(RegClass::I32, 0);
-        let mut value_regs = SecondaryMap::with_default(default_register);
-
-        // Assign a vreg to each value.
+        // Assign a vreg to each block param and each inst result.
         for bb in f.layout.blocks() {
-            for param in f.dfg.block_params(bb) {
-                let vreg = alloc_vreg(
-                    &mut value_regs,
-                    I::rc_for_type(f.dfg.value_type(*param))?,
-                    *param,
-                    &mut next_vreg,
-                );
-                vcode.set_vreg_type(vreg, f.dfg.value_type(*param));
+            for &param in f.dfg.block_params(bb) {
+                let ty = f.dfg.value_type(param);
+                let vreg = alloc_vreg(&mut value_regs, I::rc_for_type(ty)?, param, &mut next_vreg);
+                vcode.set_vreg_type(vreg, ty);
+                debug!("bb {} param {}: vreg {:?}", bb, param, vreg);
             }
             for inst in f.layout.block_insts(bb) {
-                for result in f.dfg.inst_results(inst) {
-                    let vreg = alloc_vreg(
-                        &mut value_regs,
-                        I::rc_for_type(f.dfg.value_type(*result))?,
-                        *result,
-                        &mut next_vreg,
+                for &result in f.dfg.inst_results(inst) {
+                    let ty = f.dfg.value_type(result);
+                    let vreg =
+                        alloc_vreg(&mut value_regs, I::rc_for_type(ty)?, result, &mut next_vreg);
+                    vcode.set_vreg_type(vreg, ty);
+                    debug!(
+                        "bb {} inst {} ({:?}): result vreg {:?}",
+                        bb, inst, f.dfg[inst], vreg
                     );
-                    vcode.set_vreg_type(vreg, f.dfg.value_type(*result));
                 }
             }
         }
@@ -209,13 +315,51 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
             vcode.set_vreg_type(vreg.as_virtual_reg().unwrap(), ret.value_type);
         }
 
+        // Compute instruction colors, find constant instructions, and find instructions with
+        // side-effects, in one combined pass.
+        let mut cur_color = 0;
+        let mut inst_colors = SecondaryMap::with_default(InstColor::new(0));
+        let mut inst_constants = FxHashMap::default();
+        let mut inst_needed = SecondaryMap::with_default(false);
+        for bb in f.layout.blocks() {
+            cur_color += 1;
+            for inst in f.layout.block_insts(bb) {
+                let side_effect = has_side_effect_or_load(f, inst);
+
+                // Assign colors. A new color is chosen *after* any side-effecting instruction.
+                inst_colors[inst] = InstColor::new(cur_color);
+                debug!("bb {} inst {} has color {}", bb, inst, cur_color);
+                if side_effect {
+                    debug!(" -> side-effecting");
+                    inst_needed[inst] = true;
+                    cur_color += 1;
+                }
+
+                // Determine if this is a constant; if so, add to the table.
+                if let Some(c) = is_constant_64bit(f, inst) {
+                    debug!(" -> constant: {}", c);
+                    inst_constants.insert(inst, c);
+                }
+            }
+        }
+
+        let vreg_needed = std::iter::repeat(false).take(next_vreg as usize).collect();
+
         Ok(Lower {
             f,
             vcode,
-            num_uses,
             value_regs,
             retval_regs,
+            inst_colors,
+            inst_constants,
+            inst_needed,
+            vreg_needed,
             next_vreg,
+            block_insts: vec![],
+            block_ranges: vec![],
+            bb_insts: vec![],
+            ir_insts: vec![],
+            pinned_reg: None,
         })
     }
 
@@ -229,452 +373,427 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
             for (i, param) in self.f.dfg.block_params(entry_bb).iter().enumerate() {
                 let reg = Writable::from_reg(self.value_regs[*param]);
                 let insn = self.vcode.abi().gen_copy_arg_to_reg(i, reg);
-                self.vcode.push(insn);
+                self.emit(insn);
             }
         }
     }
 
     fn gen_retval_setup(&mut self, gen_ret_inst: GenerateReturn) {
-        for (i, (reg, ext)) in self.retval_regs.iter().enumerate() {
-            let reg = Writable::from_reg(*reg);
-            let insns = self.vcode.abi().gen_copy_reg_to_retval(i, reg, *ext);
+        let retval_regs = self.retval_regs.clone();
+        for (i, (reg, ext)) in retval_regs.into_iter().enumerate() {
+            let reg = Writable::from_reg(reg);
+            let insns = self.vcode.abi().gen_copy_reg_to_retval(i, reg, ext);
             for insn in insns {
-                self.vcode.push(insn);
+                self.emit(insn);
             }
         }
         let inst = match gen_ret_inst {
             GenerateReturn::Yes => self.vcode.abi().gen_ret(),
             GenerateReturn::No => self.vcode.abi().gen_epilogue_placeholder(),
         };
-        self.vcode.push(inst);
+        self.emit(inst);
     }
 
-    fn find_reachable_bbs(&self) -> SmallVec<[Block; 16]> {
-        if let Some(entry) = self.f.layout.entry_block() {
-            let mut ret = SmallVec::new();
-            let mut queue = VecDeque::new();
-            let mut visited = SecondaryMap::with_default(false);
-            queue.push_back(entry);
-            visited[entry] = true;
-            while !queue.is_empty() {
-                let b = queue.pop_front().unwrap();
-                ret.push(b);
-                let mut succs: SmallVec<[Block; 16]> = SmallVec::new();
-                for inst in self.f.layout.block_likely_branches(b) {
-                    if self.f.dfg[inst].opcode().is_branch() {
-                        visit_branch_targets(self.f, b, inst, |succ| {
-                            succs.push(succ);
-                        });
-                    }
+    fn lower_edge(&mut self, pred: Block, inst: Inst, succ: Block) -> CodegenResult<()> {
+        debug!("lower_edge: pred {} succ {}", pred, succ);
+
+        let mut src_regs: SmallVec<[Option<Reg>; 16]> = SmallVec::new();
+        let mut src_consts: SmallVec<[Option<u64>; 16]> = SmallVec::new();
+        let mut dst_regs: SmallVec<[Writable<Reg>; 16]> = SmallVec::new();
+
+        fn overlap(a: &[Option<Reg>], b: &[Writable<Reg>]) -> bool {
+            let mut set = FxHashSet::default();
+            for &maybe_reg in a {
+                if let Some(r) = maybe_reg {
+                    set.insert(r);
                 }
-                for succ in succs.into_iter() {
-                    if !visited[succ] {
-                        queue.push_back(succ);
-                        visited[succ] = true;
+            }
+            for &reg in b {
+                if set.contains(&reg.to_reg()) {
+                    return true;
+                }
+            }
+            false
+        }
+
+        // Create a temporary for each block parameter.
+        let phi_classes: SmallVec<[Type; 16]> = self
+            .f
+            .dfg
+            .block_params(succ)
+            .iter()
+            .map(|p| self.f.dfg.value_type(*p))
+            .collect();
+
+        // Create all of the phi uses (reads) from jump args to temps.
+        // Round up all the source and destination regs
+        for (i, arg) in self.f.dfg.inst_variable_args(inst).iter().enumerate() {
+            let arg = self.f.dfg.resolve_aliases(*arg);
+            let input = self.get_input_for_val(inst, arg);
+            debug!("jump arg {} is {}, reg {:?}", i, arg, input.reg);
+            if let Some(c) = input.constant {
+                src_consts.push(Some(c));
+                src_regs.push(None);
+            } else {
+                self.use_input_reg(input);
+                src_regs.push(Some(input.reg));
+                src_consts.push(None);
+            }
+        }
+        for (i, param) in self.f.dfg.block_params(succ).iter().enumerate() {
+            debug!("bb arg {} is {}", i, param);
+            dst_regs.push(Writable::from_reg(self.value_regs[*param]));
+        }
+        debug_assert!(src_regs.len() == dst_regs.len());
+        debug_assert!(src_consts.len() == dst_regs.len());
+        debug_assert!(phi_classes.len() == dst_regs.len());
+        debug!(
+            "src_regs = {:?} src_consts = {:?} dst_regs = {:?}",
+            src_regs, src_consts, dst_regs
+        );
+
+        // If, as is mostly the case, the source and destination register
+        // sets are non overlapping, then we can copy directly, so as to
+        // save the register allocator work.
+        if !overlap(&src_regs[..], &dst_regs[..]) {
+            for i in 0..dst_regs.len() {
+                let src_reg = src_regs[i];
+                let src_const = src_consts[i];
+                let dst_reg = dst_regs[i];
+                let ty = phi_classes[i];
+                if let Some(r) = src_reg {
+                    self.emit(I::gen_move(dst_reg, r, ty));
+                } else {
+                    // Generate constants fresh in phi-edge to avoid long
+                    // live-ranges. Note that these are also excluded from the
+                    // overlap check, which increases the chance that we don't
+                    // have to do a two-stage copy.
+                    for inst in I::gen_constant(dst_reg, src_const.unwrap(), ty).into_iter() {
+                        self.emit(inst);
                     }
                 }
             }
-
-            ret
         } else {
-            SmallVec::new()
+            // There's some overlap, so play safe and copy via temps.
+            let mut tmp_regs: SmallVec<[Writable<Reg>; 16]> = SmallVec::new();
+            for &ty in &phi_classes {
+                tmp_regs.push(self.tmp(I::rc_for_type(ty)?, ty));
+            }
+
+            debug!("phi_temps = {:?}", tmp_regs);
+            debug_assert!(tmp_regs.len() == src_regs.len());
+
+            for i in 0..dst_regs.len() {
+                let src_reg = src_regs[i];
+                let tmp_reg = tmp_regs[i];
+                let ty = phi_classes[i];
+                let src_const = src_consts[i];
+                if let Some(src_reg) = src_reg {
+                    self.emit(I::gen_move(tmp_reg, src_reg, ty));
+                } else {
+                    for inst in I::gen_constant(tmp_reg, src_const.unwrap(), ty).into_iter() {
+                        self.emit(inst);
+                    }
+                }
+            }
+            for i in 0..dst_regs.len() {
+                let tmp_reg = tmp_regs[i].to_reg();
+                let dst_reg = dst_regs[i];
+                let ty = phi_classes[i];
+                self.emit(I::gen_move(dst_reg, tmp_reg, ty));
+            }
+        }
+        Ok(())
+    }
+
+    fn lower_clif_block<B: LowerBackend<MInst = I>>(
+        &mut self,
+        backend: &B,
+        block: Block,
+    ) -> CodegenResult<()> {
+        // Lowering loop:
+        // - For each non-branch instruction, in reverse order:
+        //   - If side-effecting (load, store, branch/call/return, possible trap), or if
+        //     used outside of this block, or if demanded by another inst, then lower.
+        //
+        // That's it! Lowering of side-effecting ops will force all *needed*
+        // (live) non-side-effecting ops to be lowered at the right places, via
+        // the `use_input_reg()` callback on the `LowerCtx` (that's us). That's
+        // because `use_input_reg()` sets the eager/demand bit for any insts
+        // whose result registers are used.
+        //
+        // We build up the BB in reverse instruction order in `bb_insts`.
+        // Because the machine backend calls `ctx.emit()` in forward order, we
+        // collect per-IR-inst lowered instructions in `ir_insts`, then reverse
+        // these and append to `bb_insts` as we go backward through the block.
+        // `bb_insts` are then reversed again and appended to the VCode at the
+        // end of the BB (in the toplevel driver `lower()`).
+        for inst in self.f.layout.block_insts(block).rev() {
+            let data = &self.f.dfg[inst];
+            let value_needed = self
+                .f
+                .dfg
+                .inst_results(inst)
+                .iter()
+                .any(|&result| self.vreg_needed[self.value_regs[result].get_index()]);
+            debug!(
+                "lower_clif_block: block {} inst {} ({:?}) is_branch {} inst_needed {} value_needed {}",
+                block,
+                inst,
+                data,
+                data.opcode().is_branch(),
+                self.inst_needed[inst],
+                value_needed,
+            );
+            if self.f.dfg[inst].opcode().is_branch() {
+                continue;
+            }
+            // Normal instruction: codegen if eager bit is set. (Other instructions may also be
+            // codegened if not eager when they are used by another instruction.)
+            if self.inst_needed[inst] || value_needed {
+                debug!("lowering: inst {}: {:?}", inst, self.f.dfg[inst]);
+                backend.lower(self, inst)?;
+            }
+            if data.opcode().is_return() {
+                // Return: handle specially, using ABI-appropriate sequence.
+                let gen_ret = if data.opcode() == Opcode::Return {
+                    GenerateReturn::Yes
+                } else {
+                    debug_assert!(data.opcode() == Opcode::FallthroughReturn);
+                    GenerateReturn::No
+                };
+                self.gen_retval_setup(gen_ret);
+            }
+
+            let loc = self.srcloc(inst);
+            self.finish_ir_inst(loc);
+        }
+        Ok(())
+    }
+
+    fn finish_ir_inst(&mut self, loc: SourceLoc) {
+        for inst in self.ir_insts.drain(..).rev() {
+            self.bb_insts.push((loc, inst));
+        }
+    }
+
+    fn finish_bb(&mut self) {
+        let start = self.block_insts.len();
+        for pair in self.bb_insts.drain(..).rev() {
+            self.block_insts.push(pair);
+        }
+        let end = self.block_insts.len();
+        self.block_ranges.push((start, end));
+    }
+
+    fn copy_bbs_to_vcode(&mut self) {
+        for &(start, end) in self.block_ranges.iter().rev() {
+            for &(loc, ref inst) in &self.block_insts[start..end] {
+                self.vcode.set_srcloc(loc);
+                self.vcode.push(inst.clone());
+            }
+            self.vcode.end_bb();
+        }
+    }
+
+    fn lower_clif_branches<B: LowerBackend<MInst = I>>(
+        &mut self,
+        backend: &B,
+        block: Block,
+        branches: &SmallVec<[Inst; 2]>,
+        targets: &SmallVec<[MachLabel; 2]>,
+        maybe_fallthrough: Option<MachLabel>,
+    ) -> CodegenResult<()> {
+        debug!(
+            "lower_clif_branches: block {} branches {:?} targets {:?} maybe_fallthrough {:?}",
+            block, branches, targets, maybe_fallthrough
+        );
+        backend.lower_branch_group(self, branches, targets, maybe_fallthrough)?;
+        let loc = self.srcloc(branches[0]);
+        self.finish_ir_inst(loc);
+        Ok(())
+    }
+
+    fn collect_branches_and_targets(
+        &self,
+        bindex: BlockIndex,
+        _bb: Block,
+        branches: &mut SmallVec<[Inst; 2]>,
+        targets: &mut SmallVec<[MachLabel; 2]>,
+    ) {
+        branches.clear();
+        targets.clear();
+        let mut last_inst = None;
+        for &(inst, succ) in self.vcode.block_order().succ_indices(bindex) {
+            // Avoid duplicates: this ensures a br_table is only inserted once.
+            if last_inst != Some(inst) {
+                branches.push(inst);
+            } else {
+                debug_assert!(self.f.dfg[inst].opcode() == Opcode::BrTable);
+                debug_assert!(branches.len() == 1);
+            }
+            last_inst = Some(inst);
+            targets.push(MachLabel::from_block(succ));
         }
     }
 
     /// Lower the function.
     pub fn lower<B: LowerBackend<MInst = I>>(mut self, backend: &B) -> CodegenResult<VCode<I>> {
-        // Find all reachable blocks.
-        let bbs = self.find_reachable_bbs();
-
-        // This records a Block-to-BlockIndex map so that branch targets can be resolved.
-        let mut next_bindex = self.vcode.init_bb_map(&bbs[..]);
-
-        // Allocate a separate BlockIndex for each control-flow instruction so that we can create
-        // the edge blocks later. Each entry for a control-flow inst is the edge block; the list
-        // has (control flow inst, edge block, orig block) tuples.
-        //
-        // In general, a given inst may have only one target, except for jump tables which have
-        // more. But SmallVec may store inline more than the spec'd number, so ask for slightly
-        // more.
-        let mut edge_blocks_by_inst: SecondaryMap<Inst, SmallVec<[BlockIndex; 2]>> =
-            SecondaryMap::with_default(SmallVec::new());
-
-        // Each basic block may at most have two edge blocks, since it may have a most two branch
-        // instructions. If we omit jump tables, we can model that 50% of branches are direct jumps
-        // (1 successor), and 50% are tests (2 successors). A distribution of edge_blocks per block
-        // matches this rough estimate that there are 1.5 edge block per block.
-        let mut edge_blocks: Vec<(Inst, BlockIndex, Block)> = Vec::with_capacity(bbs.len() * 3 / 2);
-
         debug!("about to lower function: {:?}", self.f);
-        debug!("bb map: {:?}", self.vcode.blocks_by_bb());
 
-        // Work backward (reverse block order, reverse through each block), skipping insns with zero
-        // uses.
-        for bb in bbs.iter().rev() {
-            for inst in self.f.layout.block_likely_branches(*bb) {
-                if self.f.dfg[inst].opcode().is_branch() {
-                    // Find the original target.
-                    let mut add_succ = |next_bb| {
-                        let edge_block = next_bindex;
-                        next_bindex += 1;
-                        edge_blocks_by_inst[inst].push(edge_block);
-                        edge_blocks.push((inst, edge_block, next_bb));
-                    };
-                    visit_branch_targets(self.f, *bb, inst, |succ| {
-                        add_succ(succ);
-                    });
-                }
-            }
-        }
+        // Get the pinned reg here (we only parameterize this function on `B`,
+        // not the whole `Lower` impl).
+        self.pinned_reg = backend.maybe_pinned_reg();
 
-        // Temporary vectors whose memory is reused in the loop below.
+        self.vcode.set_entry(0);
+
+        // Reused vectors for branch lowering.
         let mut branches: SmallVec<[Inst; 2]> = SmallVec::new();
-        let mut targets: SmallVec<[BlockIndex; 2]> = SmallVec::new();
+        let mut targets: SmallVec<[MachLabel; 2]> = SmallVec::new();
 
-        for bb in bbs.iter() {
-            debug!("lowering bb: {}", bb);
+        // get a copy of the lowered order; we hold this separately because we
+        // need a mut ref to the vcode to mutate it below.
+        let lowered_order: SmallVec<[LoweredBlock; 64]> = self
+            .vcode
+            .block_order()
+            .lowered_order()
+            .iter()
+            .cloned()
+            .collect();
 
-            // If this is a return block, produce the return value setup.  N.B.: this comes
-            // *before* the below because it must occur *after* any other instructions, and
-            // instructions are lowered in reverse order.
-            let last_insn = self.f.layout.block_insts(*bb).last().unwrap();
-            let last_insn_opcode = self.f.dfg[last_insn].opcode();
-            if last_insn_opcode.is_return() {
-                let gen_ret = if last_insn_opcode == Opcode::Return {
-                    GenerateReturn::Yes
-                } else {
-                    debug_assert!(last_insn_opcode == Opcode::FallthroughReturn);
-                    self.vcode.set_fallthrough_return_block(*bb);
-                    GenerateReturn::No
-                };
-                self.gen_retval_setup(gen_ret);
-                self.vcode.end_ir_inst();
-            }
+        // Main lowering loop over lowered blocks.
+        for (bindex, lb) in lowered_order.iter().enumerate().rev() {
+            let bindex = bindex as BlockIndex;
 
-            // Find the branches at the end first, and process those, if any.
-            for inst in self.f.layout.block_insts(*bb).rev() {
-                debug!("lower: inst {}", inst);
-                if edge_blocks_by_inst[inst].len() > 0 {
-                    branches.push(inst);
-                    for target in edge_blocks_by_inst[inst].iter().rev().cloned() {
-                        targets.push(target);
-                    }
-                } else {
-                    // We've reached the end of the branches -- process all as a group, first.
-                    if branches.len() > 0 {
-                        let fallthrough = self.f.layout.next_block(*bb);
-                        let fallthrough = fallthrough.map(|bb| self.vcode.bb_to_bindex(bb));
-                        branches.reverse();
-                        targets.reverse();
-                        debug!(
-                            "lower_branch_group: targets = {:?} branches = {:?}",
-                            targets, branches
-                        );
-                        self.vcode.set_srcloc(self.srcloc(branches[0]));
-                        backend.lower_branch_group(
-                            &mut self,
-                            &branches[..],
-                            &targets[..],
-                            fallthrough,
-                        );
-                        self.vcode.end_ir_inst();
-                        branches.clear();
-                        targets.clear();
-                    }
+            // Lower the block body in reverse order (see comment in
+            // `lower_clif_block()` for rationale).
 
-                    // Only codegen an instruction if it either has a side effect, or has at least
-                    // one use of one of its results.
-                    if self.num_uses[inst] > 0 || has_side_effect(self.f, inst) {
-                        self.vcode.set_srcloc(self.srcloc(inst));
-                        backend.lower(&mut self, inst);
-                        self.vcode.end_ir_inst();
+            // End branches.
+            if let Some(bb) = lb.orig_block() {
+                self.collect_branches_and_targets(bindex, bb, &mut branches, &mut targets);
+                if branches.len() > 0 {
+                    let maybe_fallthrough = if (bindex + 1) < (lowered_order.len() as BlockIndex) {
+                        Some(MachLabel::from_block(bindex + 1))
                     } else {
-                        // If we're skipping the instruction, we need to dec-ref its arguments.
-                        for arg in self.f.dfg.inst_args(inst) {
-                            let val = self.f.dfg.resolve_aliases(*arg);
-                            match self.f.dfg.value_def(val) {
-                                ValueDef::Result(src_inst, _) => {
-                                    self.dec_use(src_inst);
-                                }
-                                _ => {}
-                            }
-                        }
-                    }
-                }
-            }
-
-            // There are possibly some branches left if the block contained only branches.
-            if branches.len() > 0 {
-                let fallthrough = self.f.layout.next_block(*bb);
-                let fallthrough = fallthrough.map(|bb| self.vcode.bb_to_bindex(bb));
-                branches.reverse();
-                targets.reverse();
-                debug!(
-                    "lower_branch_group: targets = {:?} branches = {:?}",
-                    targets, branches
-                );
-                self.vcode.set_srcloc(self.srcloc(branches[0]));
-                backend.lower_branch_group(&mut self, &branches[..], &targets[..], fallthrough);
-                self.vcode.end_ir_inst();
-                branches.clear();
-                targets.clear();
-            }
-
-            // If this is the entry block, produce the argument setup.
-            if Some(*bb) == self.f.layout.entry_block() {
-                self.gen_arg_setup();
-                self.vcode.end_ir_inst();
-            }
-
-            let vcode_bb = self.vcode.end_bb();
-            debug!("finished building bb: BlockIndex {}", vcode_bb);
-            debug!("bb_to_bindex map says: {}", self.vcode.bb_to_bindex(*bb));
-            assert!(vcode_bb == self.vcode.bb_to_bindex(*bb));
-            if Some(*bb) == self.f.layout.entry_block() {
-                self.vcode.set_entry(vcode_bb);
-            }
-        }
-
-        // Temporary vectors whose memory is reused in the loop below.
-        // TODO accomodate changes in regalloc.rs to use small vecs here?
-        let mut src_regs = Vec::new();
-        let mut dst_regs = Vec::new();
-
-        // Now create the edge blocks, with phi lowering (block parameter copies).
-        for (inst, edge_block, orig_block) in edge_blocks.into_iter() {
-            debug!(
-                "creating edge block: inst {}, edge_block {}, orig_block {}",
-                inst, edge_block, orig_block
-            );
-
-            // Create a temporary for each block parameter.
-            let phi_classes: Vec<Type> = self
-                .f
-                .dfg
-                .block_params(orig_block)
-                .iter()
-                .map(|p| self.f.dfg.value_type(*p))
-                .collect();
-
-            // Create all of the phi uses (reads) from jump args to temps.
-            // Round up all the source and destination regs
-            src_regs.clear();
-            dst_regs.clear();
-            for (i, arg) in self.f.dfg.inst_variable_args(inst).iter().enumerate() {
-                let arg = self.f.dfg.resolve_aliases(*arg);
-                debug!("jump arg {} is {}", i, arg);
-                src_regs.push(self.value_regs[arg]);
-            }
-            for (i, param) in self.f.dfg.block_params(orig_block).iter().enumerate() {
-                debug!("bb arg {} is {}", i, param);
-                dst_regs.push(Writable::from_reg(self.value_regs[*param]));
-            }
-            debug_assert!(src_regs.len() == dst_regs.len());
-            debug_assert!(phi_classes.len() == dst_regs.len());
-
-            // If, as is mostly the case, the source and destination register
-            // sets are non overlapping, then we can copy directly, so as to
-            // save the register allocator work.
-            if !Set::<Reg>::from_vec(src_regs.clone()).intersects(&Set::<Reg>::from_vec(
-                dst_regs.iter().map(|r| r.to_reg()).collect(),
-            )) {
-                for (dst_reg, (src_reg, ty)) in
-                    dst_regs.iter().zip(src_regs.iter().zip(phi_classes))
-                {
-                    self.vcode.push(I::gen_move(*dst_reg, *src_reg, ty));
+                        None
+                    };
+                    self.lower_clif_branches(backend, bb, &branches, &targets, maybe_fallthrough)?;
+                    self.finish_ir_inst(self.srcloc(branches[0]));
                 }
             } else {
-                // There's some overlap, so play safe and copy via temps.
-                let mut tmp_regs = Vec::with_capacity(phi_classes.len());
-                for &ty in &phi_classes {
-                    tmp_regs.push(self.tmp(I::rc_for_type(ty)?, ty));
-                }
-
-                debug!("phi_temps = {:?}", tmp_regs);
-                debug_assert!(tmp_regs.len() == src_regs.len());
-
-                for (tmp_reg, (src_reg, &ty)) in
-                    tmp_regs.iter().zip(src_regs.iter().zip(phi_classes.iter()))
-                {
-                    self.vcode.push(I::gen_move(*tmp_reg, *src_reg, ty));
-                }
-                for (dst_reg, (tmp_reg, &ty)) in
-                    dst_regs.iter().zip(tmp_regs.iter().zip(phi_classes.iter()))
-                {
-                    self.vcode.push(I::gen_move(*dst_reg, tmp_reg.to_reg(), ty));
-                }
+                // If no orig block, this must be a pure edge block; get the successor and
+                // emit a jump.
+                let (_, succ) = self.vcode.block_order().succ_indices(bindex)[0];
+                self.emit(I::gen_jump(MachLabel::from_block(succ)));
+                self.finish_ir_inst(SourceLoc::default());
             }
 
-            // Create the unconditional jump to the original target block.
-            self.vcode
-                .push(I::gen_jump(self.vcode.bb_to_bindex(orig_block)));
+            // Out-edge phi moves.
+            if let Some((pred, inst, succ)) = lb.out_edge() {
+                self.lower_edge(pred, inst, succ)?;
+                self.finish_ir_inst(SourceLoc::default());
+            }
+            // Original block body.
+            if let Some(bb) = lb.orig_block() {
+                self.lower_clif_block(backend, bb)?;
+            }
+            // In-edge phi moves.
+            if let Some((pred, inst, succ)) = lb.in_edge() {
+                self.lower_edge(pred, inst, succ)?;
+                self.finish_ir_inst(SourceLoc::default());
+            }
 
-            // End the IR inst and block. (We lower this as if it were one IR instruction so that
-            // we can emit machine instructions in forward order.)
-            self.vcode.end_ir_inst();
-            let blocknum = self.vcode.end_bb();
-            assert!(blocknum == edge_block);
+            if bindex == 0 {
+                // Set up the function with arg vreg inits.
+                self.gen_arg_setup();
+                self.finish_ir_inst(SourceLoc::default());
+            }
+
+            self.finish_bb();
         }
 
+        self.copy_bbs_to_vcode();
+
         // Now that we've emitted all instructions into the VCodeBuilder, let's build the VCode.
-        Ok(self.vcode.build())
+        let vcode = self.vcode.build();
+        debug!("built vcode: {:?}", vcode);
+
+        Ok(vcode)
     }
 
-    /// Reduce the use-count of an IR instruction. Use this when, e.g., isel incorporates the
-    /// computation of an input instruction directly, so that input instruction has one
-    /// fewer use.
-    fn dec_use(&mut self, ir_inst: Inst) {
-        assert!(self.num_uses[ir_inst] > 0);
-        self.num_uses[ir_inst] -= 1;
-        debug!(
-            "incref: ir_inst {} now has {} uses",
-            ir_inst, self.num_uses[ir_inst]
-        );
-    }
+    fn get_input_for_val(&self, at_inst: Inst, val: Value) -> LowerInput {
+        debug!("get_input_for_val: val {} at inst {}", val, at_inst);
+        let mut reg = self.value_regs[val];
+        debug!(" -> reg {:?}", reg);
+        assert!(reg.is_valid());
+        let mut inst = match self.f.dfg.value_def(val) {
+            // OK to merge source instruction if (i) we have a source
+            // instruction, and either (ii-a) it has no side effects, or (ii-b)
+            // it has the same color as this instruction.
+            ValueDef::Result(src_inst, result_idx) => {
+                debug!(" -> src inst {}", src_inst);
+                debug!(
+                    " -> has side effect: {}",
+                    has_side_effect_or_load(self.f, src_inst)
+                );
+                debug!(
+                    " -> our color is {:?}, src inst is {:?}",
+                    self.inst_color(at_inst),
+                    self.inst_color(src_inst)
+                );
+                if !has_side_effect_or_load(self.f, src_inst)
+                    || self.inst_color(at_inst) == self.inst_color(src_inst)
+                {
+                    Some((src_inst, result_idx))
+                } else {
+                    None
+                }
+            }
+            _ => None,
+        };
+        let constant = inst.and_then(|(inst, _)| self.get_constant(inst));
 
-    /// Increase the use-count of an IR instruction. Use this when, e.g., isel incorporates
-    /// the computation of an input instruction directly, so that input instruction's
-    /// inputs are now used directly by the merged instruction.
-    fn inc_use(&mut self, ir_inst: Inst) {
-        self.num_uses[ir_inst] += 1;
-        debug!(
-            "decref: ir_inst {} now has {} uses",
-            ir_inst, self.num_uses[ir_inst]
-        );
+        // Pinned-reg hack: if backend specifies a fixed pinned register, use it
+        // directly when we encounter a GetPinnedReg op, rather than lowering
+        // the actual op, and do not return the source inst to the caller; the
+        // value comes "out of the ether" and we will not force generation of
+        // the superfluous move.
+        if let Some((i, _)) = inst {
+            if self.f.dfg[i].opcode() == Opcode::GetPinnedReg {
+                if let Some(pr) = self.pinned_reg {
+                    reg = pr;
+                }
+                inst = None;
+            }
+        }
+
+        LowerInput {
+            reg,
+            inst,
+            constant,
+        }
     }
 }
 
 impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> {
     type I = I;
 
-    /// Get the instdata for a given IR instruction.
-    fn data(&self, ir_inst: Inst) -> &InstructionData {
-        &self.f.dfg[ir_inst]
-    }
-
-    /// Get the controlling type for a polymorphic IR instruction.
-    fn ty(&self, ir_inst: Inst) -> Type {
-        self.f.dfg.ctrl_typevar(ir_inst)
-    }
-
     fn abi(&mut self) -> &dyn ABIBody<I = I> {
         self.vcode.abi()
     }
 
-    /// Emit a machine instruction.
-    fn emit(&mut self, mach_inst: I) {
-        self.vcode.push(mach_inst);
-    }
-
-    /// Indicate that a merge has occurred.
-    fn merged(&mut self, from_inst: Inst) {
-        debug!("merged: inst {}", from_inst);
-        // First, inc-ref all inputs of `from_inst`, because they are now used
-        // directly by `into_inst`.
-        for arg in self.f.dfg.inst_args(from_inst) {
-            let arg = self.f.dfg.resolve_aliases(*arg);
-            match self.f.dfg.value_def(arg) {
-                ValueDef::Result(src_inst, _) => {
-                    debug!(" -> inc-reffing src inst {}", src_inst);
-                    self.inc_use(src_inst);
-                }
-                _ => {}
-            }
-        }
-        // Then, dec-ref the merged instruction itself. It still retains references
-        // to its arguments (inc-ref'd above). If its refcount has reached zero,
-        // it will be skipped during emission and its args will be dec-ref'd at that
-        // time.
-        self.dec_use(from_inst);
-    }
-
-    /// Get the producing instruction, if any, and output number, for the `idx`th input to the
-    /// given IR instruction.
-    fn input_inst(&self, ir_inst: Inst, idx: usize) -> Option<(Inst, usize)> {
-        let val = self.f.dfg.inst_args(ir_inst)[idx];
-        let val = self.f.dfg.resolve_aliases(val);
-        match self.f.dfg.value_def(val) {
-            ValueDef::Result(src_inst, result_idx) => Some((src_inst, result_idx)),
-            _ => None,
-        }
-    }
-
-    /// Map a Value to its associated writable (probably virtual) Reg.
-    fn value_to_writable_reg(&self, val: Value) -> Writable<Reg> {
-        let val = self.f.dfg.resolve_aliases(val);
-        Writable::from_reg(self.value_regs[val])
-    }
-
-    /// Map a Value to its associated (probably virtual) Reg.
-    fn value_to_reg(&self, val: Value) -> Reg {
-        let val = self.f.dfg.resolve_aliases(val);
-        self.value_regs[val]
-    }
-
-    /// Get the `idx`th input to the given IR instruction as a virtual register.
-    fn input(&self, ir_inst: Inst, idx: usize) -> Reg {
-        let val = self.f.dfg.inst_args(ir_inst)[idx];
-        let val = self.f.dfg.resolve_aliases(val);
-        self.value_to_reg(val)
-    }
-
-    /// Get the `idx`th output of the given IR instruction as a virtual register.
-    fn output(&self, ir_inst: Inst, idx: usize) -> Writable<Reg> {
-        let val = self.f.dfg.inst_results(ir_inst)[idx];
-        self.value_to_writable_reg(val)
-    }
-
-    /// Get a new temp.
-    fn tmp(&mut self, rc: RegClass, ty: Type) -> Writable<Reg> {
-        let v = self.next_vreg;
-        self.next_vreg += 1;
-        let vreg = Reg::new_virtual(rc, v);
-        self.vcode.set_vreg_type(vreg.as_virtual_reg().unwrap(), ty);
-        Writable::from_reg(vreg)
-    }
-
-    /// Get the number of inputs for the given IR instruction.
-    fn num_inputs(&self, ir_inst: Inst) -> usize {
-        self.f.dfg.inst_args(ir_inst).len()
-    }
-
-    /// Get the number of outputs for the given IR instruction.
-    fn num_outputs(&self, ir_inst: Inst) -> usize {
-        self.f.dfg.inst_results(ir_inst).len()
-    }
-
-    /// Get the type for an instruction's input.
-    fn input_ty(&self, ir_inst: Inst, idx: usize) -> Type {
-        let val = self.f.dfg.inst_args(ir_inst)[idx];
-        let val = self.f.dfg.resolve_aliases(val);
-        self.f.dfg.value_type(val)
-    }
-
-    /// Get the type for an instruction's output.
-    fn output_ty(&self, ir_inst: Inst, idx: usize) -> Type {
-        self.f.dfg.value_type(self.f.dfg.inst_results(ir_inst)[idx])
-    }
-
-    /// Get the number of block params.
-    fn num_bb_params(&self, bb: Block) -> usize {
-        self.f.dfg.block_params(bb).len()
-    }
-
-    /// Get the register for a block param.
-    fn bb_param(&self, bb: Block, idx: usize) -> Reg {
-        let val = self.f.dfg.block_params(bb)[idx];
-        self.value_regs[val]
-    }
-
-    /// Get the register for a return value.
     fn retval(&self, idx: usize) -> Writable<Reg> {
         Writable::from_reg(self.retval_regs[idx].0)
     }
 
-    /// Get the target for a call instruction, as an `ExternalName`. Returns a tuple
-    /// providing this name and the "relocation distance", i.e., whether the backend
-    /// can assume the target will be "nearby" (within some small offset) or an
-    /// arbitrary address. (This comes from the `colocated` bit in the CLIF.)
+    fn data(&self, ir_inst: Inst) -> &InstructionData {
+        &self.f.dfg[ir_inst]
+    }
+
+    fn ty(&self, ir_inst: Inst) -> Type {
+        self.f.dfg.ctrl_typevar(ir_inst)
+    }
+
     fn call_target<'b>(&'b self, ir_inst: Inst) -> Option<(&'b ExternalName, RelocDistance)> {
         match &self.f.dfg[ir_inst] {
             &InstructionData::Call { func_ref, .. }
@@ -686,7 +805,7 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> {
             _ => None,
         }
     }
-    /// Get the signature for a call or call-indirect instruction.
+
     fn call_sig<'b>(&'b self, ir_inst: Inst) -> Option<&'b Signature> {
         match &self.f.dfg[ir_inst] {
             &InstructionData::Call { func_ref, .. } => {
@@ -698,7 +817,6 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> {
         }
     }
 
-    /// Get the symbol name, relocation distance estimate, and offset for a symbol_value instruction.
     fn symbol_value<'b>(&'b self, ir_inst: Inst) -> Option<(&'b ExternalName, RelocDistance, i64)> {
         match &self.f.dfg[ir_inst] {
             &InstructionData::UnaryGlobalValue { global_value, .. } => {
@@ -720,7 +838,6 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> {
         }
     }
 
-    /// Returns the memory flags of a given memory access.
     fn memflags(&self, ir_inst: Inst) -> Option<MemFlags> {
         match &self.f.dfg[ir_inst] {
             &InstructionData::Load { flags, .. }
@@ -731,27 +848,94 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> {
         }
     }
 
-    /// Get the source location for a given instruction.
     fn srcloc(&self, ir_inst: Inst) -> SourceLoc {
         self.f.srclocs[ir_inst]
     }
+
+    fn inst_color(&self, ir_inst: Inst) -> InstColor {
+        self.inst_colors[ir_inst]
+    }
+
+    fn num_inputs(&self, ir_inst: Inst) -> usize {
+        self.f.dfg.inst_args(ir_inst).len()
+    }
+
+    fn num_outputs(&self, ir_inst: Inst) -> usize {
+        self.f.dfg.inst_results(ir_inst).len()
+    }
+
+    fn input_ty(&self, ir_inst: Inst, idx: usize) -> Type {
+        let val = self.f.dfg.inst_args(ir_inst)[idx];
+        let val = self.f.dfg.resolve_aliases(val);
+        self.f.dfg.value_type(val)
+    }
+
+    fn output_ty(&self, ir_inst: Inst, idx: usize) -> Type {
+        self.f.dfg.value_type(self.f.dfg.inst_results(ir_inst)[idx])
+    }
+
+    fn get_constant(&self, ir_inst: Inst) -> Option<u64> {
+        self.inst_constants.get(&ir_inst).cloned()
+    }
+
+    fn get_input(&self, ir_inst: Inst, idx: usize) -> LowerInput {
+        let val = self.f.dfg.inst_args(ir_inst)[idx];
+        let val = self.f.dfg.resolve_aliases(val);
+        self.get_input_for_val(ir_inst, val)
+    }
+
+    fn get_output(&mut self, ir_inst: Inst, idx: usize) -> Writable<Reg> {
+        let val = self.f.dfg.inst_results(ir_inst)[idx];
+        Writable::from_reg(self.value_regs[val])
+    }
+
+    fn tmp(&mut self, rc: RegClass, ty: Type) -> Writable<Reg> {
+        let v = self.next_vreg;
+        self.next_vreg += 1;
+        let vreg = Reg::new_virtual(rc, v);
+        self.vcode.set_vreg_type(vreg.as_virtual_reg().unwrap(), ty);
+        Writable::from_reg(vreg)
+    }
+
+    fn emit(&mut self, mach_inst: I) {
+        self.ir_insts.push(mach_inst);
+    }
+
+    fn use_input_reg(&mut self, input: LowerInput) {
+        debug!("use_input_reg: vreg {:?} is needed", input.reg);
+        self.vreg_needed[input.reg.get_index()] = true;
+    }
 }
 
-fn visit_branch_targets<F: FnMut(Block)>(f: &Function, block: Block, inst: Inst, mut visit: F) {
+/// Visit all successors of a block with a given visitor closure.
+pub(crate) fn visit_block_succs<F: FnMut(Inst, Block)>(f: &Function, block: Block, mut visit: F) {
+    for inst in f.layout.block_likely_branches(block) {
+        if f.dfg[inst].opcode().is_branch() {
+            visit_branch_targets(f, block, inst, &mut visit);
+        }
+    }
+}
+
+fn visit_branch_targets<F: FnMut(Inst, Block)>(
+    f: &Function,
+    block: Block,
+    inst: Inst,
+    visit: &mut F,
+) {
     if f.dfg[inst].opcode() == Opcode::Fallthrough {
-        visit(f.layout.next_block(block).unwrap());
+        visit(inst, f.layout.next_block(block).unwrap());
     } else {
         match f.dfg[inst].analyze_branch(&f.dfg.value_lists) {
             BranchInfo::NotABranch => {}
             BranchInfo::SingleDest(dest, _) => {
-                visit(dest);
+                visit(inst, dest);
             }
             BranchInfo::Table(table, maybe_dest) => {
                 if let Some(dest) = maybe_dest {
-                    visit(dest);
+                    visit(inst, dest);
                 }
                 for &dest in f.jump_tables[table].as_slice() {
-                    visit(dest);
+                    visit(inst, dest);
                 }
             }
         }
diff --git a/cranelift/codegen/src/machinst/mod.rs b/cranelift/codegen/src/machinst/mod.rs
index cc92982a84..517c3ac81c 100644
--- a/cranelift/codegen/src/machinst/mod.rs
+++ b/cranelift/codegen/src/machinst/mod.rs
@@ -109,6 +109,7 @@ use regalloc::RegUsageCollector;
 use regalloc::{
     RealReg, RealRegUniverse, Reg, RegClass, RegUsageMapper, SpillSlot, VirtualReg, Writable,
 };
+use smallvec::SmallVec;
 use std::string::String;
 use target_lexicon::Triple;
 
@@ -124,8 +125,8 @@ pub mod abi;
 pub use abi::*;
 pub mod pretty_print;
 pub use pretty_print::*;
-pub mod sections;
-pub use sections::*;
+pub mod buffer;
+pub use buffer::*;
 pub mod adapter;
 pub use adapter::*;
 
@@ -152,6 +153,9 @@ pub trait MachInst: Clone + Debug {
     /// Generate a move.
     fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Self;
 
+    /// Generate a constant into a reg.
+    fn gen_constant(to_reg: Writable<Reg>, value: u64, ty: Type) -> SmallVec<[Self; 4]>;
+
     /// Generate a zero-length no-op.
     fn gen_zero_len_nop() -> Self;
 
@@ -166,7 +170,7 @@ pub trait MachInst: Clone + Debug {
 
     /// Generate a jump to another target. Used during lowering of
     /// control flow.
-    fn gen_jump(target: BlockIndex) -> Self;
+    fn gen_jump(target: MachLabel) -> Self;
 
     /// Generate a NOP. The `preferred_size` parameter allows the caller to
     /// request a NOP of that size, or as close to it as possible. The machine
@@ -175,22 +179,62 @@ pub trait MachInst: Clone + Debug {
     /// the instruction must have a nonzero size.
     fn gen_nop(preferred_size: usize) -> Self;
 
-    /// Rewrite block targets using the block-target map.
-    fn with_block_rewrites(&mut self, block_target_map: &[BlockIndex]);
-
-    /// Finalize branches once the block order (fallthrough) is known.
-    fn with_fallthrough_block(&mut self, fallthrough_block: Option<BlockIndex>);
-
-    /// Update instruction once block offsets are known.  These offsets are
-    /// relative to the beginning of the function. `targets` is indexed by
-    /// BlockIndex.
-    fn with_block_offsets(&mut self, my_offset: CodeOffset, targets: &[CodeOffset]);
+    /// Get the register universe for this backend.
+    fn reg_universe(flags: &Flags) -> RealRegUniverse;
 
     /// Align a basic block offset (from start of function).  By default, no
     /// alignment occurs.
     fn align_basic_block(offset: CodeOffset) -> CodeOffset {
         offset
     }
+
+    /// What is the worst-case instruction size emitted by this instruction type?
+    fn worst_case_size() -> CodeOffset;
+
+    /// A label-use kind: a type that describes the types of label references that
+    /// can occur in an instruction.
+    type LabelUse: MachInstLabelUse;
+}
+
+/// A descriptor of a label reference (use) in an instruction set.
+pub trait MachInstLabelUse: Clone + Copy + Debug + Eq {
+    /// Required alignment for any veneer. Usually the required instruction
+    /// alignment (e.g., 4 for a RISC with 32-bit instructions, or 1 for x86).
+    const ALIGN: CodeOffset;
+
+    /// What is the maximum PC-relative range (positive)? E.g., if `1024`, a
+    /// label-reference fixup at offset `x` is valid if the label resolves to `x
+    /// + 1024`.
+    fn max_pos_range(self) -> CodeOffset;
+    /// What is the maximum PC-relative range (negative)? This is the absolute
+    /// value; i.e., if `1024`, then a label-reference fixup at offset `x` is
+    /// valid if the label resolves to `x - 1024`.
+    fn max_neg_range(self) -> CodeOffset;
+    /// What is the size of code-buffer slice this label-use needs to patch in
+    /// the label's value?
+    fn patch_size(self) -> CodeOffset;
+    /// Perform a code-patch, given the offset into the buffer of this label use
+    /// and the offset into the buffer of the label's definition.
+    /// It is guaranteed that, given `delta = offset - label_offset`, we will
+    /// have `offset >= -self.max_neg_range()` and `offset <=
+    /// self.max_pos_range()`.
+    fn patch(self, buffer: &mut [u8], use_offset: CodeOffset, label_offset: CodeOffset);
+    /// Can the label-use be patched to a veneer that supports a longer range?
+    /// Usually valid for jumps (a short-range jump can jump to a longer-range
+    /// jump), but not for e.g. constant pool references, because the constant
+    /// load would require different code (one more level of indirection).
+    fn supports_veneer(self) -> bool;
+    /// How many bytes are needed for a veneer?
+    fn veneer_size(self) -> CodeOffset;
+    /// Generate a veneer. The given code-buffer slice is `self.veneer_size()`
+    /// bytes long at offset `veneer_offset` in the buffer. The original
+    /// label-use will be patched to refer to this veneer's offset.  A new
+    /// (offset, LabelUse) is returned that allows the veneer to use the actual
+    /// label. For veneers to work properly, it is expected that the new veneer
+    /// has a larger range; on most platforms this probably means either a
+    /// "long-range jump" (e.g., on ARM, the 26-bit form), or if already at that
+    /// stage, a jump that supports a full 32-bit range, for example.
+    fn generate_veneer(self, buffer: &mut [u8], veneer_offset: CodeOffset) -> (CodeOffset, Self);
 }
 
 /// Describes a block terminator (not call) in the vcode, when its branches
@@ -202,26 +246,26 @@ pub enum MachTerminator<'a> {
     /// A return instruction.
     Ret,
     /// An unconditional branch to another block.
-    Uncond(BlockIndex),
+    Uncond(MachLabel),
     /// A conditional branch to one of two other blocks.
-    Cond(BlockIndex, BlockIndex),
+    Cond(MachLabel, MachLabel),
     /// An indirect branch with known possible targets.
-    Indirect(&'a [BlockIndex]),
+    Indirect(&'a [MachLabel]),
 }
 
 /// A trait describing the ability to encode a MachInst into binary machine code.
-pub trait MachInstEmit<O: MachSectionOutput> {
+pub trait MachInstEmit: MachInst {
     /// Persistent state carried across `emit` invocations.
     type State: Default + Clone + Debug;
     /// Emit the instruction.
-    fn emit(&self, code: &mut O, flags: &Flags, state: &mut Self::State);
+    fn emit(&self, code: &mut MachBuffer<Self>, flags: &Flags, state: &mut Self::State);
 }
 
 /// The result of a `MachBackend::compile_function()` call. Contains machine
 /// code (as bytes) and a disassembly, if requested.
 pub struct MachCompileResult {
     /// Machine code.
-    pub sections: MachSections,
+    pub buffer: MachBufferFinalized,
     /// Size of stack frame, in bytes.
     pub frame_size: u32,
     /// Disassembly, if requested.
@@ -231,7 +275,7 @@ pub struct MachCompileResult {
 impl MachCompileResult {
     /// Get a `CodeInfo` describing section sizes from this compilation result.
     pub fn code_info(&self) -> CodeInfo {
-        let code_size = self.sections.total_size();
+        let code_size = self.buffer.total_size();
         CodeInfo {
             code_size,
             jumptables_size: 0,
diff --git a/cranelift/codegen/src/machinst/sections.rs b/cranelift/codegen/src/machinst/sections.rs
deleted file mode 100644
index 0bd97dcdb6..0000000000
--- a/cranelift/codegen/src/machinst/sections.rs
+++ /dev/null
@@ -1,460 +0,0 @@
-//! In-memory representation of compiled machine code, in multiple sections
-//! (text, constant pool / rodata, etc). Emission occurs into multiple sections
-//! simultaneously, so we buffer the result in memory and hand off to the
-//! caller at the end of compilation.
-
-use crate::binemit::{Addend, CodeOffset, CodeSink, Reloc};
-use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode};
-
-use alloc::vec::Vec;
-
-/// A collection of sections with defined start-offsets.
-pub struct MachSections {
-    /// Sections, in offset order.
-    pub sections: Vec<MachSection>,
-}
-
-impl MachSections {
-    /// New, empty set of sections.
-    pub fn new() -> MachSections {
-        MachSections { sections: vec![] }
-    }
-
-    /// Add a section with a known offset and size. Returns the index.
-    pub fn add_section(&mut self, start: CodeOffset, length: CodeOffset) -> usize {
-        let idx = self.sections.len();
-        self.sections.push(MachSection::new(start, length));
-        idx
-    }
-
-    /// Mutably borrow the given section by index.
-    pub fn get_section<'a>(&'a mut self, idx: usize) -> &'a mut MachSection {
-        &mut self.sections[idx]
-    }
-
-    /// Get mutable borrows of two sections simultaneously. Used during
-    /// instruction emission to provide references to the .text and .rodata
-    /// (constant pool) sections.
-    pub fn two_sections<'a>(
-        &'a mut self,
-        idx1: usize,
-        idx2: usize,
-    ) -> (&'a mut MachSection, &'a mut MachSection) {
-        assert!(idx1 < idx2);
-        assert!(idx1 < self.sections.len());
-        assert!(idx2 < self.sections.len());
-        let (first, rest) = self.sections.split_at_mut(idx2);
-        (&mut first[idx1], &mut rest[0])
-    }
-
-    /// Emit this set of sections to a set of sinks for the code,
-    /// relocations, traps, and stackmap.
-    pub fn emit<CS: CodeSink>(&self, sink: &mut CS) {
-        // N.B.: we emit every section into the .text section as far as
-        // the `CodeSink` is concerned; we do not bother to segregate
-        // the contents into the actual program text, the jumptable and the
-        // rodata (constant pool). This allows us to generate code assuming
-        // that these will not be relocated relative to each other, and avoids
-        // having to designate each section as belonging in one of the three
-        // fixed categories defined by `CodeSink`. If this becomes a problem
-        // later (e.g. because of memory permissions or similar), we can
-        // add this designation and segregate the output; take care, however,
-        // to add the appropriate relocations in this case.
-
-        for section in &self.sections {
-            if section.data.len() > 0 {
-                while sink.offset() < section.start_offset {
-                    sink.put1(0);
-                }
-                section.emit(sink);
-            }
-        }
-        sink.begin_jumptables();
-        sink.begin_rodata();
-        sink.end_codegen();
-    }
-
-    /// Get a list of source location mapping tuples in sorted-by-start-offset order.
-    pub fn get_srclocs_sorted<'a>(&'a self) -> MachSectionsSrcLocs<'a> {
-        MachSectionsSrcLocs::new(&self.sections)
-    }
-
-    /// Get the total required size for these sections.
-    pub fn total_size(&self) -> CodeOffset {
-        if self.sections.len() == 0 {
-            0
-        } else {
-            // Find the last non-empty section.
-            self.sections
-                .iter()
-                .rev()
-                .find(|s| s.data.len() > 0)
-                .map(|s| s.cur_offset_from_start())
-                .unwrap_or(0)
-        }
-    }
-}
-
-/// An iterator over the srclocs in each section.
-/// Returns MachSrcLocs in an order sorted by start location.
-pub struct MachSectionsSrcLocs<'a> {
-    sections: &'a [MachSection],
-    cur_section: usize,
-    cur_srcloc: usize,
-    // For validation:
-    last_offset: CodeOffset,
-}
-
-impl<'a> MachSectionsSrcLocs<'a> {
-    fn new(sections: &'a [MachSection]) -> MachSectionsSrcLocs<'a> {
-        MachSectionsSrcLocs {
-            sections,
-            cur_section: 0,
-            cur_srcloc: 0,
-            last_offset: 0,
-        }
-    }
-}
-
-impl<'a> Iterator for MachSectionsSrcLocs<'a> {
-    type Item = &'a MachSrcLoc;
-
-    fn next(&mut self) -> Option<&'a MachSrcLoc> {
-        // We simply iterate through sections and srcloc records in order. This produces a
-        // sorted order naturally because sections are in starting-offset-order, and srclocs
-        // are produced as a section is emitted into, so are in order as well.
-
-        // If we're out of sections, we're done.
-        if self.cur_section >= self.sections.len() {
-            return None;
-        }
-
-        // Otherwise, make sure we have a srcloc in the current section left to return, and
-        // advance to the next section if not. Done if we run out of sections.
-        while self.cur_srcloc >= self.sections[self.cur_section].srclocs.len() {
-            self.cur_srcloc = 0;
-            self.cur_section += 1;
-            if self.cur_section >= self.sections.len() {
-                return None;
-            }
-        }
-
-        let loc = &self.sections[self.cur_section].srclocs[self.cur_srcloc];
-        self.cur_srcloc += 1;
-        debug_assert!(loc.start >= self.last_offset);
-        self.last_offset = loc.start;
-        Some(loc)
-    }
-}
-
-/// An abstraction over MachSection and MachSectionSize: some
-/// receiver of section data.
-pub trait MachSectionOutput {
-    /// Get the current offset from the start of all sections.
-    fn cur_offset_from_start(&self) -> CodeOffset;
-
-    /// Get the start offset of this section.
-    fn start_offset(&self) -> CodeOffset;
-
-    /// Add 1 byte to the section.
-    fn put1(&mut self, _: u8);
-
-    /// Add 2 bytes to the section.
-    fn put2(&mut self, value: u16) {
-        let [b0, b1] = value.to_le_bytes();
-        self.put1(b0);
-        self.put1(b1);
-    }
-
-    /// Add 4 bytes to the section.
-    fn put4(&mut self, value: u32) {
-        let [b0, b1, b2, b3] = value.to_le_bytes();
-        self.put1(b0);
-        self.put1(b1);
-        self.put1(b2);
-        self.put1(b3);
-    }
-
-    /// Add 8 bytes to the section.
-    fn put8(&mut self, value: u64) {
-        let [b0, b1, b2, b3, b4, b5, b6, b7] = value.to_le_bytes();
-        self.put1(b0);
-        self.put1(b1);
-        self.put1(b2);
-        self.put1(b3);
-        self.put1(b4);
-        self.put1(b5);
-        self.put1(b6);
-        self.put1(b7);
-    }
-
-    /// Add a slice of bytes to the section.
-    fn put_data(&mut self, data: &[u8]);
-
-    /// Add a relocation at the current offset.
-    fn add_reloc(&mut self, loc: SourceLoc, kind: Reloc, name: &ExternalName, addend: Addend);
-
-    /// Add a trap record at the current offset.
-    fn add_trap(&mut self, loc: SourceLoc, code: TrapCode);
-
-    /// Add a call return address record at the current offset.
-    fn add_call_site(&mut self, loc: SourceLoc, opcode: Opcode);
-
-    /// Start the output for the given source-location at the current offset.
-    fn start_srcloc(&mut self, loc: SourceLoc);
-
-    /// End the output for the previously-given source-location at the current offset.
-    fn end_srcloc(&mut self);
-
-    /// Align up to the given alignment.
-    fn align_to(&mut self, align_to: CodeOffset) {
-        assert!(align_to.is_power_of_two());
-        while self.cur_offset_from_start() & (align_to - 1) != 0 {
-            self.put1(0);
-        }
-    }
-}
-
-/// A section of output to be emitted to a CodeSink / RelocSink in bulk.
-/// Multiple sections may be created with known start offsets in advance; the
-/// usual use-case is to create the .text (code) and .rodata (constant pool) at
-/// once, after computing the length of the code, so that constant references
-/// can use known offsets as instructions are emitted.
-pub struct MachSection {
-    /// The starting offset of this section.
-    pub start_offset: CodeOffset,
-    /// The limit of this section, defined by the start of the next section.
-    pub length_limit: CodeOffset,
-    /// The section contents, as raw bytes.
-    pub data: Vec<u8>,
-    /// Any relocations referring to this section.
-    pub relocs: Vec<MachReloc>,
-    /// Any trap records referring to this section.
-    pub traps: Vec<MachTrap>,
-    /// Any call site records referring to this section.
-    pub call_sites: Vec<MachCallSite>,
-    /// Any source location mappings referring to this section.
-    pub srclocs: Vec<MachSrcLoc>,
-    /// The current source location in progress (after `start_srcloc()` and before `end_srcloc()`).
-    /// This is a (start_offset, src_loc) tuple.
-    pub cur_srcloc: Option<(CodeOffset, SourceLoc)>,
-}
-
-impl MachSection {
-    /// Create a new section, known to start at `start_offset` and with a size limited to `length_limit`.
-    pub fn new(start_offset: CodeOffset, length_limit: CodeOffset) -> MachSection {
-        MachSection {
-            start_offset,
-            length_limit,
-            data: vec![],
-            relocs: vec![],
-            traps: vec![],
-            call_sites: vec![],
-            srclocs: vec![],
-            cur_srcloc: None,
-        }
-    }
-
-    /// Emit this section to the CodeSink and other associated sinks.  The
-    /// current offset of the CodeSink must match the starting offset of this
-    /// section.
-    pub fn emit<CS: CodeSink>(&self, sink: &mut CS) {
-        assert!(sink.offset() == self.start_offset);
-
-        let mut next_reloc = 0;
-        let mut next_trap = 0;
-        let mut next_call_site = 0;
-        for (idx, byte) in self.data.iter().enumerate() {
-            if next_reloc < self.relocs.len() {
-                let reloc = &self.relocs[next_reloc];
-                if reloc.offset == idx as CodeOffset {
-                    sink.reloc_external(reloc.srcloc, reloc.kind, &reloc.name, reloc.addend);
-                    next_reloc += 1;
-                }
-            }
-            if next_trap < self.traps.len() {
-                let trap = &self.traps[next_trap];
-                if trap.offset == idx as CodeOffset {
-                    sink.trap(trap.code, trap.srcloc);
-                    next_trap += 1;
-                }
-            }
-            if next_call_site < self.call_sites.len() {
-                let call_site = &self.call_sites[next_call_site];
-                if call_site.ret_addr == idx as CodeOffset {
-                    sink.add_call_site(call_site.opcode, call_site.srcloc);
-                    next_call_site += 1;
-                }
-            }
-            sink.put1(*byte);
-        }
-    }
-}
-
-impl MachSectionOutput for MachSection {
-    fn cur_offset_from_start(&self) -> CodeOffset {
-        self.start_offset + self.data.len() as CodeOffset
-    }
-
-    fn start_offset(&self) -> CodeOffset {
-        self.start_offset
-    }
-
-    fn put1(&mut self, value: u8) {
-        assert!(((self.data.len() + 1) as CodeOffset) <= self.length_limit);
-        self.data.push(value);
-    }
-
-    fn put_data(&mut self, data: &[u8]) {
-        assert!(((self.data.len() + data.len()) as CodeOffset) <= self.length_limit);
-        self.data.extend_from_slice(data);
-    }
-
-    fn add_reloc(&mut self, srcloc: SourceLoc, kind: Reloc, name: &ExternalName, addend: Addend) {
-        let name = name.clone();
-        self.relocs.push(MachReloc {
-            offset: self.data.len() as CodeOffset,
-            srcloc,
-            kind,
-            name,
-            addend,
-        });
-    }
-
-    fn add_trap(&mut self, srcloc: SourceLoc, code: TrapCode) {
-        self.traps.push(MachTrap {
-            offset: self.data.len() as CodeOffset,
-            srcloc,
-            code,
-        });
-    }
-
-    fn add_call_site(&mut self, srcloc: SourceLoc, opcode: Opcode) {
-        self.call_sites.push(MachCallSite {
-            ret_addr: self.data.len() as CodeOffset,
-            srcloc,
-            opcode,
-        });
-    }
-
-    fn start_srcloc(&mut self, loc: SourceLoc) {
-        self.cur_srcloc = Some((self.cur_offset_from_start(), loc));
-    }
-
-    fn end_srcloc(&mut self) {
-        let (start, loc) = self
-            .cur_srcloc
-            .take()
-            .expect("end_srcloc() called without start_srcloc()");
-        let end = self.cur_offset_from_start();
-        // Skip zero-length extends.
-        debug_assert!(end >= start);
-        if end > start {
-            self.srclocs.push(MachSrcLoc { start, end, loc });
-        }
-    }
-}
-
-/// A MachSectionOutput implementation that records only size.
-pub struct MachSectionSize {
-    /// The starting offset of this section.
-    pub start_offset: CodeOffset,
-    /// The current offset of this section.
-    pub offset: CodeOffset,
-}
-
-impl MachSectionSize {
-    /// Create a new size-counting dummy section.
-    pub fn new(start_offset: CodeOffset) -> MachSectionSize {
-        MachSectionSize {
-            start_offset,
-            offset: start_offset,
-        }
-    }
-
-    /// Return the size this section would take if emitted with a real sink.
-    pub fn size(&self) -> CodeOffset {
-        self.offset - self.start_offset
-    }
-}
-
-impl MachSectionOutput for MachSectionSize {
-    fn cur_offset_from_start(&self) -> CodeOffset {
-        // All size-counting sections conceptually start at offset 0; this doesn't
-        // matter when counting code size.
-        self.offset
-    }
-
-    fn start_offset(&self) -> CodeOffset {
-        self.start_offset
-    }
-
-    fn put1(&mut self, _: u8) {
-        self.offset += 1;
-    }
-
-    fn put_data(&mut self, data: &[u8]) {
-        self.offset += data.len() as CodeOffset;
-    }
-
-    fn add_reloc(&mut self, _: SourceLoc, _: Reloc, _: &ExternalName, _: Addend) {}
-
-    fn add_trap(&mut self, _: SourceLoc, _: TrapCode) {}
-
-    fn add_call_site(&mut self, _: SourceLoc, _: Opcode) {}
-
-    fn start_srcloc(&mut self, _: SourceLoc) {}
-
-    fn end_srcloc(&mut self) {}
-}
-
-/// A relocation resulting from a compilation.
-pub struct MachReloc {
-    /// The offset at which the relocation applies, *relative to the
-    /// containing section*.
-    pub offset: CodeOffset,
-    /// The original source location.
-    pub srcloc: SourceLoc,
-    /// The kind of relocation.
-    pub kind: Reloc,
-    /// The external symbol / name to which this relocation refers.
-    pub name: ExternalName,
-    /// The addend to add to the symbol value.
-    pub addend: i64,
-}
-
-/// A trap record resulting from a compilation.
-pub struct MachTrap {
-    /// The offset at which the trap instruction occurs, *relative to the
-    /// containing section*.
-    pub offset: CodeOffset,
-    /// The original source location.
-    pub srcloc: SourceLoc,
-    /// The trap code.
-    pub code: TrapCode,
-}
-
-/// A call site record resulting from a compilation.
-pub struct MachCallSite {
-    /// The offset of the call's return address, *relative to the containing section*.
-    pub ret_addr: CodeOffset,
-    /// The original source location.
-    pub srcloc: SourceLoc,
-    /// The call's opcode.
-    pub opcode: Opcode,
-}
-
-/// A source-location mapping resulting from a compilation.
-#[derive(Clone, Debug)]
-pub struct MachSrcLoc {
-    /// The start of the region of code corresponding to a source location.
-    /// This is relative to the start of the function, not to the start of the
-    /// section.
-    pub start: CodeOffset,
-    /// The end of the region of code corresponding to a source location.
-    /// This is relative to the start of the section, not to the start of the
-    /// section.
-    pub end: CodeOffset,
-    /// The source location.
-    pub loc: SourceLoc,
-}
diff --git a/cranelift/codegen/src/machinst/vcode.rs b/cranelift/codegen/src/machinst/vcode.rs
index ff9961aefa..d4c13bff0c 100644
--- a/cranelift/codegen/src/machinst/vcode.rs
+++ b/cranelift/codegen/src/machinst/vcode.rs
@@ -17,8 +17,7 @@
 //! See the main module comment in `mod.rs` for more details on the VCode-based
 //! backend pipeline.
 
-use crate::entity::SecondaryMap;
-use crate::ir::{self, Block, SourceLoc};
+use crate::ir::{self, SourceLoc};
 use crate::machinst::*;
 use crate::settings;
 
@@ -30,8 +29,6 @@ use regalloc::{
 
 use alloc::boxed::Box;
 use alloc::{borrow::Cow, vec::Vec};
-use log::debug;
-use smallvec::SmallVec;
 use std::fmt;
 use std::iter;
 use std::string::String;
@@ -43,8 +40,8 @@ pub type BlockIndex = u32;
 
 /// VCodeInst wraps all requirements for a MachInst to be in VCode: it must be
 /// a `MachInst` and it must be able to emit itself at least to a `SizeCodeSink`.
-pub trait VCodeInst: MachInst + MachInstEmit<MachSection> + MachInstEmit<MachSectionSize> {}
-impl<I: MachInst + MachInstEmit<MachSection> + MachInstEmit<MachSectionSize>> VCodeInst for I {}
+pub trait VCodeInst: MachInst + MachInstEmit {}
+impl<I: MachInst + MachInstEmit> VCodeInst for I {}
 
 /// A function in "VCode" (virtualized-register code) form, after lowering.
 /// This is essentially a standard CFG of basic blocks, where each basic block
@@ -80,29 +77,11 @@ pub struct VCode<I: VCodeInst> {
     /// correspond to each basic block's successors.
     block_succs: Vec<BlockIx>,
 
-    /// Block indices by IR block.
-    block_by_bb: SecondaryMap<ir::Block, BlockIndex>,
-
-    /// IR block for each VCode Block. The length of this Vec will likely be
-    /// less than the total number of Blocks, because new Blocks (for edge
-    /// splits, for example) are appended during lowering.
-    bb_by_block: Vec<ir::Block>,
-
-    /// Order of block IDs in final generated code.
-    final_block_order: Vec<BlockIndex>,
-
-    /// Final block offsets. Computed during branch finalization and used
-    /// during emission.
-    final_block_offsets: Vec<CodeOffset>,
-
-    /// Size of code, accounting for block layout / alignment.
-    code_size: CodeOffset,
+    /// Block-order information.
+    block_order: BlockLoweringOrder,
 
     /// ABI object.
     abi: Box<dyn ABIBody<I = I>>,
-
-    /// The block targeted by fallthrough_returns, if there's one.
-    pub fallthrough_return_block: Option<BlockIndex>,
 }
 
 /// A builder for a VCode function body. This builder is designed for the
@@ -123,12 +102,8 @@ pub struct VCodeBuilder<I: VCodeInst> {
     /// In-progress VCode.
     vcode: VCode<I>,
 
-    /// Current basic block instructions, in reverse order (because blocks are
-    /// built bottom-to-top).
-    bb_insns: SmallVec<[(I, SourceLoc); 32]>,
-
-    /// Current IR-inst instructions, in forward order.
-    ir_inst_insns: SmallVec<[(I, SourceLoc); 4]>,
+    /// Index of the last block-start in the vcode.
+    block_start: InsnIndex,
 
     /// Start of succs for the current block in the concatenated succs list.
     succ_start: usize,
@@ -139,12 +114,11 @@ pub struct VCodeBuilder<I: VCodeInst> {
 
 impl<I: VCodeInst> VCodeBuilder<I> {
     /// Create a new VCodeBuilder.
-    pub fn new(abi: Box<dyn ABIBody<I = I>>) -> VCodeBuilder<I> {
-        let vcode = VCode::new(abi);
+    pub fn new(abi: Box<dyn ABIBody<I = I>>, block_order: BlockLoweringOrder) -> VCodeBuilder<I> {
+        let vcode = VCode::new(abi, block_order);
         VCodeBuilder {
             vcode,
-            bb_insns: SmallVec::new(),
-            ir_inst_insns: SmallVec::new(),
+            block_start: 0,
             succ_start: 0,
             cur_srcloc: SourceLoc::default(),
         }
@@ -155,14 +129,9 @@ impl<I: VCodeInst> VCodeBuilder<I> {
         &mut *self.vcode.abi
     }
 
-    /// Set the fallthrough_return target block for this function. There must be at most once per
-    /// function.
-    pub fn set_fallthrough_return_block(&mut self, bb: Block) {
-        debug_assert!(
-            self.vcode.fallthrough_return_block.is_none(),
-            "a function must have at most one fallthrough-return instruction"
-        );
-        self.vcode.fallthrough_return_block = Some(self.bb_to_bindex(bb));
+    /// Access to the BlockLoweringOrder object.
+    pub fn block_order(&self) -> &BlockLoweringOrder {
+        &self.vcode.block_order
     }
 
     /// Set the type of a VReg.
@@ -173,53 +142,17 @@ impl<I: VCodeInst> VCodeBuilder<I> {
         self.vcode.vreg_types[vreg.get_index()] = ty;
     }
 
-    /// Return the underlying bb-to-BlockIndex map.
-    pub fn blocks_by_bb(&self) -> &SecondaryMap<ir::Block, BlockIndex> {
-        &self.vcode.block_by_bb
-    }
-
-    /// Initialize the bb-to-BlockIndex map. Returns the first free
-    /// BlockIndex.
-    pub fn init_bb_map(&mut self, blocks: &[ir::Block]) -> BlockIndex {
-        let mut bindex: BlockIndex = 0;
-        for bb in blocks.iter() {
-            self.vcode.block_by_bb[*bb] = bindex;
-            self.vcode.bb_by_block.push(*bb);
-            bindex += 1;
-        }
-        bindex
-    }
-
-    /// Get the BlockIndex for an IR block.
-    pub fn bb_to_bindex(&self, bb: ir::Block) -> BlockIndex {
-        self.vcode.block_by_bb[bb]
-    }
-
     /// Set the current block as the entry block.
     pub fn set_entry(&mut self, block: BlockIndex) {
         self.vcode.entry = block;
     }
 
-    /// End the current IR instruction. Must be called after pushing any
-    /// instructions and prior to ending the basic block.
-    pub fn end_ir_inst(&mut self) {
-        while let Some(pair) = self.ir_inst_insns.pop() {
-            self.bb_insns.push(pair);
-        }
-    }
-
     /// End the current basic block. Must be called after emitting vcode insts
     /// for IR insts and prior to ending the function (building the VCode).
-    pub fn end_bb(&mut self) -> BlockIndex {
-        assert!(self.ir_inst_insns.is_empty());
-        let block_num = self.vcode.block_ranges.len() as BlockIndex;
-        // Push the instructions.
-        let start_idx = self.vcode.insts.len() as InsnIndex;
-        while let Some((i, loc)) = self.bb_insns.pop() {
-            self.vcode.insts.push(i);
-            self.vcode.srclocs.push(loc);
-        }
+    pub fn end_bb(&mut self) {
+        let start_idx = self.block_start;
         let end_idx = self.vcode.insts.len() as InsnIndex;
+        self.block_start = end_idx;
         // Add the instruction index range to the list of blocks.
         self.vcode.block_ranges.push((start_idx, end_idx));
         // End the successors list.
@@ -228,8 +161,6 @@ impl<I: VCodeInst> VCodeBuilder<I> {
             .block_succ_range
             .push((self.succ_start, succ_end));
         self.succ_start = succ_end;
-
-        block_num
     }
 
     /// Push an instruction for the current BB and current IR inst within the BB.
@@ -237,19 +168,27 @@ impl<I: VCodeInst> VCodeBuilder<I> {
         match insn.is_term() {
             MachTerminator::None | MachTerminator::Ret => {}
             MachTerminator::Uncond(target) => {
-                self.vcode.block_succs.push(BlockIx::new(target));
+                self.vcode.block_succs.push(BlockIx::new(target.get()));
             }
             MachTerminator::Cond(true_branch, false_branch) => {
-                self.vcode.block_succs.push(BlockIx::new(true_branch));
-                self.vcode.block_succs.push(BlockIx::new(false_branch));
+                self.vcode.block_succs.push(BlockIx::new(true_branch.get()));
+                self.vcode
+                    .block_succs
+                    .push(BlockIx::new(false_branch.get()));
             }
             MachTerminator::Indirect(targets) => {
                 for target in targets {
-                    self.vcode.block_succs.push(BlockIx::new(*target));
+                    self.vcode.block_succs.push(BlockIx::new(target.get()));
                 }
             }
         }
-        self.ir_inst_insns.push((insn, self.cur_srcloc));
+        self.vcode.insts.push(insn);
+        self.vcode.srclocs.push(self.cur_srcloc);
+    }
+
+    /// Get the current source location.
+    pub fn get_srcloc(&self) -> SourceLoc {
+        self.cur_srcloc
     }
 
     /// Set the current source location.
@@ -259,8 +198,6 @@ impl<I: VCodeInst> VCodeBuilder<I> {
 
     /// Build the final VCode.
     pub fn build(self) -> VCode<I> {
-        assert!(self.ir_inst_insns.is_empty());
-        assert!(self.bb_insns.is_empty());
         self.vcode
     }
 }
@@ -282,35 +219,9 @@ fn is_redundant_move<I: VCodeInst>(insn: &I) -> bool {
     }
 }
 
-fn is_trivial_jump_block<I: VCodeInst>(vcode: &VCode<I>, block: BlockIndex) -> Option<BlockIndex> {
-    let range = vcode.block_insns(BlockIx::new(block));
-
-    debug!(
-        "is_trivial_jump_block: block {} has len {}",
-        block,
-        range.len()
-    );
-
-    if range.len() != 1 {
-        return None;
-    }
-    let insn = range.first();
-
-    debug!(
-        " -> only insn is: {:?} with terminator {:?}",
-        vcode.get_insn(insn),
-        vcode.get_insn(insn).is_term()
-    );
-
-    match vcode.get_insn(insn).is_term() {
-        MachTerminator::Uncond(target) => Some(target),
-        _ => None,
-    }
-}
-
 impl<I: VCodeInst> VCode<I> {
     /// New empty VCode.
-    fn new(abi: Box<dyn ABIBody<I = I>>) -> VCode<I> {
+    fn new(abi: Box<dyn ABIBody<I = I>>, block_order: BlockLoweringOrder) -> VCode<I> {
         VCode {
             liveins: abi.liveins(),
             liveouts: abi.liveouts(),
@@ -321,13 +232,8 @@ impl<I: VCodeInst> VCode<I> {
             block_ranges: vec![],
             block_succ_range: vec![],
             block_succs: vec![],
-            block_by_bb: SecondaryMap::with_default(0),
-            bb_by_block: vec![],
-            final_block_order: vec![],
-            final_block_offsets: vec![],
-            code_size: 0,
+            block_order,
             abi,
-            fallthrough_return_block: None,
         }
     }
 
@@ -367,8 +273,6 @@ impl<I: VCodeInst> VCode<I> {
     /// instructions including spliced fill/reload/move instructions, and replace
     /// the VCode with them.
     pub fn replace_insns_from_regalloc(&mut self, result: RegAllocResult<Self>) {
-        self.final_block_order = compute_final_block_order(self);
-
         // Record the spillslot count and clobbered registers for the ABI/stack
         // setup code.
         self.abi.set_num_spillslots(result.num_spill_slots as usize);
@@ -383,11 +287,12 @@ impl<I: VCodeInst> VCode<I> {
         let mut final_block_ranges = vec![(0, 0); self.num_blocks()];
         let mut final_srclocs = vec![];
 
-        for block in &self.final_block_order {
-            let (start, end) = block_ranges[*block as usize];
+        for block in 0..self.num_blocks() {
+            let block = block as BlockIndex;
+            let (start, end) = block_ranges[block as usize];
             let final_start = final_insns.len() as InsnIndex;
 
-            if *block == self.entry {
+            if block == self.entry {
                 // Start with the prologue.
                 let prologue = self.abi.gen_prologue();
                 let len = prologue.len();
@@ -429,7 +334,7 @@ impl<I: VCodeInst> VCode<I> {
             }
 
             let final_end = final_insns.len() as InsnIndex;
-            final_block_ranges[*block as usize] = (final_start, final_end);
+            final_block_ranges[block as usize] = (final_start, final_end);
         }
 
         debug_assert!(final_insns.len() == final_srclocs.len());
@@ -439,175 +344,68 @@ impl<I: VCodeInst> VCode<I> {
         self.block_ranges = final_block_ranges;
     }
 
-    /// Removes redundant branches, rewriting targets to point directly to the
-    /// ultimate block at the end of a chain of trivial one-target jumps.
-    pub fn remove_redundant_branches(&mut self) {
-        // For each block, compute the actual target block, looking through up to one
-        // block with single-target jumps (this will remove empty edge blocks inserted
-        // by phi-lowering).
-        let block_rewrites: Vec<BlockIndex> = (0..self.num_blocks() as u32)
-            .map(|bix| is_trivial_jump_block(self, bix).unwrap_or(bix))
-            .collect();
-        let mut refcounts: Vec<usize> = vec![0; self.num_blocks()];
-
-        debug!(
-            "remove_redundant_branches: block_rewrites = {:?}",
-            block_rewrites
-        );
-
-        refcounts[self.entry as usize] = 1;
-
-        for block in 0..self.num_blocks() as u32 {
-            for insn in self.block_insns(BlockIx::new(block)) {
-                self.get_insn_mut(insn)
-                    .with_block_rewrites(&block_rewrites[..]);
-                match self.get_insn(insn).is_term() {
-                    MachTerminator::Uncond(bix) => {
-                        refcounts[bix as usize] += 1;
-                    }
-                    MachTerminator::Cond(bix1, bix2) => {
-                        refcounts[bix1 as usize] += 1;
-                        refcounts[bix2 as usize] += 1;
-                    }
-                    MachTerminator::Indirect(blocks) => {
-                        for block in blocks {
-                            refcounts[*block as usize] += 1;
-                        }
-                    }
-                    _ => {}
-                }
-            }
-        }
-
-        let deleted: Vec<bool> = refcounts.iter().map(|r| *r == 0).collect();
-
-        let block_order = std::mem::replace(&mut self.final_block_order, vec![]);
-        self.final_block_order = block_order
-            .into_iter()
-            .filter(|b| !deleted[*b as usize])
-            .collect();
-
-        // Rewrite successor information based on the block-rewrite map.
-        for succ in &mut self.block_succs {
-            let new_succ = block_rewrites[succ.get() as usize];
-            *succ = BlockIx::new(new_succ);
-        }
-    }
-
-    /// Mutate branch instructions to (i) lower two-way condbrs to one-way,
-    /// depending on fallthrough; and (ii) use concrete offsets.
-    pub fn finalize_branches(&mut self)
+    /// Emit the instructions to a `MachBuffer`, containing fixed-up code and external
+    /// reloc/trap/etc. records ready for use.
+    pub fn emit(&self) -> MachBuffer<I>
     where
-        I: MachInstEmit<MachSectionSize>,
+        I: MachInstEmit,
     {
-        // Compute fallthrough block, indexed by block.
-        let num_final_blocks = self.final_block_order.len();
-        let mut block_fallthrough: Vec<Option<BlockIndex>> = vec![None; self.num_blocks()];
-        for i in 0..(num_final_blocks - 1) {
-            let from = self.final_block_order[i];
-            let to = self.final_block_order[i + 1];
-            block_fallthrough[from as usize] = Some(to);
-        }
-
-        // Pass over VCode instructions and finalize two-way branches into
-        // one-way branches with fallthrough.
-        for block in 0..self.num_blocks() {
-            let next_block = block_fallthrough[block];
-            let (start, end) = self.block_ranges[block];
-
-            for iix in start..end {
-                let insn = &mut self.insts[iix as usize];
-                insn.with_fallthrough_block(next_block);
-            }
-        }
-
-        let flags = self.abi.flags();
-
-        // Compute block offsets.
-        let mut code_section = MachSectionSize::new(0);
-        let mut block_offsets = vec![0; self.num_blocks()];
+        let mut buffer = MachBuffer::new();
         let mut state = Default::default();
-        for &block in &self.final_block_order {
-            code_section.offset = I::align_basic_block(code_section.offset);
-            block_offsets[block as usize] = code_section.offset;
-            let (start, end) = self.block_ranges[block as usize];
-            for iix in start..end {
-                self.insts[iix as usize].emit(&mut code_section, flags, &mut state);
-            }
-        }
 
-        // We now have the section layout.
-        self.final_block_offsets = block_offsets;
-        self.code_size = code_section.size();
-
-        // Update branches with known block offsets. This looks like the
-        // traversal above, but (i) does not update block_offsets, rather uses
-        // it (so forward references are now possible), and (ii) mutates the
-        // instructions.
-        let mut code_section = MachSectionSize::new(0);
-        let mut state = Default::default();
-        for &block in &self.final_block_order {
-            code_section.offset = I::align_basic_block(code_section.offset);
-            let (start, end) = self.block_ranges[block as usize];
-            for iix in start..end {
-                self.insts[iix as usize]
-                    .with_block_offsets(code_section.offset, &self.final_block_offsets[..]);
-                self.insts[iix as usize].emit(&mut code_section, flags, &mut state);
-            }
-        }
-    }
-
-    /// Emit the instructions to a list of sections.
-    pub fn emit(&self) -> MachSections
-    where
-        I: MachInstEmit<MachSection>,
-    {
-        let mut sections = MachSections::new();
-        let code_idx = sections.add_section(0, self.code_size);
-        let code_section = sections.get_section(code_idx);
-        let mut state = Default::default();
+        buffer.reserve_labels_for_blocks(self.num_blocks() as BlockIndex); // first N MachLabels are simply block indices.
 
         let flags = self.abi.flags();
         let mut cur_srcloc = None;
-        for &block in &self.final_block_order {
-            let new_offset = I::align_basic_block(code_section.cur_offset_from_start());
-            while new_offset > code_section.cur_offset_from_start() {
+        for block in 0..self.num_blocks() {
+            let block = block as BlockIndex;
+            let new_offset = I::align_basic_block(buffer.cur_offset());
+            while new_offset > buffer.cur_offset() {
                 // Pad with NOPs up to the aligned block offset.
-                let nop = I::gen_nop((new_offset - code_section.cur_offset_from_start()) as usize);
-                nop.emit(code_section, flags, &mut Default::default());
+                let nop = I::gen_nop((new_offset - buffer.cur_offset()) as usize);
+                nop.emit(&mut buffer, flags, &mut Default::default());
             }
-            assert_eq!(code_section.cur_offset_from_start(), new_offset);
+            assert_eq!(buffer.cur_offset(), new_offset);
 
             let (start, end) = self.block_ranges[block as usize];
+            buffer.bind_label(MachLabel::from_block(block));
             for iix in start..end {
                 let srcloc = self.srclocs[iix as usize];
                 if cur_srcloc != Some(srcloc) {
                     if cur_srcloc.is_some() {
-                        code_section.end_srcloc();
+                        buffer.end_srcloc();
                     }
-                    code_section.start_srcloc(srcloc);
+                    buffer.start_srcloc(srcloc);
                     cur_srcloc = Some(srcloc);
                 }
 
-                self.insts[iix as usize].emit(code_section, flags, &mut state);
+                self.insts[iix as usize].emit(&mut buffer, flags, &mut state);
             }
 
             if cur_srcloc.is_some() {
-                code_section.end_srcloc();
+                buffer.end_srcloc();
                 cur_srcloc = None;
             }
+
+            // Do we need an island? Get the worst-case size of the next BB and see if, having
+            // emitted that many bytes, we will be beyond the deadline.
+            if block < (self.num_blocks() - 1) as BlockIndex {
+                let next_block = block + 1;
+                let next_block_range = self.block_ranges[next_block as usize];
+                let next_block_size = next_block_range.1 - next_block_range.0;
+                let worst_case_next_bb = I::worst_case_size() * next_block_size;
+                if buffer.island_needed(worst_case_next_bb) {
+                    buffer.emit_island();
+                }
+            }
         }
 
-        sections
+        buffer
     }
 
     /// Get the IR block for a BlockIndex, if one exists.
     pub fn bindex_to_bb(&self, block: BlockIndex) -> Option<ir::Block> {
-        if (block as usize) < self.bb_by_block.len() {
-            Some(self.bb_by_block[block as usize])
-        } else {
-            None
-        }
+        self.block_order.lowered_order()[block as usize].orig_block()
     }
 }
 
@@ -712,7 +510,6 @@ impl<I: VCodeInst> fmt::Debug for VCode<I> {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         writeln!(f, "VCode_Debug {{")?;
         writeln!(f, "  Entry block: {}", self.entry)?;
-        writeln!(f, "  Final block order: {:?}", self.final_block_order)?;
 
         for block in 0..self.num_blocks() {
             writeln!(f, "Block {}:", block,)?;
@@ -736,52 +533,21 @@ impl<I: VCodeInst + ShowWithRRU> ShowWithRRU for VCode<I> {
     fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
         use std::fmt::Write;
 
-        // Calculate an order in which to display the blocks.  This is the same
-        // as final_block_order, but also includes blocks which are in the
-        // representation but not in final_block_order.
-        let mut display_order = Vec::<usize>::new();
-        // First display blocks in `final_block_order`
-        for bix in &self.final_block_order {
-            assert!((*bix as usize) < self.num_blocks());
-            display_order.push(*bix as usize);
-        }
-        // Now also take care of those not listed in `final_block_order`.
-        // This is quadratic, but it's also debug-only code.
-        for bix in 0..self.num_blocks() {
-            if display_order.contains(&bix) {
-                continue;
-            }
-            display_order.push(bix);
-        }
-
         let mut s = String::new();
         write!(&mut s, "VCode_ShowWithRRU {{{{\n").unwrap();
         write!(&mut s, "  Entry block: {}\n", self.entry).unwrap();
-        write!(
-            &mut s,
-            "  Final block order: {:?}\n",
-            self.final_block_order
-        )
-        .unwrap();
 
         for i in 0..self.num_blocks() {
-            let block = display_order[i];
+            let block = i as BlockIndex;
 
-            let omitted = if !self.final_block_order.is_empty() && i >= self.final_block_order.len()
-            {
-                "** OMITTED **"
-            } else {
-                ""
-            };
-
-            write!(&mut s, "Block {}: {}\n", block, omitted).unwrap();
-            if let Some(bb) = self.bindex_to_bb(block as BlockIndex) {
+            write!(&mut s, "Block {}:\n", block).unwrap();
+            if let Some(bb) = self.bindex_to_bb(block) {
                 write!(&mut s, "  (original IR block: {})\n", bb).unwrap();
             }
-            for succ in self.succs(block as BlockIndex) {
+            for succ in self.succs(block) {
                 write!(&mut s, "  (successor: Block {})\n", succ.get()).unwrap();
             }
-            let (start, end) = self.block_ranges[block];
+            let (start, end) = self.block_ranges[block as usize];
             write!(&mut s, "  (instruction range: {} .. {})\n", start, end).unwrap();
             for inst in start..end {
                 write!(
diff --git a/cranelift/codegen/src/num_uses.rs b/cranelift/codegen/src/num_uses.rs
deleted file mode 100644
index fd6eee8ec1..0000000000
--- a/cranelift/codegen/src/num_uses.rs
+++ /dev/null
@@ -1,52 +0,0 @@
-//! A pass that computes the number of uses of any given instruction.
-
-use crate::entity::SecondaryMap;
-use crate::ir::dfg::ValueDef;
-use crate::ir::Value;
-use crate::ir::{DataFlowGraph, Function, Inst};
-
-/// Auxiliary data structure that counts the number of uses of any given
-/// instruction in a Function. This is used during instruction selection
-/// to essentially do incremental DCE: when an instruction is no longer
-/// needed because its computation has been isel'd into another machine
-/// instruction at every use site, we can skip it.
-#[derive(Clone, Debug)]
-pub struct NumUses {
-    uses: SecondaryMap<Inst, u32>,
-}
-
-impl NumUses {
-    fn new() -> NumUses {
-        NumUses {
-            uses: SecondaryMap::with_default(0),
-        }
-    }
-
-    /// Compute the NumUses analysis result for a function.
-    pub fn compute(func: &Function) -> NumUses {
-        let mut uses = NumUses::new();
-        for bb in func.layout.blocks() {
-            for inst in func.layout.block_insts(bb) {
-                for arg in func.dfg.inst_args(inst) {
-                    let v = func.dfg.resolve_aliases(*arg);
-                    uses.add_value(&func.dfg, v);
-                }
-            }
-        }
-        uses
-    }
-
-    fn add_value(&mut self, dfg: &DataFlowGraph, v: Value) {
-        match dfg.value_def(v) {
-            ValueDef::Result(inst, _) => {
-                self.uses[inst] += 1;
-            }
-            _ => {}
-        }
-    }
-
-    /// Take the complete uses map, consuming this analysis result.
-    pub fn take_uses(self) -> SecondaryMap<Inst, u32> {
-        self.uses
-    }
-}
diff --git a/cranelift/filetests/filetests/vcode/aarch64/arithmetic.clif b/cranelift/filetests/filetests/vcode/aarch64/arithmetic.clif
index 08ecb31d35..9a95c52c64 100644
--- a/cranelift/filetests/filetests/vcode/aarch64/arithmetic.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/arithmetic.clif
@@ -1,7 +1,7 @@
 test vcode
 target aarch64
 
-function %f(i64, i64) -> i64 {
+function %f1(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
   v2 = iadd.i64 v0, v1
   return v2
@@ -15,7 +15,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ret
 
 
-function %f(i64, i64) -> i64 {
+function %f2(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
   v2 = isub.i64 v0, v1
   return v2
@@ -28,7 +28,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
 
-function %f(i64, i64) -> i64 {
+function %f3(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
   v2 = imul.i64 v0, v1
   return v2
@@ -41,7 +41,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
 
-function %f(i64, i64) -> i64 {
+function %f4(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
   v2 = umulhi.i64 v0, v1
   return v2
@@ -54,7 +54,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
 
-function %f(i64, i64) -> i64 {
+function %f5(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
   v2 = smulhi.i64 v0, v1
   return v2
@@ -67,7 +67,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
 
-function %f(i64, i64) -> i64 {
+function %f6(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
   v2 = sdiv.i64 v0, v1
   return v2
@@ -87,7 +87,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
 
-function %f(i64) -> i64 {
+function %f7(i64) -> i64 {
 block0(v0: i64):
   v1 = iconst.i64 2
   v2 = sdiv.i64 v0, v1
@@ -109,7 +109,7 @@ block0(v0: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
 
-function %f(i64, i64) -> i64 {
+function %f8(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
   v2 = udiv.i64 v0, v1
   return v2
@@ -124,7 +124,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
 
-function %f(i64) -> i64 {
+function %f9(i64) -> i64 {
 block0(v0: i64):
   v1 = iconst.i64 2
   v2 = udiv.i64 v0, v1
@@ -141,7 +141,7 @@ block0(v0: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
 
-function %f(i64, i64) -> i64 {
+function %f10(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
   v2 = srem.i64 v0, v1
   return v2
@@ -157,7 +157,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
 
-function %f(i64, i64) -> i64 {
+function %f11(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
   v2 = urem.i64 v0, v1
   return v2
@@ -174,7 +174,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ret
 
 
-function %f(i32, i32) -> i32 {
+function %f12(i32, i32) -> i32 {
 block0(v0: i32, v1: i32):
   v2 = sdiv.i32 v0, v1
   return v2
@@ -195,48 +195,48 @@ block0(v0: i32, v1: i32):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
 
-function %f(i32) -> i32 {
+function %f13(i32) -> i32 {
 block0(v0: i32):
   v1 = iconst.i32 2
   v2 = sdiv.i32 v0, v1
   return v2
 }
 
-; check:  stp fp, lr, [sp, #-16]!
-; nextln:  mov fp, sp
-; nextln:  mov x1, x0
-; nextln:  movz x0, #2
-; nextln:  sxtw x1, w1
-; nextln:  sxtw x2, w0
-; nextln:  sdiv x0, x1, x2
-; nextln:  cbz x2, 20
-; nextln:  adds wzr, w2, #1
-; nextln:  ccmp w1, #1, #nzcv, eq
-; nextln:  b.vc 12
-; nextln:  udf
-; nextln:  udf
-; nextln:  mov sp, fp
-; nextln:  ldp fp, lr, [sp], #16
-; nextln:  ret
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: sxtw x1, w0
+; nextln: movz x0, #2
+; nextln: sxtw x2, w0
+; nextln: sdiv x0, x1, x2
+; nextln: cbz x2, 20
+; nextln: adds wzr, w2, #1
+; nextln: ccmp w1, #1, #nzcv, eq
+; nextln: b.vc 12
+; nextln: udf
+; nextln: udf
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
 
-function %f(i32, i32) -> i32 {
+function %f14(i32, i32) -> i32 {
 block0(v0: i32, v1: i32):
   v2 = udiv.i32 v0, v1
   return v2
 }
 
-; check:  stp fp, lr, [sp, #-16]!
-; nextln:  mov fp, sp
-; nextln:  mov w0, w0
-; nextln:  mov w1, w1
-; nextln:  udiv x0, x0, x1
-; nextln:  cbnz x1, 8
-; nextln:  udf
-; nextln:  mov sp, fp
-; nextln:  ldp fp, lr, [sp], #16
-; nextln:  ret
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: mov w0, w0
+; nextln: mov w1, w1
+; nextln: udiv x0, x0, x1
+; nextln: cbnz x1, 8
+; nextln: udf
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
 
-function %f(i32) -> i32 {
+
+function %f15(i32) -> i32 {
 block0(v0: i32):
   v1 = iconst.i32 2
   v2 = udiv.i32 v0, v1
@@ -245,9 +245,8 @@ block0(v0: i32):
 
 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln:  mov fp, sp
-; nextln:  movz x1, #2
 ; nextln:  mov w0, w0
-; nextln:  mov w1, w1
+; nextln:  movz x1, #2
 ; nextln:  udiv x0, x0, x1
 ; nextln:  cbnz x1, 8
 ; nextln:  udf
@@ -255,7 +254,7 @@ block0(v0: i32):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
 
-function %f(i32, i32) -> i32 {
+function %f16(i32, i32) -> i32 {
 block0(v0: i32, v1: i32):
   v2 = srem.i32 v0, v1
   return v2
@@ -273,7 +272,7 @@ block0(v0: i32, v1: i32):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
 
-function %f(i32, i32) -> i32 {
+function %f17(i32, i32) -> i32 {
 block0(v0: i32, v1: i32):
   v2 = urem.i32 v0, v1
   return v2
@@ -291,7 +290,7 @@ block0(v0: i32, v1: i32):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
 
-function %f(i64, i64) -> i64 {
+function %f18(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
   v2 = band.i64 v0, v1
   return v2
@@ -304,7 +303,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
 
-function %f(i64, i64) -> i64 {
+function %f19(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
   v2 = bor.i64 v0, v1
   return v2
@@ -317,7 +316,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
 
-function %f(i64, i64) -> i64 {
+function %f20(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
   v2 = bxor.i64 v0, v1
   return v2
@@ -330,7 +329,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
 
-function %f(i64, i64) -> i64 {
+function %f21(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
   v2 = band_not.i64 v0, v1
   return v2
@@ -343,7 +342,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
 
-function %f(i64, i64) -> i64 {
+function %f22(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
   v2 = bor_not.i64 v0, v1
   return v2
@@ -356,7 +355,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
 
-function %f(i64, i64) -> i64 {
+function %f23(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
   v2 = bxor_not.i64 v0, v1
   return v2
@@ -369,7 +368,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
 
-function %f(i64, i64) -> i64 {
+function %f24(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
   v2 = bnot.i64 v0
   return v2
diff --git a/cranelift/filetests/filetests/vcode/aarch64/condbr.clif b/cranelift/filetests/filetests/vcode/aarch64/condbr.clif
index 596557d8e0..3f0c0766d7 100644
--- a/cranelift/filetests/filetests/vcode/aarch64/condbr.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/condbr.clif
@@ -30,17 +30,18 @@ block2:
   return v5
 }
 
+; check: Block 0:
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
 ; nextln: subs xzr, x0, x1
-; nextln: b.eq 20
-; check: Block 2:
-; check: movz x0, #2
+; nextln: b.eq label1 ; b label2
+; check: Block 1:
+; check: movz x0, #1
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
-; check: Block 1:
-; check: movz x0, #1
+; check: Block 2:
+; check: movz x0, #2
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
diff --git a/cranelift/filetests/filetests/vcode/aarch64/jumptable.clif b/cranelift/filetests/filetests/vcode/aarch64/jumptable.clif
index 0789173acb..f7c94c50b6 100644
--- a/cranelift/filetests/filetests/vcode/aarch64/jumptable.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/jumptable.clif
@@ -30,15 +30,15 @@ block5(v5: i64):
 
 ; check:   subs wzr, w0, #3
 ; nextln:   b.hs
-; nextln:   adr x2, pc+16 ; ldrsw x1, [x2, x0, LSL 2] ; add x2, x2, x1 ; br x2 ; jt_entries
+; nextln:   adr x1, pc+16 ; ldrsw x2, [x1, x0, LSL 2] ; add x1, x1, x2 ; br x1 ; jt_entries
 
-; check:   movz x1, #3
+; check:   movz x1, #1
 ; nextln:   b
 
 ; check:   movz x1, #2
 ; nextln:   b
 
-; check:   movz x1, #1
+; check:   movz x1, #3
 
 ; check:   add x0, x0, x1
 
diff --git a/cranelift/filetests/filetests/vcode/aarch64/saturating-ops.clif b/cranelift/filetests/filetests/vcode/aarch64/saturating-ops.clif
index 60b45cc07a..dcb76e0f26 100644
--- a/cranelift/filetests/filetests/vcode/aarch64/saturating-ops.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/saturating-ops.clif
@@ -25,10 +25,10 @@ block0(v0: i8, v1: i8):
 
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
-; nextln: uxtb x0, w0
-; nextln: uxtb x1, w1
-; nextln: mov v0.d[0], x0
-; nextln: mov v1.d[0], x1
+; nextln: uxtb x2, w0
+; nextln: uxtb x0, w1
+; nextln: mov v0.d[0], x2
+; nextln: mov v1.d[0], x0
 ; nextln: uqadd d0, d0, d1
 ; nextln: mov x0, v0.d[0]
 ; nextln: mov sp, fp
diff --git a/cranelift/filetests/filetests/vcode/aarch64/shift-rotate.clif b/cranelift/filetests/filetests/vcode/aarch64/shift-rotate.clif
index 86bdb2ea34..26a6922b39 100644
--- a/cranelift/filetests/filetests/vcode/aarch64/shift-rotate.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/shift-rotate.clif
@@ -366,15 +366,15 @@ block0(v0: i16):
   return v2
 }
 
-; check:  stp fp, lr, [sp, #-16]!
-; nextln:  mov fp, sp
-; nextln:  uxth w0, w0
-; nextln:  lsr w1, w0, #6
-; nextln:  lsl w0, w0, #10
-; nextln:  orr w0, w0, w1
-; nextln:  mov sp, fp
-; nextln:  ldp fp, lr, [sp], #16
-; nextln:  ret
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: uxth w1, w0
+; nextln: lsr w0, w1, #6
+; nextln: lsl w1, w1, #10
+; nextln: orr w0, w1, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
 
 function %f24(i8) -> i8 {
 block0(v0: i8):
@@ -385,10 +385,10 @@ block0(v0: i8):
 
 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln:  mov fp, sp
-; nextln:  uxtb w0, w0
-; nextln:  lsr w1, w0, #5
-; nextln:  lsl w0, w0, #3
-; nextln:  orr w0, w0, w1
+; nextln:  uxtb w1, w0
+; nextln:  lsr w0, w1, #5
+; nextln:  lsl w1, w1, #3
+; nextln:  orr w0, w1, w0
 ; nextln:  mov sp, fp
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
diff --git a/crates/environ/src/cranelift.rs b/crates/environ/src/cranelift.rs
index 409a6cdad6..e8e05c2438 100644
--- a/crates/environ/src/cranelift.rs
+++ b/crates/environ/src/cranelift.rs
@@ -93,7 +93,7 @@ use crate::compilation::{
 use crate::func_environ::{get_func_name, FuncEnvironment};
 use crate::{CacheConfig, FunctionBodyData, ModuleLocal, ModuleTranslation, Tunables};
 use cranelift_codegen::ir::{self, ExternalName};
-use cranelift_codegen::machinst::sections::MachSrcLoc;
+use cranelift_codegen::machinst::buffer::MachSrcLoc;
 use cranelift_codegen::print_errors::pretty_error;
 use cranelift_codegen::{binemit, isa, Context};
 use cranelift_entity::PrimaryMap;
@@ -215,7 +215,7 @@ fn get_function_address_map<'data>(
     if let Some(ref mcr) = &context.mach_compile_result {
         // New-style backend: we have a `MachCompileResult` that will give us `MachSrcLoc` mapping
         // tuples.
-        for &MachSrcLoc { start, end, loc } in mcr.sections.get_srclocs_sorted() {
+        for &MachSrcLoc { start, end, loc } in mcr.buffer.get_srclocs_sorted() {
             instructions.push(InstructionAddressMap {
                 srcloc: loc,
                 code_offset: start as usize,