Rework of MachInst isel, branch fixups and lowering, and block ordering.

This patch includes: - A complete rework of the way that CLIF blocks and edge blocks are lowered into VCode blocks. The new mechanism in `BlockLoweringOrder` computes RPO over the CFG, but with a twist: it merges edge blocks intto heads or tails of original CLIF blocks wherever possible, and it does this without ever actually materializing the full nodes-plus-edges graph first. The backend driver lowers blocks in final order so there's no need to reshuffle later. - A new `MachBuffer` that replaces the `MachSection`. This is a special version of a code-sink that is far more than a humble `Vec<u8>`. In particular, it keeps a record of label definitions and label uses, with a machine-pluggable `LabelUse` trait that defines various types of fixups (basically internal relocations). Importantly, it implements some simple peephole-style branch rewrites *inline in the emission pass*, without any separate traversals over the code to use fallthroughs, swap taken/not-taken arms, etc. It tracks branches at the tail of the buffer and can (i) remove blocks that are just unconditional branches (by redirecting the label), (ii) understand a conditional/unconditional pair and swap the conditional polarity when it's helpful; and (iii) remove branches that branch to the fallthrough PC. The `MachBuffer` also implements branch-island support. On architectures like AArch64, this is needed to allow conditional branches within plausibly-attainable ranges (+/- 1MB on AArch64 specifically). It also does this inline while streaming through the emission, without any sort of fixpoint algorithm or later moving of code, by simply tracking outstanding references and "deadlines" and emitting an island just-in-time when we're in danger of going out of range. - A rework of the instruction selector driver. This is largely following the same algorithm as before, but is cleaned up significantly, in particular in the API: the machine backend can ask for an input arg and get any of three forms (constant, register, producing instruction), indicating it needs the register or can merge the constant or producing instruction as appropriate. This new driver takes special care to emit constants right at use-sites (and at phi inputs), minimizing their live-ranges, and also special-cases the "pinned register" to avoid superfluous moves. Overall, on `bz2.wasm`, the results are: wasmtime full run (compile + runtime) of bz2: baseline: 9774M insns, 9742M cycles, 3.918s w/ changes: 7012M insns, 6888M cycles, 2.958s (24.5% faster, 28.3% fewer insns) clif-util wasm compile bz2: baseline: 2633M insns, 3278M cycles, 1.034s w/ changes: 2366M insns, 2920M cycles, 0.923s (10.7% faster, 10.1% fewer insns) All numbers are averages of two runs on an Ampere eMAG.
2020-05-15 19:04:50 -07:00
parent 463734b002
commit 72e6be9342
27 changed files with 3021 additions and 2035 deletions
--- a/cranelift/codegen/src/context.rs
+++ b/cranelift/codegen/src/context.rs
@@ -227,7 +227,7 @@ impl Context {
        let _tt = timing::binemit();
        let mut sink = MemoryCodeSink::new(mem, relocs, traps, stackmaps);
        if let Some(ref result) = &self.mach_compile_result {
-            result.sections.emit(&mut sink);
+            result.buffer.emit(&mut sink);
        } else {
            isa.emit_function_to_memory(&self.func, &mut sink);
        }
--- a/cranelift/codegen/src/inst_predicates.rs
+++ b/cranelift/codegen/src/inst_predicates.rs
@@ -40,3 +40,24 @@ pub fn has_side_effect(func: &Function, inst: Inst) -> bool {
    let opcode = data.opcode();
    trivially_has_side_effects(opcode) || is_load_with_defined_trapping(opcode, data)
 }
+
+/// Does the given instruction have any side-effect as per [has_side_effect], or else is a load?
+pub fn has_side_effect_or_load(func: &Function, inst: Inst) -> bool {
+    has_side_effect(func, inst) || func.dfg[inst].opcode().can_load()
+}
+
+/// Is the given instruction a constant value (`iconst`, `fconst`, `bconst`) that can be
+/// represented in 64 bits?
+pub fn is_constant_64bit(func: &Function, inst: Inst) -> Option<u64> {
+    let data = &func.dfg[inst];
+    if data.opcode() == Opcode::Null {
+        return Some(0);
+    }
+    match data {
+        &InstructionData::UnaryImm { imm, .. } => Some(imm.bits() as u64),
+        &InstructionData::UnaryIeee32 { imm, .. } => Some(imm.bits() as u64),
+        &InstructionData::UnaryIeee64 { imm, .. } => Some(imm.bits()),
+        &InstructionData::UnaryBool { imm, .. } => Some(if imm { 1 } else { 0 }),
+        _ => None,
+    }
+}
--- a/cranelift/codegen/src/isa/aarch64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -504,7 +504,7 @@ impl AArch64ABIBody {
                rn: stack_reg(),
                rm: stack_limit,
            });
-            insts.push(Inst::CondBrLowered {
+            insts.push(Inst::OneWayCondBr {
                target: BranchTarget::ResolvedOffset(8),
                // Here `Hs` == "higher or same" when interpreting the two
                // operands as unsigned integers.
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -3,14 +3,14 @@
 // Some variants are never constructed, but we still want them as options in the future.
 #![allow(dead_code)]

-use crate::binemit::CodeOffset;
 use crate::ir::Type;
 use crate::isa::aarch64::inst::*;
 use crate::isa::aarch64::lower::ty_bits;
+use crate::machinst::MachLabel;

 use regalloc::{RealRegUniverse, Reg, Writable};

-use core::convert::{Into, TryFrom};
+use core::convert::Into;
 use std::string::String;

 /// A shift operator for a register or immediate.
@@ -303,78 +303,44 @@ impl CondBrKind {

 /// A branch target. Either unresolved (basic-block index) or resolved (offset
 /// from end of current instruction).
-#[derive(Clone, Copy, Debug)]
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum BranchTarget {
-    /// An unresolved reference to a BlockIndex, as passed into
+    /// An unresolved reference to a Label, as passed into
    /// `lower_branch_group()`.
-    Block(BlockIndex),
-    /// A resolved reference to another instruction, after
-    /// `Inst::with_block_offsets()`.
+    Label(MachLabel),
+    /// A fixed PC offset.
    ResolvedOffset(isize),
 }

 impl BranchTarget {
-    /// Lower the branch target given offsets of each block.
-    pub fn lower(&mut self, targets: &[CodeOffset], my_offset: CodeOffset) {
+    /// Return the target's label, if it is a label-based target.
+    pub fn as_label(self) -> Option<MachLabel> {
        match self {
-            &mut BranchTarget::Block(bix) => {
-                let bix = usize::try_from(bix).unwrap();
-                assert!(bix < targets.len());
-                let block_offset_in_func = targets[bix];
-                let branch_offset = (block_offset_in_func as isize) - (my_offset as isize);
-                *self = BranchTarget::ResolvedOffset(branch_offset);
-            }
-            &mut BranchTarget::ResolvedOffset(..) => {}
-        }
-    }
-
-    /// Get the block index.
-    pub fn as_block_index(&self) -> Option<BlockIndex> {
-        match self {
-            &BranchTarget::Block(bix) => Some(bix),
+            BranchTarget::Label(l) => Some(l),
            _ => None,
        }
    }

-    /// Get the offset as 4-byte words. Returns `0` if not
-    /// yet resolved (in that case, we're only computing
-    /// size and the offset doesn't matter).
-    pub fn as_offset_words(&self) -> isize {
-        match self {
-            &BranchTarget::ResolvedOffset(off) => off >> 2,
+    /// Return the target's offset, if specified, or zero if label-based.
+    pub fn as_offset19_or_zero(self) -> u32 {
+        let off = match self {
+            BranchTarget::ResolvedOffset(off) => off >> 2,
            _ => 0,
-        }
+        };
+        assert!(off <= 0x3ffff);
+        assert!(off >= -0x40000);
+        (off as u32) & 0x7ffff
    }

-    /// Get the offset as a 26-bit offset suitable for a 26-bit jump, or `None` if overflow.
-    pub fn as_off26(&self) -> Option<u32> {
-        let off = self.as_offset_words();
-        if (off < (1 << 25)) && (off >= -(1 << 25)) {
-            Some((off as u32) & ((1 << 26) - 1))
-        } else {
-            None
-        }
-    }
-
-    /// Get the offset as a 19-bit offset, or `None` if overflow.
-    pub fn as_off19(&self) -> Option<u32> {
-        let off = self.as_offset_words();
-        if (off < (1 << 18)) && (off >= -(1 << 18)) {
-            Some((off as u32) & ((1 << 19) - 1))
-        } else {
-            None
-        }
-    }
-
-    /// Map the block index given a transform map.
-    pub fn map(&mut self, block_index_map: &[BlockIndex]) {
-        match self {
-            &mut BranchTarget::Block(ref mut bix) => {
-                let n = block_index_map[usize::try_from(*bix).unwrap()];
-                *bix = n;
-            }
-            &mut BranchTarget::ResolvedOffset(_) => {}
-        }
+    /// Return the target's offset, if specified, or zero if label-based.
+    pub fn as_offset26_or_zero(self) -> u32 {
+        let off = match self {
+            BranchTarget::ResolvedOffset(off) => off >> 2,
+            _ => 0,
+        };
+        assert!(off <= 0x1ffffff);
+        assert!(off >= -0x2000000);
+        (off as u32) & 0x3ffffff
    }
 }

@@ -507,7 +473,7 @@ impl ShowWithRRU for Cond {
 impl ShowWithRRU for BranchTarget {
    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
        match self {
-            &BranchTarget::Block(block) => format!("block{}", block),
+            &BranchTarget::Label(label) => format!("label{:?}", label.get()),
            &BranchTarget::ResolvedOffset(off) => format!("{}", off),
        }
    }
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -4,7 +4,7 @@ use crate::binemit::{CodeOffset, Reloc};
 use crate::ir::constant::ConstantData;
 use crate::ir::types::*;
 use crate::ir::TrapCode;
-use crate::isa::aarch64::{inst::regs::PINNED_REG, inst::*};
+use crate::isa::aarch64::inst::*;

 use regalloc::{Reg, RegClass, Writable};

@@ -149,6 +149,14 @@ fn enc_cbr(op_31_24: u32, off_18_0: u32, op_4: u32, cond: u32) -> u32 {
    (op_31_24 << 24) | (off_18_0 << 5) | (op_4 << 4) | cond
 }

+fn enc_conditional_br(taken: BranchTarget, kind: CondBrKind) -> u32 {
+    match kind {
+        CondBrKind::Zero(reg) => enc_cmpbr(0b1_011010_0, taken.as_offset19_or_zero(), reg),
+        CondBrKind::NotZero(reg) => enc_cmpbr(0b1_011010_1, taken.as_offset19_or_zero(), reg),
+        CondBrKind::Cond(c) => enc_cbr(0b01010100, taken.as_offset19_or_zero(), 0b0, c.bits()),
+    }
+}
+
 const MOVE_WIDE_FIXED: u32 = 0x92800000;

 #[repr(u32)]
@@ -340,10 +348,10 @@ pub struct EmitState {
    virtual_sp_offset: i64,
 }

-impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
+impl MachInstEmit for Inst {
    type State = EmitState;

-    fn emit(&self, sink: &mut O, flags: &settings::Flags, state: &mut EmitState) {
+    fn emit(&self, sink: &mut MachBuffer<Inst>, flags: &settings::Flags, state: &mut EmitState) {
        match self {
            &Inst::AluRRR { alu_op, rd, rn, rm } => {
                let top11 = match alu_op {
@@ -616,7 +624,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                ref mem,
                srcloc,
            } => {
-                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem, state);
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset(), mem, state);

                for inst in mem_insts.into_iter() {
                    inst.emit(sink, flags, state);
@@ -759,7 +767,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                ref mem,
                srcloc,
            } => {
-                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem, state);
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset(), mem, state);

                for inst in mem_insts.into_iter() {
                    inst.emit(sink, flags, state);
@@ -1147,10 +1155,18 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                panic!("Unsupported extend variant");
            }
            &Inst::Jump { ref dest } => {
-                // TODO: differentiate between as_off26() returning `None` for
-                // out-of-range vs. not-yet-finalized. The latter happens when we
-                // do early (fake) emission for size computation.
-                sink.put4(enc_jump26(0b000101, dest.as_off26().unwrap()));
+                let off = sink.cur_offset();
+                // Emit the jump itself.
+                sink.put4(enc_jump26(0b000101, dest.as_offset26_or_zero()));
+                // After the jump has been emitted, indicate that it uses a
+                // label, if so, so that a fixup can occur later. This happens
+                // after we emit the bytes because the fixup might occur right
+                // away (so the bytes must actually exist now).
+                if let Some(l) = dest.as_label() {
+                    sink.use_label_at_offset(off, l, LabelUse::Branch26);
+                    let cur_off = sink.cur_offset();
+                    sink.add_uncond_branch(off, cur_off, l);
+                }
            }
            &Inst::Ret => {
                sink.put4(0xd65f03c0);
@@ -1178,51 +1194,35 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    sink.add_call_site(loc, opcode);
                }
            }
-            &Inst::CondBr { .. } => panic!("Unlowered CondBr during binemit!"),
-            &Inst::CondBrLowered { target, kind } => match kind {
-                // TODO: handle >2^19 case by emitting a compound sequence with
-                // an unconditional (26-bit) branch. We need branch-relaxation
-                // adjustment machinery to enable this (because we don't want to
-                // always emit the long form).
-                CondBrKind::Zero(reg) => {
-                    sink.put4(enc_cmpbr(0b1_011010_0, target.as_off19().unwrap(), reg));
-                }
-                CondBrKind::NotZero(reg) => {
-                    sink.put4(enc_cmpbr(0b1_011010_1, target.as_off19().unwrap(), reg));
-                }
-                CondBrKind::Cond(c) => {
-                    sink.put4(enc_cbr(
-                        0b01010100,
-                        target.as_off19().unwrap_or(0),
-                        0b0,
-                        c.bits(),
-                    ));
-                }
-            },
-            &Inst::CondBrLoweredCompound {
+            &Inst::CondBr {
                taken,
                not_taken,
                kind,
            } => {
                // Conditional part first.
-                match kind {
-                    CondBrKind::Zero(reg) => {
-                        sink.put4(enc_cmpbr(0b1_011010_0, taken.as_off19().unwrap(), reg));
-                    }
-                    CondBrKind::NotZero(reg) => {
-                        sink.put4(enc_cmpbr(0b1_011010_1, taken.as_off19().unwrap(), reg));
-                    }
-                    CondBrKind::Cond(c) => {
-                        sink.put4(enc_cbr(
-                            0b01010100,
-                            taken.as_off19().unwrap_or(0),
-                            0b0,
-                            c.bits(),
-                        ));
-                    }
+                let cond_off = sink.cur_offset();
+                sink.put4(enc_conditional_br(taken, kind));
+                if let Some(l) = taken.as_label() {
+                    sink.use_label_at_offset(cond_off, l, LabelUse::Branch19);
+                    let cur_off = sink.cur_offset();
+                    let inverted = enc_conditional_br(taken, kind.invert()).to_le_bytes();
+                    sink.add_cond_branch(cond_off, cur_off, l, &inverted[..]);
                }
                // Unconditional part.
-                sink.put4(enc_jump26(0b000101, not_taken.as_off26().unwrap_or(0)));
+                let uncond_off = sink.cur_offset();
+                sink.put4(enc_jump26(0b000101, not_taken.as_offset26_or_zero()));
+                if let Some(l) = not_taken.as_label() {
+                    sink.use_label_at_offset(uncond_off, l, LabelUse::Branch26);
+                    let cur_off = sink.cur_offset();
+                    sink.add_uncond_branch(uncond_off, cur_off, l);
+                }
+            }
+            &Inst::OneWayCondBr { target, kind } => {
+                let off = sink.cur_offset();
+                sink.put4(enc_conditional_br(target, kind));
+                if let Some(l) = target.as_label() {
+                    sink.use_label_at_offset(off, l, LabelUse::Branch19);
+                }
            }
            &Inst::IndirectBr { rn, .. } => {
                sink.put4(enc_br(rn));
@@ -1239,8 +1239,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                sink.add_trap(srcloc, code);
                sink.put4(0xd4a00000);
            }
-            &Inst::Adr { rd, ref label } => {
-                let off = memlabel_finalize(sink.cur_offset_from_start(), label);
+            &Inst::Adr { rd, off } => {
                assert!(off > -(1 << 20));
                assert!(off < (1 << 20));
                sink.put4(enc_adr(off, rd));
@@ -1261,19 +1260,13 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                // This sequence is *one* instruction in the vcode, and is expanded only here at
                // emission time, because we cannot allow the regalloc to insert spills/reloads in
                // the middle; we depend on hardcoded PC-rel addressing below.
-                //
-                // N.B.: if PC-rel addressing on ADR below is changed, also update
-                // `Inst::with_block_offsets()` in aarch64/inst/mod.rs.

                // Save index in a tmp (the live range of ridx only goes to start of this
                // sequence; rtmp1 or rtmp2 may overwrite it).
                let inst = Inst::gen_move(rtmp2, ridx, I64);
                inst.emit(sink, flags, state);
                // Load address of jump table
-                let inst = Inst::Adr {
-                    rd: rtmp1,
-                    label: MemLabel::PCRel(16),
-                };
+                let inst = Inst::Adr { rd: rtmp1, off: 16 };
                inst.emit(sink, flags, state);
                // Load value out of jump table
                let inst = Inst::SLoad32 {
@@ -1303,12 +1296,16 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                };
                inst.emit(sink, flags, state);
                // Emit jump table (table of 32-bit offsets).
-                for target in targets.iter() {
-                    let off = target.as_offset_words() * 4;
-                    let off = i32::try_from(off).unwrap();
-                    // cast i32 to u32 (two's-complement)
-                    let off = off as u32;
-                    sink.put4(off);
+                let jt_off = sink.cur_offset();
+                for &target in targets.iter() {
+                    let word_off = sink.cur_offset();
+                    let off_into_table = word_off - jt_off;
+                    sink.put4(off_into_table);
+                    sink.use_label_at_offset(
+                        word_off,
+                        target.as_label().unwrap(),
+                        LabelUse::PCRel32,
+                    );
                }
            }
            &Inst::LoadConst64 { rd, const_data } => {
@@ -1348,7 +1345,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                }
            }
            &Inst::LoadAddr { rd, ref mem } => {
-                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem, state);
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset(), mem, state);
                for inst in mem_insts.into_iter() {
                    inst.emit(sink, flags, state);
                }
@@ -1401,20 +1398,6 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    add.emit(sink, flags, state);
                }
            }
-            &Inst::GetPinnedReg { rd } => {
-                let inst = Inst::Mov {
-                    rd,
-                    rm: xreg(PINNED_REG),
-                };
-                inst.emit(sink, flags, state);
-            }
-            &Inst::SetPinnedReg { rm } => {
-                let inst = Inst::Mov {
-                    rd: Writable::from_reg(xreg(PINNED_REG)),
-                    rm,
-                };
-                inst.emit(sink, flags, state);
-            }
            &Inst::VirtualSPOffsetAdj { offset } => {
                debug!(
                    "virtual sp offset adjusted by {} -> {}",
@@ -1423,6 +1406,17 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                );
                state.virtual_sp_offset += offset;
            }
+            &Inst::EmitIsland { needed_space } => {
+                if sink.island_needed(needed_space + 4) {
+                    let jump_around_label = sink.get_label();
+                    let jmp = Inst::Jump {
+                        dest: BranchTarget::Label(jump_around_label),
+                    };
+                    jmp.emit(sink, flags, state);
+                    sink.emit_island();
+                    sink.bind_label(jump_around_label);
+                }
+            }
        }
    }
 }
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -1956,7 +1956,7 @@ fn test_aarch64_binemit() {
    ));

    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Zero(xreg(8)),
        },
@@ -1964,7 +1964,7 @@ fn test_aarch64_binemit() {
        "cbz x8, 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::NotZero(xreg(8)),
        },
@@ -1972,7 +1972,7 @@ fn test_aarch64_binemit() {
        "cbnz x8, 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Eq),
        },
@@ -1980,7 +1980,7 @@ fn test_aarch64_binemit() {
        "b.eq 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Ne),
        },
@@ -1989,7 +1989,7 @@ fn test_aarch64_binemit() {
    ));

    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Hs),
        },
@@ -1997,7 +1997,7 @@ fn test_aarch64_binemit() {
        "b.hs 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Lo),
        },
@@ -2005,7 +2005,7 @@ fn test_aarch64_binemit() {
        "b.lo 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Mi),
        },
@@ -2013,7 +2013,7 @@ fn test_aarch64_binemit() {
        "b.mi 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Pl),
        },
@@ -2021,7 +2021,7 @@ fn test_aarch64_binemit() {
        "b.pl 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Vs),
        },
@@ -2029,7 +2029,7 @@ fn test_aarch64_binemit() {
        "b.vs 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Vc),
        },
@@ -2037,7 +2037,7 @@ fn test_aarch64_binemit() {
        "b.vc 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Hi),
        },
@@ -2045,7 +2045,7 @@ fn test_aarch64_binemit() {
        "b.hi 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Ls),
        },
@@ -2053,7 +2053,7 @@ fn test_aarch64_binemit() {
        "b.ls 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Ge),
        },
@@ -2061,7 +2061,7 @@ fn test_aarch64_binemit() {
        "b.ge 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Lt),
        },
@@ -2069,7 +2069,7 @@ fn test_aarch64_binemit() {
        "b.lt 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Gt),
        },
@@ -2077,7 +2077,7 @@ fn test_aarch64_binemit() {
        "b.gt 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Le),
        },
@@ -2085,7 +2085,7 @@ fn test_aarch64_binemit() {
        "b.le 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Al),
        },
@@ -2093,7 +2093,7 @@ fn test_aarch64_binemit() {
        "b.al 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Nv),
        },
@@ -2102,7 +2102,7 @@ fn test_aarch64_binemit() {
    ));

    insns.push((
-        Inst::CondBrLoweredCompound {
+        Inst::CondBr {
            taken: BranchTarget::ResolvedOffset(64),
            not_taken: BranchTarget::ResolvedOffset(128),
            kind: CondBrKind::Cond(Cond::Le),
@@ -2138,7 +2138,7 @@ fn test_aarch64_binemit() {
    insns.push((
        Inst::IndirectBr {
            rn: xreg(3),
-            targets: vec![1, 2, 3],
+            targets: vec![],
        },
        "60001FD6",
        "br x3",
@@ -2149,7 +2149,7 @@ fn test_aarch64_binemit() {
    insns.push((
        Inst::Adr {
            rd: writable_xreg(15),
-            label: MemLabel::PCRel((1 << 20) - 4),
+            off: (1 << 20) - 4,
        },
        "EFFF7F10",
        "adr x15, pc+1048572",
@@ -2792,19 +2792,11 @@ fn test_aarch64_binemit() {
        let actual_printing = insn.show_rru(Some(&rru));
        assert_eq!(expected_printing, actual_printing);

-        // Check the encoding is as expected.
-        let text_size = {
-            let mut code_sec = MachSectionSize::new(0);
-            insn.emit(&mut code_sec, &flags, &mut Default::default());
-            code_sec.size()
-        };
-
        let mut sink = test_utils::TestCodeSink::new();
-        let mut sections = MachSections::new();
-        let code_idx = sections.add_section(0, text_size);
-        let code_sec = sections.get_section(code_idx);
-        insn.emit(code_sec, &flags, &mut Default::default());
-        sections.emit(&mut sink);
+        let mut buffer = MachBuffer::new();
+        insn.emit(&mut buffer, &flags, &mut Default::default());
+        let buffer = buffer.finish();
+        buffer.emit(&mut sink);
        let actual_encoding = &sink.stringify();
        assert_eq!(expected_encoding, actual_encoding);
    }
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -645,35 +645,28 @@ pub enum Inst {
        dest: BranchTarget,
    },

-    /// A conditional branch.
+    /// A conditional branch. Contains two targets; at emission time, both are emitted, but
+    /// the MachBuffer knows to truncate the trailing branch if fallthrough. We optimize the
+    /// choice of taken/not_taken (inverting the branch polarity as needed) based on the
+    /// fallthrough at the time of lowering.
    CondBr {
        taken: BranchTarget,
        not_taken: BranchTarget,
        kind: CondBrKind,
    },

-    /// Lowered conditional branch: contains the original branch kind (or the
-    /// inverse), but only one BranchTarget is retained. The other is
-    /// implicitly the next instruction, given the final basic-block layout.
-    CondBrLowered {
+    /// A one-way conditional branch, invisible to the CFG processing; used *only* as part of
+    /// straight-line sequences in code to be emitted.
+    OneWayCondBr {
        target: BranchTarget,
        kind: CondBrKind,
    },

-    /// As for `CondBrLowered`, but represents a condbr/uncond-br sequence (two
-    /// actual machine instructions). Needed when the final block layout implies
-    /// that neither arm of a conditional branch targets the fallthrough block.
-    CondBrLoweredCompound {
-        taken: BranchTarget,
-        not_taken: BranchTarget,
-        kind: CondBrKind,
-    },
-
    /// An indirect branch through a register, augmented with set of all
    /// possible successors.
    IndirectBr {
        rn: Reg,
-        targets: Vec<BlockIndex>,
+        targets: Vec<MachLabel>,
    },

    /// A "break" instruction, used for e.g. traps and debug breakpoints.
@@ -685,11 +678,14 @@ pub enum Inst {
        trap_info: (SourceLoc, TrapCode),
    },

-    /// Load the address (using a PC-relative offset) of a MemLabel, using the
-    /// `ADR` instruction.
+    /// Load the address (using a PC-relative offset) of a memory location, using the `ADR`
+    /// instruction. Note that we take a simple offset, not a `MemLabel`, here, because `Adr` is
+    /// only used for now in fixed lowering sequences with hardcoded offsets. In the future we may
+    /// need full `MemLabel` support.
    Adr {
        rd: Writable<Reg>,
-        label: MemLabel,
+        /// Offset in range -2^20 .. 2^20.
+        off: i32,
    },

    /// Raw 32-bit word, used for inline constants and jump-table entries.
@@ -706,7 +702,7 @@ pub enum Inst {
    /// for rationale).
    JTSequence {
        targets: Box<[BranchTarget]>,
-        targets_for_term: Box<[BlockIndex]>, // needed for MachTerminator.
+        targets_for_term: Box<[MachLabel]>, // needed for MachTerminator.
        ridx: Reg,
        rtmp1: Writable<Reg>,
        rtmp2: Writable<Reg>,
@@ -732,21 +728,19 @@ pub enum Inst {
        mem: MemArg,
    },

-    /// Sets the value of the pinned register to the given register target.
-    GetPinnedReg {
-        rd: Writable<Reg>,
-    },
-
-    /// Writes the value of the given source register to the pinned register.
-    SetPinnedReg {
-        rm: Reg,
-    },
-
    /// Marker, no-op in generated code: SP "virtual offset" is adjusted. This
    /// controls MemArg::NominalSPOffset args are lowered.
    VirtualSPOffsetAdj {
        offset: i64,
    },
+
+    /// Meta-insn, no-op in generated code: emit constant/branch veneer island at this point (with
+    /// a guard jump around it) if less than the needed space is available before the next branch
+    /// deadline.
+    EmitIsland {
+        /// The needed space before the next deadline.
+        needed_space: CodeOffset,
+    },
 }

 fn count_zero_half_words(mut value: u64) -> usize {
@@ -1111,9 +1105,7 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
            collector.add_defs(&*defs);
            collector.add_use(rn);
        }
-        &Inst::CondBr { ref kind, .. }
-        | &Inst::CondBrLowered { ref kind, .. }
-        | &Inst::CondBrLoweredCompound { ref kind, .. } => match kind {
+        &Inst::CondBr { ref kind, .. } | &Inst::OneWayCondBr { ref kind, .. } => match kind {
            CondBrKind::Zero(rt) | CondBrKind::NotZero(rt) => {
                collector.add_use(*rt);
            }
@@ -1142,13 +1134,8 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
        &Inst::LoadAddr { rd, mem: _ } => {
            collector.add_def(rd);
        }
-        &Inst::GetPinnedReg { rd } => {
-            collector.add_def(rd);
-        }
-        &Inst::SetPinnedReg { rm } => {
-            collector.add_use(rm);
-        }
        &Inst::VirtualSPOffsetAdj { .. } => {}
+        &Inst::EmitIsland { .. } => {}
    }
 }

@@ -1676,13 +1663,7 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RegUsageMapper) {
            *defs = Box::new(new_defs);
            map_use(mapper, rn);
        }
-        &mut Inst::CondBr { ref mut kind, .. } => {
-            map_br(mapper, kind);
-        }
-        &mut Inst::CondBrLowered { ref mut kind, .. } => {
-            map_br(mapper, kind);
-        }
-        &mut Inst::CondBrLoweredCompound { ref mut kind, .. } => {
+        &mut Inst::CondBr { ref mut kind, .. } | &mut Inst::OneWayCondBr { ref mut kind, .. } => {
            map_br(mapper, kind);
        }
        &mut Inst::IndirectBr { ref mut rn, .. } => {
@@ -1716,13 +1697,8 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RegUsageMapper) {
            map_def(mapper, rd);
            map_mem(mapper, mem);
        }
-        &mut Inst::GetPinnedReg { ref mut rd } => {
-            map_def(mapper, rd);
-        }
-        &mut Inst::SetPinnedReg { ref mut rm } => {
-            map_use(mapper, rm);
-        }
        &mut Inst::VirtualSPOffsetAdj { .. } => {}
+        &mut Inst::EmitIsland { .. } => {}
    }
 }

@@ -1730,6 +1706,8 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RegUsageMapper) {
 // Instructions: misc functions and external interface

 impl MachInst for Inst {
+    type LabelUse = LabelUse;
+
    fn get_regs(&self, collector: &mut RegUsageCollector) {
        aarch64_get_regs(self, collector)
    }
@@ -1757,24 +1735,14 @@ impl MachInst for Inst {
    fn is_term<'a>(&'a self) -> MachTerminator<'a> {
        match self {
            &Inst::Ret | &Inst::EpiloguePlaceholder => MachTerminator::Ret,
-            &Inst::Jump { dest } => MachTerminator::Uncond(dest.as_block_index().unwrap()),
+            &Inst::Jump { dest } => MachTerminator::Uncond(dest.as_label().unwrap()),
            &Inst::CondBr {
                taken, not_taken, ..
-            } => MachTerminator::Cond(
-                taken.as_block_index().unwrap(),
-                not_taken.as_block_index().unwrap(),
-            ),
-            &Inst::CondBrLowered { .. } => {
-                // When this is used prior to branch finalization for branches
-                // within an open-coded sequence, i.e. with ResolvedOffsets,
-                // do not consider it a terminator. From the point of view of CFG analysis,
-                // it is part of a black-box single-in single-out region, hence is not
-                // denoted a terminator.
+            } => MachTerminator::Cond(taken.as_label().unwrap(), not_taken.as_label().unwrap()),
+            &Inst::OneWayCondBr { .. } => {
+                // Explicitly invisible to CFG processing.
                MachTerminator::None
            }
-            &Inst::CondBrLoweredCompound { .. } => {
-                panic!("is_term() called after lowering branches");
-            }
            &Inst::IndirectBr { ref targets, .. } => MachTerminator::Indirect(&targets[..]),
            &Inst::JTSequence {
                ref targets_for_term,
@@ -1789,6 +1757,23 @@ impl MachInst for Inst {
        Inst::mov(to_reg, from_reg)
    }

+    fn gen_constant(to_reg: Writable<Reg>, value: u64, ty: Type) -> SmallVec<[Inst; 4]> {
+        if ty == F64 {
+            let mut ret = SmallVec::new();
+            ret.push(Inst::load_fp_constant64(to_reg, f64::from_bits(value)));
+            ret
+        } else if ty == F32 {
+            let mut ret = SmallVec::new();
+            ret.push(Inst::load_fp_constant32(
+                to_reg,
+                f32::from_bits(value as u32),
+            ));
+            ret
+        } else {
+            Inst::load_constant(to_reg, value)
+        }
+    }
+
    fn gen_zero_len_nop() -> Inst {
        Inst::Nop0
    }
@@ -1815,101 +1800,25 @@ impl MachInst for Inst {
        }
    }

-    fn gen_jump(blockindex: BlockIndex) -> Inst {
+    fn gen_jump(target: MachLabel) -> Inst {
        Inst::Jump {
-            dest: BranchTarget::Block(blockindex),
+            dest: BranchTarget::Label(target),
        }
    }

-    fn with_block_rewrites(&mut self, block_target_map: &[BlockIndex]) {
-        match self {
-            &mut Inst::Jump { ref mut dest } => {
-                dest.map(block_target_map);
-            }
-            &mut Inst::CondBr {
-                ref mut taken,
-                ref mut not_taken,
-                ..
-            } => {
-                taken.map(block_target_map);
-                not_taken.map(block_target_map);
-            }
-            &mut Inst::CondBrLowered { .. } => {
-                // See note in `is_term()`: this is used in open-coded sequences
-                // within blocks and should be left alone.
-            }
-            &mut Inst::CondBrLoweredCompound { .. } => {
-                panic!("with_block_rewrites called after branch lowering!");
-            }
-            _ => {}
-        }
+    fn reg_universe(flags: &settings::Flags) -> RealRegUniverse {
+        create_reg_universe(flags)
    }

-    fn with_fallthrough_block(&mut self, fallthrough: Option<BlockIndex>) {
-        match self {
-            &mut Inst::CondBr {
-                taken,
-                not_taken,
-                kind,
-            } => {
-                if taken.as_block_index() == fallthrough
-                    && not_taken.as_block_index() == fallthrough
-                {
-                    *self = Inst::Nop0;
-                } else if taken.as_block_index() == fallthrough {
-                    *self = Inst::CondBrLowered {
-                        target: not_taken,
-                        kind: kind.invert(),
-                    };
-                } else if not_taken.as_block_index() == fallthrough {
-                    *self = Inst::CondBrLowered {
-                        target: taken,
-                        kind,
-                    };
-                } else {
-                    // We need a compound sequence (condbr / uncond-br).
-                    *self = Inst::CondBrLoweredCompound {
-                        taken,
-                        not_taken,
-                        kind,
-                    };
-                }
-            }
-            &mut Inst::Jump { dest } => {
-                if dest.as_block_index() == fallthrough {
-                    *self = Inst::Nop0;
-                }
-            }
-            _ => {}
-        }
-    }
-
-    fn with_block_offsets(&mut self, my_offset: CodeOffset, targets: &[CodeOffset]) {
-        match self {
-            &mut Inst::CondBrLowered { ref mut target, .. } => {
-                target.lower(targets, my_offset);
-            }
-            &mut Inst::CondBrLoweredCompound {
-                ref mut taken,
-                ref mut not_taken,
-                ..
-            } => {
-                taken.lower(targets, my_offset);
-                not_taken.lower(targets, my_offset + 4);
-            }
-            &mut Inst::Jump { ref mut dest } => {
-                dest.lower(targets, my_offset);
-            }
-            &mut Inst::JTSequence {
-                targets: ref mut t, ..
-            } => {
-                for target in t.iter_mut() {
-                    // offset+20: jumptable is 20 bytes into compound sequence.
-                    target.lower(targets, my_offset + 20);
-                }
-            }
-            _ => {}
-        }
+    fn worst_case_size() -> CodeOffset {
+        // The maximum size, in bytes, of any `Inst`'s emitted code. We have at least one case of
+        // an 8-instruction sequence (saturating int-to-float conversions) with three embedded
+        // 64-bit f64 constants.
+        //
+        // Note that inline jump-tables handle island/pool insertion separately, so we do not need
+        // to account for them here (otherwise the worst case would be 2^31 * 4, clearly not
+        // feasible for other reasons).
+        44
    }
 }

@@ -2550,12 +2459,12 @@ impl ShowWithRRU for Inst {
                    }
                }
            }
-            &Inst::CondBrLowered {
+            &Inst::OneWayCondBr {
                ref target,
                ref kind,
            } => {
                let target = target.show_rru(mb_rru);
-                match &kind {
+                match kind {
                    &CondBrKind::Zero(reg) => {
                        let reg = reg.show_rru(mb_rru);
                        format!("cbz {}, {}", reg, target)
@@ -2570,30 +2479,15 @@ impl ShowWithRRU for Inst {
                    }
                }
            }
-            &Inst::CondBrLoweredCompound {
-                ref taken,
-                ref not_taken,
-                ref kind,
-            } => {
-                let first = Inst::CondBrLowered {
-                    target: taken.clone(),
-                    kind: kind.clone(),
-                };
-                let second = Inst::Jump {
-                    dest: not_taken.clone(),
-                };
-                first.show_rru(mb_rru) + " ; " + &second.show_rru(mb_rru)
-            }
            &Inst::IndirectBr { rn, .. } => {
                let rn = rn.show_rru(mb_rru);
                format!("br {}", rn)
            }
            &Inst::Brk => "brk #0".to_string(),
            &Inst::Udf { .. } => "udf".to_string(),
-            &Inst::Adr { rd, ref label } => {
+            &Inst::Adr { rd, off } => {
                let rd = rd.show_rru(mb_rru);
-                let label = label.show_rru(mb_rru);
-                format!("adr {}, {}", rd, label)
+                format!("adr {}, pc+{}", rd, off)
            }
            &Inst::Word4 { data } => format!("data.i32 {}", data),
            &Inst::Word8 { data } => format!("data.i64 {}", data),
@@ -2683,15 +2577,134 @@ impl ShowWithRRU for Inst {
                }
                ret
            }
-            &Inst::GetPinnedReg { rd } => {
-                let rd = rd.show_rru(mb_rru);
-                format!("get_pinned_reg {}", rd)
-            }
-            &Inst::SetPinnedReg { rm } => {
-                let rm = rm.show_rru(mb_rru);
-                format!("set_pinned_reg {}", rm)
-            }
            &Inst::VirtualSPOffsetAdj { offset } => format!("virtual_sp_offset_adjust {}", offset),
+            &Inst::EmitIsland { needed_space } => format!("emit_island {}", needed_space),
+        }
+    }
+}
+
+//=============================================================================
+// Label fixups and jump veneers.
+
+/// Different forms of label references for different instruction formats.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum LabelUse {
+    /// 19-bit branch offset (conditional branches). PC-rel, offset is imm << 2. Immediate is 19
+    /// signed bits, in bits 23:5. Used by cbz, cbnz, b.cond.
+    Branch19,
+    /// 26-bit branch offset (unconditional branches). PC-rel, offset is imm << 2. Immediate is 26
+    /// signed bits, in bits 25:0. Used by b, bl.
+    Branch26,
+    /// 19-bit offset for LDR (load literal). PC-rel, offset is imm << 2. Immediate is 19 signed bits,
+    /// in bits 23:5.
+    Ldr19,
+    /// 21-bit offset for ADR (get address of label). PC-rel, offset is not shifted. Immediate is
+    /// 21 signed bits, with high 19 bits in bits 23:5 and low 2 bits in bits 30:29.
+    Adr21,
+    /// 32-bit PC relative constant offset (from address of constant itself). Used in jump tables.
+    PCRel32,
+}
+
+impl MachInstLabelUse for LabelUse {
+    /// Alignment for veneer code. Every AArch64 instruction must be 4-byte-aligned.
+    const ALIGN: CodeOffset = 4;
+
+    /// Maximum PC-relative range (positive), inclusive.
+    fn max_pos_range(self) -> CodeOffset {
+        match self {
+            // 19-bit immediate, left-shifted by 2, for 21 bits of total range. Signed, so +2^20
+            // from zero. Likewise for two other shifted cases below.
+            LabelUse::Branch19 => (1 << 20) - 1,
+            LabelUse::Branch26 => (1 << 27) - 1,
+            LabelUse::Ldr19 => (1 << 20) - 1,
+            // Adr does not shift its immediate, so the 21-bit immediate gives 21 bits of total
+            // range.
+            LabelUse::Adr21 => (1 << 20) - 1,
+            LabelUse::PCRel32 => 0x7fffffff,
+        }
+    }
+
+    /// Maximum PC-relative range (negative).
+    fn max_neg_range(self) -> CodeOffset {
+        // All forms are twos-complement signed offsets, so negative limit is one more than
+        // positive limit.
+        self.max_pos_range() + 1
+    }
+
+    /// Size of window into code needed to do the patch.
+    fn patch_size(self) -> CodeOffset {
+        // Patch is on one instruction only for all of these label reference types.
+        4
+    }
+
+    /// Perform the patch.
+    fn patch(self, buffer: &mut [u8], use_offset: CodeOffset, label_offset: CodeOffset) {
+        let pc_rel = (label_offset as i64) - (use_offset as i64);
+        debug_assert!(pc_rel <= self.max_pos_range() as i64);
+        debug_assert!(pc_rel >= -(self.max_neg_range() as i64));
+        let pc_rel = pc_rel as u32;
+        let insn_word = u32::from_le_bytes([buffer[0], buffer[1], buffer[2], buffer[3]]);
+        let mask = match self {
+            LabelUse::Branch19 => 0x00ffffe0, // bits 23..5 inclusive
+            LabelUse::Branch26 => 0x03ffffff, // bits 25..0 inclusive
+            LabelUse::Ldr19 => 0x00ffffe0,    // bits 23..5 inclusive
+            LabelUse::Adr21 => 0x60ffffe0,    // bits 30..29, 25..5 inclusive
+            LabelUse::PCRel32 => 0xffffffff,
+        };
+        let pc_rel_shifted = match self {
+            LabelUse::Adr21 | LabelUse::PCRel32 => pc_rel,
+            _ => {
+                debug_assert!(pc_rel & 3 == 0);
+                pc_rel >> 2
+            }
+        };
+        let pc_rel_inserted = match self {
+            LabelUse::Branch19 | LabelUse::Ldr19 => (pc_rel_shifted & 0x7ffff) << 5,
+            LabelUse::Branch26 => pc_rel_shifted & 0x3ffffff,
+            LabelUse::Adr21 => (pc_rel_shifted & 0x7ffff) << 5 | (pc_rel_shifted & 0x180000) << 10,
+            LabelUse::PCRel32 => pc_rel_shifted,
+        };
+        let is_add = match self {
+            LabelUse::PCRel32 => true,
+            _ => false,
+        };
+        let insn_word = if is_add {
+            insn_word.wrapping_add(pc_rel_inserted)
+        } else {
+            (insn_word & !mask) | pc_rel_inserted
+        };
+        buffer[0..4].clone_from_slice(&u32::to_le_bytes(insn_word));
+    }
+
+    /// Is a veneer supported for this label reference type?
+    fn supports_veneer(self) -> bool {
+        match self {
+            LabelUse::Branch19 => true, // veneer is a Branch26
+            _ => false,
+        }
+    }
+
+    /// How large is the veneer, if supported?
+    fn veneer_size(self) -> CodeOffset {
+        4
+    }
+
+    /// Generate a veneer into the buffer, given that this veneer is at `veneer_offset`, and return
+    /// an offset and label-use for the veneer's use of the original label.
+    fn generate_veneer(
+        self,
+        buffer: &mut [u8],
+        veneer_offset: CodeOffset,
+    ) -> (CodeOffset, LabelUse) {
+        match self {
+            LabelUse::Branch19 => {
+                // veneer is a Branch26 (unconditional branch). Just encode directly here -- don't
+                // bother with constructing an Inst.
+                let insn_word = 0b000101 << 26;
+                buffer[0..4].clone_from_slice(&u32::to_le_bytes(insn_word));
+                (veneer_offset, LabelUse::Branch26)
+            }
+            _ => panic!("Unsupported label-reference type for veneer generation!"),
        }
    }
 }
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -14,12 +14,14 @@ use crate::ir::Inst as IRInst;
 use crate::ir::{InstructionData, Opcode, TrapCode, Type};
 use crate::machinst::lower::*;
 use crate::machinst::*;
+use crate::CodegenResult;

 use crate::isa::aarch64::inst::*;
 use crate::isa::aarch64::AArch64Backend;

 use super::lower_inst;

+use log::debug;
 use regalloc::{Reg, RegClass, Writable};

 //============================================================================
@@ -104,18 +106,11 @@ pub(crate) enum ResultRegImmShift {
 }

 //============================================================================
-// Instruction input and output "slots".
+// Instruction input "slots".
 //
 // We use these types to refer to operand numbers, and result numbers, together
 // with the associated instruction, in a type-safe way.

-/// Identifier for a particular output of an instruction.
-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub(crate) struct InsnOutput {
-    pub(crate) insn: IRInst,
-    pub(crate) output: usize,
-}
-
 /// Identifier for a particular input of an instruction.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub(crate) struct InsnInput {
@@ -123,93 +118,28 @@ pub(crate) struct InsnInput {
    pub(crate) input: usize,
 }

-/// Producer of a value: either a previous instruction's output, or a register that will be
-/// codegen'd separately.
+/// Identifier for a particular output of an instruction.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub(crate) enum InsnInputSource {
-    Output(InsnOutput),
-    Reg(Reg),
-}
-
-impl InsnInputSource {
-    fn as_output(self) -> Option<InsnOutput> {
-        match self {
-            InsnInputSource::Output(o) => Some(o),
-            _ => None,
-        }
-    }
-}
-
-fn get_input<C: LowerCtx<I = Inst>>(ctx: &mut C, output: InsnOutput, num: usize) -> InsnInput {
-    assert!(num <= ctx.num_inputs(output.insn));
-    InsnInput {
-        insn: output.insn,
-        input: num,
-    }
-}
-
-/// Convert an instruction input to a producing instruction's output if possible (in same BB), or a
-/// register otherwise.
-fn input_source<C: LowerCtx<I = Inst>>(ctx: &mut C, input: InsnInput) -> InsnInputSource {
-    if let Some((input_inst, result_num)) = ctx.input_inst(input.insn, input.input) {
-        let out = InsnOutput {
-            insn: input_inst,
-            output: result_num,
-        };
-        InsnInputSource::Output(out)
-    } else {
-        let reg = ctx.input(input.insn, input.input);
-        InsnInputSource::Reg(reg)
-    }
+pub(crate) struct InsnOutput {
+    pub(crate) insn: IRInst,
+    pub(crate) output: usize,
 }

 //============================================================================
-// Lowering: convert instruction outputs to result types.
+// Lowering: convert instruction inputs to forms that we can use.

-/// Lower an instruction output to a 64-bit constant, if possible.
-pub(crate) fn output_to_const<C: LowerCtx<I = Inst>>(ctx: &mut C, out: InsnOutput) -> Option<u64> {
-    if out.output > 0 {
-        None
-    } else {
-        let inst_data = ctx.data(out.insn);
-        if inst_data.opcode() == Opcode::Null {
-            Some(0)
-        } else {
-            match inst_data {
-                &InstructionData::UnaryImm { opcode: _, imm } => {
-                    // Only has Into for i64; we use u64 elsewhere, so we cast.
-                    let imm: i64 = imm.into();
-                    Some(imm as u64)
-                }
-                &InstructionData::UnaryBool { opcode: _, imm } => Some(u64::from(imm)),
-                &InstructionData::UnaryIeee32 { opcode: _, imm } => Some(u64::from(imm.bits())),
-                &InstructionData::UnaryIeee64 { opcode: _, imm } => Some(imm.bits()),
-                _ => None,
-            }
-        }
-    }
+/// Lower an instruction input to a 64-bit constant, if possible.
+pub(crate) fn input_to_const<C: LowerCtx<I = Inst>>(ctx: &mut C, input: InsnInput) -> Option<u64> {
+    let input = ctx.get_input(input.insn, input.input);
+    input.constant
 }

-pub(crate) fn output_to_const_f32<C: LowerCtx<I = Inst>>(
+/// Lower an instruction input to a constant register-shift amount, if possible.
+pub(crate) fn input_to_shiftimm<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
-    out: InsnOutput,
-) -> Option<f32> {
-    output_to_const(ctx, out).map(|value| f32::from_bits(value as u32))
-}
-
-pub(crate) fn output_to_const_f64<C: LowerCtx<I = Inst>>(
-    ctx: &mut C,
-    out: InsnOutput,
-) -> Option<f64> {
-    output_to_const(ctx, out).map(|value| f64::from_bits(value))
-}
-
-/// Lower an instruction output to a constant register-shift amount, if possible.
-pub(crate) fn output_to_shiftimm<C: LowerCtx<I = Inst>>(
-    ctx: &mut C,
-    out: InsnOutput,
+    input: InsnInput,
 ) -> Option<ShiftOpShiftImm> {
-    output_to_const(ctx, out).and_then(ShiftOpShiftImm::maybe_from_shift)
+    input_to_const(ctx, input).and_then(ShiftOpShiftImm::maybe_from_shift)
 }

 /// How to handle narrow values loaded into registers; see note on `narrow_mode`
@@ -237,9 +167,9 @@ impl NarrowValueMode {
    }
 }

-/// Lower an instruction output to a reg.
+/// Allocate a register for an instruction output and return it.
 pub(crate) fn output_to_reg<C: LowerCtx<I = Inst>>(ctx: &mut C, out: InsnOutput) -> Writable<Reg> {
-    ctx.output(out.insn, out.output)
+    ctx.get_output(out.insn, out.output)
 }

 /// Lower an instruction input to a reg.
@@ -252,9 +182,22 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
    input: InsnInput,
    narrow_mode: NarrowValueMode,
 ) -> Reg {
+    debug!("input_to_reg: input {:?}", input);
    let ty = ctx.input_ty(input.insn, input.input);
    let from_bits = ty_bits(ty) as u8;
-    let in_reg = ctx.input(input.insn, input.input);
+    let inputs = ctx.get_input(input.insn, input.input);
+    let in_reg = if let Some(c) = inputs.constant {
+        // Generate constants fresh at each use to minimize long-range register pressure.
+        let to_reg = ctx.tmp(Inst::rc_for_type(ty).unwrap(), ty);
+        for inst in Inst::gen_constant(to_reg, c, ty).into_iter() {
+            ctx.emit(inst);
+        }
+        to_reg.to_reg()
+    } else {
+        ctx.use_input_reg(inputs);
+        inputs.reg
+    };
+
    match (narrow_mode, from_bits) {
        (NarrowValueMode::None, _) => in_reg,
        (NarrowValueMode::ZeroExtend32, n) if n < 32 => {
@@ -282,6 +225,10 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
        (NarrowValueMode::ZeroExtend32, 32) | (NarrowValueMode::SignExtend32, 32) => in_reg,

        (NarrowValueMode::ZeroExtend64, n) if n < 64 => {
+            if inputs.constant.is_some() {
+                // Constants are zero-extended to full 64-bit width on load already.
+                in_reg
+            } else {
                let tmp = ctx.tmp(RegClass::I64, I32);
                ctx.emit(Inst::Extend {
                    rd: tmp,
@@ -292,6 +239,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
                });
                tmp.to_reg()
            }
+        }
        (NarrowValueMode::SignExtend64, n) if n < 64 => {
            let tmp = ctx.tmp(RegClass::I64, I32);
            ctx.emit(Inst::Extend {
@@ -313,8 +261,6 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
 }

 /// Lower an instruction input to a reg or reg/shift, or reg/extend operand.
-/// This does not actually codegen the source instruction; it just uses the
-/// vreg into which the source instruction will generate its value.
 ///
 /// The `narrow_mode` flag indicates whether the consumer of this value needs
 /// the high bits clear. For many operations, such as an add/sub/mul or any
@@ -330,26 +276,21 @@ fn input_to_rs<C: LowerCtx<I = Inst>>(
    input: InsnInput,
    narrow_mode: NarrowValueMode,
 ) -> ResultRS {
-    if let InsnInputSource::Output(out) = input_source(ctx, input) {
-        let insn = out.insn;
-        assert!(out.output <= ctx.num_outputs(insn));
+    let inputs = ctx.get_input(input.insn, input.input);
+    if let Some((insn, 0)) = inputs.inst {
        let op = ctx.data(insn).opcode();

        if op == Opcode::Ishl {
-            let shiftee = get_input(ctx, out, 0);
-            let shift_amt = get_input(ctx, out, 1);
+            let shiftee = InsnInput { insn, input: 0 };
+            let shift_amt = InsnInput { insn, input: 1 };

            // Can we get the shift amount as an immediate?
-            if let Some(shift_amt_out) = input_source(ctx, shift_amt).as_output() {
-                if let Some(shiftimm) = output_to_shiftimm(ctx, shift_amt_out) {
+            if let Some(shiftimm) = input_to_shiftimm(ctx, shift_amt) {
                let reg = input_to_reg(ctx, shiftee, narrow_mode);
-                    ctx.merged(insn);
-                    ctx.merged(shift_amt_out.insn);
                return ResultRS::RegShift(reg, ShiftOpAndAmt::new(ShiftOp::LSL, shiftimm));
            }
        }
    }
-    }

    ResultRS::Reg(input_to_reg(ctx, input, narrow_mode))
 }
@@ -364,11 +305,10 @@ fn input_to_rse<C: LowerCtx<I = Inst>>(
    input: InsnInput,
    narrow_mode: NarrowValueMode,
 ) -> ResultRSE {
-    if let InsnInputSource::Output(out) = input_source(ctx, input) {
-        let insn = out.insn;
-        assert!(out.output <= ctx.num_outputs(insn));
+    let inputs = ctx.get_input(input.insn, input.input);
+    if let Some((insn, 0)) = inputs.inst {
        let op = ctx.data(insn).opcode();
-        let out_ty = ctx.output_ty(insn, out.output);
+        let out_ty = ctx.output_ty(insn, 0);
        let out_bits = ty_bits(out_ty);

        // If `out_ty` is smaller than 32 bits and we need to zero- or sign-extend,
@@ -378,7 +318,7 @@ fn input_to_rse<C: LowerCtx<I = Inst>>(
            && ((narrow_mode.is_32bit() && out_bits < 32)
                || (!narrow_mode.is_32bit() && out_bits < 64))
        {
-            let reg = output_to_reg(ctx, out);
+            let reg = input_to_reg(ctx, InsnInput { insn, input: 0 }, NarrowValueMode::None);
            let extendop = match (narrow_mode, out_bits) {
                (NarrowValueMode::SignExtend32, 1) | (NarrowValueMode::SignExtend64, 1) => {
                    ExtendOp::SXTB
@@ -402,15 +342,14 @@ fn input_to_rse<C: LowerCtx<I = Inst>>(
                (NarrowValueMode::ZeroExtend64, 32) => ExtendOp::UXTW,
                _ => unreachable!(),
            };
-            return ResultRSE::RegExtend(reg.to_reg(), extendop);
+            return ResultRSE::RegExtend(reg, extendop);
        }

        // Is this a zero-extend or sign-extend and can we handle that with a register-mode operator?
        if op == Opcode::Uextend || op == Opcode::Sextend {
            assert!(out_bits == 32 || out_bits == 64);
            let sign_extend = op == Opcode::Sextend;
-            let extendee = get_input(ctx, out, 0);
-            let inner_ty = ctx.input_ty(extendee.insn, extendee.input);
+            let inner_ty = ctx.input_ty(insn, 0);
            let inner_bits = ty_bits(inner_ty);
            assert!(inner_bits < out_bits);
            let extendop = match (sign_extend, inner_bits) {
@@ -424,8 +363,7 @@ fn input_to_rse<C: LowerCtx<I = Inst>>(
                (false, 32) => ExtendOp::UXTW,
                _ => unreachable!(),
            };
-            let reg = input_to_reg(ctx, extendee, NarrowValueMode::None);
-            ctx.merged(insn);
+            let reg = input_to_reg(ctx, InsnInput { insn, input: 0 }, NarrowValueMode::None);
            return ResultRSE::RegExtend(reg, extendop);
        }
    }
@@ -438,14 +376,11 @@ pub(crate) fn input_to_rse_imm12<C: LowerCtx<I = Inst>>(
    input: InsnInput,
    narrow_mode: NarrowValueMode,
 ) -> ResultRSEImm12 {
-    if let InsnInputSource::Output(out) = input_source(ctx, input) {
-        if let Some(imm_value) = output_to_const(ctx, out) {
+    if let Some(imm_value) = input_to_const(ctx, input) {
        if let Some(i) = Imm12::maybe_from_u64(imm_value) {
-                ctx.merged(out.insn);
            return ResultRSEImm12::Imm12(i);
        }
    }
-    }

    ResultRSEImm12::from_rse(input_to_rse(ctx, input, narrow_mode))
 }
@@ -455,16 +390,13 @@ pub(crate) fn input_to_rs_immlogic<C: LowerCtx<I = Inst>>(
    input: InsnInput,
    narrow_mode: NarrowValueMode,
 ) -> ResultRSImmLogic {
-    if let InsnInputSource::Output(out) = input_source(ctx, input) {
-        if let Some(imm_value) = output_to_const(ctx, out) {
-            let ty = ctx.output_ty(out.insn, out.output);
+    if let Some(imm_value) = input_to_const(ctx, input) {
+        let ty = ctx.input_ty(input.insn, input.input);
        let ty = if ty_bits(ty) < 32 { I32 } else { ty };
        if let Some(i) = ImmLogic::maybe_from_u64(imm_value, ty) {
-                ctx.merged(out.insn);
            return ResultRSImmLogic::ImmLogic(i);
        }
    }
-    }

    ResultRSImmLogic::from_rs(input_to_rs(ctx, input, narrow_mode))
 }
@@ -473,14 +405,11 @@ pub(crate) fn input_to_reg_immshift<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
    input: InsnInput,
 ) -> ResultRegImmShift {
-    if let InsnInputSource::Output(out) = input_source(ctx, input) {
-        if let Some(imm_value) = output_to_const(ctx, out) {
+    if let Some(imm_value) = input_to_const(ctx, input) {
        if let Some(immshift) = ImmShift::maybe_from_u64(imm_value) {
-                ctx.merged(out.insn);
            return ResultRegImmShift::ImmShift(immshift);
        }
    }
-    }

    ResultRegImmShift::Reg(input_to_reg(ctx, input, NarrowValueMode::None))
 }
@@ -823,24 +752,29 @@ pub(crate) fn inst_trapcode(data: &InstructionData) -> Option<TrapCode> {
    }
 }

-/// Checks for an instance of `op` feeding the given input. Marks as merged (decrementing refcount) if so.
+/// Checks for an instance of `op` feeding the given input.
 pub(crate) fn maybe_input_insn<C: LowerCtx<I = Inst>>(
    c: &mut C,
    input: InsnInput,
    op: Opcode,
 ) -> Option<IRInst> {
-    if let InsnInputSource::Output(out) = input_source(c, input) {
-        let data = c.data(out.insn);
+    let inputs = c.get_input(input.insn, input.input);
+    debug!(
+        "maybe_input_insn: input {:?} has options {:?}; looking for op {:?}",
+        input, inputs, op
+    );
+    if let Some((src_inst, _)) = inputs.inst {
+        let data = c.data(src_inst);
+        debug!(" -> input inst {:?}", data);
        if data.opcode() == op {
-            c.merged(out.insn);
-            return Some(out.insn);
+            return Some(src_inst);
        }
    }
    None
 }

 /// Checks for an instance of `op` feeding the given input, possibly via a conversion `conv` (e.g.,
-/// Bint or a bitcast). Marks one or both as merged if so, as appropriate.
+/// Bint or a bitcast).
 ///
 /// FIXME cfallin 2020-03-30: this is really ugly. Factor out tree-matching stuff and make it
 /// a bit more generic.
@@ -850,21 +784,19 @@ pub(crate) fn maybe_input_insn_via_conv<C: LowerCtx<I = Inst>>(
    op: Opcode,
    conv: Opcode,
 ) -> Option<IRInst> {
-    if let Some(ret) = maybe_input_insn(c, input, op) {
-        return Some(ret);
+    let inputs = c.get_input(input.insn, input.input);
+    if let Some((src_inst, _)) = inputs.inst {
+        let data = c.data(src_inst);
+        if data.opcode() == op {
+            return Some(src_inst);
        }
-
-    if let InsnInputSource::Output(out) = input_source(c, input) {
-        let data = c.data(out.insn);
        if data.opcode() == conv {
-            let conv_insn = out.insn;
-            let conv_input = InsnInput {
-                insn: conv_insn,
-                input: 0,
-            };
-            if let Some(inner) = maybe_input_insn(c, conv_input, op) {
-                c.merged(conv_insn);
-                return Some(inner);
+            let inputs = c.get_input(src_inst, 0);
+            if let Some((src_inst, _)) = inputs.inst {
+                let data = c.data(src_inst);
+                if data.opcode() == op {
+                    return Some(src_inst);
+                }
            }
        }
    }
@@ -876,6 +808,7 @@ pub(crate) fn lower_icmp_or_ifcmp_to_flags<C: LowerCtx<I = Inst>>(
    insn: IRInst,
    is_signed: bool,
 ) {
+    debug!("lower_icmp_or_ifcmp_to_flags: insn {}", insn);
    let ty = ctx.input_ty(insn, 0);
    let bits = ty_bits(ty);
    let narrow_mode = match (bits <= 32, is_signed) {
@@ -897,6 +830,7 @@ pub(crate) fn lower_icmp_or_ifcmp_to_flags<C: LowerCtx<I = Inst>>(
    let ty = ctx.input_ty(insn, 0);
    let rn = input_to_reg(ctx, inputs[0], narrow_mode);
    let rm = input_to_rse_imm12(ctx, inputs[1], narrow_mode);
+    debug!("lower_icmp_or_ifcmp_to_flags: rn = {:?} rm = {:?}", rn, rm);
    let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
    let rd = writable_zero_reg();
    ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
@@ -934,17 +868,21 @@ pub(crate) fn lower_fcmp_or_ffcmp_to_flags<C: LowerCtx<I = Inst>>(ctx: &mut C, i
 impl LowerBackend for AArch64Backend {
    type MInst = Inst;

-    fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) {
-        lower_inst::lower_insn_to_regs(ctx, ir_inst);
+    fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> {
+        lower_inst::lower_insn_to_regs(ctx, ir_inst)
    }

    fn lower_branch_group<C: LowerCtx<I = Inst>>(
        &self,
        ctx: &mut C,
        branches: &[IRInst],
-        targets: &[BlockIndex],
-        fallthrough: Option<BlockIndex>,
-    ) {
+        targets: &[MachLabel],
+        fallthrough: Option<MachLabel>,
+    ) -> CodegenResult<()> {
        lower_inst::lower_branch(ctx, branches, targets, fallthrough)
    }
+
+    fn maybe_pinned_reg(&self) -> Option<Reg> {
+        Some(xreg(PINNED_REG))
+    }
 }
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -1,11 +1,13 @@
 //! Lower a single Cranelift instruction into vcode.

+use crate::binemit::CodeOffset;
 use crate::ir::condcodes::FloatCC;
 use crate::ir::types::*;
 use crate::ir::Inst as IRInst;
 use crate::ir::{InstructionData, Opcode, TrapCode};
 use crate::machinst::lower::*;
 use crate::machinst::*;
+use crate::CodegenResult;

 use crate::isa::aarch64::abi::*;
 use crate::isa::aarch64::inst::*;
@@ -19,7 +21,10 @@ use smallvec::SmallVec;
 use super::lower::*;

 /// Actually codegen an instruction's results into registers.
-pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) {
+pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    insn: IRInst,
+) -> CodegenResult<()> {
    let op = ctx.data(insn).opcode();
    let inputs: SmallVec<[InsnInput; 4]> = (0..ctx.num_inputs(insn))
        .map(|i| InsnInput { insn, input: i })
@@ -35,17 +40,17 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns

    match op {
        Opcode::Iconst | Opcode::Bconst | Opcode::Null => {
-            let value = output_to_const(ctx, outputs[0]).unwrap();
+            let value = ctx.get_constant(insn).unwrap();
            let rd = output_to_reg(ctx, outputs[0]);
            lower_constant_u64(ctx, rd, value);
        }
        Opcode::F32const => {
-            let value = output_to_const_f32(ctx, outputs[0]).unwrap();
+            let value = f32::from_bits(ctx.get_constant(insn).unwrap() as u32);
            let rd = output_to_reg(ctx, outputs[0]);
            lower_constant_f32(ctx, rd, value);
        }
        Opcode::F64const => {
-            let value = output_to_const_f64(ctx, outputs[0]).unwrap();
+            let value = f64::from_bits(ctx.get_constant(insn).unwrap());
            let rd = output_to_reg(ctx, outputs[0]);
            lower_constant_f64(ctx, rd, value);
        }
@@ -271,7 +276,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns

                // Check for divide by 0.
                let branch_size = 8;
-                ctx.emit(Inst::CondBrLowered {
+                ctx.emit(Inst::OneWayCondBr {
                    target: BranchTarget::ResolvedOffset(branch_size),
                    kind: CondBrKind::NotZero(rm),
                });
@@ -297,7 +302,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns

                    // Check for divide by 0.
                    let branch_size = 20;
-                    ctx.emit(Inst::CondBrLowered {
+                    ctx.emit(Inst::OneWayCondBr {
                        target: BranchTarget::ResolvedOffset(branch_size),
                        kind: CondBrKind::Zero(rm),
                    });
@@ -324,7 +329,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                        nzcv: NZCV::new(false, false, false, false),
                        cond: Cond::Eq,
                    });
-                    ctx.emit(Inst::CondBrLowered {
+                    ctx.emit(Inst::OneWayCondBr {
                        target: BranchTarget::ResolvedOffset(12),
                        kind: CondBrKind::Cond(Cond::Vc),
                    });
@@ -337,7 +342,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns

                    // Check for divide by 0.
                    let branch_size = 8;
-                    ctx.emit(Inst::CondBrLowered {
+                    ctx.emit(Inst::OneWayCondBr {
                        target: BranchTarget::ResolvedOffset(branch_size),
                        kind: CondBrKind::NotZero(rm),
                    });
@@ -1211,7 +1216,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
            // Branch around the break instruction with inverted cond. Go straight to lowered
            // one-target form; this is logically part of a single-in single-out template lowering.
            let cond = cond.invert();
-            ctx.emit(Inst::CondBrLowered {
+            ctx.emit(Inst::OneWayCondBr {
                target: BranchTarget::ResolvedOffset(8),
                kind: CondBrKind::Cond(cond),
            });
@@ -1301,11 +1306,12 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns

        Opcode::GetPinnedReg => {
            let rd = output_to_reg(ctx, outputs[0]);
-            ctx.emit(Inst::GetPinnedReg { rd });
+            ctx.emit(Inst::mov(rd, xreg(PINNED_REG)));
        }
+
        Opcode::SetPinnedReg => {
            let rm = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
-            ctx.emit(Inst::SetPinnedReg { rm });
+            ctx.emit(Inst::mov(writable_xreg(PINNED_REG), rm));
        }

        Opcode::Spill
@@ -1533,7 +1539,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
            } else {
                ctx.emit(Inst::FpuCmp64 { rn, rm: rn });
            }
-            ctx.emit(Inst::CondBrLowered {
+            ctx.emit(Inst::OneWayCondBr {
                target: BranchTarget::ResolvedOffset(8),
                kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::Ordered)),
            });
@@ -1574,7 +1580,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                    rn,
                    rm: tmp.to_reg(),
                });
-                ctx.emit(Inst::CondBrLowered {
+                ctx.emit(Inst::OneWayCondBr {
                    target: BranchTarget::ResolvedOffset(8),
                    kind: CondBrKind::Cond(lower_fp_condcode(low_cond)),
                });
@@ -1587,7 +1593,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                    rn,
                    rm: tmp.to_reg(),
                });
-                ctx.emit(Inst::CondBrLowered {
+                ctx.emit(Inst::OneWayCondBr {
                    target: BranchTarget::ResolvedOffset(8),
                    kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan)),
                });
@@ -1617,7 +1623,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                    rn,
                    rm: tmp.to_reg(),
                });
-                ctx.emit(Inst::CondBrLowered {
+                ctx.emit(Inst::OneWayCondBr {
                    target: BranchTarget::ResolvedOffset(8),
                    kind: CondBrKind::Cond(lower_fp_condcode(low_cond)),
                });
@@ -1630,7 +1636,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                    rn,
                    rm: tmp.to_reg(),
                });
-                ctx.emit(Inst::CondBrLowered {
+                ctx.emit(Inst::OneWayCondBr {
                    target: BranchTarget::ResolvedOffset(8),
                    kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan)),
                });
@@ -1862,14 +1868,16 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
        Opcode::AvgRound => unimplemented!(),
        Opcode::TlsValue => unimplemented!(),
    }
+
+    Ok(())
 }

 pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
    branches: &[IRInst],
-    targets: &[BlockIndex],
-    fallthrough: Option<BlockIndex>,
-) {
+    targets: &[MachLabel],
+    fallthrough: Option<MachLabel>,
+) -> CodegenResult<()> {
    // A block should end with at most two branches. The first may be a
    // conditional branch; a conditional branch can be followed only by an
    // unconditional branch or fallthrough. Otherwise, if only one branch,
@@ -1883,18 +1891,14 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
        let op0 = ctx.data(branches[0]).opcode();
        let op1 = ctx.data(branches[1]).opcode();

-        //println!(
-        //    "lowering two-branch group: opcodes are {:?} and {:?}",
-        //    op0, op1
-        //);
-
        assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough);
-        let taken = BranchTarget::Block(targets[0]);
+        let taken = BranchTarget::Label(targets[0]);
        let not_taken = match op1 {
-            Opcode::Jump => BranchTarget::Block(targets[1]),
-            Opcode::Fallthrough => BranchTarget::Block(fallthrough.unwrap()),
+            Opcode::Jump => BranchTarget::Label(targets[1]),
+            Opcode::Fallthrough => BranchTarget::Label(fallthrough.unwrap()),
            _ => unreachable!(), // assert above.
        };
+
        match op0 {
            Opcode::Brz | Opcode::Brnz => {
                let flag_input = InsnInput {
@@ -1954,6 +1958,8 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
            Opcode::BrIcmp => {
                let condcode = inst_condcode(ctx.data(branches[0])).unwrap();
                let cond = lower_condcode(condcode);
+                let kind = CondBrKind::Cond(cond);
+
                let is_signed = condcode_is_signed(condcode);
                let ty = ctx.input_ty(branches[0], 0);
                let bits = ty_bits(ty);
@@ -1986,13 +1992,15 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                ctx.emit(Inst::CondBr {
                    taken,
                    not_taken,
-                    kind: CondBrKind::Cond(cond),
+                    kind,
                });
            }

            Opcode::Brif => {
                let condcode = inst_condcode(ctx.data(branches[0])).unwrap();
                let cond = lower_condcode(condcode);
+                let kind = CondBrKind::Cond(cond);
+
                let is_signed = condcode_is_signed(condcode);
                let flag_input = InsnInput {
                    insn: branches[0],
@@ -2003,7 +2011,7 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                    ctx.emit(Inst::CondBr {
                        taken,
                        not_taken,
-                        kind: CondBrKind::Cond(cond),
+                        kind,
                    });
                } else {
                    // If the ifcmp result is actually placed in a
@@ -2013,7 +2021,7 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                    ctx.emit(Inst::CondBr {
                        taken,
                        not_taken,
-                        kind: CondBrKind::Cond(cond),
+                        kind,
                    });
                }
            }
@@ -2021,6 +2029,7 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
            Opcode::Brff => {
                let condcode = inst_fp_condcode(ctx.data(branches[0])).unwrap();
                let cond = lower_fp_condcode(condcode);
+                let kind = CondBrKind::Cond(cond);
                let flag_input = InsnInput {
                    insn: branches[0],
                    input: 0,
@@ -2030,7 +2039,7 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                    ctx.emit(Inst::CondBr {
                        taken,
                        not_taken,
-                        kind: CondBrKind::Cond(cond),
+                        kind,
                    });
                } else {
                    // If the ffcmp result is actually placed in a
@@ -2040,7 +2049,7 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                    ctx.emit(Inst::CondBr {
                        taken,
                        not_taken,
-                        kind: CondBrKind::Cond(cond),
+                        kind,
                    });
                }
            }
@@ -2057,12 +2066,13 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                // fills in `targets[0]` with our fallthrough block, so this
                // is valid for both Jump and Fallthrough.
                ctx.emit(Inst::Jump {
-                    dest: BranchTarget::Block(targets[0]),
+                    dest: BranchTarget::Label(targets[0]),
                });
            }
            Opcode::BrTable => {
                // Expand `br_table index, default, JT` to:
                //
+                //   (emit island with guard jump if needed)
                //   subs idx, #jt_size
                //   b.hs default
                //   adr vTmp1, PC+16
@@ -2072,6 +2082,11 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                //   [jumptable offsets relative to JT base]
                let jt_size = targets.len() - 1;
                assert!(jt_size <= std::u32::MAX as usize);
+
+                ctx.emit(Inst::EmitIsland {
+                    needed_space: 4 * (6 + jt_size) as CodeOffset,
+                });
+
                let ridx = input_to_reg(
                    ctx,
                    InsnInput {
@@ -2101,10 +2116,10 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                        rm: rtmp1.to_reg(),
                    });
                }
-                let default_target = BranchTarget::Block(targets[0]);
-                ctx.emit(Inst::CondBrLowered {
-                    kind: CondBrKind::Cond(Cond::Hs), // unsigned >=
+                let default_target = BranchTarget::Label(targets[0]);
+                ctx.emit(Inst::OneWayCondBr {
                    target: default_target.clone(),
+                    kind: CondBrKind::Cond(Cond::Hs), // unsigned >=
                });

                // Emit the compound instruction that does:
@@ -2125,9 +2140,9 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                let jt_targets: Vec<BranchTarget> = targets
                    .iter()
                    .skip(1)
-                    .map(|bix| BranchTarget::Block(*bix))
+                    .map(|bix| BranchTarget::Label(*bix))
                    .collect();
-                let targets_for_term: Vec<BlockIndex> = targets.to_vec();
+                let targets_for_term: Vec<MachLabel> = targets.to_vec();
                ctx.emit(Inst::JTSequence {
                    ridx,
                    rtmp1,
@@ -2140,4 +2155,6 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
            _ => panic!("Unknown branch type!"),
        }
    }
+
+    Ok(())
 }
--- a/cranelift/codegen/src/isa/aarch64/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/mod.rs
@@ -15,7 +15,7 @@ use target_lexicon::{Aarch64Architecture, Architecture, Triple};

 // New backend:
 mod abi;
-mod inst;
+pub(crate) mod inst;
 mod lower;
 mod lower_inst;

@@ -59,7 +59,7 @@ impl MachBackend for AArch64Backend {
    ) -> CodegenResult<MachCompileResult> {
        let flags = self.flags();
        let vcode = self.compile_vcode(func, flags.clone())?;
-        let sections = vcode.emit();
+        let buffer = vcode.emit();
        let frame_size = vcode.frame_size();

        let disasm = if want_disasm {
@@ -68,8 +68,10 @@ impl MachBackend for AArch64Backend {
            None
        };

+        let buffer = buffer.finish();
+
        Ok(MachCompileResult {
-            sections,
+            buffer,
            frame_size,
            disasm,
        })
@@ -140,8 +142,8 @@ mod test {
            Triple::from_str("aarch64").unwrap(),
            settings::Flags::new(shared_flags),
        );
-        let sections = backend.compile_function(&mut func, false).unwrap().sections;
-        let code = &sections.sections[0].data;
+        let buffer = backend.compile_function(&mut func, false).unwrap().buffer;
+        let code = &buffer.data[..];

        // stp x29, x30, [sp, #-16]!
        // mov x29, sp
@@ -155,7 +157,7 @@ mod test {
            0x01, 0x0b, 0xbf, 0x03, 0x00, 0x91, 0xfd, 0x7b, 0xc1, 0xa8, 0xc0, 0x03, 0x5f, 0xd6,
        ];

-        assert_eq!(code, &golden);
+        assert_eq!(code, &golden[..]);
    }

    #[test]
@@ -198,34 +200,32 @@ mod test {
        let result = backend
            .compile_function(&mut func, /* want_disasm = */ false)
            .unwrap();
-        let code = &result.sections.sections[0].data;
+        let code = &result.buffer.data[..];

        // stp	x29, x30, [sp, #-16]!
        // mov	x29, sp
-        // mov	x1, x0
-        // mov  x0, #0x1234
-        // add	w1, w1, w0
-        // mov	w2, w1
-        // cbz	x2, ...
-        // mov	w2, w1
-        // cbz	x2, ...
-        // sub	w0, w1, w0
+        // mov	x1, #0x1234                	// #4660
+        // add	w0, w0, w1
+        // mov	w1, w0
+        // cbnz	x1, 0x28
+        // mov	x1, #0x1234                	// #4660
+        // add	w1, w0, w1
+        // mov	w1, w1
+        // cbnz	x1, 0x18
+        // mov	w1, w0
+        // cbnz	x1, 0x18
+        // mov	x1, #0x1234                	// #4660
+        // sub	w0, w0, w1
        // mov	sp, x29
        // ldp	x29, x30, [sp], #16
        // ret
-        // add	w2, w1, w0
-        // mov	w2, w2
-        // cbnz	x2, ... <---- compound branch (cond / uncond)
-        // b ...        <----
-
        let golden = vec![
-            0xfd, 0x7b, 0xbf, 0xa9, 0xfd, 0x03, 0x00, 0x91, 0xe1, 0x03, 0x00, 0xaa, 0x80, 0x46,
-            0x82, 0xd2, 0x21, 0x00, 0x00, 0x0b, 0xe2, 0x03, 0x01, 0x2a, 0xe2, 0x00, 0x00, 0xb4,
-            0xe2, 0x03, 0x01, 0x2a, 0xa2, 0x00, 0x00, 0xb5, 0x20, 0x00, 0x00, 0x4b, 0xbf, 0x03,
-            0x00, 0x91, 0xfd, 0x7b, 0xc1, 0xa8, 0xc0, 0x03, 0x5f, 0xd6, 0x22, 0x00, 0x00, 0x0b,
-            0xe2, 0x03, 0x02, 0x2a, 0xc2, 0xff, 0xff, 0xb5, 0xf7, 0xff, 0xff, 0x17,
+            253, 123, 191, 169, 253, 3, 0, 145, 129, 70, 130, 210, 0, 0, 1, 11, 225, 3, 0, 42, 161,
+            0, 0, 181, 129, 70, 130, 210, 1, 0, 1, 11, 225, 3, 1, 42, 161, 255, 255, 181, 225, 3,
+            0, 42, 97, 255, 255, 181, 129, 70, 130, 210, 0, 0, 1, 75, 191, 3, 0, 145, 253, 123,
+            193, 168, 192, 3, 95, 214,
        ];

-        assert_eq!(code, &golden);
+        assert_eq!(code, &golden[..]);
    }
 }
--- a/cranelift/codegen/src/isa/mod.rs
+++ b/cranelift/codegen/src/isa/mod.rs
@@ -77,14 +77,14 @@ mod riscv;
 #[cfg(feature = "x86")]
 mod x86;

-#[cfg(feature = "x64")]
-mod x64;
+//#[cfg(feature = "x64")]
+//mod x64;

 #[cfg(feature = "arm32")]
 mod arm32;

 #[cfg(feature = "arm64")]
-mod aarch64;
+pub(crate) mod aarch64;

 #[cfg(feature = "unwind")]
 pub mod unwind;
--- a/cranelift/codegen/src/isa/x86/mod.rs
+++ b/cranelift/codegen/src/isa/x86/mod.rs
@@ -57,11 +57,11 @@ fn isa_constructor(
    let isa_flags = settings::Flags::new(&shared_flags, builder);

    if isa_flags.use_new_backend() {
-        #[cfg(not(feature = "x64"))]
+        //#[cfg(not(feature = "x64"))]
        panic!("new backend x86 support not included by cargo features!");

-        #[cfg(feature = "x64")]
-        super::x64::isa_builder(triple).finish(shared_flags)
+    //#[cfg(feature = "x64")]
+    //super::x64::isa_builder(triple).finish(shared_flags)
    } else {
        Box::new(Isa {
            triple,
--- a/cranelift/codegen/src/lib.rs
+++ b/cranelift/codegen/src/lib.rs
@@ -99,7 +99,6 @@ mod iterators;
 mod legalizer;
 mod licm;
 mod nan_canonicalization;
-mod num_uses;
 mod partition_slice;
 mod postopt;
 mod predicates;
--- a/cranelift/codegen/src/machinst/blockorder.rs
+++ b/cranelift/codegen/src/machinst/blockorder.rs
@@ -1,49 +1,579 @@
 //! Computation of basic block order in emitted code.
+//!
+//! This module handles the translation from CLIF BBs to VCode BBs.
+//!
+//! The basic idea is that we compute a sequence of "lowered blocks" that
+//! correspond to subgraphs of the CLIF CFG plus an implicit block on *every*
+//! edge (not just critical edges). Conceptually, the lowering pipeline wants to
+//! insert moves for phi-nodes on every block-to-block transfer; these blocks
+//! always conceptually exist, but may be merged with an "original" CLIF block
+//! (and hence not actually exist; this is equivalent to inserting the blocks
+//! only on critical edges).
+//!
+//! Each `LoweredBlock` names just an original CLIF block, an original CLIF
+//! block prepended or appended with an edge block (never both, though), or just
+//! an edge block.
+//!
+//! To compute this lowering, we do a DFS over the CLIF-plus-edge-block graph
+//! (never actually materialized, just defined by a "successors" function), and
+//! compute the reverse postorder.
+//!
+//! This algorithm isn't perfect w.r.t. generated code quality: we don't, for
+//! example, consider any information about whether edge blocks will actually
+//! have content, because this computation happens as part of lowering *before*
+//! regalloc, and regalloc may or may not insert moves/spills/reloads on any
+//! particular edge. But it works relatively well and is conceptually simple.

+use crate::entity::SecondaryMap;
+use crate::fx::{FxHashMap, FxHashSet};
+use crate::ir::{Block, Function, Inst, Opcode};
+use crate::machinst::lower::visit_block_succs;
 use crate::machinst::*;

-/// Simple reverse postorder-based block order emission.
-///
-/// TODO: use a proper algorithm, such as the bottom-up straight-line-section
-/// construction algorithm.
-struct BlockRPO {
-    visited: Vec<bool>,
-    postorder: Vec<BlockIndex>,
+use log::debug;
+use smallvec::SmallVec;
+
+/// Mapping from CLIF BBs to VCode BBs.
+#[derive(Debug)]
+pub struct BlockLoweringOrder {
+    /// Lowered blocks, in BlockIndex order. Each block is some combination of
+    /// (i) a CLIF block, and (ii) inserted crit-edge blocks before or after;
+    /// see [LoweredBlock] for details.
+    lowered_order: Vec<LoweredBlock>,
+    /// Successors for all lowered blocks, in one serialized vector. Indexed by
+    /// the ranges in `lowered_succ_ranges`.
+    lowered_succs: Vec<(Inst, LoweredBlock)>,
+    /// BlockIndex values for successors for all lowered blocks, in the same
+    /// order as `lowered_succs`.
+    lowered_succ_indices: Vec<(Inst, BlockIndex)>,
+    /// Ranges in `lowered_succs` giving the successor lists for each lowered
+    /// block. Indexed by lowering-order index (`BlockIndex`).
+    lowered_succ_ranges: Vec<(usize, usize)>,
+    /// Mapping from CLIF BB to BlockIndex (index in lowered order). Note that
+    /// some CLIF BBs may not be lowered; in particular, we skip unreachable
+    /// blocks.
+    orig_map: SecondaryMap<Block, Option<BlockIndex>>,
 }

-impl BlockRPO {
-    fn new<I: VCodeInst>(vcode: &VCode<I>) -> BlockRPO {
-        BlockRPO {
-            visited: vec![false; vcode.num_blocks()],
-            postorder: Vec::with_capacity(vcode.num_blocks()),
+/// The origin of a block in the lowered block-order: either an original CLIF
+/// block, or an inserted edge-block, or a combination of the two if an edge is
+/// non-critical.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub enum LoweredBlock {
+    /// Block in original CLIF, with no merged edge-blocks.
+    Orig {
+        /// Original CLIF block.
+        block: Block,
+    },
+    /// Block in the original CLIF, plus edge-block to one succ (which is the
+    /// one successor of the original block).
+    OrigAndEdge {
+        /// The original CLIF block contained in this lowered block.
+        block: Block,
+        /// The edge (jump) instruction transitioning from this block
+        /// to the next, i.e., corresponding to the included edge-block. This
+        /// will be an instruction in `block`.
+        edge_inst: Inst,
+        /// The successor CLIF block.
+        succ: Block,
+    },
+    /// Block in the original CLIF, preceded by edge-block from one pred (which
+    /// is the one pred of the original block).
+    EdgeAndOrig {
+        /// The previous CLIF block, i.e., the edge block's predecessor.
+        pred: Block,
+        /// The edge (jump) instruction corresponding to the included
+        /// edge-block. This will be an instruction in `pred`.
+        edge_inst: Inst,
+        /// The original CLIF block included in this lowered block.
+        block: Block,
+    },
+    /// Split critical edge between two CLIF blocks. This lowered block does not
+    /// correspond to any original CLIF blocks; it only serves as an insertion
+    /// point for work to happen on the transition from `pred` to `succ`.
+    Edge {
+        /// The predecessor CLIF block.
+        pred: Block,
+        /// The edge (jump) instruction corresponding to this edge's transition.
+        /// This will be an instruction in `pred`.
+        edge_inst: Inst,
+        /// The successor CLIF block.
+        succ: Block,
+    },
+}
+
+impl LoweredBlock {
+    /// The associated original (CLIF) block included in this lowered block, if
+    /// any.
+    pub fn orig_block(self) -> Option<Block> {
+        match self {
+            LoweredBlock::Orig { block, .. }
+            | LoweredBlock::OrigAndEdge { block, .. }
+            | LoweredBlock::EdgeAndOrig { block, .. } => Some(block),
+            LoweredBlock::Edge { .. } => None,
        }
    }

-    fn visit<I: VCodeInst>(&mut self, vcode: &VCode<I>, block: BlockIndex) {
-        self.visited[block as usize] = true;
-        for succ in vcode.succs(block) {
-            if !self.visited[succ.get() as usize] {
-                self.visit(vcode, succ.get());
-            }
-        }
-        if Some(block) != vcode.fallthrough_return_block {
-            self.postorder.push(block);
+    /// The associated in-edge, if any.
+    pub fn in_edge(self) -> Option<(Block, Inst, Block)> {
+        match self {
+            LoweredBlock::EdgeAndOrig {
+                pred,
+                edge_inst,
+                block,
+            } => Some((pred, edge_inst, block)),
+            _ => None,
        }
    }

-    fn rpo<I: VCodeInst>(self, vcode: &VCode<I>) -> Vec<BlockIndex> {
-        let mut rpo = self.postorder;
-        rpo.reverse();
-        if let Some(block) = vcode.fallthrough_return_block {
-            rpo.push(block);
+    /// the associated out-edge, if any. Also includes edge-only blocks.
+    pub fn out_edge(self) -> Option<(Block, Inst, Block)> {
+        match self {
+            LoweredBlock::OrigAndEdge {
+                block,
+                edge_inst,
+                succ,
+            } => Some((block, edge_inst, succ)),
+            LoweredBlock::Edge {
+                pred,
+                edge_inst,
+                succ,
+            } => Some((pred, edge_inst, succ)),
+            _ => None,
        }
-        rpo
    }
 }

-/// Compute the final block order.
-pub fn compute_final_block_order<I: VCodeInst>(vcode: &VCode<I>) -> Vec<BlockIndex> {
-    let mut rpo = BlockRPO::new(vcode);
-    rpo.visit(vcode, vcode.entry());
-    rpo.rpo(vcode)
+impl BlockLoweringOrder {
+    /// Compute and return a lowered block order for `f`.
+    pub fn new(f: &Function) -> BlockLoweringOrder {
+        debug!("BlockLoweringOrder: function body {:?}", f);
+
+        // Step 1: compute the in-edge and out-edge count of every block.
+        let mut block_in_count = SecondaryMap::with_default(0);
+        let mut block_out_count = SecondaryMap::with_default(0);
+
+        // Cache the block successors to avoid re-examining branches below.
+        let mut block_succs: SmallVec<[(Inst, Block); 128]> = SmallVec::new();
+        let mut block_succ_range = SecondaryMap::with_default((0, 0));
+        let mut fallthrough_return_block = None;
+        for block in f.layout.blocks() {
+            let block_succ_start = block_succs.len();
+            visit_block_succs(f, block, |inst, succ| {
+                block_out_count[block] += 1;
+                block_in_count[succ] += 1;
+                block_succs.push((inst, succ));
+            });
+            let block_succ_end = block_succs.len();
+            block_succ_range[block] = (block_succ_start, block_succ_end);
+
+            for inst in f.layout.block_likely_branches(block) {
+                if f.dfg[inst].opcode() == Opcode::Return {
+                    // Implicit output edge for any return.
+                    block_out_count[block] += 1;
+                }
+                if f.dfg[inst].opcode() == Opcode::FallthroughReturn {
+                    // Fallthrough return block must come last.
+                    debug_assert!(fallthrough_return_block == None);
+                    fallthrough_return_block = Some(block);
+                }
+            }
+        }
+        // Implicit input edge for entry block.
+        if let Some(entry) = f.layout.entry_block() {
+            block_in_count[entry] += 1;
+        }
+
+        // Here we define the implicit CLIF-plus-edges graph. There are
+        // conceptually two such graphs: the original, with every edge explicit,
+        // and the merged one, with blocks (represented by `LoweredBlock`
+        // values) that contain original CLIF blocks, edges, or both. This
+        // function returns a lowered block's successors as per the latter, with
+        // consideration to edge-block merging.
+        //
+        // Note that there is a property of the block-merging rules below
+        // that is very important to ensure we don't miss any lowered blocks:
+        // any block in the implicit CLIF-plus-edges graph will *only* be
+        // included in one block in the merged graph.
+        //
+        // This, combined with the property that every edge block is reachable
+        // only from one predecessor (and hence cannot be reached by a DFS
+        // backedge), means that it is sufficient in our DFS below to track
+        // visited-bits per original CLIF block only, not per edge. This greatly
+        // simplifies the data structures (no need to keep a sparse hash-set of
+        // (block, block) tuples).
+        let compute_lowered_succs = |ret: &mut Vec<(Inst, LoweredBlock)>, block: LoweredBlock| {
+            let start_idx = ret.len();
+            match block {
+                LoweredBlock::Orig { block } | LoweredBlock::EdgeAndOrig { block, .. } => {
+                    // At an orig block; successors are always edge blocks,
+                    // possibly with orig blocks following.
+                    let range = block_succ_range[block];
+                    for &(edge_inst, succ) in &block_succs[range.0..range.1] {
+                        if block_in_count[succ] == 1 {
+                            ret.push((
+                                edge_inst,
+                                LoweredBlock::EdgeAndOrig {
+                                    pred: block,
+                                    edge_inst,
+                                    block: succ,
+                                },
+                            ));
+                        } else {
+                            ret.push((
+                                edge_inst,
+                                LoweredBlock::Edge {
+                                    pred: block,
+                                    edge_inst,
+                                    succ,
+                                },
+                            ));
+                        }
+                    }
+                }
+                LoweredBlock::Edge {
+                    succ, edge_inst, ..
+                }
+                | LoweredBlock::OrigAndEdge {
+                    succ, edge_inst, ..
+                } => {
+                    // At an edge block; successors are always orig blocks,
+                    // possibly with edge blocks following.
+                    if block_out_count[succ] == 1 {
+                        let range = block_succ_range[succ];
+                        // check if the one succ is a real CFG edge (vs.
+                        // implicit return succ).
+                        if range.1 - range.0 > 0 {
+                            debug_assert!(range.1 - range.0 == 1);
+                            let (succ_edge_inst, succ_succ) = block_succs[range.0];
+                            ret.push((
+                                edge_inst,
+                                LoweredBlock::OrigAndEdge {
+                                    block: succ,
+                                    edge_inst: succ_edge_inst,
+                                    succ: succ_succ,
+                                },
+                            ));
+                        } else {
+                            ret.push((edge_inst, LoweredBlock::Orig { block: succ }));
+                        }
+                    } else {
+                        ret.push((edge_inst, LoweredBlock::Orig { block: succ }));
+                    }
+                }
+            }
+            let end_idx = ret.len();
+            (start_idx, end_idx)
+        };
+
+        // Build the explicit LoweredBlock-to-LoweredBlock successors list.
+        let mut lowered_succs = vec![];
+        let mut lowered_succ_indices = vec![];
+
+        // Step 2: Compute RPO traversal of the implicit CLIF-plus-edge-block graph. Use an
+        // explicit stack so we don't overflow the real stack with a deep DFS.
+        #[derive(Debug)]
+        struct StackEntry {
+            this: LoweredBlock,
+            succs: (usize, usize), // range in lowered_succs
+            cur_succ: usize,       // index in lowered_succs
+        }
+
+        let mut stack: SmallVec<[StackEntry; 16]> = SmallVec::new();
+        let mut visited = FxHashSet::default();
+        let mut postorder = vec![];
+        if let Some(entry) = f.layout.entry_block() {
+            // FIXME(cfallin): we might be able to use OrigAndEdge. Find a way
+            // to not special-case the entry block here.
+            let block = LoweredBlock::Orig { block: entry };
+            visited.insert(block);
+            let range = compute_lowered_succs(&mut lowered_succs, block);
+            lowered_succ_indices.resize(lowered_succs.len(), 0);
+            stack.push(StackEntry {
+                this: block,
+                succs: range,
+                cur_succ: range.1,
+            });
+        }
+
+        let mut deferred_last = None;
+        while !stack.is_empty() {
+            let stack_entry = stack.last_mut().unwrap();
+            let range = stack_entry.succs;
+            if stack_entry.cur_succ == range.0 {
+                let orig_block = stack_entry.this.orig_block();
+                if orig_block.is_some() && orig_block == fallthrough_return_block {
+                    deferred_last = Some((stack_entry.this, range));
+                } else {
+                    postorder.push((stack_entry.this, range));
+                }
+                stack.pop();
+            } else {
+                // Heuristic: chase the children in reverse. This puts the first
+                // successor block first in RPO, all other things being equal,
+                // which tends to prioritize loop backedges over out-edges,
+                // putting the edge-block closer to the loop body and minimizing
+                // live-ranges in linear instruction space.
+                let next = lowered_succs[stack_entry.cur_succ - 1].1;
+                stack_entry.cur_succ -= 1;
+                if visited.contains(&next) {
+                    continue;
+                }
+                visited.insert(next);
+                let range = compute_lowered_succs(&mut lowered_succs, next);
+                lowered_succ_indices.resize(lowered_succs.len(), 0);
+                stack.push(StackEntry {
+                    this: next,
+                    succs: range,
+                    cur_succ: range.1,
+                });
+            }
+        }
+
+        postorder.reverse();
+        let mut rpo = postorder;
+        if let Some(d) = deferred_last {
+            rpo.push(d);
+        }
+
+        // Step 3: now that we have RPO, build the BlockIndex/BB fwd/rev maps.
+        let mut lowered_order = vec![];
+        let mut lowered_succ_ranges = vec![];
+        let mut lb_to_bindex = FxHashMap::default();
+        for (block, succ_range) in rpo.into_iter() {
+            lb_to_bindex.insert(block, lowered_order.len() as BlockIndex);
+            lowered_order.push(block);
+            lowered_succ_ranges.push(succ_range);
+        }
+
+        let lowered_succ_indices = lowered_succs
+            .iter()
+            .map(|&(inst, succ)| (inst, lb_to_bindex.get(&succ).cloned().unwrap()))
+            .collect();
+
+        let mut orig_map = SecondaryMap::with_default(None);
+        for (i, lb) in lowered_order.iter().enumerate() {
+            let i = i as BlockIndex;
+            if let Some(b) = lb.orig_block() {
+                orig_map[b] = Some(i);
+            }
+        }
+
+        let result = BlockLoweringOrder {
+            lowered_order,
+            lowered_succs,
+            lowered_succ_indices,
+            lowered_succ_ranges,
+            orig_map,
+        };
+        debug!("BlockLoweringOrder: {:?}", result);
+        result
+    }
+
+    /// Get the lowered order of blocks.
+    pub fn lowered_order(&self) -> &[LoweredBlock] {
+        &self.lowered_order[..]
+    }
+
+    /// Get the successors for a lowered block, by index in `lowered_order()`'s
+    /// returned slice. Each successsor is paired with the edge-instruction
+    /// (branch) corresponding to this edge.
+    pub fn succs(&self, block: BlockIndex) -> &[(Inst, LoweredBlock)] {
+        let range = self.lowered_succ_ranges[block as usize];
+        &self.lowered_succs[range.0..range.1]
+    }
+
+    /// Get the successor indices for a lowered block.
+    pub fn succ_indices(&self, block: BlockIndex) -> &[(Inst, BlockIndex)] {
+        let range = self.lowered_succ_ranges[block as usize];
+        &self.lowered_succ_indices[range.0..range.1]
+    }
+
+    /// Get the lowered block index containing a CLIF block, if any. (May not be
+    /// present if the original CLIF block was unreachable.)
+    pub fn lowered_block_for_bb(&self, bb: Block) -> Option<BlockIndex> {
+        self.orig_map[bb]
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::cursor::{Cursor, FuncCursor};
+    use crate::ir::types::*;
+    use crate::ir::{AbiParam, ExternalName, Function, InstBuilder, Signature};
+    use crate::isa::CallConv;
+
+    fn build_test_func(n_blocks: usize, edges: &[(usize, usize)]) -> Function {
+        assert!(n_blocks > 0);
+
+        let name = ExternalName::testcase("test0");
+        let mut sig = Signature::new(CallConv::SystemV);
+        sig.params.push(AbiParam::new(I32));
+        let mut func = Function::with_name_signature(name, sig);
+        let blocks = (0..n_blocks)
+            .map(|i| {
+                let bb = func.dfg.make_block();
+                assert!(bb.as_u32() == i as u32);
+                bb
+            })
+            .collect::<Vec<_>>();
+
+        let arg0 = func.dfg.append_block_param(blocks[0], I32);
+
+        let mut pos = FuncCursor::new(&mut func);
+
+        let mut edge = 0;
+        for i in 0..n_blocks {
+            pos.insert_block(blocks[i]);
+            let mut succs = vec![];
+            while edge < edges.len() && edges[edge].0 == i {
+                succs.push(edges[edge].1);
+                edge += 1;
+            }
+            if succs.len() == 0 {
+                pos.ins().return_(&[arg0]);
+            } else if succs.len() == 1 {
+                pos.ins().jump(blocks[succs[0]], &[]);
+            } else if succs.len() == 2 {
+                pos.ins().brnz(arg0, blocks[succs[0]], &[]);
+                pos.ins().jump(blocks[succs[1]], &[]);
+            } else {
+                panic!("Too many successors");
+            }
+        }
+
+        func
+    }
+
+    #[test]
+    fn test_blockorder_diamond() {
+        let func = build_test_func(4, &[(0, 1), (0, 2), (1, 3), (2, 3)]);
+        let order = BlockLoweringOrder::new(&func);
+
+        assert_eq!(order.lowered_order.len(), 6);
+
+        assert!(order.lowered_order[0].orig_block().unwrap().as_u32() == 0);
+        assert!(order.lowered_order[0].in_edge().is_none());
+        assert!(order.lowered_order[0].out_edge().is_none());
+
+        assert!(order.lowered_order[1].orig_block().unwrap().as_u32() == 1);
+        assert!(order.lowered_order[1].in_edge().unwrap().0.as_u32() == 0);
+        assert!(order.lowered_order[1].in_edge().unwrap().2.as_u32() == 1);
+
+        assert!(order.lowered_order[2].orig_block().is_none());
+        assert!(order.lowered_order[2].in_edge().is_none());
+        assert!(order.lowered_order[2].out_edge().unwrap().0.as_u32() == 1);
+        assert!(order.lowered_order[2].out_edge().unwrap().2.as_u32() == 3);
+
+        assert!(order.lowered_order[3].orig_block().unwrap().as_u32() == 2);
+        assert!(order.lowered_order[3].in_edge().unwrap().0.as_u32() == 0);
+        assert!(order.lowered_order[3].in_edge().unwrap().2.as_u32() == 2);
+        assert!(order.lowered_order[3].out_edge().is_none());
+
+        assert!(order.lowered_order[4].orig_block().is_none());
+        assert!(order.lowered_order[4].in_edge().is_none());
+        assert!(order.lowered_order[4].out_edge().unwrap().0.as_u32() == 2);
+        assert!(order.lowered_order[4].out_edge().unwrap().2.as_u32() == 3);
+
+        assert!(order.lowered_order[5].orig_block().unwrap().as_u32() == 3);
+        assert!(order.lowered_order[5].in_edge().is_none());
+        assert!(order.lowered_order[5].out_edge().is_none());
+    }
+
+    #[test]
+    fn test_blockorder_critedge() {
+        //            0
+        //          /   \
+        //         1     2
+        //        /  \     \
+        //       3    4    |
+        //       |\  _|____|
+        //       | \/ |
+        //       | /\ |
+        //       5    6
+        //
+        // (3 -> 5, 3 -> 6, 4 -> 6 are critical edges and must be split)
+        //
+        let func = build_test_func(
+            7,
+            &[
+                (0, 1),
+                (0, 2),
+                (1, 3),
+                (1, 4),
+                (2, 5),
+                (3, 5),
+                (3, 6),
+                (4, 6),
+            ],
+        );
+        let order = BlockLoweringOrder::new(&func);
+
+        assert_eq!(order.lowered_order.len(), 11);
+        println!("ordered = {:?}", order.lowered_order);
+
+        // block 0
+        assert!(order.lowered_order[0].orig_block().unwrap().as_u32() == 0);
+        assert!(order.lowered_order[0].in_edge().is_none());
+        assert!(order.lowered_order[0].out_edge().is_none());
+
+        // edge 0->1 + block 1
+        assert!(order.lowered_order[1].orig_block().unwrap().as_u32() == 1);
+        assert!(order.lowered_order[1].in_edge().unwrap().0.as_u32() == 0);
+        assert!(order.lowered_order[1].in_edge().unwrap().2.as_u32() == 1);
+        assert!(order.lowered_order[1].out_edge().is_none());
+
+        // edge 1->3 + block 3
+        assert!(order.lowered_order[2].orig_block().unwrap().as_u32() == 3);
+        assert!(order.lowered_order[2].in_edge().unwrap().0.as_u32() == 1);
+        assert!(order.lowered_order[2].in_edge().unwrap().2.as_u32() == 3);
+        assert!(order.lowered_order[2].out_edge().is_none());
+
+        // edge 3->5
+        assert!(order.lowered_order[3].orig_block().is_none());
+        assert!(order.lowered_order[3].in_edge().is_none());
+        assert!(order.lowered_order[3].out_edge().unwrap().0.as_u32() == 3);
+        assert!(order.lowered_order[3].out_edge().unwrap().2.as_u32() == 5);
+
+        // edge 3->6
+        assert!(order.lowered_order[4].orig_block().is_none());
+        assert!(order.lowered_order[4].in_edge().is_none());
+        assert!(order.lowered_order[4].out_edge().unwrap().0.as_u32() == 3);
+        assert!(order.lowered_order[4].out_edge().unwrap().2.as_u32() == 6);
+
+        // edge 1->4 + block 4
+        assert!(order.lowered_order[5].orig_block().unwrap().as_u32() == 4);
+        assert!(order.lowered_order[5].in_edge().unwrap().0.as_u32() == 1);
+        assert!(order.lowered_order[5].in_edge().unwrap().2.as_u32() == 4);
+        assert!(order.lowered_order[5].out_edge().is_none());
+
+        // edge 4->6
+        assert!(order.lowered_order[6].orig_block().is_none());
+        assert!(order.lowered_order[6].in_edge().is_none());
+        assert!(order.lowered_order[6].out_edge().unwrap().0.as_u32() == 4);
+        assert!(order.lowered_order[6].out_edge().unwrap().2.as_u32() == 6);
+
+        // block 6
+        assert!(order.lowered_order[7].orig_block().unwrap().as_u32() == 6);
+        assert!(order.lowered_order[7].in_edge().is_none());
+        assert!(order.lowered_order[7].out_edge().is_none());
+
+        // edge 0->2 + block 2
+        assert!(order.lowered_order[8].orig_block().unwrap().as_u32() == 2);
+        assert!(order.lowered_order[8].in_edge().unwrap().0.as_u32() == 0);
+        assert!(order.lowered_order[8].in_edge().unwrap().2.as_u32() == 2);
+        assert!(order.lowered_order[8].out_edge().is_none());
+
+        // edge 2->5
+        assert!(order.lowered_order[9].orig_block().is_none());
+        assert!(order.lowered_order[9].in_edge().is_none());
+        assert!(order.lowered_order[9].out_edge().unwrap().0.as_u32() == 2);
+        assert!(order.lowered_order[9].out_edge().unwrap().2.as_u32() == 5);
+
+        // block 5
+        assert!(order.lowered_order[10].orig_block().unwrap().as_u32() == 5);
+        assert!(order.lowered_order[10].in_edge().is_none());
+        assert!(order.lowered_order[10].out_edge().is_none());
+    }
 }
--- a/cranelift/codegen/src/machinst/buffer.rs
+++ b/cranelift/codegen/src/machinst/buffer.rs
--- a/cranelift/codegen/src/machinst/compile.rs
+++ b/cranelift/codegen/src/machinst/compile.rs
@@ -18,8 +18,12 @@ pub fn compile<B: LowerBackend + MachBackend>(
 where
    B::MInst: ShowWithRRU,
 {
-    // This lowers the CL IR.
-    let mut vcode = Lower::new(f, abi)?.lower(b)?;
+    // Compute lowered block order.
+    let block_order = BlockLoweringOrder::new(f);
+    // Build the lowering context.
+    let lower = Lower::new(f, abi, block_order)?;
+    // Lower the IR.
+    let mut vcode = lower.lower(b)?;

    debug!(
        "vcode from lowering: \n{}",
@@ -65,11 +69,6 @@ where
    // all at once. This also inserts prologues/epilogues.
    vcode.replace_insns_from_regalloc(result);

-    vcode.remove_redundant_branches();
-
-    // Do final passes over code to finalize branches.
-    vcode.finalize_branches();
-
    debug!(
        "vcode after regalloc: final version:\n{}",
        vcode.show_rru(Some(b.reg_universe()))
--- a/cranelift/codegen/src/machinst/lower.rs
+++ b/cranelift/codegen/src/machinst/lower.rs
--- a/cranelift/codegen/src/machinst/mod.rs
+++ b/cranelift/codegen/src/machinst/mod.rs
@@ -109,6 +109,7 @@ use regalloc::RegUsageCollector;
 use regalloc::{
    RealReg, RealRegUniverse, Reg, RegClass, RegUsageMapper, SpillSlot, VirtualReg, Writable,
 };
+use smallvec::SmallVec;
 use std::string::String;
 use target_lexicon::Triple;

@@ -124,8 +125,8 @@ pub mod abi;
 pub use abi::*;
 pub mod pretty_print;
 pub use pretty_print::*;
-pub mod sections;
-pub use sections::*;
+pub mod buffer;
+pub use buffer::*;
 pub mod adapter;
 pub use adapter::*;

@@ -152,6 +153,9 @@ pub trait MachInst: Clone + Debug {
    /// Generate a move.
    fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Self;

+    /// Generate a constant into a reg.
+    fn gen_constant(to_reg: Writable<Reg>, value: u64, ty: Type) -> SmallVec<[Self; 4]>;
+
    /// Generate a zero-length no-op.
    fn gen_zero_len_nop() -> Self;

@@ -166,7 +170,7 @@ pub trait MachInst: Clone + Debug {

    /// Generate a jump to another target. Used during lowering of
    /// control flow.
-    fn gen_jump(target: BlockIndex) -> Self;
+    fn gen_jump(target: MachLabel) -> Self;

    /// Generate a NOP. The `preferred_size` parameter allows the caller to
    /// request a NOP of that size, or as close to it as possible. The machine
@@ -175,22 +179,62 @@ pub trait MachInst: Clone + Debug {
    /// the instruction must have a nonzero size.
    fn gen_nop(preferred_size: usize) -> Self;

-    /// Rewrite block targets using the block-target map.
-    fn with_block_rewrites(&mut self, block_target_map: &[BlockIndex]);
-
-    /// Finalize branches once the block order (fallthrough) is known.
-    fn with_fallthrough_block(&mut self, fallthrough_block: Option<BlockIndex>);
-
-    /// Update instruction once block offsets are known.  These offsets are
-    /// relative to the beginning of the function. `targets` is indexed by
-    /// BlockIndex.
-    fn with_block_offsets(&mut self, my_offset: CodeOffset, targets: &[CodeOffset]);
+    /// Get the register universe for this backend.
+    fn reg_universe(flags: &Flags) -> RealRegUniverse;

    /// Align a basic block offset (from start of function).  By default, no
    /// alignment occurs.
    fn align_basic_block(offset: CodeOffset) -> CodeOffset {
        offset
    }
+
+    /// What is the worst-case instruction size emitted by this instruction type?
+    fn worst_case_size() -> CodeOffset;
+
+    /// A label-use kind: a type that describes the types of label references that
+    /// can occur in an instruction.
+    type LabelUse: MachInstLabelUse;
+}
+
+/// A descriptor of a label reference (use) in an instruction set.
+pub trait MachInstLabelUse: Clone + Copy + Debug + Eq {
+    /// Required alignment for any veneer. Usually the required instruction
+    /// alignment (e.g., 4 for a RISC with 32-bit instructions, or 1 for x86).
+    const ALIGN: CodeOffset;
+
+    /// What is the maximum PC-relative range (positive)? E.g., if `1024`, a
+    /// label-reference fixup at offset `x` is valid if the label resolves to `x
+    /// + 1024`.
+    fn max_pos_range(self) -> CodeOffset;
+    /// What is the maximum PC-relative range (negative)? This is the absolute
+    /// value; i.e., if `1024`, then a label-reference fixup at offset `x` is
+    /// valid if the label resolves to `x - 1024`.
+    fn max_neg_range(self) -> CodeOffset;
+    /// What is the size of code-buffer slice this label-use needs to patch in
+    /// the label's value?
+    fn patch_size(self) -> CodeOffset;
+    /// Perform a code-patch, given the offset into the buffer of this label use
+    /// and the offset into the buffer of the label's definition.
+    /// It is guaranteed that, given `delta = offset - label_offset`, we will
+    /// have `offset >= -self.max_neg_range()` and `offset <=
+    /// self.max_pos_range()`.
+    fn patch(self, buffer: &mut [u8], use_offset: CodeOffset, label_offset: CodeOffset);
+    /// Can the label-use be patched to a veneer that supports a longer range?
+    /// Usually valid for jumps (a short-range jump can jump to a longer-range
+    /// jump), but not for e.g. constant pool references, because the constant
+    /// load would require different code (one more level of indirection).
+    fn supports_veneer(self) -> bool;
+    /// How many bytes are needed for a veneer?
+    fn veneer_size(self) -> CodeOffset;
+    /// Generate a veneer. The given code-buffer slice is `self.veneer_size()`
+    /// bytes long at offset `veneer_offset` in the buffer. The original
+    /// label-use will be patched to refer to this veneer's offset.  A new
+    /// (offset, LabelUse) is returned that allows the veneer to use the actual
+    /// label. For veneers to work properly, it is expected that the new veneer
+    /// has a larger range; on most platforms this probably means either a
+    /// "long-range jump" (e.g., on ARM, the 26-bit form), or if already at that
+    /// stage, a jump that supports a full 32-bit range, for example.
+    fn generate_veneer(self, buffer: &mut [u8], veneer_offset: CodeOffset) -> (CodeOffset, Self);
 }

 /// Describes a block terminator (not call) in the vcode, when its branches
@@ -202,26 +246,26 @@ pub enum MachTerminator<'a> {
    /// A return instruction.
    Ret,
    /// An unconditional branch to another block.
-    Uncond(BlockIndex),
+    Uncond(MachLabel),
    /// A conditional branch to one of two other blocks.
-    Cond(BlockIndex, BlockIndex),
+    Cond(MachLabel, MachLabel),
    /// An indirect branch with known possible targets.
-    Indirect(&'a [BlockIndex]),
+    Indirect(&'a [MachLabel]),
 }

 /// A trait describing the ability to encode a MachInst into binary machine code.
-pub trait MachInstEmit<O: MachSectionOutput> {
+pub trait MachInstEmit: MachInst {
    /// Persistent state carried across `emit` invocations.
    type State: Default + Clone + Debug;
    /// Emit the instruction.
-    fn emit(&self, code: &mut O, flags: &Flags, state: &mut Self::State);
+    fn emit(&self, code: &mut MachBuffer<Self>, flags: &Flags, state: &mut Self::State);
 }

 /// The result of a `MachBackend::compile_function()` call. Contains machine
 /// code (as bytes) and a disassembly, if requested.
 pub struct MachCompileResult {
    /// Machine code.
-    pub sections: MachSections,
+    pub buffer: MachBufferFinalized,
    /// Size of stack frame, in bytes.
    pub frame_size: u32,
    /// Disassembly, if requested.
@@ -231,7 +275,7 @@ pub struct MachCompileResult {
 impl MachCompileResult {
    /// Get a `CodeInfo` describing section sizes from this compilation result.
    pub fn code_info(&self) -> CodeInfo {
-        let code_size = self.sections.total_size();
+        let code_size = self.buffer.total_size();
        CodeInfo {
            code_size,
            jumptables_size: 0,
--- a/cranelift/codegen/src/machinst/sections.rs
+++ b/cranelift/codegen/src/machinst/sections.rs
@@ -1,460 +0,0 @@
-//! In-memory representation of compiled machine code, in multiple sections
-//! (text, constant pool / rodata, etc). Emission occurs into multiple sections
-//! simultaneously, so we buffer the result in memory and hand off to the
-//! caller at the end of compilation.
-
-use crate::binemit::{Addend, CodeOffset, CodeSink, Reloc};
-use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode};
-
-use alloc::vec::Vec;
-
-/// A collection of sections with defined start-offsets.
-pub struct MachSections {
-    /// Sections, in offset order.
-    pub sections: Vec<MachSection>,
-}
-
-impl MachSections {
-    /// New, empty set of sections.
-    pub fn new() -> MachSections {
-        MachSections { sections: vec![] }
-    }
-
-    /// Add a section with a known offset and size. Returns the index.
-    pub fn add_section(&mut self, start: CodeOffset, length: CodeOffset) -> usize {
-        let idx = self.sections.len();
-        self.sections.push(MachSection::new(start, length));
-        idx
-    }
-
-    /// Mutably borrow the given section by index.
-    pub fn get_section<'a>(&'a mut self, idx: usize) -> &'a mut MachSection {
-        &mut self.sections[idx]
-    }
-
-    /// Get mutable borrows of two sections simultaneously. Used during
-    /// instruction emission to provide references to the .text and .rodata
-    /// (constant pool) sections.
-    pub fn two_sections<'a>(
-        &'a mut self,
-        idx1: usize,
-        idx2: usize,
-    ) -> (&'a mut MachSection, &'a mut MachSection) {
-        assert!(idx1 < idx2);
-        assert!(idx1 < self.sections.len());
-        assert!(idx2 < self.sections.len());
-        let (first, rest) = self.sections.split_at_mut(idx2);
-        (&mut first[idx1], &mut rest[0])
-    }
-
-    /// Emit this set of sections to a set of sinks for the code,
-    /// relocations, traps, and stackmap.
-    pub fn emit<CS: CodeSink>(&self, sink: &mut CS) {
-        // N.B.: we emit every section into the .text section as far as
-        // the `CodeSink` is concerned; we do not bother to segregate
-        // the contents into the actual program text, the jumptable and the
-        // rodata (constant pool). This allows us to generate code assuming
-        // that these will not be relocated relative to each other, and avoids
-        // having to designate each section as belonging in one of the three
-        // fixed categories defined by `CodeSink`. If this becomes a problem
-        // later (e.g. because of memory permissions or similar), we can
-        // add this designation and segregate the output; take care, however,
-        // to add the appropriate relocations in this case.
-
-        for section in &self.sections {
-            if section.data.len() > 0 {
-                while sink.offset() < section.start_offset {
-                    sink.put1(0);
-                }
-                section.emit(sink);
-            }
-        }
-        sink.begin_jumptables();
-        sink.begin_rodata();
-        sink.end_codegen();
-    }
-
-    /// Get a list of source location mapping tuples in sorted-by-start-offset order.
-    pub fn get_srclocs_sorted<'a>(&'a self) -> MachSectionsSrcLocs<'a> {
-        MachSectionsSrcLocs::new(&self.sections)
-    }
-
-    /// Get the total required size for these sections.
-    pub fn total_size(&self) -> CodeOffset {
-        if self.sections.len() == 0 {
-            0
-        } else {
-            // Find the last non-empty section.
-            self.sections
-                .iter()
-                .rev()
-                .find(|s| s.data.len() > 0)
-                .map(|s| s.cur_offset_from_start())
-                .unwrap_or(0)
-        }
-    }
-}
-
-/// An iterator over the srclocs in each section.
-/// Returns MachSrcLocs in an order sorted by start location.
-pub struct MachSectionsSrcLocs<'a> {
-    sections: &'a [MachSection],
-    cur_section: usize,
-    cur_srcloc: usize,
-    // For validation:
-    last_offset: CodeOffset,
-}
-
-impl<'a> MachSectionsSrcLocs<'a> {
-    fn new(sections: &'a [MachSection]) -> MachSectionsSrcLocs<'a> {
-        MachSectionsSrcLocs {
-            sections,
-            cur_section: 0,
-            cur_srcloc: 0,
-            last_offset: 0,
-        }
-    }
-}
-
-impl<'a> Iterator for MachSectionsSrcLocs<'a> {
-    type Item = &'a MachSrcLoc;
-
-    fn next(&mut self) -> Option<&'a MachSrcLoc> {
-        // We simply iterate through sections and srcloc records in order. This produces a
-        // sorted order naturally because sections are in starting-offset-order, and srclocs
-        // are produced as a section is emitted into, so are in order as well.
-
-        // If we're out of sections, we're done.
-        if self.cur_section >= self.sections.len() {
-            return None;
-        }
-
-        // Otherwise, make sure we have a srcloc in the current section left to return, and
-        // advance to the next section if not. Done if we run out of sections.
-        while self.cur_srcloc >= self.sections[self.cur_section].srclocs.len() {
-            self.cur_srcloc = 0;
-            self.cur_section += 1;
-            if self.cur_section >= self.sections.len() {
-                return None;
-            }
-        }
-
-        let loc = &self.sections[self.cur_section].srclocs[self.cur_srcloc];
-        self.cur_srcloc += 1;
-        debug_assert!(loc.start >= self.last_offset);
-        self.last_offset = loc.start;
-        Some(loc)
-    }
-}
-
-/// An abstraction over MachSection and MachSectionSize: some
-/// receiver of section data.
-pub trait MachSectionOutput {
-    /// Get the current offset from the start of all sections.
-    fn cur_offset_from_start(&self) -> CodeOffset;
-
-    /// Get the start offset of this section.
-    fn start_offset(&self) -> CodeOffset;
-
-    /// Add 1 byte to the section.
-    fn put1(&mut self, _: u8);
-
-    /// Add 2 bytes to the section.
-    fn put2(&mut self, value: u16) {
-        let [b0, b1] = value.to_le_bytes();
-        self.put1(b0);
-        self.put1(b1);
-    }
-
-    /// Add 4 bytes to the section.
-    fn put4(&mut self, value: u32) {
-        let [b0, b1, b2, b3] = value.to_le_bytes();
-        self.put1(b0);
-        self.put1(b1);
-        self.put1(b2);
-        self.put1(b3);
-    }
-
-    /// Add 8 bytes to the section.
-    fn put8(&mut self, value: u64) {
-        let [b0, b1, b2, b3, b4, b5, b6, b7] = value.to_le_bytes();
-        self.put1(b0);
-        self.put1(b1);
-        self.put1(b2);
-        self.put1(b3);
-        self.put1(b4);
-        self.put1(b5);
-        self.put1(b6);
-        self.put1(b7);
-    }
-
-    /// Add a slice of bytes to the section.
-    fn put_data(&mut self, data: &[u8]);
-
-    /// Add a relocation at the current offset.
-    fn add_reloc(&mut self, loc: SourceLoc, kind: Reloc, name: &ExternalName, addend: Addend);
-
-    /// Add a trap record at the current offset.
-    fn add_trap(&mut self, loc: SourceLoc, code: TrapCode);
-
-    /// Add a call return address record at the current offset.
-    fn add_call_site(&mut self, loc: SourceLoc, opcode: Opcode);
-
-    /// Start the output for the given source-location at the current offset.
-    fn start_srcloc(&mut self, loc: SourceLoc);
-
-    /// End the output for the previously-given source-location at the current offset.
-    fn end_srcloc(&mut self);
-
-    /// Align up to the given alignment.
-    fn align_to(&mut self, align_to: CodeOffset) {
-        assert!(align_to.is_power_of_two());
-        while self.cur_offset_from_start() & (align_to - 1) != 0 {
-            self.put1(0);
-        }
-    }
-}
-
-/// A section of output to be emitted to a CodeSink / RelocSink in bulk.
-/// Multiple sections may be created with known start offsets in advance; the
-/// usual use-case is to create the .text (code) and .rodata (constant pool) at
-/// once, after computing the length of the code, so that constant references
-/// can use known offsets as instructions are emitted.
-pub struct MachSection {
-    /// The starting offset of this section.
-    pub start_offset: CodeOffset,
-    /// The limit of this section, defined by the start of the next section.
-    pub length_limit: CodeOffset,
-    /// The section contents, as raw bytes.
-    pub data: Vec<u8>,
-    /// Any relocations referring to this section.
-    pub relocs: Vec<MachReloc>,
-    /// Any trap records referring to this section.
-    pub traps: Vec<MachTrap>,
-    /// Any call site records referring to this section.
-    pub call_sites: Vec<MachCallSite>,
-    /// Any source location mappings referring to this section.
-    pub srclocs: Vec<MachSrcLoc>,
-    /// The current source location in progress (after `start_srcloc()` and before `end_srcloc()`).
-    /// This is a (start_offset, src_loc) tuple.
-    pub cur_srcloc: Option<(CodeOffset, SourceLoc)>,
-}
-
-impl MachSection {
-    /// Create a new section, known to start at `start_offset` and with a size limited to `length_limit`.
-    pub fn new(start_offset: CodeOffset, length_limit: CodeOffset) -> MachSection {
-        MachSection {
-            start_offset,
-            length_limit,
-            data: vec![],
-            relocs: vec![],
-            traps: vec![],
-            call_sites: vec![],
-            srclocs: vec![],
-            cur_srcloc: None,
-        }
-    }
-
-    /// Emit this section to the CodeSink and other associated sinks.  The
-    /// current offset of the CodeSink must match the starting offset of this
-    /// section.
-    pub fn emit<CS: CodeSink>(&self, sink: &mut CS) {
-        assert!(sink.offset() == self.start_offset);
-
-        let mut next_reloc = 0;
-        let mut next_trap = 0;
-        let mut next_call_site = 0;
-        for (idx, byte) in self.data.iter().enumerate() {
-            if next_reloc < self.relocs.len() {
-                let reloc = &self.relocs[next_reloc];
-                if reloc.offset == idx as CodeOffset {
-                    sink.reloc_external(reloc.srcloc, reloc.kind, &reloc.name, reloc.addend);
-                    next_reloc += 1;
-                }
-            }
-            if next_trap < self.traps.len() {
-                let trap = &self.traps[next_trap];
-                if trap.offset == idx as CodeOffset {
-                    sink.trap(trap.code, trap.srcloc);
-                    next_trap += 1;
-                }
-            }
-            if next_call_site < self.call_sites.len() {
-                let call_site = &self.call_sites[next_call_site];
-                if call_site.ret_addr == idx as CodeOffset {
-                    sink.add_call_site(call_site.opcode, call_site.srcloc);
-                    next_call_site += 1;
-                }
-            }
-            sink.put1(*byte);
-        }
-    }
-}
-
-impl MachSectionOutput for MachSection {
-    fn cur_offset_from_start(&self) -> CodeOffset {
-        self.start_offset + self.data.len() as CodeOffset
-    }
-
-    fn start_offset(&self) -> CodeOffset {
-        self.start_offset
-    }
-
-    fn put1(&mut self, value: u8) {
-        assert!(((self.data.len() + 1) as CodeOffset) <= self.length_limit);
-        self.data.push(value);
-    }
-
-    fn put_data(&mut self, data: &[u8]) {
-        assert!(((self.data.len() + data.len()) as CodeOffset) <= self.length_limit);
-        self.data.extend_from_slice(data);
-    }
-
-    fn add_reloc(&mut self, srcloc: SourceLoc, kind: Reloc, name: &ExternalName, addend: Addend) {
-        let name = name.clone();
-        self.relocs.push(MachReloc {
-            offset: self.data.len() as CodeOffset,
-            srcloc,
-            kind,
-            name,
-            addend,
-        });
-    }
-
-    fn add_trap(&mut self, srcloc: SourceLoc, code: TrapCode) {
-        self.traps.push(MachTrap {
-            offset: self.data.len() as CodeOffset,
-            srcloc,
-            code,
-        });
-    }
-
-    fn add_call_site(&mut self, srcloc: SourceLoc, opcode: Opcode) {
-        self.call_sites.push(MachCallSite {
-            ret_addr: self.data.len() as CodeOffset,
-            srcloc,
-            opcode,
-        });
-    }
-
-    fn start_srcloc(&mut self, loc: SourceLoc) {
-        self.cur_srcloc = Some((self.cur_offset_from_start(), loc));
-    }
-
-    fn end_srcloc(&mut self) {
-        let (start, loc) = self
-            .cur_srcloc
-            .take()
-            .expect("end_srcloc() called without start_srcloc()");
-        let end = self.cur_offset_from_start();
-        // Skip zero-length extends.
-        debug_assert!(end >= start);
-        if end > start {
-            self.srclocs.push(MachSrcLoc { start, end, loc });
-        }
-    }
-}
-
-/// A MachSectionOutput implementation that records only size.
-pub struct MachSectionSize {
-    /// The starting offset of this section.
-    pub start_offset: CodeOffset,
-    /// The current offset of this section.
-    pub offset: CodeOffset,
-}
-
-impl MachSectionSize {
-    /// Create a new size-counting dummy section.
-    pub fn new(start_offset: CodeOffset) -> MachSectionSize {
-        MachSectionSize {
-            start_offset,
-            offset: start_offset,
-        }
-    }
-
-    /// Return the size this section would take if emitted with a real sink.
-    pub fn size(&self) -> CodeOffset {
-        self.offset - self.start_offset
-    }
-}
-
-impl MachSectionOutput for MachSectionSize {
-    fn cur_offset_from_start(&self) -> CodeOffset {
-        // All size-counting sections conceptually start at offset 0; this doesn't
-        // matter when counting code size.
-        self.offset
-    }
-
-    fn start_offset(&self) -> CodeOffset {
-        self.start_offset
-    }
-
-    fn put1(&mut self, _: u8) {
-        self.offset += 1;
-    }
-
-    fn put_data(&mut self, data: &[u8]) {
-        self.offset += data.len() as CodeOffset;
-    }
-
-    fn add_reloc(&mut self, _: SourceLoc, _: Reloc, _: &ExternalName, _: Addend) {}
-
-    fn add_trap(&mut self, _: SourceLoc, _: TrapCode) {}
-
-    fn add_call_site(&mut self, _: SourceLoc, _: Opcode) {}
-
-    fn start_srcloc(&mut self, _: SourceLoc) {}
-
-    fn end_srcloc(&mut self) {}
-}
-
-/// A relocation resulting from a compilation.
-pub struct MachReloc {
-    /// The offset at which the relocation applies, *relative to the
-    /// containing section*.
-    pub offset: CodeOffset,
-    /// The original source location.
-    pub srcloc: SourceLoc,
-    /// The kind of relocation.
-    pub kind: Reloc,
-    /// The external symbol / name to which this relocation refers.
-    pub name: ExternalName,
-    /// The addend to add to the symbol value.
-    pub addend: i64,
-}
-
-/// A trap record resulting from a compilation.
-pub struct MachTrap {
-    /// The offset at which the trap instruction occurs, *relative to the
-    /// containing section*.
-    pub offset: CodeOffset,
-    /// The original source location.
-    pub srcloc: SourceLoc,
-    /// The trap code.
-    pub code: TrapCode,
-}
-
-/// A call site record resulting from a compilation.
-pub struct MachCallSite {
-    /// The offset of the call's return address, *relative to the containing section*.
-    pub ret_addr: CodeOffset,
-    /// The original source location.
-    pub srcloc: SourceLoc,
-    /// The call's opcode.
-    pub opcode: Opcode,
-}
-
-/// A source-location mapping resulting from a compilation.
-#[derive(Clone, Debug)]
-pub struct MachSrcLoc {
-    /// The start of the region of code corresponding to a source location.
-    /// This is relative to the start of the function, not to the start of the
-    /// section.
-    pub start: CodeOffset,
-    /// The end of the region of code corresponding to a source location.
-    /// This is relative to the start of the section, not to the start of the
-    /// section.
-    pub end: CodeOffset,
-    /// The source location.
-    pub loc: SourceLoc,
-}
--- a/cranelift/codegen/src/machinst/vcode.rs
+++ b/cranelift/codegen/src/machinst/vcode.rs
@@ -17,8 +17,7 @@
 //! See the main module comment in `mod.rs` for more details on the VCode-based
 //! backend pipeline.

-use crate::entity::SecondaryMap;
-use crate::ir::{self, Block, SourceLoc};
+use crate::ir::{self, SourceLoc};
 use crate::machinst::*;
 use crate::settings;

@@ -30,8 +29,6 @@ use regalloc::{

 use alloc::boxed::Box;
 use alloc::{borrow::Cow, vec::Vec};
-use log::debug;
-use smallvec::SmallVec;
 use std::fmt;
 use std::iter;
 use std::string::String;
@@ -43,8 +40,8 @@ pub type BlockIndex = u32;

 /// VCodeInst wraps all requirements for a MachInst to be in VCode: it must be
 /// a `MachInst` and it must be able to emit itself at least to a `SizeCodeSink`.
-pub trait VCodeInst: MachInst + MachInstEmit<MachSection> + MachInstEmit<MachSectionSize> {}
-impl<I: MachInst + MachInstEmit<MachSection> + MachInstEmit<MachSectionSize>> VCodeInst for I {}
+pub trait VCodeInst: MachInst + MachInstEmit {}
+impl<I: MachInst + MachInstEmit> VCodeInst for I {}

 /// A function in "VCode" (virtualized-register code) form, after lowering.
 /// This is essentially a standard CFG of basic blocks, where each basic block
@@ -80,29 +77,11 @@ pub struct VCode<I: VCodeInst> {
    /// correspond to each basic block's successors.
    block_succs: Vec<BlockIx>,

-    /// Block indices by IR block.
-    block_by_bb: SecondaryMap<ir::Block, BlockIndex>,
-
-    /// IR block for each VCode Block. The length of this Vec will likely be
-    /// less than the total number of Blocks, because new Blocks (for edge
-    /// splits, for example) are appended during lowering.
-    bb_by_block: Vec<ir::Block>,
-
-    /// Order of block IDs in final generated code.
-    final_block_order: Vec<BlockIndex>,
-
-    /// Final block offsets. Computed during branch finalization and used
-    /// during emission.
-    final_block_offsets: Vec<CodeOffset>,
-
-    /// Size of code, accounting for block layout / alignment.
-    code_size: CodeOffset,
+    /// Block-order information.
+    block_order: BlockLoweringOrder,

    /// ABI object.
    abi: Box<dyn ABIBody<I = I>>,
-
-    /// The block targeted by fallthrough_returns, if there's one.
-    pub fallthrough_return_block: Option<BlockIndex>,
 }

 /// A builder for a VCode function body. This builder is designed for the
@@ -123,12 +102,8 @@ pub struct VCodeBuilder<I: VCodeInst> {
    /// In-progress VCode.
    vcode: VCode<I>,

-    /// Current basic block instructions, in reverse order (because blocks are
-    /// built bottom-to-top).
-    bb_insns: SmallVec<[(I, SourceLoc); 32]>,
-
-    /// Current IR-inst instructions, in forward order.
-    ir_inst_insns: SmallVec<[(I, SourceLoc); 4]>,
+    /// Index of the last block-start in the vcode.
+    block_start: InsnIndex,

    /// Start of succs for the current block in the concatenated succs list.
    succ_start: usize,
@@ -139,12 +114,11 @@ pub struct VCodeBuilder<I: VCodeInst> {

 impl<I: VCodeInst> VCodeBuilder<I> {
    /// Create a new VCodeBuilder.
-    pub fn new(abi: Box<dyn ABIBody<I = I>>) -> VCodeBuilder<I> {
-        let vcode = VCode::new(abi);
+    pub fn new(abi: Box<dyn ABIBody<I = I>>, block_order: BlockLoweringOrder) -> VCodeBuilder<I> {
+        let vcode = VCode::new(abi, block_order);
        VCodeBuilder {
            vcode,
-            bb_insns: SmallVec::new(),
-            ir_inst_insns: SmallVec::new(),
+            block_start: 0,
            succ_start: 0,
            cur_srcloc: SourceLoc::default(),
        }
@@ -155,14 +129,9 @@ impl<I: VCodeInst> VCodeBuilder<I> {
        &mut *self.vcode.abi
    }

-    /// Set the fallthrough_return target block for this function. There must be at most once per
-    /// function.
-    pub fn set_fallthrough_return_block(&mut self, bb: Block) {
-        debug_assert!(
-            self.vcode.fallthrough_return_block.is_none(),
-            "a function must have at most one fallthrough-return instruction"
-        );
-        self.vcode.fallthrough_return_block = Some(self.bb_to_bindex(bb));
+    /// Access to the BlockLoweringOrder object.
+    pub fn block_order(&self) -> &BlockLoweringOrder {
+        &self.vcode.block_order
    }

    /// Set the type of a VReg.
@@ -173,53 +142,17 @@ impl<I: VCodeInst> VCodeBuilder<I> {
        self.vcode.vreg_types[vreg.get_index()] = ty;
    }

-    /// Return the underlying bb-to-BlockIndex map.
-    pub fn blocks_by_bb(&self) -> &SecondaryMap<ir::Block, BlockIndex> {
-        &self.vcode.block_by_bb
-    }
-
-    /// Initialize the bb-to-BlockIndex map. Returns the first free
-    /// BlockIndex.
-    pub fn init_bb_map(&mut self, blocks: &[ir::Block]) -> BlockIndex {
-        let mut bindex: BlockIndex = 0;
-        for bb in blocks.iter() {
-            self.vcode.block_by_bb[*bb] = bindex;
-            self.vcode.bb_by_block.push(*bb);
-            bindex += 1;
-        }
-        bindex
-    }
-
-    /// Get the BlockIndex for an IR block.
-    pub fn bb_to_bindex(&self, bb: ir::Block) -> BlockIndex {
-        self.vcode.block_by_bb[bb]
-    }
-
    /// Set the current block as the entry block.
    pub fn set_entry(&mut self, block: BlockIndex) {
        self.vcode.entry = block;
    }

-    /// End the current IR instruction. Must be called after pushing any
-    /// instructions and prior to ending the basic block.
-    pub fn end_ir_inst(&mut self) {
-        while let Some(pair) = self.ir_inst_insns.pop() {
-            self.bb_insns.push(pair);
-        }
-    }
-
    /// End the current basic block. Must be called after emitting vcode insts
    /// for IR insts and prior to ending the function (building the VCode).
-    pub fn end_bb(&mut self) -> BlockIndex {
-        assert!(self.ir_inst_insns.is_empty());
-        let block_num = self.vcode.block_ranges.len() as BlockIndex;
-        // Push the instructions.
-        let start_idx = self.vcode.insts.len() as InsnIndex;
-        while let Some((i, loc)) = self.bb_insns.pop() {
-            self.vcode.insts.push(i);
-            self.vcode.srclocs.push(loc);
-        }
+    pub fn end_bb(&mut self) {
+        let start_idx = self.block_start;
        let end_idx = self.vcode.insts.len() as InsnIndex;
+        self.block_start = end_idx;
        // Add the instruction index range to the list of blocks.
        self.vcode.block_ranges.push((start_idx, end_idx));
        // End the successors list.
@@ -228,8 +161,6 @@ impl<I: VCodeInst> VCodeBuilder<I> {
            .block_succ_range
            .push((self.succ_start, succ_end));
        self.succ_start = succ_end;
-
-        block_num
    }

    /// Push an instruction for the current BB and current IR inst within the BB.
@@ -237,19 +168,27 @@ impl<I: VCodeInst> VCodeBuilder<I> {
        match insn.is_term() {
            MachTerminator::None | MachTerminator::Ret => {}
            MachTerminator::Uncond(target) => {
-                self.vcode.block_succs.push(BlockIx::new(target));
+                self.vcode.block_succs.push(BlockIx::new(target.get()));
            }
            MachTerminator::Cond(true_branch, false_branch) => {
-                self.vcode.block_succs.push(BlockIx::new(true_branch));
-                self.vcode.block_succs.push(BlockIx::new(false_branch));
+                self.vcode.block_succs.push(BlockIx::new(true_branch.get()));
+                self.vcode
+                    .block_succs
+                    .push(BlockIx::new(false_branch.get()));
            }
            MachTerminator::Indirect(targets) => {
                for target in targets {
-                    self.vcode.block_succs.push(BlockIx::new(*target));
+                    self.vcode.block_succs.push(BlockIx::new(target.get()));
                }
            }
        }
-        self.ir_inst_insns.push((insn, self.cur_srcloc));
+        self.vcode.insts.push(insn);
+        self.vcode.srclocs.push(self.cur_srcloc);
+    }
+
+    /// Get the current source location.
+    pub fn get_srcloc(&self) -> SourceLoc {
+        self.cur_srcloc
    }

    /// Set the current source location.
@@ -259,8 +198,6 @@ impl<I: VCodeInst> VCodeBuilder<I> {

    /// Build the final VCode.
    pub fn build(self) -> VCode<I> {
-        assert!(self.ir_inst_insns.is_empty());
-        assert!(self.bb_insns.is_empty());
        self.vcode
    }
 }
@@ -282,35 +219,9 @@ fn is_redundant_move<I: VCodeInst>(insn: &I) -> bool {
    }
 }

-fn is_trivial_jump_block<I: VCodeInst>(vcode: &VCode<I>, block: BlockIndex) -> Option<BlockIndex> {
-    let range = vcode.block_insns(BlockIx::new(block));
-
-    debug!(
-        "is_trivial_jump_block: block {} has len {}",
-        block,
-        range.len()
-    );
-
-    if range.len() != 1 {
-        return None;
-    }
-    let insn = range.first();
-
-    debug!(
-        " -> only insn is: {:?} with terminator {:?}",
-        vcode.get_insn(insn),
-        vcode.get_insn(insn).is_term()
-    );
-
-    match vcode.get_insn(insn).is_term() {
-        MachTerminator::Uncond(target) => Some(target),
-        _ => None,
-    }
-}
-
 impl<I: VCodeInst> VCode<I> {
    /// New empty VCode.
-    fn new(abi: Box<dyn ABIBody<I = I>>) -> VCode<I> {
+    fn new(abi: Box<dyn ABIBody<I = I>>, block_order: BlockLoweringOrder) -> VCode<I> {
        VCode {
            liveins: abi.liveins(),
            liveouts: abi.liveouts(),
@@ -321,13 +232,8 @@ impl<I: VCodeInst> VCode<I> {
            block_ranges: vec![],
            block_succ_range: vec![],
            block_succs: vec![],
-            block_by_bb: SecondaryMap::with_default(0),
-            bb_by_block: vec![],
-            final_block_order: vec![],
-            final_block_offsets: vec![],
-            code_size: 0,
+            block_order,
            abi,
-            fallthrough_return_block: None,
        }
    }

@@ -367,8 +273,6 @@ impl<I: VCodeInst> VCode<I> {
    /// instructions including spliced fill/reload/move instructions, and replace
    /// the VCode with them.
    pub fn replace_insns_from_regalloc(&mut self, result: RegAllocResult<Self>) {
-        self.final_block_order = compute_final_block_order(self);
-
        // Record the spillslot count and clobbered registers for the ABI/stack
        // setup code.
        self.abi.set_num_spillslots(result.num_spill_slots as usize);
@@ -383,11 +287,12 @@ impl<I: VCodeInst> VCode<I> {
        let mut final_block_ranges = vec![(0, 0); self.num_blocks()];
        let mut final_srclocs = vec![];

-        for block in &self.final_block_order {
-            let (start, end) = block_ranges[*block as usize];
+        for block in 0..self.num_blocks() {
+            let block = block as BlockIndex;
+            let (start, end) = block_ranges[block as usize];
            let final_start = final_insns.len() as InsnIndex;

-            if *block == self.entry {
+            if block == self.entry {
                // Start with the prologue.
                let prologue = self.abi.gen_prologue();
                let len = prologue.len();
@@ -429,7 +334,7 @@ impl<I: VCodeInst> VCode<I> {
            }

            let final_end = final_insns.len() as InsnIndex;
-            final_block_ranges[*block as usize] = (final_start, final_end);
+            final_block_ranges[block as usize] = (final_start, final_end);
        }

        debug_assert!(final_insns.len() == final_srclocs.len());
@@ -439,175 +344,68 @@ impl<I: VCodeInst> VCode<I> {
        self.block_ranges = final_block_ranges;
    }

-    /// Removes redundant branches, rewriting targets to point directly to the
-    /// ultimate block at the end of a chain of trivial one-target jumps.
-    pub fn remove_redundant_branches(&mut self) {
-        // For each block, compute the actual target block, looking through up to one
-        // block with single-target jumps (this will remove empty edge blocks inserted
-        // by phi-lowering).
-        let block_rewrites: Vec<BlockIndex> = (0..self.num_blocks() as u32)
-            .map(|bix| is_trivial_jump_block(self, bix).unwrap_or(bix))
-            .collect();
-        let mut refcounts: Vec<usize> = vec![0; self.num_blocks()];
-
-        debug!(
-            "remove_redundant_branches: block_rewrites = {:?}",
-            block_rewrites
-        );
-
-        refcounts[self.entry as usize] = 1;
-
-        for block in 0..self.num_blocks() as u32 {
-            for insn in self.block_insns(BlockIx::new(block)) {
-                self.get_insn_mut(insn)
-                    .with_block_rewrites(&block_rewrites[..]);
-                match self.get_insn(insn).is_term() {
-                    MachTerminator::Uncond(bix) => {
-                        refcounts[bix as usize] += 1;
-                    }
-                    MachTerminator::Cond(bix1, bix2) => {
-                        refcounts[bix1 as usize] += 1;
-                        refcounts[bix2 as usize] += 1;
-                    }
-                    MachTerminator::Indirect(blocks) => {
-                        for block in blocks {
-                            refcounts[*block as usize] += 1;
-                        }
-                    }
-                    _ => {}
-                }
-            }
-        }
-
-        let deleted: Vec<bool> = refcounts.iter().map(|r| *r == 0).collect();
-
-        let block_order = std::mem::replace(&mut self.final_block_order, vec![]);
-        self.final_block_order = block_order
-            .into_iter()
-            .filter(|b| !deleted[*b as usize])
-            .collect();
-
-        // Rewrite successor information based on the block-rewrite map.
-        for succ in &mut self.block_succs {
-            let new_succ = block_rewrites[succ.get() as usize];
-            *succ = BlockIx::new(new_succ);
-        }
-    }
-
-    /// Mutate branch instructions to (i) lower two-way condbrs to one-way,
-    /// depending on fallthrough; and (ii) use concrete offsets.
-    pub fn finalize_branches(&mut self)
+    /// Emit the instructions to a `MachBuffer`, containing fixed-up code and external
+    /// reloc/trap/etc. records ready for use.
+    pub fn emit(&self) -> MachBuffer<I>
    where
-        I: MachInstEmit<MachSectionSize>,
+        I: MachInstEmit,
    {
-        // Compute fallthrough block, indexed by block.
-        let num_final_blocks = self.final_block_order.len();
-        let mut block_fallthrough: Vec<Option<BlockIndex>> = vec![None; self.num_blocks()];
-        for i in 0..(num_final_blocks - 1) {
-            let from = self.final_block_order[i];
-            let to = self.final_block_order[i + 1];
-            block_fallthrough[from as usize] = Some(to);
-        }
-
-        // Pass over VCode instructions and finalize two-way branches into
-        // one-way branches with fallthrough.
-        for block in 0..self.num_blocks() {
-            let next_block = block_fallthrough[block];
-            let (start, end) = self.block_ranges[block];
-
-            for iix in start..end {
-                let insn = &mut self.insts[iix as usize];
-                insn.with_fallthrough_block(next_block);
-            }
-        }
-
-        let flags = self.abi.flags();
-
-        // Compute block offsets.
-        let mut code_section = MachSectionSize::new(0);
-        let mut block_offsets = vec![0; self.num_blocks()];
+        let mut buffer = MachBuffer::new();
        let mut state = Default::default();
-        for &block in &self.final_block_order {
-            code_section.offset = I::align_basic_block(code_section.offset);
-            block_offsets[block as usize] = code_section.offset;
-            let (start, end) = self.block_ranges[block as usize];
-            for iix in start..end {
-                self.insts[iix as usize].emit(&mut code_section, flags, &mut state);
-            }
-        }

-        // We now have the section layout.
-        self.final_block_offsets = block_offsets;
-        self.code_size = code_section.size();
-
-        // Update branches with known block offsets. This looks like the
-        // traversal above, but (i) does not update block_offsets, rather uses
-        // it (so forward references are now possible), and (ii) mutates the
-        // instructions.
-        let mut code_section = MachSectionSize::new(0);
-        let mut state = Default::default();
-        for &block in &self.final_block_order {
-            code_section.offset = I::align_basic_block(code_section.offset);
-            let (start, end) = self.block_ranges[block as usize];
-            for iix in start..end {
-                self.insts[iix as usize]
-                    .with_block_offsets(code_section.offset, &self.final_block_offsets[..]);
-                self.insts[iix as usize].emit(&mut code_section, flags, &mut state);
-            }
-        }
-    }
-
-    /// Emit the instructions to a list of sections.
-    pub fn emit(&self) -> MachSections
-    where
-        I: MachInstEmit<MachSection>,
-    {
-        let mut sections = MachSections::new();
-        let code_idx = sections.add_section(0, self.code_size);
-        let code_section = sections.get_section(code_idx);
-        let mut state = Default::default();
+        buffer.reserve_labels_for_blocks(self.num_blocks() as BlockIndex); // first N MachLabels are simply block indices.

        let flags = self.abi.flags();
        let mut cur_srcloc = None;
-        for &block in &self.final_block_order {
-            let new_offset = I::align_basic_block(code_section.cur_offset_from_start());
-            while new_offset > code_section.cur_offset_from_start() {
+        for block in 0..self.num_blocks() {
+            let block = block as BlockIndex;
+            let new_offset = I::align_basic_block(buffer.cur_offset());
+            while new_offset > buffer.cur_offset() {
                // Pad with NOPs up to the aligned block offset.
-                let nop = I::gen_nop((new_offset - code_section.cur_offset_from_start()) as usize);
-                nop.emit(code_section, flags, &mut Default::default());
+                let nop = I::gen_nop((new_offset - buffer.cur_offset()) as usize);
+                nop.emit(&mut buffer, flags, &mut Default::default());
            }
-            assert_eq!(code_section.cur_offset_from_start(), new_offset);
+            assert_eq!(buffer.cur_offset(), new_offset);

            let (start, end) = self.block_ranges[block as usize];
+            buffer.bind_label(MachLabel::from_block(block));
            for iix in start..end {
                let srcloc = self.srclocs[iix as usize];
                if cur_srcloc != Some(srcloc) {
                    if cur_srcloc.is_some() {
-                        code_section.end_srcloc();
+                        buffer.end_srcloc();
                    }
-                    code_section.start_srcloc(srcloc);
+                    buffer.start_srcloc(srcloc);
                    cur_srcloc = Some(srcloc);
                }

-                self.insts[iix as usize].emit(code_section, flags, &mut state);
+                self.insts[iix as usize].emit(&mut buffer, flags, &mut state);
            }

            if cur_srcloc.is_some() {
-                code_section.end_srcloc();
+                buffer.end_srcloc();
                cur_srcloc = None;
            }
+
+            // Do we need an island? Get the worst-case size of the next BB and see if, having
+            // emitted that many bytes, we will be beyond the deadline.
+            if block < (self.num_blocks() - 1) as BlockIndex {
+                let next_block = block + 1;
+                let next_block_range = self.block_ranges[next_block as usize];
+                let next_block_size = next_block_range.1 - next_block_range.0;
+                let worst_case_next_bb = I::worst_case_size() * next_block_size;
+                if buffer.island_needed(worst_case_next_bb) {
+                    buffer.emit_island();
+                }
+            }
        }

-        sections
+        buffer
    }

    /// Get the IR block for a BlockIndex, if one exists.
    pub fn bindex_to_bb(&self, block: BlockIndex) -> Option<ir::Block> {
-        if (block as usize) < self.bb_by_block.len() {
-            Some(self.bb_by_block[block as usize])
-        } else {
-            None
-        }
+        self.block_order.lowered_order()[block as usize].orig_block()
    }
 }

@@ -712,7 +510,6 @@ impl<I: VCodeInst> fmt::Debug for VCode<I> {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        writeln!(f, "VCode_Debug {{")?;
        writeln!(f, "  Entry block: {}", self.entry)?;
-        writeln!(f, "  Final block order: {:?}", self.final_block_order)?;

        for block in 0..self.num_blocks() {
            writeln!(f, "Block {}:", block,)?;
@@ -736,52 +533,21 @@ impl<I: VCodeInst + ShowWithRRU> ShowWithRRU for VCode<I> {
    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
        use std::fmt::Write;

-        // Calculate an order in which to display the blocks.  This is the same
-        // as final_block_order, but also includes blocks which are in the
-        // representation but not in final_block_order.
-        let mut display_order = Vec::<usize>::new();
-        // First display blocks in `final_block_order`
-        for bix in &self.final_block_order {
-            assert!((*bix as usize) < self.num_blocks());
-            display_order.push(*bix as usize);
-        }
-        // Now also take care of those not listed in `final_block_order`.
-        // This is quadratic, but it's also debug-only code.
-        for bix in 0..self.num_blocks() {
-            if display_order.contains(&bix) {
-                continue;
-            }
-            display_order.push(bix);
-        }
-
        let mut s = String::new();
        write!(&mut s, "VCode_ShowWithRRU {{{{\n").unwrap();
        write!(&mut s, "  Entry block: {}\n", self.entry).unwrap();
-        write!(
-            &mut s,
-            "  Final block order: {:?}\n",
-            self.final_block_order
-        )
-        .unwrap();

        for i in 0..self.num_blocks() {
-            let block = display_order[i];
+            let block = i as BlockIndex;

-            let omitted = if !self.final_block_order.is_empty() && i >= self.final_block_order.len()
-            {
-                "** OMITTED **"
-            } else {
-                ""
-            };
-
-            write!(&mut s, "Block {}: {}\n", block, omitted).unwrap();
-            if let Some(bb) = self.bindex_to_bb(block as BlockIndex) {
+            write!(&mut s, "Block {}:\n", block).unwrap();
+            if let Some(bb) = self.bindex_to_bb(block) {
                write!(&mut s, "  (original IR block: {})\n", bb).unwrap();
            }
-            for succ in self.succs(block as BlockIndex) {
+            for succ in self.succs(block) {
                write!(&mut s, "  (successor: Block {})\n", succ.get()).unwrap();
            }
-            let (start, end) = self.block_ranges[block];
+            let (start, end) = self.block_ranges[block as usize];
            write!(&mut s, "  (instruction range: {} .. {})\n", start, end).unwrap();
            for inst in start..end {
                write!(
--- a/cranelift/codegen/src/num_uses.rs
+++ b/cranelift/codegen/src/num_uses.rs
@@ -1,52 +0,0 @@
-//! A pass that computes the number of uses of any given instruction.
-
-use crate::entity::SecondaryMap;
-use crate::ir::dfg::ValueDef;
-use crate::ir::Value;
-use crate::ir::{DataFlowGraph, Function, Inst};
-
-/// Auxiliary data structure that counts the number of uses of any given
-/// instruction in a Function. This is used during instruction selection
-/// to essentially do incremental DCE: when an instruction is no longer
-/// needed because its computation has been isel'd into another machine
-/// instruction at every use site, we can skip it.
-#[derive(Clone, Debug)]
-pub struct NumUses {
-    uses: SecondaryMap<Inst, u32>,
-}
-
-impl NumUses {
-    fn new() -> NumUses {
-        NumUses {
-            uses: SecondaryMap::with_default(0),
-        }
-    }
-
-    /// Compute the NumUses analysis result for a function.
-    pub fn compute(func: &Function) -> NumUses {
-        let mut uses = NumUses::new();
-        for bb in func.layout.blocks() {
-            for inst in func.layout.block_insts(bb) {
-                for arg in func.dfg.inst_args(inst) {
-                    let v = func.dfg.resolve_aliases(*arg);
-                    uses.add_value(&func.dfg, v);
-                }
-            }
-        }
-        uses
-    }
-
-    fn add_value(&mut self, dfg: &DataFlowGraph, v: Value) {
-        match dfg.value_def(v) {
-            ValueDef::Result(inst, _) => {
-                self.uses[inst] += 1;
-            }
-            _ => {}
-        }
-    }
-
-    /// Take the complete uses map, consuming this analysis result.
-    pub fn take_uses(self) -> SecondaryMap<Inst, u32> {
-        self.uses
-    }
-}
--- a/cranelift/filetests/filetests/vcode/aarch64/arithmetic.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/arithmetic.clif
@@ -1,7 +1,7 @@
 test vcode
 target aarch64

-function %f(i64, i64) -> i64 {
+function %f1(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = iadd.i64 v0, v1
  return v2
@@ -15,7 +15,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ret


-function %f(i64, i64) -> i64 {
+function %f2(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = isub.i64 v0, v1
  return v2
@@ -28,7 +28,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f3(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = imul.i64 v0, v1
  return v2
@@ -41,7 +41,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f4(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = umulhi.i64 v0, v1
  return v2
@@ -54,7 +54,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f5(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = smulhi.i64 v0, v1
  return v2
@@ -67,7 +67,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f6(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = sdiv.i64 v0, v1
  return v2
@@ -87,7 +87,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64) -> i64 {
+function %f7(i64) -> i64 {
 block0(v0: i64):
  v1 = iconst.i64 2
  v2 = sdiv.i64 v0, v1
@@ -109,7 +109,7 @@ block0(v0: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f8(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = udiv.i64 v0, v1
  return v2
@@ -124,7 +124,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64) -> i64 {
+function %f9(i64) -> i64 {
 block0(v0: i64):
  v1 = iconst.i64 2
  v2 = udiv.i64 v0, v1
@@ -141,7 +141,7 @@ block0(v0: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f10(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = srem.i64 v0, v1
  return v2
@@ -157,7 +157,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f11(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = urem.i64 v0, v1
  return v2
@@ -174,7 +174,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ret


-function %f(i32, i32) -> i32 {
+function %f12(i32, i32) -> i32 {
 block0(v0: i32, v1: i32):
  v2 = sdiv.i32 v0, v1
  return v2
@@ -195,7 +195,7 @@ block0(v0: i32, v1: i32):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i32) -> i32 {
+function %f13(i32) -> i32 {
 block0(v0: i32):
  v1 = iconst.i32 2
  v2 = sdiv.i32 v0, v1
@@ -204,9 +204,8 @@ block0(v0: i32):

 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
-; nextln:  mov x1, x0
+; nextln: sxtw x1, w0
 ; nextln: movz x0, #2
-; nextln:  sxtw x1, w1
 ; nextln: sxtw x2, w0
 ; nextln: sdiv x0, x1, x2
 ; nextln: cbz x2, 20
@@ -219,7 +218,7 @@ block0(v0: i32):
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret

-function %f(i32, i32) -> i32 {
+function %f14(i32, i32) -> i32 {
 block0(v0: i32, v1: i32):
  v2 = udiv.i32 v0, v1
  return v2
@@ -236,7 +235,8 @@ block0(v0: i32, v1: i32):
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret

-function %f(i32) -> i32 {
+
+function %f15(i32) -> i32 {
 block0(v0: i32):
  v1 = iconst.i32 2
  v2 = udiv.i32 v0, v1
@@ -245,9 +245,8 @@ block0(v0: i32):

 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln:  mov fp, sp
-; nextln:  movz x1, #2
 ; nextln:  mov w0, w0
-; nextln:  mov w1, w1
+; nextln:  movz x1, #2
 ; nextln:  udiv x0, x0, x1
 ; nextln:  cbnz x1, 8
 ; nextln:  udf
@@ -255,7 +254,7 @@ block0(v0: i32):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i32, i32) -> i32 {
+function %f16(i32, i32) -> i32 {
 block0(v0: i32, v1: i32):
  v2 = srem.i32 v0, v1
  return v2
@@ -273,7 +272,7 @@ block0(v0: i32, v1: i32):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i32, i32) -> i32 {
+function %f17(i32, i32) -> i32 {
 block0(v0: i32, v1: i32):
  v2 = urem.i32 v0, v1
  return v2
@@ -291,7 +290,7 @@ block0(v0: i32, v1: i32):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f18(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = band.i64 v0, v1
  return v2
@@ -304,7 +303,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f19(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = bor.i64 v0, v1
  return v2
@@ -317,7 +316,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f20(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = bxor.i64 v0, v1
  return v2
@@ -330,7 +329,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f21(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = band_not.i64 v0, v1
  return v2
@@ -343,7 +342,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f22(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = bor_not.i64 v0, v1
  return v2
@@ -356,7 +355,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f23(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = bxor_not.i64 v0, v1
  return v2
@@ -369,7 +368,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f24(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = bnot.i64 v0
  return v2
--- a/cranelift/filetests/filetests/vcode/aarch64/condbr.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/condbr.clif
@@ -30,17 +30,18 @@ block2:
  return v5
 }

+; check: Block 0:
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
 ; nextln: subs xzr, x0, x1
-; nextln: b.eq 20
-; check: Block 2:
-; check: movz x0, #2
+; nextln: b.eq label1 ; b label2
+; check: Block 1:
+; check: movz x0, #1
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
-; check: Block 1:
-; check: movz x0, #1
+; check: Block 2:
+; check: movz x0, #2
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
--- a/cranelift/filetests/filetests/vcode/aarch64/jumptable.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/jumptable.clif
@@ -30,15 +30,15 @@ block5(v5: i64):

 ; check:   subs wzr, w0, #3
 ; nextln:   b.hs
-; nextln:   adr x2, pc+16 ; ldrsw x1, [x2, x0, LSL 2] ; add x2, x2, x1 ; br x2 ; jt_entries
+; nextln:   adr x1, pc+16 ; ldrsw x2, [x1, x0, LSL 2] ; add x1, x1, x2 ; br x1 ; jt_entries

-; check:   movz x1, #3
+; check:   movz x1, #1
 ; nextln:   b

 ; check:   movz x1, #2
 ; nextln:   b

-; check:   movz x1, #1
+; check:   movz x1, #3

 ; check:   add x0, x0, x1

--- a/cranelift/filetests/filetests/vcode/aarch64/saturating-ops.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/saturating-ops.clif
@@ -25,10 +25,10 @@ block0(v0: i8, v1: i8):

 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
-; nextln: uxtb x0, w0
-; nextln: uxtb x1, w1
-; nextln: mov v0.d[0], x0
-; nextln: mov v1.d[0], x1
+; nextln: uxtb x2, w0
+; nextln: uxtb x0, w1
+; nextln: mov v0.d[0], x2
+; nextln: mov v1.d[0], x0
 ; nextln: uqadd d0, d0, d1
 ; nextln: mov x0, v0.d[0]
 ; nextln: mov sp, fp
--- a/cranelift/filetests/filetests/vcode/aarch64/shift-rotate.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/shift-rotate.clif
@@ -368,10 +368,10 @@ block0(v0: i16):

 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
-; nextln:  uxth w0, w0
-; nextln:  lsr w1, w0, #6
-; nextln:  lsl w0, w0, #10
-; nextln:  orr w0, w0, w1
+; nextln: uxth w1, w0
+; nextln: lsr w0, w1, #6
+; nextln: lsl w1, w1, #10
+; nextln: orr w0, w1, w0
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
@@ -385,10 +385,10 @@ block0(v0: i8):

 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln:  mov fp, sp
-; nextln:  uxtb w0, w0
-; nextln:  lsr w1, w0, #5
-; nextln:  lsl w0, w0, #3
-; nextln:  orr w0, w0, w1
+; nextln:  uxtb w1, w0
+; nextln:  lsr w0, w1, #5
+; nextln:  lsl w1, w1, #3
+; nextln:  orr w0, w1, w0
 ; nextln:  mov sp, fp
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
--- a/crates/environ/src/cranelift.rs
+++ b/crates/environ/src/cranelift.rs
@@ -93,7 +93,7 @@ use crate::compilation::{
 use crate::func_environ::{get_func_name, FuncEnvironment};
 use crate::{CacheConfig, FunctionBodyData, ModuleLocal, ModuleTranslation, Tunables};
 use cranelift_codegen::ir::{self, ExternalName};
-use cranelift_codegen::machinst::sections::MachSrcLoc;
+use cranelift_codegen::machinst::buffer::MachSrcLoc;
 use cranelift_codegen::print_errors::pretty_error;
 use cranelift_codegen::{binemit, isa, Context};
 use cranelift_entity::PrimaryMap;
@@ -215,7 +215,7 @@ fn get_function_address_map<'data>(
    if let Some(ref mcr) = &context.mach_compile_result {
        // New-style backend: we have a `MachCompileResult` that will give us `MachSrcLoc` mapping
        // tuples.
-        for &MachSrcLoc { start, end, loc } in mcr.sections.get_srclocs_sorted() {
+        for &MachSrcLoc { start, end, loc } in mcr.buffer.get_srclocs_sorted() {
            instructions.push(InstructionAddressMap {
                srcloc: loc,
                code_offset: start as usize,