Merge pull request #1718 from cfallin/machinst-codebuffer

Rework of MachInst isel, branch fixups and lowering, and block ordering.
2020-05-19 07:17:22 -07:00
parent 28d6df0db6 bdd2873c8c
commit d8d6fbe58c
32 changed files with 3465 additions and 2353 deletions
--- a/cranelift/codegen/src/context.rs
+++ b/cranelift/codegen/src/context.rs
@@ -227,7 +227,7 @@ impl Context {
        let _tt = timing::binemit();
        let mut sink = MemoryCodeSink::new(mem, relocs, traps, stackmaps);
        if let Some(ref result) = &self.mach_compile_result {
-            result.sections.emit(&mut sink);
+            result.buffer.emit(&mut sink);
        } else {
            isa.emit_function_to_memory(&self.func, &mut sink);
        }
--- a/cranelift/codegen/src/inst_predicates.rs
+++ b/cranelift/codegen/src/inst_predicates.rs
@@ -40,3 +40,24 @@ pub fn has_side_effect(func: &Function, inst: Inst) -> bool {
    let opcode = data.opcode();
    trivially_has_side_effects(opcode) || is_load_with_defined_trapping(opcode, data)
 }
+
+/// Does the given instruction have any side-effect as per [has_side_effect], or else is a load?
+pub fn has_side_effect_or_load(func: &Function, inst: Inst) -> bool {
+    has_side_effect(func, inst) || func.dfg[inst].opcode().can_load()
+}
+
+/// Is the given instruction a constant value (`iconst`, `fconst`, `bconst`) that can be
+/// represented in 64 bits?
+pub fn is_constant_64bit(func: &Function, inst: Inst) -> Option<u64> {
+    let data = &func.dfg[inst];
+    if data.opcode() == Opcode::Null {
+        return Some(0);
+    }
+    match data {
+        &InstructionData::UnaryImm { imm, .. } => Some(imm.bits() as u64),
+        &InstructionData::UnaryIeee32 { imm, .. } => Some(imm.bits() as u64),
+        &InstructionData::UnaryIeee64 { imm, .. } => Some(imm.bits()),
+        &InstructionData::UnaryBool { imm, .. } => Some(if imm { 1 } else { 0 }),
+        _ => None,
+    }
+}
--- a/cranelift/codegen/src/isa/aarch64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -504,7 +504,7 @@ impl AArch64ABIBody {
                rn: stack_reg(),
                rm: stack_limit,
            });
-            insts.push(Inst::CondBrLowered {
+            insts.push(Inst::OneWayCondBr {
                target: BranchTarget::ResolvedOffset(8),
                // Here `Hs` == "higher or same" when interpreting the two
                // operands as unsigned integers.
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -3,14 +3,14 @@
 // Some variants are never constructed, but we still want them as options in the future.
 #![allow(dead_code)]

-use crate::binemit::CodeOffset;
 use crate::ir::Type;
 use crate::isa::aarch64::inst::*;
 use crate::isa::aarch64::lower::ty_bits;
+use crate::machinst::MachLabel;

 use regalloc::{RealRegUniverse, Reg, Writable};

-use core::convert::{Into, TryFrom};
+use core::convert::Into;
 use std::string::String;

 /// A shift operator for a register or immediate.
@@ -303,78 +303,44 @@ impl CondBrKind {

 /// A branch target. Either unresolved (basic-block index) or resolved (offset
 /// from end of current instruction).
-#[derive(Clone, Copy, Debug)]
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum BranchTarget {
-    /// An unresolved reference to a BlockIndex, as passed into
+    /// An unresolved reference to a Label, as passed into
    /// `lower_branch_group()`.
-    Block(BlockIndex),
-    /// A resolved reference to another instruction, after
-    /// `Inst::with_block_offsets()`.
+    Label(MachLabel),
+    /// A fixed PC offset.
    ResolvedOffset(isize),
 }

 impl BranchTarget {
-    /// Lower the branch target given offsets of each block.
-    pub fn lower(&mut self, targets: &[CodeOffset], my_offset: CodeOffset) {
+    /// Return the target's label, if it is a label-based target.
+    pub fn as_label(self) -> Option<MachLabel> {
        match self {
-            &mut BranchTarget::Block(bix) => {
-                let bix = usize::try_from(bix).unwrap();
-                assert!(bix < targets.len());
-                let block_offset_in_func = targets[bix];
-                let branch_offset = (block_offset_in_func as isize) - (my_offset as isize);
-                *self = BranchTarget::ResolvedOffset(branch_offset);
-            }
-            &mut BranchTarget::ResolvedOffset(..) => {}
-        }
-    }
-
-    /// Get the block index.
-    pub fn as_block_index(&self) -> Option<BlockIndex> {
-        match self {
-            &BranchTarget::Block(bix) => Some(bix),
+            BranchTarget::Label(l) => Some(l),
            _ => None,
        }
    }

-    /// Get the offset as 4-byte words. Returns `0` if not
-    /// yet resolved (in that case, we're only computing
-    /// size and the offset doesn't matter).
-    pub fn as_offset_words(&self) -> isize {
-        match self {
-            &BranchTarget::ResolvedOffset(off) => off >> 2,
+    /// Return the target's offset, if specified, or zero if label-based.
+    pub fn as_offset19_or_zero(self) -> u32 {
+        let off = match self {
+            BranchTarget::ResolvedOffset(off) => off >> 2,
            _ => 0,
-        }
+        };
+        assert!(off <= 0x3ffff);
+        assert!(off >= -0x40000);
+        (off as u32) & 0x7ffff
    }

-    /// Get the offset as a 26-bit offset suitable for a 26-bit jump, or `None` if overflow.
-    pub fn as_off26(&self) -> Option<u32> {
-        let off = self.as_offset_words();
-        if (off < (1 << 25)) && (off >= -(1 << 25)) {
-            Some((off as u32) & ((1 << 26) - 1))
-        } else {
-            None
-        }
-    }
-
-    /// Get the offset as a 19-bit offset, or `None` if overflow.
-    pub fn as_off19(&self) -> Option<u32> {
-        let off = self.as_offset_words();
-        if (off < (1 << 18)) && (off >= -(1 << 18)) {
-            Some((off as u32) & ((1 << 19) - 1))
-        } else {
-            None
-        }
-    }
-
-    /// Map the block index given a transform map.
-    pub fn map(&mut self, block_index_map: &[BlockIndex]) {
-        match self {
-            &mut BranchTarget::Block(ref mut bix) => {
-                let n = block_index_map[usize::try_from(*bix).unwrap()];
-                *bix = n;
-            }
-            &mut BranchTarget::ResolvedOffset(_) => {}
-        }
+    /// Return the target's offset, if specified, or zero if label-based.
+    pub fn as_offset26_or_zero(self) -> u32 {
+        let off = match self {
+            BranchTarget::ResolvedOffset(off) => off >> 2,
+            _ => 0,
+        };
+        assert!(off <= 0x1ffffff);
+        assert!(off >= -0x2000000);
+        (off as u32) & 0x3ffffff
    }
 }

@@ -507,7 +473,7 @@ impl ShowWithRRU for Cond {
 impl ShowWithRRU for BranchTarget {
    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
        match self {
-            &BranchTarget::Block(block) => format!("block{}", block),
+            &BranchTarget::Label(label) => format!("label{:?}", label.get()),
            &BranchTarget::ResolvedOffset(off) => format!("{}", off),
        }
    }
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -4,7 +4,7 @@ use crate::binemit::{CodeOffset, Reloc};
 use crate::ir::constant::ConstantData;
 use crate::ir::types::*;
 use crate::ir::TrapCode;
-use crate::isa::aarch64::{inst::regs::PINNED_REG, inst::*};
+use crate::isa::aarch64::inst::*;

 use regalloc::{Reg, RegClass, Writable};

@@ -149,6 +149,14 @@ fn enc_cbr(op_31_24: u32, off_18_0: u32, op_4: u32, cond: u32) -> u32 {
    (op_31_24 << 24) | (off_18_0 << 5) | (op_4 << 4) | cond
 }

+fn enc_conditional_br(taken: BranchTarget, kind: CondBrKind) -> u32 {
+    match kind {
+        CondBrKind::Zero(reg) => enc_cmpbr(0b1_011010_0, taken.as_offset19_or_zero(), reg),
+        CondBrKind::NotZero(reg) => enc_cmpbr(0b1_011010_1, taken.as_offset19_or_zero(), reg),
+        CondBrKind::Cond(c) => enc_cbr(0b01010100, taken.as_offset19_or_zero(), 0b0, c.bits()),
+    }
+}
+
 const MOVE_WIDE_FIXED: u32 = 0x92800000;

 #[repr(u32)]
@@ -340,10 +348,17 @@ pub struct EmitState {
    virtual_sp_offset: i64,
 }

-impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
+impl MachInstEmit for Inst {
    type State = EmitState;

-    fn emit(&self, sink: &mut O, flags: &settings::Flags, state: &mut EmitState) {
+    fn emit(&self, sink: &mut MachBuffer<Inst>, flags: &settings::Flags, state: &mut EmitState) {
+        // N.B.: we *must* not exceed the "worst-case size" used to compute
+        // where to insert islands, except when islands are explicitly triggered
+        // (with an `EmitIsland`). We check this in debug builds. This is `mut`
+        // to allow disabling the check for `JTSequence`, which is always
+        // emitted following an `EmitIsland`.
+        let mut start_off = sink.cur_offset();
+
        match self {
            &Inst::AluRRR { alu_op, rd, rn, rm } => {
                let top11 = match alu_op {
@@ -616,7 +631,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                ref mem,
                srcloc,
            } => {
-                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem, state);
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset(), mem, state);

                for inst in mem_insts.into_iter() {
                    inst.emit(sink, flags, state);
@@ -759,7 +774,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                ref mem,
                srcloc,
            } => {
-                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem, state);
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset(), mem, state);

                for inst in mem_insts.into_iter() {
                    inst.emit(sink, flags, state);
@@ -1147,10 +1162,18 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                panic!("Unsupported extend variant");
            }
            &Inst::Jump { ref dest } => {
-                // TODO: differentiate between as_off26() returning `None` for
-                // out-of-range vs. not-yet-finalized. The latter happens when we
-                // do early (fake) emission for size computation.
-                sink.put4(enc_jump26(0b000101, dest.as_off26().unwrap()));
+                let off = sink.cur_offset();
+                // Emit the jump itself.
+                sink.put4(enc_jump26(0b000101, dest.as_offset26_or_zero()));
+                // After the jump has been emitted, indicate that it uses a
+                // label, if so, so that a fixup can occur later. This happens
+                // after we emit the bytes because the fixup might occur right
+                // away (so the bytes must actually exist now).
+                if let Some(l) = dest.as_label() {
+                    sink.use_label_at_offset(off, l, LabelUse::Branch26);
+                    let cur_off = sink.cur_offset();
+                    sink.add_uncond_branch(off, cur_off, l);
+                }
            }
            &Inst::Ret => {
                sink.put4(0xd65f03c0);
@@ -1178,51 +1201,35 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    sink.add_call_site(loc, opcode);
                }
            }
-            &Inst::CondBr { .. } => panic!("Unlowered CondBr during binemit!"),
-            &Inst::CondBrLowered { target, kind } => match kind {
-                // TODO: handle >2^19 case by emitting a compound sequence with
-                // an unconditional (26-bit) branch. We need branch-relaxation
-                // adjustment machinery to enable this (because we don't want to
-                // always emit the long form).
-                CondBrKind::Zero(reg) => {
-                    sink.put4(enc_cmpbr(0b1_011010_0, target.as_off19().unwrap(), reg));
-                }
-                CondBrKind::NotZero(reg) => {
-                    sink.put4(enc_cmpbr(0b1_011010_1, target.as_off19().unwrap(), reg));
-                }
-                CondBrKind::Cond(c) => {
-                    sink.put4(enc_cbr(
-                        0b01010100,
-                        target.as_off19().unwrap_or(0),
-                        0b0,
-                        c.bits(),
-                    ));
-                }
-            },
-            &Inst::CondBrLoweredCompound {
+            &Inst::CondBr {
                taken,
                not_taken,
                kind,
            } => {
                // Conditional part first.
-                match kind {
-                    CondBrKind::Zero(reg) => {
-                        sink.put4(enc_cmpbr(0b1_011010_0, taken.as_off19().unwrap(), reg));
-                    }
-                    CondBrKind::NotZero(reg) => {
-                        sink.put4(enc_cmpbr(0b1_011010_1, taken.as_off19().unwrap(), reg));
-                    }
-                    CondBrKind::Cond(c) => {
-                        sink.put4(enc_cbr(
-                            0b01010100,
-                            taken.as_off19().unwrap_or(0),
-                            0b0,
-                            c.bits(),
-                        ));
-                    }
+                let cond_off = sink.cur_offset();
+                sink.put4(enc_conditional_br(taken, kind));
+                if let Some(l) = taken.as_label() {
+                    sink.use_label_at_offset(cond_off, l, LabelUse::Branch19);
+                    let cur_off = sink.cur_offset();
+                    let inverted = enc_conditional_br(taken, kind.invert()).to_le_bytes();
+                    sink.add_cond_branch(cond_off, cur_off, l, &inverted[..]);
                }
                // Unconditional part.
-                sink.put4(enc_jump26(0b000101, not_taken.as_off26().unwrap_or(0)));
+                let uncond_off = sink.cur_offset();
+                sink.put4(enc_jump26(0b000101, not_taken.as_offset26_or_zero()));
+                if let Some(l) = not_taken.as_label() {
+                    sink.use_label_at_offset(uncond_off, l, LabelUse::Branch26);
+                    let cur_off = sink.cur_offset();
+                    sink.add_uncond_branch(uncond_off, cur_off, l);
+                }
+            }
+            &Inst::OneWayCondBr { target, kind } => {
+                let off = sink.cur_offset();
+                sink.put4(enc_conditional_br(target, kind));
+                if let Some(l) = target.as_label() {
+                    sink.use_label_at_offset(off, l, LabelUse::Branch19);
+                }
            }
            &Inst::IndirectBr { rn, .. } => {
                sink.put4(enc_br(rn));
@@ -1239,8 +1246,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                sink.add_trap(srcloc, code);
                sink.put4(0xd4a00000);
            }
-            &Inst::Adr { rd, ref label } => {
-                let off = memlabel_finalize(sink.cur_offset_from_start(), label);
+            &Inst::Adr { rd, off } => {
                assert!(off > -(1 << 20));
                assert!(off < (1 << 20));
                sink.put4(enc_adr(off, rd));
@@ -1261,19 +1267,13 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                // This sequence is *one* instruction in the vcode, and is expanded only here at
                // emission time, because we cannot allow the regalloc to insert spills/reloads in
                // the middle; we depend on hardcoded PC-rel addressing below.
-                //
-                // N.B.: if PC-rel addressing on ADR below is changed, also update
-                // `Inst::with_block_offsets()` in aarch64/inst/mod.rs.

                // Save index in a tmp (the live range of ridx only goes to start of this
                // sequence; rtmp1 or rtmp2 may overwrite it).
                let inst = Inst::gen_move(rtmp2, ridx, I64);
                inst.emit(sink, flags, state);
                // Load address of jump table
-                let inst = Inst::Adr {
-                    rd: rtmp1,
-                    label: MemLabel::PCRel(16),
-                };
+                let inst = Inst::Adr { rd: rtmp1, off: 16 };
                inst.emit(sink, flags, state);
                // Load value out of jump table
                let inst = Inst::SLoad32 {
@@ -1303,13 +1303,21 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                };
                inst.emit(sink, flags, state);
                // Emit jump table (table of 32-bit offsets).
-                for target in targets.iter() {
-                    let off = target.as_offset_words() * 4;
-                    let off = i32::try_from(off).unwrap();
-                    // cast i32 to u32 (two's-complement)
-                    let off = off as u32;
-                    sink.put4(off);
+                let jt_off = sink.cur_offset();
+                for &target in targets.iter() {
+                    let word_off = sink.cur_offset();
+                    let off_into_table = word_off - jt_off;
+                    sink.put4(off_into_table);
+                    sink.use_label_at_offset(
+                        word_off,
+                        target.as_label().unwrap(),
+                        LabelUse::PCRel32,
+                    );
                }
+
+                // Lowering produces an EmitIsland before using a JTSequence, so we can safely
+                // disable the worst-case-size check in this case.
+                start_off = sink.cur_offset();
            }
            &Inst::LoadConst64 { rd, const_data } => {
                let inst = Inst::ULoad64 {
@@ -1348,7 +1356,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                }
            }
            &Inst::LoadAddr { rd, ref mem } => {
-                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem, state);
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset(), mem, state);
                for inst in mem_insts.into_iter() {
                    inst.emit(sink, flags, state);
                }
@@ -1401,20 +1409,6 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    add.emit(sink, flags, state);
                }
            }
-            &Inst::GetPinnedReg { rd } => {
-                let inst = Inst::Mov {
-                    rd,
-                    rm: xreg(PINNED_REG),
-                };
-                inst.emit(sink, flags, state);
-            }
-            &Inst::SetPinnedReg { rm } => {
-                let inst = Inst::Mov {
-                    rd: Writable::from_reg(xreg(PINNED_REG)),
-                    rm,
-                };
-                inst.emit(sink, flags, state);
-            }
            &Inst::VirtualSPOffsetAdj { offset } => {
                debug!(
                    "virtual sp offset adjusted by {} -> {}",
@@ -1423,6 +1417,20 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                );
                state.virtual_sp_offset += offset;
            }
+            &Inst::EmitIsland { needed_space } => {
+                if sink.island_needed(needed_space + 4) {
+                    let jump_around_label = sink.get_label();
+                    let jmp = Inst::Jump {
+                        dest: BranchTarget::Label(jump_around_label),
+                    };
+                    jmp.emit(sink, flags, state);
+                    sink.emit_island();
+                    sink.bind_label(jump_around_label);
+                }
+            }
        }
+
+        let end_off = sink.cur_offset();
+        debug_assert!((end_off - start_off) <= Inst::worst_case_size());
    }
 }
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -1956,7 +1956,7 @@ fn test_aarch64_binemit() {
    ));

    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Zero(xreg(8)),
        },
@@ -1964,7 +1964,7 @@ fn test_aarch64_binemit() {
        "cbz x8, 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::NotZero(xreg(8)),
        },
@@ -1972,7 +1972,7 @@ fn test_aarch64_binemit() {
        "cbnz x8, 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Eq),
        },
@@ -1980,7 +1980,7 @@ fn test_aarch64_binemit() {
        "b.eq 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Ne),
        },
@@ -1989,7 +1989,7 @@ fn test_aarch64_binemit() {
    ));

    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Hs),
        },
@@ -1997,7 +1997,7 @@ fn test_aarch64_binemit() {
        "b.hs 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Lo),
        },
@@ -2005,7 +2005,7 @@ fn test_aarch64_binemit() {
        "b.lo 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Mi),
        },
@@ -2013,7 +2013,7 @@ fn test_aarch64_binemit() {
        "b.mi 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Pl),
        },
@@ -2021,7 +2021,7 @@ fn test_aarch64_binemit() {
        "b.pl 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Vs),
        },
@@ -2029,7 +2029,7 @@ fn test_aarch64_binemit() {
        "b.vs 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Vc),
        },
@@ -2037,7 +2037,7 @@ fn test_aarch64_binemit() {
        "b.vc 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Hi),
        },
@@ -2045,7 +2045,7 @@ fn test_aarch64_binemit() {
        "b.hi 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Ls),
        },
@@ -2053,7 +2053,7 @@ fn test_aarch64_binemit() {
        "b.ls 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Ge),
        },
@@ -2061,7 +2061,7 @@ fn test_aarch64_binemit() {
        "b.ge 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Lt),
        },
@@ -2069,7 +2069,7 @@ fn test_aarch64_binemit() {
        "b.lt 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Gt),
        },
@@ -2077,7 +2077,7 @@ fn test_aarch64_binemit() {
        "b.gt 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Le),
        },
@@ -2085,7 +2085,7 @@ fn test_aarch64_binemit() {
        "b.le 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Al),
        },
@@ -2093,7 +2093,7 @@ fn test_aarch64_binemit() {
        "b.al 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Nv),
        },
@@ -2102,7 +2102,7 @@ fn test_aarch64_binemit() {
    ));

    insns.push((
-        Inst::CondBrLoweredCompound {
+        Inst::CondBr {
            taken: BranchTarget::ResolvedOffset(64),
            not_taken: BranchTarget::ResolvedOffset(128),
            kind: CondBrKind::Cond(Cond::Le),
@@ -2138,7 +2138,7 @@ fn test_aarch64_binemit() {
    insns.push((
        Inst::IndirectBr {
            rn: xreg(3),
-            targets: vec![1, 2, 3],
+            targets: vec![],
        },
        "60001FD6",
        "br x3",
@@ -2149,7 +2149,7 @@ fn test_aarch64_binemit() {
    insns.push((
        Inst::Adr {
            rd: writable_xreg(15),
-            label: MemLabel::PCRel((1 << 20) - 4),
+            off: (1 << 20) - 4,
        },
        "EFFF7F10",
        "adr x15, pc+1048572",
@@ -2792,19 +2792,11 @@ fn test_aarch64_binemit() {
        let actual_printing = insn.show_rru(Some(&rru));
        assert_eq!(expected_printing, actual_printing);

-        // Check the encoding is as expected.
-        let text_size = {
-            let mut code_sec = MachSectionSize::new(0);
-            insn.emit(&mut code_sec, &flags, &mut Default::default());
-            code_sec.size()
-        };
-
        let mut sink = test_utils::TestCodeSink::new();
-        let mut sections = MachSections::new();
-        let code_idx = sections.add_section(0, text_size);
-        let code_sec = sections.get_section(code_idx);
-        insn.emit(code_sec, &flags, &mut Default::default());
-        sections.emit(&mut sink);
+        let mut buffer = MachBuffer::new();
+        insn.emit(&mut buffer, &flags, &mut Default::default());
+        let buffer = buffer.finish();
+        buffer.emit(&mut sink);
        let actual_encoding = &sink.stringify();
        assert_eq!(expected_encoding, actual_encoding);
    }
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -645,35 +645,37 @@ pub enum Inst {
        dest: BranchTarget,
    },

-    /// A conditional branch.
+    /// A conditional branch. Contains two targets; at emission time, both are emitted, but
+    /// the MachBuffer knows to truncate the trailing branch if fallthrough. We optimize the
+    /// choice of taken/not_taken (inverting the branch polarity as needed) based on the
+    /// fallthrough at the time of lowering.
    CondBr {
        taken: BranchTarget,
        not_taken: BranchTarget,
        kind: CondBrKind,
    },

-    /// Lowered conditional branch: contains the original branch kind (or the
-    /// inverse), but only one BranchTarget is retained. The other is
-    /// implicitly the next instruction, given the final basic-block layout.
-    CondBrLowered {
+    /// A one-way conditional branch, invisible to the CFG processing; used *only* as part of
+    /// straight-line sequences in code to be emitted.
+    ///
+    /// In more detail:
+    /// - This branch is lowered to a branch at the machine-code level, but does not end a basic
+    ///   block, and does not create edges in the CFG seen by regalloc.
+    /// - Thus, it is *only* valid to use as part of a single-in, single-out sequence that is
+    ///   lowered from a single CLIF instruction. For example, certain arithmetic operations may
+    ///   use these branches to handle certain conditions, such as overflows, traps, etc.
+    ///
+    /// See, e.g., the lowering of `trapif` (conditional trap) for an example.
+    OneWayCondBr {
        target: BranchTarget,
        kind: CondBrKind,
    },

-    /// As for `CondBrLowered`, but represents a condbr/uncond-br sequence (two
-    /// actual machine instructions). Needed when the final block layout implies
-    /// that neither arm of a conditional branch targets the fallthrough block.
-    CondBrLoweredCompound {
-        taken: BranchTarget,
-        not_taken: BranchTarget,
-        kind: CondBrKind,
-    },
-
    /// An indirect branch through a register, augmented with set of all
    /// possible successors.
    IndirectBr {
        rn: Reg,
-        targets: Vec<BlockIndex>,
+        targets: Vec<MachLabel>,
    },

    /// A "break" instruction, used for e.g. traps and debug breakpoints.
@@ -685,11 +687,14 @@ pub enum Inst {
        trap_info: (SourceLoc, TrapCode),
    },

-    /// Load the address (using a PC-relative offset) of a MemLabel, using the
-    /// `ADR` instruction.
+    /// Compute the address (using a PC-relative offset) of a memory location, using the `ADR`
+    /// instruction. Note that we take a simple offset, not a `MemLabel`, here, because `Adr` is
+    /// only used for now in fixed lowering sequences with hardcoded offsets. In the future we may
+    /// need full `MemLabel` support.
    Adr {
        rd: Writable<Reg>,
-        label: MemLabel,
+        /// Offset in range -2^20 .. 2^20.
+        off: i32,
    },

    /// Raw 32-bit word, used for inline constants and jump-table entries.
@@ -706,7 +711,7 @@ pub enum Inst {
    /// for rationale).
    JTSequence {
        targets: Box<[BranchTarget]>,
-        targets_for_term: Box<[BlockIndex]>, // needed for MachTerminator.
+        targets_for_term: Box<[MachLabel]>, // needed for MachTerminator.
        ridx: Reg,
        rtmp1: Writable<Reg>,
        rtmp2: Writable<Reg>,
@@ -732,21 +737,36 @@ pub enum Inst {
        mem: MemArg,
    },

-    /// Sets the value of the pinned register to the given register target.
-    GetPinnedReg {
-        rd: Writable<Reg>,
-    },
-
-    /// Writes the value of the given source register to the pinned register.
-    SetPinnedReg {
-        rm: Reg,
-    },
-
    /// Marker, no-op in generated code: SP "virtual offset" is adjusted. This
    /// controls MemArg::NominalSPOffset args are lowered.
    VirtualSPOffsetAdj {
        offset: i64,
    },
+
+    /// Meta-insn, no-op in generated code: emit constant/branch veneer island
+    /// at this point (with a guard jump around it) if less than the needed
+    /// space is available before the next branch deadline. See the `MachBuffer`
+    /// implementation in `machinst/buffer.rs` for the overall algorithm. In
+    /// brief, we retain a set of "pending/unresolved label references" from
+    /// branches as we scan forward through instructions to emit machine code;
+    /// if we notice we're about to go out of range on an unresolved reference,
+    /// we stop, emit a bunch of "veneers" (branches in a form that has a longer
+    /// range, e.g. a 26-bit-offset unconditional jump), and point the original
+    /// label references to those. This is an "island" because it comes in the
+    /// middle of the code.
+    ///
+    /// This meta-instruction is a necessary part of the logic that determines
+    /// where to place islands. Ordinarily, we want to place them between basic
+    /// blocks, so we compute the worst-case size of each block, and emit the
+    /// island before starting a block if we would exceed a deadline before the
+    /// end of the block. However, some sequences (such as an inline jumptable)
+    /// are variable-length and not accounted for by this logic; so these
+    /// lowered sequences include an `EmitIsland` to trigger island generation
+    /// where necessary.
+    EmitIsland {
+        /// The needed space before the next deadline.
+        needed_space: CodeOffset,
+    },
 }

 fn count_zero_half_words(mut value: u64) -> usize {
@@ -1111,9 +1131,7 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
            collector.add_defs(&*defs);
            collector.add_use(rn);
        }
-        &Inst::CondBr { ref kind, .. }
-        | &Inst::CondBrLowered { ref kind, .. }
-        | &Inst::CondBrLoweredCompound { ref kind, .. } => match kind {
+        &Inst::CondBr { ref kind, .. } | &Inst::OneWayCondBr { ref kind, .. } => match kind {
            CondBrKind::Zero(rt) | CondBrKind::NotZero(rt) => {
                collector.add_use(*rt);
            }
@@ -1142,13 +1160,8 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
        &Inst::LoadAddr { rd, mem: _ } => {
            collector.add_def(rd);
        }
-        &Inst::GetPinnedReg { rd } => {
-            collector.add_def(rd);
-        }
-        &Inst::SetPinnedReg { rm } => {
-            collector.add_use(rm);
-        }
        &Inst::VirtualSPOffsetAdj { .. } => {}
+        &Inst::EmitIsland { .. } => {}
    }
 }

@@ -1676,13 +1689,7 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RegUsageMapper) {
            *defs = Box::new(new_defs);
            map_use(mapper, rn);
        }
-        &mut Inst::CondBr { ref mut kind, .. } => {
-            map_br(mapper, kind);
-        }
-        &mut Inst::CondBrLowered { ref mut kind, .. } => {
-            map_br(mapper, kind);
-        }
-        &mut Inst::CondBrLoweredCompound { ref mut kind, .. } => {
+        &mut Inst::CondBr { ref mut kind, .. } | &mut Inst::OneWayCondBr { ref mut kind, .. } => {
            map_br(mapper, kind);
        }
        &mut Inst::IndirectBr { ref mut rn, .. } => {
@@ -1716,13 +1723,8 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RegUsageMapper) {
            map_def(mapper, rd);
            map_mem(mapper, mem);
        }
-        &mut Inst::GetPinnedReg { ref mut rd } => {
-            map_def(mapper, rd);
-        }
-        &mut Inst::SetPinnedReg { ref mut rm } => {
-            map_use(mapper, rm);
-        }
        &mut Inst::VirtualSPOffsetAdj { .. } => {}
+        &mut Inst::EmitIsland { .. } => {}
    }
 }

@@ -1730,6 +1732,8 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RegUsageMapper) {
 // Instructions: misc functions and external interface

 impl MachInst for Inst {
+    type LabelUse = LabelUse;
+
    fn get_regs(&self, collector: &mut RegUsageCollector) {
        aarch64_get_regs(self, collector)
    }
@@ -1757,24 +1761,14 @@ impl MachInst for Inst {
    fn is_term<'a>(&'a self) -> MachTerminator<'a> {
        match self {
            &Inst::Ret | &Inst::EpiloguePlaceholder => MachTerminator::Ret,
-            &Inst::Jump { dest } => MachTerminator::Uncond(dest.as_block_index().unwrap()),
+            &Inst::Jump { dest } => MachTerminator::Uncond(dest.as_label().unwrap()),
            &Inst::CondBr {
                taken, not_taken, ..
-            } => MachTerminator::Cond(
-                taken.as_block_index().unwrap(),
-                not_taken.as_block_index().unwrap(),
-            ),
-            &Inst::CondBrLowered { .. } => {
-                // When this is used prior to branch finalization for branches
-                // within an open-coded sequence, i.e. with ResolvedOffsets,
-                // do not consider it a terminator. From the point of view of CFG analysis,
-                // it is part of a black-box single-in single-out region, hence is not
-                // denoted a terminator.
+            } => MachTerminator::Cond(taken.as_label().unwrap(), not_taken.as_label().unwrap()),
+            &Inst::OneWayCondBr { .. } => {
+                // Explicitly invisible to CFG processing.
                MachTerminator::None
            }
-            &Inst::CondBrLoweredCompound { .. } => {
-                panic!("is_term() called after lowering branches");
-            }
            &Inst::IndirectBr { ref targets, .. } => MachTerminator::Indirect(&targets[..]),
            &Inst::JTSequence {
                ref targets_for_term,
@@ -1789,6 +1783,35 @@ impl MachInst for Inst {
        Inst::mov(to_reg, from_reg)
    }

+    fn gen_constant(to_reg: Writable<Reg>, value: u64, ty: Type) -> SmallVec<[Inst; 4]> {
+        if ty == F64 {
+            let mut ret = SmallVec::new();
+            ret.push(Inst::load_fp_constant64(to_reg, f64::from_bits(value)));
+            ret
+        } else if ty == F32 {
+            let mut ret = SmallVec::new();
+            ret.push(Inst::load_fp_constant32(
+                to_reg,
+                f32::from_bits(value as u32),
+            ));
+            ret
+        } else {
+            // Must be an integer type.
+            debug_assert!(
+                ty == B1
+                    || ty == I8
+                    || ty == B8
+                    || ty == I16
+                    || ty == B16
+                    || ty == I32
+                    || ty == B32
+                    || ty == I64
+                    || ty == B64
+            );
+            Inst::load_constant(to_reg, value)
+        }
+    }
+
    fn gen_zero_len_nop() -> Inst {
        Inst::Nop0
    }
@@ -1815,101 +1838,25 @@ impl MachInst for Inst {
        }
    }

-    fn gen_jump(blockindex: BlockIndex) -> Inst {
+    fn gen_jump(target: MachLabel) -> Inst {
        Inst::Jump {
-            dest: BranchTarget::Block(blockindex),
+            dest: BranchTarget::Label(target),
        }
    }

-    fn with_block_rewrites(&mut self, block_target_map: &[BlockIndex]) {
-        match self {
-            &mut Inst::Jump { ref mut dest } => {
-                dest.map(block_target_map);
-            }
-            &mut Inst::CondBr {
-                ref mut taken,
-                ref mut not_taken,
-                ..
-            } => {
-                taken.map(block_target_map);
-                not_taken.map(block_target_map);
-            }
-            &mut Inst::CondBrLowered { .. } => {
-                // See note in `is_term()`: this is used in open-coded sequences
-                // within blocks and should be left alone.
-            }
-            &mut Inst::CondBrLoweredCompound { .. } => {
-                panic!("with_block_rewrites called after branch lowering!");
-            }
-            _ => {}
-        }
+    fn reg_universe(flags: &settings::Flags) -> RealRegUniverse {
+        create_reg_universe(flags)
    }

-    fn with_fallthrough_block(&mut self, fallthrough: Option<BlockIndex>) {
-        match self {
-            &mut Inst::CondBr {
-                taken,
-                not_taken,
-                kind,
-            } => {
-                if taken.as_block_index() == fallthrough
-                    && not_taken.as_block_index() == fallthrough
-                {
-                    *self = Inst::Nop0;
-                } else if taken.as_block_index() == fallthrough {
-                    *self = Inst::CondBrLowered {
-                        target: not_taken,
-                        kind: kind.invert(),
-                    };
-                } else if not_taken.as_block_index() == fallthrough {
-                    *self = Inst::CondBrLowered {
-                        target: taken,
-                        kind,
-                    };
-                } else {
-                    // We need a compound sequence (condbr / uncond-br).
-                    *self = Inst::CondBrLoweredCompound {
-                        taken,
-                        not_taken,
-                        kind,
-                    };
-                }
-            }
-            &mut Inst::Jump { dest } => {
-                if dest.as_block_index() == fallthrough {
-                    *self = Inst::Nop0;
-                }
-            }
-            _ => {}
-        }
-    }
-
-    fn with_block_offsets(&mut self, my_offset: CodeOffset, targets: &[CodeOffset]) {
-        match self {
-            &mut Inst::CondBrLowered { ref mut target, .. } => {
-                target.lower(targets, my_offset);
-            }
-            &mut Inst::CondBrLoweredCompound {
-                ref mut taken,
-                ref mut not_taken,
-                ..
-            } => {
-                taken.lower(targets, my_offset);
-                not_taken.lower(targets, my_offset + 4);
-            }
-            &mut Inst::Jump { ref mut dest } => {
-                dest.lower(targets, my_offset);
-            }
-            &mut Inst::JTSequence {
-                targets: ref mut t, ..
-            } => {
-                for target in t.iter_mut() {
-                    // offset+20: jumptable is 20 bytes into compound sequence.
-                    target.lower(targets, my_offset + 20);
-                }
-            }
-            _ => {}
-        }
+    fn worst_case_size() -> CodeOffset {
+        // The maximum size, in bytes, of any `Inst`'s emitted code. We have at least one case of
+        // an 8-instruction sequence (saturating int-to-float conversions) with three embedded
+        // 64-bit f64 constants.
+        //
+        // Note that inline jump-tables handle island/pool insertion separately, so we do not need
+        // to account for them here (otherwise the worst case would be 2^31 * 4, clearly not
+        // feasible for other reasons).
+        44
    }
 }

@@ -2550,12 +2497,12 @@ impl ShowWithRRU for Inst {
                    }
                }
            }
-            &Inst::CondBrLowered {
+            &Inst::OneWayCondBr {
                ref target,
                ref kind,
            } => {
                let target = target.show_rru(mb_rru);
-                match &kind {
+                match kind {
                    &CondBrKind::Zero(reg) => {
                        let reg = reg.show_rru(mb_rru);
                        format!("cbz {}, {}", reg, target)
@@ -2570,30 +2517,15 @@ impl ShowWithRRU for Inst {
                    }
                }
            }
-            &Inst::CondBrLoweredCompound {
-                ref taken,
-                ref not_taken,
-                ref kind,
-            } => {
-                let first = Inst::CondBrLowered {
-                    target: taken.clone(),
-                    kind: kind.clone(),
-                };
-                let second = Inst::Jump {
-                    dest: not_taken.clone(),
-                };
-                first.show_rru(mb_rru) + " ; " + &second.show_rru(mb_rru)
-            }
            &Inst::IndirectBr { rn, .. } => {
                let rn = rn.show_rru(mb_rru);
                format!("br {}", rn)
            }
            &Inst::Brk => "brk #0".to_string(),
            &Inst::Udf { .. } => "udf".to_string(),
-            &Inst::Adr { rd, ref label } => {
+            &Inst::Adr { rd, off } => {
                let rd = rd.show_rru(mb_rru);
-                let label = label.show_rru(mb_rru);
-                format!("adr {}, {}", rd, label)
+                format!("adr {}, pc+{}", rd, off)
            }
            &Inst::Word4 { data } => format!("data.i32 {}", data),
            &Inst::Word8 { data } => format!("data.i64 {}", data),
@@ -2683,15 +2615,135 @@ impl ShowWithRRU for Inst {
                }
                ret
            }
-            &Inst::GetPinnedReg { rd } => {
-                let rd = rd.show_rru(mb_rru);
-                format!("get_pinned_reg {}", rd)
-            }
-            &Inst::SetPinnedReg { rm } => {
-                let rm = rm.show_rru(mb_rru);
-                format!("set_pinned_reg {}", rm)
-            }
            &Inst::VirtualSPOffsetAdj { offset } => format!("virtual_sp_offset_adjust {}", offset),
+            &Inst::EmitIsland { needed_space } => format!("emit_island {}", needed_space),
+        }
+    }
+}
+
+//=============================================================================
+// Label fixups and jump veneers.
+
+/// Different forms of label references for different instruction formats.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum LabelUse {
+    /// 19-bit branch offset (conditional branches). PC-rel, offset is imm << 2. Immediate is 19
+    /// signed bits, in bits 23:5. Used by cbz, cbnz, b.cond.
+    Branch19,
+    /// 26-bit branch offset (unconditional branches). PC-rel, offset is imm << 2. Immediate is 26
+    /// signed bits, in bits 25:0. Used by b, bl.
+    Branch26,
+    /// 19-bit offset for LDR (load literal). PC-rel, offset is imm << 2. Immediate is 19 signed bits,
+    /// in bits 23:5.
+    Ldr19,
+    /// 21-bit offset for ADR (get address of label). PC-rel, offset is not shifted. Immediate is
+    /// 21 signed bits, with high 19 bits in bits 23:5 and low 2 bits in bits 30:29.
+    Adr21,
+    /// 32-bit PC relative constant offset (from address of constant itself),
+    /// signed. Used in jump tables.
+    PCRel32,
+}
+
+impl MachInstLabelUse for LabelUse {
+    /// Alignment for veneer code. Every AArch64 instruction must be 4-byte-aligned.
+    const ALIGN: CodeOffset = 4;
+
+    /// Maximum PC-relative range (positive), inclusive.
+    fn max_pos_range(self) -> CodeOffset {
+        match self {
+            // 19-bit immediate, left-shifted by 2, for 21 bits of total range. Signed, so +2^20
+            // from zero. Likewise for two other shifted cases below.
+            LabelUse::Branch19 => (1 << 20) - 1,
+            LabelUse::Branch26 => (1 << 27) - 1,
+            LabelUse::Ldr19 => (1 << 20) - 1,
+            // Adr does not shift its immediate, so the 21-bit immediate gives 21 bits of total
+            // range.
+            LabelUse::Adr21 => (1 << 20) - 1,
+            LabelUse::PCRel32 => 0x7fffffff,
+        }
+    }
+
+    /// Maximum PC-relative range (negative).
+    fn max_neg_range(self) -> CodeOffset {
+        // All forms are twos-complement signed offsets, so negative limit is one more than
+        // positive limit.
+        self.max_pos_range() + 1
+    }
+
+    /// Size of window into code needed to do the patch.
+    fn patch_size(self) -> CodeOffset {
+        // Patch is on one instruction only for all of these label reference types.
+        4
+    }
+
+    /// Perform the patch.
+    fn patch(self, buffer: &mut [u8], use_offset: CodeOffset, label_offset: CodeOffset) {
+        let pc_rel = (label_offset as i64) - (use_offset as i64);
+        debug_assert!(pc_rel <= self.max_pos_range() as i64);
+        debug_assert!(pc_rel >= -(self.max_neg_range() as i64));
+        let pc_rel = pc_rel as u32;
+        let insn_word = u32::from_le_bytes([buffer[0], buffer[1], buffer[2], buffer[3]]);
+        let mask = match self {
+            LabelUse::Branch19 => 0x00ffffe0, // bits 23..5 inclusive
+            LabelUse::Branch26 => 0x03ffffff, // bits 25..0 inclusive
+            LabelUse::Ldr19 => 0x00ffffe0,    // bits 23..5 inclusive
+            LabelUse::Adr21 => 0x60ffffe0,    // bits 30..29, 25..5 inclusive
+            LabelUse::PCRel32 => 0xffffffff,
+        };
+        let pc_rel_shifted = match self {
+            LabelUse::Adr21 | LabelUse::PCRel32 => pc_rel,
+            _ => {
+                debug_assert!(pc_rel & 3 == 0);
+                pc_rel >> 2
+            }
+        };
+        let pc_rel_inserted = match self {
+            LabelUse::Branch19 | LabelUse::Ldr19 => (pc_rel_shifted & 0x7ffff) << 5,
+            LabelUse::Branch26 => pc_rel_shifted & 0x3ffffff,
+            LabelUse::Adr21 => (pc_rel_shifted & 0x7ffff) << 5 | (pc_rel_shifted & 0x180000) << 10,
+            LabelUse::PCRel32 => pc_rel_shifted,
+        };
+        let is_add = match self {
+            LabelUse::PCRel32 => true,
+            _ => false,
+        };
+        let insn_word = if is_add {
+            insn_word.wrapping_add(pc_rel_inserted)
+        } else {
+            (insn_word & !mask) | pc_rel_inserted
+        };
+        buffer[0..4].clone_from_slice(&u32::to_le_bytes(insn_word));
+    }
+
+    /// Is a veneer supported for this label reference type?
+    fn supports_veneer(self) -> bool {
+        match self {
+            LabelUse::Branch19 => true, // veneer is a Branch26
+            _ => false,
+        }
+    }
+
+    /// How large is the veneer, if supported?
+    fn veneer_size(self) -> CodeOffset {
+        4
+    }
+
+    /// Generate a veneer into the buffer, given that this veneer is at `veneer_offset`, and return
+    /// an offset and label-use for the veneer's use of the original label.
+    fn generate_veneer(
+        self,
+        buffer: &mut [u8],
+        veneer_offset: CodeOffset,
+    ) -> (CodeOffset, LabelUse) {
+        match self {
+            LabelUse::Branch19 => {
+                // veneer is a Branch26 (unconditional branch). Just encode directly here -- don't
+                // bother with constructing an Inst.
+                let insn_word = 0b000101 << 26;
+                buffer[0..4].clone_from_slice(&u32::to_le_bytes(insn_word));
+                (veneer_offset, LabelUse::Branch26)
+            }
+            _ => panic!("Unsupported label-reference type for veneer generation!"),
        }
    }
 }
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -14,12 +14,14 @@ use crate::ir::Inst as IRInst;
 use crate::ir::{InstructionData, Opcode, TrapCode, Type};
 use crate::machinst::lower::*;
 use crate::machinst::*;
+use crate::CodegenResult;

 use crate::isa::aarch64::inst::*;
 use crate::isa::aarch64::AArch64Backend;

 use super::lower_inst;

+use log::debug;
 use regalloc::{Reg, RegClass, Writable};

 //============================================================================
@@ -104,18 +106,11 @@ pub(crate) enum ResultRegImmShift {
 }

 //============================================================================
-// Instruction input and output "slots".
+// Instruction input "slots".
 //
 // We use these types to refer to operand numbers, and result numbers, together
 // with the associated instruction, in a type-safe way.

-/// Identifier for a particular output of an instruction.
-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub(crate) struct InsnOutput {
-    pub(crate) insn: IRInst,
-    pub(crate) output: usize,
-}
-
 /// Identifier for a particular input of an instruction.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub(crate) struct InsnInput {
@@ -123,93 +118,28 @@ pub(crate) struct InsnInput {
    pub(crate) input: usize,
 }

-/// Producer of a value: either a previous instruction's output, or a register that will be
-/// codegen'd separately.
+/// Identifier for a particular output of an instruction.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub(crate) enum InsnInputSource {
-    Output(InsnOutput),
-    Reg(Reg),
-}
-
-impl InsnInputSource {
-    fn as_output(self) -> Option<InsnOutput> {
-        match self {
-            InsnInputSource::Output(o) => Some(o),
-            _ => None,
-        }
-    }
-}
-
-fn get_input<C: LowerCtx<I = Inst>>(ctx: &mut C, output: InsnOutput, num: usize) -> InsnInput {
-    assert!(num <= ctx.num_inputs(output.insn));
-    InsnInput {
-        insn: output.insn,
-        input: num,
-    }
-}
-
-/// Convert an instruction input to a producing instruction's output if possible (in same BB), or a
-/// register otherwise.
-fn input_source<C: LowerCtx<I = Inst>>(ctx: &mut C, input: InsnInput) -> InsnInputSource {
-    if let Some((input_inst, result_num)) = ctx.input_inst(input.insn, input.input) {
-        let out = InsnOutput {
-            insn: input_inst,
-            output: result_num,
-        };
-        InsnInputSource::Output(out)
-    } else {
-        let reg = ctx.input(input.insn, input.input);
-        InsnInputSource::Reg(reg)
-    }
+pub(crate) struct InsnOutput {
+    pub(crate) insn: IRInst,
+    pub(crate) output: usize,
 }

 //============================================================================
-// Lowering: convert instruction outputs to result types.
+// Lowering: convert instruction inputs to forms that we can use.

-/// Lower an instruction output to a 64-bit constant, if possible.
-pub(crate) fn output_to_const<C: LowerCtx<I = Inst>>(ctx: &mut C, out: InsnOutput) -> Option<u64> {
-    if out.output > 0 {
-        None
-    } else {
-        let inst_data = ctx.data(out.insn);
-        if inst_data.opcode() == Opcode::Null {
-            Some(0)
-        } else {
-            match inst_data {
-                &InstructionData::UnaryImm { opcode: _, imm } => {
-                    // Only has Into for i64; we use u64 elsewhere, so we cast.
-                    let imm: i64 = imm.into();
-                    Some(imm as u64)
-                }
-                &InstructionData::UnaryBool { opcode: _, imm } => Some(u64::from(imm)),
-                &InstructionData::UnaryIeee32 { opcode: _, imm } => Some(u64::from(imm.bits())),
-                &InstructionData::UnaryIeee64 { opcode: _, imm } => Some(imm.bits()),
-                _ => None,
-            }
-        }
-    }
+/// Lower an instruction input to a 64-bit constant, if possible.
+pub(crate) fn input_to_const<C: LowerCtx<I = Inst>>(ctx: &mut C, input: InsnInput) -> Option<u64> {
+    let input = ctx.get_input(input.insn, input.input);
+    input.constant
 }

-pub(crate) fn output_to_const_f32<C: LowerCtx<I = Inst>>(
+/// Lower an instruction input to a constant register-shift amount, if possible.
+pub(crate) fn input_to_shiftimm<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
-    out: InsnOutput,
-) -> Option<f32> {
-    output_to_const(ctx, out).map(|value| f32::from_bits(value as u32))
-}
-
-pub(crate) fn output_to_const_f64<C: LowerCtx<I = Inst>>(
-    ctx: &mut C,
-    out: InsnOutput,
-) -> Option<f64> {
-    output_to_const(ctx, out).map(|value| f64::from_bits(value))
-}
-
-/// Lower an instruction output to a constant register-shift amount, if possible.
-pub(crate) fn output_to_shiftimm<C: LowerCtx<I = Inst>>(
-    ctx: &mut C,
-    out: InsnOutput,
+    input: InsnInput,
 ) -> Option<ShiftOpShiftImm> {
-    output_to_const(ctx, out).and_then(ShiftOpShiftImm::maybe_from_shift)
+    input_to_const(ctx, input).and_then(ShiftOpShiftImm::maybe_from_shift)
 }

 /// How to handle narrow values loaded into registers; see note on `narrow_mode`
@@ -237,9 +167,9 @@ impl NarrowValueMode {
    }
 }

-/// Lower an instruction output to a reg.
+/// Allocate a register for an instruction output and return it.
 pub(crate) fn output_to_reg<C: LowerCtx<I = Inst>>(ctx: &mut C, out: InsnOutput) -> Writable<Reg> {
-    ctx.output(out.insn, out.output)
+    ctx.get_output(out.insn, out.output)
 }

 /// Lower an instruction input to a reg.
@@ -252,13 +182,26 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
    input: InsnInput,
    narrow_mode: NarrowValueMode,
 ) -> Reg {
+    debug!("input_to_reg: input {:?}", input);
    let ty = ctx.input_ty(input.insn, input.input);
    let from_bits = ty_bits(ty) as u8;
-    let in_reg = ctx.input(input.insn, input.input);
+    let inputs = ctx.get_input(input.insn, input.input);
+    let in_reg = if let Some(c) = inputs.constant {
+        // Generate constants fresh at each use to minimize long-range register pressure.
+        let to_reg = ctx.alloc_tmp(Inst::rc_for_type(ty).unwrap(), ty);
+        for inst in Inst::gen_constant(to_reg, c, ty).into_iter() {
+            ctx.emit(inst);
+        }
+        to_reg.to_reg()
+    } else {
+        ctx.use_input_reg(inputs);
+        inputs.reg
+    };
+
    match (narrow_mode, from_bits) {
        (NarrowValueMode::None, _) => in_reg,
        (NarrowValueMode::ZeroExtend32, n) if n < 32 => {
-            let tmp = ctx.tmp(RegClass::I64, I32);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I32);
            ctx.emit(Inst::Extend {
                rd: tmp,
                rn: in_reg,
@@ -269,7 +212,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
            tmp.to_reg()
        }
        (NarrowValueMode::SignExtend32, n) if n < 32 => {
-            let tmp = ctx.tmp(RegClass::I64, I32);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I32);
            ctx.emit(Inst::Extend {
                rd: tmp,
                rn: in_reg,
@@ -282,18 +225,23 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
        (NarrowValueMode::ZeroExtend32, 32) | (NarrowValueMode::SignExtend32, 32) => in_reg,

        (NarrowValueMode::ZeroExtend64, n) if n < 64 => {
-            let tmp = ctx.tmp(RegClass::I64, I32);
-            ctx.emit(Inst::Extend {
-                rd: tmp,
-                rn: in_reg,
-                signed: false,
-                from_bits,
-                to_bits: 64,
-            });
-            tmp.to_reg()
+            if inputs.constant.is_some() {
+                // Constants are zero-extended to full 64-bit width on load already.
+                in_reg
+            } else {
+                let tmp = ctx.alloc_tmp(RegClass::I64, I32);
+                ctx.emit(Inst::Extend {
+                    rd: tmp,
+                    rn: in_reg,
+                    signed: false,
+                    from_bits,
+                    to_bits: 64,
+                });
+                tmp.to_reg()
+            }
        }
        (NarrowValueMode::SignExtend64, n) if n < 64 => {
-            let tmp = ctx.tmp(RegClass::I64, I32);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I32);
            ctx.emit(Inst::Extend {
                rd: tmp,
                rn: in_reg,
@@ -313,8 +261,6 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
 }

 /// Lower an instruction input to a reg or reg/shift, or reg/extend operand.
-/// This does not actually codegen the source instruction; it just uses the
-/// vreg into which the source instruction will generate its value.
 ///
 /// The `narrow_mode` flag indicates whether the consumer of this value needs
 /// the high bits clear. For many operations, such as an add/sub/mul or any
@@ -330,23 +276,18 @@ fn input_to_rs<C: LowerCtx<I = Inst>>(
    input: InsnInput,
    narrow_mode: NarrowValueMode,
 ) -> ResultRS {
-    if let InsnInputSource::Output(out) = input_source(ctx, input) {
-        let insn = out.insn;
-        assert!(out.output <= ctx.num_outputs(insn));
+    let inputs = ctx.get_input(input.insn, input.input);
+    if let Some((insn, 0)) = inputs.inst {
        let op = ctx.data(insn).opcode();

        if op == Opcode::Ishl {
-            let shiftee = get_input(ctx, out, 0);
-            let shift_amt = get_input(ctx, out, 1);
+            let shiftee = InsnInput { insn, input: 0 };
+            let shift_amt = InsnInput { insn, input: 1 };

            // Can we get the shift amount as an immediate?
-            if let Some(shift_amt_out) = input_source(ctx, shift_amt).as_output() {
-                if let Some(shiftimm) = output_to_shiftimm(ctx, shift_amt_out) {
-                    let reg = input_to_reg(ctx, shiftee, narrow_mode);
-                    ctx.merged(insn);
-                    ctx.merged(shift_amt_out.insn);
-                    return ResultRS::RegShift(reg, ShiftOpAndAmt::new(ShiftOp::LSL, shiftimm));
-                }
+            if let Some(shiftimm) = input_to_shiftimm(ctx, shift_amt) {
+                let reg = input_to_reg(ctx, shiftee, narrow_mode);
+                return ResultRS::RegShift(reg, ShiftOpAndAmt::new(ShiftOp::LSL, shiftimm));
            }
        }
    }
@@ -364,11 +305,10 @@ fn input_to_rse<C: LowerCtx<I = Inst>>(
    input: InsnInput,
    narrow_mode: NarrowValueMode,
 ) -> ResultRSE {
-    if let InsnInputSource::Output(out) = input_source(ctx, input) {
-        let insn = out.insn;
-        assert!(out.output <= ctx.num_outputs(insn));
+    let inputs = ctx.get_input(input.insn, input.input);
+    if let Some((insn, 0)) = inputs.inst {
        let op = ctx.data(insn).opcode();
-        let out_ty = ctx.output_ty(insn, out.output);
+        let out_ty = ctx.output_ty(insn, 0);
        let out_bits = ty_bits(out_ty);

        // If `out_ty` is smaller than 32 bits and we need to zero- or sign-extend,
@@ -378,7 +318,7 @@ fn input_to_rse<C: LowerCtx<I = Inst>>(
            && ((narrow_mode.is_32bit() && out_bits < 32)
                || (!narrow_mode.is_32bit() && out_bits < 64))
        {
-            let reg = output_to_reg(ctx, out);
+            let reg = input_to_reg(ctx, InsnInput { insn, input: 0 }, NarrowValueMode::None);
            let extendop = match (narrow_mode, out_bits) {
                (NarrowValueMode::SignExtend32, 1) | (NarrowValueMode::SignExtend64, 1) => {
                    ExtendOp::SXTB
@@ -402,15 +342,14 @@ fn input_to_rse<C: LowerCtx<I = Inst>>(
                (NarrowValueMode::ZeroExtend64, 32) => ExtendOp::UXTW,
                _ => unreachable!(),
            };
-            return ResultRSE::RegExtend(reg.to_reg(), extendop);
+            return ResultRSE::RegExtend(reg, extendop);
        }

        // Is this a zero-extend or sign-extend and can we handle that with a register-mode operator?
        if op == Opcode::Uextend || op == Opcode::Sextend {
            assert!(out_bits == 32 || out_bits == 64);
            let sign_extend = op == Opcode::Sextend;
-            let extendee = get_input(ctx, out, 0);
-            let inner_ty = ctx.input_ty(extendee.insn, extendee.input);
+            let inner_ty = ctx.input_ty(insn, 0);
            let inner_bits = ty_bits(inner_ty);
            assert!(inner_bits < out_bits);
            let extendop = match (sign_extend, inner_bits) {
@@ -424,8 +363,7 @@ fn input_to_rse<C: LowerCtx<I = Inst>>(
                (false, 32) => ExtendOp::UXTW,
                _ => unreachable!(),
            };
-            let reg = input_to_reg(ctx, extendee, NarrowValueMode::None);
-            ctx.merged(insn);
+            let reg = input_to_reg(ctx, InsnInput { insn, input: 0 }, NarrowValueMode::None);
            return ResultRSE::RegExtend(reg, extendop);
        }
    }
@@ -438,12 +376,9 @@ pub(crate) fn input_to_rse_imm12<C: LowerCtx<I = Inst>>(
    input: InsnInput,
    narrow_mode: NarrowValueMode,
 ) -> ResultRSEImm12 {
-    if let InsnInputSource::Output(out) = input_source(ctx, input) {
-        if let Some(imm_value) = output_to_const(ctx, out) {
-            if let Some(i) = Imm12::maybe_from_u64(imm_value) {
-                ctx.merged(out.insn);
-                return ResultRSEImm12::Imm12(i);
-            }
+    if let Some(imm_value) = input_to_const(ctx, input) {
+        if let Some(i) = Imm12::maybe_from_u64(imm_value) {
+            return ResultRSEImm12::Imm12(i);
        }
    }

@@ -455,14 +390,11 @@ pub(crate) fn input_to_rs_immlogic<C: LowerCtx<I = Inst>>(
    input: InsnInput,
    narrow_mode: NarrowValueMode,
 ) -> ResultRSImmLogic {
-    if let InsnInputSource::Output(out) = input_source(ctx, input) {
-        if let Some(imm_value) = output_to_const(ctx, out) {
-            let ty = ctx.output_ty(out.insn, out.output);
-            let ty = if ty_bits(ty) < 32 { I32 } else { ty };
-            if let Some(i) = ImmLogic::maybe_from_u64(imm_value, ty) {
-                ctx.merged(out.insn);
-                return ResultRSImmLogic::ImmLogic(i);
-            }
+    if let Some(imm_value) = input_to_const(ctx, input) {
+        let ty = ctx.input_ty(input.insn, input.input);
+        let ty = if ty_bits(ty) < 32 { I32 } else { ty };
+        if let Some(i) = ImmLogic::maybe_from_u64(imm_value, ty) {
+            return ResultRSImmLogic::ImmLogic(i);
        }
    }

@@ -473,12 +405,9 @@ pub(crate) fn input_to_reg_immshift<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
    input: InsnInput,
 ) -> ResultRegImmShift {
-    if let InsnInputSource::Output(out) = input_source(ctx, input) {
-        if let Some(imm_value) = output_to_const(ctx, out) {
-            if let Some(immshift) = ImmShift::maybe_from_u64(imm_value) {
-                ctx.merged(out.insn);
-                return ResultRegImmShift::ImmShift(immshift);
-            }
+    if let Some(imm_value) = input_to_const(ctx, input) {
+        if let Some(immshift) = ImmShift::maybe_from_u64(imm_value) {
+            return ResultRegImmShift::ImmShift(immshift);
        }
    }

@@ -600,7 +529,7 @@ pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
    }

    // Otherwise, generate add instructions.
-    let addr = ctx.tmp(RegClass::I64, I64);
+    let addr = ctx.alloc_tmp(RegClass::I64, I64);

    // Get the const into a reg.
    lower_constant_u64(ctx, addr.clone(), offset as u64);
@@ -612,7 +541,7 @@ pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
        // In an addition, the stack register is the zero register, so divert it to another
        // register just before doing the actual add.
        let reg = if reg == stack_reg() {
-            let tmp = ctx.tmp(RegClass::I64, I64);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I64);
            ctx.emit(Inst::Mov {
                rd: tmp,
                rm: stack_reg(),
@@ -823,24 +752,29 @@ pub(crate) fn inst_trapcode(data: &InstructionData) -> Option<TrapCode> {
    }
 }

-/// Checks for an instance of `op` feeding the given input. Marks as merged (decrementing refcount) if so.
+/// Checks for an instance of `op` feeding the given input.
 pub(crate) fn maybe_input_insn<C: LowerCtx<I = Inst>>(
    c: &mut C,
    input: InsnInput,
    op: Opcode,
 ) -> Option<IRInst> {
-    if let InsnInputSource::Output(out) = input_source(c, input) {
-        let data = c.data(out.insn);
+    let inputs = c.get_input(input.insn, input.input);
+    debug!(
+        "maybe_input_insn: input {:?} has options {:?}; looking for op {:?}",
+        input, inputs, op
+    );
+    if let Some((src_inst, _)) = inputs.inst {
+        let data = c.data(src_inst);
+        debug!(" -> input inst {:?}", data);
        if data.opcode() == op {
-            c.merged(out.insn);
-            return Some(out.insn);
+            return Some(src_inst);
        }
    }
    None
 }

 /// Checks for an instance of `op` feeding the given input, possibly via a conversion `conv` (e.g.,
-/// Bint or a bitcast). Marks one or both as merged if so, as appropriate.
+/// Bint or a bitcast).
 ///
 /// FIXME cfallin 2020-03-30: this is really ugly. Factor out tree-matching stuff and make it
 /// a bit more generic.
@@ -850,21 +784,19 @@ pub(crate) fn maybe_input_insn_via_conv<C: LowerCtx<I = Inst>>(
    op: Opcode,
    conv: Opcode,
 ) -> Option<IRInst> {
-    if let Some(ret) = maybe_input_insn(c, input, op) {
-        return Some(ret);
-    }
-
-    if let InsnInputSource::Output(out) = input_source(c, input) {
-        let data = c.data(out.insn);
+    let inputs = c.get_input(input.insn, input.input);
+    if let Some((src_inst, _)) = inputs.inst {
+        let data = c.data(src_inst);
+        if data.opcode() == op {
+            return Some(src_inst);
+        }
        if data.opcode() == conv {
-            let conv_insn = out.insn;
-            let conv_input = InsnInput {
-                insn: conv_insn,
-                input: 0,
-            };
-            if let Some(inner) = maybe_input_insn(c, conv_input, op) {
-                c.merged(conv_insn);
-                return Some(inner);
+            let inputs = c.get_input(src_inst, 0);
+            if let Some((src_inst, _)) = inputs.inst {
+                let data = c.data(src_inst);
+                if data.opcode() == op {
+                    return Some(src_inst);
+                }
            }
        }
    }
@@ -876,6 +808,7 @@ pub(crate) fn lower_icmp_or_ifcmp_to_flags<C: LowerCtx<I = Inst>>(
    insn: IRInst,
    is_signed: bool,
 ) {
+    debug!("lower_icmp_or_ifcmp_to_flags: insn {}", insn);
    let ty = ctx.input_ty(insn, 0);
    let bits = ty_bits(ty);
    let narrow_mode = match (bits <= 32, is_signed) {
@@ -897,6 +830,7 @@ pub(crate) fn lower_icmp_or_ifcmp_to_flags<C: LowerCtx<I = Inst>>(
    let ty = ctx.input_ty(insn, 0);
    let rn = input_to_reg(ctx, inputs[0], narrow_mode);
    let rm = input_to_rse_imm12(ctx, inputs[1], narrow_mode);
+    debug!("lower_icmp_or_ifcmp_to_flags: rn = {:?} rm = {:?}", rn, rm);
    let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
    let rd = writable_zero_reg();
    ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
@@ -934,17 +868,21 @@ pub(crate) fn lower_fcmp_or_ffcmp_to_flags<C: LowerCtx<I = Inst>>(ctx: &mut C, i
 impl LowerBackend for AArch64Backend {
    type MInst = Inst;

-    fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) {
-        lower_inst::lower_insn_to_regs(ctx, ir_inst);
+    fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> {
+        lower_inst::lower_insn_to_regs(ctx, ir_inst)
    }

    fn lower_branch_group<C: LowerCtx<I = Inst>>(
        &self,
        ctx: &mut C,
        branches: &[IRInst],
-        targets: &[BlockIndex],
-        fallthrough: Option<BlockIndex>,
-    ) {
+        targets: &[MachLabel],
+        fallthrough: Option<MachLabel>,
+    ) -> CodegenResult<()> {
        lower_inst::lower_branch(ctx, branches, targets, fallthrough)
    }
+
+    fn maybe_pinned_reg(&self) -> Option<Reg> {
+        Some(xreg(PINNED_REG))
+    }
 }
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -1,11 +1,13 @@
 //! Lower a single Cranelift instruction into vcode.

+use crate::binemit::CodeOffset;
 use crate::ir::condcodes::FloatCC;
 use crate::ir::types::*;
 use crate::ir::Inst as IRInst;
 use crate::ir::{InstructionData, Opcode, TrapCode};
 use crate::machinst::lower::*;
 use crate::machinst::*;
+use crate::CodegenResult;

 use crate::isa::aarch64::abi::*;
 use crate::isa::aarch64::inst::*;
@@ -19,7 +21,10 @@ use smallvec::SmallVec;
 use super::lower::*;

 /// Actually codegen an instruction's results into registers.
-pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) {
+pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    insn: IRInst,
+) -> CodegenResult<()> {
    let op = ctx.data(insn).opcode();
    let inputs: SmallVec<[InsnInput; 4]> = (0..ctx.num_inputs(insn))
        .map(|i| InsnInput { insn, input: i })
@@ -35,17 +40,17 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns

    match op {
        Opcode::Iconst | Opcode::Bconst | Opcode::Null => {
-            let value = output_to_const(ctx, outputs[0]).unwrap();
+            let value = ctx.get_constant(insn).unwrap();
            let rd = output_to_reg(ctx, outputs[0]);
            lower_constant_u64(ctx, rd, value);
        }
        Opcode::F32const => {
-            let value = output_to_const_f32(ctx, outputs[0]).unwrap();
+            let value = f32::from_bits(ctx.get_constant(insn).unwrap() as u32);
            let rd = output_to_reg(ctx, outputs[0]);
            lower_constant_f32(ctx, rd, value);
        }
        Opcode::F64const => {
-            let value = output_to_const_f64(ctx, outputs[0]).unwrap();
+            let value = f64::from_bits(ctx.get_constant(insn).unwrap());
            let rd = output_to_reg(ctx, outputs[0]);
            lower_constant_f64(ctx, rd, value);
        }
@@ -79,8 +84,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
            } else {
                VecALUOp::UQAddScalar
            };
-            let va = ctx.tmp(RegClass::V128, I128);
-            let vb = ctx.tmp(RegClass::V128, I128);
+            let va = ctx.alloc_tmp(RegClass::V128, I128);
+            let vb = ctx.alloc_tmp(RegClass::V128, I128);
            let ra = input_to_reg(ctx, inputs[0], narrow_mode);
            let rb = input_to_reg(ctx, inputs[1], narrow_mode);
            let rd = output_to_reg(ctx, outputs[0]);
@@ -110,8 +115,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
            } else {
                VecALUOp::UQSubScalar
            };
-            let va = ctx.tmp(RegClass::V128, I128);
-            let vb = ctx.tmp(RegClass::V128, I128);
+            let va = ctx.alloc_tmp(RegClass::V128, I128);
+            let vb = ctx.alloc_tmp(RegClass::V128, I128);
            let ra = input_to_reg(ctx, inputs[0], narrow_mode);
            let rb = input_to_reg(ctx, inputs[1], narrow_mode);
            let rd = output_to_reg(ctx, outputs[0]);
@@ -271,7 +276,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns

                // Check for divide by 0.
                let branch_size = 8;
-                ctx.emit(Inst::CondBrLowered {
+                ctx.emit(Inst::OneWayCondBr {
                    target: BranchTarget::ResolvedOffset(branch_size),
                    kind: CondBrKind::NotZero(rm),
                });
@@ -297,7 +302,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns

                    // Check for divide by 0.
                    let branch_size = 20;
-                    ctx.emit(Inst::CondBrLowered {
+                    ctx.emit(Inst::OneWayCondBr {
                        target: BranchTarget::ResolvedOffset(branch_size),
                        kind: CondBrKind::Zero(rm),
                    });
@@ -324,7 +329,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                        nzcv: NZCV::new(false, false, false, false),
                        cond: Cond::Eq,
                    });
-                    ctx.emit(Inst::CondBrLowered {
+                    ctx.emit(Inst::OneWayCondBr {
                        target: BranchTarget::ResolvedOffset(12),
                        kind: CondBrKind::Cond(Cond::Vc),
                    });
@@ -337,7 +342,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns

                    // Check for divide by 0.
                    let branch_size = 8;
-                    ctx.emit(Inst::CondBrLowered {
+                    ctx.emit(Inst::OneWayCondBr {
                        target: BranchTarget::ResolvedOffset(branch_size),
                        kind: CondBrKind::NotZero(rm),
                    });
@@ -493,7 +498,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                            // ignored (because of the implicit masking done by the instruction),
                            // so this is equivalent to negating the input.
                            let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
-                            let tmp = ctx.tmp(RegClass::I64, ty);
+                            let tmp = ctx.alloc_tmp(RegClass::I64, ty);
                            ctx.emit(Inst::AluRRR {
                                alu_op,
                                rd: tmp,
@@ -516,7 +521,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                            // Really ty_bits_size - rn, but the upper bits of the result are
                            // ignored (because of the implicit masking done by the instruction),
                            // so this is equivalent to negating the input.
-                            let tmp = ctx.tmp(RegClass::I64, I32);
+                            let tmp = ctx.alloc_tmp(RegClass::I64, I32);
                            ctx.emit(Inst::AluRRR {
                                alu_op: ALUOp::Sub32,
                                rd: tmp,
@@ -529,7 +534,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                        };

                        // Explicitly mask the rotation count.
-                        let tmp_masked_rm = ctx.tmp(RegClass::I64, I32);
+                        let tmp_masked_rm = ctx.alloc_tmp(RegClass::I64, I32);
                        ctx.emit(Inst::AluRRImmLogic {
                            alu_op: ALUOp::And32,
                            rd: tmp_masked_rm,
@@ -538,8 +543,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                        });
                        let tmp_masked_rm = tmp_masked_rm.to_reg();

-                        let tmp1 = ctx.tmp(RegClass::I64, I32);
-                        let tmp2 = ctx.tmp(RegClass::I64, I32);
+                        let tmp1 = ctx.alloc_tmp(RegClass::I64, I32);
+                        let tmp2 = ctx.alloc_tmp(RegClass::I64, I32);
                        ctx.emit(Inst::AluRRImm12 {
                            alu_op: ALUOp::Sub32,
                            rd: tmp1,
@@ -578,7 +583,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                        }
                        immshift.imm &= ty_bits_size - 1;

-                        let tmp1 = ctx.tmp(RegClass::I64, I32);
+                        let tmp1 = ctx.alloc_tmp(RegClass::I64, I32);
                        ctx.emit(Inst::AluRRImmShift {
                            alu_op: ALUOp::Lsr32,
                            rd: tmp1,
@@ -683,7 +688,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
            // and fix the sequence below to work properly for this.
            let narrow_mode = NarrowValueMode::ZeroExtend64;
            let rn = input_to_reg(ctx, inputs[0], narrow_mode);
-            let tmp = ctx.tmp(RegClass::I64, I64);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I64);

            // If this is a 32-bit Popcnt, use Lsr32 to clear the top 32 bits of the register, then
            // the rest of the code is identical to the 64-bit version.
@@ -992,7 +997,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
        }

        Opcode::Bitselect => {
-            let tmp = ctx.tmp(RegClass::I64, I64);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I64);
            let rd = output_to_reg(ctx, outputs[0]);
            let rcond = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
            let rn = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
@@ -1211,7 +1216,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
            // Branch around the break instruction with inverted cond. Go straight to lowered
            // one-target form; this is logically part of a single-in single-out template lowering.
            let cond = cond.invert();
-            ctx.emit(Inst::CondBrLowered {
+            ctx.emit(Inst::OneWayCondBr {
                target: BranchTarget::ResolvedOffset(8),
                kind: CondBrKind::Cond(cond),
            });
@@ -1301,11 +1306,12 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns

        Opcode::GetPinnedReg => {
            let rd = output_to_reg(ctx, outputs[0]);
-            ctx.emit(Inst::GetPinnedReg { rd });
+            ctx.emit(Inst::mov(rd, xreg(PINNED_REG)));
        }
+
        Opcode::SetPinnedReg => {
            let rm = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
-            ctx.emit(Inst::SetPinnedReg { rm });
+            ctx.emit(Inst::mov(writable_xreg(PINNED_REG), rm));
        }

        Opcode::Spill
@@ -1469,8 +1475,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
            let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
            let rd = output_to_reg(ctx, outputs[0]);
-            let tmp1 = ctx.tmp(RegClass::I64, I64);
-            let tmp2 = ctx.tmp(RegClass::I64, I64);
+            let tmp1 = ctx.alloc_tmp(RegClass::I64, I64);
+            let tmp2 = ctx.alloc_tmp(RegClass::I64, I64);
            ctx.emit(Inst::MovFromVec64 { rd: tmp1, rn: rn });
            ctx.emit(Inst::MovFromVec64 { rd: tmp2, rn: rm });
            let imml = if bits == 32 {
@@ -1533,14 +1539,14 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
            } else {
                ctx.emit(Inst::FpuCmp64 { rn, rm: rn });
            }
-            ctx.emit(Inst::CondBrLowered {
+            ctx.emit(Inst::OneWayCondBr {
                target: BranchTarget::ResolvedOffset(8),
                kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::Ordered)),
            });
            let trap_info = (ctx.srcloc(insn), TrapCode::BadConversionToInteger);
            ctx.emit(Inst::Udf { trap_info });

-            let tmp = ctx.tmp(RegClass::V128, I128);
+            let tmp = ctx.alloc_tmp(RegClass::V128, I128);

            // Check that the input is in range, with "truncate towards zero" semantics. This means
            // we allow values that are slightly out of range:
@@ -1574,7 +1580,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                    rn,
                    rm: tmp.to_reg(),
                });
-                ctx.emit(Inst::CondBrLowered {
+                ctx.emit(Inst::OneWayCondBr {
                    target: BranchTarget::ResolvedOffset(8),
                    kind: CondBrKind::Cond(lower_fp_condcode(low_cond)),
                });
@@ -1587,7 +1593,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                    rn,
                    rm: tmp.to_reg(),
                });
-                ctx.emit(Inst::CondBrLowered {
+                ctx.emit(Inst::OneWayCondBr {
                    target: BranchTarget::ResolvedOffset(8),
                    kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan)),
                });
@@ -1617,7 +1623,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                    rn,
                    rm: tmp.to_reg(),
                });
-                ctx.emit(Inst::CondBrLowered {
+                ctx.emit(Inst::OneWayCondBr {
                    target: BranchTarget::ResolvedOffset(8),
                    kind: CondBrKind::Cond(lower_fp_condcode(low_cond)),
                });
@@ -1630,7 +1636,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                    rn,
                    rm: tmp.to_reg(),
                });
-                ctx.emit(Inst::CondBrLowered {
+                ctx.emit(Inst::OneWayCondBr {
                    target: BranchTarget::ResolvedOffset(8),
                    kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan)),
                });
@@ -1706,8 +1712,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                _ => unreachable!(),
            };

-            let rtmp1 = ctx.tmp(RegClass::V128, in_ty);
-            let rtmp2 = ctx.tmp(RegClass::V128, in_ty);
+            let rtmp1 = ctx.alloc_tmp(RegClass::V128, in_ty);
+            let rtmp2 = ctx.alloc_tmp(RegClass::V128, in_ty);

            if in_bits == 32 {
                ctx.emit(Inst::LoadFpuConst32 {
@@ -1862,14 +1868,16 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
        Opcode::AvgRound => unimplemented!(),
        Opcode::TlsValue => unimplemented!(),
    }
+
+    Ok(())
 }

 pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
    branches: &[IRInst],
-    targets: &[BlockIndex],
-    fallthrough: Option<BlockIndex>,
-) {
+    targets: &[MachLabel],
+    fallthrough: Option<MachLabel>,
+) -> CodegenResult<()> {
    // A block should end with at most two branches. The first may be a
    // conditional branch; a conditional branch can be followed only by an
    // unconditional branch or fallthrough. Otherwise, if only one branch,
@@ -1883,18 +1891,14 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
        let op0 = ctx.data(branches[0]).opcode();
        let op1 = ctx.data(branches[1]).opcode();

-        //println!(
-        //    "lowering two-branch group: opcodes are {:?} and {:?}",
-        //    op0, op1
-        //);
-
        assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough);
-        let taken = BranchTarget::Block(targets[0]);
+        let taken = BranchTarget::Label(targets[0]);
        let not_taken = match op1 {
-            Opcode::Jump => BranchTarget::Block(targets[1]),
-            Opcode::Fallthrough => BranchTarget::Block(fallthrough.unwrap()),
+            Opcode::Jump => BranchTarget::Label(targets[1]),
+            Opcode::Fallthrough => BranchTarget::Label(fallthrough.unwrap()),
            _ => unreachable!(), // assert above.
        };
+
        match op0 {
            Opcode::Brz | Opcode::Brnz => {
                let flag_input = InsnInput {
@@ -1954,6 +1958,8 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
            Opcode::BrIcmp => {
                let condcode = inst_condcode(ctx.data(branches[0])).unwrap();
                let cond = lower_condcode(condcode);
+                let kind = CondBrKind::Cond(cond);
+
                let is_signed = condcode_is_signed(condcode);
                let ty = ctx.input_ty(branches[0], 0);
                let bits = ty_bits(ty);
@@ -1986,13 +1992,15 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                ctx.emit(Inst::CondBr {
                    taken,
                    not_taken,
-                    kind: CondBrKind::Cond(cond),
+                    kind,
                });
            }

            Opcode::Brif => {
                let condcode = inst_condcode(ctx.data(branches[0])).unwrap();
                let cond = lower_condcode(condcode);
+                let kind = CondBrKind::Cond(cond);
+
                let is_signed = condcode_is_signed(condcode);
                let flag_input = InsnInput {
                    insn: branches[0],
@@ -2003,7 +2011,7 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                    ctx.emit(Inst::CondBr {
                        taken,
                        not_taken,
-                        kind: CondBrKind::Cond(cond),
+                        kind,
                    });
                } else {
                    // If the ifcmp result is actually placed in a
@@ -2013,7 +2021,7 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                    ctx.emit(Inst::CondBr {
                        taken,
                        not_taken,
-                        kind: CondBrKind::Cond(cond),
+                        kind,
                    });
                }
            }
@@ -2021,6 +2029,7 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
            Opcode::Brff => {
                let condcode = inst_fp_condcode(ctx.data(branches[0])).unwrap();
                let cond = lower_fp_condcode(condcode);
+                let kind = CondBrKind::Cond(cond);
                let flag_input = InsnInput {
                    insn: branches[0],
                    input: 0,
@@ -2030,7 +2039,7 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                    ctx.emit(Inst::CondBr {
                        taken,
                        not_taken,
-                        kind: CondBrKind::Cond(cond),
+                        kind,
                    });
                } else {
                    // If the ffcmp result is actually placed in a
@@ -2040,7 +2049,7 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                    ctx.emit(Inst::CondBr {
                        taken,
                        not_taken,
-                        kind: CondBrKind::Cond(cond),
+                        kind,
                    });
                }
            }
@@ -2057,12 +2066,15 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                // fills in `targets[0]` with our fallthrough block, so this
                // is valid for both Jump and Fallthrough.
                ctx.emit(Inst::Jump {
-                    dest: BranchTarget::Block(targets[0]),
+                    dest: BranchTarget::Label(targets[0]),
                });
            }
            Opcode::BrTable => {
                // Expand `br_table index, default, JT` to:
                //
+                //   emit_island  // this forces an island at this point
+                //                // if the jumptable would push us past
+                //                // the deadline
                //   subs idx, #jt_size
                //   b.hs default
                //   adr vTmp1, PC+16
@@ -2072,6 +2084,11 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                //   [jumptable offsets relative to JT base]
                let jt_size = targets.len() - 1;
                assert!(jt_size <= std::u32::MAX as usize);
+
+                ctx.emit(Inst::EmitIsland {
+                    needed_space: 4 * (6 + jt_size) as CodeOffset,
+                });
+
                let ridx = input_to_reg(
                    ctx,
                    InsnInput {
@@ -2081,8 +2098,8 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                    NarrowValueMode::ZeroExtend32,
                );

-                let rtmp1 = ctx.tmp(RegClass::I64, I32);
-                let rtmp2 = ctx.tmp(RegClass::I64, I32);
+                let rtmp1 = ctx.alloc_tmp(RegClass::I64, I32);
+                let rtmp2 = ctx.alloc_tmp(RegClass::I64, I32);

                // Bounds-check and branch to default.
                if let Some(imm12) = Imm12::maybe_from_u64(jt_size as u64) {
@@ -2101,10 +2118,10 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                        rm: rtmp1.to_reg(),
                    });
                }
-                let default_target = BranchTarget::Block(targets[0]);
-                ctx.emit(Inst::CondBrLowered {
-                    kind: CondBrKind::Cond(Cond::Hs), // unsigned >=
+                let default_target = BranchTarget::Label(targets[0]);
+                ctx.emit(Inst::OneWayCondBr {
                    target: default_target.clone(),
+                    kind: CondBrKind::Cond(Cond::Hs), // unsigned >=
                });

                // Emit the compound instruction that does:
@@ -2125,9 +2142,9 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                let jt_targets: Vec<BranchTarget> = targets
                    .iter()
                    .skip(1)
-                    .map(|bix| BranchTarget::Block(*bix))
+                    .map(|bix| BranchTarget::Label(*bix))
                    .collect();
-                let targets_for_term: Vec<BlockIndex> = targets.to_vec();
+                let targets_for_term: Vec<MachLabel> = targets.to_vec();
                ctx.emit(Inst::JTSequence {
                    ridx,
                    rtmp1,
@@ -2140,4 +2157,6 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
            _ => panic!("Unknown branch type!"),
        }
    }
+
+    Ok(())
 }
--- a/cranelift/codegen/src/isa/aarch64/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/mod.rs
@@ -15,7 +15,7 @@ use target_lexicon::{Aarch64Architecture, Architecture, Triple};

 // New backend:
 mod abi;
-mod inst;
+pub(crate) mod inst;
 mod lower;
 mod lower_inst;

@@ -59,7 +59,7 @@ impl MachBackend for AArch64Backend {
    ) -> CodegenResult<MachCompileResult> {
        let flags = self.flags();
        let vcode = self.compile_vcode(func, flags.clone())?;
-        let sections = vcode.emit();
+        let buffer = vcode.emit();
        let frame_size = vcode.frame_size();

        let disasm = if want_disasm {
@@ -68,8 +68,10 @@ impl MachBackend for AArch64Backend {
            None
        };

+        let buffer = buffer.finish();
+
        Ok(MachCompileResult {
-            sections,
+            buffer,
            frame_size,
            disasm,
        })
@@ -140,8 +142,8 @@ mod test {
            Triple::from_str("aarch64").unwrap(),
            settings::Flags::new(shared_flags),
        );
-        let sections = backend.compile_function(&mut func, false).unwrap().sections;
-        let code = &sections.sections[0].data;
+        let buffer = backend.compile_function(&mut func, false).unwrap().buffer;
+        let code = &buffer.data[..];

        // stp x29, x30, [sp, #-16]!
        // mov x29, sp
@@ -155,7 +157,7 @@ mod test {
            0x01, 0x0b, 0xbf, 0x03, 0x00, 0x91, 0xfd, 0x7b, 0xc1, 0xa8, 0xc0, 0x03, 0x5f, 0xd6,
        ];

-        assert_eq!(code, &golden);
+        assert_eq!(code, &golden[..]);
    }

    #[test]
@@ -198,34 +200,32 @@ mod test {
        let result = backend
            .compile_function(&mut func, /* want_disasm = */ false)
            .unwrap();
-        let code = &result.sections.sections[0].data;
+        let code = &result.buffer.data[..];

        // stp	x29, x30, [sp, #-16]!
        // mov	x29, sp
-        // mov	x1, x0
-        // mov  x0, #0x1234
-        // add	w1, w1, w0
-        // mov	w2, w1
-        // cbz	x2, ...
-        // mov	w2, w1
-        // cbz	x2, ...
-        // sub	w0, w1, w0
+        // mov	x1, #0x1234                	// #4660
+        // add	w0, w0, w1
+        // mov	w1, w0
+        // cbnz	x1, 0x28
+        // mov	x1, #0x1234                	// #4660
+        // add	w1, w0, w1
+        // mov	w1, w1
+        // cbnz	x1, 0x18
+        // mov	w1, w0
+        // cbnz	x1, 0x18
+        // mov	x1, #0x1234                	// #4660
+        // sub	w0, w0, w1
        // mov	sp, x29
        // ldp	x29, x30, [sp], #16
        // ret
-        // add	w2, w1, w0
-        // mov	w2, w2
-        // cbnz	x2, ... <---- compound branch (cond / uncond)
-        // b ...        <----
-
        let golden = vec![
-            0xfd, 0x7b, 0xbf, 0xa9, 0xfd, 0x03, 0x00, 0x91, 0xe1, 0x03, 0x00, 0xaa, 0x80, 0x46,
-            0x82, 0xd2, 0x21, 0x00, 0x00, 0x0b, 0xe2, 0x03, 0x01, 0x2a, 0xe2, 0x00, 0x00, 0xb4,
-            0xe2, 0x03, 0x01, 0x2a, 0xa2, 0x00, 0x00, 0xb5, 0x20, 0x00, 0x00, 0x4b, 0xbf, 0x03,
-            0x00, 0x91, 0xfd, 0x7b, 0xc1, 0xa8, 0xc0, 0x03, 0x5f, 0xd6, 0x22, 0x00, 0x00, 0x0b,
-            0xe2, 0x03, 0x02, 0x2a, 0xc2, 0xff, 0xff, 0xb5, 0xf7, 0xff, 0xff, 0x17,
+            253, 123, 191, 169, 253, 3, 0, 145, 129, 70, 130, 210, 0, 0, 1, 11, 225, 3, 0, 42, 161,
+            0, 0, 181, 129, 70, 130, 210, 1, 0, 1, 11, 225, 3, 1, 42, 161, 255, 255, 181, 225, 3,
+            0, 42, 97, 255, 255, 181, 129, 70, 130, 210, 0, 0, 1, 75, 191, 3, 0, 145, 253, 123,
+            193, 168, 192, 3, 95, 214,
        ];

-        assert_eq!(code, &golden);
+        assert_eq!(code, &golden[..]);
    }
 }
--- a/cranelift/codegen/src/isa/mod.rs
+++ b/cranelift/codegen/src/isa/mod.rs
@@ -84,7 +84,7 @@ mod x64;
 mod arm32;

 #[cfg(feature = "arm64")]
-mod aarch64;
+pub(crate) mod aarch64;

 #[cfg(feature = "unwind")]
 pub mod unwind;
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -5,7 +5,6 @@ use std::string::{String, ToString};

 use regalloc::{RealRegUniverse, Reg, RegClass, RegUsageCollector};

-use crate::binemit::CodeOffset;
 use crate::machinst::*;

 use super::regs::show_ireg_sized;
@@ -375,43 +374,27 @@ impl fmt::Debug for CC {
 /// from end of current instruction).
 #[derive(Clone, Copy, Debug)]
 pub enum BranchTarget {
-    /// An unresolved reference to a BlockIndex, as passed into
-    /// `lower_branch_group()`.
-    Block(BlockIndex),
+    /// An unresolved reference to a MachLabel.
+    Label(MachLabel),

-    /// A resolved reference to another instruction, after
-    /// `Inst::with_block_offsets()`.  This offset is in bytes.
-    ResolvedOffset(BlockIndex, isize),
+    /// A resolved reference to another instruction, in bytes.
+    ResolvedOffset(isize),
 }

 impl ShowWithRRU for BranchTarget {
    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
        match self {
-            BranchTarget::Block(bix) => format!("(Block {})", bix),
-            BranchTarget::ResolvedOffset(bix, offs) => format!("(Block {}, offset {})", bix, offs),
+            BranchTarget::Label(l) => format!("{:?}", l),
+            BranchTarget::ResolvedOffset(offs) => format!("(offset {})", offs),
        }
    }
 }

 impl BranchTarget {
-    /// Lower the branch target given offsets of each block.
-    pub fn lower(&mut self, targets: &[CodeOffset], my_offset: CodeOffset) {
+    /// Get the label.
+    pub fn as_label(&self) -> Option<MachLabel> {
        match self {
-            &mut BranchTarget::Block(bix) => {
-                let bix = bix as usize;
-                assert!(bix < targets.len());
-                let block_offset_in_func = targets[bix];
-                let branch_offset = (block_offset_in_func as isize) - (my_offset as isize);
-                *self = BranchTarget::ResolvedOffset(bix as BlockIndex, branch_offset);
-            }
-            &mut BranchTarget::ResolvedOffset(..) => {}
-        }
-    }
-
-    /// Get the block index.
-    pub fn as_block_index(&self) -> Option<BlockIndex> {
-        match self {
-            &BranchTarget::Block(bix) => Some(bix),
+            &BranchTarget::Label(l) => Some(l),
            _ => None,
        }
    }
@@ -421,31 +404,17 @@ impl BranchTarget {
    /// byte of the target.  It does not take into account the Intel-specific
    /// rule that a branch offset is encoded as relative to the start of the
    /// following instruction.  That is a problem for the emitter to deal
-    /// with.
-    pub fn as_offset_i32(&self) -> Option<i32> {
+    /// with. If a label, returns zero.
+    pub fn as_offset32_or_zero(&self) -> i32 {
        match self {
-            &BranchTarget::ResolvedOffset(_, off) => {
+            &BranchTarget::ResolvedOffset(off) => {
                // Leave a bit of slack so that the emitter is guaranteed to
                // be able to add the length of the jump instruction encoding
                // to this value and still have a value in signed-32 range.
-                if off >= -0x7FFF_FF00isize && off <= 0x7FFF_FF00isize {
-                    Some(off as i32)
-                } else {
-                    None
-                }
+                assert!(off >= -0x7FFF_FF00 && off <= 0x7FFF_FF00);
+                off as i32
            }
-            _ => None,
-        }
-    }
-
-    /// Map the block index given a transform map.
-    pub fn map(&mut self, block_index_map: &[BlockIndex]) {
-        match self {
-            &mut BranchTarget::Block(ref mut bix) => {
-                let n = block_index_map[*bix as usize];
-                *bix = n;
-            }
-            _ => panic!("BranchTarget::map() called on already-lowered BranchTarget!"),
+            _ => 0,
        }
    }
 }
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -80,8 +80,8 @@ const F_PREFIX_66: u32 = 4;
 /// deleted if it is redundant (0x40).  Note that for a 64-bit operation, the
 /// REX prefix will normally never be redundant, since REX.W must be 1 to
 /// indicate a 64-bit operation.
-fn emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE<O: MachSectionOutput>(
-    sink: &mut O,
+fn emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE(
+    sink: &mut MachBuffer<Inst>,
    opcodes: u32,
    mut numOpcodes: usize,
    encG: u8,
@@ -199,8 +199,8 @@ fn emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE<O: MachSectionOutput>(
 /// emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE, except it is for the case
 /// where the E operand is a register rather than memory.  Hence it is much
 /// simpler.
-fn emit_REX_OPCODES_MODRM_encG_encE<O: MachSectionOutput>(
-    sink: &mut O,
+fn emit_REX_OPCODES_MODRM_encG_encE(
+    sink: &mut MachBuffer<Inst>,
    opcodes: u32,
    mut numOpcodes: usize,
    encG: u8,
@@ -240,8 +240,8 @@ fn emit_REX_OPCODES_MODRM_encG_encE<O: MachSectionOutput>(
 // These are merely wrappers for the above two functions that facilitate passing
 // actual `Reg`s rather than their encodings.

-fn emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE<O: MachSectionOutput>(
-    sink: &mut O,
+fn emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+    sink: &mut MachBuffer<Inst>,
    opcodes: u32,
    numOpcodes: usize,
    regG: Reg,
@@ -253,8 +253,8 @@ fn emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE<O: MachSectionOutput>(
    emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE(sink, opcodes, numOpcodes, encG, memE, flags);
 }

-fn emit_REX_OPCODES_MODRM_regG_regE<O: MachSectionOutput>(
-    sink: &mut O,
+fn emit_REX_OPCODES_MODRM_regG_regE(
+    sink: &mut MachBuffer<Inst>,
    opcodes: u32,
    numOpcodes: usize,
    regG: Reg,
@@ -268,7 +268,7 @@ fn emit_REX_OPCODES_MODRM_regG_regE<O: MachSectionOutput>(
 }

 /// Write a suitable number of bits from an imm64 to the sink.
-fn emit_simm<O: MachSectionOutput>(sink: &mut O, size: u8, simm32: u32) {
+fn emit_simm(sink: &mut MachBuffer<Inst>, size: u8, simm32: u32) {
    match size {
        8 | 4 => sink.put4(simm32),
        2 => sink.put2(simm32 as u16),
@@ -329,7 +329,7 @@ fn emit_simm<O: MachSectionOutput>(sink: &mut O, size: u8, simm32: u32) {
 ///
 /// * there's a shorter encoding for shl/shr/sar by a 1-bit immediate.  (Do we
 ///   care?)
-pub(crate) fn emit<O: MachSectionOutput>(inst: &Inst, sink: &mut O) {
+pub(crate) fn emit(inst: &Inst, sink: &mut MachBuffer<Inst>) {
    match inst {
        Inst::Nop { len: 0 } => {}
        Inst::Alu_RMI_R {
@@ -808,55 +808,59 @@ pub(crate) fn emit<O: MachSectionOutput>(inst: &Inst, sink: &mut O) {
        }
        Inst::Ret {} => sink.put1(0xC3),

-        Inst::JmpKnown {
-            dest: BranchTarget::Block(..),
-        } => {
-            // Computation of block offsets/sizes.
-            sink.put1(0);
-            sink.put4(0);
-        }
-        Inst::JmpKnown {
-            dest: BranchTarget::ResolvedOffset(_bix, offset),
-        } if *offset >= -0x7FFF_FF00 && *offset <= 0x7FFF_FF00 => {
-            // And now for real
-            let mut offs_i32 = *offset as i32;
-            offs_i32 -= 5;
-            let offs_u32 = offs_i32 as u32;
+        Inst::JmpKnown { dest } => {
+            let disp = dest.as_offset32_or_zero() - 5;
+            let disp = disp as u32;
+            let br_start = sink.cur_offset();
            sink.put1(0xE9);
-            sink.put4(offs_u32);
+            let br_disp_off = sink.cur_offset();
+            sink.put4(disp);
+            let br_end = sink.cur_offset();
+            if let Some(l) = dest.as_label() {
+                sink.use_label_at_offset(br_disp_off, l, LabelUse::Rel32);
+                sink.add_uncond_branch(br_start, br_end, l);
+            }
        }
-        //
-        // ** Inst::JmpCondSymm   XXXX should never happen
-        //
-        Inst::JmpCond {
-            cc: _,
-            target: BranchTarget::Block(..),
-        } => {
-            // This case occurs when we are computing block offsets / sizes,
-            // prior to lowering block-index targets to concrete-offset targets.
-            // Only the size matters, so let's emit 6 bytes, as below.
-            sink.put1(0);
-            sink.put1(0);
-            sink.put4(0);
-        }
-        Inst::JmpCond {
+        Inst::JmpCondSymm {
            cc,
-            target: BranchTarget::ResolvedOffset(_bix, offset),
-        } if *offset >= -0x7FFF_FF00 && *offset <= 0x7FFF_FF00 => {
+            taken,
+            not_taken,
+        } => {
+            // Conditional part.
+
            // This insn is 6 bytes long.  Currently `offset` is relative to
            // the start of this insn, but the Intel encoding requires it to
            // be relative to the start of the next instruction.  Hence the
            // adjustment.
-            let mut offs_i32 = *offset as i32;
-            offs_i32 -= 6;
-            let offs_u32 = offs_i32 as u32;
+            let taken_disp = taken.as_offset32_or_zero() - 6;
+            let taken_disp = taken_disp as u32;
+            let cond_start = sink.cur_offset();
            sink.put1(0x0F);
            sink.put1(0x80 + cc.get_enc());
-            sink.put4(offs_u32);
+            let cond_disp_off = sink.cur_offset();
+            sink.put4(taken_disp);
+            let cond_end = sink.cur_offset();
+            if let Some(l) = taken.as_label() {
+                sink.use_label_at_offset(cond_disp_off, l, LabelUse::Rel32);
+                let inverted: [u8; 6] =
+                    [0x0F, 0x80 + (cc.invert().get_enc()), 0xFA, 0xFF, 0xFF, 0xFF];
+                sink.add_cond_branch(cond_start, cond_end, l, &inverted[..]);
+            }
+
+            // Unconditional part.
+
+            let nt_disp = not_taken.as_offset32_or_zero() - 5;
+            let nt_disp = nt_disp as u32;
+            let uncond_start = sink.cur_offset();
+            sink.put1(0xE9);
+            let uncond_disp_off = sink.cur_offset();
+            sink.put4(nt_disp);
+            let uncond_end = sink.cur_offset();
+            if let Some(l) = not_taken.as_label() {
+                sink.use_label_at_offset(uncond_disp_off, l, LabelUse::Rel32);
+                sink.add_uncond_branch(uncond_start, uncond_end, l);
+            }
        }
-        //
-        // ** Inst::JmpCondCompound   XXXX should never happen
-        //
        Inst::JmpUnknown { target } => {
            match target {
                RM::R { reg } => {
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -2180,19 +2180,11 @@ fn test_x64_emit() {
        let actual_printing = insn.show_rru(Some(&rru));
        assert_eq!(expected_printing, actual_printing);

-        // Check the encoding is as expected.
-        let text_size = {
-            let mut code_sec = MachSectionSize::new(0);
-            insn.emit(&mut code_sec, &flags, &mut Default::default());
-            code_sec.size()
-        };
-
        let mut sink = test_utils::TestCodeSink::new();
-        let mut sections = MachSections::new();
-        let code_idx = sections.add_section(0, text_size);
-        let code_sec = sections.get_section(code_idx);
-        insn.emit(code_sec, &flags, &mut Default::default());
-        sections.emit(&mut sink);
+        let mut buffer = MachBuffer::new();
+        insn.emit(&mut buffer, &flags, &mut Default::default());
+        let buffer = buffer.finish();
+        buffer.emit(&mut sink);
        let actual_encoding = &sink.stringify();
        assert_eq!(expected_encoding, actual_encoding);
    }
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -4,6 +4,8 @@
 #![allow(non_snake_case)]
 #![allow(non_camel_case_types)]

+use core::convert::TryFrom;
+use smallvec::SmallVec;
 use std::fmt;
 use std::string::{String, ToString};

@@ -16,6 +18,7 @@ use crate::ir::types::{B1, B128, B16, B32, B64, B8, F32, F64, I128, I16, I32, I6
 use crate::ir::ExternalName;
 use crate::ir::Type;
 use crate::machinst::*;
+use crate::settings::Flags;
 use crate::{settings, CodegenError, CodegenResult};

 pub mod args;
@@ -25,7 +28,7 @@ mod emit_tests;
 pub mod regs;

 use args::*;
-use regs::show_ireg_sized;
+use regs::{create_reg_universe_systemv, show_ireg_sized};

 //=============================================================================
 // Instructions (top level): definition
@@ -136,34 +139,15 @@ pub(crate) enum Inst {
    JmpKnown { dest: BranchTarget },

    /// jcond cond target target
-    // Symmetrical two-way conditional branch.
-    // Should never reach the emitter.
+    /// Symmetrical two-way conditional branch.
+    /// Emitted as a compound sequence; the MachBuffer will shrink it
+    /// as appropriate.
    JmpCondSymm {
        cc: CC,
        taken: BranchTarget,
        not_taken: BranchTarget,
    },

-    /// Lowered conditional branch: contains the original instruction, and a
-    /// flag indicating whether to invert the taken-condition or not. Only one
-    /// BranchTarget is retained, and the other is implicitly the next
-    /// instruction, given the final basic-block layout.
-    JmpCond {
-        cc: CC,
-        //inverted: bool, is this needed?
-        target: BranchTarget,
-    },
-
-    /// As for `CondBrLowered`, but represents a condbr/uncond-br sequence (two
-    /// actual machine instructions). Needed when the final block layout implies
-    /// that neither arm of a conditional branch targets the fallthrough block.
-    // Should never reach the emitter
-    JmpCondCompound {
-        cc: CC,
-        taken: BranchTarget,
-        not_taken: BranchTarget,
-    },
-
    /// jmpq (reg mem)
    JmpUnknown { target: RM },
 }
@@ -298,18 +282,6 @@ impl Inst {
        }
    }

-    pub(crate) fn jmp_cond(cc: CC, target: BranchTarget) -> Inst {
-        Inst::JmpCond { cc, target }
-    }
-
-    pub(crate) fn jmp_cond_compound(cc: CC, taken: BranchTarget, not_taken: BranchTarget) -> Inst {
-        Inst::JmpCondCompound {
-            cc,
-            taken,
-            not_taken,
-        }
-    }
-
    pub(crate) fn jmp_unknown(target: RM) -> Inst {
        Inst::JmpUnknown { target }
    }
@@ -485,13 +457,6 @@ impl ShowWithRRU for Inst {
                not_taken.show_rru(mb_rru)
            ),
            //
-            Inst::JmpCond { cc, ref target } => format!(
-                "{} {}",
-                ljustify2("j".to_string(), cc.to_string()),
-                target.show_rru(None)
-            ),
-            //
-            Inst::JmpCondCompound { .. } => "**JmpCondCompound**".to_string(),
            Inst::JmpUnknown { target } => format!(
                "{} *{}",
                ljustify("jmp".to_string()),
@@ -601,18 +566,10 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
            taken: _,
            not_taken: _,
        } => {}
-        //
-        // ** JmpCond
-        //
-        // ** JmpCondCompound
-        //
        //Inst::JmpUnknown { target } => {
        //    target.get_regs_as_uses(collector);
        //}
-        Inst::Nop { .. }
-        | Inst::JmpCond { .. }
-        | Inst::JmpCondCompound { .. }
-        | Inst::JmpUnknown { .. } => unimplemented!("x64_get_regs inst"),
+        Inst::Nop { .. } | Inst::JmpUnknown { .. } => unimplemented!("x64_get_regs inst"),
    }
 }

@@ -767,18 +724,10 @@ fn x64_map_regs(inst: &mut Inst, mapper: &RegUsageMapper) {
            taken: _,
            not_taken: _,
        } => {}
-        //
-        // ** JmpCond
-        //
-        // ** JmpCondCompound
-        //
        //Inst::JmpUnknown { target } => {
        //    target.apply_map(mapper);
        //}
-        Inst::Nop { .. }
-        | Inst::JmpCond { .. }
-        | Inst::JmpCondCompound { .. }
-        | Inst::JmpUnknown { .. } => unimplemented!("x64_map_regs opcode"),
+        Inst::Nop { .. } | Inst::JmpUnknown { .. } => unimplemented!("x64_map_regs opcode"),
    }
 }

@@ -817,18 +766,12 @@ impl MachInst for Inst {
        match self {
            // Interesting cases.
            &Self::Ret | &Self::EpiloguePlaceholder => MachTerminator::Ret,
-            &Self::JmpKnown { dest } => MachTerminator::Uncond(dest.as_block_index().unwrap()),
+            &Self::JmpKnown { dest } => MachTerminator::Uncond(dest.as_label().unwrap()),
            &Self::JmpCondSymm {
                cc: _,
                taken,
                not_taken,
-            } => MachTerminator::Cond(
-                taken.as_block_index().unwrap(),
-                not_taken.as_block_index().unwrap(),
-            ),
-            &Self::JmpCond { .. } | &Self::JmpCondCompound { .. } => {
-                panic!("is_term() called after lowering branches");
-            }
+            } => MachTerminator::Cond(taken.as_label().unwrap(), not_taken.as_label().unwrap()),
            // All other cases are boring.
            _ => MachTerminator::None,
        }
@@ -868,87 +811,95 @@ impl MachInst for Inst {
        }
    }

-    fn gen_jump(blockindex: BlockIndex) -> Inst {
-        Inst::jmp_known(BranchTarget::Block(blockindex))
+    fn gen_jump(label: MachLabel) -> Inst {
+        Inst::jmp_known(BranchTarget::Label(label))
    }

-    fn with_block_rewrites(&mut self, block_target_map: &[BlockIndex]) {
-        // This is identical (modulo renaming) to the arm64 version.
-        match self {
-            &mut Inst::JmpKnown { ref mut dest } => {
-                dest.map(block_target_map);
-            }
-            &mut Inst::JmpCondSymm {
-                cc: _,
-                ref mut taken,
-                ref mut not_taken,
-            } => {
-                taken.map(block_target_map);
-                not_taken.map(block_target_map);
-            }
-            &mut Inst::JmpCond { .. } | &mut Inst::JmpCondCompound { .. } => {
-                panic!("with_block_rewrites called after branch lowering!");
-            }
-            _ => {}
-        }
+    fn gen_constant(to_reg: Writable<Reg>, value: u64, _: Type) -> SmallVec<[Self; 4]> {
+        let mut ret = SmallVec::new();
+        let is64 = value > 0xffff_ffff;
+        ret.push(Inst::imm_r(is64, value, to_reg));
+        ret
    }

-    fn with_fallthrough_block(&mut self, fallthrough: Option<BlockIndex>) {
-        // This is identical (modulo renaming) to the arm64 version.
-        match self {
-            &mut Inst::JmpCondSymm {
-                cc,
-                taken,
-                not_taken,
-            } => {
-                if taken.as_block_index() == fallthrough {
-                    *self = Inst::jmp_cond(cc.invert(), not_taken);
-                } else if not_taken.as_block_index() == fallthrough {
-                    *self = Inst::jmp_cond(cc, taken);
-                } else {
-                    // We need a compound sequence (condbr / uncond-br).
-                    *self = Inst::jmp_cond_compound(cc, taken, not_taken);
-                }
-            }
-            &mut Inst::JmpKnown { dest } => {
-                if dest.as_block_index() == fallthrough {
-                    *self = Inst::nop(0);
-                }
-            }
-            _ => {}
-        }
+    fn reg_universe(flags: &Flags) -> RealRegUniverse {
+        create_reg_universe_systemv(flags)
    }

-    fn with_block_offsets(&mut self, my_offset: CodeOffset, targets: &[CodeOffset]) {
-        // This is identical (modulo renaming) to the arm64 version.
-        match self {
-            &mut Self::JmpCond {
-                cc: _,
-                ref mut target,
-            } => {
-                target.lower(targets, my_offset);
-            }
-            &mut Self::JmpCondCompound {
-                cc: _,
-                ref mut taken,
-                ref mut not_taken,
-                ..
-            } => {
-                taken.lower(targets, my_offset);
-                not_taken.lower(targets, my_offset);
-            }
-            &mut Self::JmpKnown { ref mut dest } => {
-                dest.lower(targets, my_offset);
-            }
-            _ => {}
-        }
+    fn worst_case_size() -> CodeOffset {
+        15
    }
+
+    type LabelUse = LabelUse;
 }

-impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
+impl MachInstEmit for Inst {
    type State = ();

-    fn emit(&self, sink: &mut O, _flags: &settings::Flags, _: &mut Self::State) {
+    fn emit(&self, sink: &mut MachBuffer<Inst>, _flags: &settings::Flags, _: &mut Self::State) {
        emit::emit(self, sink);
    }
 }
+
+/// A label-use (internal relocation) in generated code.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub(crate) enum LabelUse {
+    /// A 32-bit offset from location of relocation itself, added to the
+    /// existing value at that location.
+    Rel32,
+}
+
+impl MachInstLabelUse for LabelUse {
+    const ALIGN: CodeOffset = 1;
+
+    fn max_pos_range(self) -> CodeOffset {
+        match self {
+            LabelUse::Rel32 => 0x7fff_ffff,
+        }
+    }
+
+    fn max_neg_range(self) -> CodeOffset {
+        match self {
+            LabelUse::Rel32 => 0x8000_0000,
+        }
+    }
+
+    fn patch_size(self) -> CodeOffset {
+        match self {
+            LabelUse::Rel32 => 4,
+        }
+    }
+
+    fn patch(self, buffer: &mut [u8], use_offset: CodeOffset, label_offset: CodeOffset) {
+        match self {
+            LabelUse::Rel32 => {
+                let addend = i32::from_le_bytes([buffer[0], buffer[1], buffer[2], buffer[3]]);
+                let value = i32::try_from(label_offset)
+                    .unwrap()
+                    .wrapping_sub(i32::try_from(use_offset).unwrap())
+                    .wrapping_add(addend);
+                buffer.copy_from_slice(&value.to_le_bytes()[..]);
+            }
+        }
+    }
+
+    fn supports_veneer(self) -> bool {
+        match self {
+            LabelUse::Rel32 => false,
+        }
+    }
+
+    fn veneer_size(self) -> CodeOffset {
+        match self {
+            LabelUse::Rel32 => 0,
+        }
+    }
+
+    fn generate_veneer(self, _: &mut [u8], _: CodeOffset) -> (CodeOffset, LabelUse) {
+        match self {
+            LabelUse::Rel32 => {
+                panic!("Veneer not supported for Rel32 label-use.");
+            }
+        }
+    }
+}
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -12,6 +12,7 @@ use crate::ir::{InstructionData, Opcode, Type};

 use crate::machinst::lower::*;
 use crate::machinst::*;
+use crate::result::CodegenResult;

 use crate::isa::x64::inst::args::*;
 use crate::isa::x64::inst::*;
@@ -94,6 +95,16 @@ fn intCC_to_x64_CC(cc: IntCC) -> CC {
    }
 }

+fn input_to_reg<'a>(ctx: Ctx<'a>, iri: IRInst, input: usize) -> Reg {
+    let inputs = ctx.get_input(iri, input);
+    ctx.use_input_reg(inputs);
+    inputs.reg
+}
+
+fn output_to_reg<'a>(ctx: Ctx<'a>, iri: IRInst, output: usize) -> Writable<Reg> {
+    ctx.get_output(iri, output)
+}
+
 //=============================================================================
 // Top-level instruction lowering entry point, for one instruction.

@@ -114,7 +125,7 @@ fn lower_insn_to_regs<'a>(ctx: Ctx<'a>, iri: IRInst) {
                // Get exactly the bit pattern in 'w64' into the dest.  No
                // monkeying with sign extension etc.
                let dstIs64 = w64 > 0xFFFF_FFFF;
-                let regD = ctx.output(iri, 0);
+                let regD = output_to_reg(ctx, iri, 0);
                ctx.emit(Inst::imm_r(dstIs64, w64, regD));
            } else {
                unimplemented!();
@@ -122,9 +133,9 @@ fn lower_insn_to_regs<'a>(ctx: Ctx<'a>, iri: IRInst) {
        }

        Opcode::Iadd | Opcode::Isub => {
-            let regD = ctx.output(iri, 0);
-            let regL = ctx.input(iri, 0);
-            let regR = ctx.input(iri, 1);
+            let regD = output_to_reg(ctx, iri, 0);
+            let regL = input_to_reg(ctx, iri, 0);
+            let regR = input_to_reg(ctx, iri, 1);
            let is64 = int_ty_to_is64(ty.unwrap());
            let how = if op == Opcode::Iadd {
                RMI_R_Op::Add
@@ -139,9 +150,9 @@ fn lower_insn_to_regs<'a>(ctx: Ctx<'a>, iri: IRInst) {
            // TODO: implement imm shift value into insn
            let tySL = ctx.input_ty(iri, 0);
            let tyD = ctx.output_ty(iri, 0); // should be the same as tySL
-            let regSL = ctx.input(iri, 0);
-            let regSR = ctx.input(iri, 1);
-            let regD = ctx.output(iri, 0);
+            let regSL = input_to_reg(ctx, iri, 0);
+            let regSR = input_to_reg(ctx, iri, 1);
+            let regD = output_to_reg(ctx, iri, 0);
            if tyD == tySL && (tyD == types::I32 || tyD == types::I64) {
                let how = match op {
                    Opcode::Ishl => ShiftKind::Left,
@@ -168,8 +179,8 @@ fn lower_insn_to_regs<'a>(ctx: Ctx<'a>, iri: IRInst) {
            let isZX = op == Opcode::Uextend;
            let tyS = ctx.input_ty(iri, 0);
            let tyD = ctx.output_ty(iri, 0);
-            let regS = ctx.input(iri, 0);
-            let regD = ctx.output(iri, 0);
+            let regS = input_to_reg(ctx, iri, 0);
+            let regD = output_to_reg(ctx, iri, 0);
            ctx.emit(Inst::mov_r_r(true, regS, regD));
            match (tyS, tyD, isZX) {
                (types::I8, types::I64, false) => {
@@ -182,7 +193,7 @@ fn lower_insn_to_regs<'a>(ctx: Ctx<'a>, iri: IRInst) {

        Opcode::FallthroughReturn | Opcode::Return => {
            for i in 0..ctx.num_inputs(iri) {
-                let src_reg = ctx.input(iri, i);
+                let src_reg = input_to_reg(ctx, iri, i);
                let retval_reg = ctx.retval(i);
                ctx.emit(Inst::mov_r_r(true, src_reg, retval_reg));
            }
@@ -219,35 +230,6 @@ fn lower_insn_to_regs<'a>(ctx: Ctx<'a>, iri: IRInst) {
            panic!("ALU+imm and ALU+carry ops should not appear here!");
        }

-        Opcode::X86Udivmodx
-        | Opcode::X86Sdivmodx
-        | Opcode::X86Umulx
-        | Opcode::X86Smulx
-        | Opcode::X86Cvtt2si
-        | Opcode::X86Fmin
-        | Opcode::X86Fmax
-        | Opcode::X86Push
-        | Opcode::X86Pop
-        | Opcode::X86Bsr
-        | Opcode::X86Bsf
-        | Opcode::X86Pshufd
-        | Opcode::X86Pshufb
-        | Opcode::X86Pextr
-        | Opcode::X86Pinsr
-        | Opcode::X86Insertps
-        | Opcode::X86Movsd
-        | Opcode::X86Movlhps
-        | Opcode::X86Psll
-        | Opcode::X86Psrl
-        | Opcode::X86Psra
-        | Opcode::X86Ptest
-        | Opcode::X86Pmaxs
-        | Opcode::X86Pmaxu
-        | Opcode::X86Pmins
-        | Opcode::X86Pminu => {
-            panic!("x86-specific opcode in supposedly arch-neutral IR!");
-        }
-
        _ => unimplemented!("unimplemented lowering for opcode {:?}", op),
    }
 }
@@ -258,17 +240,18 @@ fn lower_insn_to_regs<'a>(ctx: Ctx<'a>, iri: IRInst) {
 impl LowerBackend for X64Backend {
    type MInst = Inst;

-    fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) {
+    fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> {
        lower_insn_to_regs(ctx, ir_inst);
+        Ok(())
    }

    fn lower_branch_group<C: LowerCtx<I = Inst>>(
        &self,
        ctx: &mut C,
        branches: &[IRInst],
-        targets: &[BlockIndex],
-        fallthrough: Option<BlockIndex>,
-    ) {
+        targets: &[MachLabel],
+        fallthrough: Option<MachLabel>,
+    ) -> CodegenResult<()> {
        // A block should end with at most two branches. The first may be a
        // conditional branch; a conditional branch can be followed only by an
        // unconditional branch or fallthrough. Otherwise, if only one branch,
@@ -290,17 +273,17 @@ impl LowerBackend for X64Backend {
            );

            assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough);
-            let taken = BranchTarget::Block(targets[0]);
+            let taken = BranchTarget::Label(targets[0]);
            let not_taken = match op1 {
-                Opcode::Jump => BranchTarget::Block(targets[1]),
-                Opcode::Fallthrough => BranchTarget::Block(fallthrough.unwrap()),
+                Opcode::Jump => BranchTarget::Label(targets[1]),
+                Opcode::Fallthrough => BranchTarget::Label(fallthrough.unwrap()),
                _ => unreachable!(), // assert above.
            };
            match op0 {
                Opcode::Brz | Opcode::Brnz => {
                    let tyS = ctx.input_ty(branches[0], 0);
                    if is_int_ty(tyS) {
-                        let rS = ctx.input(branches[0], 0);
+                        let rS = input_to_reg(ctx, branches[0], 0);
                        let cc = match op0 {
                            Opcode::Brz => CC::Z,
                            Opcode::Brnz => CC::NZ,
@@ -316,8 +299,8 @@ impl LowerBackend for X64Backend {
                Opcode::BrIcmp => {
                    let tyS = ctx.input_ty(branches[0], 0);
                    if is_int_ty(tyS) {
-                        let rSL = ctx.input(branches[0], 0);
-                        let rSR = ctx.input(branches[0], 1);
+                        let rSL = input_to_reg(ctx, branches[0], 0);
+                        let rSR = input_to_reg(ctx, branches[0], 1);
                        let cc = intCC_to_x64_CC(inst_condcode(ctx.data(branches[0])));
                        let sizeB = int_ty_to_sizeB(tyS);
                        // FIXME verify rSR vs rSL ordering
@@ -339,10 +322,10 @@ impl LowerBackend for X64Backend {
            let op = ctx.data(branches[0]).opcode();
            match op {
                Opcode::Jump => {
-                    ctx.emit(Inst::jmp_known(BranchTarget::Block(targets[0])));
+                    ctx.emit(Inst::jmp_known(BranchTarget::Label(targets[0])));
                }
                Opcode::Fallthrough => {
-                    ctx.emit(Inst::jmp_known(BranchTarget::Block(targets[0])));
+                    ctx.emit(Inst::jmp_known(BranchTarget::Label(targets[0])));
                }
                Opcode::Trap => {
                    unimplemented = true;
@@ -354,5 +337,7 @@ impl LowerBackend for X64Backend {
        if unimplemented {
            unimplemented!("lower_branch_group(x64): can't handle: {:?}", branches);
        }
+
+        Ok(())
    }
 }
--- a/cranelift/codegen/src/isa/x64/mod.rs
+++ b/cranelift/codegen/src/isa/x64/mod.rs
@@ -52,7 +52,8 @@ impl MachBackend for X64Backend {
    ) -> CodegenResult<MachCompileResult> {
        let flags = self.flags();
        let vcode = self.compile_vcode(func, flags.clone())?;
-        let sections = vcode.emit();
+        let buffer = vcode.emit();
+        let buffer = buffer.finish();
        let frame_size = vcode.frame_size();

        let disasm = if want_disasm {
@@ -62,7 +63,7 @@ impl MachBackend for X64Backend {
        };

        Ok(MachCompileResult {
-            sections,
+            buffer,
            frame_size,
            disasm,
        })
--- a/cranelift/codegen/src/lib.rs
+++ b/cranelift/codegen/src/lib.rs
@@ -99,7 +99,6 @@ mod iterators;
 mod legalizer;
 mod licm;
 mod nan_canonicalization;
-mod num_uses;
 mod partition_slice;
 mod postopt;
 mod predicates;
--- a/cranelift/codegen/src/machinst/blockorder.rs
+++ b/cranelift/codegen/src/machinst/blockorder.rs
@@ -1,49 +1,624 @@
 //! Computation of basic block order in emitted code.
+//!
+//! This module handles the translation from CLIF BBs to VCode BBs.
+//!
+//! The basic idea is that we compute a sequence of "lowered blocks" that
+//! correspond to one or more blocks in the graph: (CLIF CFG) `union` (implicit
+//! block on *every* edge). Conceptually, the lowering pipeline wants to insert
+//! moves for phi-nodes on every block-to-block transfer; these blocks always
+//! conceptually exist, but may be merged with an "original" CLIF block (and
+//! hence not actually exist; this is equivalent to inserting the blocks only on
+//! critical edges).
+//!
+//! In other words, starting from a CFG like this (where each "CLIF block" and
+//! "(edge N->M)" is a separate basic block):
+//!
+//! ```plain
+//!
+//!              CLIF block 0
+//!               /           \
+//!       (edge 0->1)         (edge 0->2)
+//!              |                |
+//!       CLIF block 1         CLIF block 2
+//!              \                /
+//!           (edge 1->3)   (edge 2->3)
+//!                   \      /
+//!                 CLIF block 3
+//! ```
+//!
+//! We can produce a CFG of lowered blocks like so:
+//!
+//! ```plain
+//!            +--------------+
+//!            | CLIF block 0 |
+//!            +--------------+
+//!               /           \
+//!     +--------------+     +--------------+
+//!     | (edge 0->1)  |     |(edge 0->2)   |
+//!     | CLIF block 1 |     | CLIF block 2 |
+//!     +--------------+     +--------------+
+//!              \                /
+//!          +-----------+ +-----------+
+//!          |(edge 1->3)| |(edge 2->3)|
+//!          +-----------+ +-----------+
+//!                   \      /
+//!                +------------+
+//!                |CLIF block 3|
+//!                +------------+
+//! ```
+//!
+//! (note that the edges into CLIF blocks 1 and 2 could be merged with those
+//! blocks' original bodies, but the out-edges could not because for simplicity
+//! in the successor-function definition, we only ever merge an edge onto one
+//! side of an original CLIF block.)
+//!
+//! Each `LoweredBlock` names just an original CLIF block, an original CLIF
+//! block prepended or appended with an edge block (never both, though), or just
+//! an edge block.
+//!
+//! To compute this lowering, we do a DFS over the CLIF-plus-edge-block graph
+//! (never actually materialized, just defined by a "successors" function), and
+//! compute the reverse postorder.
+//!
+//! This algorithm isn't perfect w.r.t. generated code quality: we don't, for
+//! example, consider any information about whether edge blocks will actually
+//! have content, because this computation happens as part of lowering *before*
+//! regalloc, and regalloc may or may not insert moves/spills/reloads on any
+//! particular edge. But it works relatively well and is conceptually simple.
+//! Furthermore, the [MachBuffer] machine-code sink performs final peephole-like
+//! branch editing that in practice elides empty blocks and simplifies some of
+//! the other redundancies that this scheme produces.

+use crate::entity::SecondaryMap;
+use crate::fx::{FxHashMap, FxHashSet};
+use crate::ir::{Block, Function, Inst, Opcode};
+use crate::machinst::lower::visit_block_succs;
 use crate::machinst::*;

-/// Simple reverse postorder-based block order emission.
-///
-/// TODO: use a proper algorithm, such as the bottom-up straight-line-section
-/// construction algorithm.
-struct BlockRPO {
-    visited: Vec<bool>,
-    postorder: Vec<BlockIndex>,
+use log::debug;
+use smallvec::SmallVec;
+
+/// Mapping from CLIF BBs to VCode BBs.
+#[derive(Debug)]
+pub struct BlockLoweringOrder {
+    /// Lowered blocks, in BlockIndex order. Each block is some combination of
+    /// (i) a CLIF block, and (ii) inserted crit-edge blocks before or after;
+    /// see [LoweredBlock] for details.
+    lowered_order: Vec<LoweredBlock>,
+    /// Successors for all lowered blocks, in one serialized vector. Indexed by
+    /// the ranges in `lowered_succ_ranges`.
+    lowered_succs: Vec<(Inst, LoweredBlock)>,
+    /// BlockIndex values for successors for all lowered blocks, in the same
+    /// order as `lowered_succs`.
+    lowered_succ_indices: Vec<(Inst, BlockIndex)>,
+    /// Ranges in `lowered_succs` giving the successor lists for each lowered
+    /// block. Indexed by lowering-order index (`BlockIndex`).
+    lowered_succ_ranges: Vec<(usize, usize)>,
+    /// Mapping from CLIF BB to BlockIndex (index in lowered order). Note that
+    /// some CLIF BBs may not be lowered; in particular, we skip unreachable
+    /// blocks.
+    orig_map: SecondaryMap<Block, Option<BlockIndex>>,
 }

-impl BlockRPO {
-    fn new<I: VCodeInst>(vcode: &VCode<I>) -> BlockRPO {
-        BlockRPO {
-            visited: vec![false; vcode.num_blocks()],
-            postorder: Vec::with_capacity(vcode.num_blocks()),
+/// The origin of a block in the lowered block-order: either an original CLIF
+/// block, or an inserted edge-block, or a combination of the two if an edge is
+/// non-critical.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub enum LoweredBlock {
+    /// Block in original CLIF, with no merged edge-blocks.
+    Orig {
+        /// Original CLIF block.
+        block: Block,
+    },
+    /// Block in the original CLIF, plus edge-block to one succ (which is the
+    /// one successor of the original block).
+    OrigAndEdge {
+        /// The original CLIF block contained in this lowered block.
+        block: Block,
+        /// The edge (jump) instruction transitioning from this block
+        /// to the next, i.e., corresponding to the included edge-block. This
+        /// will be an instruction in `block`.
+        edge_inst: Inst,
+        /// The successor CLIF block.
+        succ: Block,
+    },
+    /// Block in the original CLIF, preceded by edge-block from one pred (which
+    /// is the one pred of the original block).
+    EdgeAndOrig {
+        /// The previous CLIF block, i.e., the edge block's predecessor.
+        pred: Block,
+        /// The edge (jump) instruction corresponding to the included
+        /// edge-block. This will be an instruction in `pred`.
+        edge_inst: Inst,
+        /// The original CLIF block included in this lowered block.
+        block: Block,
+    },
+    /// Split critical edge between two CLIF blocks. This lowered block does not
+    /// correspond to any original CLIF blocks; it only serves as an insertion
+    /// point for work to happen on the transition from `pred` to `succ`.
+    Edge {
+        /// The predecessor CLIF block.
+        pred: Block,
+        /// The edge (jump) instruction corresponding to this edge's transition.
+        /// This will be an instruction in `pred`.
+        edge_inst: Inst,
+        /// The successor CLIF block.
+        succ: Block,
+    },
+}
+
+impl LoweredBlock {
+    /// The associated original (CLIF) block included in this lowered block, if
+    /// any.
+    pub fn orig_block(self) -> Option<Block> {
+        match self {
+            LoweredBlock::Orig { block, .. }
+            | LoweredBlock::OrigAndEdge { block, .. }
+            | LoweredBlock::EdgeAndOrig { block, .. } => Some(block),
+            LoweredBlock::Edge { .. } => None,
        }
    }

-    fn visit<I: VCodeInst>(&mut self, vcode: &VCode<I>, block: BlockIndex) {
-        self.visited[block as usize] = true;
-        for succ in vcode.succs(block) {
-            if !self.visited[succ.get() as usize] {
-                self.visit(vcode, succ.get());
+    /// The associated in-edge, if any.
+    pub fn in_edge(self) -> Option<(Block, Inst, Block)> {
+        match self {
+            LoweredBlock::EdgeAndOrig {
+                pred,
+                edge_inst,
+                block,
+            } => Some((pred, edge_inst, block)),
+            _ => None,
+        }
+    }
+
+    /// the associated out-edge, if any. Also includes edge-only blocks.
+    pub fn out_edge(self) -> Option<(Block, Inst, Block)> {
+        match self {
+            LoweredBlock::OrigAndEdge {
+                block,
+                edge_inst,
+                succ,
+            } => Some((block, edge_inst, succ)),
+            LoweredBlock::Edge {
+                pred,
+                edge_inst,
+                succ,
+            } => Some((pred, edge_inst, succ)),
+            _ => None,
+        }
+    }
+}
+
+impl BlockLoweringOrder {
+    /// Compute and return a lowered block order for `f`.
+    pub fn new(f: &Function) -> BlockLoweringOrder {
+        debug!("BlockLoweringOrder: function body {:?}", f);
+
+        // Step 1: compute the in-edge and out-edge count of every block.
+        let mut block_in_count = SecondaryMap::with_default(0);
+        let mut block_out_count = SecondaryMap::with_default(0);
+
+        // Cache the block successors to avoid re-examining branches below.
+        let mut block_succs: SmallVec<[(Inst, Block); 128]> = SmallVec::new();
+        let mut block_succ_range = SecondaryMap::with_default((0, 0));
+        let mut fallthrough_return_block = None;
+        for block in f.layout.blocks() {
+            let block_succ_start = block_succs.len();
+            visit_block_succs(f, block, |inst, succ| {
+                block_out_count[block] += 1;
+                block_in_count[succ] += 1;
+                block_succs.push((inst, succ));
+            });
+            let block_succ_end = block_succs.len();
+            block_succ_range[block] = (block_succ_start, block_succ_end);
+
+            for inst in f.layout.block_likely_branches(block) {
+                if f.dfg[inst].opcode() == Opcode::Return {
+                    // Implicit output edge for any return.
+                    block_out_count[block] += 1;
+                }
+                if f.dfg[inst].opcode() == Opcode::FallthroughReturn {
+                    // Fallthrough return block must come last.
+                    debug_assert!(fallthrough_return_block == None);
+                    fallthrough_return_block = Some(block);
+                }
            }
        }
-        if Some(block) != vcode.fallthrough_return_block {
-            self.postorder.push(block);
+        // Implicit input edge for entry block.
+        if let Some(entry) = f.layout.entry_block() {
+            block_in_count[entry] += 1;
        }
+
+        // Here we define the implicit CLIF-plus-edges graph. There are
+        // conceptually two such graphs: the original, with every edge explicit,
+        // and the merged one, with blocks (represented by `LoweredBlock`
+        // values) that contain original CLIF blocks, edges, or both. This
+        // function returns a lowered block's successors as per the latter, with
+        // consideration to edge-block merging.
+        //
+        // Note that there is a property of the block-merging rules below
+        // that is very important to ensure we don't miss any lowered blocks:
+        // any block in the implicit CLIF-plus-edges graph will *only* be
+        // included in one block in the merged graph.
+        //
+        // This, combined with the property that every edge block is reachable
+        // only from one predecessor (and hence cannot be reached by a DFS
+        // backedge), means that it is sufficient in our DFS below to track
+        // visited-bits per original CLIF block only, not per edge. This greatly
+        // simplifies the data structures (no need to keep a sparse hash-set of
+        // (block, block) tuples).
+        let compute_lowered_succs = |ret: &mut Vec<(Inst, LoweredBlock)>, block: LoweredBlock| {
+            let start_idx = ret.len();
+            match block {
+                LoweredBlock::Orig { block } | LoweredBlock::EdgeAndOrig { block, .. } => {
+                    // At an orig block; successors are always edge blocks,
+                    // possibly with orig blocks following.
+                    let range = block_succ_range[block];
+                    for &(edge_inst, succ) in &block_succs[range.0..range.1] {
+                        if block_in_count[succ] == 1 {
+                            ret.push((
+                                edge_inst,
+                                LoweredBlock::EdgeAndOrig {
+                                    pred: block,
+                                    edge_inst,
+                                    block: succ,
+                                },
+                            ));
+                        } else {
+                            ret.push((
+                                edge_inst,
+                                LoweredBlock::Edge {
+                                    pred: block,
+                                    edge_inst,
+                                    succ,
+                                },
+                            ));
+                        }
+                    }
+                }
+                LoweredBlock::Edge {
+                    succ, edge_inst, ..
+                }
+                | LoweredBlock::OrigAndEdge {
+                    succ, edge_inst, ..
+                } => {
+                    // At an edge block; successors are always orig blocks,
+                    // possibly with edge blocks following.
+                    if block_out_count[succ] == 1 {
+                        let range = block_succ_range[succ];
+                        // check if the one succ is a real CFG edge (vs.
+                        // implicit return succ).
+                        if range.1 - range.0 > 0 {
+                            debug_assert!(range.1 - range.0 == 1);
+                            let (succ_edge_inst, succ_succ) = block_succs[range.0];
+                            ret.push((
+                                edge_inst,
+                                LoweredBlock::OrigAndEdge {
+                                    block: succ,
+                                    edge_inst: succ_edge_inst,
+                                    succ: succ_succ,
+                                },
+                            ));
+                        } else {
+                            ret.push((edge_inst, LoweredBlock::Orig { block: succ }));
+                        }
+                    } else {
+                        ret.push((edge_inst, LoweredBlock::Orig { block: succ }));
+                    }
+                }
+            }
+            let end_idx = ret.len();
+            (start_idx, end_idx)
+        };
+
+        // Build the explicit LoweredBlock-to-LoweredBlock successors list.
+        let mut lowered_succs = vec![];
+        let mut lowered_succ_indices = vec![];
+
+        // Step 2: Compute RPO traversal of the implicit CLIF-plus-edge-block graph. Use an
+        // explicit stack so we don't overflow the real stack with a deep DFS.
+        #[derive(Debug)]
+        struct StackEntry {
+            this: LoweredBlock,
+            succs: (usize, usize), // range in lowered_succs
+            cur_succ: usize,       // index in lowered_succs
+        }
+
+        let mut stack: SmallVec<[StackEntry; 16]> = SmallVec::new();
+        let mut visited = FxHashSet::default();
+        let mut postorder = vec![];
+        if let Some(entry) = f.layout.entry_block() {
+            // FIXME(cfallin): we might be able to use OrigAndEdge. Find a way
+            // to not special-case the entry block here.
+            let block = LoweredBlock::Orig { block: entry };
+            visited.insert(block);
+            let range = compute_lowered_succs(&mut lowered_succs, block);
+            lowered_succ_indices.resize(lowered_succs.len(), 0);
+            stack.push(StackEntry {
+                this: block,
+                succs: range,
+                cur_succ: range.1,
+            });
+        }
+
+        let mut deferred_last = None;
+        while !stack.is_empty() {
+            let stack_entry = stack.last_mut().unwrap();
+            let range = stack_entry.succs;
+            if stack_entry.cur_succ == range.0 {
+                let orig_block = stack_entry.this.orig_block();
+                if orig_block.is_some() && orig_block == fallthrough_return_block {
+                    deferred_last = Some((stack_entry.this, range));
+                } else {
+                    postorder.push((stack_entry.this, range));
+                }
+                stack.pop();
+            } else {
+                // Heuristic: chase the children in reverse. This puts the first
+                // successor block first in RPO, all other things being equal,
+                // which tends to prioritize loop backedges over out-edges,
+                // putting the edge-block closer to the loop body and minimizing
+                // live-ranges in linear instruction space.
+                let next = lowered_succs[stack_entry.cur_succ - 1].1;
+                stack_entry.cur_succ -= 1;
+                if visited.contains(&next) {
+                    continue;
+                }
+                visited.insert(next);
+                let range = compute_lowered_succs(&mut lowered_succs, next);
+                lowered_succ_indices.resize(lowered_succs.len(), 0);
+                stack.push(StackEntry {
+                    this: next,
+                    succs: range,
+                    cur_succ: range.1,
+                });
+            }
+        }
+
+        postorder.reverse();
+        let mut rpo = postorder;
+        if let Some(d) = deferred_last {
+            rpo.push(d);
+        }
+
+        // Step 3: now that we have RPO, build the BlockIndex/BB fwd/rev maps.
+        let mut lowered_order = vec![];
+        let mut lowered_succ_ranges = vec![];
+        let mut lb_to_bindex = FxHashMap::default();
+        for (block, succ_range) in rpo.into_iter() {
+            lb_to_bindex.insert(block, lowered_order.len() as BlockIndex);
+            lowered_order.push(block);
+            lowered_succ_ranges.push(succ_range);
+        }
+
+        let lowered_succ_indices = lowered_succs
+            .iter()
+            .map(|&(inst, succ)| (inst, lb_to_bindex.get(&succ).cloned().unwrap()))
+            .collect();
+
+        let mut orig_map = SecondaryMap::with_default(None);
+        for (i, lb) in lowered_order.iter().enumerate() {
+            let i = i as BlockIndex;
+            if let Some(b) = lb.orig_block() {
+                orig_map[b] = Some(i);
+            }
+        }
+
+        let result = BlockLoweringOrder {
+            lowered_order,
+            lowered_succs,
+            lowered_succ_indices,
+            lowered_succ_ranges,
+            orig_map,
+        };
+        debug!("BlockLoweringOrder: {:?}", result);
+        result
    }

-    fn rpo<I: VCodeInst>(self, vcode: &VCode<I>) -> Vec<BlockIndex> {
-        let mut rpo = self.postorder;
-        rpo.reverse();
-        if let Some(block) = vcode.fallthrough_return_block {
-            rpo.push(block);
-        }
-        rpo
+    /// Get the lowered order of blocks.
+    pub fn lowered_order(&self) -> &[LoweredBlock] {
+        &self.lowered_order[..]
+    }
+
+    /// Get the successors for a lowered block, by index in `lowered_order()`'s
+    /// returned slice. Each successsor is paired with the edge-instruction
+    /// (branch) corresponding to this edge.
+    pub fn succs(&self, block: BlockIndex) -> &[(Inst, LoweredBlock)] {
+        let range = self.lowered_succ_ranges[block as usize];
+        &self.lowered_succs[range.0..range.1]
+    }
+
+    /// Get the successor indices for a lowered block.
+    pub fn succ_indices(&self, block: BlockIndex) -> &[(Inst, BlockIndex)] {
+        let range = self.lowered_succ_ranges[block as usize];
+        &self.lowered_succ_indices[range.0..range.1]
+    }
+
+    /// Get the lowered block index containing a CLIF block, if any. (May not be
+    /// present if the original CLIF block was unreachable.)
+    pub fn lowered_block_for_bb(&self, bb: Block) -> Option<BlockIndex> {
+        self.orig_map[bb]
    }
 }

-/// Compute the final block order.
-pub fn compute_final_block_order<I: VCodeInst>(vcode: &VCode<I>) -> Vec<BlockIndex> {
-    let mut rpo = BlockRPO::new(vcode);
-    rpo.visit(vcode, vcode.entry());
-    rpo.rpo(vcode)
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::cursor::{Cursor, FuncCursor};
+    use crate::ir::types::*;
+    use crate::ir::{AbiParam, ExternalName, Function, InstBuilder, Signature};
+    use crate::isa::CallConv;
+
+    fn build_test_func(n_blocks: usize, edges: &[(usize, usize)]) -> Function {
+        assert!(n_blocks > 0);
+
+        let name = ExternalName::testcase("test0");
+        let mut sig = Signature::new(CallConv::SystemV);
+        sig.params.push(AbiParam::new(I32));
+        let mut func = Function::with_name_signature(name, sig);
+        let blocks = (0..n_blocks)
+            .map(|i| {
+                let bb = func.dfg.make_block();
+                assert!(bb.as_u32() == i as u32);
+                bb
+            })
+            .collect::<Vec<_>>();
+
+        let arg0 = func.dfg.append_block_param(blocks[0], I32);
+
+        let mut pos = FuncCursor::new(&mut func);
+
+        let mut edge = 0;
+        for i in 0..n_blocks {
+            pos.insert_block(blocks[i]);
+            let mut succs = vec![];
+            while edge < edges.len() && edges[edge].0 == i {
+                succs.push(edges[edge].1);
+                edge += 1;
+            }
+            if succs.len() == 0 {
+                pos.ins().return_(&[arg0]);
+            } else if succs.len() == 1 {
+                pos.ins().jump(blocks[succs[0]], &[]);
+            } else if succs.len() == 2 {
+                pos.ins().brnz(arg0, blocks[succs[0]], &[]);
+                pos.ins().jump(blocks[succs[1]], &[]);
+            } else {
+                panic!("Too many successors");
+            }
+        }
+
+        func
+    }
+
+    #[test]
+    fn test_blockorder_diamond() {
+        let func = build_test_func(4, &[(0, 1), (0, 2), (1, 3), (2, 3)]);
+        let order = BlockLoweringOrder::new(&func);
+
+        assert_eq!(order.lowered_order.len(), 6);
+
+        assert!(order.lowered_order[0].orig_block().unwrap().as_u32() == 0);
+        assert!(order.lowered_order[0].in_edge().is_none());
+        assert!(order.lowered_order[0].out_edge().is_none());
+
+        assert!(order.lowered_order[1].orig_block().unwrap().as_u32() == 1);
+        assert!(order.lowered_order[1].in_edge().unwrap().0.as_u32() == 0);
+        assert!(order.lowered_order[1].in_edge().unwrap().2.as_u32() == 1);
+
+        assert!(order.lowered_order[2].orig_block().is_none());
+        assert!(order.lowered_order[2].in_edge().is_none());
+        assert!(order.lowered_order[2].out_edge().unwrap().0.as_u32() == 1);
+        assert!(order.lowered_order[2].out_edge().unwrap().2.as_u32() == 3);
+
+        assert!(order.lowered_order[3].orig_block().unwrap().as_u32() == 2);
+        assert!(order.lowered_order[3].in_edge().unwrap().0.as_u32() == 0);
+        assert!(order.lowered_order[3].in_edge().unwrap().2.as_u32() == 2);
+        assert!(order.lowered_order[3].out_edge().is_none());
+
+        assert!(order.lowered_order[4].orig_block().is_none());
+        assert!(order.lowered_order[4].in_edge().is_none());
+        assert!(order.lowered_order[4].out_edge().unwrap().0.as_u32() == 2);
+        assert!(order.lowered_order[4].out_edge().unwrap().2.as_u32() == 3);
+
+        assert!(order.lowered_order[5].orig_block().unwrap().as_u32() == 3);
+        assert!(order.lowered_order[5].in_edge().is_none());
+        assert!(order.lowered_order[5].out_edge().is_none());
+    }
+
+    #[test]
+    fn test_blockorder_critedge() {
+        //            0
+        //          /   \
+        //         1     2
+        //        /  \     \
+        //       3    4    |
+        //       |\  _|____|
+        //       | \/ |
+        //       | /\ |
+        //       5    6
+        //
+        // (3 -> 5, 3 -> 6, 4 -> 6 are critical edges and must be split)
+        //
+        let func = build_test_func(
+            7,
+            &[
+                (0, 1),
+                (0, 2),
+                (1, 3),
+                (1, 4),
+                (2, 5),
+                (3, 5),
+                (3, 6),
+                (4, 6),
+            ],
+        );
+        let order = BlockLoweringOrder::new(&func);
+
+        assert_eq!(order.lowered_order.len(), 11);
+        println!("ordered = {:?}", order.lowered_order);
+
+        // block 0
+        assert!(order.lowered_order[0].orig_block().unwrap().as_u32() == 0);
+        assert!(order.lowered_order[0].in_edge().is_none());
+        assert!(order.lowered_order[0].out_edge().is_none());
+
+        // edge 0->1 + block 1
+        assert!(order.lowered_order[1].orig_block().unwrap().as_u32() == 1);
+        assert!(order.lowered_order[1].in_edge().unwrap().0.as_u32() == 0);
+        assert!(order.lowered_order[1].in_edge().unwrap().2.as_u32() == 1);
+        assert!(order.lowered_order[1].out_edge().is_none());
+
+        // edge 1->3 + block 3
+        assert!(order.lowered_order[2].orig_block().unwrap().as_u32() == 3);
+        assert!(order.lowered_order[2].in_edge().unwrap().0.as_u32() == 1);
+        assert!(order.lowered_order[2].in_edge().unwrap().2.as_u32() == 3);
+        assert!(order.lowered_order[2].out_edge().is_none());
+
+        // edge 3->5
+        assert!(order.lowered_order[3].orig_block().is_none());
+        assert!(order.lowered_order[3].in_edge().is_none());
+        assert!(order.lowered_order[3].out_edge().unwrap().0.as_u32() == 3);
+        assert!(order.lowered_order[3].out_edge().unwrap().2.as_u32() == 5);
+
+        // edge 3->6
+        assert!(order.lowered_order[4].orig_block().is_none());
+        assert!(order.lowered_order[4].in_edge().is_none());
+        assert!(order.lowered_order[4].out_edge().unwrap().0.as_u32() == 3);
+        assert!(order.lowered_order[4].out_edge().unwrap().2.as_u32() == 6);
+
+        // edge 1->4 + block 4
+        assert!(order.lowered_order[5].orig_block().unwrap().as_u32() == 4);
+        assert!(order.lowered_order[5].in_edge().unwrap().0.as_u32() == 1);
+        assert!(order.lowered_order[5].in_edge().unwrap().2.as_u32() == 4);
+        assert!(order.lowered_order[5].out_edge().is_none());
+
+        // edge 4->6
+        assert!(order.lowered_order[6].orig_block().is_none());
+        assert!(order.lowered_order[6].in_edge().is_none());
+        assert!(order.lowered_order[6].out_edge().unwrap().0.as_u32() == 4);
+        assert!(order.lowered_order[6].out_edge().unwrap().2.as_u32() == 6);
+
+        // block 6
+        assert!(order.lowered_order[7].orig_block().unwrap().as_u32() == 6);
+        assert!(order.lowered_order[7].in_edge().is_none());
+        assert!(order.lowered_order[7].out_edge().is_none());
+
+        // edge 0->2 + block 2
+        assert!(order.lowered_order[8].orig_block().unwrap().as_u32() == 2);
+        assert!(order.lowered_order[8].in_edge().unwrap().0.as_u32() == 0);
+        assert!(order.lowered_order[8].in_edge().unwrap().2.as_u32() == 2);
+        assert!(order.lowered_order[8].out_edge().is_none());
+
+        // edge 2->5
+        assert!(order.lowered_order[9].orig_block().is_none());
+        assert!(order.lowered_order[9].in_edge().is_none());
+        assert!(order.lowered_order[9].out_edge().unwrap().0.as_u32() == 2);
+        assert!(order.lowered_order[9].out_edge().unwrap().2.as_u32() == 5);
+
+        // block 5
+        assert!(order.lowered_order[10].orig_block().unwrap().as_u32() == 5);
+        assert!(order.lowered_order[10].in_edge().is_none());
+        assert!(order.lowered_order[10].out_edge().is_none());
+    }
 }
--- a/cranelift/codegen/src/machinst/buffer.rs
+++ b/cranelift/codegen/src/machinst/buffer.rs
--- a/cranelift/codegen/src/machinst/compile.rs
+++ b/cranelift/codegen/src/machinst/compile.rs
@@ -18,8 +18,12 @@ pub fn compile<B: LowerBackend + MachBackend>(
 where
    B::MInst: ShowWithRRU,
 {
-    // This lowers the CL IR.
-    let mut vcode = Lower::new(f, abi)?.lower(b)?;
+    // Compute lowered block order.
+    let block_order = BlockLoweringOrder::new(f);
+    // Build the lowering context.
+    let lower = Lower::new(f, abi, block_order)?;
+    // Lower the IR.
+    let mut vcode = lower.lower(b)?;

    debug!(
        "vcode from lowering: \n{}",
@@ -65,11 +69,6 @@ where
    // all at once. This also inserts prologues/epilogues.
    vcode.replace_insns_from_regalloc(result);

-    vcode.remove_redundant_branches();
-
-    // Do final passes over code to finalize branches.
-    vcode.finalize_branches();
-
    debug!(
        "vcode after regalloc: final version:\n{}",
        vcode.show_rru(Some(b.reg_universe()))
--- a/cranelift/codegen/src/machinst/lower.rs
+++ b/cranelift/codegen/src/machinst/lower.rs
--- a/cranelift/codegen/src/machinst/mod.rs
+++ b/cranelift/codegen/src/machinst/mod.rs
@@ -109,6 +109,7 @@ use regalloc::RegUsageCollector;
 use regalloc::{
    RealReg, RealRegUniverse, Reg, RegClass, RegUsageMapper, SpillSlot, VirtualReg, Writable,
 };
+use smallvec::SmallVec;
 use std::string::String;
 use target_lexicon::Triple;

@@ -124,8 +125,8 @@ pub mod abi;
 pub use abi::*;
 pub mod pretty_print;
 pub use pretty_print::*;
-pub mod sections;
-pub use sections::*;
+pub mod buffer;
+pub use buffer::*;
 pub mod adapter;
 pub use adapter::*;

@@ -152,6 +153,9 @@ pub trait MachInst: Clone + Debug {
    /// Generate a move.
    fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Self;

+    /// Generate a constant into a reg.
+    fn gen_constant(to_reg: Writable<Reg>, value: u64, ty: Type) -> SmallVec<[Self; 4]>;
+
    /// Generate a zero-length no-op.
    fn gen_zero_len_nop() -> Self;

@@ -166,7 +170,7 @@ pub trait MachInst: Clone + Debug {

    /// Generate a jump to another target. Used during lowering of
    /// control flow.
-    fn gen_jump(target: BlockIndex) -> Self;
+    fn gen_jump(target: MachLabel) -> Self;

    /// Generate a NOP. The `preferred_size` parameter allows the caller to
    /// request a NOP of that size, or as close to it as possible. The machine
@@ -175,22 +179,62 @@ pub trait MachInst: Clone + Debug {
    /// the instruction must have a nonzero size.
    fn gen_nop(preferred_size: usize) -> Self;

-    /// Rewrite block targets using the block-target map.
-    fn with_block_rewrites(&mut self, block_target_map: &[BlockIndex]);
-
-    /// Finalize branches once the block order (fallthrough) is known.
-    fn with_fallthrough_block(&mut self, fallthrough_block: Option<BlockIndex>);
-
-    /// Update instruction once block offsets are known.  These offsets are
-    /// relative to the beginning of the function. `targets` is indexed by
-    /// BlockIndex.
-    fn with_block_offsets(&mut self, my_offset: CodeOffset, targets: &[CodeOffset]);
+    /// Get the register universe for this backend.
+    fn reg_universe(flags: &Flags) -> RealRegUniverse;

    /// Align a basic block offset (from start of function).  By default, no
    /// alignment occurs.
    fn align_basic_block(offset: CodeOffset) -> CodeOffset {
        offset
    }
+
+    /// What is the worst-case instruction size emitted by this instruction type?
+    fn worst_case_size() -> CodeOffset;
+
+    /// A label-use kind: a type that describes the types of label references that
+    /// can occur in an instruction.
+    type LabelUse: MachInstLabelUse;
+}
+
+/// A descriptor of a label reference (use) in an instruction set.
+pub trait MachInstLabelUse: Clone + Copy + Debug + Eq {
+    /// Required alignment for any veneer. Usually the required instruction
+    /// alignment (e.g., 4 for a RISC with 32-bit instructions, or 1 for x86).
+    const ALIGN: CodeOffset;
+
+    /// What is the maximum PC-relative range (positive)? E.g., if `1024`, a
+    /// label-reference fixup at offset `x` is valid if the label resolves to `x
+    /// + 1024`.
+    fn max_pos_range(self) -> CodeOffset;
+    /// What is the maximum PC-relative range (negative)? This is the absolute
+    /// value; i.e., if `1024`, then a label-reference fixup at offset `x` is
+    /// valid if the label resolves to `x - 1024`.
+    fn max_neg_range(self) -> CodeOffset;
+    /// What is the size of code-buffer slice this label-use needs to patch in
+    /// the label's value?
+    fn patch_size(self) -> CodeOffset;
+    /// Perform a code-patch, given the offset into the buffer of this label use
+    /// and the offset into the buffer of the label's definition.
+    /// It is guaranteed that, given `delta = offset - label_offset`, we will
+    /// have `offset >= -self.max_neg_range()` and `offset <=
+    /// self.max_pos_range()`.
+    fn patch(self, buffer: &mut [u8], use_offset: CodeOffset, label_offset: CodeOffset);
+    /// Can the label-use be patched to a veneer that supports a longer range?
+    /// Usually valid for jumps (a short-range jump can jump to a longer-range
+    /// jump), but not for e.g. constant pool references, because the constant
+    /// load would require different code (one more level of indirection).
+    fn supports_veneer(self) -> bool;
+    /// How many bytes are needed for a veneer?
+    fn veneer_size(self) -> CodeOffset;
+    /// Generate a veneer. The given code-buffer slice is `self.veneer_size()`
+    /// bytes long at offset `veneer_offset` in the buffer. The original
+    /// label-use will be patched to refer to this veneer's offset.  A new
+    /// (offset, LabelUse) is returned that allows the veneer to use the actual
+    /// label. For veneers to work properly, it is expected that the new veneer
+    /// has a larger range; on most platforms this probably means either a
+    /// "long-range jump" (e.g., on ARM, the 26-bit form), or if already at that
+    /// stage, a jump that supports a full 32-bit range, for example.
+    fn generate_veneer(self, buffer: &mut [u8], veneer_offset: CodeOffset) -> (CodeOffset, Self);
 }

 /// Describes a block terminator (not call) in the vcode, when its branches
@@ -202,26 +246,26 @@ pub enum MachTerminator<'a> {
    /// A return instruction.
    Ret,
    /// An unconditional branch to another block.
-    Uncond(BlockIndex),
+    Uncond(MachLabel),
    /// A conditional branch to one of two other blocks.
-    Cond(BlockIndex, BlockIndex),
+    Cond(MachLabel, MachLabel),
    /// An indirect branch with known possible targets.
-    Indirect(&'a [BlockIndex]),
+    Indirect(&'a [MachLabel]),
 }

 /// A trait describing the ability to encode a MachInst into binary machine code.
-pub trait MachInstEmit<O: MachSectionOutput> {
+pub trait MachInstEmit: MachInst {
    /// Persistent state carried across `emit` invocations.
    type State: Default + Clone + Debug;
    /// Emit the instruction.
-    fn emit(&self, code: &mut O, flags: &Flags, state: &mut Self::State);
+    fn emit(&self, code: &mut MachBuffer<Self>, flags: &Flags, state: &mut Self::State);
 }

 /// The result of a `MachBackend::compile_function()` call. Contains machine
 /// code (as bytes) and a disassembly, if requested.
 pub struct MachCompileResult {
    /// Machine code.
-    pub sections: MachSections,
+    pub buffer: MachBufferFinalized,
    /// Size of stack frame, in bytes.
    pub frame_size: u32,
    /// Disassembly, if requested.
@@ -231,7 +275,7 @@ pub struct MachCompileResult {
 impl MachCompileResult {
    /// Get a `CodeInfo` describing section sizes from this compilation result.
    pub fn code_info(&self) -> CodeInfo {
-        let code_size = self.sections.total_size();
+        let code_size = self.buffer.total_size();
        CodeInfo {
            code_size,
            jumptables_size: 0,
--- a/cranelift/codegen/src/machinst/sections.rs
+++ b/cranelift/codegen/src/machinst/sections.rs
@@ -1,460 +0,0 @@
-//! In-memory representation of compiled machine code, in multiple sections
-//! (text, constant pool / rodata, etc). Emission occurs into multiple sections
-//! simultaneously, so we buffer the result in memory and hand off to the
-//! caller at the end of compilation.
-
-use crate::binemit::{Addend, CodeOffset, CodeSink, Reloc};
-use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode};
-
-use alloc::vec::Vec;
-
-/// A collection of sections with defined start-offsets.
-pub struct MachSections {
-    /// Sections, in offset order.
-    pub sections: Vec<MachSection>,
-}
-
-impl MachSections {
-    /// New, empty set of sections.
-    pub fn new() -> MachSections {
-        MachSections { sections: vec![] }
-    }
-
-    /// Add a section with a known offset and size. Returns the index.
-    pub fn add_section(&mut self, start: CodeOffset, length: CodeOffset) -> usize {
-        let idx = self.sections.len();
-        self.sections.push(MachSection::new(start, length));
-        idx
-    }
-
-    /// Mutably borrow the given section by index.
-    pub fn get_section<'a>(&'a mut self, idx: usize) -> &'a mut MachSection {
-        &mut self.sections[idx]
-    }
-
-    /// Get mutable borrows of two sections simultaneously. Used during
-    /// instruction emission to provide references to the .text and .rodata
-    /// (constant pool) sections.
-    pub fn two_sections<'a>(
-        &'a mut self,
-        idx1: usize,
-        idx2: usize,
-    ) -> (&'a mut MachSection, &'a mut MachSection) {
-        assert!(idx1 < idx2);
-        assert!(idx1 < self.sections.len());
-        assert!(idx2 < self.sections.len());
-        let (first, rest) = self.sections.split_at_mut(idx2);
-        (&mut first[idx1], &mut rest[0])
-    }
-
-    /// Emit this set of sections to a set of sinks for the code,
-    /// relocations, traps, and stackmap.
-    pub fn emit<CS: CodeSink>(&self, sink: &mut CS) {
-        // N.B.: we emit every section into the .text section as far as
-        // the `CodeSink` is concerned; we do not bother to segregate
-        // the contents into the actual program text, the jumptable and the
-        // rodata (constant pool). This allows us to generate code assuming
-        // that these will not be relocated relative to each other, and avoids
-        // having to designate each section as belonging in one of the three
-        // fixed categories defined by `CodeSink`. If this becomes a problem
-        // later (e.g. because of memory permissions or similar), we can
-        // add this designation and segregate the output; take care, however,
-        // to add the appropriate relocations in this case.
-
-        for section in &self.sections {
-            if section.data.len() > 0 {
-                while sink.offset() < section.start_offset {
-                    sink.put1(0);
-                }
-                section.emit(sink);
-            }
-        }
-        sink.begin_jumptables();
-        sink.begin_rodata();
-        sink.end_codegen();
-    }
-
-    /// Get a list of source location mapping tuples in sorted-by-start-offset order.
-    pub fn get_srclocs_sorted<'a>(&'a self) -> MachSectionsSrcLocs<'a> {
-        MachSectionsSrcLocs::new(&self.sections)
-    }
-
-    /// Get the total required size for these sections.
-    pub fn total_size(&self) -> CodeOffset {
-        if self.sections.len() == 0 {
-            0
-        } else {
-            // Find the last non-empty section.
-            self.sections
-                .iter()
-                .rev()
-                .find(|s| s.data.len() > 0)
-                .map(|s| s.cur_offset_from_start())
-                .unwrap_or(0)
-        }
-    }
-}
-
-/// An iterator over the srclocs in each section.
-/// Returns MachSrcLocs in an order sorted by start location.
-pub struct MachSectionsSrcLocs<'a> {
-    sections: &'a [MachSection],
-    cur_section: usize,
-    cur_srcloc: usize,
-    // For validation:
-    last_offset: CodeOffset,
-}
-
-impl<'a> MachSectionsSrcLocs<'a> {
-    fn new(sections: &'a [MachSection]) -> MachSectionsSrcLocs<'a> {
-        MachSectionsSrcLocs {
-            sections,
-            cur_section: 0,
-            cur_srcloc: 0,
-            last_offset: 0,
-        }
-    }
-}
-
-impl<'a> Iterator for MachSectionsSrcLocs<'a> {
-    type Item = &'a MachSrcLoc;
-
-    fn next(&mut self) -> Option<&'a MachSrcLoc> {
-        // We simply iterate through sections and srcloc records in order. This produces a
-        // sorted order naturally because sections are in starting-offset-order, and srclocs
-        // are produced as a section is emitted into, so are in order as well.
-
-        // If we're out of sections, we're done.
-        if self.cur_section >= self.sections.len() {
-            return None;
-        }
-
-        // Otherwise, make sure we have a srcloc in the current section left to return, and
-        // advance to the next section if not. Done if we run out of sections.
-        while self.cur_srcloc >= self.sections[self.cur_section].srclocs.len() {
-            self.cur_srcloc = 0;
-            self.cur_section += 1;
-            if self.cur_section >= self.sections.len() {
-                return None;
-            }
-        }
-
-        let loc = &self.sections[self.cur_section].srclocs[self.cur_srcloc];
-        self.cur_srcloc += 1;
-        debug_assert!(loc.start >= self.last_offset);
-        self.last_offset = loc.start;
-        Some(loc)
-    }
-}
-
-/// An abstraction over MachSection and MachSectionSize: some
-/// receiver of section data.
-pub trait MachSectionOutput {
-    /// Get the current offset from the start of all sections.
-    fn cur_offset_from_start(&self) -> CodeOffset;
-
-    /// Get the start offset of this section.
-    fn start_offset(&self) -> CodeOffset;
-
-    /// Add 1 byte to the section.
-    fn put1(&mut self, _: u8);
-
-    /// Add 2 bytes to the section.
-    fn put2(&mut self, value: u16) {
-        let [b0, b1] = value.to_le_bytes();
-        self.put1(b0);
-        self.put1(b1);
-    }
-
-    /// Add 4 bytes to the section.
-    fn put4(&mut self, value: u32) {
-        let [b0, b1, b2, b3] = value.to_le_bytes();
-        self.put1(b0);
-        self.put1(b1);
-        self.put1(b2);
-        self.put1(b3);
-    }
-
-    /// Add 8 bytes to the section.
-    fn put8(&mut self, value: u64) {
-        let [b0, b1, b2, b3, b4, b5, b6, b7] = value.to_le_bytes();
-        self.put1(b0);
-        self.put1(b1);
-        self.put1(b2);
-        self.put1(b3);
-        self.put1(b4);
-        self.put1(b5);
-        self.put1(b6);
-        self.put1(b7);
-    }
-
-    /// Add a slice of bytes to the section.
-    fn put_data(&mut self, data: &[u8]);
-
-    /// Add a relocation at the current offset.
-    fn add_reloc(&mut self, loc: SourceLoc, kind: Reloc, name: &ExternalName, addend: Addend);
-
-    /// Add a trap record at the current offset.
-    fn add_trap(&mut self, loc: SourceLoc, code: TrapCode);
-
-    /// Add a call return address record at the current offset.
-    fn add_call_site(&mut self, loc: SourceLoc, opcode: Opcode);
-
-    /// Start the output for the given source-location at the current offset.
-    fn start_srcloc(&mut self, loc: SourceLoc);
-
-    /// End the output for the previously-given source-location at the current offset.
-    fn end_srcloc(&mut self);
-
-    /// Align up to the given alignment.
-    fn align_to(&mut self, align_to: CodeOffset) {
-        assert!(align_to.is_power_of_two());
-        while self.cur_offset_from_start() & (align_to - 1) != 0 {
-            self.put1(0);
-        }
-    }
-}
-
-/// A section of output to be emitted to a CodeSink / RelocSink in bulk.
-/// Multiple sections may be created with known start offsets in advance; the
-/// usual use-case is to create the .text (code) and .rodata (constant pool) at
-/// once, after computing the length of the code, so that constant references
-/// can use known offsets as instructions are emitted.
-pub struct MachSection {
-    /// The starting offset of this section.
-    pub start_offset: CodeOffset,
-    /// The limit of this section, defined by the start of the next section.
-    pub length_limit: CodeOffset,
-    /// The section contents, as raw bytes.
-    pub data: Vec<u8>,
-    /// Any relocations referring to this section.
-    pub relocs: Vec<MachReloc>,
-    /// Any trap records referring to this section.
-    pub traps: Vec<MachTrap>,
-    /// Any call site records referring to this section.
-    pub call_sites: Vec<MachCallSite>,
-    /// Any source location mappings referring to this section.
-    pub srclocs: Vec<MachSrcLoc>,
-    /// The current source location in progress (after `start_srcloc()` and before `end_srcloc()`).
-    /// This is a (start_offset, src_loc) tuple.
-    pub cur_srcloc: Option<(CodeOffset, SourceLoc)>,
-}
-
-impl MachSection {
-    /// Create a new section, known to start at `start_offset` and with a size limited to `length_limit`.
-    pub fn new(start_offset: CodeOffset, length_limit: CodeOffset) -> MachSection {
-        MachSection {
-            start_offset,
-            length_limit,
-            data: vec![],
-            relocs: vec![],
-            traps: vec![],
-            call_sites: vec![],
-            srclocs: vec![],
-            cur_srcloc: None,
-        }
-    }
-
-    /// Emit this section to the CodeSink and other associated sinks.  The
-    /// current offset of the CodeSink must match the starting offset of this
-    /// section.
-    pub fn emit<CS: CodeSink>(&self, sink: &mut CS) {
-        assert!(sink.offset() == self.start_offset);
-
-        let mut next_reloc = 0;
-        let mut next_trap = 0;
-        let mut next_call_site = 0;
-        for (idx, byte) in self.data.iter().enumerate() {
-            if next_reloc < self.relocs.len() {
-                let reloc = &self.relocs[next_reloc];
-                if reloc.offset == idx as CodeOffset {
-                    sink.reloc_external(reloc.srcloc, reloc.kind, &reloc.name, reloc.addend);
-                    next_reloc += 1;
-                }
-            }
-            if next_trap < self.traps.len() {
-                let trap = &self.traps[next_trap];
-                if trap.offset == idx as CodeOffset {
-                    sink.trap(trap.code, trap.srcloc);
-                    next_trap += 1;
-                }
-            }
-            if next_call_site < self.call_sites.len() {
-                let call_site = &self.call_sites[next_call_site];
-                if call_site.ret_addr == idx as CodeOffset {
-                    sink.add_call_site(call_site.opcode, call_site.srcloc);
-                    next_call_site += 1;
-                }
-            }
-            sink.put1(*byte);
-        }
-    }
-}
-
-impl MachSectionOutput for MachSection {
-    fn cur_offset_from_start(&self) -> CodeOffset {
-        self.start_offset + self.data.len() as CodeOffset
-    }
-
-    fn start_offset(&self) -> CodeOffset {
-        self.start_offset
-    }
-
-    fn put1(&mut self, value: u8) {
-        assert!(((self.data.len() + 1) as CodeOffset) <= self.length_limit);
-        self.data.push(value);
-    }
-
-    fn put_data(&mut self, data: &[u8]) {
-        assert!(((self.data.len() + data.len()) as CodeOffset) <= self.length_limit);
-        self.data.extend_from_slice(data);
-    }
-
-    fn add_reloc(&mut self, srcloc: SourceLoc, kind: Reloc, name: &ExternalName, addend: Addend) {
-        let name = name.clone();
-        self.relocs.push(MachReloc {
-            offset: self.data.len() as CodeOffset,
-            srcloc,
-            kind,
-            name,
-            addend,
-        });
-    }
-
-    fn add_trap(&mut self, srcloc: SourceLoc, code: TrapCode) {
-        self.traps.push(MachTrap {
-            offset: self.data.len() as CodeOffset,
-            srcloc,
-            code,
-        });
-    }
-
-    fn add_call_site(&mut self, srcloc: SourceLoc, opcode: Opcode) {
-        self.call_sites.push(MachCallSite {
-            ret_addr: self.data.len() as CodeOffset,
-            srcloc,
-            opcode,
-        });
-    }
-
-    fn start_srcloc(&mut self, loc: SourceLoc) {
-        self.cur_srcloc = Some((self.cur_offset_from_start(), loc));
-    }
-
-    fn end_srcloc(&mut self) {
-        let (start, loc) = self
-            .cur_srcloc
-            .take()
-            .expect("end_srcloc() called without start_srcloc()");
-        let end = self.cur_offset_from_start();
-        // Skip zero-length extends.
-        debug_assert!(end >= start);
-        if end > start {
-            self.srclocs.push(MachSrcLoc { start, end, loc });
-        }
-    }
-}
-
-/// A MachSectionOutput implementation that records only size.
-pub struct MachSectionSize {
-    /// The starting offset of this section.
-    pub start_offset: CodeOffset,
-    /// The current offset of this section.
-    pub offset: CodeOffset,
-}
-
-impl MachSectionSize {
-    /// Create a new size-counting dummy section.
-    pub fn new(start_offset: CodeOffset) -> MachSectionSize {
-        MachSectionSize {
-            start_offset,
-            offset: start_offset,
-        }
-    }
-
-    /// Return the size this section would take if emitted with a real sink.
-    pub fn size(&self) -> CodeOffset {
-        self.offset - self.start_offset
-    }
-}
-
-impl MachSectionOutput for MachSectionSize {
-    fn cur_offset_from_start(&self) -> CodeOffset {
-        // All size-counting sections conceptually start at offset 0; this doesn't
-        // matter when counting code size.
-        self.offset
-    }
-
-    fn start_offset(&self) -> CodeOffset {
-        self.start_offset
-    }
-
-    fn put1(&mut self, _: u8) {
-        self.offset += 1;
-    }
-
-    fn put_data(&mut self, data: &[u8]) {
-        self.offset += data.len() as CodeOffset;
-    }
-
-    fn add_reloc(&mut self, _: SourceLoc, _: Reloc, _: &ExternalName, _: Addend) {}
-
-    fn add_trap(&mut self, _: SourceLoc, _: TrapCode) {}
-
-    fn add_call_site(&mut self, _: SourceLoc, _: Opcode) {}
-
-    fn start_srcloc(&mut self, _: SourceLoc) {}
-
-    fn end_srcloc(&mut self) {}
-}
-
-/// A relocation resulting from a compilation.
-pub struct MachReloc {
-    /// The offset at which the relocation applies, *relative to the
-    /// containing section*.
-    pub offset: CodeOffset,
-    /// The original source location.
-    pub srcloc: SourceLoc,
-    /// The kind of relocation.
-    pub kind: Reloc,
-    /// The external symbol / name to which this relocation refers.
-    pub name: ExternalName,
-    /// The addend to add to the symbol value.
-    pub addend: i64,
-}
-
-/// A trap record resulting from a compilation.
-pub struct MachTrap {
-    /// The offset at which the trap instruction occurs, *relative to the
-    /// containing section*.
-    pub offset: CodeOffset,
-    /// The original source location.
-    pub srcloc: SourceLoc,
-    /// The trap code.
-    pub code: TrapCode,
-}
-
-/// A call site record resulting from a compilation.
-pub struct MachCallSite {
-    /// The offset of the call's return address, *relative to the containing section*.
-    pub ret_addr: CodeOffset,
-    /// The original source location.
-    pub srcloc: SourceLoc,
-    /// The call's opcode.
-    pub opcode: Opcode,
-}
-
-/// A source-location mapping resulting from a compilation.
-#[derive(Clone, Debug)]
-pub struct MachSrcLoc {
-    /// The start of the region of code corresponding to a source location.
-    /// This is relative to the start of the function, not to the start of the
-    /// section.
-    pub start: CodeOffset,
-    /// The end of the region of code corresponding to a source location.
-    /// This is relative to the start of the section, not to the start of the
-    /// section.
-    pub end: CodeOffset,
-    /// The source location.
-    pub loc: SourceLoc,
-}
--- a/cranelift/codegen/src/machinst/vcode.rs
+++ b/cranelift/codegen/src/machinst/vcode.rs
@@ -17,8 +17,7 @@
 //! See the main module comment in `mod.rs` for more details on the VCode-based
 //! backend pipeline.

-use crate::entity::SecondaryMap;
-use crate::ir::{self, Block, SourceLoc};
+use crate::ir::{self, SourceLoc};
 use crate::machinst::*;
 use crate::settings;

@@ -30,8 +29,6 @@ use regalloc::{

 use alloc::boxed::Box;
 use alloc::{borrow::Cow, vec::Vec};
-use log::debug;
-use smallvec::SmallVec;
 use std::fmt;
 use std::iter;
 use std::string::String;
@@ -43,8 +40,8 @@ pub type BlockIndex = u32;

 /// VCodeInst wraps all requirements for a MachInst to be in VCode: it must be
 /// a `MachInst` and it must be able to emit itself at least to a `SizeCodeSink`.
-pub trait VCodeInst: MachInst + MachInstEmit<MachSection> + MachInstEmit<MachSectionSize> {}
-impl<I: MachInst + MachInstEmit<MachSection> + MachInstEmit<MachSectionSize>> VCodeInst for I {}
+pub trait VCodeInst: MachInst + MachInstEmit {}
+impl<I: MachInst + MachInstEmit> VCodeInst for I {}

 /// A function in "VCode" (virtualized-register code) form, after lowering.
 /// This is essentially a standard CFG of basic blocks, where each basic block
@@ -80,29 +77,11 @@ pub struct VCode<I: VCodeInst> {
    /// correspond to each basic block's successors.
    block_succs: Vec<BlockIx>,

-    /// Block indices by IR block.
-    block_by_bb: SecondaryMap<ir::Block, BlockIndex>,
-
-    /// IR block for each VCode Block. The length of this Vec will likely be
-    /// less than the total number of Blocks, because new Blocks (for edge
-    /// splits, for example) are appended during lowering.
-    bb_by_block: Vec<ir::Block>,
-
-    /// Order of block IDs in final generated code.
-    final_block_order: Vec<BlockIndex>,
-
-    /// Final block offsets. Computed during branch finalization and used
-    /// during emission.
-    final_block_offsets: Vec<CodeOffset>,
-
-    /// Size of code, accounting for block layout / alignment.
-    code_size: CodeOffset,
+    /// Block-order information.
+    block_order: BlockLoweringOrder,

    /// ABI object.
    abi: Box<dyn ABIBody<I = I>>,
-
-    /// The block targeted by fallthrough_returns, if there's one.
-    pub fallthrough_return_block: Option<BlockIndex>,
 }

 /// A builder for a VCode function body. This builder is designed for the
@@ -123,12 +102,8 @@ pub struct VCodeBuilder<I: VCodeInst> {
    /// In-progress VCode.
    vcode: VCode<I>,

-    /// Current basic block instructions, in reverse order (because blocks are
-    /// built bottom-to-top).
-    bb_insns: SmallVec<[(I, SourceLoc); 32]>,
-
-    /// Current IR-inst instructions, in forward order.
-    ir_inst_insns: SmallVec<[(I, SourceLoc); 4]>,
+    /// Index of the last block-start in the vcode.
+    block_start: InsnIndex,

    /// Start of succs for the current block in the concatenated succs list.
    succ_start: usize,
@@ -139,12 +114,11 @@ pub struct VCodeBuilder<I: VCodeInst> {

 impl<I: VCodeInst> VCodeBuilder<I> {
    /// Create a new VCodeBuilder.
-    pub fn new(abi: Box<dyn ABIBody<I = I>>) -> VCodeBuilder<I> {
-        let vcode = VCode::new(abi);
+    pub fn new(abi: Box<dyn ABIBody<I = I>>, block_order: BlockLoweringOrder) -> VCodeBuilder<I> {
+        let vcode = VCode::new(abi, block_order);
        VCodeBuilder {
            vcode,
-            bb_insns: SmallVec::new(),
-            ir_inst_insns: SmallVec::new(),
+            block_start: 0,
            succ_start: 0,
            cur_srcloc: SourceLoc::default(),
        }
@@ -155,14 +129,9 @@ impl<I: VCodeInst> VCodeBuilder<I> {
        &mut *self.vcode.abi
    }

-    /// Set the fallthrough_return target block for this function. There must be at most once per
-    /// function.
-    pub fn set_fallthrough_return_block(&mut self, bb: Block) {
-        debug_assert!(
-            self.vcode.fallthrough_return_block.is_none(),
-            "a function must have at most one fallthrough-return instruction"
-        );
-        self.vcode.fallthrough_return_block = Some(self.bb_to_bindex(bb));
+    /// Access to the BlockLoweringOrder object.
+    pub fn block_order(&self) -> &BlockLoweringOrder {
+        &self.vcode.block_order
    }

    /// Set the type of a VReg.
@@ -173,53 +142,17 @@ impl<I: VCodeInst> VCodeBuilder<I> {
        self.vcode.vreg_types[vreg.get_index()] = ty;
    }

-    /// Return the underlying bb-to-BlockIndex map.
-    pub fn blocks_by_bb(&self) -> &SecondaryMap<ir::Block, BlockIndex> {
-        &self.vcode.block_by_bb
-    }
-
-    /// Initialize the bb-to-BlockIndex map. Returns the first free
-    /// BlockIndex.
-    pub fn init_bb_map(&mut self, blocks: &[ir::Block]) -> BlockIndex {
-        let mut bindex: BlockIndex = 0;
-        for bb in blocks.iter() {
-            self.vcode.block_by_bb[*bb] = bindex;
-            self.vcode.bb_by_block.push(*bb);
-            bindex += 1;
-        }
-        bindex
-    }
-
-    /// Get the BlockIndex for an IR block.
-    pub fn bb_to_bindex(&self, bb: ir::Block) -> BlockIndex {
-        self.vcode.block_by_bb[bb]
-    }
-
    /// Set the current block as the entry block.
    pub fn set_entry(&mut self, block: BlockIndex) {
        self.vcode.entry = block;
    }

-    /// End the current IR instruction. Must be called after pushing any
-    /// instructions and prior to ending the basic block.
-    pub fn end_ir_inst(&mut self) {
-        while let Some(pair) = self.ir_inst_insns.pop() {
-            self.bb_insns.push(pair);
-        }
-    }
-
    /// End the current basic block. Must be called after emitting vcode insts
    /// for IR insts and prior to ending the function (building the VCode).
-    pub fn end_bb(&mut self) -> BlockIndex {
-        assert!(self.ir_inst_insns.is_empty());
-        let block_num = self.vcode.block_ranges.len() as BlockIndex;
-        // Push the instructions.
-        let start_idx = self.vcode.insts.len() as InsnIndex;
-        while let Some((i, loc)) = self.bb_insns.pop() {
-            self.vcode.insts.push(i);
-            self.vcode.srclocs.push(loc);
-        }
+    pub fn end_bb(&mut self) {
+        let start_idx = self.block_start;
        let end_idx = self.vcode.insts.len() as InsnIndex;
+        self.block_start = end_idx;
        // Add the instruction index range to the list of blocks.
        self.vcode.block_ranges.push((start_idx, end_idx));
        // End the successors list.
@@ -228,8 +161,6 @@ impl<I: VCodeInst> VCodeBuilder<I> {
            .block_succ_range
            .push((self.succ_start, succ_end));
        self.succ_start = succ_end;
-
-        block_num
    }

    /// Push an instruction for the current BB and current IR inst within the BB.
@@ -237,19 +168,27 @@ impl<I: VCodeInst> VCodeBuilder<I> {
        match insn.is_term() {
            MachTerminator::None | MachTerminator::Ret => {}
            MachTerminator::Uncond(target) => {
-                self.vcode.block_succs.push(BlockIx::new(target));
+                self.vcode.block_succs.push(BlockIx::new(target.get()));
            }
            MachTerminator::Cond(true_branch, false_branch) => {
-                self.vcode.block_succs.push(BlockIx::new(true_branch));
-                self.vcode.block_succs.push(BlockIx::new(false_branch));
+                self.vcode.block_succs.push(BlockIx::new(true_branch.get()));
+                self.vcode
+                    .block_succs
+                    .push(BlockIx::new(false_branch.get()));
            }
            MachTerminator::Indirect(targets) => {
                for target in targets {
-                    self.vcode.block_succs.push(BlockIx::new(*target));
+                    self.vcode.block_succs.push(BlockIx::new(target.get()));
                }
            }
        }
-        self.ir_inst_insns.push((insn, self.cur_srcloc));
+        self.vcode.insts.push(insn);
+        self.vcode.srclocs.push(self.cur_srcloc);
+    }
+
+    /// Get the current source location.
+    pub fn get_srcloc(&self) -> SourceLoc {
+        self.cur_srcloc
    }

    /// Set the current source location.
@@ -259,8 +198,6 @@ impl<I: VCodeInst> VCodeBuilder<I> {

    /// Build the final VCode.
    pub fn build(self) -> VCode<I> {
-        assert!(self.ir_inst_insns.is_empty());
-        assert!(self.bb_insns.is_empty());
        self.vcode
    }
 }
@@ -282,35 +219,9 @@ fn is_redundant_move<I: VCodeInst>(insn: &I) -> bool {
    }
 }

-fn is_trivial_jump_block<I: VCodeInst>(vcode: &VCode<I>, block: BlockIndex) -> Option<BlockIndex> {
-    let range = vcode.block_insns(BlockIx::new(block));
-
-    debug!(
-        "is_trivial_jump_block: block {} has len {}",
-        block,
-        range.len()
-    );
-
-    if range.len() != 1 {
-        return None;
-    }
-    let insn = range.first();
-
-    debug!(
-        " -> only insn is: {:?} with terminator {:?}",
-        vcode.get_insn(insn),
-        vcode.get_insn(insn).is_term()
-    );
-
-    match vcode.get_insn(insn).is_term() {
-        MachTerminator::Uncond(target) => Some(target),
-        _ => None,
-    }
-}
-
 impl<I: VCodeInst> VCode<I> {
    /// New empty VCode.
-    fn new(abi: Box<dyn ABIBody<I = I>>) -> VCode<I> {
+    fn new(abi: Box<dyn ABIBody<I = I>>, block_order: BlockLoweringOrder) -> VCode<I> {
        VCode {
            liveins: abi.liveins(),
            liveouts: abi.liveouts(),
@@ -321,13 +232,8 @@ impl<I: VCodeInst> VCode<I> {
            block_ranges: vec![],
            block_succ_range: vec![],
            block_succs: vec![],
-            block_by_bb: SecondaryMap::with_default(0),
-            bb_by_block: vec![],
-            final_block_order: vec![],
-            final_block_offsets: vec![],
-            code_size: 0,
+            block_order,
            abi,
-            fallthrough_return_block: None,
        }
    }

@@ -367,8 +273,6 @@ impl<I: VCodeInst> VCode<I> {
    /// instructions including spliced fill/reload/move instructions, and replace
    /// the VCode with them.
    pub fn replace_insns_from_regalloc(&mut self, result: RegAllocResult<Self>) {
-        self.final_block_order = compute_final_block_order(self);
-
        // Record the spillslot count and clobbered registers for the ABI/stack
        // setup code.
        self.abi.set_num_spillslots(result.num_spill_slots as usize);
@@ -383,11 +287,12 @@ impl<I: VCodeInst> VCode<I> {
        let mut final_block_ranges = vec![(0, 0); self.num_blocks()];
        let mut final_srclocs = vec![];

-        for block in &self.final_block_order {
-            let (start, end) = block_ranges[*block as usize];
+        for block in 0..self.num_blocks() {
+            let block = block as BlockIndex;
+            let (start, end) = block_ranges[block as usize];
            let final_start = final_insns.len() as InsnIndex;

-            if *block == self.entry {
+            if block == self.entry {
                // Start with the prologue.
                let prologue = self.abi.gen_prologue();
                let len = prologue.len();
@@ -429,7 +334,7 @@ impl<I: VCodeInst> VCode<I> {
            }

            let final_end = final_insns.len() as InsnIndex;
-            final_block_ranges[*block as usize] = (final_start, final_end);
+            final_block_ranges[block as usize] = (final_start, final_end);
        }

        debug_assert!(final_insns.len() == final_srclocs.len());
@@ -439,175 +344,68 @@ impl<I: VCodeInst> VCode<I> {
        self.block_ranges = final_block_ranges;
    }

-    /// Removes redundant branches, rewriting targets to point directly to the
-    /// ultimate block at the end of a chain of trivial one-target jumps.
-    pub fn remove_redundant_branches(&mut self) {
-        // For each block, compute the actual target block, looking through up to one
-        // block with single-target jumps (this will remove empty edge blocks inserted
-        // by phi-lowering).
-        let block_rewrites: Vec<BlockIndex> = (0..self.num_blocks() as u32)
-            .map(|bix| is_trivial_jump_block(self, bix).unwrap_or(bix))
-            .collect();
-        let mut refcounts: Vec<usize> = vec![0; self.num_blocks()];
-
-        debug!(
-            "remove_redundant_branches: block_rewrites = {:?}",
-            block_rewrites
-        );
-
-        refcounts[self.entry as usize] = 1;
-
-        for block in 0..self.num_blocks() as u32 {
-            for insn in self.block_insns(BlockIx::new(block)) {
-                self.get_insn_mut(insn)
-                    .with_block_rewrites(&block_rewrites[..]);
-                match self.get_insn(insn).is_term() {
-                    MachTerminator::Uncond(bix) => {
-                        refcounts[bix as usize] += 1;
-                    }
-                    MachTerminator::Cond(bix1, bix2) => {
-                        refcounts[bix1 as usize] += 1;
-                        refcounts[bix2 as usize] += 1;
-                    }
-                    MachTerminator::Indirect(blocks) => {
-                        for block in blocks {
-                            refcounts[*block as usize] += 1;
-                        }
-                    }
-                    _ => {}
-                }
-            }
-        }
-
-        let deleted: Vec<bool> = refcounts.iter().map(|r| *r == 0).collect();
-
-        let block_order = std::mem::replace(&mut self.final_block_order, vec![]);
-        self.final_block_order = block_order
-            .into_iter()
-            .filter(|b| !deleted[*b as usize])
-            .collect();
-
-        // Rewrite successor information based on the block-rewrite map.
-        for succ in &mut self.block_succs {
-            let new_succ = block_rewrites[succ.get() as usize];
-            *succ = BlockIx::new(new_succ);
-        }
-    }
-
-    /// Mutate branch instructions to (i) lower two-way condbrs to one-way,
-    /// depending on fallthrough; and (ii) use concrete offsets.
-    pub fn finalize_branches(&mut self)
+    /// Emit the instructions to a `MachBuffer`, containing fixed-up code and external
+    /// reloc/trap/etc. records ready for use.
+    pub fn emit(&self) -> MachBuffer<I>
    where
-        I: MachInstEmit<MachSectionSize>,
+        I: MachInstEmit,
    {
-        // Compute fallthrough block, indexed by block.
-        let num_final_blocks = self.final_block_order.len();
-        let mut block_fallthrough: Vec<Option<BlockIndex>> = vec![None; self.num_blocks()];
-        for i in 0..(num_final_blocks - 1) {
-            let from = self.final_block_order[i];
-            let to = self.final_block_order[i + 1];
-            block_fallthrough[from as usize] = Some(to);
-        }
-
-        // Pass over VCode instructions and finalize two-way branches into
-        // one-way branches with fallthrough.
-        for block in 0..self.num_blocks() {
-            let next_block = block_fallthrough[block];
-            let (start, end) = self.block_ranges[block];
-
-            for iix in start..end {
-                let insn = &mut self.insts[iix as usize];
-                insn.with_fallthrough_block(next_block);
-            }
-        }
-
-        let flags = self.abi.flags();
-
-        // Compute block offsets.
-        let mut code_section = MachSectionSize::new(0);
-        let mut block_offsets = vec![0; self.num_blocks()];
+        let mut buffer = MachBuffer::new();
        let mut state = Default::default();
-        for &block in &self.final_block_order {
-            code_section.offset = I::align_basic_block(code_section.offset);
-            block_offsets[block as usize] = code_section.offset;
-            let (start, end) = self.block_ranges[block as usize];
-            for iix in start..end {
-                self.insts[iix as usize].emit(&mut code_section, flags, &mut state);
-            }
-        }

-        // We now have the section layout.
-        self.final_block_offsets = block_offsets;
-        self.code_size = code_section.size();
-
-        // Update branches with known block offsets. This looks like the
-        // traversal above, but (i) does not update block_offsets, rather uses
-        // it (so forward references are now possible), and (ii) mutates the
-        // instructions.
-        let mut code_section = MachSectionSize::new(0);
-        let mut state = Default::default();
-        for &block in &self.final_block_order {
-            code_section.offset = I::align_basic_block(code_section.offset);
-            let (start, end) = self.block_ranges[block as usize];
-            for iix in start..end {
-                self.insts[iix as usize]
-                    .with_block_offsets(code_section.offset, &self.final_block_offsets[..]);
-                self.insts[iix as usize].emit(&mut code_section, flags, &mut state);
-            }
-        }
-    }
-
-    /// Emit the instructions to a list of sections.
-    pub fn emit(&self) -> MachSections
-    where
-        I: MachInstEmit<MachSection>,
-    {
-        let mut sections = MachSections::new();
-        let code_idx = sections.add_section(0, self.code_size);
-        let code_section = sections.get_section(code_idx);
-        let mut state = Default::default();
+        buffer.reserve_labels_for_blocks(self.num_blocks() as BlockIndex); // first N MachLabels are simply block indices.

        let flags = self.abi.flags();
        let mut cur_srcloc = None;
-        for &block in &self.final_block_order {
-            let new_offset = I::align_basic_block(code_section.cur_offset_from_start());
-            while new_offset > code_section.cur_offset_from_start() {
+        for block in 0..self.num_blocks() {
+            let block = block as BlockIndex;
+            let new_offset = I::align_basic_block(buffer.cur_offset());
+            while new_offset > buffer.cur_offset() {
                // Pad with NOPs up to the aligned block offset.
-                let nop = I::gen_nop((new_offset - code_section.cur_offset_from_start()) as usize);
-                nop.emit(code_section, flags, &mut Default::default());
+                let nop = I::gen_nop((new_offset - buffer.cur_offset()) as usize);
+                nop.emit(&mut buffer, flags, &mut Default::default());
            }
-            assert_eq!(code_section.cur_offset_from_start(), new_offset);
+            assert_eq!(buffer.cur_offset(), new_offset);

            let (start, end) = self.block_ranges[block as usize];
+            buffer.bind_label(MachLabel::from_block(block));
            for iix in start..end {
                let srcloc = self.srclocs[iix as usize];
                if cur_srcloc != Some(srcloc) {
                    if cur_srcloc.is_some() {
-                        code_section.end_srcloc();
+                        buffer.end_srcloc();
                    }
-                    code_section.start_srcloc(srcloc);
+                    buffer.start_srcloc(srcloc);
                    cur_srcloc = Some(srcloc);
                }

-                self.insts[iix as usize].emit(code_section, flags, &mut state);
+                self.insts[iix as usize].emit(&mut buffer, flags, &mut state);
            }

            if cur_srcloc.is_some() {
-                code_section.end_srcloc();
+                buffer.end_srcloc();
                cur_srcloc = None;
            }
+
+            // Do we need an island? Get the worst-case size of the next BB and see if, having
+            // emitted that many bytes, we will be beyond the deadline.
+            if block < (self.num_blocks() - 1) as BlockIndex {
+                let next_block = block + 1;
+                let next_block_range = self.block_ranges[next_block as usize];
+                let next_block_size = next_block_range.1 - next_block_range.0;
+                let worst_case_next_bb = I::worst_case_size() * next_block_size;
+                if buffer.island_needed(worst_case_next_bb) {
+                    buffer.emit_island();
+                }
+            }
        }

-        sections
+        buffer
    }

    /// Get the IR block for a BlockIndex, if one exists.
    pub fn bindex_to_bb(&self, block: BlockIndex) -> Option<ir::Block> {
-        if (block as usize) < self.bb_by_block.len() {
-            Some(self.bb_by_block[block as usize])
-        } else {
-            None
-        }
+        self.block_order.lowered_order()[block as usize].orig_block()
    }
 }

@@ -712,7 +510,6 @@ impl<I: VCodeInst> fmt::Debug for VCode<I> {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        writeln!(f, "VCode_Debug {{")?;
        writeln!(f, "  Entry block: {}", self.entry)?;
-        writeln!(f, "  Final block order: {:?}", self.final_block_order)?;

        for block in 0..self.num_blocks() {
            writeln!(f, "Block {}:", block,)?;
@@ -736,52 +533,21 @@ impl<I: VCodeInst + ShowWithRRU> ShowWithRRU for VCode<I> {
    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
        use std::fmt::Write;

-        // Calculate an order in which to display the blocks.  This is the same
-        // as final_block_order, but also includes blocks which are in the
-        // representation but not in final_block_order.
-        let mut display_order = Vec::<usize>::new();
-        // First display blocks in `final_block_order`
-        for bix in &self.final_block_order {
-            assert!((*bix as usize) < self.num_blocks());
-            display_order.push(*bix as usize);
-        }
-        // Now also take care of those not listed in `final_block_order`.
-        // This is quadratic, but it's also debug-only code.
-        for bix in 0..self.num_blocks() {
-            if display_order.contains(&bix) {
-                continue;
-            }
-            display_order.push(bix);
-        }
-
        let mut s = String::new();
        write!(&mut s, "VCode_ShowWithRRU {{{{\n").unwrap();
        write!(&mut s, "  Entry block: {}\n", self.entry).unwrap();
-        write!(
-            &mut s,
-            "  Final block order: {:?}\n",
-            self.final_block_order
-        )
-        .unwrap();

        for i in 0..self.num_blocks() {
-            let block = display_order[i];
+            let block = i as BlockIndex;

-            let omitted = if !self.final_block_order.is_empty() && i >= self.final_block_order.len()
-            {
-                "** OMITTED **"
-            } else {
-                ""
-            };
-
-            write!(&mut s, "Block {}: {}\n", block, omitted).unwrap();
-            if let Some(bb) = self.bindex_to_bb(block as BlockIndex) {
+            write!(&mut s, "Block {}:\n", block).unwrap();
+            if let Some(bb) = self.bindex_to_bb(block) {
                write!(&mut s, "  (original IR block: {})\n", bb).unwrap();
            }
-            for succ in self.succs(block as BlockIndex) {
+            for succ in self.succs(block) {
                write!(&mut s, "  (successor: Block {})\n", succ.get()).unwrap();
            }
-            let (start, end) = self.block_ranges[block];
+            let (start, end) = self.block_ranges[block as usize];
            write!(&mut s, "  (instruction range: {} .. {})\n", start, end).unwrap();
            for inst in start..end {
                write!(
--- a/cranelift/codegen/src/num_uses.rs
+++ b/cranelift/codegen/src/num_uses.rs
@@ -1,52 +0,0 @@
-//! A pass that computes the number of uses of any given instruction.
-
-use crate::entity::SecondaryMap;
-use crate::ir::dfg::ValueDef;
-use crate::ir::Value;
-use crate::ir::{DataFlowGraph, Function, Inst};
-
-/// Auxiliary data structure that counts the number of uses of any given
-/// instruction in a Function. This is used during instruction selection
-/// to essentially do incremental DCE: when an instruction is no longer
-/// needed because its computation has been isel'd into another machine
-/// instruction at every use site, we can skip it.
-#[derive(Clone, Debug)]
-pub struct NumUses {
-    uses: SecondaryMap<Inst, u32>,
-}
-
-impl NumUses {
-    fn new() -> NumUses {
-        NumUses {
-            uses: SecondaryMap::with_default(0),
-        }
-    }
-
-    /// Compute the NumUses analysis result for a function.
-    pub fn compute(func: &Function) -> NumUses {
-        let mut uses = NumUses::new();
-        for bb in func.layout.blocks() {
-            for inst in func.layout.block_insts(bb) {
-                for arg in func.dfg.inst_args(inst) {
-                    let v = func.dfg.resolve_aliases(*arg);
-                    uses.add_value(&func.dfg, v);
-                }
-            }
-        }
-        uses
-    }
-
-    fn add_value(&mut self, dfg: &DataFlowGraph, v: Value) {
-        match dfg.value_def(v) {
-            ValueDef::Result(inst, _) => {
-                self.uses[inst] += 1;
-            }
-            _ => {}
-        }
-    }
-
-    /// Take the complete uses map, consuming this analysis result.
-    pub fn take_uses(self) -> SecondaryMap<Inst, u32> {
-        self.uses
-    }
-}
--- a/cranelift/filetests/filetests/vcode/aarch64/arithmetic.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/arithmetic.clif
@@ -1,7 +1,7 @@
 test vcode
 target aarch64

-function %f(i64, i64) -> i64 {
+function %f1(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = iadd.i64 v0, v1
  return v2
@@ -15,7 +15,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ret


-function %f(i64, i64) -> i64 {
+function %f2(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = isub.i64 v0, v1
  return v2
@@ -28,7 +28,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f3(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = imul.i64 v0, v1
  return v2
@@ -41,7 +41,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f4(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = umulhi.i64 v0, v1
  return v2
@@ -54,7 +54,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f5(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = smulhi.i64 v0, v1
  return v2
@@ -67,7 +67,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f6(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = sdiv.i64 v0, v1
  return v2
@@ -87,7 +87,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64) -> i64 {
+function %f7(i64) -> i64 {
 block0(v0: i64):
  v1 = iconst.i64 2
  v2 = sdiv.i64 v0, v1
@@ -109,7 +109,7 @@ block0(v0: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f8(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = udiv.i64 v0, v1
  return v2
@@ -124,7 +124,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64) -> i64 {
+function %f9(i64) -> i64 {
 block0(v0: i64):
  v1 = iconst.i64 2
  v2 = udiv.i64 v0, v1
@@ -141,7 +141,7 @@ block0(v0: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f10(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = srem.i64 v0, v1
  return v2
@@ -157,7 +157,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f11(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = urem.i64 v0, v1
  return v2
@@ -174,7 +174,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ret


-function %f(i32, i32) -> i32 {
+function %f12(i32, i32) -> i32 {
 block0(v0: i32, v1: i32):
  v2 = sdiv.i32 v0, v1
  return v2
@@ -195,48 +195,48 @@ block0(v0: i32, v1: i32):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i32) -> i32 {
+function %f13(i32) -> i32 {
 block0(v0: i32):
  v1 = iconst.i32 2
  v2 = sdiv.i32 v0, v1
  return v2
 }

-; check:  stp fp, lr, [sp, #-16]!
-; nextln:  mov fp, sp
-; nextln:  mov x1, x0
-; nextln:  movz x0, #2
-; nextln:  sxtw x1, w1
-; nextln:  sxtw x2, w0
-; nextln:  sdiv x0, x1, x2
-; nextln:  cbz x2, 20
-; nextln:  adds wzr, w2, #1
-; nextln:  ccmp w1, #1, #nzcv, eq
-; nextln:  b.vc 12
-; nextln:  udf
-; nextln:  udf
-; nextln:  mov sp, fp
-; nextln:  ldp fp, lr, [sp], #16
-; nextln:  ret
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: sxtw x1, w0
+; nextln: movz x0, #2
+; nextln: sxtw x2, w0
+; nextln: sdiv x0, x1, x2
+; nextln: cbz x2, 20
+; nextln: adds wzr, w2, #1
+; nextln: ccmp w1, #1, #nzcv, eq
+; nextln: b.vc 12
+; nextln: udf
+; nextln: udf
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret

-function %f(i32, i32) -> i32 {
+function %f14(i32, i32) -> i32 {
 block0(v0: i32, v1: i32):
  v2 = udiv.i32 v0, v1
  return v2
 }

-; check:  stp fp, lr, [sp, #-16]!
-; nextln:  mov fp, sp
-; nextln:  mov w0, w0
-; nextln:  mov w1, w1
-; nextln:  udiv x0, x0, x1
-; nextln:  cbnz x1, 8
-; nextln:  udf
-; nextln:  mov sp, fp
-; nextln:  ldp fp, lr, [sp], #16
-; nextln:  ret
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: mov w0, w0
+; nextln: mov w1, w1
+; nextln: udiv x0, x0, x1
+; nextln: cbnz x1, 8
+; nextln: udf
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret

-function %f(i32) -> i32 {
+
+function %f15(i32) -> i32 {
 block0(v0: i32):
  v1 = iconst.i32 2
  v2 = udiv.i32 v0, v1
@@ -245,9 +245,8 @@ block0(v0: i32):

 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln:  mov fp, sp
-; nextln:  movz x1, #2
 ; nextln:  mov w0, w0
-; nextln:  mov w1, w1
+; nextln:  movz x1, #2
 ; nextln:  udiv x0, x0, x1
 ; nextln:  cbnz x1, 8
 ; nextln:  udf
@@ -255,7 +254,7 @@ block0(v0: i32):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i32, i32) -> i32 {
+function %f16(i32, i32) -> i32 {
 block0(v0: i32, v1: i32):
  v2 = srem.i32 v0, v1
  return v2
@@ -273,7 +272,7 @@ block0(v0: i32, v1: i32):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i32, i32) -> i32 {
+function %f17(i32, i32) -> i32 {
 block0(v0: i32, v1: i32):
  v2 = urem.i32 v0, v1
  return v2
@@ -291,7 +290,7 @@ block0(v0: i32, v1: i32):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f18(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = band.i64 v0, v1
  return v2
@@ -304,7 +303,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f19(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = bor.i64 v0, v1
  return v2
@@ -317,7 +316,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f20(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = bxor.i64 v0, v1
  return v2
@@ -330,7 +329,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f21(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = band_not.i64 v0, v1
  return v2
@@ -343,7 +342,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f22(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = bor_not.i64 v0, v1
  return v2
@@ -356,7 +355,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f23(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = bxor_not.i64 v0, v1
  return v2
@@ -369,7 +368,7 @@ block0(v0: i64, v1: i64):
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret

-function %f(i64, i64) -> i64 {
+function %f24(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = bnot.i64 v0
  return v2
--- a/cranelift/filetests/filetests/vcode/aarch64/condbr.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/condbr.clif
@@ -30,17 +30,18 @@ block2:
  return v5
 }

+; check: Block 0:
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
 ; nextln: subs xzr, x0, x1
-; nextln: b.eq 20
-; check: Block 2:
-; check: movz x0, #2
+; nextln: b.eq label1 ; b label2
+; check: Block 1:
+; check: movz x0, #1
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
-; check: Block 1:
-; check: movz x0, #1
+; check: Block 2:
+; check: movz x0, #2
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
--- a/cranelift/filetests/filetests/vcode/aarch64/jumptable.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/jumptable.clif
@@ -30,15 +30,15 @@ block5(v5: i64):

 ; check:   subs wzr, w0, #3
 ; nextln:   b.hs
-; nextln:   adr x2, pc+16 ; ldrsw x1, [x2, x0, LSL 2] ; add x2, x2, x1 ; br x2 ; jt_entries
+; nextln:   adr x1, pc+16 ; ldrsw x2, [x1, x0, LSL 2] ; add x1, x1, x2 ; br x1 ; jt_entries

-; check:   movz x1, #3
+; check:   movz x1, #1
 ; nextln:   b

 ; check:   movz x1, #2
 ; nextln:   b

-; check:   movz x1, #1
+; check:   movz x1, #3

 ; check:   add x0, x0, x1

--- a/cranelift/filetests/filetests/vcode/aarch64/saturating-ops.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/saturating-ops.clif
@@ -25,10 +25,10 @@ block0(v0: i8, v1: i8):

 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
-; nextln: uxtb x0, w0
-; nextln: uxtb x1, w1
-; nextln: mov v0.d[0], x0
-; nextln: mov v1.d[0], x1
+; nextln: uxtb x2, w0
+; nextln: uxtb x0, w1
+; nextln: mov v0.d[0], x2
+; nextln: mov v1.d[0], x0
 ; nextln: uqadd d0, d0, d1
 ; nextln: mov x0, v0.d[0]
 ; nextln: mov sp, fp
--- a/cranelift/filetests/filetests/vcode/aarch64/shift-rotate.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/shift-rotate.clif
@@ -366,15 +366,15 @@ block0(v0: i16):
  return v2
 }

-; check:  stp fp, lr, [sp, #-16]!
-; nextln:  mov fp, sp
-; nextln:  uxth w0, w0
-; nextln:  lsr w1, w0, #6
-; nextln:  lsl w0, w0, #10
-; nextln:  orr w0, w0, w1
-; nextln:  mov sp, fp
-; nextln:  ldp fp, lr, [sp], #16
-; nextln:  ret
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: uxth w1, w0
+; nextln: lsr w0, w1, #6
+; nextln: lsl w1, w1, #10
+; nextln: orr w0, w1, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret

 function %f24(i8) -> i8 {
 block0(v0: i8):
@@ -385,10 +385,10 @@ block0(v0: i8):

 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln:  mov fp, sp
-; nextln:  uxtb w0, w0
-; nextln:  lsr w1, w0, #5
-; nextln:  lsl w0, w0, #3
-; nextln:  orr w0, w0, w1
+; nextln:  uxtb w1, w0
+; nextln:  lsr w0, w1, #5
+; nextln:  lsl w1, w1, #3
+; nextln:  orr w0, w1, w0
 ; nextln:  mov sp, fp
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
--- a/crates/environ/src/cranelift.rs
+++ b/crates/environ/src/cranelift.rs
@@ -93,7 +93,7 @@ use crate::compilation::{
 use crate::func_environ::{get_func_name, FuncEnvironment};
 use crate::{CacheConfig, FunctionBodyData, ModuleLocal, ModuleTranslation, Tunables};
 use cranelift_codegen::ir::{self, ExternalName};
-use cranelift_codegen::machinst::sections::MachSrcLoc;
+use cranelift_codegen::machinst::buffer::MachSrcLoc;
 use cranelift_codegen::print_errors::pretty_error;
 use cranelift_codegen::{binemit, isa, Context};
 use cranelift_entity::PrimaryMap;
@@ -215,7 +215,7 @@ fn get_function_address_map<'data>(
    if let Some(ref mcr) = &context.mach_compile_result {
        // New-style backend: we have a `MachCompileResult` that will give us `MachSrcLoc` mapping
        // tuples.
-        for &MachSrcLoc { start, end, loc } in mcr.sections.get_srclocs_sorted() {
+        for &MachSrcLoc { start, end, loc } in mcr.buffer.get_srclocs_sorted() {
            instructions.push(InstructionAddressMap {
                srcloc: loc,
                code_offset: start as usize,