Rework of MachInst isel, branch fixups and lowering, and block ordering.

This patch includes: - A complete rework of the way that CLIF blocks and edge blocks are lowered into VCode blocks. The new mechanism in `BlockLoweringOrder` computes RPO over the CFG, but with a twist: it merges edge blocks intto heads or tails of original CLIF blocks wherever possible, and it does this without ever actually materializing the full nodes-plus-edges graph first. The backend driver lowers blocks in final order so there's no need to reshuffle later. - A new `MachBuffer` that replaces the `MachSection`. This is a special version of a code-sink that is far more than a humble `Vec<u8>`. In particular, it keeps a record of label definitions and label uses, with a machine-pluggable `LabelUse` trait that defines various types of fixups (basically internal relocations). Importantly, it implements some simple peephole-style branch rewrites *inline in the emission pass*, without any separate traversals over the code to use fallthroughs, swap taken/not-taken arms, etc. It tracks branches at the tail of the buffer and can (i) remove blocks that are just unconditional branches (by redirecting the label), (ii) understand a conditional/unconditional pair and swap the conditional polarity when it's helpful; and (iii) remove branches that branch to the fallthrough PC. The `MachBuffer` also implements branch-island support. On architectures like AArch64, this is needed to allow conditional branches within plausibly-attainable ranges (+/- 1MB on AArch64 specifically). It also does this inline while streaming through the emission, without any sort of fixpoint algorithm or later moving of code, by simply tracking outstanding references and "deadlines" and emitting an island just-in-time when we're in danger of going out of range. - A rework of the instruction selector driver. This is largely following the same algorithm as before, but is cleaned up significantly, in particular in the API: the machine backend can ask for an input arg and get any of three forms (constant, register, producing instruction), indicating it needs the register or can merge the constant or producing instruction as appropriate. This new driver takes special care to emit constants right at use-sites (and at phi inputs), minimizing their live-ranges, and also special-cases the "pinned register" to avoid superfluous moves. Overall, on `bz2.wasm`, the results are: wasmtime full run (compile + runtime) of bz2: baseline: 9774M insns, 9742M cycles, 3.918s w/ changes: 7012M insns, 6888M cycles, 2.958s (24.5% faster, 28.3% fewer insns) clif-util wasm compile bz2: baseline: 2633M insns, 3278M cycles, 1.034s w/ changes: 2366M insns, 2920M cycles, 0.923s (10.7% faster, 10.1% fewer insns) All numbers are averages of two runs on an Ampere eMAG.
2020-05-15 19:04:50 -07:00
parent 463734b002
commit 72e6be9342
27 changed files with 3021 additions and 2035 deletions
--- a/cranelift/codegen/src/isa/aarch64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -504,7 +504,7 @@ impl AArch64ABIBody {
                rn: stack_reg(),
                rm: stack_limit,
            });
-            insts.push(Inst::CondBrLowered {
+            insts.push(Inst::OneWayCondBr {
                target: BranchTarget::ResolvedOffset(8),
                // Here `Hs` == "higher or same" when interpreting the two
                // operands as unsigned integers.
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -3,14 +3,14 @@
 // Some variants are never constructed, but we still want them as options in the future.
 #![allow(dead_code)]

-use crate::binemit::CodeOffset;
 use crate::ir::Type;
 use crate::isa::aarch64::inst::*;
 use crate::isa::aarch64::lower::ty_bits;
+use crate::machinst::MachLabel;

 use regalloc::{RealRegUniverse, Reg, Writable};

-use core::convert::{Into, TryFrom};
+use core::convert::Into;
 use std::string::String;

 /// A shift operator for a register or immediate.
@@ -303,78 +303,44 @@ impl CondBrKind {

 /// A branch target. Either unresolved (basic-block index) or resolved (offset
 /// from end of current instruction).
-#[derive(Clone, Copy, Debug)]
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum BranchTarget {
-    /// An unresolved reference to a BlockIndex, as passed into
+    /// An unresolved reference to a Label, as passed into
    /// `lower_branch_group()`.
-    Block(BlockIndex),
-    /// A resolved reference to another instruction, after
-    /// `Inst::with_block_offsets()`.
+    Label(MachLabel),
+    /// A fixed PC offset.
    ResolvedOffset(isize),
 }

 impl BranchTarget {
-    /// Lower the branch target given offsets of each block.
-    pub fn lower(&mut self, targets: &[CodeOffset], my_offset: CodeOffset) {
+    /// Return the target's label, if it is a label-based target.
+    pub fn as_label(self) -> Option<MachLabel> {
        match self {
-            &mut BranchTarget::Block(bix) => {
-                let bix = usize::try_from(bix).unwrap();
-                assert!(bix < targets.len());
-                let block_offset_in_func = targets[bix];
-                let branch_offset = (block_offset_in_func as isize) - (my_offset as isize);
-                *self = BranchTarget::ResolvedOffset(branch_offset);
-            }
-            &mut BranchTarget::ResolvedOffset(..) => {}
-        }
-    }
-
-    /// Get the block index.
-    pub fn as_block_index(&self) -> Option<BlockIndex> {
-        match self {
-            &BranchTarget::Block(bix) => Some(bix),
+            BranchTarget::Label(l) => Some(l),
            _ => None,
        }
    }

-    /// Get the offset as 4-byte words. Returns `0` if not
-    /// yet resolved (in that case, we're only computing
-    /// size and the offset doesn't matter).
-    pub fn as_offset_words(&self) -> isize {
-        match self {
-            &BranchTarget::ResolvedOffset(off) => off >> 2,
+    /// Return the target's offset, if specified, or zero if label-based.
+    pub fn as_offset19_or_zero(self) -> u32 {
+        let off = match self {
+            BranchTarget::ResolvedOffset(off) => off >> 2,
            _ => 0,
-        }
+        };
+        assert!(off <= 0x3ffff);
+        assert!(off >= -0x40000);
+        (off as u32) & 0x7ffff
    }

-    /// Get the offset as a 26-bit offset suitable for a 26-bit jump, or `None` if overflow.
-    pub fn as_off26(&self) -> Option<u32> {
-        let off = self.as_offset_words();
-        if (off < (1 << 25)) && (off >= -(1 << 25)) {
-            Some((off as u32) & ((1 << 26) - 1))
-        } else {
-            None
-        }
-    }
-
-    /// Get the offset as a 19-bit offset, or `None` if overflow.
-    pub fn as_off19(&self) -> Option<u32> {
-        let off = self.as_offset_words();
-        if (off < (1 << 18)) && (off >= -(1 << 18)) {
-            Some((off as u32) & ((1 << 19) - 1))
-        } else {
-            None
-        }
-    }
-
-    /// Map the block index given a transform map.
-    pub fn map(&mut self, block_index_map: &[BlockIndex]) {
-        match self {
-            &mut BranchTarget::Block(ref mut bix) => {
-                let n = block_index_map[usize::try_from(*bix).unwrap()];
-                *bix = n;
-            }
-            &mut BranchTarget::ResolvedOffset(_) => {}
-        }
+    /// Return the target's offset, if specified, or zero if label-based.
+    pub fn as_offset26_or_zero(self) -> u32 {
+        let off = match self {
+            BranchTarget::ResolvedOffset(off) => off >> 2,
+            _ => 0,
+        };
+        assert!(off <= 0x1ffffff);
+        assert!(off >= -0x2000000);
+        (off as u32) & 0x3ffffff
    }
 }

@@ -507,7 +473,7 @@ impl ShowWithRRU for Cond {
 impl ShowWithRRU for BranchTarget {
    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
        match self {
-            &BranchTarget::Block(block) => format!("block{}", block),
+            &BranchTarget::Label(label) => format!("label{:?}", label.get()),
            &BranchTarget::ResolvedOffset(off) => format!("{}", off),
        }
    }
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -4,7 +4,7 @@ use crate::binemit::{CodeOffset, Reloc};
 use crate::ir::constant::ConstantData;
 use crate::ir::types::*;
 use crate::ir::TrapCode;
-use crate::isa::aarch64::{inst::regs::PINNED_REG, inst::*};
+use crate::isa::aarch64::inst::*;

 use regalloc::{Reg, RegClass, Writable};

@@ -149,6 +149,14 @@ fn enc_cbr(op_31_24: u32, off_18_0: u32, op_4: u32, cond: u32) -> u32 {
    (op_31_24 << 24) | (off_18_0 << 5) | (op_4 << 4) | cond
 }

+fn enc_conditional_br(taken: BranchTarget, kind: CondBrKind) -> u32 {
+    match kind {
+        CondBrKind::Zero(reg) => enc_cmpbr(0b1_011010_0, taken.as_offset19_or_zero(), reg),
+        CondBrKind::NotZero(reg) => enc_cmpbr(0b1_011010_1, taken.as_offset19_or_zero(), reg),
+        CondBrKind::Cond(c) => enc_cbr(0b01010100, taken.as_offset19_or_zero(), 0b0, c.bits()),
+    }
+}
+
 const MOVE_WIDE_FIXED: u32 = 0x92800000;

 #[repr(u32)]
@@ -340,10 +348,10 @@ pub struct EmitState {
    virtual_sp_offset: i64,
 }

-impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
+impl MachInstEmit for Inst {
    type State = EmitState;

-    fn emit(&self, sink: &mut O, flags: &settings::Flags, state: &mut EmitState) {
+    fn emit(&self, sink: &mut MachBuffer<Inst>, flags: &settings::Flags, state: &mut EmitState) {
        match self {
            &Inst::AluRRR { alu_op, rd, rn, rm } => {
                let top11 = match alu_op {
@@ -616,7 +624,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                ref mem,
                srcloc,
            } => {
-                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem, state);
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset(), mem, state);

                for inst in mem_insts.into_iter() {
                    inst.emit(sink, flags, state);
@@ -759,7 +767,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                ref mem,
                srcloc,
            } => {
-                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem, state);
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset(), mem, state);

                for inst in mem_insts.into_iter() {
                    inst.emit(sink, flags, state);
@@ -1147,10 +1155,18 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                panic!("Unsupported extend variant");
            }
            &Inst::Jump { ref dest } => {
-                // TODO: differentiate between as_off26() returning `None` for
-                // out-of-range vs. not-yet-finalized. The latter happens when we
-                // do early (fake) emission for size computation.
-                sink.put4(enc_jump26(0b000101, dest.as_off26().unwrap()));
+                let off = sink.cur_offset();
+                // Emit the jump itself.
+                sink.put4(enc_jump26(0b000101, dest.as_offset26_or_zero()));
+                // After the jump has been emitted, indicate that it uses a
+                // label, if so, so that a fixup can occur later. This happens
+                // after we emit the bytes because the fixup might occur right
+                // away (so the bytes must actually exist now).
+                if let Some(l) = dest.as_label() {
+                    sink.use_label_at_offset(off, l, LabelUse::Branch26);
+                    let cur_off = sink.cur_offset();
+                    sink.add_uncond_branch(off, cur_off, l);
+                }
            }
            &Inst::Ret => {
                sink.put4(0xd65f03c0);
@@ -1178,51 +1194,35 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    sink.add_call_site(loc, opcode);
                }
            }
-            &Inst::CondBr { .. } => panic!("Unlowered CondBr during binemit!"),
-            &Inst::CondBrLowered { target, kind } => match kind {
-                // TODO: handle >2^19 case by emitting a compound sequence with
-                // an unconditional (26-bit) branch. We need branch-relaxation
-                // adjustment machinery to enable this (because we don't want to
-                // always emit the long form).
-                CondBrKind::Zero(reg) => {
-                    sink.put4(enc_cmpbr(0b1_011010_0, target.as_off19().unwrap(), reg));
-                }
-                CondBrKind::NotZero(reg) => {
-                    sink.put4(enc_cmpbr(0b1_011010_1, target.as_off19().unwrap(), reg));
-                }
-                CondBrKind::Cond(c) => {
-                    sink.put4(enc_cbr(
-                        0b01010100,
-                        target.as_off19().unwrap_or(0),
-                        0b0,
-                        c.bits(),
-                    ));
-                }
-            },
-            &Inst::CondBrLoweredCompound {
+            &Inst::CondBr {
                taken,
                not_taken,
                kind,
            } => {
                // Conditional part first.
-                match kind {
-                    CondBrKind::Zero(reg) => {
-                        sink.put4(enc_cmpbr(0b1_011010_0, taken.as_off19().unwrap(), reg));
-                    }
-                    CondBrKind::NotZero(reg) => {
-                        sink.put4(enc_cmpbr(0b1_011010_1, taken.as_off19().unwrap(), reg));
-                    }
-                    CondBrKind::Cond(c) => {
-                        sink.put4(enc_cbr(
-                            0b01010100,
-                            taken.as_off19().unwrap_or(0),
-                            0b0,
-                            c.bits(),
-                        ));
-                    }
+                let cond_off = sink.cur_offset();
+                sink.put4(enc_conditional_br(taken, kind));
+                if let Some(l) = taken.as_label() {
+                    sink.use_label_at_offset(cond_off, l, LabelUse::Branch19);
+                    let cur_off = sink.cur_offset();
+                    let inverted = enc_conditional_br(taken, kind.invert()).to_le_bytes();
+                    sink.add_cond_branch(cond_off, cur_off, l, &inverted[..]);
                }
                // Unconditional part.
-                sink.put4(enc_jump26(0b000101, not_taken.as_off26().unwrap_or(0)));
+                let uncond_off = sink.cur_offset();
+                sink.put4(enc_jump26(0b000101, not_taken.as_offset26_or_zero()));
+                if let Some(l) = not_taken.as_label() {
+                    sink.use_label_at_offset(uncond_off, l, LabelUse::Branch26);
+                    let cur_off = sink.cur_offset();
+                    sink.add_uncond_branch(uncond_off, cur_off, l);
+                }
+            }
+            &Inst::OneWayCondBr { target, kind } => {
+                let off = sink.cur_offset();
+                sink.put4(enc_conditional_br(target, kind));
+                if let Some(l) = target.as_label() {
+                    sink.use_label_at_offset(off, l, LabelUse::Branch19);
+                }
            }
            &Inst::IndirectBr { rn, .. } => {
                sink.put4(enc_br(rn));
@@ -1239,8 +1239,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                sink.add_trap(srcloc, code);
                sink.put4(0xd4a00000);
            }
-            &Inst::Adr { rd, ref label } => {
-                let off = memlabel_finalize(sink.cur_offset_from_start(), label);
+            &Inst::Adr { rd, off } => {
                assert!(off > -(1 << 20));
                assert!(off < (1 << 20));
                sink.put4(enc_adr(off, rd));
@@ -1261,19 +1260,13 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                // This sequence is *one* instruction in the vcode, and is expanded only here at
                // emission time, because we cannot allow the regalloc to insert spills/reloads in
                // the middle; we depend on hardcoded PC-rel addressing below.
-                //
-                // N.B.: if PC-rel addressing on ADR below is changed, also update
-                // `Inst::with_block_offsets()` in aarch64/inst/mod.rs.

                // Save index in a tmp (the live range of ridx only goes to start of this
                // sequence; rtmp1 or rtmp2 may overwrite it).
                let inst = Inst::gen_move(rtmp2, ridx, I64);
                inst.emit(sink, flags, state);
                // Load address of jump table
-                let inst = Inst::Adr {
-                    rd: rtmp1,
-                    label: MemLabel::PCRel(16),
-                };
+                let inst = Inst::Adr { rd: rtmp1, off: 16 };
                inst.emit(sink, flags, state);
                // Load value out of jump table
                let inst = Inst::SLoad32 {
@@ -1303,12 +1296,16 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                };
                inst.emit(sink, flags, state);
                // Emit jump table (table of 32-bit offsets).
-                for target in targets.iter() {
-                    let off = target.as_offset_words() * 4;
-                    let off = i32::try_from(off).unwrap();
-                    // cast i32 to u32 (two's-complement)
-                    let off = off as u32;
-                    sink.put4(off);
+                let jt_off = sink.cur_offset();
+                for &target in targets.iter() {
+                    let word_off = sink.cur_offset();
+                    let off_into_table = word_off - jt_off;
+                    sink.put4(off_into_table);
+                    sink.use_label_at_offset(
+                        word_off,
+                        target.as_label().unwrap(),
+                        LabelUse::PCRel32,
+                    );
                }
            }
            &Inst::LoadConst64 { rd, const_data } => {
@@ -1348,7 +1345,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                }
            }
            &Inst::LoadAddr { rd, ref mem } => {
-                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem, state);
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset(), mem, state);
                for inst in mem_insts.into_iter() {
                    inst.emit(sink, flags, state);
                }
@@ -1401,20 +1398,6 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    add.emit(sink, flags, state);
                }
            }
-            &Inst::GetPinnedReg { rd } => {
-                let inst = Inst::Mov {
-                    rd,
-                    rm: xreg(PINNED_REG),
-                };
-                inst.emit(sink, flags, state);
-            }
-            &Inst::SetPinnedReg { rm } => {
-                let inst = Inst::Mov {
-                    rd: Writable::from_reg(xreg(PINNED_REG)),
-                    rm,
-                };
-                inst.emit(sink, flags, state);
-            }
            &Inst::VirtualSPOffsetAdj { offset } => {
                debug!(
                    "virtual sp offset adjusted by {} -> {}",
@@ -1423,6 +1406,17 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                );
                state.virtual_sp_offset += offset;
            }
+            &Inst::EmitIsland { needed_space } => {
+                if sink.island_needed(needed_space + 4) {
+                    let jump_around_label = sink.get_label();
+                    let jmp = Inst::Jump {
+                        dest: BranchTarget::Label(jump_around_label),
+                    };
+                    jmp.emit(sink, flags, state);
+                    sink.emit_island();
+                    sink.bind_label(jump_around_label);
+                }
+            }
        }
    }
 }
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -1956,7 +1956,7 @@ fn test_aarch64_binemit() {
    ));

    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Zero(xreg(8)),
        },
@@ -1964,7 +1964,7 @@ fn test_aarch64_binemit() {
        "cbz x8, 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::NotZero(xreg(8)),
        },
@@ -1972,7 +1972,7 @@ fn test_aarch64_binemit() {
        "cbnz x8, 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Eq),
        },
@@ -1980,7 +1980,7 @@ fn test_aarch64_binemit() {
        "b.eq 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Ne),
        },
@@ -1989,7 +1989,7 @@ fn test_aarch64_binemit() {
    ));

    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Hs),
        },
@@ -1997,7 +1997,7 @@ fn test_aarch64_binemit() {
        "b.hs 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Lo),
        },
@@ -2005,7 +2005,7 @@ fn test_aarch64_binemit() {
        "b.lo 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Mi),
        },
@@ -2013,7 +2013,7 @@ fn test_aarch64_binemit() {
        "b.mi 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Pl),
        },
@@ -2021,7 +2021,7 @@ fn test_aarch64_binemit() {
        "b.pl 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Vs),
        },
@@ -2029,7 +2029,7 @@ fn test_aarch64_binemit() {
        "b.vs 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Vc),
        },
@@ -2037,7 +2037,7 @@ fn test_aarch64_binemit() {
        "b.vc 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Hi),
        },
@@ -2045,7 +2045,7 @@ fn test_aarch64_binemit() {
        "b.hi 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Ls),
        },
@@ -2053,7 +2053,7 @@ fn test_aarch64_binemit() {
        "b.ls 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Ge),
        },
@@ -2061,7 +2061,7 @@ fn test_aarch64_binemit() {
        "b.ge 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Lt),
        },
@@ -2069,7 +2069,7 @@ fn test_aarch64_binemit() {
        "b.lt 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Gt),
        },
@@ -2077,7 +2077,7 @@ fn test_aarch64_binemit() {
        "b.gt 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Le),
        },
@@ -2085,7 +2085,7 @@ fn test_aarch64_binemit() {
        "b.le 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Al),
        },
@@ -2093,7 +2093,7 @@ fn test_aarch64_binemit() {
        "b.al 64",
    ));
    insns.push((
-        Inst::CondBrLowered {
+        Inst::OneWayCondBr {
            target: BranchTarget::ResolvedOffset(64),
            kind: CondBrKind::Cond(Cond::Nv),
        },
@@ -2102,7 +2102,7 @@ fn test_aarch64_binemit() {
    ));

    insns.push((
-        Inst::CondBrLoweredCompound {
+        Inst::CondBr {
            taken: BranchTarget::ResolvedOffset(64),
            not_taken: BranchTarget::ResolvedOffset(128),
            kind: CondBrKind::Cond(Cond::Le),
@@ -2138,7 +2138,7 @@ fn test_aarch64_binemit() {
    insns.push((
        Inst::IndirectBr {
            rn: xreg(3),
-            targets: vec![1, 2, 3],
+            targets: vec![],
        },
        "60001FD6",
        "br x3",
@@ -2149,7 +2149,7 @@ fn test_aarch64_binemit() {
    insns.push((
        Inst::Adr {
            rd: writable_xreg(15),
-            label: MemLabel::PCRel((1 << 20) - 4),
+            off: (1 << 20) - 4,
        },
        "EFFF7F10",
        "adr x15, pc+1048572",
@@ -2792,19 +2792,11 @@ fn test_aarch64_binemit() {
        let actual_printing = insn.show_rru(Some(&rru));
        assert_eq!(expected_printing, actual_printing);

-        // Check the encoding is as expected.
-        let text_size = {
-            let mut code_sec = MachSectionSize::new(0);
-            insn.emit(&mut code_sec, &flags, &mut Default::default());
-            code_sec.size()
-        };
-
        let mut sink = test_utils::TestCodeSink::new();
-        let mut sections = MachSections::new();
-        let code_idx = sections.add_section(0, text_size);
-        let code_sec = sections.get_section(code_idx);
-        insn.emit(code_sec, &flags, &mut Default::default());
-        sections.emit(&mut sink);
+        let mut buffer = MachBuffer::new();
+        insn.emit(&mut buffer, &flags, &mut Default::default());
+        let buffer = buffer.finish();
+        buffer.emit(&mut sink);
        let actual_encoding = &sink.stringify();
        assert_eq!(expected_encoding, actual_encoding);
    }
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -645,35 +645,28 @@ pub enum Inst {
        dest: BranchTarget,
    },

-    /// A conditional branch.
+    /// A conditional branch. Contains two targets; at emission time, both are emitted, but
+    /// the MachBuffer knows to truncate the trailing branch if fallthrough. We optimize the
+    /// choice of taken/not_taken (inverting the branch polarity as needed) based on the
+    /// fallthrough at the time of lowering.
    CondBr {
        taken: BranchTarget,
        not_taken: BranchTarget,
        kind: CondBrKind,
    },

-    /// Lowered conditional branch: contains the original branch kind (or the
-    /// inverse), but only one BranchTarget is retained. The other is
-    /// implicitly the next instruction, given the final basic-block layout.
-    CondBrLowered {
+    /// A one-way conditional branch, invisible to the CFG processing; used *only* as part of
+    /// straight-line sequences in code to be emitted.
+    OneWayCondBr {
        target: BranchTarget,
        kind: CondBrKind,
    },

-    /// As for `CondBrLowered`, but represents a condbr/uncond-br sequence (two
-    /// actual machine instructions). Needed when the final block layout implies
-    /// that neither arm of a conditional branch targets the fallthrough block.
-    CondBrLoweredCompound {
-        taken: BranchTarget,
-        not_taken: BranchTarget,
-        kind: CondBrKind,
-    },
-
    /// An indirect branch through a register, augmented with set of all
    /// possible successors.
    IndirectBr {
        rn: Reg,
-        targets: Vec<BlockIndex>,
+        targets: Vec<MachLabel>,
    },

    /// A "break" instruction, used for e.g. traps and debug breakpoints.
@@ -685,11 +678,14 @@ pub enum Inst {
        trap_info: (SourceLoc, TrapCode),
    },

-    /// Load the address (using a PC-relative offset) of a MemLabel, using the
-    /// `ADR` instruction.
+    /// Load the address (using a PC-relative offset) of a memory location, using the `ADR`
+    /// instruction. Note that we take a simple offset, not a `MemLabel`, here, because `Adr` is
+    /// only used for now in fixed lowering sequences with hardcoded offsets. In the future we may
+    /// need full `MemLabel` support.
    Adr {
        rd: Writable<Reg>,
-        label: MemLabel,
+        /// Offset in range -2^20 .. 2^20.
+        off: i32,
    },

    /// Raw 32-bit word, used for inline constants and jump-table entries.
@@ -706,7 +702,7 @@ pub enum Inst {
    /// for rationale).
    JTSequence {
        targets: Box<[BranchTarget]>,
-        targets_for_term: Box<[BlockIndex]>, // needed for MachTerminator.
+        targets_for_term: Box<[MachLabel]>, // needed for MachTerminator.
        ridx: Reg,
        rtmp1: Writable<Reg>,
        rtmp2: Writable<Reg>,
@@ -732,21 +728,19 @@ pub enum Inst {
        mem: MemArg,
    },

-    /// Sets the value of the pinned register to the given register target.
-    GetPinnedReg {
-        rd: Writable<Reg>,
-    },
-
-    /// Writes the value of the given source register to the pinned register.
-    SetPinnedReg {
-        rm: Reg,
-    },
-
    /// Marker, no-op in generated code: SP "virtual offset" is adjusted. This
    /// controls MemArg::NominalSPOffset args are lowered.
    VirtualSPOffsetAdj {
        offset: i64,
    },
+
+    /// Meta-insn, no-op in generated code: emit constant/branch veneer island at this point (with
+    /// a guard jump around it) if less than the needed space is available before the next branch
+    /// deadline.
+    EmitIsland {
+        /// The needed space before the next deadline.
+        needed_space: CodeOffset,
+    },
 }

 fn count_zero_half_words(mut value: u64) -> usize {
@@ -1111,9 +1105,7 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
            collector.add_defs(&*defs);
            collector.add_use(rn);
        }
-        &Inst::CondBr { ref kind, .. }
-        | &Inst::CondBrLowered { ref kind, .. }
-        | &Inst::CondBrLoweredCompound { ref kind, .. } => match kind {
+        &Inst::CondBr { ref kind, .. } | &Inst::OneWayCondBr { ref kind, .. } => match kind {
            CondBrKind::Zero(rt) | CondBrKind::NotZero(rt) => {
                collector.add_use(*rt);
            }
@@ -1142,13 +1134,8 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
        &Inst::LoadAddr { rd, mem: _ } => {
            collector.add_def(rd);
        }
-        &Inst::GetPinnedReg { rd } => {
-            collector.add_def(rd);
-        }
-        &Inst::SetPinnedReg { rm } => {
-            collector.add_use(rm);
-        }
        &Inst::VirtualSPOffsetAdj { .. } => {}
+        &Inst::EmitIsland { .. } => {}
    }
 }

@@ -1676,13 +1663,7 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RegUsageMapper) {
            *defs = Box::new(new_defs);
            map_use(mapper, rn);
        }
-        &mut Inst::CondBr { ref mut kind, .. } => {
-            map_br(mapper, kind);
-        }
-        &mut Inst::CondBrLowered { ref mut kind, .. } => {
-            map_br(mapper, kind);
-        }
-        &mut Inst::CondBrLoweredCompound { ref mut kind, .. } => {
+        &mut Inst::CondBr { ref mut kind, .. } | &mut Inst::OneWayCondBr { ref mut kind, .. } => {
            map_br(mapper, kind);
        }
        &mut Inst::IndirectBr { ref mut rn, .. } => {
@@ -1716,13 +1697,8 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RegUsageMapper) {
            map_def(mapper, rd);
            map_mem(mapper, mem);
        }
-        &mut Inst::GetPinnedReg { ref mut rd } => {
-            map_def(mapper, rd);
-        }
-        &mut Inst::SetPinnedReg { ref mut rm } => {
-            map_use(mapper, rm);
-        }
        &mut Inst::VirtualSPOffsetAdj { .. } => {}
+        &mut Inst::EmitIsland { .. } => {}
    }
 }

@@ -1730,6 +1706,8 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RegUsageMapper) {
 // Instructions: misc functions and external interface

 impl MachInst for Inst {
+    type LabelUse = LabelUse;
+
    fn get_regs(&self, collector: &mut RegUsageCollector) {
        aarch64_get_regs(self, collector)
    }
@@ -1757,24 +1735,14 @@ impl MachInst for Inst {
    fn is_term<'a>(&'a self) -> MachTerminator<'a> {
        match self {
            &Inst::Ret | &Inst::EpiloguePlaceholder => MachTerminator::Ret,
-            &Inst::Jump { dest } => MachTerminator::Uncond(dest.as_block_index().unwrap()),
+            &Inst::Jump { dest } => MachTerminator::Uncond(dest.as_label().unwrap()),
            &Inst::CondBr {
                taken, not_taken, ..
-            } => MachTerminator::Cond(
-                taken.as_block_index().unwrap(),
-                not_taken.as_block_index().unwrap(),
-            ),
-            &Inst::CondBrLowered { .. } => {
-                // When this is used prior to branch finalization for branches
-                // within an open-coded sequence, i.e. with ResolvedOffsets,
-                // do not consider it a terminator. From the point of view of CFG analysis,
-                // it is part of a black-box single-in single-out region, hence is not
-                // denoted a terminator.
+            } => MachTerminator::Cond(taken.as_label().unwrap(), not_taken.as_label().unwrap()),
+            &Inst::OneWayCondBr { .. } => {
+                // Explicitly invisible to CFG processing.
                MachTerminator::None
            }
-            &Inst::CondBrLoweredCompound { .. } => {
-                panic!("is_term() called after lowering branches");
-            }
            &Inst::IndirectBr { ref targets, .. } => MachTerminator::Indirect(&targets[..]),
            &Inst::JTSequence {
                ref targets_for_term,
@@ -1789,6 +1757,23 @@ impl MachInst for Inst {
        Inst::mov(to_reg, from_reg)
    }

+    fn gen_constant(to_reg: Writable<Reg>, value: u64, ty: Type) -> SmallVec<[Inst; 4]> {
+        if ty == F64 {
+            let mut ret = SmallVec::new();
+            ret.push(Inst::load_fp_constant64(to_reg, f64::from_bits(value)));
+            ret
+        } else if ty == F32 {
+            let mut ret = SmallVec::new();
+            ret.push(Inst::load_fp_constant32(
+                to_reg,
+                f32::from_bits(value as u32),
+            ));
+            ret
+        } else {
+            Inst::load_constant(to_reg, value)
+        }
+    }
+
    fn gen_zero_len_nop() -> Inst {
        Inst::Nop0
    }
@@ -1815,101 +1800,25 @@ impl MachInst for Inst {
        }
    }

-    fn gen_jump(blockindex: BlockIndex) -> Inst {
+    fn gen_jump(target: MachLabel) -> Inst {
        Inst::Jump {
-            dest: BranchTarget::Block(blockindex),
+            dest: BranchTarget::Label(target),
        }
    }

-    fn with_block_rewrites(&mut self, block_target_map: &[BlockIndex]) {
-        match self {
-            &mut Inst::Jump { ref mut dest } => {
-                dest.map(block_target_map);
-            }
-            &mut Inst::CondBr {
-                ref mut taken,
-                ref mut not_taken,
-                ..
-            } => {
-                taken.map(block_target_map);
-                not_taken.map(block_target_map);
-            }
-            &mut Inst::CondBrLowered { .. } => {
-                // See note in `is_term()`: this is used in open-coded sequences
-                // within blocks and should be left alone.
-            }
-            &mut Inst::CondBrLoweredCompound { .. } => {
-                panic!("with_block_rewrites called after branch lowering!");
-            }
-            _ => {}
-        }
+    fn reg_universe(flags: &settings::Flags) -> RealRegUniverse {
+        create_reg_universe(flags)
    }

-    fn with_fallthrough_block(&mut self, fallthrough: Option<BlockIndex>) {
-        match self {
-            &mut Inst::CondBr {
-                taken,
-                not_taken,
-                kind,
-            } => {
-                if taken.as_block_index() == fallthrough
-                    && not_taken.as_block_index() == fallthrough
-                {
-                    *self = Inst::Nop0;
-                } else if taken.as_block_index() == fallthrough {
-                    *self = Inst::CondBrLowered {
-                        target: not_taken,
-                        kind: kind.invert(),
-                    };
-                } else if not_taken.as_block_index() == fallthrough {
-                    *self = Inst::CondBrLowered {
-                        target: taken,
-                        kind,
-                    };
-                } else {
-                    // We need a compound sequence (condbr / uncond-br).
-                    *self = Inst::CondBrLoweredCompound {
-                        taken,
-                        not_taken,
-                        kind,
-                    };
-                }
-            }
-            &mut Inst::Jump { dest } => {
-                if dest.as_block_index() == fallthrough {
-                    *self = Inst::Nop0;
-                }
-            }
-            _ => {}
-        }
-    }
-
-    fn with_block_offsets(&mut self, my_offset: CodeOffset, targets: &[CodeOffset]) {
-        match self {
-            &mut Inst::CondBrLowered { ref mut target, .. } => {
-                target.lower(targets, my_offset);
-            }
-            &mut Inst::CondBrLoweredCompound {
-                ref mut taken,
-                ref mut not_taken,
-                ..
-            } => {
-                taken.lower(targets, my_offset);
-                not_taken.lower(targets, my_offset + 4);
-            }
-            &mut Inst::Jump { ref mut dest } => {
-                dest.lower(targets, my_offset);
-            }
-            &mut Inst::JTSequence {
-                targets: ref mut t, ..
-            } => {
-                for target in t.iter_mut() {
-                    // offset+20: jumptable is 20 bytes into compound sequence.
-                    target.lower(targets, my_offset + 20);
-                }
-            }
-            _ => {}
-        }
+    fn worst_case_size() -> CodeOffset {
+        // The maximum size, in bytes, of any `Inst`'s emitted code. We have at least one case of
+        // an 8-instruction sequence (saturating int-to-float conversions) with three embedded
+        // 64-bit f64 constants.
+        //
+        // Note that inline jump-tables handle island/pool insertion separately, so we do not need
+        // to account for them here (otherwise the worst case would be 2^31 * 4, clearly not
+        // feasible for other reasons).
+        44
    }
 }

@@ -2550,12 +2459,12 @@ impl ShowWithRRU for Inst {
                    }
                }
            }
-            &Inst::CondBrLowered {
+            &Inst::OneWayCondBr {
                ref target,
                ref kind,
            } => {
                let target = target.show_rru(mb_rru);
-                match &kind {
+                match kind {
                    &CondBrKind::Zero(reg) => {
                        let reg = reg.show_rru(mb_rru);
                        format!("cbz {}, {}", reg, target)
@@ -2570,30 +2479,15 @@ impl ShowWithRRU for Inst {
                    }
                }
            }
-            &Inst::CondBrLoweredCompound {
-                ref taken,
-                ref not_taken,
-                ref kind,
-            } => {
-                let first = Inst::CondBrLowered {
-                    target: taken.clone(),
-                    kind: kind.clone(),
-                };
-                let second = Inst::Jump {
-                    dest: not_taken.clone(),
-                };
-                first.show_rru(mb_rru) + " ; " + &second.show_rru(mb_rru)
-            }
            &Inst::IndirectBr { rn, .. } => {
                let rn = rn.show_rru(mb_rru);
                format!("br {}", rn)
            }
            &Inst::Brk => "brk #0".to_string(),
            &Inst::Udf { .. } => "udf".to_string(),
-            &Inst::Adr { rd, ref label } => {
+            &Inst::Adr { rd, off } => {
                let rd = rd.show_rru(mb_rru);
-                let label = label.show_rru(mb_rru);
-                format!("adr {}, {}", rd, label)
+                format!("adr {}, pc+{}", rd, off)
            }
            &Inst::Word4 { data } => format!("data.i32 {}", data),
            &Inst::Word8 { data } => format!("data.i64 {}", data),
@@ -2683,15 +2577,134 @@ impl ShowWithRRU for Inst {
                }
                ret
            }
-            &Inst::GetPinnedReg { rd } => {
-                let rd = rd.show_rru(mb_rru);
-                format!("get_pinned_reg {}", rd)
-            }
-            &Inst::SetPinnedReg { rm } => {
-                let rm = rm.show_rru(mb_rru);
-                format!("set_pinned_reg {}", rm)
-            }
            &Inst::VirtualSPOffsetAdj { offset } => format!("virtual_sp_offset_adjust {}", offset),
+            &Inst::EmitIsland { needed_space } => format!("emit_island {}", needed_space),
+        }
+    }
+}
+
+//=============================================================================
+// Label fixups and jump veneers.
+
+/// Different forms of label references for different instruction formats.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum LabelUse {
+    /// 19-bit branch offset (conditional branches). PC-rel, offset is imm << 2. Immediate is 19
+    /// signed bits, in bits 23:5. Used by cbz, cbnz, b.cond.
+    Branch19,
+    /// 26-bit branch offset (unconditional branches). PC-rel, offset is imm << 2. Immediate is 26
+    /// signed bits, in bits 25:0. Used by b, bl.
+    Branch26,
+    /// 19-bit offset for LDR (load literal). PC-rel, offset is imm << 2. Immediate is 19 signed bits,
+    /// in bits 23:5.
+    Ldr19,
+    /// 21-bit offset for ADR (get address of label). PC-rel, offset is not shifted. Immediate is
+    /// 21 signed bits, with high 19 bits in bits 23:5 and low 2 bits in bits 30:29.
+    Adr21,
+    /// 32-bit PC relative constant offset (from address of constant itself). Used in jump tables.
+    PCRel32,
+}
+
+impl MachInstLabelUse for LabelUse {
+    /// Alignment for veneer code. Every AArch64 instruction must be 4-byte-aligned.
+    const ALIGN: CodeOffset = 4;
+
+    /// Maximum PC-relative range (positive), inclusive.
+    fn max_pos_range(self) -> CodeOffset {
+        match self {
+            // 19-bit immediate, left-shifted by 2, for 21 bits of total range. Signed, so +2^20
+            // from zero. Likewise for two other shifted cases below.
+            LabelUse::Branch19 => (1 << 20) - 1,
+            LabelUse::Branch26 => (1 << 27) - 1,
+            LabelUse::Ldr19 => (1 << 20) - 1,
+            // Adr does not shift its immediate, so the 21-bit immediate gives 21 bits of total
+            // range.
+            LabelUse::Adr21 => (1 << 20) - 1,
+            LabelUse::PCRel32 => 0x7fffffff,
+        }
+    }
+
+    /// Maximum PC-relative range (negative).
+    fn max_neg_range(self) -> CodeOffset {
+        // All forms are twos-complement signed offsets, so negative limit is one more than
+        // positive limit.
+        self.max_pos_range() + 1
+    }
+
+    /// Size of window into code needed to do the patch.
+    fn patch_size(self) -> CodeOffset {
+        // Patch is on one instruction only for all of these label reference types.
+        4
+    }
+
+    /// Perform the patch.
+    fn patch(self, buffer: &mut [u8], use_offset: CodeOffset, label_offset: CodeOffset) {
+        let pc_rel = (label_offset as i64) - (use_offset as i64);
+        debug_assert!(pc_rel <= self.max_pos_range() as i64);
+        debug_assert!(pc_rel >= -(self.max_neg_range() as i64));
+        let pc_rel = pc_rel as u32;
+        let insn_word = u32::from_le_bytes([buffer[0], buffer[1], buffer[2], buffer[3]]);
+        let mask = match self {
+            LabelUse::Branch19 => 0x00ffffe0, // bits 23..5 inclusive
+            LabelUse::Branch26 => 0x03ffffff, // bits 25..0 inclusive
+            LabelUse::Ldr19 => 0x00ffffe0,    // bits 23..5 inclusive
+            LabelUse::Adr21 => 0x60ffffe0,    // bits 30..29, 25..5 inclusive
+            LabelUse::PCRel32 => 0xffffffff,
+        };
+        let pc_rel_shifted = match self {
+            LabelUse::Adr21 | LabelUse::PCRel32 => pc_rel,
+            _ => {
+                debug_assert!(pc_rel & 3 == 0);
+                pc_rel >> 2
+            }
+        };
+        let pc_rel_inserted = match self {
+            LabelUse::Branch19 | LabelUse::Ldr19 => (pc_rel_shifted & 0x7ffff) << 5,
+            LabelUse::Branch26 => pc_rel_shifted & 0x3ffffff,
+            LabelUse::Adr21 => (pc_rel_shifted & 0x7ffff) << 5 | (pc_rel_shifted & 0x180000) << 10,
+            LabelUse::PCRel32 => pc_rel_shifted,
+        };
+        let is_add = match self {
+            LabelUse::PCRel32 => true,
+            _ => false,
+        };
+        let insn_word = if is_add {
+            insn_word.wrapping_add(pc_rel_inserted)
+        } else {
+            (insn_word & !mask) | pc_rel_inserted
+        };
+        buffer[0..4].clone_from_slice(&u32::to_le_bytes(insn_word));
+    }
+
+    /// Is a veneer supported for this label reference type?
+    fn supports_veneer(self) -> bool {
+        match self {
+            LabelUse::Branch19 => true, // veneer is a Branch26
+            _ => false,
+        }
+    }
+
+    /// How large is the veneer, if supported?
+    fn veneer_size(self) -> CodeOffset {
+        4
+    }
+
+    /// Generate a veneer into the buffer, given that this veneer is at `veneer_offset`, and return
+    /// an offset and label-use for the veneer's use of the original label.
+    fn generate_veneer(
+        self,
+        buffer: &mut [u8],
+        veneer_offset: CodeOffset,
+    ) -> (CodeOffset, LabelUse) {
+        match self {
+            LabelUse::Branch19 => {
+                // veneer is a Branch26 (unconditional branch). Just encode directly here -- don't
+                // bother with constructing an Inst.
+                let insn_word = 0b000101 << 26;
+                buffer[0..4].clone_from_slice(&u32::to_le_bytes(insn_word));
+                (veneer_offset, LabelUse::Branch26)
+            }
+            _ => panic!("Unsupported label-reference type for veneer generation!"),
        }
    }
 }
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -14,12 +14,14 @@ use crate::ir::Inst as IRInst;
 use crate::ir::{InstructionData, Opcode, TrapCode, Type};
 use crate::machinst::lower::*;
 use crate::machinst::*;
+use crate::CodegenResult;

 use crate::isa::aarch64::inst::*;
 use crate::isa::aarch64::AArch64Backend;

 use super::lower_inst;

+use log::debug;
 use regalloc::{Reg, RegClass, Writable};

 //============================================================================
@@ -104,18 +106,11 @@ pub(crate) enum ResultRegImmShift {
 }

 //============================================================================
-// Instruction input and output "slots".
+// Instruction input "slots".
 //
 // We use these types to refer to operand numbers, and result numbers, together
 // with the associated instruction, in a type-safe way.

-/// Identifier for a particular output of an instruction.
-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub(crate) struct InsnOutput {
-    pub(crate) insn: IRInst,
-    pub(crate) output: usize,
-}
-
 /// Identifier for a particular input of an instruction.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub(crate) struct InsnInput {
@@ -123,93 +118,28 @@ pub(crate) struct InsnInput {
    pub(crate) input: usize,
 }

-/// Producer of a value: either a previous instruction's output, or a register that will be
-/// codegen'd separately.
+/// Identifier for a particular output of an instruction.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub(crate) enum InsnInputSource {
-    Output(InsnOutput),
-    Reg(Reg),
-}
-
-impl InsnInputSource {
-    fn as_output(self) -> Option<InsnOutput> {
-        match self {
-            InsnInputSource::Output(o) => Some(o),
-            _ => None,
-        }
-    }
-}
-
-fn get_input<C: LowerCtx<I = Inst>>(ctx: &mut C, output: InsnOutput, num: usize) -> InsnInput {
-    assert!(num <= ctx.num_inputs(output.insn));
-    InsnInput {
-        insn: output.insn,
-        input: num,
-    }
-}
-
-/// Convert an instruction input to a producing instruction's output if possible (in same BB), or a
-/// register otherwise.
-fn input_source<C: LowerCtx<I = Inst>>(ctx: &mut C, input: InsnInput) -> InsnInputSource {
-    if let Some((input_inst, result_num)) = ctx.input_inst(input.insn, input.input) {
-        let out = InsnOutput {
-            insn: input_inst,
-            output: result_num,
-        };
-        InsnInputSource::Output(out)
-    } else {
-        let reg = ctx.input(input.insn, input.input);
-        InsnInputSource::Reg(reg)
-    }
+pub(crate) struct InsnOutput {
+    pub(crate) insn: IRInst,
+    pub(crate) output: usize,
 }

 //============================================================================
-// Lowering: convert instruction outputs to result types.
+// Lowering: convert instruction inputs to forms that we can use.

-/// Lower an instruction output to a 64-bit constant, if possible.
-pub(crate) fn output_to_const<C: LowerCtx<I = Inst>>(ctx: &mut C, out: InsnOutput) -> Option<u64> {
-    if out.output > 0 {
-        None
-    } else {
-        let inst_data = ctx.data(out.insn);
-        if inst_data.opcode() == Opcode::Null {
-            Some(0)
-        } else {
-            match inst_data {
-                &InstructionData::UnaryImm { opcode: _, imm } => {
-                    // Only has Into for i64; we use u64 elsewhere, so we cast.
-                    let imm: i64 = imm.into();
-                    Some(imm as u64)
-                }
-                &InstructionData::UnaryBool { opcode: _, imm } => Some(u64::from(imm)),
-                &InstructionData::UnaryIeee32 { opcode: _, imm } => Some(u64::from(imm.bits())),
-                &InstructionData::UnaryIeee64 { opcode: _, imm } => Some(imm.bits()),
-                _ => None,
-            }
-        }
-    }
+/// Lower an instruction input to a 64-bit constant, if possible.
+pub(crate) fn input_to_const<C: LowerCtx<I = Inst>>(ctx: &mut C, input: InsnInput) -> Option<u64> {
+    let input = ctx.get_input(input.insn, input.input);
+    input.constant
 }

-pub(crate) fn output_to_const_f32<C: LowerCtx<I = Inst>>(
+/// Lower an instruction input to a constant register-shift amount, if possible.
+pub(crate) fn input_to_shiftimm<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
-    out: InsnOutput,
-) -> Option<f32> {
-    output_to_const(ctx, out).map(|value| f32::from_bits(value as u32))
-}
-
-pub(crate) fn output_to_const_f64<C: LowerCtx<I = Inst>>(
-    ctx: &mut C,
-    out: InsnOutput,
-) -> Option<f64> {
-    output_to_const(ctx, out).map(|value| f64::from_bits(value))
-}
-
-/// Lower an instruction output to a constant register-shift amount, if possible.
-pub(crate) fn output_to_shiftimm<C: LowerCtx<I = Inst>>(
-    ctx: &mut C,
-    out: InsnOutput,
+    input: InsnInput,
 ) -> Option<ShiftOpShiftImm> {
-    output_to_const(ctx, out).and_then(ShiftOpShiftImm::maybe_from_shift)
+    input_to_const(ctx, input).and_then(ShiftOpShiftImm::maybe_from_shift)
 }

 /// How to handle narrow values loaded into registers; see note on `narrow_mode`
@@ -237,9 +167,9 @@ impl NarrowValueMode {
    }
 }

-/// Lower an instruction output to a reg.
+/// Allocate a register for an instruction output and return it.
 pub(crate) fn output_to_reg<C: LowerCtx<I = Inst>>(ctx: &mut C, out: InsnOutput) -> Writable<Reg> {
-    ctx.output(out.insn, out.output)
+    ctx.get_output(out.insn, out.output)
 }

 /// Lower an instruction input to a reg.
@@ -252,9 +182,22 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
    input: InsnInput,
    narrow_mode: NarrowValueMode,
 ) -> Reg {
+    debug!("input_to_reg: input {:?}", input);
    let ty = ctx.input_ty(input.insn, input.input);
    let from_bits = ty_bits(ty) as u8;
-    let in_reg = ctx.input(input.insn, input.input);
+    let inputs = ctx.get_input(input.insn, input.input);
+    let in_reg = if let Some(c) = inputs.constant {
+        // Generate constants fresh at each use to minimize long-range register pressure.
+        let to_reg = ctx.tmp(Inst::rc_for_type(ty).unwrap(), ty);
+        for inst in Inst::gen_constant(to_reg, c, ty).into_iter() {
+            ctx.emit(inst);
+        }
+        to_reg.to_reg()
+    } else {
+        ctx.use_input_reg(inputs);
+        inputs.reg
+    };
+
    match (narrow_mode, from_bits) {
        (NarrowValueMode::None, _) => in_reg,
        (NarrowValueMode::ZeroExtend32, n) if n < 32 => {
@@ -282,15 +225,20 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
        (NarrowValueMode::ZeroExtend32, 32) | (NarrowValueMode::SignExtend32, 32) => in_reg,

        (NarrowValueMode::ZeroExtend64, n) if n < 64 => {
-            let tmp = ctx.tmp(RegClass::I64, I32);
-            ctx.emit(Inst::Extend {
-                rd: tmp,
-                rn: in_reg,
-                signed: false,
-                from_bits,
-                to_bits: 64,
-            });
-            tmp.to_reg()
+            if inputs.constant.is_some() {
+                // Constants are zero-extended to full 64-bit width on load already.
+                in_reg
+            } else {
+                let tmp = ctx.tmp(RegClass::I64, I32);
+                ctx.emit(Inst::Extend {
+                    rd: tmp,
+                    rn: in_reg,
+                    signed: false,
+                    from_bits,
+                    to_bits: 64,
+                });
+                tmp.to_reg()
+            }
        }
        (NarrowValueMode::SignExtend64, n) if n < 64 => {
            let tmp = ctx.tmp(RegClass::I64, I32);
@@ -313,8 +261,6 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
 }

 /// Lower an instruction input to a reg or reg/shift, or reg/extend operand.
-/// This does not actually codegen the source instruction; it just uses the
-/// vreg into which the source instruction will generate its value.
 ///
 /// The `narrow_mode` flag indicates whether the consumer of this value needs
 /// the high bits clear. For many operations, such as an add/sub/mul or any
@@ -330,23 +276,18 @@ fn input_to_rs<C: LowerCtx<I = Inst>>(
    input: InsnInput,
    narrow_mode: NarrowValueMode,
 ) -> ResultRS {
-    if let InsnInputSource::Output(out) = input_source(ctx, input) {
-        let insn = out.insn;
-        assert!(out.output <= ctx.num_outputs(insn));
+    let inputs = ctx.get_input(input.insn, input.input);
+    if let Some((insn, 0)) = inputs.inst {
        let op = ctx.data(insn).opcode();

        if op == Opcode::Ishl {
-            let shiftee = get_input(ctx, out, 0);
-            let shift_amt = get_input(ctx, out, 1);
+            let shiftee = InsnInput { insn, input: 0 };
+            let shift_amt = InsnInput { insn, input: 1 };

            // Can we get the shift amount as an immediate?
-            if let Some(shift_amt_out) = input_source(ctx, shift_amt).as_output() {
-                if let Some(shiftimm) = output_to_shiftimm(ctx, shift_amt_out) {
-                    let reg = input_to_reg(ctx, shiftee, narrow_mode);
-                    ctx.merged(insn);
-                    ctx.merged(shift_amt_out.insn);
-                    return ResultRS::RegShift(reg, ShiftOpAndAmt::new(ShiftOp::LSL, shiftimm));
-                }
+            if let Some(shiftimm) = input_to_shiftimm(ctx, shift_amt) {
+                let reg = input_to_reg(ctx, shiftee, narrow_mode);
+                return ResultRS::RegShift(reg, ShiftOpAndAmt::new(ShiftOp::LSL, shiftimm));
            }
        }
    }
@@ -364,11 +305,10 @@ fn input_to_rse<C: LowerCtx<I = Inst>>(
    input: InsnInput,
    narrow_mode: NarrowValueMode,
 ) -> ResultRSE {
-    if let InsnInputSource::Output(out) = input_source(ctx, input) {
-        let insn = out.insn;
-        assert!(out.output <= ctx.num_outputs(insn));
+    let inputs = ctx.get_input(input.insn, input.input);
+    if let Some((insn, 0)) = inputs.inst {
        let op = ctx.data(insn).opcode();
-        let out_ty = ctx.output_ty(insn, out.output);
+        let out_ty = ctx.output_ty(insn, 0);
        let out_bits = ty_bits(out_ty);

        // If `out_ty` is smaller than 32 bits and we need to zero- or sign-extend,
@@ -378,7 +318,7 @@ fn input_to_rse<C: LowerCtx<I = Inst>>(
            && ((narrow_mode.is_32bit() && out_bits < 32)
                || (!narrow_mode.is_32bit() && out_bits < 64))
        {
-            let reg = output_to_reg(ctx, out);
+            let reg = input_to_reg(ctx, InsnInput { insn, input: 0 }, NarrowValueMode::None);
            let extendop = match (narrow_mode, out_bits) {
                (NarrowValueMode::SignExtend32, 1) | (NarrowValueMode::SignExtend64, 1) => {
                    ExtendOp::SXTB
@@ -402,15 +342,14 @@ fn input_to_rse<C: LowerCtx<I = Inst>>(
                (NarrowValueMode::ZeroExtend64, 32) => ExtendOp::UXTW,
                _ => unreachable!(),
            };
-            return ResultRSE::RegExtend(reg.to_reg(), extendop);
+            return ResultRSE::RegExtend(reg, extendop);
        }

        // Is this a zero-extend or sign-extend and can we handle that with a register-mode operator?
        if op == Opcode::Uextend || op == Opcode::Sextend {
            assert!(out_bits == 32 || out_bits == 64);
            let sign_extend = op == Opcode::Sextend;
-            let extendee = get_input(ctx, out, 0);
-            let inner_ty = ctx.input_ty(extendee.insn, extendee.input);
+            let inner_ty = ctx.input_ty(insn, 0);
            let inner_bits = ty_bits(inner_ty);
            assert!(inner_bits < out_bits);
            let extendop = match (sign_extend, inner_bits) {
@@ -424,8 +363,7 @@ fn input_to_rse<C: LowerCtx<I = Inst>>(
                (false, 32) => ExtendOp::UXTW,
                _ => unreachable!(),
            };
-            let reg = input_to_reg(ctx, extendee, NarrowValueMode::None);
-            ctx.merged(insn);
+            let reg = input_to_reg(ctx, InsnInput { insn, input: 0 }, NarrowValueMode::None);
            return ResultRSE::RegExtend(reg, extendop);
        }
    }
@@ -438,12 +376,9 @@ pub(crate) fn input_to_rse_imm12<C: LowerCtx<I = Inst>>(
    input: InsnInput,
    narrow_mode: NarrowValueMode,
 ) -> ResultRSEImm12 {
-    if let InsnInputSource::Output(out) = input_source(ctx, input) {
-        if let Some(imm_value) = output_to_const(ctx, out) {
-            if let Some(i) = Imm12::maybe_from_u64(imm_value) {
-                ctx.merged(out.insn);
-                return ResultRSEImm12::Imm12(i);
-            }
+    if let Some(imm_value) = input_to_const(ctx, input) {
+        if let Some(i) = Imm12::maybe_from_u64(imm_value) {
+            return ResultRSEImm12::Imm12(i);
        }
    }

@@ -455,14 +390,11 @@ pub(crate) fn input_to_rs_immlogic<C: LowerCtx<I = Inst>>(
    input: InsnInput,
    narrow_mode: NarrowValueMode,
 ) -> ResultRSImmLogic {
-    if let InsnInputSource::Output(out) = input_source(ctx, input) {
-        if let Some(imm_value) = output_to_const(ctx, out) {
-            let ty = ctx.output_ty(out.insn, out.output);
-            let ty = if ty_bits(ty) < 32 { I32 } else { ty };
-            if let Some(i) = ImmLogic::maybe_from_u64(imm_value, ty) {
-                ctx.merged(out.insn);
-                return ResultRSImmLogic::ImmLogic(i);
-            }
+    if let Some(imm_value) = input_to_const(ctx, input) {
+        let ty = ctx.input_ty(input.insn, input.input);
+        let ty = if ty_bits(ty) < 32 { I32 } else { ty };
+        if let Some(i) = ImmLogic::maybe_from_u64(imm_value, ty) {
+            return ResultRSImmLogic::ImmLogic(i);
        }
    }

@@ -473,12 +405,9 @@ pub(crate) fn input_to_reg_immshift<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
    input: InsnInput,
 ) -> ResultRegImmShift {
-    if let InsnInputSource::Output(out) = input_source(ctx, input) {
-        if let Some(imm_value) = output_to_const(ctx, out) {
-            if let Some(immshift) = ImmShift::maybe_from_u64(imm_value) {
-                ctx.merged(out.insn);
-                return ResultRegImmShift::ImmShift(immshift);
-            }
+    if let Some(imm_value) = input_to_const(ctx, input) {
+        if let Some(immshift) = ImmShift::maybe_from_u64(imm_value) {
+            return ResultRegImmShift::ImmShift(immshift);
        }
    }

@@ -823,24 +752,29 @@ pub(crate) fn inst_trapcode(data: &InstructionData) -> Option<TrapCode> {
    }
 }

-/// Checks for an instance of `op` feeding the given input. Marks as merged (decrementing refcount) if so.
+/// Checks for an instance of `op` feeding the given input.
 pub(crate) fn maybe_input_insn<C: LowerCtx<I = Inst>>(
    c: &mut C,
    input: InsnInput,
    op: Opcode,
 ) -> Option<IRInst> {
-    if let InsnInputSource::Output(out) = input_source(c, input) {
-        let data = c.data(out.insn);
+    let inputs = c.get_input(input.insn, input.input);
+    debug!(
+        "maybe_input_insn: input {:?} has options {:?}; looking for op {:?}",
+        input, inputs, op
+    );
+    if let Some((src_inst, _)) = inputs.inst {
+        let data = c.data(src_inst);
+        debug!(" -> input inst {:?}", data);
        if data.opcode() == op {
-            c.merged(out.insn);
-            return Some(out.insn);
+            return Some(src_inst);
        }
    }
    None
 }

 /// Checks for an instance of `op` feeding the given input, possibly via a conversion `conv` (e.g.,
-/// Bint or a bitcast). Marks one or both as merged if so, as appropriate.
+/// Bint or a bitcast).
 ///
 /// FIXME cfallin 2020-03-30: this is really ugly. Factor out tree-matching stuff and make it
 /// a bit more generic.
@@ -850,21 +784,19 @@ pub(crate) fn maybe_input_insn_via_conv<C: LowerCtx<I = Inst>>(
    op: Opcode,
    conv: Opcode,
 ) -> Option<IRInst> {
-    if let Some(ret) = maybe_input_insn(c, input, op) {
-        return Some(ret);
-    }
-
-    if let InsnInputSource::Output(out) = input_source(c, input) {
-        let data = c.data(out.insn);
+    let inputs = c.get_input(input.insn, input.input);
+    if let Some((src_inst, _)) = inputs.inst {
+        let data = c.data(src_inst);
+        if data.opcode() == op {
+            return Some(src_inst);
+        }
        if data.opcode() == conv {
-            let conv_insn = out.insn;
-            let conv_input = InsnInput {
-                insn: conv_insn,
-                input: 0,
-            };
-            if let Some(inner) = maybe_input_insn(c, conv_input, op) {
-                c.merged(conv_insn);
-                return Some(inner);
+            let inputs = c.get_input(src_inst, 0);
+            if let Some((src_inst, _)) = inputs.inst {
+                let data = c.data(src_inst);
+                if data.opcode() == op {
+                    return Some(src_inst);
+                }
            }
        }
    }
@@ -876,6 +808,7 @@ pub(crate) fn lower_icmp_or_ifcmp_to_flags<C: LowerCtx<I = Inst>>(
    insn: IRInst,
    is_signed: bool,
 ) {
+    debug!("lower_icmp_or_ifcmp_to_flags: insn {}", insn);
    let ty = ctx.input_ty(insn, 0);
    let bits = ty_bits(ty);
    let narrow_mode = match (bits <= 32, is_signed) {
@@ -897,6 +830,7 @@ pub(crate) fn lower_icmp_or_ifcmp_to_flags<C: LowerCtx<I = Inst>>(
    let ty = ctx.input_ty(insn, 0);
    let rn = input_to_reg(ctx, inputs[0], narrow_mode);
    let rm = input_to_rse_imm12(ctx, inputs[1], narrow_mode);
+    debug!("lower_icmp_or_ifcmp_to_flags: rn = {:?} rm = {:?}", rn, rm);
    let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
    let rd = writable_zero_reg();
    ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
@@ -934,17 +868,21 @@ pub(crate) fn lower_fcmp_or_ffcmp_to_flags<C: LowerCtx<I = Inst>>(ctx: &mut C, i
 impl LowerBackend for AArch64Backend {
    type MInst = Inst;

-    fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) {
-        lower_inst::lower_insn_to_regs(ctx, ir_inst);
+    fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> {
+        lower_inst::lower_insn_to_regs(ctx, ir_inst)
    }

    fn lower_branch_group<C: LowerCtx<I = Inst>>(
        &self,
        ctx: &mut C,
        branches: &[IRInst],
-        targets: &[BlockIndex],
-        fallthrough: Option<BlockIndex>,
-    ) {
+        targets: &[MachLabel],
+        fallthrough: Option<MachLabel>,
+    ) -> CodegenResult<()> {
        lower_inst::lower_branch(ctx, branches, targets, fallthrough)
    }
+
+    fn maybe_pinned_reg(&self) -> Option<Reg> {
+        Some(xreg(PINNED_REG))
+    }
 }
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -1,11 +1,13 @@
 //! Lower a single Cranelift instruction into vcode.

+use crate::binemit::CodeOffset;
 use crate::ir::condcodes::FloatCC;
 use crate::ir::types::*;
 use crate::ir::Inst as IRInst;
 use crate::ir::{InstructionData, Opcode, TrapCode};
 use crate::machinst::lower::*;
 use crate::machinst::*;
+use crate::CodegenResult;

 use crate::isa::aarch64::abi::*;
 use crate::isa::aarch64::inst::*;
@@ -19,7 +21,10 @@ use smallvec::SmallVec;
 use super::lower::*;

 /// Actually codegen an instruction's results into registers.
-pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) {
+pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    insn: IRInst,
+) -> CodegenResult<()> {
    let op = ctx.data(insn).opcode();
    let inputs: SmallVec<[InsnInput; 4]> = (0..ctx.num_inputs(insn))
        .map(|i| InsnInput { insn, input: i })
@@ -35,17 +40,17 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns

    match op {
        Opcode::Iconst | Opcode::Bconst | Opcode::Null => {
-            let value = output_to_const(ctx, outputs[0]).unwrap();
+            let value = ctx.get_constant(insn).unwrap();
            let rd = output_to_reg(ctx, outputs[0]);
            lower_constant_u64(ctx, rd, value);
        }
        Opcode::F32const => {
-            let value = output_to_const_f32(ctx, outputs[0]).unwrap();
+            let value = f32::from_bits(ctx.get_constant(insn).unwrap() as u32);
            let rd = output_to_reg(ctx, outputs[0]);
            lower_constant_f32(ctx, rd, value);
        }
        Opcode::F64const => {
-            let value = output_to_const_f64(ctx, outputs[0]).unwrap();
+            let value = f64::from_bits(ctx.get_constant(insn).unwrap());
            let rd = output_to_reg(ctx, outputs[0]);
            lower_constant_f64(ctx, rd, value);
        }
@@ -271,7 +276,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns

                // Check for divide by 0.
                let branch_size = 8;
-                ctx.emit(Inst::CondBrLowered {
+                ctx.emit(Inst::OneWayCondBr {
                    target: BranchTarget::ResolvedOffset(branch_size),
                    kind: CondBrKind::NotZero(rm),
                });
@@ -297,7 +302,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns

                    // Check for divide by 0.
                    let branch_size = 20;
-                    ctx.emit(Inst::CondBrLowered {
+                    ctx.emit(Inst::OneWayCondBr {
                        target: BranchTarget::ResolvedOffset(branch_size),
                        kind: CondBrKind::Zero(rm),
                    });
@@ -324,7 +329,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                        nzcv: NZCV::new(false, false, false, false),
                        cond: Cond::Eq,
                    });
-                    ctx.emit(Inst::CondBrLowered {
+                    ctx.emit(Inst::OneWayCondBr {
                        target: BranchTarget::ResolvedOffset(12),
                        kind: CondBrKind::Cond(Cond::Vc),
                    });
@@ -337,7 +342,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns

                    // Check for divide by 0.
                    let branch_size = 8;
-                    ctx.emit(Inst::CondBrLowered {
+                    ctx.emit(Inst::OneWayCondBr {
                        target: BranchTarget::ResolvedOffset(branch_size),
                        kind: CondBrKind::NotZero(rm),
                    });
@@ -1211,7 +1216,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
            // Branch around the break instruction with inverted cond. Go straight to lowered
            // one-target form; this is logically part of a single-in single-out template lowering.
            let cond = cond.invert();
-            ctx.emit(Inst::CondBrLowered {
+            ctx.emit(Inst::OneWayCondBr {
                target: BranchTarget::ResolvedOffset(8),
                kind: CondBrKind::Cond(cond),
            });
@@ -1301,11 +1306,12 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns

        Opcode::GetPinnedReg => {
            let rd = output_to_reg(ctx, outputs[0]);
-            ctx.emit(Inst::GetPinnedReg { rd });
+            ctx.emit(Inst::mov(rd, xreg(PINNED_REG)));
        }
+
        Opcode::SetPinnedReg => {
            let rm = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
-            ctx.emit(Inst::SetPinnedReg { rm });
+            ctx.emit(Inst::mov(writable_xreg(PINNED_REG), rm));
        }

        Opcode::Spill
@@ -1533,7 +1539,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
            } else {
                ctx.emit(Inst::FpuCmp64 { rn, rm: rn });
            }
-            ctx.emit(Inst::CondBrLowered {
+            ctx.emit(Inst::OneWayCondBr {
                target: BranchTarget::ResolvedOffset(8),
                kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::Ordered)),
            });
@@ -1574,7 +1580,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                    rn,
                    rm: tmp.to_reg(),
                });
-                ctx.emit(Inst::CondBrLowered {
+                ctx.emit(Inst::OneWayCondBr {
                    target: BranchTarget::ResolvedOffset(8),
                    kind: CondBrKind::Cond(lower_fp_condcode(low_cond)),
                });
@@ -1587,7 +1593,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                    rn,
                    rm: tmp.to_reg(),
                });
-                ctx.emit(Inst::CondBrLowered {
+                ctx.emit(Inst::OneWayCondBr {
                    target: BranchTarget::ResolvedOffset(8),
                    kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan)),
                });
@@ -1617,7 +1623,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                    rn,
                    rm: tmp.to_reg(),
                });
-                ctx.emit(Inst::CondBrLowered {
+                ctx.emit(Inst::OneWayCondBr {
                    target: BranchTarget::ResolvedOffset(8),
                    kind: CondBrKind::Cond(lower_fp_condcode(low_cond)),
                });
@@ -1630,7 +1636,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
                    rn,
                    rm: tmp.to_reg(),
                });
-                ctx.emit(Inst::CondBrLowered {
+                ctx.emit(Inst::OneWayCondBr {
                    target: BranchTarget::ResolvedOffset(8),
                    kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan)),
                });
@@ -1862,14 +1868,16 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
        Opcode::AvgRound => unimplemented!(),
        Opcode::TlsValue => unimplemented!(),
    }
+
+    Ok(())
 }

 pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
    branches: &[IRInst],
-    targets: &[BlockIndex],
-    fallthrough: Option<BlockIndex>,
-) {
+    targets: &[MachLabel],
+    fallthrough: Option<MachLabel>,
+) -> CodegenResult<()> {
    // A block should end with at most two branches. The first may be a
    // conditional branch; a conditional branch can be followed only by an
    // unconditional branch or fallthrough. Otherwise, if only one branch,
@@ -1883,18 +1891,14 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
        let op0 = ctx.data(branches[0]).opcode();
        let op1 = ctx.data(branches[1]).opcode();

-        //println!(
-        //    "lowering two-branch group: opcodes are {:?} and {:?}",
-        //    op0, op1
-        //);
-
        assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough);
-        let taken = BranchTarget::Block(targets[0]);
+        let taken = BranchTarget::Label(targets[0]);
        let not_taken = match op1 {
-            Opcode::Jump => BranchTarget::Block(targets[1]),
-            Opcode::Fallthrough => BranchTarget::Block(fallthrough.unwrap()),
+            Opcode::Jump => BranchTarget::Label(targets[1]),
+            Opcode::Fallthrough => BranchTarget::Label(fallthrough.unwrap()),
            _ => unreachable!(), // assert above.
        };
+
        match op0 {
            Opcode::Brz | Opcode::Brnz => {
                let flag_input = InsnInput {
@@ -1954,6 +1958,8 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
            Opcode::BrIcmp => {
                let condcode = inst_condcode(ctx.data(branches[0])).unwrap();
                let cond = lower_condcode(condcode);
+                let kind = CondBrKind::Cond(cond);
+
                let is_signed = condcode_is_signed(condcode);
                let ty = ctx.input_ty(branches[0], 0);
                let bits = ty_bits(ty);
@@ -1986,13 +1992,15 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                ctx.emit(Inst::CondBr {
                    taken,
                    not_taken,
-                    kind: CondBrKind::Cond(cond),
+                    kind,
                });
            }

            Opcode::Brif => {
                let condcode = inst_condcode(ctx.data(branches[0])).unwrap();
                let cond = lower_condcode(condcode);
+                let kind = CondBrKind::Cond(cond);
+
                let is_signed = condcode_is_signed(condcode);
                let flag_input = InsnInput {
                    insn: branches[0],
@@ -2003,7 +2011,7 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                    ctx.emit(Inst::CondBr {
                        taken,
                        not_taken,
-                        kind: CondBrKind::Cond(cond),
+                        kind,
                    });
                } else {
                    // If the ifcmp result is actually placed in a
@@ -2013,7 +2021,7 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                    ctx.emit(Inst::CondBr {
                        taken,
                        not_taken,
-                        kind: CondBrKind::Cond(cond),
+                        kind,
                    });
                }
            }
@@ -2021,6 +2029,7 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
            Opcode::Brff => {
                let condcode = inst_fp_condcode(ctx.data(branches[0])).unwrap();
                let cond = lower_fp_condcode(condcode);
+                let kind = CondBrKind::Cond(cond);
                let flag_input = InsnInput {
                    insn: branches[0],
                    input: 0,
@@ -2030,7 +2039,7 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                    ctx.emit(Inst::CondBr {
                        taken,
                        not_taken,
-                        kind: CondBrKind::Cond(cond),
+                        kind,
                    });
                } else {
                    // If the ffcmp result is actually placed in a
@@ -2040,7 +2049,7 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                    ctx.emit(Inst::CondBr {
                        taken,
                        not_taken,
-                        kind: CondBrKind::Cond(cond),
+                        kind,
                    });
                }
            }
@@ -2057,12 +2066,13 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                // fills in `targets[0]` with our fallthrough block, so this
                // is valid for both Jump and Fallthrough.
                ctx.emit(Inst::Jump {
-                    dest: BranchTarget::Block(targets[0]),
+                    dest: BranchTarget::Label(targets[0]),
                });
            }
            Opcode::BrTable => {
                // Expand `br_table index, default, JT` to:
                //
+                //   (emit island with guard jump if needed)
                //   subs idx, #jt_size
                //   b.hs default
                //   adr vTmp1, PC+16
@@ -2072,6 +2082,11 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                //   [jumptable offsets relative to JT base]
                let jt_size = targets.len() - 1;
                assert!(jt_size <= std::u32::MAX as usize);
+
+                ctx.emit(Inst::EmitIsland {
+                    needed_space: 4 * (6 + jt_size) as CodeOffset,
+                });
+
                let ridx = input_to_reg(
                    ctx,
                    InsnInput {
@@ -2101,10 +2116,10 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                        rm: rtmp1.to_reg(),
                    });
                }
-                let default_target = BranchTarget::Block(targets[0]);
-                ctx.emit(Inst::CondBrLowered {
-                    kind: CondBrKind::Cond(Cond::Hs), // unsigned >=
+                let default_target = BranchTarget::Label(targets[0]);
+                ctx.emit(Inst::OneWayCondBr {
                    target: default_target.clone(),
+                    kind: CondBrKind::Cond(Cond::Hs), // unsigned >=
                });

                // Emit the compound instruction that does:
@@ -2125,9 +2140,9 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                let jt_targets: Vec<BranchTarget> = targets
                    .iter()
                    .skip(1)
-                    .map(|bix| BranchTarget::Block(*bix))
+                    .map(|bix| BranchTarget::Label(*bix))
                    .collect();
-                let targets_for_term: Vec<BlockIndex> = targets.to_vec();
+                let targets_for_term: Vec<MachLabel> = targets.to_vec();
                ctx.emit(Inst::JTSequence {
                    ridx,
                    rtmp1,
@@ -2140,4 +2155,6 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
            _ => panic!("Unknown branch type!"),
        }
    }
+
+    Ok(())
 }
--- a/cranelift/codegen/src/isa/aarch64/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/mod.rs
@@ -15,7 +15,7 @@ use target_lexicon::{Aarch64Architecture, Architecture, Triple};

 // New backend:
 mod abi;
-mod inst;
+pub(crate) mod inst;
 mod lower;
 mod lower_inst;

@@ -59,7 +59,7 @@ impl MachBackend for AArch64Backend {
    ) -> CodegenResult<MachCompileResult> {
        let flags = self.flags();
        let vcode = self.compile_vcode(func, flags.clone())?;
-        let sections = vcode.emit();
+        let buffer = vcode.emit();
        let frame_size = vcode.frame_size();

        let disasm = if want_disasm {
@@ -68,8 +68,10 @@ impl MachBackend for AArch64Backend {
            None
        };

+        let buffer = buffer.finish();
+
        Ok(MachCompileResult {
-            sections,
+            buffer,
            frame_size,
            disasm,
        })
@@ -140,8 +142,8 @@ mod test {
            Triple::from_str("aarch64").unwrap(),
            settings::Flags::new(shared_flags),
        );
-        let sections = backend.compile_function(&mut func, false).unwrap().sections;
-        let code = &sections.sections[0].data;
+        let buffer = backend.compile_function(&mut func, false).unwrap().buffer;
+        let code = &buffer.data[..];

        // stp x29, x30, [sp, #-16]!
        // mov x29, sp
@@ -155,7 +157,7 @@ mod test {
            0x01, 0x0b, 0xbf, 0x03, 0x00, 0x91, 0xfd, 0x7b, 0xc1, 0xa8, 0xc0, 0x03, 0x5f, 0xd6,
        ];

-        assert_eq!(code, &golden);
+        assert_eq!(code, &golden[..]);
    }

    #[test]
@@ -198,34 +200,32 @@ mod test {
        let result = backend
            .compile_function(&mut func, /* want_disasm = */ false)
            .unwrap();
-        let code = &result.sections.sections[0].data;
+        let code = &result.buffer.data[..];

        // stp	x29, x30, [sp, #-16]!
        // mov	x29, sp
-        // mov	x1, x0
-        // mov  x0, #0x1234
-        // add	w1, w1, w0
-        // mov	w2, w1
-        // cbz	x2, ...
-        // mov	w2, w1
-        // cbz	x2, ...
-        // sub	w0, w1, w0
+        // mov	x1, #0x1234                	// #4660
+        // add	w0, w0, w1
+        // mov	w1, w0
+        // cbnz	x1, 0x28
+        // mov	x1, #0x1234                	// #4660
+        // add	w1, w0, w1
+        // mov	w1, w1
+        // cbnz	x1, 0x18
+        // mov	w1, w0
+        // cbnz	x1, 0x18
+        // mov	x1, #0x1234                	// #4660
+        // sub	w0, w0, w1
        // mov	sp, x29
        // ldp	x29, x30, [sp], #16
        // ret
-        // add	w2, w1, w0
-        // mov	w2, w2
-        // cbnz	x2, ... <---- compound branch (cond / uncond)
-        // b ...        <----
-
        let golden = vec![
-            0xfd, 0x7b, 0xbf, 0xa9, 0xfd, 0x03, 0x00, 0x91, 0xe1, 0x03, 0x00, 0xaa, 0x80, 0x46,
-            0x82, 0xd2, 0x21, 0x00, 0x00, 0x0b, 0xe2, 0x03, 0x01, 0x2a, 0xe2, 0x00, 0x00, 0xb4,
-            0xe2, 0x03, 0x01, 0x2a, 0xa2, 0x00, 0x00, 0xb5, 0x20, 0x00, 0x00, 0x4b, 0xbf, 0x03,
-            0x00, 0x91, 0xfd, 0x7b, 0xc1, 0xa8, 0xc0, 0x03, 0x5f, 0xd6, 0x22, 0x00, 0x00, 0x0b,
-            0xe2, 0x03, 0x02, 0x2a, 0xc2, 0xff, 0xff, 0xb5, 0xf7, 0xff, 0xff, 0x17,
+            253, 123, 191, 169, 253, 3, 0, 145, 129, 70, 130, 210, 0, 0, 1, 11, 225, 3, 0, 42, 161,
+            0, 0, 181, 129, 70, 130, 210, 1, 0, 1, 11, 225, 3, 1, 42, 161, 255, 255, 181, 225, 3,
+            0, 42, 97, 255, 255, 181, 129, 70, 130, 210, 0, 0, 1, 75, 191, 3, 0, 145, 253, 123,
+            193, 168, 192, 3, 95, 214,
        ];

-        assert_eq!(code, &golden);
+        assert_eq!(code, &golden[..]);
    }
 }
--- a/cranelift/codegen/src/isa/mod.rs
+++ b/cranelift/codegen/src/isa/mod.rs
@@ -77,14 +77,14 @@ mod riscv;
 #[cfg(feature = "x86")]
 mod x86;

-#[cfg(feature = "x64")]
-mod x64;
+//#[cfg(feature = "x64")]
+//mod x64;

 #[cfg(feature = "arm32")]
 mod arm32;

 #[cfg(feature = "arm64")]
-mod aarch64;
+pub(crate) mod aarch64;

 #[cfg(feature = "unwind")]
 pub mod unwind;
--- a/cranelift/codegen/src/isa/x86/mod.rs
+++ b/cranelift/codegen/src/isa/x86/mod.rs
@@ -57,11 +57,11 @@ fn isa_constructor(
    let isa_flags = settings::Flags::new(&shared_flags, builder);

    if isa_flags.use_new_backend() {
-        #[cfg(not(feature = "x64"))]
+        //#[cfg(not(feature = "x64"))]
        panic!("new backend x86 support not included by cargo features!");

-        #[cfg(feature = "x64")]
-        super::x64::isa_builder(triple).finish(shared_flags)
+    //#[cfg(feature = "x64")]
+    //super::x64::isa_builder(triple).finish(shared_flags)
    } else {
        Box::new(Isa {
            triple,