Merge pull request #1494 from cfallin/arm64-merge

Add new `MachInst` backend and ARM64 support.
2020-04-16 10:02:02 -07:00
parent c268704743 48cf2c2f50
commit 7da6101732
63 changed files with 16668 additions and 322 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -379,6 +379,7 @@ dependencies = [
 "gimli",
 "hashbrown 0.7.1",
 "log",
+ "regalloc",
 "serde",
 "smallvec",
 "target-lexicon",
@@ -432,6 +433,7 @@ dependencies = [
 "memmap",
 "num_cpus",
 "region",
+ "target-lexicon",
 ]

 [[package]]
@@ -1589,6 +1591,16 @@ dependencies = [
 "rust-argon2",
 ]

+[[package]]
+name = "regalloc"
+version = "0.0.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "89ce0cd835fa6e91bbf5d010beee19d0c2e97e4ad5e13c399a31122cfc83bdd6"
+dependencies = [
+ "log",
+ "rustc-hash",
+]
+
 [[package]]
 name = "regex"
 version = "1.3.6"
@@ -1653,6 +1665,12 @@ version = "0.1.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4c691c0e608126e00913e33f0ccf3727d5fc84573623b8d65b2df340b5201783"

+[[package]]
+name = "rustc-hash"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
+
 [[package]]
 name = "rustc_version"
 version = "0.2.3"
--- a/cranelift/codegen/Cargo.toml
+++ b/cranelift/codegen/Cargo.toml
@@ -24,6 +24,7 @@ gimli = { version = "0.20.0", default-features = false, features = ["write"], op
 smallvec = { version = "1.0.0" }
 thiserror = "1.0.4"
 byteorder = { version = "1.3.2", default-features = false }
+regalloc = "0.0.17"
 # It is a goal of the cranelift-codegen crate to have minimal external dependencies.
 # Please don't add any unless they are essential to the task of creating binary
 # machine code. Integration tests that need external dependencies can be
--- a/cranelift/codegen/meta/src/isa/arm64/mod.rs
+++ b/cranelift/codegen/meta/src/isa/arm64/mod.rs
@@ -54,7 +54,9 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
    let mut a64 = CpuMode::new("A64");

    // TODO refine these.
+    let expand_flags = shared_defs.transform_groups.by_name("expand_flags");
    let narrow_flags = shared_defs.transform_groups.by_name("narrow_flags");
+    a64.legalize_monomorphic(expand_flags);
    a64.legalize_default(narrow_flags);

    let cpu_modes = vec![a64];
--- a/cranelift/codegen/src/binemit/mod.rs
+++ b/cranelift/codegen/src/binemit/mod.rs
@@ -54,7 +54,9 @@ pub enum Reloc {
    X86GOTPCRel4,
    /// Arm32 call target
    Arm32Call,
-    /// Arm64 call target
+    /// Arm64 call target. Encoded as bottom 26 bits of instruction. This
+    /// value is sign-extended, multiplied by 4, and added to the PC of
+    /// the call instruction to form the destination address.
    Arm64Call,
    /// RISC-V call target
    RiscvCall,
--- a/cranelift/codegen/src/context.rs
+++ b/cranelift/codegen/src/context.rs
@@ -19,8 +19,10 @@ use crate::flowgraph::ControlFlowGraph;
 use crate::ir::Function;
 use crate::isa::TargetIsa;
 use crate::legalize_function;
+use crate::legalizer::simple_legalize;
 use crate::licm::do_licm;
 use crate::loop_analysis::LoopAnalysis;
+use crate::machinst::MachCompileResult;
 use crate::nan_canonicalization::do_nan_canonicalization;
 use crate::postopt::do_postopt;
 use crate::redundant_reload_remover::RedundantReloadRemover;
@@ -55,6 +57,12 @@ pub struct Context {

    /// Redundant-reload remover context.
    pub redundant_reload_remover: RedundantReloadRemover,
+
+    /// Result of MachBackend compilation, if computed.
+    pub mach_compile_result: Option<MachCompileResult>,
+
+    /// Flag: do we want a disassembly with the MachCompileResult?
+    pub want_disasm: bool,
 }

 impl Context {
@@ -78,6 +86,8 @@ impl Context {
            regalloc: regalloc::Context::new(),
            loop_analysis: LoopAnalysis::new(),
            redundant_reload_remover: RedundantReloadRemover::new(),
+            mach_compile_result: None,
+            want_disasm: false,
        }
    }

@@ -89,6 +99,14 @@ impl Context {
        self.regalloc.clear();
        self.loop_analysis.clear();
        self.redundant_reload_remover.clear();
+        self.mach_compile_result = None;
+        self.want_disasm = false;
+    }
+
+    /// Set the flag to request a disassembly when compiling with a
+    /// `MachBackend` backend.
+    pub fn set_disasm(&mut self, val: bool) {
+        self.want_disasm = val;
    }

    /// Compile the function, and emit machine code into a `Vec<u8>`.
@@ -130,9 +148,13 @@ impl Context {
    pub fn compile(&mut self, isa: &dyn TargetIsa) -> CodegenResult<CodeInfo> {
        let _tt = timing::compile();
        self.verify_if(isa)?;
-        debug!("Compiling:\n{}", self.func.display(isa));

        let opt_level = isa.flags().opt_level();
+        debug!(
+            "Compiling (opt level {:?}):\n{}",
+            opt_level,
+            self.func.display(isa)
+        );

        self.compute_cfg();
        if opt_level != OptLevel::None {
@@ -141,6 +163,7 @@ impl Context {
        if isa.flags().enable_nan_canonicalization() {
            self.canonicalize_nans(isa)?;
        }
+
        self.legalize(isa)?;
        if opt_level != OptLevel::None {
            self.postopt(isa)?;
@@ -149,23 +172,32 @@ impl Context {
            self.licm(isa)?;
            self.simple_gvn(isa)?;
        }
+
        self.compute_domtree();
        self.eliminate_unreachable_code(isa)?;
        if opt_level != OptLevel::None {
            self.dce(isa)?;
        }
-        self.regalloc(isa)?;
-        self.prologue_epilogue(isa)?;
-        if opt_level == OptLevel::Speed || opt_level == OptLevel::SpeedAndSize {
-            self.redundant_reload_remover(isa)?;
-        }
-        if opt_level == OptLevel::SpeedAndSize {
-            self.shrink_instructions(isa)?;
-        }
-        let result = self.relax_branches(isa);

-        debug!("Compiled:\n{}", self.func.display(isa));
-        result
+        if let Some(backend) = isa.get_mach_backend() {
+            let result = backend.compile_function(&mut self.func, self.want_disasm)?;
+            let info = result.code_info();
+            self.mach_compile_result = Some(result);
+            Ok(info)
+        } else {
+            self.regalloc(isa)?;
+            self.prologue_epilogue(isa)?;
+            if opt_level == OptLevel::Speed || opt_level == OptLevel::SpeedAndSize {
+                self.redundant_reload_remover(isa)?;
+            }
+            if opt_level == OptLevel::SpeedAndSize {
+                self.shrink_instructions(isa)?;
+            }
+            let result = self.relax_branches(isa);
+
+            debug!("Compiled:\n{}", self.func.display(isa));
+            result
+        }
    }

    /// Emit machine code directly into raw memory.
@@ -191,7 +223,11 @@ impl Context {
    ) -> CodeInfo {
        let _tt = timing::binemit();
        let mut sink = MemoryCodeSink::new(mem, relocs, traps, stackmaps);
-        isa.emit_function_to_memory(&self.func, &mut sink);
+        if let Some(ref result) = &self.mach_compile_result {
+            result.sections.emit(&mut sink);
+        } else {
+            isa.emit_function_to_memory(&self.func, &mut sink);
+        }
        sink.info
    }

@@ -279,9 +315,15 @@ impl Context {
        // TODO: Avoid doing this when legalization doesn't actually mutate the CFG.
        self.domtree.clear();
        self.loop_analysis.clear();
-        legalize_function(&mut self.func, &mut self.cfg, isa);
-        debug!("Legalized:\n{}", self.func.display(isa));
-        self.verify_if(isa)
+        if isa.get_mach_backend().is_some() {
+            // Run some specific legalizations only.
+            simple_legalize(&mut self.func, &mut self.cfg, isa);
+            self.verify_if(isa)
+        } else {
+            legalize_function(&mut self.func, &mut self.cfg, isa);
+            debug!("Legalized:\n{}", self.func.display(isa));
+            self.verify_if(isa)
+        }
    }

    /// Perform post-legalization rewrites on the function.
--- a/cranelift/codegen/src/dce.rs
+++ b/cranelift/codegen/src/dce.rs
@@ -6,40 +6,10 @@
 use crate::cursor::{Cursor, FuncCursor};
 use crate::dominator_tree::DominatorTree;
 use crate::entity::EntityRef;
-use crate::ir::instructions::InstructionData;
-use crate::ir::{DataFlowGraph, Function, Inst, Opcode};
+use crate::inst_predicates::{any_inst_results_used, has_side_effect};
+use crate::ir::Function;
 use crate::timing;

-/// Test whether the given opcode is unsafe to even consider for DCE.
-fn trivially_unsafe_for_dce(opcode: Opcode) -> bool {
-    opcode.is_call()
-        || opcode.is_branch()
-        || opcode.is_terminator()
-        || opcode.is_return()
-        || opcode.can_trap()
-        || opcode.other_side_effects()
-        || opcode.can_store()
-}
-
-/// Preserve instructions with used result values.
-fn any_inst_results_used(inst: Inst, live: &[bool], dfg: &DataFlowGraph) -> bool {
-    dfg.inst_results(inst).iter().any(|v| live[v.index()])
-}
-
-/// Load instructions without the `notrap` flag are defined to trap when
-/// operating on inaccessible memory, so we can't DCE them even if the
-/// loaded value is unused.
-fn is_load_with_defined_trapping(opcode: Opcode, data: &InstructionData) -> bool {
-    if !opcode.can_load() {
-        return false;
-    }
-    match *data {
-        InstructionData::StackLoad { .. } => false,
-        InstructionData::Load { flags, .. } => !flags.notrap(),
-        _ => true,
-    }
-}
-
 /// Perform DCE on `func`.
 pub fn do_dce(func: &mut Function, domtree: &mut DominatorTree) {
    let _tt = timing::dce();
@@ -50,10 +20,7 @@ pub fn do_dce(func: &mut Function, domtree: &mut DominatorTree) {
        let mut pos = FuncCursor::new(func).at_bottom(block);
        while let Some(inst) = pos.prev_inst() {
            {
-                let data = &pos.func.dfg[inst];
-                let opcode = data.opcode();
-                if trivially_unsafe_for_dce(opcode)
-                    || is_load_with_defined_trapping(opcode, &data)
+                if has_side_effect(pos.func, inst)
                    || any_inst_results_used(inst, &live, &pos.func.dfg)
                {
                    for arg in pos.func.dfg.inst_args(inst) {
--- a/cranelift/codegen/src/inst_predicates.rs
+++ b/cranelift/codegen/src/inst_predicates.rs
@@ -0,0 +1,42 @@
+//! Instruction predicates/properties, shared by various analyses.
+
+use crate::ir::{DataFlowGraph, Function, Inst, InstructionData, Opcode};
+use cranelift_entity::EntityRef;
+
+/// Preserve instructions with used result values.
+pub fn any_inst_results_used(inst: Inst, live: &[bool], dfg: &DataFlowGraph) -> bool {
+    dfg.inst_results(inst).iter().any(|v| live[v.index()])
+}
+
+/// Test whether the given opcode is unsafe to even consider as side-effect-free.
+fn trivially_has_side_effects(opcode: Opcode) -> bool {
+    opcode.is_call()
+        || opcode.is_branch()
+        || opcode.is_terminator()
+        || opcode.is_return()
+        || opcode.can_trap()
+        || opcode.other_side_effects()
+        || opcode.can_store()
+}
+
+/// Load instructions without the `notrap` flag are defined to trap when
+/// operating on inaccessible memory, so we can't treat them as side-effect-free even if the loaded
+/// value is unused.
+fn is_load_with_defined_trapping(opcode: Opcode, data: &InstructionData) -> bool {
+    if !opcode.can_load() {
+        return false;
+    }
+    match *data {
+        InstructionData::StackLoad { .. } => false,
+        InstructionData::Load { flags, .. } => !flags.notrap(),
+        _ => true,
+    }
+}
+
+/// Does the given instruction have any side-effect that would preclude it from being removed when
+/// its value is unused?
+pub fn has_side_effect(func: &Function, inst: Inst) -> bool {
+    let data = &func.dfg[inst];
+    let opcode = data.opcode();
+    trivially_has_side_effects(opcode) || is_load_with_defined_trapping(opcode, data)
+}
--- a/cranelift/codegen/src/ir/function.rs
+++ b/cranelift/codegen/src/ir/function.rs
@@ -238,13 +238,21 @@ impl Function {

    /// Wrapper around `encode` which assigns `inst` the resulting encoding.
    pub fn update_encoding(&mut self, inst: ir::Inst, isa: &dyn TargetIsa) -> Result<(), Legalize> {
-        self.encode(inst, isa).map(|e| self.encodings[inst] = e)
+        if isa.get_mach_backend().is_some() {
+            Ok(())
+        } else {
+            self.encode(inst, isa).map(|e| self.encodings[inst] = e)
+        }
    }

    /// Wrapper around `TargetIsa::encode` for encoding an existing instruction
    /// in the `Function`.
    pub fn encode(&self, inst: ir::Inst, isa: &dyn TargetIsa) -> Result<Encoding, Legalize> {
-        isa.encode(&self, &self.dfg[inst], self.dfg.ctrl_typevar(inst))
+        if isa.get_mach_backend().is_some() {
+            Ok(Encoding::new(0, 0))
+        } else {
+            isa.encode(&self, &self.dfg[inst], self.dfg.ctrl_typevar(inst))
+        }
    }

    /// Starts collection of debug information.
--- a/cranelift/codegen/src/ir/immediates.rs
+++ b/cranelift/codegen/src/ir/immediates.rs
@@ -57,6 +57,11 @@ impl Imm64 {
    pub fn wrapping_neg(self) -> Self {
        Self(self.0.wrapping_neg())
    }
+
+    /// Return bits of this immediate.
+    pub fn bits(&self) -> i64 {
+        self.0
+    }
 }

 impl Into<i64> for Imm64 {
--- a/cranelift/codegen/src/isa/aarch64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -0,0 +1,885 @@
+//! Implementation of the standard AArch64 ABI.
+
+use crate::ir;
+use crate::ir::types;
+use crate::ir::types::*;
+use crate::ir::StackSlot;
+use crate::isa;
+use crate::isa::aarch64::inst::*;
+use crate::machinst::*;
+use crate::settings;
+
+use alloc::vec::Vec;
+
+use regalloc::{RealReg, Reg, RegClass, Set, SpillSlot, Writable};
+
+use log::debug;
+
+/// A location for an argument or return value.
+#[derive(Clone, Copy, Debug)]
+enum ABIArg {
+    /// In a real register.
+    Reg(RealReg, ir::Type),
+    /// Arguments only: on stack, at given offset from SP at entry.
+    Stack(i64, ir::Type),
+}
+
+/// AArch64 ABI information shared between body (callee) and caller.
+struct ABISig {
+    args: Vec<ABIArg>,
+    rets: Vec<ABIArg>,
+    stack_arg_space: i64,
+    call_conv: isa::CallConv,
+}
+
+// Spidermonkey specific ABI convention.
+
+/// This is SpiderMonkey's `WasmTableCallSigReg`.
+static BALDRDASH_SIG_REG: u8 = 10;
+
+/// This is SpiderMonkey's `WasmTlsReg`.
+static BALDRDASH_TLS_REG: u8 = 23;
+
+// These two lists represent the registers the JIT may *not* use at any point in generated code.
+//
+// So these are callee-preserved from the JIT's point of view, and every register not in this list
+// has to be caller-preserved by definition.
+//
+// Keep these lists in sync with the NonAllocatableMask set in Spidermonkey's
+// Architecture-arm64.cpp.
+
+// Indexed by physical register number.
+#[rustfmt::skip]
+static BALDRDASH_JIT_CALLEE_SAVED_GPR: &[bool] = &[
+    /* 0 = */ false, false, false, false, false, false, false, false,
+    /* 8 = */ false, false, false, false, false, false, false, false,
+    /* 16 = */ true /* x16 / ip1 */, true /* x17 / ip2 */, true /* x18 / TLS */, false,
+    /* 20 = */ false, false, false, false,
+    /* 24 = */ false, false, false, false,
+    // There should be 28, the pseudo stack pointer in this list, however the wasm stubs trash it
+    // gladly right now.
+    /* 28 = */ false, false, true /* x30 = FP */, true /* x31 = SP */
+];
+
+#[rustfmt::skip]
+static BALDRDASH_JIT_CALLEE_SAVED_FPU: &[bool] = &[
+    /* 0 = */ false, false, false, false, false, false, false, false,
+    /* 8 = */ false, false, false, false, false, false, false, false,
+    /* 16 = */ false, false, false, false, false, false, false, false,
+    /* 24 = */ false, false, false, false, false, false, false, true /* v31 / d31 */
+];
+
+/// Try to fill a Baldrdash register, returning it if it was found.
+fn try_fill_baldrdash_reg(call_conv: isa::CallConv, param: &ir::AbiParam) -> Option<ABIArg> {
+    if call_conv.extends_baldrdash() {
+        match &param.purpose {
+            &ir::ArgumentPurpose::VMContext => {
+                // This is SpiderMonkey's `WasmTlsReg`.
+                Some(ABIArg::Reg(
+                    xreg(BALDRDASH_TLS_REG).to_real_reg(),
+                    ir::types::I64,
+                ))
+            }
+            &ir::ArgumentPurpose::SignatureId => {
+                // This is SpiderMonkey's `WasmTableCallSigReg`.
+                Some(ABIArg::Reg(
+                    xreg(BALDRDASH_SIG_REG).to_real_reg(),
+                    ir::types::I64,
+                ))
+            }
+            _ => None,
+        }
+    } else {
+        None
+    }
+}
+
+/// Process a list of parameters or return values and allocate them to X-regs,
+/// V-regs, and stack slots.
+///
+/// Returns the list of argument locations, and the stack-space used (rounded up
+/// to a 16-byte-aligned boundary).
+fn compute_arg_locs(call_conv: isa::CallConv, params: &[ir::AbiParam]) -> (Vec<ABIArg>, i64) {
+    // See AArch64 ABI (https://c9x.me/compile/bib/abi-arm64.pdf), sections 5.4.
+    let mut next_xreg = 0;
+    let mut next_vreg = 0;
+    let mut next_stack: u64 = 0;
+    let mut ret = vec![];
+    for param in params {
+        // Validate "purpose".
+        match &param.purpose {
+            &ir::ArgumentPurpose::VMContext
+            | &ir::ArgumentPurpose::Normal
+            | &ir::ArgumentPurpose::SignatureId => {}
+            _ => panic!(
+                "Unsupported argument purpose {:?} in signature: {:?}",
+                param.purpose, params
+            ),
+        }
+
+        if in_int_reg(param.value_type) {
+            if let Some(param) = try_fill_baldrdash_reg(call_conv, param) {
+                ret.push(param);
+            } else if next_xreg < 8 {
+                ret.push(ABIArg::Reg(xreg(next_xreg).to_real_reg(), param.value_type));
+                next_xreg += 1;
+            } else {
+                ret.push(ABIArg::Stack(next_stack as i64, param.value_type));
+                next_stack += 8;
+            }
+        } else if in_vec_reg(param.value_type) {
+            if next_vreg < 8 {
+                ret.push(ABIArg::Reg(vreg(next_vreg).to_real_reg(), param.value_type));
+                next_vreg += 1;
+            } else {
+                let size: u64 = match param.value_type {
+                    F32 | F64 => 8,
+                    _ => panic!("Unsupported vector-reg argument type"),
+                };
+                // Align.
+                assert!(size.is_power_of_two());
+                next_stack = (next_stack + size - 1) & !(size - 1);
+                ret.push(ABIArg::Stack(next_stack as i64, param.value_type));
+                next_stack += size;
+            }
+        }
+    }
+
+    next_stack = (next_stack + 15) & !15;
+
+    (ret, next_stack as i64)
+}
+
+impl ABISig {
+    fn from_func_sig(sig: &ir::Signature) -> ABISig {
+        // Compute args and retvals from signature.
+        // TODO: pass in arg-mode or ret-mode. (Does not matter
+        // for the types of arguments/return values that we support.)
+        let (args, stack_arg_space) = compute_arg_locs(sig.call_conv, &sig.params);
+        let (rets, _) = compute_arg_locs(sig.call_conv, &sig.returns);
+
+        // Verify that there are no return values on the stack.
+        assert!(rets.iter().all(|a| match a {
+            &ABIArg::Stack(..) => false,
+            _ => true,
+        }));
+
+        ABISig {
+            args,
+            rets,
+            stack_arg_space,
+            call_conv: sig.call_conv,
+        }
+    }
+}
+
+/// AArch64 ABI object for a function body.
+pub struct AArch64ABIBody {
+    /// signature: arg and retval regs
+    sig: ABISig,
+    /// offsets to each stackslot
+    stackslots: Vec<u32>,
+    /// total stack size of all stackslots
+    stackslots_size: u32,
+    /// clobbered registers, from regalloc.
+    clobbered: Set<Writable<RealReg>>,
+    /// total number of spillslots, from regalloc.
+    spillslots: Option<usize>,
+    /// Total frame size.
+    frame_size: Option<u32>,
+    /// Calling convention this function expects.
+    call_conv: isa::CallConv,
+}
+
+fn in_int_reg(ty: ir::Type) -> bool {
+    match ty {
+        types::I8 | types::I16 | types::I32 | types::I64 => true,
+        types::B1 | types::B8 | types::B16 | types::B32 | types::B64 => true,
+        _ => false,
+    }
+}
+
+fn in_vec_reg(ty: ir::Type) -> bool {
+    match ty {
+        types::F32 | types::F64 => true,
+        _ => false,
+    }
+}
+
+impl AArch64ABIBody {
+    /// Create a new body ABI instance.
+    pub fn new(f: &ir::Function) -> Self {
+        debug!("AArch64 ABI: func signature {:?}", f.signature);
+
+        let sig = ABISig::from_func_sig(&f.signature);
+
+        let call_conv = f.signature.call_conv;
+        // Only these calling conventions are supported.
+        assert!(
+            call_conv == isa::CallConv::SystemV
+                || call_conv == isa::CallConv::Fast
+                || call_conv == isa::CallConv::Cold
+                || call_conv.extends_baldrdash(),
+            "Unsupported calling convention: {:?}",
+            call_conv
+        );
+
+        // Compute stackslot locations and total stackslot size.
+        let mut stack_offset: u32 = 0;
+        let mut stackslots = vec![];
+        for (stackslot, data) in f.stack_slots.iter() {
+            let off = stack_offset;
+            stack_offset += data.size;
+            stack_offset = (stack_offset + 7) & !7;
+            assert_eq!(stackslot.as_u32() as usize, stackslots.len());
+            stackslots.push(off);
+        }
+
+        Self {
+            sig,
+            stackslots,
+            stackslots_size: stack_offset,
+            clobbered: Set::empty(),
+            spillslots: None,
+            frame_size: None,
+            call_conv,
+        }
+    }
+}
+
+fn load_stack(fp_offset: i64, into_reg: Writable<Reg>, ty: Type) -> Inst {
+    let mem = MemArg::FPOffset(fp_offset);
+
+    match ty {
+        types::B1
+        | types::B8
+        | types::I8
+        | types::B16
+        | types::I16
+        | types::B32
+        | types::I32
+        | types::B64
+        | types::I64 => Inst::ULoad64 {
+            rd: into_reg,
+            mem,
+            srcloc: None,
+        },
+        types::F32 => Inst::FpuLoad32 {
+            rd: into_reg,
+            mem,
+            srcloc: None,
+        },
+        types::F64 => Inst::FpuLoad64 {
+            rd: into_reg,
+            mem,
+            srcloc: None,
+        },
+        _ => unimplemented!("load_stack({})", ty),
+    }
+}
+
+fn store_stack(fp_offset: i64, from_reg: Reg, ty: Type) -> Inst {
+    let mem = MemArg::FPOffset(fp_offset);
+
+    match ty {
+        types::B1
+        | types::B8
+        | types::I8
+        | types::B16
+        | types::I16
+        | types::B32
+        | types::I32
+        | types::B64
+        | types::I64 => Inst::Store64 {
+            rd: from_reg,
+            mem,
+            srcloc: None,
+        },
+        types::F32 => Inst::FpuStore32 {
+            rd: from_reg,
+            mem,
+            srcloc: None,
+        },
+        types::F64 => Inst::FpuStore64 {
+            rd: from_reg,
+            mem,
+            srcloc: None,
+        },
+        _ => unimplemented!("store_stack({})", ty),
+    }
+}
+
+fn is_callee_save(call_conv: isa::CallConv, r: RealReg) -> bool {
+    if call_conv.extends_baldrdash() {
+        match r.get_class() {
+            RegClass::I64 => {
+                let enc = r.get_hw_encoding();
+                if BALDRDASH_JIT_CALLEE_SAVED_GPR[enc] {
+                    return true;
+                }
+                // Otherwise, fall through to preserve native ABI registers.
+            }
+            RegClass::V128 => {
+                let enc = r.get_hw_encoding();
+                if BALDRDASH_JIT_CALLEE_SAVED_FPU[enc] {
+                    return true;
+                }
+                // Otherwise, fall through to preserve native ABI registers.
+            }
+            _ => unimplemented!("baldrdash callee saved on non-i64 reg classes"),
+        };
+    }
+
+    match r.get_class() {
+        RegClass::I64 => {
+            // x19 - x28 inclusive are callee-saves.
+            r.get_hw_encoding() >= 19 && r.get_hw_encoding() <= 28
+        }
+        RegClass::V128 => {
+            // v8 - v15 inclusive are callee-saves.
+            r.get_hw_encoding() >= 8 && r.get_hw_encoding() <= 15
+        }
+        _ => panic!("Unexpected RegClass"),
+    }
+}
+
+fn get_callee_saves(
+    call_conv: isa::CallConv,
+    regs: Vec<Writable<RealReg>>,
+) -> (Vec<Writable<RealReg>>, Vec<Writable<RealReg>>) {
+    let mut int_saves = vec![];
+    let mut vec_saves = vec![];
+    for reg in regs.into_iter() {
+        if is_callee_save(call_conv, reg.to_reg()) {
+            match reg.to_reg().get_class() {
+                RegClass::I64 => int_saves.push(reg),
+                RegClass::V128 => vec_saves.push(reg),
+                _ => panic!("Unexpected RegClass"),
+            }
+        }
+    }
+    (int_saves, vec_saves)
+}
+
+fn is_caller_save(call_conv: isa::CallConv, r: RealReg) -> bool {
+    if call_conv.extends_baldrdash() {
+        match r.get_class() {
+            RegClass::I64 => {
+                let enc = r.get_hw_encoding();
+                if !BALDRDASH_JIT_CALLEE_SAVED_GPR[enc] {
+                    return true;
+                }
+                // Otherwise, fall through to preserve native's ABI caller-saved.
+            }
+            RegClass::V128 => {
+                let enc = r.get_hw_encoding();
+                if !BALDRDASH_JIT_CALLEE_SAVED_FPU[enc] {
+                    return true;
+                }
+                // Otherwise, fall through to preserve native's ABI caller-saved.
+            }
+            _ => unimplemented!("baldrdash callee saved on non-i64 reg classes"),
+        };
+    }
+
+    match r.get_class() {
+        RegClass::I64 => {
+            // x0 - x17 inclusive are caller-saves.
+            r.get_hw_encoding() <= 17
+        }
+        RegClass::V128 => {
+            // v0 - v7 inclusive and v16 - v31 inclusive are caller-saves.
+            r.get_hw_encoding() <= 7 || (r.get_hw_encoding() >= 16 && r.get_hw_encoding() <= 31)
+        }
+        _ => panic!("Unexpected RegClass"),
+    }
+}
+
+fn get_caller_saves_set(call_conv: isa::CallConv) -> Set<Writable<Reg>> {
+    let mut set = Set::empty();
+    for i in 0..29 {
+        let x = writable_xreg(i);
+        if is_caller_save(call_conv, x.to_reg().to_real_reg()) {
+            set.insert(x);
+        }
+    }
+    for i in 0..32 {
+        let v = writable_vreg(i);
+        if is_caller_save(call_conv, v.to_reg().to_real_reg()) {
+            set.insert(v);
+        }
+    }
+    set
+}
+
+impl ABIBody for AArch64ABIBody {
+    type I = Inst;
+
+    fn liveins(&self) -> Set<RealReg> {
+        let mut set: Set<RealReg> = Set::empty();
+        for &arg in &self.sig.args {
+            if let ABIArg::Reg(r, _) = arg {
+                set.insert(r);
+            }
+        }
+        set
+    }
+
+    fn liveouts(&self) -> Set<RealReg> {
+        let mut set: Set<RealReg> = Set::empty();
+        for &ret in &self.sig.rets {
+            if let ABIArg::Reg(r, _) = ret {
+                set.insert(r);
+            }
+        }
+        set
+    }
+
+    fn num_args(&self) -> usize {
+        self.sig.args.len()
+    }
+
+    fn num_retvals(&self) -> usize {
+        self.sig.rets.len()
+    }
+
+    fn num_stackslots(&self) -> usize {
+        self.stackslots.len()
+    }
+
+    fn gen_copy_arg_to_reg(&self, idx: usize, into_reg: Writable<Reg>) -> Inst {
+        match &self.sig.args[idx] {
+            &ABIArg::Reg(r, ty) => Inst::gen_move(into_reg, r.to_reg(), ty),
+            &ABIArg::Stack(off, ty) => load_stack(off + 16, into_reg, ty),
+        }
+    }
+
+    fn gen_copy_reg_to_retval(&self, idx: usize, from_reg: Reg) -> Inst {
+        match &self.sig.rets[idx] {
+            &ABIArg::Reg(r, ty) => Inst::gen_move(Writable::from_reg(r.to_reg()), from_reg, ty),
+            &ABIArg::Stack(off, ty) => store_stack(off + 16, from_reg, ty),
+        }
+    }
+
+    fn gen_ret(&self) -> Inst {
+        Inst::Ret {}
+    }
+
+    fn gen_epilogue_placeholder(&self) -> Inst {
+        Inst::EpiloguePlaceholder {}
+    }
+
+    fn set_num_spillslots(&mut self, slots: usize) {
+        self.spillslots = Some(slots);
+    }
+
+    fn set_clobbered(&mut self, clobbered: Set<Writable<RealReg>>) {
+        self.clobbered = clobbered;
+    }
+
+    fn load_stackslot(
+        &self,
+        slot: StackSlot,
+        offset: u32,
+        ty: Type,
+        into_reg: Writable<Reg>,
+    ) -> Inst {
+        // Offset from beginning of stackslot area, which is at FP - stackslots_size.
+        let stack_off = self.stackslots[slot.as_u32() as usize] as i64;
+        let fp_off: i64 = -(self.stackslots_size as i64) + stack_off + (offset as i64);
+        load_stack(fp_off, into_reg, ty)
+    }
+
+    fn store_stackslot(&self, slot: StackSlot, offset: u32, ty: Type, from_reg: Reg) -> Inst {
+        // Offset from beginning of stackslot area, which is at FP - stackslots_size.
+        let stack_off = self.stackslots[slot.as_u32() as usize] as i64;
+        let fp_off: i64 = -(self.stackslots_size as i64) + stack_off + (offset as i64);
+        store_stack(fp_off, from_reg, ty)
+    }
+
+    // Load from a spillslot.
+    fn load_spillslot(&self, slot: SpillSlot, ty: Type, into_reg: Writable<Reg>) -> Inst {
+        // Note that when spills/fills are generated, we don't yet know how many
+        // spillslots there will be, so we allocate *downward* from the beginning
+        // of the stackslot area. Hence: FP - stackslot_size - 8*spillslot -
+        // sizeof(ty).
+        let islot = slot.get() as i64;
+        let ty_size = self.get_spillslot_size(into_reg.to_reg().get_class(), ty) * 8;
+        let fp_off: i64 = -(self.stackslots_size as i64) - (8 * islot) - ty_size as i64;
+        load_stack(fp_off, into_reg, ty)
+    }
+
+    // Store to a spillslot.
+    fn store_spillslot(&self, slot: SpillSlot, ty: Type, from_reg: Reg) -> Inst {
+        let islot = slot.get() as i64;
+        let ty_size = self.get_spillslot_size(from_reg.get_class(), ty) * 8;
+        let fp_off: i64 = -(self.stackslots_size as i64) - (8 * islot) - ty_size as i64;
+        store_stack(fp_off, from_reg, ty)
+    }
+
+    fn gen_prologue(&mut self, flags: &settings::Flags) -> Vec<Inst> {
+        let mut insts = vec![];
+        if !self.call_conv.extends_baldrdash() {
+            // stp fp (x29), lr (x30), [sp, #-16]!
+            insts.push(Inst::StoreP64 {
+                rt: fp_reg(),
+                rt2: link_reg(),
+                mem: PairMemArg::PreIndexed(
+                    writable_stack_reg(),
+                    SImm7Scaled::maybe_from_i64(-16, types::I64).unwrap(),
+                ),
+            });
+            // mov fp (x29), sp. This uses the ADDI rd, rs, 0 form of `MOV` because
+            // the usual encoding (`ORR`) does not work with SP.
+            insts.push(Inst::AluRRImm12 {
+                alu_op: ALUOp::Add64,
+                rd: writable_fp_reg(),
+                rn: stack_reg(),
+                imm12: Imm12 {
+                    bits: 0,
+                    shift12: false,
+                },
+            });
+        }
+
+        let mut total_stacksize = self.stackslots_size + 8 * self.spillslots.unwrap() as u32;
+        if self.call_conv.extends_baldrdash() {
+            debug_assert!(
+                !flags.enable_probestack(),
+                "baldrdash does not expect cranelift to emit stack probes"
+            );
+            total_stacksize += flags.baldrdash_prologue_words() as u32 * 8;
+        }
+        let total_stacksize = (total_stacksize + 15) & !15; // 16-align the stack.
+
+        if !self.call_conv.extends_baldrdash() && total_stacksize > 0 {
+            // sub sp, sp, #total_stacksize
+            if let Some(imm12) = Imm12::maybe_from_u64(total_stacksize as u64) {
+                let sub_inst = Inst::AluRRImm12 {
+                    alu_op: ALUOp::Sub64,
+                    rd: writable_stack_reg(),
+                    rn: stack_reg(),
+                    imm12,
+                };
+                insts.push(sub_inst);
+            } else {
+                let tmp = writable_spilltmp_reg();
+                let const_inst = Inst::LoadConst64 {
+                    rd: tmp,
+                    const_data: total_stacksize as u64,
+                };
+                let sub_inst = Inst::AluRRRExtend {
+                    alu_op: ALUOp::Sub64,
+                    rd: writable_stack_reg(),
+                    rn: stack_reg(),
+                    rm: tmp.to_reg(),
+                    extendop: ExtendOp::UXTX,
+                };
+                insts.push(const_inst);
+                insts.push(sub_inst);
+            }
+        }
+
+        // Save clobbered registers.
+        let (clobbered_int, clobbered_vec) =
+            get_callee_saves(self.call_conv, self.clobbered.to_vec());
+        for reg_pair in clobbered_int.chunks(2) {
+            let (r1, r2) = if reg_pair.len() == 2 {
+                // .to_reg().to_reg(): Writable<RealReg> --> RealReg --> Reg
+                (reg_pair[0].to_reg().to_reg(), reg_pair[1].to_reg().to_reg())
+            } else {
+                (reg_pair[0].to_reg().to_reg(), zero_reg())
+            };
+
+            debug_assert!(r1.get_class() == RegClass::I64);
+            debug_assert!(r2.get_class() == RegClass::I64);
+
+            // stp r1, r2, [sp, #-16]!
+            insts.push(Inst::StoreP64 {
+                rt: r1,
+                rt2: r2,
+                mem: PairMemArg::PreIndexed(
+                    writable_stack_reg(),
+                    SImm7Scaled::maybe_from_i64(-16, types::I64).unwrap(),
+                ),
+            });
+        }
+        let vec_save_bytes = clobbered_vec.len() * 16;
+        if vec_save_bytes != 0 {
+            insts.push(Inst::AluRRImm12 {
+                alu_op: ALUOp::Sub64,
+                rd: writable_stack_reg(),
+                rn: stack_reg(),
+                imm12: Imm12::maybe_from_u64(vec_save_bytes as u64).unwrap(),
+            });
+        }
+        for (i, reg) in clobbered_vec.iter().enumerate() {
+            insts.push(Inst::FpuStore128 {
+                rd: reg.to_reg().to_reg(),
+                mem: MemArg::Unscaled(stack_reg(), SImm9::maybe_from_i64((i * 16) as i64).unwrap()),
+                srcloc: None,
+            });
+        }
+
+        self.frame_size = Some(total_stacksize);
+        insts
+    }
+
+    fn gen_epilogue(&self, _flags: &settings::Flags) -> Vec<Inst> {
+        let mut insts = vec![];
+
+        // Restore clobbered registers.
+        let (clobbered_int, clobbered_vec) =
+            get_callee_saves(self.call_conv, self.clobbered.to_vec());
+
+        for (i, reg) in clobbered_vec.iter().enumerate() {
+            insts.push(Inst::FpuLoad128 {
+                rd: Writable::from_reg(reg.to_reg().to_reg()),
+                mem: MemArg::Unscaled(stack_reg(), SImm9::maybe_from_i64((i * 16) as i64).unwrap()),
+                srcloc: None,
+            });
+        }
+        let vec_save_bytes = clobbered_vec.len() * 16;
+        if vec_save_bytes != 0 {
+            insts.push(Inst::AluRRImm12 {
+                alu_op: ALUOp::Add64,
+                rd: writable_stack_reg(),
+                rn: stack_reg(),
+                imm12: Imm12::maybe_from_u64(vec_save_bytes as u64).unwrap(),
+            });
+        }
+
+        for reg_pair in clobbered_int.chunks(2).rev() {
+            let (r1, r2) = if reg_pair.len() == 2 {
+                (
+                    reg_pair[0].map(|r| r.to_reg()),
+                    reg_pair[1].map(|r| r.to_reg()),
+                )
+            } else {
+                (reg_pair[0].map(|r| r.to_reg()), writable_zero_reg())
+            };
+
+            debug_assert!(r1.to_reg().get_class() == RegClass::I64);
+            debug_assert!(r2.to_reg().get_class() == RegClass::I64);
+
+            // ldp r1, r2, [sp], #16
+            insts.push(Inst::LoadP64 {
+                rt: r1,
+                rt2: r2,
+                mem: PairMemArg::PostIndexed(
+                    writable_stack_reg(),
+                    SImm7Scaled::maybe_from_i64(16, types::I64).unwrap(),
+                ),
+            });
+        }
+
+        if !self.call_conv.extends_baldrdash() {
+            // The MOV (alias of ORR) interprets x31 as XZR, so use an ADD here.
+            // MOV to SP is an alias of ADD.
+            insts.push(Inst::AluRRImm12 {
+                alu_op: ALUOp::Add64,
+                rd: writable_stack_reg(),
+                rn: fp_reg(),
+                imm12: Imm12 {
+                    bits: 0,
+                    shift12: false,
+                },
+            });
+            insts.push(Inst::LoadP64 {
+                rt: writable_fp_reg(),
+                rt2: writable_link_reg(),
+                mem: PairMemArg::PostIndexed(
+                    writable_stack_reg(),
+                    SImm7Scaled::maybe_from_i64(16, types::I64).unwrap(),
+                ),
+            });
+            insts.push(Inst::Ret {});
+        }
+
+        debug!("Epilogue: {:?}", insts);
+        insts
+    }
+
+    fn frame_size(&self) -> u32 {
+        self.frame_size
+            .expect("frame size not computed before prologue generation")
+    }
+
+    fn get_spillslot_size(&self, rc: RegClass, ty: Type) -> u32 {
+        // We allocate in terms of 8-byte slots.
+        match (rc, ty) {
+            (RegClass::I64, _) => 1,
+            (RegClass::V128, F32) | (RegClass::V128, F64) => 1,
+            (RegClass::V128, _) => 2,
+            _ => panic!("Unexpected register class!"),
+        }
+    }
+
+    fn gen_spill(&self, to_slot: SpillSlot, from_reg: RealReg, ty: Type) -> Inst {
+        self.store_spillslot(to_slot, ty, from_reg.to_reg())
+    }
+
+    fn gen_reload(&self, to_reg: Writable<RealReg>, from_slot: SpillSlot, ty: Type) -> Inst {
+        self.load_spillslot(from_slot, ty, to_reg.map(|r| r.to_reg()))
+    }
+}
+
+enum CallDest {
+    ExtName(ir::ExternalName),
+    Reg(Reg),
+}
+
+/// AArch64 ABI object for a function call.
+pub struct AArch64ABICall {
+    sig: ABISig,
+    uses: Set<Reg>,
+    defs: Set<Writable<Reg>>,
+    dest: CallDest,
+    loc: ir::SourceLoc,
+    opcode: ir::Opcode,
+}
+
+fn abisig_to_uses_and_defs(sig: &ABISig) -> (Set<Reg>, Set<Writable<Reg>>) {
+    // Compute uses: all arg regs.
+    let mut uses = Set::empty();
+    for arg in &sig.args {
+        match arg {
+            &ABIArg::Reg(reg, _) => uses.insert(reg.to_reg()),
+            _ => {}
+        }
+    }
+
+    // Compute defs: all retval regs, and all caller-save (clobbered) regs.
+    let mut defs = get_caller_saves_set(sig.call_conv);
+    for ret in &sig.rets {
+        match ret {
+            &ABIArg::Reg(reg, _) => defs.insert(Writable::from_reg(reg.to_reg())),
+            _ => {}
+        }
+    }
+
+    (uses, defs)
+}
+
+impl AArch64ABICall {
+    /// Create a callsite ABI object for a call directly to the specified function.
+    pub fn from_func(
+        sig: &ir::Signature,
+        extname: &ir::ExternalName,
+        loc: ir::SourceLoc,
+    ) -> AArch64ABICall {
+        let sig = ABISig::from_func_sig(sig);
+        let (uses, defs) = abisig_to_uses_and_defs(&sig);
+        AArch64ABICall {
+            sig,
+            uses,
+            defs,
+            dest: CallDest::ExtName(extname.clone()),
+            loc,
+            opcode: ir::Opcode::Call,
+        }
+    }
+
+    /// Create a callsite ABI object for a call to a function pointer with the
+    /// given signature.
+    pub fn from_ptr(
+        sig: &ir::Signature,
+        ptr: Reg,
+        loc: ir::SourceLoc,
+        opcode: ir::Opcode,
+    ) -> AArch64ABICall {
+        let sig = ABISig::from_func_sig(sig);
+        let (uses, defs) = abisig_to_uses_and_defs(&sig);
+        AArch64ABICall {
+            sig,
+            uses,
+            defs,
+            dest: CallDest::Reg(ptr),
+            loc,
+            opcode,
+        }
+    }
+}
+
+fn adjust_stack(amt: u64, is_sub: bool) -> Vec<Inst> {
+    if amt > 0 {
+        let alu_op = if is_sub { ALUOp::Sub64 } else { ALUOp::Add64 };
+        if let Some(imm12) = Imm12::maybe_from_u64(amt) {
+            vec![Inst::AluRRImm12 {
+                alu_op,
+                rd: writable_stack_reg(),
+                rn: stack_reg(),
+                imm12,
+            }]
+        } else {
+            let const_load = Inst::LoadConst64 {
+                rd: writable_spilltmp_reg(),
+                const_data: amt,
+            };
+            let adj = Inst::AluRRRExtend {
+                alu_op,
+                rd: writable_stack_reg(),
+                rn: stack_reg(),
+                rm: spilltmp_reg(),
+                extendop: ExtendOp::UXTX,
+            };
+            vec![const_load, adj]
+        }
+    } else {
+        vec![]
+    }
+}
+
+impl ABICall for AArch64ABICall {
+    type I = Inst;
+
+    fn num_args(&self) -> usize {
+        self.sig.args.len()
+    }
+
+    fn gen_stack_pre_adjust(&self) -> Vec<Inst> {
+        adjust_stack(self.sig.stack_arg_space as u64, /* is_sub = */ true)
+    }
+
+    fn gen_stack_post_adjust(&self) -> Vec<Inst> {
+        adjust_stack(self.sig.stack_arg_space as u64, /* is_sub = */ false)
+    }
+
+    fn gen_copy_reg_to_arg(&self, idx: usize, from_reg: Reg) -> Inst {
+        match &self.sig.args[idx] {
+            &ABIArg::Reg(reg, ty) => Inst::gen_move(Writable::from_reg(reg.to_reg()), from_reg, ty),
+            &ABIArg::Stack(off, _) => Inst::Store64 {
+                rd: from_reg,
+                mem: MemArg::SPOffset(off),
+                srcloc: None,
+            },
+        }
+    }
+
+    fn gen_copy_retval_to_reg(&self, idx: usize, into_reg: Writable<Reg>) -> Inst {
+        match &self.sig.rets[idx] {
+            &ABIArg::Reg(reg, ty) => Inst::gen_move(into_reg, reg.to_reg(), ty),
+            _ => unimplemented!(),
+        }
+    }
+
+    fn gen_call(&self) -> Vec<Inst> {
+        let (uses, defs) = (self.uses.clone(), self.defs.clone());
+        match &self.dest {
+            &CallDest::ExtName(ref name) => vec![Inst::Call {
+                dest: name.clone(),
+                uses,
+                defs,
+                loc: self.loc,
+                opcode: self.opcode,
+            }],
+            &CallDest::Reg(reg) => vec![Inst::CallInd {
+                rn: reg,
+                uses,
+                defs,
+                loc: self.loc,
+                opcode: self.opcode,
+            }],
+        }
+    }
+}
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -0,0 +1,528 @@
+//! AArch64 ISA definitions: instruction arguments.
+
+// Some variants are never constructed, but we still want them as options in the future.
+#![allow(dead_code)]
+
+use crate::binemit::CodeOffset;
+use crate::ir::Type;
+use crate::isa::aarch64::inst::*;
+
+use regalloc::{RealRegUniverse, Reg, Writable};
+
+use core::convert::{Into, TryFrom};
+use std::string::String;
+
+/// A shift operator for a register or immediate.
+#[derive(Clone, Copy, Debug)]
+#[repr(u8)]
+pub enum ShiftOp {
+    LSL = 0b00,
+    LSR = 0b01,
+    ASR = 0b10,
+    ROR = 0b11,
+}
+
+impl ShiftOp {
+    /// Get the encoding of this shift op.
+    pub fn bits(self) -> u8 {
+        self as u8
+    }
+}
+
+/// A shift operator amount.
+#[derive(Clone, Copy, Debug)]
+pub struct ShiftOpShiftImm(u8);
+
+impl ShiftOpShiftImm {
+    /// Maximum shift for shifted-register operands.
+    pub const MAX_SHIFT: u64 = 63;
+
+    /// Create a new shiftop shift amount, if possible.
+    pub fn maybe_from_shift(shift: u64) -> Option<ShiftOpShiftImm> {
+        if shift <= Self::MAX_SHIFT {
+            Some(ShiftOpShiftImm(shift as u8))
+        } else {
+            None
+        }
+    }
+
+    /// Return the shift amount.
+    pub fn value(self) -> u8 {
+        self.0
+    }
+}
+
+/// A shift operator with an amount, guaranteed to be within range.
+#[derive(Clone, Debug)]
+pub struct ShiftOpAndAmt {
+    op: ShiftOp,
+    shift: ShiftOpShiftImm,
+}
+
+impl ShiftOpAndAmt {
+    pub fn new(op: ShiftOp, shift: ShiftOpShiftImm) -> ShiftOpAndAmt {
+        ShiftOpAndAmt { op, shift }
+    }
+
+    /// Get the shift op.
+    pub fn op(&self) -> ShiftOp {
+        self.op
+    }
+
+    /// Get the shift amount.
+    pub fn amt(&self) -> ShiftOpShiftImm {
+        self.shift
+    }
+}
+
+/// An extend operator for a register.
+#[derive(Clone, Copy, Debug)]
+#[repr(u8)]
+pub enum ExtendOp {
+    UXTB = 0b000,
+    UXTH = 0b001,
+    UXTW = 0b010,
+    UXTX = 0b011,
+    SXTB = 0b100,
+    SXTH = 0b101,
+    SXTW = 0b110,
+    SXTX = 0b111,
+}
+
+impl ExtendOp {
+    /// Encoding of this op.
+    pub fn bits(self) -> u8 {
+        self as u8
+    }
+}
+
+//=============================================================================
+// Instruction sub-components (memory addresses): definitions
+
+/// A reference to some memory address.
+#[derive(Clone, Debug)]
+pub enum MemLabel {
+    /// An address in the code, a constant pool or jumptable, with relative
+    /// offset from this instruction. This form must be used at emission time;
+    /// see `memlabel_finalize()` for how other forms are lowered to this one.
+    PCRel(i32),
+}
+
+/// A memory argument to load/store, encapsulating the possible addressing modes.
+#[derive(Clone, Debug)]
+pub enum MemArg {
+    Label(MemLabel),
+    /// "post-indexed" mode as per AArch64 docs: postincrement reg after address computation.
+    PostIndexed(Writable<Reg>, SImm9),
+    /// "pre-indexed" mode as per AArch64 docs: preincrement reg before address computation.
+    PreIndexed(Writable<Reg>, SImm9),
+
+    // N.B.: RegReg, RegScaled, and RegScaledExtended all correspond to
+    // what the ISA calls the "register offset" addressing mode. We split out
+    // several options here for more ergonomic codegen.
+    /// Register plus register offset.
+    RegReg(Reg, Reg),
+
+    /// Register plus register offset, scaled by type's size.
+    RegScaled(Reg, Reg, Type),
+
+    /// Register plus register offset, scaled by type's size, with index sign- or zero-extended
+    /// first.
+    RegScaledExtended(Reg, Reg, Type, ExtendOp),
+
+    /// Unscaled signed 9-bit immediate offset from reg.
+    Unscaled(Reg, SImm9),
+
+    /// Scaled (by size of a type) unsigned 12-bit immediate offset from reg.
+    UnsignedOffset(Reg, UImm12Scaled),
+
+    /// Offset from the stack pointer. Lowered into a real amode at emission.
+    SPOffset(i64),
+
+    /// Offset from the frame pointer. Lowered into a real amode at emission.
+    FPOffset(i64),
+}
+
+impl MemArg {
+    /// Memory reference using an address in a register.
+    pub fn reg(reg: Reg) -> MemArg {
+        // Use UnsignedOffset rather than Unscaled to use ldr rather than ldur.
+        // This also does not use PostIndexed / PreIndexed as they update the register.
+        MemArg::UnsignedOffset(reg, UImm12Scaled::zero(I64))
+    }
+
+    /// Memory reference using an address in a register and an offset, if possible.
+    pub fn reg_maybe_offset(reg: Reg, offset: i64, value_type: Type) -> Option<MemArg> {
+        if let Some(simm9) = SImm9::maybe_from_i64(offset) {
+            Some(MemArg::Unscaled(reg, simm9))
+        } else if let Some(uimm12s) = UImm12Scaled::maybe_from_i64(offset, value_type) {
+            Some(MemArg::UnsignedOffset(reg, uimm12s))
+        } else {
+            None
+        }
+    }
+
+    /// Memory reference using the sum of two registers as an address.
+    pub fn reg_plus_reg(reg1: Reg, reg2: Reg) -> MemArg {
+        MemArg::RegReg(reg1, reg2)
+    }
+
+    /// Memory reference using `reg1 + sizeof(ty) * reg2` as an address.
+    pub fn reg_plus_reg_scaled(reg1: Reg, reg2: Reg, ty: Type) -> MemArg {
+        MemArg::RegScaled(reg1, reg2, ty)
+    }
+
+    /// Memory reference using `reg1 + sizeof(ty) * reg2` as an address, with `reg2` sign- or
+    /// zero-extended as per `op`.
+    pub fn reg_plus_reg_scaled_extended(reg1: Reg, reg2: Reg, ty: Type, op: ExtendOp) -> MemArg {
+        MemArg::RegScaledExtended(reg1, reg2, ty, op)
+    }
+
+    /// Memory reference to a label: a global function or value, or data in the constant pool.
+    pub fn label(label: MemLabel) -> MemArg {
+        MemArg::Label(label)
+    }
+}
+
+/// A memory argument to a load/store-pair.
+#[derive(Clone, Debug)]
+pub enum PairMemArg {
+    SignedOffset(Reg, SImm7Scaled),
+    PreIndexed(Writable<Reg>, SImm7Scaled),
+    PostIndexed(Writable<Reg>, SImm7Scaled),
+}
+
+//=============================================================================
+// Instruction sub-components (conditions, branches and branch targets):
+// definitions
+
+/// Condition for conditional branches.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+#[repr(u8)]
+pub enum Cond {
+    Eq = 0,
+    Ne = 1,
+    Hs = 2,
+    Lo = 3,
+    Mi = 4,
+    Pl = 5,
+    Vs = 6,
+    Vc = 7,
+    Hi = 8,
+    Ls = 9,
+    Ge = 10,
+    Lt = 11,
+    Gt = 12,
+    Le = 13,
+    Al = 14,
+    Nv = 15,
+}
+
+impl Cond {
+    /// Return the inverted condition.
+    pub fn invert(self) -> Cond {
+        match self {
+            Cond::Eq => Cond::Ne,
+            Cond::Ne => Cond::Eq,
+
+            Cond::Hs => Cond::Lo,
+            Cond::Lo => Cond::Hs,
+
+            Cond::Mi => Cond::Pl,
+            Cond::Pl => Cond::Mi,
+
+            Cond::Vs => Cond::Vc,
+            Cond::Vc => Cond::Vs,
+
+            Cond::Hi => Cond::Ls,
+            Cond::Ls => Cond::Hi,
+
+            Cond::Ge => Cond::Lt,
+            Cond::Lt => Cond::Ge,
+
+            Cond::Gt => Cond::Le,
+            Cond::Le => Cond::Gt,
+
+            Cond::Al => Cond::Nv,
+            Cond::Nv => Cond::Al,
+        }
+    }
+
+    /// Return the machine encoding of this condition.
+    pub fn bits(self) -> u32 {
+        self as u32
+    }
+}
+
+/// The kind of conditional branch: the common-case-optimized "reg-is-zero" /
+/// "reg-is-nonzero" variants, or the generic one that tests the machine
+/// condition codes.
+#[derive(Clone, Copy, Debug)]
+pub enum CondBrKind {
+    /// Condition: given register is zero.
+    Zero(Reg),
+    /// Condition: given register is nonzero.
+    NotZero(Reg),
+    /// Condition: the given condition-code test is true.
+    Cond(Cond),
+}
+
+impl CondBrKind {
+    /// Return the inverted branch condition.
+    pub fn invert(self) -> CondBrKind {
+        match self {
+            CondBrKind::Zero(reg) => CondBrKind::NotZero(reg),
+            CondBrKind::NotZero(reg) => CondBrKind::Zero(reg),
+            CondBrKind::Cond(c) => CondBrKind::Cond(c.invert()),
+        }
+    }
+}
+
+/// A branch target. Either unresolved (basic-block index) or resolved (offset
+/// from end of current instruction).
+#[derive(Clone, Copy, Debug)]
+pub enum BranchTarget {
+    /// An unresolved reference to a BlockIndex, as passed into
+    /// `lower_branch_group()`.
+    Block(BlockIndex),
+    /// A resolved reference to another instruction, after
+    /// `Inst::with_block_offsets()`.
+    ResolvedOffset(isize),
+}
+
+impl BranchTarget {
+    /// Lower the branch target given offsets of each block.
+    pub fn lower(&mut self, targets: &[CodeOffset], my_offset: CodeOffset) {
+        match self {
+            &mut BranchTarget::Block(bix) => {
+                let bix = usize::try_from(bix).unwrap();
+                assert!(bix < targets.len());
+                let block_offset_in_func = targets[bix];
+                let branch_offset = (block_offset_in_func as isize) - (my_offset as isize);
+                *self = BranchTarget::ResolvedOffset(branch_offset);
+            }
+            &mut BranchTarget::ResolvedOffset(..) => {}
+        }
+    }
+
+    /// Get the block index.
+    pub fn as_block_index(&self) -> Option<BlockIndex> {
+        match self {
+            &BranchTarget::Block(bix) => Some(bix),
+            _ => None,
+        }
+    }
+
+    /// Get the offset as 4-byte words. Returns `0` if not
+    /// yet resolved (in that case, we're only computing
+    /// size and the offset doesn't matter).
+    pub fn as_offset_words(&self) -> isize {
+        match self {
+            &BranchTarget::ResolvedOffset(off) => off >> 2,
+            _ => 0,
+        }
+    }
+
+    /// Get the offset as a 26-bit offset suitable for a 26-bit jump, or `None` if overflow.
+    pub fn as_off26(&self) -> Option<u32> {
+        let off = self.as_offset_words();
+        if (off < (1 << 25)) && (off >= -(1 << 25)) {
+            Some((off as u32) & ((1 << 26) - 1))
+        } else {
+            None
+        }
+    }
+
+    /// Get the offset as a 19-bit offset, or `None` if overflow.
+    pub fn as_off19(&self) -> Option<u32> {
+        let off = self.as_offset_words();
+        if (off < (1 << 18)) && (off >= -(1 << 18)) {
+            Some((off as u32) & ((1 << 19) - 1))
+        } else {
+            None
+        }
+    }
+
+    /// Map the block index given a transform map.
+    pub fn map(&mut self, block_index_map: &[BlockIndex]) {
+        match self {
+            &mut BranchTarget::Block(ref mut bix) => {
+                let n = block_index_map[usize::try_from(*bix).unwrap()];
+                *bix = n;
+            }
+            &mut BranchTarget::ResolvedOffset(_) => {}
+        }
+    }
+}
+
+impl ShowWithRRU for ShiftOpAndAmt {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("{:?} {}", self.op(), self.amt().value())
+    }
+}
+
+impl ShowWithRRU for ExtendOp {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("{:?}", self)
+    }
+}
+
+impl ShowWithRRU for MemLabel {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        match self {
+            &MemLabel::PCRel(off) => format!("pc+{}", off),
+        }
+    }
+}
+
+fn shift_for_type(ty: Type) -> usize {
+    match ty.bytes() {
+        1 => 0,
+        2 => 1,
+        4 => 2,
+        8 => 3,
+        16 => 4,
+        _ => panic!("unknown type: {}", ty),
+    }
+}
+
+impl ShowWithRRU for MemArg {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        match self {
+            &MemArg::Unscaled(reg, simm9) => {
+                if simm9.value != 0 {
+                    format!("[{}, {}]", reg.show_rru(mb_rru), simm9.show_rru(mb_rru))
+                } else {
+                    format!("[{}]", reg.show_rru(mb_rru))
+                }
+            }
+            &MemArg::UnsignedOffset(reg, uimm12) => {
+                if uimm12.value != 0 {
+                    format!("[{}, {}]", reg.show_rru(mb_rru), uimm12.show_rru(mb_rru))
+                } else {
+                    format!("[{}]", reg.show_rru(mb_rru))
+                }
+            }
+            &MemArg::RegReg(r1, r2) => {
+                format!("[{}, {}]", r1.show_rru(mb_rru), r2.show_rru(mb_rru),)
+            }
+            &MemArg::RegScaled(r1, r2, ty) => {
+                let shift = shift_for_type(ty);
+                format!(
+                    "[{}, {}, LSL #{}]",
+                    r1.show_rru(mb_rru),
+                    r2.show_rru(mb_rru),
+                    shift,
+                )
+            }
+            &MemArg::RegScaledExtended(r1, r2, ty, op) => {
+                let shift = shift_for_type(ty);
+                let size = match op {
+                    ExtendOp::SXTW | ExtendOp::UXTW => InstSize::Size32,
+                    _ => InstSize::Size64,
+                };
+                let op = op.show_rru(mb_rru);
+                format!(
+                    "[{}, {}, {} #{}]",
+                    r1.show_rru(mb_rru),
+                    show_ireg_sized(r2, mb_rru, size),
+                    op,
+                    shift
+                )
+            }
+            &MemArg::Label(ref label) => label.show_rru(mb_rru),
+            &MemArg::PreIndexed(r, simm9) => format!(
+                "[{}, {}]!",
+                r.to_reg().show_rru(mb_rru),
+                simm9.show_rru(mb_rru)
+            ),
+            &MemArg::PostIndexed(r, simm9) => format!(
+                "[{}], {}",
+                r.to_reg().show_rru(mb_rru),
+                simm9.show_rru(mb_rru)
+            ),
+            // Eliminated by `mem_finalize()`.
+            &MemArg::SPOffset(..) | &MemArg::FPOffset(..) => {
+                panic!("Unexpected stack-offset mem-arg mode!")
+            }
+        }
+    }
+}
+
+impl ShowWithRRU for PairMemArg {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        match self {
+            &PairMemArg::SignedOffset(reg, simm7) => {
+                if simm7.value != 0 {
+                    format!("[{}, {}]", reg.show_rru(mb_rru), simm7.show_rru(mb_rru))
+                } else {
+                    format!("[{}]", reg.show_rru(mb_rru))
+                }
+            }
+            &PairMemArg::PreIndexed(reg, simm7) => format!(
+                "[{}, {}]!",
+                reg.to_reg().show_rru(mb_rru),
+                simm7.show_rru(mb_rru)
+            ),
+            &PairMemArg::PostIndexed(reg, simm7) => format!(
+                "[{}], {}",
+                reg.to_reg().show_rru(mb_rru),
+                simm7.show_rru(mb_rru)
+            ),
+        }
+    }
+}
+
+impl ShowWithRRU for Cond {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        let mut s = format!("{:?}", self);
+        s.make_ascii_lowercase();
+        s
+    }
+}
+
+impl ShowWithRRU for BranchTarget {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        match self {
+            &BranchTarget::Block(block) => format!("block{}", block),
+            &BranchTarget::ResolvedOffset(off) => format!("{}", off),
+        }
+    }
+}
+
+/// Type used to communicate the operand size of a machine instruction, as AArch64 has 32- and
+/// 64-bit variants of many instructions (and integer registers).
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum InstSize {
+    Size32,
+    Size64,
+}
+
+impl InstSize {
+    /// 32-bit case?
+    pub fn is32(self) -> bool {
+        self == InstSize::Size32
+    }
+    /// 64-bit case?
+    pub fn is64(self) -> bool {
+        self == InstSize::Size64
+    }
+    /// Convert from an `is32` boolean flag to an `InstSize`.
+    pub fn from_is32(is32: bool) -> InstSize {
+        if is32 {
+            InstSize::Size32
+        } else {
+            InstSize::Size64
+        }
+    }
+    /// Convert from a needed width to the smallest size that fits.
+    pub fn from_bits<I: Into<usize>>(bits: I) -> InstSize {
+        let bits: usize = bits.into();
+        assert!(bits <= 64);
+        if bits <= 32 {
+            InstSize::Size32
+        } else {
+            InstSize::Size64
+        }
+    }
+}
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
--- a/cranelift/codegen/src/isa/aarch64/inst/imms.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/imms.rs
@@ -0,0 +1,752 @@
+//! AArch64 ISA definitions: immediate constants.
+
+// Some variants are never constructed, but we still want them as options in the future.
+#[allow(dead_code)]
+use crate::ir::types::*;
+use crate::ir::Type;
+use crate::machinst::*;
+
+use regalloc::RealRegUniverse;
+
+use core::convert::TryFrom;
+use std::string::String;
+
+/// A signed, scaled 7-bit offset.
+#[derive(Clone, Copy, Debug)]
+pub struct SImm7Scaled {
+    /// The value.
+    pub value: i16,
+    /// multiplied by the size of this type
+    pub scale_ty: Type,
+}
+
+impl SImm7Scaled {
+    /// Create a SImm7Scaled from a raw offset and the known scale type, if
+    /// possible.
+    pub fn maybe_from_i64(value: i64, scale_ty: Type) -> Option<SImm7Scaled> {
+        assert!(scale_ty == I64 || scale_ty == I32);
+        let scale = scale_ty.bytes();
+        assert!(scale.is_power_of_two());
+        let scale = i64::from(scale);
+        let upper_limit = 63 * scale;
+        let lower_limit = -(64 * scale);
+        if value >= lower_limit && value <= upper_limit && (value & (scale - 1)) == 0 {
+            Some(SImm7Scaled {
+                value: i16::try_from(value).unwrap(),
+                scale_ty,
+            })
+        } else {
+            None
+        }
+    }
+
+    /// Create a zero immediate of this format.
+    pub fn zero(scale_ty: Type) -> SImm7Scaled {
+        SImm7Scaled { value: 0, scale_ty }
+    }
+
+    /// Bits for encoding.
+    pub fn bits(&self) -> u32 {
+        let ty_bytes: i16 = self.scale_ty.bytes() as i16;
+        let scaled: i16 = self.value / ty_bytes;
+        assert!(scaled <= 63 && scaled >= -64);
+        let scaled: i8 = scaled as i8;
+        let encoded: u32 = scaled as u32;
+        encoded & 0x7f
+    }
+}
+
+/// a 9-bit signed offset.
+#[derive(Clone, Copy, Debug)]
+pub struct SImm9 {
+    /// The value.
+    pub value: i16,
+}
+
+impl SImm9 {
+    /// Create a signed 9-bit offset from a full-range value, if possible.
+    pub fn maybe_from_i64(value: i64) -> Option<SImm9> {
+        if value >= -256 && value <= 255 {
+            Some(SImm9 {
+                value: value as i16,
+            })
+        } else {
+            None
+        }
+    }
+
+    /// Create a zero immediate of this format.
+    pub fn zero() -> SImm9 {
+        SImm9 { value: 0 }
+    }
+
+    /// Bits for encoding.
+    pub fn bits(&self) -> u32 {
+        (self.value as u32) & 0x1ff
+    }
+}
+
+/// An unsigned, scaled 12-bit offset.
+#[derive(Clone, Copy, Debug)]
+pub struct UImm12Scaled {
+    /// The value.
+    pub value: u16,
+    /// multiplied by the size of this type
+    pub scale_ty: Type,
+}
+
+impl UImm12Scaled {
+    /// Create a UImm12Scaled from a raw offset and the known scale type, if
+    /// possible.
+    pub fn maybe_from_i64(value: i64, scale_ty: Type) -> Option<UImm12Scaled> {
+        let scale = scale_ty.bytes();
+        assert!(scale.is_power_of_two());
+        let scale = scale as i64;
+        let limit = 4095 * scale;
+        if value >= 0 && value <= limit && (value & (scale - 1)) == 0 {
+            Some(UImm12Scaled {
+                value: value as u16,
+                scale_ty,
+            })
+        } else {
+            None
+        }
+    }
+
+    /// Create a zero immediate of this format.
+    pub fn zero(scale_ty: Type) -> UImm12Scaled {
+        UImm12Scaled { value: 0, scale_ty }
+    }
+
+    /// Encoded bits.
+    pub fn bits(&self) -> u32 {
+        (self.value as u32 / self.scale_ty.bytes()) & 0xfff
+    }
+}
+
+/// A shifted immediate value in 'imm12' format: supports 12 bits, shifted
+/// left by 0 or 12 places.
+#[derive(Clone, Debug)]
+pub struct Imm12 {
+    /// The immediate bits.
+    pub bits: u16,
+    /// Whether the immediate bits are shifted left by 12 or not.
+    pub shift12: bool,
+}
+
+impl Imm12 {
+    /// Compute a Imm12 from raw bits, if possible.
+    pub fn maybe_from_u64(val: u64) -> Option<Imm12> {
+        if val == 0 {
+            Some(Imm12 {
+                bits: 0,
+                shift12: false,
+            })
+        } else if val < 0xfff {
+            Some(Imm12 {
+                bits: val as u16,
+                shift12: false,
+            })
+        } else if val < 0xfff_000 && (val & 0xfff == 0) {
+            Some(Imm12 {
+                bits: (val >> 12) as u16,
+                shift12: true,
+            })
+        } else {
+            None
+        }
+    }
+
+    /// Bits for 2-bit "shift" field in e.g. AddI.
+    pub fn shift_bits(&self) -> u32 {
+        if self.shift12 {
+            0b01
+        } else {
+            0b00
+        }
+    }
+
+    /// Bits for 12-bit "imm" field in e.g. AddI.
+    pub fn imm_bits(&self) -> u32 {
+        self.bits as u32
+    }
+}
+
+/// An immediate for logical instructions.
+#[derive(Clone, Debug)]
+#[cfg_attr(test, derive(PartialEq))]
+pub struct ImmLogic {
+    /// The actual value.
+    value: u64,
+    /// `N` flag.
+    pub n: bool,
+    /// `S` field: element size and element bits.
+    pub r: u8,
+    /// `R` field: rotate amount.
+    pub s: u8,
+}
+
+impl ImmLogic {
+    /// Compute an ImmLogic from raw bits, if possible.
+    pub fn maybe_from_u64(value: u64, ty: Type) -> Option<ImmLogic> {
+        // Note: This function is a port of VIXL's Assembler::IsImmLogical.
+
+        if ty != I64 && ty != I32 {
+            return None;
+        }
+
+        let original_value = value;
+
+        let value = if ty == I32 {
+            // To handle 32-bit logical immediates, the very easiest thing is to repeat
+            // the input value twice to make a 64-bit word. The correct encoding of that
+            // as a logical immediate will also be the correct encoding of the 32-bit
+            // value.
+
+            // Avoid making the assumption that the most-significant 32 bits are zero by
+            // shifting the value left and duplicating it.
+            let value = value << 32;
+            value | value >> 32
+        } else {
+            value
+        };
+
+        // Logical immediates are encoded using parameters n, imm_s and imm_r using
+        // the following table:
+        //
+        //    N   imms    immr    size        S             R
+        //    1  ssssss  rrrrrr    64    UInt(ssssss)  UInt(rrrrrr)
+        //    0  0sssss  xrrrrr    32    UInt(sssss)   UInt(rrrrr)
+        //    0  10ssss  xxrrrr    16    UInt(ssss)    UInt(rrrr)
+        //    0  110sss  xxxrrr     8    UInt(sss)     UInt(rrr)
+        //    0  1110ss  xxxxrr     4    UInt(ss)      UInt(rr)
+        //    0  11110s  xxxxxr     2    UInt(s)       UInt(r)
+        // (s bits must not be all set)
+        //
+        // A pattern is constructed of size bits, where the least significant S+1 bits
+        // are set. The pattern is rotated right by R, and repeated across a 32 or
+        // 64-bit value, depending on destination register width.
+        //
+        // Put another way: the basic format of a logical immediate is a single
+        // contiguous stretch of 1 bits, repeated across the whole word at intervals
+        // given by a power of 2. To identify them quickly, we first locate the
+        // lowest stretch of 1 bits, then the next 1 bit above that; that combination
+        // is different for every logical immediate, so it gives us all the
+        // information we need to identify the only logical immediate that our input
+        // could be, and then we simply check if that's the value we actually have.
+        //
+        // (The rotation parameter does give the possibility of the stretch of 1 bits
+        // going 'round the end' of the word. To deal with that, we observe that in
+        // any situation where that happens the bitwise NOT of the value is also a
+        // valid logical immediate. So we simply invert the input whenever its low bit
+        // is set, and then we know that the rotated case can't arise.)
+        let (value, inverted) = if value & 1 == 1 {
+            (!value, true)
+        } else {
+            (value, false)
+        };
+
+        if value == 0 {
+            return None;
+        }
+
+        // The basic analysis idea: imagine our input word looks like this.
+        //
+        //    0011111000111110001111100011111000111110001111100011111000111110
+        //                                                          c  b    a
+        //                                                          |<--d-->|
+        //
+        // We find the lowest set bit (as an actual power-of-2 value, not its index)
+        // and call it a. Then we add a to our original number, which wipes out the
+        // bottommost stretch of set bits and replaces it with a 1 carried into the
+        // next zero bit. Then we look for the new lowest set bit, which is in
+        // position b, and subtract it, so now our number is just like the original
+        // but with the lowest stretch of set bits completely gone. Now we find the
+        // lowest set bit again, which is position c in the diagram above. Then we'll
+        // measure the distance d between bit positions a and c (using CLZ), and that
+        // tells us that the only valid logical immediate that could possibly be equal
+        // to this number is the one in which a stretch of bits running from a to just
+        // below b is replicated every d bits.
+        fn lowest_set_bit(value: u64) -> u64 {
+            let bit = value.trailing_zeros();
+            1u64.checked_shl(bit).unwrap_or(0)
+        }
+        let a = lowest_set_bit(value);
+        assert_ne!(0, a);
+        let value_plus_a = value.wrapping_add(a);
+        let b = lowest_set_bit(value_plus_a);
+        let value_plus_a_minus_b = value_plus_a - b;
+        let c = lowest_set_bit(value_plus_a_minus_b);
+
+        let (d, clz_a, out_n, mask) = if c != 0 {
+            // The general case, in which there is more than one stretch of set bits.
+            // Compute the repeat distance d, and set up a bitmask covering the basic
+            // unit of repetition (i.e. a word with the bottom d bits set). Also, in all
+            // of these cases the N bit of the output will be zero.
+            let clz_a = a.leading_zeros();
+            let clz_c = c.leading_zeros();
+            let d = clz_a - clz_c;
+            let mask = (1 << d) - 1;
+            (d, clz_a, 0, mask)
+        } else {
+            (64, a.leading_zeros(), 1, u64::max_value())
+        };
+
+        // If the repeat period d is not a power of two, it can't be encoded.
+        if !d.is_power_of_two() {
+            return None;
+        }
+
+        if ((b.wrapping_sub(a)) & !mask) != 0 {
+            // If the bit stretch (b - a) does not fit within the mask derived from the
+            // repeat period, then fail.
+            return None;
+        }
+
+        // The only possible option is b - a repeated every d bits. Now we're going to
+        // actually construct the valid logical immediate derived from that
+        // specification, and see if it equals our original input.
+        //
+        // To repeat a value every d bits, we multiply it by a number of the form
+        // (1 + 2^d + 2^(2d) + ...), i.e. 0x0001000100010001 or similar. These can
+        // be derived using a table lookup on CLZ(d).
+        const MULTIPLIERS: [u64; 6] = [
+            0x0000000000000001,
+            0x0000000100000001,
+            0x0001000100010001,
+            0x0101010101010101,
+            0x1111111111111111,
+            0x5555555555555555,
+        ];
+        let multiplier = MULTIPLIERS[(u64::from(d).leading_zeros() - 57) as usize];
+        let candidate = b.wrapping_sub(a) * multiplier;
+
+        if value != candidate {
+            // The candidate pattern doesn't match our input value, so fail.
+            return None;
+        }
+
+        // We have a match! This is a valid logical immediate, so now we have to
+        // construct the bits and pieces of the instruction encoding that generates
+        // it.
+
+        // Count the set bits in our basic stretch. The special case of clz(0) == -1
+        // makes the answer come out right for stretches that reach the very top of
+        // the word (e.g. numbers like 0xffffc00000000000).
+        let clz_b = if b == 0 {
+            u32::max_value() // -1
+        } else {
+            b.leading_zeros()
+        };
+        let s = clz_a.wrapping_sub(clz_b);
+
+        // Decide how many bits to rotate right by, to put the low bit of that basic
+        // stretch in position a.
+        let (s, r) = if inverted {
+            // If we inverted the input right at the start of this function, here's
+            // where we compensate: the number of set bits becomes the number of clear
+            // bits, and the rotation count is based on position b rather than position
+            // a (since b is the location of the 'lowest' 1 bit after inversion).
+            // Need wrapping for when clz_b is max_value() (for when b == 0).
+            (d - s, clz_b.wrapping_add(1) & (d - 1))
+        } else {
+            (s, (clz_a + 1) & (d - 1))
+        };
+
+        // Now we're done, except for having to encode the S output in such a way that
+        // it gives both the number of set bits and the length of the repeated
+        // segment. The s field is encoded like this:
+        //
+        //     imms    size        S
+        //    ssssss    64    UInt(ssssss)
+        //    0sssss    32    UInt(sssss)
+        //    10ssss    16    UInt(ssss)
+        //    110sss     8    UInt(sss)
+        //    1110ss     4    UInt(ss)
+        //    11110s     2    UInt(s)
+        //
+        // So we 'or' (2 * -d) with our computed s to form imms.
+        let s = ((d * 2).wrapping_neg() | (s - 1)) & 0x3f;
+        debug_assert!(u8::try_from(r).is_ok());
+        debug_assert!(u8::try_from(s).is_ok());
+        Some(ImmLogic {
+            value: original_value,
+            n: out_n != 0,
+            r: r as u8,
+            s: s as u8,
+        })
+    }
+
+    pub fn from_raw(value: u64, n: bool, r: u8, s: u8) -> ImmLogic {
+        ImmLogic { n, r, s, value }
+    }
+
+    /// Returns bits ready for encoding: (N:1, R:6, S:6)
+    pub fn enc_bits(&self) -> u32 {
+        ((self.n as u32) << 12) | ((self.r as u32) << 6) | (self.s as u32)
+    }
+
+    /// Returns the value that this immediate represents.
+    pub fn value(&self) -> u64 {
+        self.value
+    }
+
+    /// Return an immediate for the bitwise-inverted value.
+    pub fn invert(&self) -> ImmLogic {
+        // For every ImmLogical immediate, the inverse can also be encoded.
+        Self::maybe_from_u64(!self.value, I64).unwrap()
+    }
+}
+
+/// An immediate for shift instructions.
+#[derive(Clone, Debug)]
+pub struct ImmShift {
+    /// 6-bit shift amount.
+    pub imm: u8,
+}
+
+impl ImmShift {
+    /// Create an ImmShift from raw bits, if possible.
+    pub fn maybe_from_u64(val: u64) -> Option<ImmShift> {
+        if val < 64 {
+            Some(ImmShift { imm: val as u8 })
+        } else {
+            None
+        }
+    }
+
+    /// Get the immediate value.
+    pub fn value(&self) -> u8 {
+        self.imm
+    }
+}
+
+/// A 16-bit immediate for a MOVZ instruction, with a {0,16,32,48}-bit shift.
+#[derive(Clone, Copy, Debug)]
+pub struct MoveWideConst {
+    /// The value.
+    pub bits: u16,
+    /// Result is `bits` shifted 16*shift bits to the left.
+    pub shift: u8,
+}
+
+impl MoveWideConst {
+    /// Construct a MoveWideConst from an arbitrary 64-bit constant if possible.
+    pub fn maybe_from_u64(value: u64) -> Option<MoveWideConst> {
+        let mask0 = 0x0000_0000_0000_ffffu64;
+        let mask1 = 0x0000_0000_ffff_0000u64;
+        let mask2 = 0x0000_ffff_0000_0000u64;
+        let mask3 = 0xffff_0000_0000_0000u64;
+
+        if value == (value & mask0) {
+            return Some(MoveWideConst {
+                bits: (value & mask0) as u16,
+                shift: 0,
+            });
+        }
+        if value == (value & mask1) {
+            return Some(MoveWideConst {
+                bits: ((value >> 16) & mask0) as u16,
+                shift: 1,
+            });
+        }
+        if value == (value & mask2) {
+            return Some(MoveWideConst {
+                bits: ((value >> 32) & mask0) as u16,
+                shift: 2,
+            });
+        }
+        if value == (value & mask3) {
+            return Some(MoveWideConst {
+                bits: ((value >> 48) & mask0) as u16,
+                shift: 3,
+            });
+        }
+        None
+    }
+
+    pub fn maybe_with_shift(imm: u16, shift: u8) -> Option<MoveWideConst> {
+        let shift_enc = shift / 16;
+        if shift_enc > 3 {
+            None
+        } else {
+            Some(MoveWideConst {
+                bits: imm,
+                shift: shift_enc,
+            })
+        }
+    }
+
+    /// Returns the value that this constant represents.
+    pub fn value(&self) -> u64 {
+        (self.bits as u64) << (16 * self.shift)
+    }
+}
+
+impl ShowWithRRU for Imm12 {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        let shift = if self.shift12 { 12 } else { 0 };
+        let value = u32::from(self.bits) << shift;
+        format!("#{}", value)
+    }
+}
+
+impl ShowWithRRU for SImm7Scaled {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("#{}", self.value)
+    }
+}
+
+impl ShowWithRRU for SImm9 {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("#{}", self.value)
+    }
+}
+
+impl ShowWithRRU for UImm12Scaled {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("#{}", self.value)
+    }
+}
+
+impl ShowWithRRU for ImmLogic {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("#{}", self.value())
+    }
+}
+
+impl ShowWithRRU for ImmShift {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("#{}", self.imm)
+    }
+}
+
+impl ShowWithRRU for MoveWideConst {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        if self.shift == 0 {
+            format!("#{}", self.bits)
+        } else {
+            format!("#{}, LSL #{}", self.bits, self.shift * 16)
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn imm_logical_test() {
+        assert_eq!(None, ImmLogic::maybe_from_u64(0, I64));
+        assert_eq!(None, ImmLogic::maybe_from_u64(u64::max_value(), I64));
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 1,
+                n: true,
+                r: 0,
+                s: 0
+            }),
+            ImmLogic::maybe_from_u64(1, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 2,
+                n: true,
+                r: 63,
+                s: 0
+            }),
+            ImmLogic::maybe_from_u64(2, I64)
+        );
+
+        assert_eq!(None, ImmLogic::maybe_from_u64(5, I64));
+
+        assert_eq!(None, ImmLogic::maybe_from_u64(11, I64));
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 248,
+                n: true,
+                r: 61,
+                s: 4
+            }),
+            ImmLogic::maybe_from_u64(248, I64)
+        );
+
+        assert_eq!(None, ImmLogic::maybe_from_u64(249, I64));
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 1920,
+                n: true,
+                r: 57,
+                s: 3
+            }),
+            ImmLogic::maybe_from_u64(1920, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 0x7ffe,
+                n: true,
+                r: 63,
+                s: 13
+            }),
+            ImmLogic::maybe_from_u64(0x7ffe, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 0x30000,
+                n: true,
+                r: 48,
+                s: 1
+            }),
+            ImmLogic::maybe_from_u64(0x30000, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 0x100000,
+                n: true,
+                r: 44,
+                s: 0
+            }),
+            ImmLogic::maybe_from_u64(0x100000, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: u64::max_value() - 1,
+                n: true,
+                r: 63,
+                s: 62
+            }),
+            ImmLogic::maybe_from_u64(u64::max_value() - 1, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 0xaaaaaaaaaaaaaaaa,
+                n: false,
+                r: 1,
+                s: 60
+            }),
+            ImmLogic::maybe_from_u64(0xaaaaaaaaaaaaaaaa, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 0x8181818181818181,
+                n: false,
+                r: 1,
+                s: 49
+            }),
+            ImmLogic::maybe_from_u64(0x8181818181818181, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 0xffc3ffc3ffc3ffc3,
+                n: false,
+                r: 10,
+                s: 43
+            }),
+            ImmLogic::maybe_from_u64(0xffc3ffc3ffc3ffc3, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 0x100000001,
+                n: false,
+                r: 0,
+                s: 0
+            }),
+            ImmLogic::maybe_from_u64(0x100000001, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 0x1111111111111111,
+                n: false,
+                r: 0,
+                s: 56
+            }),
+            ImmLogic::maybe_from_u64(0x1111111111111111, I64)
+        );
+
+        for n in 0..2 {
+            let types = if n == 0 { vec![I64, I32] } else { vec![I64] };
+            for s in 0..64 {
+                for r in 0..64 {
+                    let imm = get_logical_imm(n, s, r);
+                    for &ty in &types {
+                        match ImmLogic::maybe_from_u64(imm, ty) {
+                            Some(ImmLogic { value, .. }) => {
+                                assert_eq!(imm, value);
+                                ImmLogic::maybe_from_u64(!value, ty).unwrap();
+                            }
+                            None => assert_eq!(0, imm),
+                        };
+                    }
+                }
+            }
+        }
+    }
+
+    // Repeat a value that has `width` bits, across a 64-bit value.
+    fn repeat(value: u64, width: u64) -> u64 {
+        let mut result = value & ((1 << width) - 1);
+        let mut i = width;
+        while i < 64 {
+            result |= result << i;
+            i *= 2;
+        }
+        result
+    }
+
+    // Get the logical immediate, from the encoding N/R/S bits.
+    fn get_logical_imm(n: u32, s: u32, r: u32) -> u64 {
+        // An integer is constructed from the n, imm_s and imm_r bits according to
+        // the following table:
+        //
+        //  N   imms    immr    size        S             R
+        //  1  ssssss  rrrrrr    64    UInt(ssssss)  UInt(rrrrrr)
+        //  0  0sssss  xrrrrr    32    UInt(sssss)   UInt(rrrrr)
+        //  0  10ssss  xxrrrr    16    UInt(ssss)    UInt(rrrr)
+        //  0  110sss  xxxrrr     8    UInt(sss)     UInt(rrr)
+        //  0  1110ss  xxxxrr     4    UInt(ss)      UInt(rr)
+        //  0  11110s  xxxxxr     2    UInt(s)       UInt(r)
+        // (s bits must not be all set)
+        //
+        // A pattern is constructed of size bits, where the least significant S+1
+        // bits are set. The pattern is rotated right by R, and repeated across a
+        // 64-bit value.
+
+        if n == 1 {
+            if s == 0x3f {
+                return 0;
+            }
+            let bits = (1u64 << (s + 1)) - 1;
+            bits.rotate_right(r)
+        } else {
+            if (s >> 1) == 0x1f {
+                return 0;
+            }
+            let mut width = 0x20;
+            while width >= 0x2 {
+                if (s & width) == 0 {
+                    let mask = width - 1;
+                    if (s & mask) == mask {
+                        return 0;
+                    }
+                    let bits = (1u64 << ((s & mask) + 1)) - 1;
+                    return repeat(bits.rotate_right(r & mask), width.into());
+                }
+                width >>= 1;
+            }
+            unreachable!();
+        }
+    }
+}
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
--- a/cranelift/codegen/src/isa/aarch64/inst/regs.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/regs.rs
@@ -0,0 +1,270 @@
+//! AArch64 ISA definitions: registers.
+
+use crate::isa::aarch64::inst::InstSize;
+use crate::machinst::*;
+
+use regalloc::{RealRegUniverse, Reg, RegClass, RegClassInfo, Writable, NUM_REG_CLASSES};
+
+use std::string::{String, ToString};
+
+//=============================================================================
+// Registers, the Universe thereof, and printing
+
+#[rustfmt::skip]
+const XREG_INDICES: [u8; 31] = [
+    // X0 - X7
+    32, 33, 34, 35, 36, 37, 38, 39,
+    // X8 - X14
+    40, 41, 42, 43, 44, 45, 46,
+    // X15
+    59,
+    // X16, X17
+    47, 48,
+    // X18
+    60,
+    // X19 - X28
+    49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
+    // X29
+    61,
+    // X30
+    62,
+];
+
+const ZERO_REG_INDEX: u8 = 63;
+
+const SP_REG_INDEX: u8 = 64;
+
+/// Get a reference to an X-register (integer register).
+pub fn xreg(num: u8) -> Reg {
+    assert!(num < 31);
+    Reg::new_real(
+        RegClass::I64,
+        /* enc = */ num,
+        /* index = */ XREG_INDICES[num as usize],
+    )
+}
+
+/// Get a writable reference to an X-register.
+pub fn writable_xreg(num: u8) -> Writable<Reg> {
+    Writable::from_reg(xreg(num))
+}
+
+/// Get a reference to a V-register (vector/FP register).
+pub fn vreg(num: u8) -> Reg {
+    assert!(num < 32);
+    Reg::new_real(RegClass::V128, /* enc = */ num, /* index = */ num)
+}
+
+/// Get a writable reference to a V-register.
+pub fn writable_vreg(num: u8) -> Writable<Reg> {
+    Writable::from_reg(vreg(num))
+}
+
+/// Get a reference to the zero-register.
+pub fn zero_reg() -> Reg {
+    // This should be the same as what xreg(31) returns, except that
+    // we use the special index into the register index space.
+    Reg::new_real(
+        RegClass::I64,
+        /* enc = */ 31,
+        /* index = */ ZERO_REG_INDEX,
+    )
+}
+
+/// Get a writable reference to the zero-register (this discards a result).
+pub fn writable_zero_reg() -> Writable<Reg> {
+    Writable::from_reg(zero_reg())
+}
+
+/// Get a reference to the stack-pointer register.
+pub fn stack_reg() -> Reg {
+    // XSP (stack) and XZR (zero) are logically different registers which have
+    // the same hardware encoding, and whose meaning, in real aarch64
+    // instructions, is context-dependent.  For convenience of
+    // universe-construction and for correct printing, we make them be two
+    // different real registers.
+    Reg::new_real(
+        RegClass::I64,
+        /* enc = */ 31,
+        /* index = */ SP_REG_INDEX,
+    )
+}
+
+/// Get a writable reference to the stack-pointer register.
+pub fn writable_stack_reg() -> Writable<Reg> {
+    Writable::from_reg(stack_reg())
+}
+
+/// Get a reference to the link register (x30).
+pub fn link_reg() -> Reg {
+    xreg(30)
+}
+
+/// Get a writable reference to the link register.
+pub fn writable_link_reg() -> Writable<Reg> {
+    Writable::from_reg(link_reg())
+}
+
+/// Get a reference to the frame pointer (x29).
+pub fn fp_reg() -> Reg {
+    xreg(29)
+}
+
+/// Get a writable reference to the frame pointer.
+pub fn writable_fp_reg() -> Writable<Reg> {
+    Writable::from_reg(fp_reg())
+}
+
+/// Get a reference to the "spill temp" register. This register is used to
+/// compute the address of a spill slot when a direct offset addressing mode from
+/// FP is not sufficient (+/- 2^11 words). We exclude this register from regalloc
+/// and reserve it for this purpose for simplicity; otherwise we need a
+/// multi-stage analysis where we first determine how many spill slots we have,
+/// then perhaps remove the reg from the pool and recompute regalloc.
+pub fn spilltmp_reg() -> Reg {
+    xreg(15)
+}
+
+/// Get a writable reference to the spilltmp reg.
+pub fn writable_spilltmp_reg() -> Writable<Reg> {
+    Writable::from_reg(spilltmp_reg())
+}
+
+/// Create the register universe for AArch64.
+pub fn create_reg_universe() -> RealRegUniverse {
+    let mut regs = vec![];
+    let mut allocable_by_class = [None; NUM_REG_CLASSES];
+
+    // Numbering Scheme: we put V-regs first, then X-regs. The X-regs
+    // exclude several registers: x18 (globally reserved for platform-specific
+    // purposes), x29 (frame pointer), x30 (link register), x31 (stack pointer
+    // or zero register, depending on context).
+
+    let v_reg_base = 0u8; // in contiguous real-register index space
+    let v_reg_count = 32;
+    for i in 0u8..v_reg_count {
+        let reg = Reg::new_real(
+            RegClass::V128,
+            /* enc = */ i,
+            /* index = */ v_reg_base + i,
+        )
+        .to_real_reg();
+        let name = format!("v{}", i);
+        regs.push((reg, name));
+    }
+    let v_reg_last = v_reg_base + v_reg_count - 1;
+
+    // Add the X registers. N.B.: the order here must match the order implied
+    // by XREG_INDICES, ZERO_REG_INDEX, and SP_REG_INDEX above.
+
+    let x_reg_base = 32u8; // in contiguous real-register index space
+    let mut x_reg_count = 0;
+    for i in 0u8..32u8 {
+        // See above for excluded registers.
+        if i == 15 || i == 18 || i == 29 || i == 30 || i == 31 {
+            continue;
+        }
+        let reg = Reg::new_real(
+            RegClass::I64,
+            /* enc = */ i,
+            /* index = */ x_reg_base + x_reg_count,
+        )
+        .to_real_reg();
+        let name = format!("x{}", i);
+        regs.push((reg, name));
+        x_reg_count += 1;
+    }
+    let x_reg_last = x_reg_base + x_reg_count - 1;
+
+    allocable_by_class[RegClass::I64.rc_to_usize()] = Some(RegClassInfo {
+        first: x_reg_base as usize,
+        last: x_reg_last as usize,
+        suggested_scratch: Some(XREG_INDICES[13] as usize),
+    });
+    allocable_by_class[RegClass::V128.rc_to_usize()] = Some(RegClassInfo {
+        first: v_reg_base as usize,
+        last: v_reg_last as usize,
+        suggested_scratch: Some(/* V31: */ 31),
+    });
+
+    // Other regs, not available to the allocator.
+    let allocable = regs.len();
+    regs.push((xreg(15).to_real_reg(), "x15".to_string()));
+    regs.push((xreg(18).to_real_reg(), "x18".to_string()));
+    regs.push((fp_reg().to_real_reg(), "fp".to_string()));
+    regs.push((link_reg().to_real_reg(), "lr".to_string()));
+    regs.push((zero_reg().to_real_reg(), "xzr".to_string()));
+    regs.push((stack_reg().to_real_reg(), "sp".to_string()));
+    // FIXME JRS 2020Feb06: unfortunately this pushes the number of real regs
+    // to 65, which is potentially inconvenient from a compiler performance
+    // standpoint.  We could possibly drop back to 64 by "losing" a vector
+    // register in future.
+
+    // Assert sanity: the indices in the register structs must match their
+    // actual indices in the array.
+    for (i, reg) in regs.iter().enumerate() {
+        assert_eq!(i, reg.0.get_index());
+    }
+
+    RealRegUniverse {
+        regs,
+        allocable,
+        allocable_by_class,
+    }
+}
+
+/// If `ireg` denotes an I64-classed reg, make a best-effort attempt to show
+/// its name at the 32-bit size.
+pub fn show_ireg_sized(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: InstSize) -> String {
+    let mut s = reg.show_rru(mb_rru);
+    if reg.get_class() != RegClass::I64 || !size.is32() {
+        // We can't do any better.
+        return s;
+    }
+
+    if reg.is_real() {
+        // Change (eg) "x42" into "w42" as appropriate
+        if reg.get_class() == RegClass::I64 && size.is32() && s.starts_with("x") {
+            s = "w".to_string() + &s[1..];
+        }
+    } else {
+        // Add a "w" suffix to RegClass::I64 vregs used in a 32-bit role
+        if reg.get_class() == RegClass::I64 && size.is32() {
+            s.push('w');
+        }
+    }
+    s
+}
+
+/// Show a vector register when its use as a 32-bit or 64-bit float is known.
+pub fn show_freg_sized(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: InstSize) -> String {
+    let mut s = reg.show_rru(mb_rru);
+    if reg.get_class() != RegClass::V128 {
+        return s;
+    }
+    let prefix = if size.is32() { "s" } else { "d" };
+    s.replace_range(0..1, prefix);
+    s
+}
+
+/// Show a vector register used in a scalar context.
+pub fn show_vreg_scalar(reg: Reg, mb_rru: Option<&RealRegUniverse>) -> String {
+    let mut s = reg.show_rru(mb_rru);
+    if reg.get_class() != RegClass::V128 {
+        // We can't do any better.
+        return s;
+    }
+
+    if reg.is_real() {
+        // Change (eg) "v0" into "d0".
+        if reg.get_class() == RegClass::V128 && s.starts_with("v") {
+            s.replace_range(0..1, "d");
+        }
+    } else {
+        // Add a "d" suffix to RegClass::V128 vregs.
+        if reg.get_class() == RegClass::V128 {
+            s.push('d');
+        }
+    }
+    s
+}
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
--- a/cranelift/codegen/src/isa/aarch64/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/mod.rs
@@ -0,0 +1,220 @@
+//! ARM 64-bit Instruction Set Architecture.
+
+use crate::ir::Function;
+use crate::isa::Builder as IsaBuilder;
+use crate::machinst::{
+    compile, MachBackend, MachCompileResult, ShowWithRRU, TargetIsaAdapter, VCode,
+};
+use crate::result::CodegenResult;
+use crate::settings;
+
+use alloc::boxed::Box;
+
+use regalloc::RealRegUniverse;
+use target_lexicon::{Aarch64Architecture, Architecture, Triple};
+
+// New backend:
+mod abi;
+mod inst;
+mod lower;
+
+use inst::create_reg_universe;
+
+/// An AArch64 backend.
+pub struct AArch64Backend {
+    triple: Triple,
+    flags: settings::Flags,
+}
+
+impl AArch64Backend {
+    /// Create a new AArch64 backend with the given (shared) flags.
+    pub fn new_with_flags(triple: Triple, flags: settings::Flags) -> AArch64Backend {
+        AArch64Backend { triple, flags }
+    }
+
+    fn compile_vcode(&self, func: &Function, flags: &settings::Flags) -> VCode<inst::Inst> {
+        // This performs lowering to VCode, register-allocates the code, computes
+        // block layout and finalizes branches. The result is ready for binary emission.
+        let abi = Box::new(abi::AArch64ABIBody::new(func));
+        compile::compile::<AArch64Backend>(func, self, abi, flags)
+    }
+}
+
+impl MachBackend for AArch64Backend {
+    fn compile_function(
+        &self,
+        func: &Function,
+        want_disasm: bool,
+    ) -> CodegenResult<MachCompileResult> {
+        let flags = self.flags();
+        let vcode = self.compile_vcode(func, flags);
+        let sections = vcode.emit();
+        let frame_size = vcode.frame_size();
+
+        let disasm = if want_disasm {
+            Some(vcode.show_rru(Some(&create_reg_universe())))
+        } else {
+            None
+        };
+
+        Ok(MachCompileResult {
+            sections,
+            frame_size,
+            disasm,
+        })
+    }
+
+    fn name(&self) -> &'static str {
+        "aarch64"
+    }
+
+    fn triple(&self) -> Triple {
+        self.triple.clone()
+    }
+
+    fn flags(&self) -> &settings::Flags {
+        &self.flags
+    }
+
+    fn reg_universe(&self) -> RealRegUniverse {
+        create_reg_universe()
+    }
+}
+
+/// Create a new `isa::Builder`.
+pub fn isa_builder(triple: Triple) -> IsaBuilder {
+    assert!(triple.architecture == Architecture::Aarch64(Aarch64Architecture::Aarch64));
+    IsaBuilder {
+        triple,
+        setup: settings::builder(),
+        constructor: |triple, shared_flags, _| {
+            let backend = AArch64Backend::new_with_flags(triple, shared_flags);
+            Box::new(TargetIsaAdapter::new(backend))
+        },
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::cursor::{Cursor, FuncCursor};
+    use crate::ir::types::*;
+    use crate::ir::{AbiParam, ExternalName, Function, InstBuilder, Signature};
+    use crate::isa::CallConv;
+    use crate::settings;
+    use crate::settings::Configurable;
+    use core::str::FromStr;
+    use target_lexicon::Triple;
+
+    #[test]
+    fn test_compile_function() {
+        let name = ExternalName::testcase("test0");
+        let mut sig = Signature::new(CallConv::SystemV);
+        sig.params.push(AbiParam::new(I32));
+        sig.returns.push(AbiParam::new(I32));
+        let mut func = Function::with_name_signature(name, sig);
+
+        let bb0 = func.dfg.make_block();
+        let arg0 = func.dfg.append_block_param(bb0, I32);
+
+        let mut pos = FuncCursor::new(&mut func);
+        pos.insert_block(bb0);
+        let v0 = pos.ins().iconst(I32, 0x1234);
+        let v1 = pos.ins().iadd(arg0, v0);
+        pos.ins().return_(&[v1]);
+
+        let mut shared_flags = settings::builder();
+        shared_flags.set("opt_level", "none").unwrap();
+        let backend = AArch64Backend::new_with_flags(
+            Triple::from_str("aarch64").unwrap(),
+            settings::Flags::new(shared_flags),
+        );
+        let sections = backend.compile_function(&mut func, false).unwrap().sections;
+        let code = &sections.sections[0].data;
+
+        // stp x29, x30, [sp, #-16]!
+        // mov x29, sp
+        // mov x1, #0x1234
+        // add w0, w0, w1
+        // mov sp, x29
+        // ldp x29, x30, [sp], #16
+        // ret
+        let golden = vec![
+            0xfd, 0x7b, 0xbf, 0xa9, 0xfd, 0x03, 0x00, 0x91, 0x81, 0x46, 0x82, 0xd2, 0x00, 0x00,
+            0x01, 0x0b, 0xbf, 0x03, 0x00, 0x91, 0xfd, 0x7b, 0xc1, 0xa8, 0xc0, 0x03, 0x5f, 0xd6,
+        ];
+
+        assert_eq!(code, &golden);
+    }
+
+    #[test]
+    fn test_branch_lowering() {
+        let name = ExternalName::testcase("test0");
+        let mut sig = Signature::new(CallConv::SystemV);
+        sig.params.push(AbiParam::new(I32));
+        sig.returns.push(AbiParam::new(I32));
+        let mut func = Function::with_name_signature(name, sig);
+
+        let bb0 = func.dfg.make_block();
+        let arg0 = func.dfg.append_block_param(bb0, I32);
+        let bb1 = func.dfg.make_block();
+        let bb2 = func.dfg.make_block();
+        let bb3 = func.dfg.make_block();
+
+        let mut pos = FuncCursor::new(&mut func);
+        pos.insert_block(bb0);
+        let v0 = pos.ins().iconst(I32, 0x1234);
+        let v1 = pos.ins().iadd(arg0, v0);
+        pos.ins().brnz(v1, bb1, &[]);
+        pos.ins().jump(bb2, &[]);
+        pos.insert_block(bb1);
+        pos.ins().brnz(v1, bb2, &[]);
+        pos.ins().jump(bb3, &[]);
+        pos.insert_block(bb2);
+        let v2 = pos.ins().iadd(v1, v0);
+        pos.ins().brnz(v2, bb2, &[]);
+        pos.ins().jump(bb1, &[]);
+        pos.insert_block(bb3);
+        let v3 = pos.ins().isub(v1, v0);
+        pos.ins().return_(&[v3]);
+
+        let mut shared_flags = settings::builder();
+        shared_flags.set("opt_level", "none").unwrap();
+        let backend = AArch64Backend::new_with_flags(
+            Triple::from_str("aarch64").unwrap(),
+            settings::Flags::new(shared_flags),
+        );
+        let result = backend
+            .compile_function(&mut func, /* want_disasm = */ false)
+            .unwrap();
+        let code = &result.sections.sections[0].data;
+
+        // stp	x29, x30, [sp, #-16]!
+        // mov	x29, sp
+        // mov	x1, x0
+        // mov  x0, #0x1234
+        // add	w1, w1, w0
+        // mov	w2, w1
+        // cbz	x2, ...
+        // mov	w2, w1
+        // cbz	x2, ...
+        // sub	w0, w1, w0
+        // mov	sp, x29
+        // ldp	x29, x30, [sp], #16
+        // ret
+        // add	w2, w1, w0
+        // mov	w2, w2
+        // cbnz	x2, ... <---- compound branch (cond / uncond)
+        // b ...        <----
+
+        let golden = vec![
+            0xfd, 0x7b, 0xbf, 0xa9, 0xfd, 0x03, 0x00, 0x91, 0xe1, 0x03, 0x00, 0xaa, 0x80, 0x46,
+            0x82, 0xd2, 0x21, 0x00, 0x00, 0x0b, 0xe2, 0x03, 0x01, 0x2a, 0xe2, 0x00, 0x00, 0xb4,
+            0xe2, 0x03, 0x01, 0x2a, 0xa2, 0x00, 0x00, 0xb5, 0x20, 0x00, 0x00, 0x4b, 0xbf, 0x03,
+            0x00, 0x91, 0xfd, 0x7b, 0xc1, 0xa8, 0xc0, 0x03, 0x5f, 0xd6, 0x22, 0x00, 0x00, 0x0b,
+            0xe2, 0x03, 0x02, 0x2a, 0xc2, 0xff, 0xff, 0xb5, 0xf7, 0xff, 0xff, 0x17,
+        ];
+
+        assert_eq!(code, &golden);
+    }
+}
--- a/cranelift/codegen/src/isa/arm64/abi.rs
+++ b/cranelift/codegen/src/isa/arm64/abi.rs
@@ -1,31 +0,0 @@
-//! ARM 64 ABI implementation.
-
-use super::registers::{FPR, GPR};
-use crate::ir;
-use crate::isa::RegClass;
-use crate::regalloc::RegisterSet;
-use crate::settings as shared_settings;
-use alloc::borrow::Cow;
-
-/// Legalize `sig`.
-pub fn legalize_signature(
-    _sig: &mut Cow<ir::Signature>,
-    _flags: &shared_settings::Flags,
-    _current: bool,
-) {
-    unimplemented!()
-}
-
-/// Get register class for a type appearing in a legalized signature.
-pub fn regclass_for_abi_type(ty: ir::Type) -> RegClass {
-    if ty.is_int() {
-        GPR
-    } else {
-        FPR
-    }
-}
-
-/// Get the set of allocatable registers for `func`.
-pub fn allocatable_registers(_func: &ir::Function) -> RegisterSet {
-    unimplemented!()
-}
--- a/cranelift/codegen/src/isa/arm64/binemit.rs
+++ b/cranelift/codegen/src/isa/arm64/binemit.rs
@@ -1,8 +0,0 @@
-//! Emitting binary ARM64 machine code.
-
-use crate::binemit::{bad_encoding, CodeSink};
-use crate::ir::{Function, Inst};
-use crate::isa::TargetIsa;
-use crate::regalloc::RegDiversions;
-
-include!(concat!(env!("OUT_DIR"), "/binemit-arm64.rs"));
--- a/cranelift/codegen/src/isa/arm64/enc_tables.rs
+++ b/cranelift/codegen/src/isa/arm64/enc_tables.rs
@@ -1,10 +0,0 @@
-//! Encoding tables for ARM64 ISA.
-
-use crate::ir;
-use crate::isa;
-use crate::isa::constraints::*;
-use crate::isa::enc_tables::*;
-use crate::isa::encoding::RecipeSizing;
-
-include!(concat!(env!("OUT_DIR"), "/encoding-arm64.rs"));
-include!(concat!(env!("OUT_DIR"), "/legalize-arm64.rs"));
--- a/cranelift/codegen/src/isa/arm64/mod.rs
+++ b/cranelift/codegen/src/isa/arm64/mod.rs
@@ -1,132 +0,0 @@
-//! ARM 64-bit Instruction Set Architecture.
-
-mod abi;
-mod binemit;
-mod enc_tables;
-mod registers;
-pub mod settings;
-
-use super::super::settings as shared_settings;
-#[cfg(feature = "testing_hooks")]
-use crate::binemit::CodeSink;
-use crate::binemit::{emit_function, MemoryCodeSink};
-use crate::ir;
-use crate::isa::enc_tables::{lookup_enclist, Encodings};
-use crate::isa::Builder as IsaBuilder;
-use crate::isa::{EncInfo, RegClass, RegInfo, TargetIsa};
-use crate::regalloc;
-use alloc::borrow::Cow;
-use alloc::boxed::Box;
-use core::fmt;
-use target_lexicon::Triple;
-
-#[allow(dead_code)]
-struct Isa {
-    triple: Triple,
-    shared_flags: shared_settings::Flags,
-    isa_flags: settings::Flags,
-}
-
-/// Get an ISA builder for creating ARM64 targets.
-pub fn isa_builder(triple: Triple) -> IsaBuilder {
-    IsaBuilder {
-        triple,
-        setup: settings::builder(),
-        constructor: isa_constructor,
-    }
-}
-
-fn isa_constructor(
-    triple: Triple,
-    shared_flags: shared_settings::Flags,
-    builder: shared_settings::Builder,
-) -> Box<dyn TargetIsa> {
-    Box::new(Isa {
-        triple,
-        isa_flags: settings::Flags::new(&shared_flags, builder),
-        shared_flags,
-    })
-}
-
-impl TargetIsa for Isa {
-    fn name(&self) -> &'static str {
-        "arm64"
-    }
-
-    fn triple(&self) -> &Triple {
-        &self.triple
-    }
-
-    fn flags(&self) -> &shared_settings::Flags {
-        &self.shared_flags
-    }
-
-    fn register_info(&self) -> RegInfo {
-        registers::INFO.clone()
-    }
-
-    fn encoding_info(&self) -> EncInfo {
-        enc_tables::INFO.clone()
-    }
-
-    fn legal_encodings<'a>(
-        &'a self,
-        func: &'a ir::Function,
-        inst: &'a ir::InstructionData,
-        ctrl_typevar: ir::Type,
-    ) -> Encodings<'a> {
-        lookup_enclist(
-            ctrl_typevar,
-            inst,
-            func,
-            &enc_tables::LEVEL1_A64[..],
-            &enc_tables::LEVEL2[..],
-            &enc_tables::ENCLISTS[..],
-            &enc_tables::LEGALIZE_ACTIONS[..],
-            &enc_tables::RECIPE_PREDICATES[..],
-            &enc_tables::INST_PREDICATES[..],
-            self.isa_flags.predicate_view(),
-        )
-    }
-
-    fn legalize_signature(&self, sig: &mut Cow<ir::Signature>, current: bool) {
-        abi::legalize_signature(sig, &self.shared_flags, current)
-    }
-
-    fn regclass_for_abi_type(&self, ty: ir::Type) -> RegClass {
-        abi::regclass_for_abi_type(ty)
-    }
-
-    fn allocatable_registers(&self, func: &ir::Function) -> regalloc::RegisterSet {
-        abi::allocatable_registers(func)
-    }
-
-    #[cfg(feature = "testing_hooks")]
-    fn emit_inst(
-        &self,
-        func: &ir::Function,
-        inst: ir::Inst,
-        divert: &mut regalloc::RegDiversions,
-        sink: &mut dyn CodeSink,
-    ) {
-        binemit::emit_inst(func, inst, divert, sink, self)
-    }
-
-    fn emit_function_to_memory(&self, func: &ir::Function, sink: &mut MemoryCodeSink) {
-        emit_function(func, binemit::emit_inst, sink, self)
-    }
-
-    fn unsigned_add_overflow_condition(&self) -> ir::condcodes::IntCC {
-        ir::condcodes::IntCC::UnsignedLessThan
-    }
-
-    fn unsigned_sub_overflow_condition(&self) -> ir::condcodes::IntCC {
-        ir::condcodes::IntCC::UnsignedGreaterThanOrEqual
-    }
-}
-
-impl fmt::Display for Isa {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "{}\n{}", self.shared_flags, self.isa_flags)
-    }
-}
--- a/cranelift/codegen/src/isa/arm64/registers.rs
+++ b/cranelift/codegen/src/isa/arm64/registers.rs
@@ -1,39 +0,0 @@
-//! ARM64 register descriptions.
-
-use crate::isa::registers::{RegBank, RegClass, RegClassData, RegInfo, RegUnit};
-
-include!(concat!(env!("OUT_DIR"), "/registers-arm64.rs"));
-
-#[cfg(test)]
-mod tests {
-    use super::INFO;
-    use crate::isa::RegUnit;
-    use alloc::string::{String, ToString};
-
-    #[test]
-    fn unit_encodings() {
-        assert_eq!(INFO.parse_regunit("x0"), Some(0));
-        assert_eq!(INFO.parse_regunit("x31"), Some(31));
-        assert_eq!(INFO.parse_regunit("v0"), Some(32));
-        assert_eq!(INFO.parse_regunit("v31"), Some(63));
-
-        assert_eq!(INFO.parse_regunit("x32"), None);
-        assert_eq!(INFO.parse_regunit("v32"), None);
-    }
-
-    #[test]
-    fn unit_names() {
-        fn uname(ru: RegUnit) -> String {
-            INFO.display_regunit(ru).to_string()
-        }
-
-        assert_eq!(uname(0), "%x0");
-        assert_eq!(uname(1), "%x1");
-        assert_eq!(uname(31), "%x31");
-        assert_eq!(uname(32), "%v0");
-        assert_eq!(uname(33), "%v1");
-        assert_eq!(uname(63), "%v31");
-        assert_eq!(uname(64), "%nzcv");
-        assert_eq!(uname(65), "%INVALID65");
-    }
-}
--- a/cranelift/codegen/src/isa/arm64/settings.rs
+++ b/cranelift/codegen/src/isa/arm64/settings.rs
@@ -1,9 +0,0 @@
-//! ARM64 Settings.
-
-use crate::settings::{self, detail, Builder};
-use core::fmt;
-
-// Include code generated by `cranelift-codegen/meta/src/gen_settings.rs`. This file contains a
-// public `Flags` struct with an impl for all of the settings defined in
-// `cranelift-codegen/meta/src/isa/arm64/mod.rs`.
-include!(concat!(env!("OUT_DIR"), "/settings-arm64.rs"));
--- a/cranelift/codegen/src/isa/mod.rs
+++ b/cranelift/codegen/src/isa/mod.rs
@@ -48,6 +48,7 @@ pub use crate::isa::call_conv::CallConv;
 pub use crate::isa::constraints::{
    BranchRange, ConstraintKind, OperandConstraint, RecipeConstraints,
 };
+pub use crate::isa::enc_tables::Encodings;
 pub use crate::isa::encoding::{base_size, EncInfo, Encoding};
 pub use crate::isa::registers::{regs_overlap, RegClass, RegClassIndex, RegInfo, RegUnit};
 pub use crate::isa::stack::{StackBase, StackBaseMask, StackRef};
@@ -55,9 +56,9 @@ pub use crate::isa::stack::{StackBase, StackBaseMask, StackRef};
 use crate::binemit;
 use crate::flowgraph;
 use crate::ir;
-use crate::isa::enc_tables::Encodings;
-#[cfg(feature = "unwind")]
 use crate::isa::fde::RegisterMappingError;
+#[cfg(feature = "unwind")]
+use crate::machinst::MachBackend;
 use crate::regalloc;
 use crate::result::CodegenResult;
 use crate::settings;
@@ -83,7 +84,7 @@ pub mod fde;
 mod arm32;

 #[cfg(feature = "arm64")]
-mod arm64;
+mod aarch64;

 mod call_conv;
 mod constraints;
@@ -92,6 +93,9 @@ mod encoding;
 pub mod registers;
 mod stack;

+#[cfg(test)]
+mod test_utils;
+
 /// Returns a builder that can create a corresponding `TargetIsa`
 /// or `Err(LookupError::SupportDisabled)` if not enabled.
 macro_rules! isa_builder {
@@ -116,7 +120,7 @@ pub fn lookup(triple: Triple) -> Result<Builder, LookupError> {
            isa_builder!(x86, "x86", triple)
        }
        Architecture::Arm { .. } => isa_builder!(arm32, "arm32", triple),
-        Architecture::Aarch64 { .. } => isa_builder!(arm64, "arm64", triple),
+        Architecture::Aarch64 { .. } => isa_builder!(aarch64, "arm64", triple),
        _ => Err(LookupError::Unsupported),
    }
 }
@@ -402,6 +406,11 @@ pub trait TargetIsa: fmt::Display + Send + Sync {
        // No-op by default
        Ok(())
    }
+
+    /// Get the new-style MachBackend, if this is an adapter around one.
+    fn get_mach_backend(&self) -> Option<&dyn MachBackend> {
+        None
+    }
 }

 impl Debug for &dyn TargetIsa {
--- a/cranelift/codegen/src/isa/test_utils.rs
+++ b/cranelift/codegen/src/isa/test_utils.rs
@@ -0,0 +1,88 @@
+// This is unused when no platforms with the new backend are enabled.
+#![allow(dead_code)]
+
+use crate::binemit::{Addend, CodeOffset, CodeSink, Reloc};
+use crate::ir::Value;
+use crate::ir::{ConstantOffset, ExternalName, Function, JumpTable, Opcode, SourceLoc, TrapCode};
+use crate::isa::TargetIsa;
+
+use alloc::vec::Vec;
+use std::string::String;
+
+pub struct TestCodeSink {
+    bytes: Vec<u8>,
+}
+
+impl TestCodeSink {
+    /// Create a new TestCodeSink.
+    pub fn new() -> TestCodeSink {
+        TestCodeSink { bytes: vec![] }
+    }
+
+    /// Return the code emitted to this sink as a hex string.
+    pub fn stringify(&self) -> String {
+        // This is pretty lame, but whatever ..
+        use std::fmt::Write;
+        let mut s = String::with_capacity(self.bytes.len() * 2);
+        for b in &self.bytes {
+            write!(&mut s, "{:02X}", b).unwrap();
+        }
+        s
+    }
+}
+
+impl CodeSink for TestCodeSink {
+    fn offset(&self) -> CodeOffset {
+        self.bytes.len() as CodeOffset
+    }
+
+    fn put1(&mut self, x: u8) {
+        self.bytes.push(x);
+    }
+
+    fn put2(&mut self, x: u16) {
+        self.bytes.push((x >> 0) as u8);
+        self.bytes.push((x >> 8) as u8);
+    }
+
+    fn put4(&mut self, mut x: u32) {
+        for _ in 0..4 {
+            self.bytes.push(x as u8);
+            x >>= 8;
+        }
+    }
+
+    fn put8(&mut self, mut x: u64) {
+        for _ in 0..8 {
+            self.bytes.push(x as u8);
+            x >>= 8;
+        }
+    }
+
+    fn reloc_block(&mut self, _rel: Reloc, _block_offset: CodeOffset) {}
+
+    fn reloc_external(
+        &mut self,
+        _srcloc: SourceLoc,
+        _rel: Reloc,
+        _name: &ExternalName,
+        _addend: Addend,
+    ) {
+    }
+
+    fn reloc_constant(&mut self, _rel: Reloc, _constant_offset: ConstantOffset) {}
+
+    fn reloc_jt(&mut self, _rel: Reloc, _jt: JumpTable) {}
+
+    fn trap(&mut self, _code: TrapCode, _srcloc: SourceLoc) {}
+
+    fn begin_jumptables(&mut self) {}
+
+    fn begin_rodata(&mut self) {}
+
+    fn end_codegen(&mut self) {}
+
+    fn add_stackmap(&mut self, _val_list: &[Value], _func: &Function, _isa: &dyn TargetIsa) {}
+
+    fn add_call_site(&mut self, _opcode: Opcode, _srcloc: SourceLoc) {}
+}
--- a/cranelift/codegen/src/legalizer/mod.rs
+++ b/cranelift/codegen/src/legalizer/mod.rs
@@ -196,6 +196,55 @@ pub fn legalize_function(func: &mut ir::Function, cfg: &mut ControlFlowGraph, is
    }
 }

+/// Perform a simple legalization by expansion of the function, without
+/// platform-specific transforms.
+pub fn simple_legalize(func: &mut ir::Function, cfg: &mut ControlFlowGraph, isa: &dyn TargetIsa) {
+    let mut pos = FuncCursor::new(func);
+    let func_begin = pos.position();
+    pos.set_position(func_begin);
+    while let Some(_block) = pos.next_block() {
+        let mut prev_pos = pos.position();
+        while let Some(inst) = pos.next_inst() {
+            let expanded = match pos.func.dfg[inst].opcode() {
+                ir::Opcode::BrIcmp
+                | ir::Opcode::GlobalValue
+                | ir::Opcode::HeapAddr
+                | ir::Opcode::StackLoad
+                | ir::Opcode::StackStore
+                | ir::Opcode::TableAddr
+                | ir::Opcode::Trapnz
+                | ir::Opcode::Trapz
+                | ir::Opcode::BandImm
+                | ir::Opcode::BorImm
+                | ir::Opcode::BxorImm
+                | ir::Opcode::IaddImm
+                | ir::Opcode::IfcmpImm
+                | ir::Opcode::ImulImm
+                | ir::Opcode::IrsubImm
+                | ir::Opcode::IshlImm
+                | ir::Opcode::RotlImm
+                | ir::Opcode::RotrImm
+                | ir::Opcode::SdivImm
+                | ir::Opcode::SremImm
+                | ir::Opcode::SshrImm
+                | ir::Opcode::UdivImm
+                | ir::Opcode::UremImm
+                | ir::Opcode::UshrImm
+                | ir::Opcode::IcmpImm => expand(inst, &mut pos.func, cfg, isa),
+                _ => false,
+            };
+
+            if expanded {
+                // Legalization implementations require fixpoint loop
+                // here. TODO: fix this.
+                pos.set_position(prev_pos);
+            } else {
+                prev_pos = pos.position();
+            }
+        }
+    }
+}
+
 // Include legalization patterns that were generated by `gen_legalizer.rs` from the
 // `TransformGroup` in `cranelift-codegen/meta/shared/legalize.rs`.
 //
--- a/cranelift/codegen/src/lib.rs
+++ b/cranelift/codegen/src/lib.rs
@@ -71,6 +71,7 @@ pub mod flowgraph;
 pub mod ir;
 pub mod isa;
 pub mod loop_analysis;
+pub mod machinst;
 pub mod print_errors;
 pub mod settings;
 pub mod timing;
@@ -86,10 +87,12 @@ mod context;
 mod dce;
 mod divconst_magic_numbers;
 mod fx;
+mod inst_predicates;
 mod iterators;
 mod legalizer;
 mod licm;
 mod nan_canonicalization;
+mod num_uses;
 mod partition_slice;
 mod postopt;
 mod predicates;
--- a/cranelift/codegen/src/machinst/abi.rs
+++ b/cranelift/codegen/src/machinst/abi.rs
@@ -0,0 +1,149 @@
+//! ABI definitions.
+
+use crate::ir::StackSlot;
+use crate::machinst::*;
+use crate::settings;
+
+use regalloc::{Reg, Set, SpillSlot, Writable};
+
+/// Trait implemented by an object that tracks ABI-related state (e.g., stack
+/// layout) and can generate code while emitting the *body* of a function.
+pub trait ABIBody {
+    /// The instruction type for the ISA associated with this ABI.
+    type I: VCodeInst;
+
+    /// Get the liveins of the function.
+    fn liveins(&self) -> Set<RealReg>;
+
+    /// Get the liveouts of the function.
+    fn liveouts(&self) -> Set<RealReg>;
+
+    /// Number of arguments.
+    fn num_args(&self) -> usize;
+
+    /// Number of return values.
+    fn num_retvals(&self) -> usize;
+
+    /// Number of stack slots (not spill slots).
+    fn num_stackslots(&self) -> usize;
+
+    /// Generate an instruction which copies an argument to a destination
+    /// register.
+    fn gen_copy_arg_to_reg(&self, idx: usize, into_reg: Writable<Reg>) -> Self::I;
+
+    /// Generate an instruction which copies a source register to a return
+    /// value slot.
+    fn gen_copy_reg_to_retval(&self, idx: usize, from_reg: Reg) -> Self::I;
+
+    /// Generate a return instruction.
+    fn gen_ret(&self) -> Self::I;
+
+    /// Generate an epilogue placeholder. The returned instruction should return `true` from
+    /// `is_epilogue_placeholder()`; this is used to indicate to the lowering driver when
+    /// the epilogue should be inserted.
+    fn gen_epilogue_placeholder(&self) -> Self::I;
+
+    // -----------------------------------------------------------------
+    // Every function above this line may only be called pre-regalloc.
+    // Every function below this line may only be called post-regalloc.
+    // `spillslots()` must be called before any other post-regalloc
+    // function.
+    // ----------------------------------------------------------------
+
+    /// Update with the number of spillslots, post-regalloc.
+    fn set_num_spillslots(&mut self, slots: usize);
+
+    /// Update with the clobbered registers, post-regalloc.
+    fn set_clobbered(&mut self, clobbered: Set<Writable<RealReg>>);
+
+    /// Load from a stackslot.
+    fn load_stackslot(
+        &self,
+        slot: StackSlot,
+        offset: u32,
+        ty: Type,
+        into_reg: Writable<Reg>,
+    ) -> Self::I;
+
+    /// Store to a stackslot.
+    fn store_stackslot(&self, slot: StackSlot, offset: u32, ty: Type, from_reg: Reg) -> Self::I;
+
+    /// Load from a spillslot.
+    fn load_spillslot(&self, slot: SpillSlot, ty: Type, into_reg: Writable<Reg>) -> Self::I;
+
+    /// Store to a spillslot.
+    fn store_spillslot(&self, slot: SpillSlot, ty: Type, from_reg: Reg) -> Self::I;
+
+    /// Generate a prologue, post-regalloc. This should include any stack
+    /// frame or other setup necessary to use the other methods (`load_arg`,
+    /// `store_retval`, and spillslot accesses.)  `self` is mutable so that we
+    /// can store information in it which will be useful when creating the
+    /// epilogue.
+    fn gen_prologue(&mut self, flags: &settings::Flags) -> Vec<Self::I>;
+
+    /// Generate an epilogue, post-regalloc. Note that this must generate the
+    /// actual return instruction (rather than emitting this in the lowering
+    /// logic), because the epilogue code comes before the return and the two are
+    /// likely closely related.
+    fn gen_epilogue(&self, flags: &settings::Flags) -> Vec<Self::I>;
+
+    /// Returns the full frame size for the given function, after prologue emission has run. This
+    /// comprises the spill space, incoming argument space, alignment padding, etc.
+    fn frame_size(&self) -> u32;
+
+    /// Get the spill-slot size.
+    fn get_spillslot_size(&self, rc: RegClass, ty: Type) -> u32;
+
+    /// Generate a spill.
+    fn gen_spill(&self, to_slot: SpillSlot, from_reg: RealReg, ty: Type) -> Self::I;
+
+    /// Generate a reload (fill).
+    fn gen_reload(&self, to_reg: Writable<RealReg>, from_slot: SpillSlot, ty: Type) -> Self::I;
+}
+
+/// Trait implemented by an object that tracks ABI-related state and can
+/// generate code while emitting a *call* to a function.
+///
+/// An instance of this trait returns information for a *particular*
+/// callsite. It will usually be computed from the called function's
+/// signature.
+///
+/// Unlike `ABIBody` above, methods on this trait are not invoked directly
+/// by the machine-independent code. Rather, the machine-specific lowering
+/// code will typically create an `ABICall` when creating machine instructions
+/// for an IR call instruction inside `lower()`, directly emit the arg and
+/// and retval copies, and attach the register use/def info to the call.
+///
+/// This trait is thus provided for convenience to the backends.
+pub trait ABICall {
+    /// The instruction type for the ISA associated with this ABI.
+    type I: VCodeInst;
+
+    /// Get the number of arguments expected.
+    fn num_args(&self) -> usize;
+
+    /// Save the clobbered registers.
+    /// Copy an argument value from a source register, prior to the call.
+    fn gen_copy_reg_to_arg(&self, idx: usize, from_reg: Reg) -> Self::I;
+
+    /// Copy a return value into a destination register, after the call returns.
+    fn gen_copy_retval_to_reg(&self, idx: usize, into_reg: Writable<Reg>) -> Self::I;
+
+    /// Pre-adjust the stack, prior to argument copies and call.
+    fn gen_stack_pre_adjust(&self) -> Vec<Self::I>;
+
+    /// Post-adjust the satck, after call return and return-value copies.
+    fn gen_stack_post_adjust(&self) -> Vec<Self::I>;
+
+    /// Generate the call itself.
+    ///
+    /// The returned instruction should have proper use- and def-sets according
+    /// to the argument registers, return-value registers, and clobbered
+    /// registers for this function signature in this ABI.
+    ///
+    /// (Arg registers are uses, and retval registers are defs. Clobbered
+    /// registers are also logically defs, but should never be read; their
+    /// values are "defined" (to the regalloc) but "undefined" in every other
+    /// sense.)
+    fn gen_call(&self) -> Vec<Self::I>;
+}
--- a/cranelift/codegen/src/machinst/adapter.rs
+++ b/cranelift/codegen/src/machinst/adapter.rs
@@ -0,0 +1,130 @@
+//! Adapter for a `MachBackend` to implement the `TargetIsa` trait.
+
+use crate::binemit;
+use crate::ir;
+use crate::isa::{EncInfo, Encoding, Encodings, Legalize, RegClass, RegInfo, TargetIsa};
+use crate::machinst::*;
+use crate::regalloc::RegisterSet;
+use crate::settings::Flags;
+
+#[cfg(feature = "testing_hooks")]
+use crate::regalloc::RegDiversions;
+
+use std::borrow::Cow;
+use std::fmt;
+use target_lexicon::Triple;
+
+/// A wrapper around a `MachBackend` that provides a `TargetIsa` impl.
+pub struct TargetIsaAdapter {
+    backend: Box<dyn MachBackend + Send + Sync + 'static>,
+    triple: Triple,
+}
+
+impl TargetIsaAdapter {
+    /// Create a new `TargetIsa` wrapper around a `MachBackend`.
+    pub fn new<B: MachBackend + Send + Sync + 'static>(backend: B) -> TargetIsaAdapter {
+        let triple = backend.triple();
+        TargetIsaAdapter {
+            backend: Box::new(backend),
+            triple,
+        }
+    }
+}
+
+impl fmt::Display for TargetIsaAdapter {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        f.debug_struct("MachBackend")
+            .field("name", &self.backend.name())
+            .field("triple", &self.backend.triple())
+            .field("flags", &format!("{}", self.backend.flags()))
+            .finish()
+    }
+}
+
+impl TargetIsa for TargetIsaAdapter {
+    fn name(&self) -> &'static str {
+        self.backend.name()
+    }
+
+    fn triple(&self) -> &Triple {
+        &self.triple
+    }
+
+    fn flags(&self) -> &Flags {
+        self.backend.flags()
+    }
+
+    fn register_info(&self) -> RegInfo {
+        // Called from function's Display impl, so we need a stub here.
+        RegInfo {
+            banks: &[],
+            classes: &[],
+        }
+    }
+
+    fn legal_encodings<'a>(
+        &'a self,
+        _func: &'a ir::Function,
+        _inst: &'a ir::InstructionData,
+        _ctrl_typevar: ir::Type,
+    ) -> Encodings<'a> {
+        panic!("Should not be called when new-style backend is available!")
+    }
+
+    fn encode(
+        &self,
+        _func: &ir::Function,
+        _inst: &ir::InstructionData,
+        _ctrl_typevar: ir::Type,
+    ) -> Result<Encoding, Legalize> {
+        panic!("Should not be called when new-style backend is available!")
+    }
+
+    fn encoding_info(&self) -> EncInfo {
+        panic!("Should not be called when new-style backend is available!")
+    }
+
+    fn legalize_signature(&self, _sig: &mut Cow<ir::Signature>, _current: bool) {
+        panic!("Should not be called when new-style backend is available!")
+    }
+
+    fn regclass_for_abi_type(&self, _ty: ir::Type) -> RegClass {
+        panic!("Should not be called when new-style backend is available!")
+    }
+
+    fn allocatable_registers(&self, _func: &ir::Function) -> RegisterSet {
+        panic!("Should not be called when new-style backend is available!")
+    }
+
+    fn prologue_epilogue(&self, _func: &mut ir::Function) -> CodegenResult<()> {
+        panic!("Should not be called when new-style backend is available!")
+    }
+
+    #[cfg(feature = "testing_hooks")]
+    fn emit_inst(
+        &self,
+        _func: &ir::Function,
+        _inst: ir::Inst,
+        _divert: &mut RegDiversions,
+        _sink: &mut dyn binemit::CodeSink,
+    ) {
+        panic!("Should not be called when new-style backend is available!")
+    }
+
+    /// Emit a whole function into memory.
+    fn emit_function_to_memory(&self, _func: &ir::Function, _sink: &mut binemit::MemoryCodeSink) {
+        panic!("Should not be called when new-style backend is available!")
+    }
+
+    fn get_mach_backend(&self) -> Option<&dyn MachBackend> {
+        Some(&*self.backend)
+    }
+
+    fn unsigned_add_overflow_condition(&self) -> ir::condcodes::IntCC {
+        self.backend.unsigned_add_overflow_condition()
+    }
+
+    fn unsigned_sub_overflow_condition(&self) -> ir::condcodes::IntCC {
+        self.backend.unsigned_sub_overflow_condition()
+    }
+}
--- a/cranelift/codegen/src/machinst/blockorder.rs
+++ b/cranelift/codegen/src/machinst/blockorder.rs
@@ -0,0 +1,59 @@
+//! Computation of basic block order in emitted code.
+
+use crate::machinst::*;
+use regalloc::{BlockIx, Function};
+
+/// Simple reverse postorder-based block order emission.
+///
+/// TODO: use a proper algorithm, such as the bottom-up straight-line-section
+/// construction algorithm.
+struct BlockRPO {
+    visited: Vec<bool>,
+    postorder: Vec<BlockIndex>,
+    deferred_last: Option<BlockIndex>,
+}
+
+impl BlockRPO {
+    fn new<I: VCodeInst>(vcode: &VCode<I>) -> BlockRPO {
+        BlockRPO {
+            visited: vec![false; vcode.num_blocks()],
+            postorder: vec![],
+            deferred_last: None,
+        }
+    }
+
+    fn visit<I: VCodeInst>(&mut self, vcode: &VCode<I>, block: BlockIndex) {
+        self.visited[block as usize] = true;
+        for succ in vcode.succs(block) {
+            if !self.visited[*succ as usize] {
+                self.visit(vcode, *succ);
+            }
+        }
+
+        for i in vcode.block_insns(BlockIx::new(block)) {
+            if vcode.get_insn(i).is_epilogue_placeholder() {
+                debug_assert!(self.deferred_last.is_none());
+                self.deferred_last = Some(block);
+                return;
+            }
+        }
+
+        self.postorder.push(block);
+    }
+
+    fn rpo(self) -> Vec<BlockIndex> {
+        let mut rpo = self.postorder;
+        rpo.reverse();
+        if let Some(block) = self.deferred_last {
+            rpo.push(block);
+        }
+        rpo
+    }
+}
+
+/// Compute the final block order.
+pub fn compute_final_block_order<I: VCodeInst>(vcode: &VCode<I>) -> Vec<BlockIndex> {
+    let mut rpo = BlockRPO::new(vcode);
+    rpo.visit(vcode, vcode.entry());
+    rpo.rpo()
+}
--- a/cranelift/codegen/src/machinst/compile.rs
+++ b/cranelift/codegen/src/machinst/compile.rs
@@ -0,0 +1,63 @@
+//! Compilation backend pipeline: optimized IR to VCode / binemit.
+
+use crate::ir::Function;
+use crate::machinst::*;
+use crate::settings;
+use crate::timing;
+
+use log::debug;
+use regalloc::{allocate_registers, RegAllocAlgorithm};
+
+/// Compile the given function down to VCode with allocated registers, ready
+/// for binary emission.
+pub fn compile<B: LowerBackend>(
+    f: &Function,
+    b: &B,
+    abi: Box<dyn ABIBody<I = B::MInst>>,
+    flags: &settings::Flags,
+) -> VCode<B::MInst>
+where
+    B::MInst: ShowWithRRU,
+{
+    // This lowers the CL IR.
+    let mut vcode = Lower::new(f, abi).lower(b);
+
+    let universe = &B::MInst::reg_universe();
+
+    debug!("vcode from lowering: \n{}", vcode.show_rru(Some(universe)));
+
+    // Perform register allocation.
+    // TODO: select register allocation algorithm from flags.
+    let algorithm = RegAllocAlgorithm::Backtracking;
+    let result = {
+        let _tt = timing::regalloc();
+        allocate_registers(
+            &mut vcode, algorithm, universe, /*request_block_annotations=*/ false,
+        )
+        .map_err(|err| {
+            debug!(
+                "Register allocation error for vcode\n{}\nError: {:?}",
+                vcode.show_rru(Some(universe)),
+                err
+            );
+            err
+        })
+        .expect("register allocation")
+    };
+
+    // Reorder vcode into final order and copy out final instruction sequence
+    // all at once. This also inserts prologues/epilogues.
+    vcode.replace_insns_from_regalloc(result, flags);
+
+    vcode.remove_redundant_branches();
+
+    // Do final passes over code to finalize branches.
+    vcode.finalize_branches();
+
+    debug!(
+        "vcode after regalloc: final version:\n{}",
+        vcode.show_rru(Some(universe))
+    );
+
+    vcode
+}
--- a/cranelift/codegen/src/machinst/lower.rs
+++ b/cranelift/codegen/src/machinst/lower.rs
@@ -0,0 +1,720 @@
+//! This module implements lowering (instruction selection) from Cranelift IR
+//! to machine instructions with virtual registers. This is *almost* the final
+//! machine code, except for register allocation.
+
+use crate::entity::SecondaryMap;
+use crate::inst_predicates::has_side_effect;
+use crate::ir::instructions::BranchInfo;
+use crate::ir::{
+    Block, ExternalName, Function, GlobalValueData, Inst, InstructionData, MemFlags, Opcode,
+    Signature, SourceLoc, Type, Value, ValueDef,
+};
+use crate::machinst::{ABIBody, BlockIndex, VCode, VCodeBuilder, VCodeInst};
+use crate::num_uses::NumUses;
+
+use regalloc::{Reg, RegClass, Set, VirtualReg, Writable};
+
+use alloc::boxed::Box;
+use alloc::vec::Vec;
+use log::debug;
+use smallvec::SmallVec;
+use std::collections::VecDeque;
+
+/// A context that machine-specific lowering code can use to emit lowered instructions. This is the
+/// view of the machine-independent per-function lowering context that is seen by the machine
+/// backend.
+pub trait LowerCtx {
+    /// The instruction type for which this lowering framework is instantiated.
+    type I;
+
+    /// Get the instdata for a given IR instruction.
+    fn data(&self, ir_inst: Inst) -> &InstructionData;
+    /// Get the controlling type for a polymorphic IR instruction.
+    fn ty(&self, ir_inst: Inst) -> Type;
+    /// Emit a machine instruction.
+    fn emit(&mut self, mach_inst: Self::I);
+    /// Indicate that an IR instruction has been merged, and so one of its
+    /// uses is gone (replaced by uses of the instruction's inputs). This
+    /// helps the lowering algorithm to perform on-the-fly DCE, skipping over
+    /// unused instructions (such as immediates incorporated directly).
+    fn merged(&mut self, from_inst: Inst);
+    /// Get the producing instruction, if any, and output number, for the `idx`th input to the
+    /// given IR instruction
+    fn input_inst(&self, ir_inst: Inst, idx: usize) -> Option<(Inst, usize)>;
+    /// Map a Value to its associated writable (probably virtual) Reg.
+    fn value_to_writable_reg(&self, val: Value) -> Writable<Reg>;
+    /// Map a Value to its associated (probably virtual) Reg.
+    fn value_to_reg(&self, val: Value) -> Reg;
+    /// Get the `idx`th input to the given IR instruction as a virtual register.
+    fn input(&self, ir_inst: Inst, idx: usize) -> Reg;
+    /// Get the `idx`th output of the given IR instruction as a virtual register.
+    fn output(&self, ir_inst: Inst, idx: usize) -> Writable<Reg>;
+    /// Get the number of inputs to the given IR instruction.
+    fn num_inputs(&self, ir_inst: Inst) -> usize;
+    /// Get the number of outputs to the given IR instruction.
+    fn num_outputs(&self, ir_inst: Inst) -> usize;
+    /// Get the type for an instruction's input.
+    fn input_ty(&self, ir_inst: Inst, idx: usize) -> Type;
+    /// Get the type for an instruction's output.
+    fn output_ty(&self, ir_inst: Inst, idx: usize) -> Type;
+    /// Get a new temp.
+    fn tmp(&mut self, rc: RegClass, ty: Type) -> Writable<Reg>;
+    /// Get the number of block params.
+    fn num_bb_params(&self, bb: Block) -> usize;
+    /// Get the register for a block param.
+    fn bb_param(&self, bb: Block, idx: usize) -> Reg;
+    /// Get the register for a return value.
+    fn retval(&self, idx: usize) -> Writable<Reg>;
+    /// Get the target for a call instruction, as an `ExternalName`.
+    fn call_target<'b>(&'b self, ir_inst: Inst) -> Option<&'b ExternalName>;
+    /// Get the signature for a call or call-indirect instruction.
+    fn call_sig<'b>(&'b self, ir_inst: Inst) -> Option<&'b Signature>;
+    /// Get the symbol name and offset for a symbol_value instruction.
+    fn symbol_value<'b>(&'b self, ir_inst: Inst) -> Option<(&'b ExternalName, i64)>;
+    /// Returns the memory flags of a given memory access.
+    fn memflags(&self, ir_inst: Inst) -> Option<MemFlags>;
+    /// Get the source location for a given instruction.
+    fn srcloc(&self, ir_inst: Inst) -> SourceLoc;
+}
+
+/// A machine backend.
+pub trait LowerBackend {
+    /// The machine instruction type.
+    type MInst: VCodeInst;
+
+    /// Lower a single instruction. Instructions are lowered in reverse order.
+    /// This function need not handle branches; those are always passed to
+    /// `lower_branch_group` below.
+    fn lower<C: LowerCtx<I = Self::MInst>>(&self, ctx: &mut C, inst: Inst);
+
+    /// Lower a block-terminating group of branches (which together can be seen as one
+    /// N-way branch), given a vcode BlockIndex for each target.
+    fn lower_branch_group<C: LowerCtx<I = Self::MInst>>(
+        &self,
+        ctx: &mut C,
+        insts: &[Inst],
+        targets: &[BlockIndex],
+        fallthrough: Option<BlockIndex>,
+    );
+}
+
+/// Machine-independent lowering driver / machine-instruction container. Maintains a correspondence
+/// from original Inst to MachInsts.
+pub struct Lower<'a, I: VCodeInst> {
+    /// The function to lower.
+    f: &'a Function,
+
+    /// Lowered machine instructions.
+    vcode: VCodeBuilder<I>,
+
+    /// Number of active uses (minus `dec_use()` calls by backend) of each instruction.
+    num_uses: SecondaryMap<Inst, u32>,
+
+    /// Mapping from `Value` (SSA value in IR) to virtual register.
+    value_regs: SecondaryMap<Value, Reg>,
+
+    /// Return-value vregs.
+    retval_regs: Vec<Reg>,
+
+    /// Next virtual register number to allocate.
+    next_vreg: u32,
+}
+
+fn alloc_vreg(
+    value_regs: &mut SecondaryMap<Value, Reg>,
+    regclass: RegClass,
+    value: Value,
+    next_vreg: &mut u32,
+) -> VirtualReg {
+    if value_regs[value].get_index() == 0 {
+        // default value in map.
+        let v = *next_vreg;
+        *next_vreg += 1;
+        value_regs[value] = Reg::new_virtual(regclass, v);
+    }
+    value_regs[value].as_virtual_reg().unwrap()
+}
+
+enum GenerateReturn {
+    Yes,
+    No,
+}
+
+impl<'a, I: VCodeInst> Lower<'a, I> {
+    /// Prepare a new lowering context for the given IR function.
+    pub fn new(f: &'a Function, abi: Box<dyn ABIBody<I = I>>) -> Lower<'a, I> {
+        let mut vcode = VCodeBuilder::new(abi);
+
+        let num_uses = NumUses::compute(f).take_uses();
+
+        let mut next_vreg: u32 = 1;
+
+        // Default register should never be seen, but the `value_regs` map needs a default and we
+        // don't want to push `Option` everywhere. All values will be assigned registers by the
+        // loops over block parameters and instruction results below.
+        //
+        // We do not use vreg 0 so that we can detect any unassigned register that leaks through.
+        let default_register = Reg::new_virtual(RegClass::I32, 0);
+        let mut value_regs = SecondaryMap::with_default(default_register);
+
+        // Assign a vreg to each value.
+        for bb in f.layout.blocks() {
+            for param in f.dfg.block_params(bb) {
+                let vreg = alloc_vreg(
+                    &mut value_regs,
+                    I::rc_for_type(f.dfg.value_type(*param)),
+                    *param,
+                    &mut next_vreg,
+                );
+                vcode.set_vreg_type(vreg, f.dfg.value_type(*param));
+            }
+            for inst in f.layout.block_insts(bb) {
+                for result in f.dfg.inst_results(inst) {
+                    let vreg = alloc_vreg(
+                        &mut value_regs,
+                        I::rc_for_type(f.dfg.value_type(*result)),
+                        *result,
+                        &mut next_vreg,
+                    );
+                    vcode.set_vreg_type(vreg, f.dfg.value_type(*result));
+                }
+            }
+        }
+
+        // Assign a vreg to each return value.
+        let mut retval_regs = vec![];
+        for ret in &f.signature.returns {
+            let v = next_vreg;
+            next_vreg += 1;
+            let regclass = I::rc_for_type(ret.value_type);
+            let vreg = Reg::new_virtual(regclass, v);
+            retval_regs.push(vreg);
+            vcode.set_vreg_type(vreg.as_virtual_reg().unwrap(), ret.value_type);
+        }
+
+        Lower {
+            f,
+            vcode,
+            num_uses,
+            value_regs,
+            retval_regs,
+            next_vreg,
+        }
+    }
+
+    fn gen_arg_setup(&mut self) {
+        if let Some(entry_bb) = self.f.layout.entry_block() {
+            debug!(
+                "gen_arg_setup: entry BB {} args are:\n{:?}",
+                entry_bb,
+                self.f.dfg.block_params(entry_bb)
+            );
+            for (i, param) in self.f.dfg.block_params(entry_bb).iter().enumerate() {
+                let reg = Writable::from_reg(self.value_regs[*param]);
+                let insn = self.vcode.abi().gen_copy_arg_to_reg(i, reg);
+                self.vcode.push(insn);
+            }
+        }
+    }
+
+    fn gen_retval_setup(&mut self, gen_ret_inst: GenerateReturn) {
+        for (i, reg) in self.retval_regs.iter().enumerate() {
+            let insn = self.vcode.abi().gen_copy_reg_to_retval(i, *reg);
+            self.vcode.push(insn);
+        }
+        let inst = match gen_ret_inst {
+            GenerateReturn::Yes => self.vcode.abi().gen_ret(),
+            GenerateReturn::No => self.vcode.abi().gen_epilogue_placeholder(),
+        };
+        self.vcode.push(inst);
+    }
+
+    fn find_reachable_bbs(&self) -> SmallVec<[Block; 16]> {
+        if let Some(entry) = self.f.layout.entry_block() {
+            let mut ret = SmallVec::new();
+            let mut queue = VecDeque::new();
+            let mut visited = SecondaryMap::with_default(false);
+            queue.push_back(entry);
+            visited[entry] = true;
+            while !queue.is_empty() {
+                let b = queue.pop_front().unwrap();
+                ret.push(b);
+                let mut succs: SmallVec<[Block; 16]> = SmallVec::new();
+                for inst in self.f.layout.block_insts(b) {
+                    if self.f.dfg[inst].opcode().is_branch() {
+                        visit_branch_targets(self.f, b, inst, |succ| {
+                            succs.push(succ);
+                        });
+                    }
+                }
+                for succ in succs.into_iter() {
+                    if !visited[succ] {
+                        queue.push_back(succ);
+                        visited[succ] = true;
+                    }
+                }
+            }
+
+            ret
+        } else {
+            SmallVec::new()
+        }
+    }
+
+    /// Lower the function.
+    pub fn lower<B: LowerBackend<MInst = I>>(mut self, backend: &B) -> VCode<I> {
+        // Find all reachable blocks.
+        let bbs = self.find_reachable_bbs();
+
+        // This records a Block-to-BlockIndex map so that branch targets can be resolved.
+        let mut next_bindex = self.vcode.init_bb_map(&bbs[..]);
+
+        // Allocate a separate BlockIndex for each control-flow instruction so that we can create
+        // the edge blocks later. Each entry for a control-flow inst is the edge block; the list
+        // has (control flow inst, edge block, orig block) tuples.
+        let mut edge_blocks_by_inst: SecondaryMap<Inst, Vec<BlockIndex>> =
+            SecondaryMap::with_default(vec![]);
+        let mut edge_blocks: Vec<(Inst, BlockIndex, Block)> = vec![];
+
+        debug!("about to lower function: {:?}", self.f);
+        debug!("bb map: {:?}", self.vcode.blocks_by_bb());
+
+        // Work backward (reverse block order, reverse through each block), skipping insns with zero
+        // uses.
+        for bb in bbs.iter().rev() {
+            for inst in self.f.layout.block_insts(*bb) {
+                let op = self.f.dfg[inst].opcode();
+                if op.is_branch() {
+                    // Find the original target.
+                    let mut add_succ = |next_bb| {
+                        let edge_block = next_bindex;
+                        next_bindex += 1;
+                        edge_blocks_by_inst[inst].push(edge_block);
+                        edge_blocks.push((inst, edge_block, next_bb));
+                    };
+                    visit_branch_targets(self.f, *bb, inst, |succ| {
+                        add_succ(succ);
+                    });
+                }
+            }
+        }
+
+        for bb in bbs.iter() {
+            debug!("lowering bb: {}", bb);
+
+            // If this is a return block, produce the return value setup.  N.B.: this comes
+            // *before* the below because it must occur *after* any other instructions, and
+            // instructions are lowered in reverse order.
+            let last_insn = self.f.layout.block_insts(*bb).last().unwrap();
+            let last_insn_opcode = self.f.dfg[last_insn].opcode();
+            if last_insn_opcode.is_return() {
+                let gen_ret = if last_insn_opcode == Opcode::Return {
+                    GenerateReturn::Yes
+                } else {
+                    debug_assert!(last_insn_opcode == Opcode::FallthroughReturn);
+                    GenerateReturn::No
+                };
+                self.gen_retval_setup(gen_ret);
+                self.vcode.end_ir_inst();
+            }
+
+            // Find the branches at the end first, and process those, if any.
+            let mut branches: SmallVec<[Inst; 2]> = SmallVec::new();
+            let mut targets: SmallVec<[BlockIndex; 2]> = SmallVec::new();
+
+            for inst in self.f.layout.block_insts(*bb).rev() {
+                debug!("lower: inst {}", inst);
+                if edge_blocks_by_inst[inst].len() > 0 {
+                    branches.push(inst);
+                    for target in edge_blocks_by_inst[inst].iter().rev().cloned() {
+                        targets.push(target);
+                    }
+                } else {
+                    // We've reached the end of the branches -- process all as a group, first.
+                    if branches.len() > 0 {
+                        let fallthrough = self.f.layout.next_block(*bb);
+                        let fallthrough = fallthrough.map(|bb| self.vcode.bb_to_bindex(bb));
+                        branches.reverse();
+                        targets.reverse();
+                        debug!(
+                            "lower_branch_group: targets = {:?} branches = {:?}",
+                            targets, branches
+                        );
+                        backend.lower_branch_group(
+                            &mut self,
+                            &branches[..],
+                            &targets[..],
+                            fallthrough,
+                        );
+                        self.vcode.end_ir_inst();
+                        branches.clear();
+                        targets.clear();
+                    }
+
+                    // Only codegen an instruction if it either has a side
+                    // effect, or has at least one use of one of its results.
+                    let num_uses = self.num_uses[inst];
+                    let side_effect = has_side_effect(self.f, inst);
+                    if side_effect || num_uses > 0 {
+                        backend.lower(&mut self, inst);
+                        self.vcode.end_ir_inst();
+                    } else {
+                        // If we're skipping the instruction, we need to dec-ref
+                        // its arguments.
+                        for arg in self.f.dfg.inst_args(inst) {
+                            let val = self.f.dfg.resolve_aliases(*arg);
+                            match self.f.dfg.value_def(val) {
+                                ValueDef::Result(src_inst, _) => {
+                                    self.dec_use(src_inst);
+                                }
+                                _ => {}
+                            }
+                        }
+                    }
+                }
+            }
+
+            // There are possibly some branches left if the block contained only branches.
+            if branches.len() > 0 {
+                let fallthrough = self.f.layout.next_block(*bb);
+                let fallthrough = fallthrough.map(|bb| self.vcode.bb_to_bindex(bb));
+                branches.reverse();
+                targets.reverse();
+                debug!(
+                    "lower_branch_group: targets = {:?} branches = {:?}",
+                    targets, branches
+                );
+                backend.lower_branch_group(&mut self, &branches[..], &targets[..], fallthrough);
+                self.vcode.end_ir_inst();
+                branches.clear();
+                targets.clear();
+            }
+
+            // If this is the entry block, produce the argument setup.
+            if Some(*bb) == self.f.layout.entry_block() {
+                self.gen_arg_setup();
+                self.vcode.end_ir_inst();
+            }
+
+            let vcode_bb = self.vcode.end_bb();
+            debug!("finished building bb: BlockIndex {}", vcode_bb);
+            debug!("bb_to_bindex map says: {}", self.vcode.bb_to_bindex(*bb));
+            assert!(vcode_bb == self.vcode.bb_to_bindex(*bb));
+            if Some(*bb) == self.f.layout.entry_block() {
+                self.vcode.set_entry(vcode_bb);
+            }
+        }
+
+        // Now create the edge blocks, with phi lowering (block parameter copies).
+        for (inst, edge_block, orig_block) in edge_blocks.into_iter() {
+            debug!(
+                "creating edge block: inst {}, edge_block {}, orig_block {}",
+                inst, edge_block, orig_block
+            );
+
+            // Create a temporary for each block parameter.
+            let phi_classes: Vec<(Type, RegClass)> = self
+                .f
+                .dfg
+                .block_params(orig_block)
+                .iter()
+                .map(|p| self.f.dfg.value_type(*p))
+                .map(|ty| (ty, I::rc_for_type(ty)))
+                .collect();
+
+            // FIXME sewardj 2020Feb29: use SmallVec
+            let mut src_regs = vec![];
+            let mut dst_regs = vec![];
+
+            // Create all of the phi uses (reads) from jump args to temps.
+
+            // Round up all the source and destination regs
+            for (i, arg) in self.f.dfg.inst_variable_args(inst).iter().enumerate() {
+                let arg = self.f.dfg.resolve_aliases(*arg);
+                debug!("jump arg {} is {}", i, arg);
+                src_regs.push(self.value_regs[arg]);
+            }
+            for (i, param) in self.f.dfg.block_params(orig_block).iter().enumerate() {
+                debug!("bb arg {} is {}", i, param);
+                dst_regs.push(Writable::from_reg(self.value_regs[*param]));
+            }
+            debug_assert!(src_regs.len() == dst_regs.len());
+            debug_assert!(phi_classes.len() == dst_regs.len());
+
+            // If, as is mostly the case, the source and destination register
+            // sets are non overlapping, then we can copy directly, so as to
+            // save the register allocator work.
+            if !Set::<Reg>::from_vec(src_regs.clone()).intersects(&Set::<Reg>::from_vec(
+                dst_regs.iter().map(|r| r.to_reg()).collect(),
+            )) {
+                for (dst_reg, (src_reg, (ty, _))) in
+                    dst_regs.iter().zip(src_regs.iter().zip(phi_classes))
+                {
+                    self.vcode.push(I::gen_move(*dst_reg, *src_reg, ty));
+                }
+            } else {
+                // There's some overlap, so play safe and copy via temps.
+
+                let tmp_regs: Vec<Writable<Reg>> = phi_classes
+                    .iter()
+                    .map(|&(ty, rc)| self.tmp(rc, ty)) // borrows `self` mutably.
+                    .collect();
+
+                debug!("phi_temps = {:?}", tmp_regs);
+                debug_assert!(tmp_regs.len() == src_regs.len());
+
+                for (tmp_reg, (src_reg, &(ty, _))) in
+                    tmp_regs.iter().zip(src_regs.iter().zip(phi_classes.iter()))
+                {
+                    self.vcode.push(I::gen_move(*tmp_reg, *src_reg, ty));
+                }
+                for (dst_reg, (tmp_reg, &(ty, _))) in
+                    dst_regs.iter().zip(tmp_regs.iter().zip(phi_classes.iter()))
+                {
+                    self.vcode.push(I::gen_move(*dst_reg, tmp_reg.to_reg(), ty));
+                }
+            }
+
+            // Create the unconditional jump to the original target block.
+            self.vcode
+                .push(I::gen_jump(self.vcode.bb_to_bindex(orig_block)));
+
+            // End the IR inst and block. (We lower this as if it were one IR instruction so that
+            // we can emit machine instructions in forward order.)
+            self.vcode.end_ir_inst();
+            let blocknum = self.vcode.end_bb();
+            assert!(blocknum == edge_block);
+        }
+
+        // Now that we've emitted all instructions into the VCodeBuilder, let's build the VCode.
+        self.vcode.build()
+    }
+
+    /// Reduce the use-count of an IR instruction. Use this when, e.g., isel incorporates the
+    /// computation of an input instruction directly, so that input instruction has one
+    /// fewer use.
+    fn dec_use(&mut self, ir_inst: Inst) {
+        assert!(self.num_uses[ir_inst] > 0);
+        self.num_uses[ir_inst] -= 1;
+        debug!(
+            "incref: ir_inst {} now has {} uses",
+            ir_inst, self.num_uses[ir_inst]
+        );
+    }
+
+    /// Increase the use-count of an IR instruction. Use this when, e.g., isel incorporates
+    /// the computation of an input instruction directly, so that input instruction's
+    /// inputs are now used directly by the merged instruction.
+    fn inc_use(&mut self, ir_inst: Inst) {
+        self.num_uses[ir_inst] += 1;
+        debug!(
+            "decref: ir_inst {} now has {} uses",
+            ir_inst, self.num_uses[ir_inst]
+        );
+    }
+}
+
+impl<'a, I: VCodeInst> LowerCtx for Lower<'a, I> {
+    type I = I;
+
+    /// Get the instdata for a given IR instruction.
+    fn data(&self, ir_inst: Inst) -> &InstructionData {
+        &self.f.dfg[ir_inst]
+    }
+
+    /// Get the controlling type for a polymorphic IR instruction.
+    fn ty(&self, ir_inst: Inst) -> Type {
+        self.f.dfg.ctrl_typevar(ir_inst)
+    }
+
+    /// Emit a machine instruction.
+    fn emit(&mut self, mach_inst: I) {
+        self.vcode.push(mach_inst);
+    }
+
+    /// Indicate that a merge has occurred.
+    fn merged(&mut self, from_inst: Inst) {
+        debug!("merged: inst {}", from_inst);
+        // First, inc-ref all inputs of `from_inst`, because they are now used
+        // directly by `into_inst`.
+        for arg in self.f.dfg.inst_args(from_inst) {
+            let arg = self.f.dfg.resolve_aliases(*arg);
+            match self.f.dfg.value_def(arg) {
+                ValueDef::Result(src_inst, _) => {
+                    debug!(" -> inc-reffing src inst {}", src_inst);
+                    self.inc_use(src_inst);
+                }
+                _ => {}
+            }
+        }
+        // Then, dec-ref the merged instruction itself. It still retains references
+        // to its arguments (inc-ref'd above). If its refcount has reached zero,
+        // it will be skipped during emission and its args will be dec-ref'd at that
+        // time.
+        self.dec_use(from_inst);
+    }
+
+    /// Get the producing instruction, if any, and output number, for the `idx`th input to the
+    /// given IR instruction.
+    fn input_inst(&self, ir_inst: Inst, idx: usize) -> Option<(Inst, usize)> {
+        let val = self.f.dfg.inst_args(ir_inst)[idx];
+        let val = self.f.dfg.resolve_aliases(val);
+        match self.f.dfg.value_def(val) {
+            ValueDef::Result(src_inst, result_idx) => Some((src_inst, result_idx)),
+            _ => None,
+        }
+    }
+
+    /// Map a Value to its associated writable (probably virtual) Reg.
+    fn value_to_writable_reg(&self, val: Value) -> Writable<Reg> {
+        let val = self.f.dfg.resolve_aliases(val);
+        Writable::from_reg(self.value_regs[val])
+    }
+
+    /// Map a Value to its associated (probably virtual) Reg.
+    fn value_to_reg(&self, val: Value) -> Reg {
+        let val = self.f.dfg.resolve_aliases(val);
+        self.value_regs[val]
+    }
+
+    /// Get the `idx`th input to the given IR instruction as a virtual register.
+    fn input(&self, ir_inst: Inst, idx: usize) -> Reg {
+        let val = self.f.dfg.inst_args(ir_inst)[idx];
+        let val = self.f.dfg.resolve_aliases(val);
+        self.value_to_reg(val)
+    }
+
+    /// Get the `idx`th output of the given IR instruction as a virtual register.
+    fn output(&self, ir_inst: Inst, idx: usize) -> Writable<Reg> {
+        let val = self.f.dfg.inst_results(ir_inst)[idx];
+        self.value_to_writable_reg(val)
+    }
+
+    /// Get a new temp.
+    fn tmp(&mut self, rc: RegClass, ty: Type) -> Writable<Reg> {
+        let v = self.next_vreg;
+        self.next_vreg += 1;
+        let vreg = Reg::new_virtual(rc, v);
+        self.vcode.set_vreg_type(vreg.as_virtual_reg().unwrap(), ty);
+        Writable::from_reg(vreg)
+    }
+
+    /// Get the number of inputs for the given IR instruction.
+    fn num_inputs(&self, ir_inst: Inst) -> usize {
+        self.f.dfg.inst_args(ir_inst).len()
+    }
+
+    /// Get the number of outputs for the given IR instruction.
+    fn num_outputs(&self, ir_inst: Inst) -> usize {
+        self.f.dfg.inst_results(ir_inst).len()
+    }
+
+    /// Get the type for an instruction's input.
+    fn input_ty(&self, ir_inst: Inst, idx: usize) -> Type {
+        let val = self.f.dfg.inst_args(ir_inst)[idx];
+        let val = self.f.dfg.resolve_aliases(val);
+        self.f.dfg.value_type(val)
+    }
+
+    /// Get the type for an instruction's output.
+    fn output_ty(&self, ir_inst: Inst, idx: usize) -> Type {
+        self.f.dfg.value_type(self.f.dfg.inst_results(ir_inst)[idx])
+    }
+
+    /// Get the number of block params.
+    fn num_bb_params(&self, bb: Block) -> usize {
+        self.f.dfg.block_params(bb).len()
+    }
+
+    /// Get the register for a block param.
+    fn bb_param(&self, bb: Block, idx: usize) -> Reg {
+        let val = self.f.dfg.block_params(bb)[idx];
+        self.value_regs[val]
+    }
+
+    /// Get the register for a return value.
+    fn retval(&self, idx: usize) -> Writable<Reg> {
+        Writable::from_reg(self.retval_regs[idx])
+    }
+
+    /// Get the target for a call instruction, as an `ExternalName`.
+    fn call_target<'b>(&'b self, ir_inst: Inst) -> Option<&'b ExternalName> {
+        match &self.f.dfg[ir_inst] {
+            &InstructionData::Call { func_ref, .. }
+            | &InstructionData::FuncAddr { func_ref, .. } => {
+                let funcdata = &self.f.dfg.ext_funcs[func_ref];
+                Some(&funcdata.name)
+            }
+            _ => None,
+        }
+    }
+    /// Get the signature for a call or call-indirect instruction.
+    fn call_sig<'b>(&'b self, ir_inst: Inst) -> Option<&'b Signature> {
+        match &self.f.dfg[ir_inst] {
+            &InstructionData::Call { func_ref, .. } => {
+                let funcdata = &self.f.dfg.ext_funcs[func_ref];
+                Some(&self.f.dfg.signatures[funcdata.signature])
+            }
+            &InstructionData::CallIndirect { sig_ref, .. } => Some(&self.f.dfg.signatures[sig_ref]),
+            _ => None,
+        }
+    }
+
+    /// Get the symbol name and offset for a symbol_value instruction.
+    fn symbol_value<'b>(&'b self, ir_inst: Inst) -> Option<(&'b ExternalName, i64)> {
+        match &self.f.dfg[ir_inst] {
+            &InstructionData::UnaryGlobalValue { global_value, .. } => {
+                let gvdata = &self.f.global_values[global_value];
+                match gvdata {
+                    &GlobalValueData::Symbol {
+                        ref name,
+                        ref offset,
+                        ..
+                    } => {
+                        let offset = offset.bits();
+                        Some((name, offset))
+                    }
+                    _ => None,
+                }
+            }
+            _ => None,
+        }
+    }
+
+    /// Returns the memory flags of a given memory access.
+    fn memflags(&self, ir_inst: Inst) -> Option<MemFlags> {
+        match &self.f.dfg[ir_inst] {
+            &InstructionData::Load { flags, .. }
+            | &InstructionData::LoadComplex { flags, .. }
+            | &InstructionData::Store { flags, .. }
+            | &InstructionData::StoreComplex { flags, .. } => Some(flags),
+            _ => None,
+        }
+    }
+
+    /// Get the source location for a given instruction.
+    fn srcloc(&self, ir_inst: Inst) -> SourceLoc {
+        self.f.srclocs[ir_inst]
+    }
+}
+
+fn visit_branch_targets<F: FnMut(Block)>(f: &Function, block: Block, inst: Inst, mut visit: F) {
+    if f.dfg[inst].opcode() == Opcode::Fallthrough {
+        visit(f.layout.next_block(block).unwrap());
+    } else {
+        match f.dfg[inst].analyze_branch(&f.dfg.value_lists) {
+            BranchInfo::NotABranch => {}
+            BranchInfo::SingleDest(dest, _) => {
+                visit(dest);
+            }
+            BranchInfo::Table(table, maybe_dest) => {
+                if let Some(dest) = maybe_dest {
+                    visit(dest);
+                }
+                for &dest in f.jump_tables[table].as_slice() {
+                    visit(dest);
+                }
+            }
+        }
+    }
+}
--- a/cranelift/codegen/src/machinst/mod.rs
+++ b/cranelift/codegen/src/machinst/mod.rs
@@ -0,0 +1,280 @@
+//! This module exposes the machine-specific backend definition pieces.
+//!
+//! The MachInst infrastructure is the compiler backend, from CLIF
+//! (ir::Function) to machine code. The purpose of this infrastructure is, at a
+//! high level, to do instruction selection/lowering (to machine instructions),
+//! register allocation, and then perform all the fixups to branches, constant
+//! data references, etc., needed to actually generate machine code.
+//!
+//! The container for machine instructions, at various stages of construction,
+//! is the `VCode` struct. We refer to a sequence of machine instructions organized
+//! into basic blocks as "vcode". This is short for "virtual-register code", though
+//! it's a bit of a misnomer because near the end of the pipeline, vcode has all
+//! real registers. Nevertheless, the name is catchy and we like it.
+//!
+//! The compilation pipeline, from an `ir::Function` (already optimized as much as
+//! you like by machine-independent optimization passes) onward, is as follows.
+//! (N.B.: though we show the VCode separately at each stage, the passes
+//! mutate the VCode in place; these are not separate copies of the code.)
+//!
+//! ```plain
+//!
+//!     ir::Function                (SSA IR, machine-independent opcodes)
+//!         |
+//!         |  [lower]
+//!         |
+//!     VCode<arch_backend::Inst>   (machine instructions:
+//!         |                        - mostly virtual registers.
+//!         |                        - cond branches in two-target form.
+//!         |                        - branch targets are block indices.
+//!         |                        - in-memory constants held by insns,
+//!         |                          with unknown offsets.
+//!         |                        - critical edges (actually all edges)
+//!         |                          are split.)
+//!         | [regalloc]
+//!         |
+//!     VCode<arch_backend::Inst>   (machine instructions:
+//!         |                        - all real registers.
+//!         |                        - new instruction sequence returned
+//!         |                          out-of-band in RegAllocResult.
+//!         |                        - instruction sequence has spills,
+//!         |                          reloads, and moves inserted.
+//!         |                        - other invariants same as above.)
+//!         |
+//!         | [preamble/postamble]
+//!         |
+//!     VCode<arch_backend::Inst>   (machine instructions:
+//!         |                        - stack-frame size known.
+//!         |                        - out-of-band instruction sequence
+//!         |                          has preamble prepended to entry
+//!         |                          block, and postamble injected before
+//!         |                          every return instruction.
+//!         |                        - all symbolic stack references to
+//!         |                          stackslots and spillslots are resolved
+//!         |                          to concrete FP-offset mem addresses.)
+//!         | [block/insn ordering]
+//!         |
+//!     VCode<arch_backend::Inst>   (machine instructions:
+//!         |                        - vcode.final_block_order is filled in.
+//!         |                        - new insn sequence from regalloc is
+//!         |                          placed back into vcode and block
+//!         |                          boundaries are updated.)
+//!         | [redundant branch/block
+//!         |  removal]
+//!         |
+//!     VCode<arch_backend::Inst>   (machine instructions:
+//!         |                        - all blocks that were just an
+//!         |                          unconditional branch are removed.)
+//!         |
+//!         | [branch finalization
+//!         |  (fallthroughs)]
+//!         |
+//!     VCode<arch_backend::Inst>   (machine instructions:
+//!         |                        - all branches are in lowered one-
+//!         |                          target form, but targets are still
+//!         |                          block indices.)
+//!         |
+//!         | [branch finalization
+//!         |  (offsets)]
+//!         |
+//!     VCode<arch_backend::Inst>   (machine instructions:
+//!         |                        - all branch offsets from start of
+//!         |                          function are known, and all branches
+//!         |                          have resolved-offset targets.)
+//!         |
+//!         | [MemArg finalization]
+//!         |
+//!     VCode<arch_backend::Inst>   (machine instructions:
+//!         |                        - all MemArg references to the constant
+//!         |                          pool are replaced with offsets.
+//!         |                        - all constant-pool data is collected
+//!         |                          in the VCode.)
+//!         |
+//!         | [binary emission]
+//!         |
+//!     Vec<u8>                     (machine code!)
+//!
+//! ```
+
+use crate::binemit::{CodeInfo, CodeOffset};
+use crate::entity::SecondaryMap;
+use crate::ir::condcodes::IntCC;
+use crate::ir::{Function, Type};
+use crate::result::CodegenResult;
+use crate::settings::Flags;
+use alloc::boxed::Box;
+use alloc::vec::Vec;
+use core::fmt::Debug;
+use regalloc::Map as RegallocMap;
+use regalloc::RegUsageCollector;
+use regalloc::{RealReg, RealRegUniverse, Reg, RegClass, SpillSlot, VirtualReg, Writable};
+use std::string::String;
+use target_lexicon::Triple;
+
+pub mod lower;
+pub use lower::*;
+pub mod vcode;
+pub use vcode::*;
+pub mod compile;
+pub use compile::*;
+pub mod blockorder;
+pub use blockorder::*;
+pub mod abi;
+pub use abi::*;
+pub mod pretty_print;
+pub use pretty_print::*;
+pub mod sections;
+pub use sections::*;
+pub mod adapter;
+pub use adapter::*;
+
+/// A machine instruction.
+pub trait MachInst: Clone + Debug {
+    /// Return the registers referenced by this machine instruction along with
+    /// the modes of reference (use, def, modify).
+    fn get_regs(&self, collector: &mut RegUsageCollector);
+
+    /// Map virtual registers to physical registers using the given virt->phys
+    /// maps corresponding to the program points prior to, and after, this instruction.
+    fn map_regs(
+        &mut self,
+        pre_map: &RegallocMap<VirtualReg, RealReg>,
+        post_map: &RegallocMap<VirtualReg, RealReg>,
+    );
+
+    /// If this is a simple move, return the (source, destination) tuple of registers.
+    fn is_move(&self) -> Option<(Writable<Reg>, Reg)>;
+
+    /// Is this a terminator (branch or ret)? If so, return its type
+    /// (ret/uncond/cond) and target if applicable.
+    fn is_term<'a>(&'a self) -> MachTerminator<'a>;
+
+    /// Returns true if the instruction is an epilogue placeholder.
+    fn is_epilogue_placeholder(&self) -> bool;
+
+    /// Generate a move.
+    fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Self;
+
+    /// Generate a zero-length no-op.
+    fn gen_zero_len_nop() -> Self;
+
+    /// Possibly operate on a value directly in a spill-slot rather than a
+    /// register. Useful if the machine has register-memory instruction forms
+    /// (e.g., add directly from or directly to memory), like x86.
+    fn maybe_direct_reload(&self, reg: VirtualReg, slot: SpillSlot) -> Option<Self>;
+
+    /// Determine a register class to store the given CraneLift type.
+    fn rc_for_type(ty: Type) -> RegClass;
+
+    /// Generate a jump to another target. Used during lowering of
+    /// control flow.
+    fn gen_jump(target: BlockIndex) -> Self;
+
+    /// Generate a NOP. The `preferred_size` parameter allows the caller to
+    /// request a NOP of that size, or as close to it as possible. The machine
+    /// backend may return a NOP whose binary encoding is smaller than the
+    /// preferred size, but must not return a NOP that is larger. However,
+    /// the instruction must have a nonzero size.
+    fn gen_nop(preferred_size: usize) -> Self;
+
+    /// Rewrite block targets using the block-target map.
+    fn with_block_rewrites(&mut self, block_target_map: &[BlockIndex]);
+
+    /// Finalize branches once the block order (fallthrough) is known.
+    fn with_fallthrough_block(&mut self, fallthrough_block: Option<BlockIndex>);
+
+    /// Update instruction once block offsets are known.  These offsets are
+    /// relative to the beginning of the function. `targets` is indexed by
+    /// BlockIndex.
+    fn with_block_offsets(&mut self, my_offset: CodeOffset, targets: &[CodeOffset]);
+
+    /// Get the register universe for this backend.
+    fn reg_universe() -> RealRegUniverse;
+
+    /// Align a basic block offset (from start of function).  By default, no
+    /// alignment occurs.
+    fn align_basic_block(offset: CodeOffset) -> CodeOffset {
+        offset
+    }
+}
+
+/// Describes a block terminator (not call) in the vcode, when its branches
+/// have not yet been finalized (so a branch may have two targets).
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub enum MachTerminator<'a> {
+    /// Not a terminator.
+    None,
+    /// A return instruction.
+    Ret,
+    /// An unconditional branch to another block.
+    Uncond(BlockIndex),
+    /// A conditional branch to one of two other blocks.
+    Cond(BlockIndex, BlockIndex),
+    /// An indirect branch with known possible targets.
+    Indirect(&'a [BlockIndex]),
+}
+
+/// A trait describing the ability to encode a MachInst into binary machine code.
+pub trait MachInstEmit<O: MachSectionOutput> {
+    /// Emit the instruction.
+    fn emit(&self, code: &mut O);
+}
+
+/// The result of a `MachBackend::compile_function()` call. Contains machine
+/// code (as bytes) and a disassembly, if requested.
+pub struct MachCompileResult {
+    /// Machine code.
+    pub sections: MachSections,
+    /// Size of stack frame, in bytes.
+    pub frame_size: u32,
+    /// Disassembly, if requested.
+    pub disasm: Option<String>,
+}
+
+impl MachCompileResult {
+    /// Get a `CodeInfo` describing section sizes from this compilation result.
+    pub fn code_info(&self) -> CodeInfo {
+        let code_size = self.sections.total_size();
+        CodeInfo {
+            code_size,
+            jumptables_size: 0,
+            rodata_size: 0,
+            total_size: code_size,
+        }
+    }
+}
+
+/// Top-level machine backend trait, which wraps all monomorphized code and
+/// allows a virtual call from the machine-independent `Function::compile()`.
+pub trait MachBackend {
+    /// Compile the given function.
+    fn compile_function(
+        &self,
+        func: &Function,
+        want_disasm: bool,
+    ) -> CodegenResult<MachCompileResult>;
+
+    /// Return flags for this backend.
+    fn flags(&self) -> &Flags;
+
+    /// Return triple for this backend.
+    fn triple(&self) -> Triple;
+
+    /// Return name for this backend.
+    fn name(&self) -> &'static str;
+
+    /// Return the register universe for this backend.
+    fn reg_universe(&self) -> RealRegUniverse;
+
+    /// Machine-specific condcode info needed by TargetIsa.
+    fn unsigned_add_overflow_condition(&self) -> IntCC {
+        // TODO: this is what x86 specifies. Is this right for arm64?
+        IntCC::UnsignedLessThan
+    }
+
+    /// Machine-specific condcode info needed by TargetIsa.
+    fn unsigned_sub_overflow_condition(&self) -> IntCC {
+        // TODO: this is what x86 specifies. Is this right for arm64?
+        IntCC::UnsignedLessThan
+    }
+}
--- a/cranelift/codegen/src/machinst/pretty_print.rs
+++ b/cranelift/codegen/src/machinst/pretty_print.rs
@@ -0,0 +1,66 @@
+//! Pretty-printing for machine code (virtual-registerized or final).
+
+use regalloc::{RealRegUniverse, Reg, Writable};
+
+use std::fmt::Debug;
+use std::hash::Hash;
+use std::string::{String, ToString};
+
+// FIXME: Should this go into regalloc.rs instead?
+
+/// A trait for printing instruction bits and pieces, with the the ability to
+/// take a contextualising RealRegUniverse that is used to give proper names to
+/// registers.
+pub trait ShowWithRRU {
+    /// Return a string that shows the implementing object in context of the
+    /// given `RealRegUniverse`, if provided.
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String;
+
+    /// The same as |show_rru|, but with an optional hint giving a size in
+    /// bytes.  Its interpretation is object-dependent, and it is intended to
+    /// pass around enough information to facilitate printing sub-parts of
+    /// real registers correctly.  Objects may ignore size hints that are
+    /// irrelevant to them.
+    fn show_rru_sized(&self, mb_rru: Option<&RealRegUniverse>, _size: u8) -> String {
+        // Default implementation is to ignore the hint.
+        self.show_rru(mb_rru)
+    }
+}
+
+impl ShowWithRRU for Reg {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        if self.is_real() {
+            if let Some(rru) = mb_rru {
+                let reg_ix = self.get_index();
+                if reg_ix < rru.regs.len() {
+                    return rru.regs[reg_ix].1.to_string();
+                } else {
+                    // We have a real reg which isn't listed in the universe.
+                    // Per the regalloc.rs interface requirements, this is
+                    // Totally Not Allowed.  Print it generically anyway, so
+                    // we have something to debug.
+                    return format!("!!{:?}!!", self);
+                }
+            }
+        }
+        // The reg is virtual, or we have no universe.  Be generic.
+        format!("%{:?}", self)
+    }
+
+    fn show_rru_sized(&self, _mb_rru: Option<&RealRegUniverse>, _size: u8) -> String {
+        // For the specific case of Reg, we demand not to have a size hint,
+        // since interpretation of the size is target specific, but this code
+        // is used by all targets.
+        panic!("Reg::show_rru_sized: impossible to implement");
+    }
+}
+
+impl<R: ShowWithRRU + Copy + Ord + Hash + Eq + Debug> ShowWithRRU for Writable<R> {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        self.to_reg().show_rru(mb_rru)
+    }
+
+    fn show_rru_sized(&self, mb_rru: Option<&RealRegUniverse>, size: u8) -> String {
+        self.to_reg().show_rru_sized(mb_rru, size)
+    }
+}
--- a/cranelift/codegen/src/machinst/sections.rs
+++ b/cranelift/codegen/src/machinst/sections.rs
@@ -0,0 +1,354 @@
+//! In-memory representation of compiled machine code, in multiple sections
+//! (text, constant pool / rodata, etc). Emission occurs into multiple sections
+//! simultaneously, so we buffer the result in memory and hand off to the
+//! caller at the end of compilation.
+
+use crate::binemit::{Addend, CodeOffset, CodeSink, Reloc};
+use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode};
+
+use alloc::vec::Vec;
+
+/// A collection of sections with defined start-offsets.
+pub struct MachSections {
+    /// Sections, in offset order.
+    pub sections: Vec<MachSection>,
+}
+
+impl MachSections {
+    /// New, empty set of sections.
+    pub fn new() -> MachSections {
+        MachSections { sections: vec![] }
+    }
+
+    /// Add a section with a known offset and size. Returns the index.
+    pub fn add_section(&mut self, start: CodeOffset, length: CodeOffset) -> usize {
+        let idx = self.sections.len();
+        self.sections.push(MachSection::new(start, length));
+        idx
+    }
+
+    /// Mutably borrow the given section by index.
+    pub fn get_section<'a>(&'a mut self, idx: usize) -> &'a mut MachSection {
+        &mut self.sections[idx]
+    }
+
+    /// Get mutable borrows of two sections simultaneously. Used during
+    /// instruction emission to provide references to the .text and .rodata
+    /// (constant pool) sections.
+    pub fn two_sections<'a>(
+        &'a mut self,
+        idx1: usize,
+        idx2: usize,
+    ) -> (&'a mut MachSection, &'a mut MachSection) {
+        assert!(idx1 < idx2);
+        assert!(idx1 < self.sections.len());
+        assert!(idx2 < self.sections.len());
+        let (first, rest) = self.sections.split_at_mut(idx2);
+        (&mut first[idx1], &mut rest[0])
+    }
+
+    /// Emit this set of sections to a set of sinks for the code,
+    /// relocations, traps, and stackmap.
+    pub fn emit<CS: CodeSink>(&self, sink: &mut CS) {
+        // N.B.: we emit every section into the .text section as far as
+        // the `CodeSink` is concerned; we do not bother to segregate
+        // the contents into the actual program text, the jumptable and the
+        // rodata (constant pool). This allows us to generate code assuming
+        // that these will not be relocated relative to each other, and avoids
+        // having to designate each section as belonging in one of the three
+        // fixed categories defined by `CodeSink`. If this becomes a problem
+        // later (e.g. because of memory permissions or similar), we can
+        // add this designation and segregate the output; take care, however,
+        // to add the appropriate relocations in this case.
+
+        for section in &self.sections {
+            if section.data.len() > 0 {
+                while sink.offset() < section.start_offset {
+                    sink.put1(0);
+                }
+                section.emit(sink);
+            }
+        }
+        sink.begin_jumptables();
+        sink.begin_rodata();
+        sink.end_codegen();
+    }
+
+    /// Get the total required size for these sections.
+    pub fn total_size(&self) -> CodeOffset {
+        if self.sections.len() == 0 {
+            0
+        } else {
+            // Find the last non-empty section.
+            self.sections
+                .iter()
+                .rev()
+                .find(|s| s.data.len() > 0)
+                .map(|s| s.cur_offset_from_start())
+                .unwrap_or(0)
+        }
+    }
+}
+
+/// An abstraction over MachSection and MachSectionSize: some
+/// receiver of section data.
+pub trait MachSectionOutput {
+    /// Get the current offset from the start of all sections.
+    fn cur_offset_from_start(&self) -> CodeOffset;
+
+    /// Get the start offset of this section.
+    fn start_offset(&self) -> CodeOffset;
+
+    /// Add 1 byte to the section.
+    fn put1(&mut self, _: u8);
+
+    /// Add 2 bytes to the section.
+    fn put2(&mut self, value: u16) {
+        let [b0, b1] = value.to_le_bytes();
+        self.put1(b0);
+        self.put1(b1);
+    }
+
+    /// Add 4 bytes to the section.
+    fn put4(&mut self, value: u32) {
+        let [b0, b1, b2, b3] = value.to_le_bytes();
+        self.put1(b0);
+        self.put1(b1);
+        self.put1(b2);
+        self.put1(b3);
+    }
+
+    /// Add 8 bytes to the section.
+    fn put8(&mut self, value: u64) {
+        let [b0, b1, b2, b3, b4, b5, b6, b7] = value.to_le_bytes();
+        self.put1(b0);
+        self.put1(b1);
+        self.put1(b2);
+        self.put1(b3);
+        self.put1(b4);
+        self.put1(b5);
+        self.put1(b6);
+        self.put1(b7);
+    }
+
+    /// Add a slice of bytes to the section.
+    fn put_data(&mut self, data: &[u8]);
+
+    /// Add a relocation at the current offset.
+    fn add_reloc(&mut self, loc: SourceLoc, kind: Reloc, name: &ExternalName, addend: Addend);
+
+    /// Add a trap record at the current offset.
+    fn add_trap(&mut self, loc: SourceLoc, code: TrapCode);
+
+    /// Add a call return address record at the current offset.
+    fn add_call_site(&mut self, loc: SourceLoc, opcode: Opcode);
+
+    /// Align up to the given alignment.
+    fn align_to(&mut self, align_to: CodeOffset) {
+        assert!(align_to.is_power_of_two());
+        while self.cur_offset_from_start() & (align_to - 1) != 0 {
+            self.put1(0);
+        }
+    }
+}
+
+/// A section of output to be emitted to a CodeSink / RelocSink in bulk.
+/// Multiple sections may be created with known start offsets in advance; the
+/// usual use-case is to create the .text (code) and .rodata (constant pool) at
+/// once, after computing the length of the code, so that constant references
+/// can use known offsets as instructions are emitted.
+pub struct MachSection {
+    /// The starting offset of this section.
+    pub start_offset: CodeOffset,
+    /// The limit of this section, defined by the start of the next section.
+    pub length_limit: CodeOffset,
+    /// The section contents, as raw bytes.
+    pub data: Vec<u8>,
+    /// Any relocations referring to this section.
+    pub relocs: Vec<MachReloc>,
+    /// Any trap records referring to this section.
+    pub traps: Vec<MachTrap>,
+    /// Any call site record referring to this section.
+    pub call_sites: Vec<MachCallSite>,
+}
+
+impl MachSection {
+    /// Create a new section, known to start at `start_offset` and with a size limited to `length_limit`.
+    pub fn new(start_offset: CodeOffset, length_limit: CodeOffset) -> MachSection {
+        MachSection {
+            start_offset,
+            length_limit,
+            data: vec![],
+            relocs: vec![],
+            traps: vec![],
+            call_sites: vec![],
+        }
+    }
+
+    /// Emit this section to the CodeSink and other associated sinks.  The
+    /// current offset of the CodeSink must match the starting offset of this
+    /// section.
+    pub fn emit<CS: CodeSink>(&self, sink: &mut CS) {
+        assert!(sink.offset() == self.start_offset);
+
+        let mut next_reloc = 0;
+        let mut next_trap = 0;
+        let mut next_call_site = 0;
+        for (idx, byte) in self.data.iter().enumerate() {
+            if next_reloc < self.relocs.len() {
+                let reloc = &self.relocs[next_reloc];
+                if reloc.offset == idx as CodeOffset {
+                    sink.reloc_external(reloc.srcloc, reloc.kind, &reloc.name, reloc.addend);
+                    next_reloc += 1;
+                }
+            }
+            if next_trap < self.traps.len() {
+                let trap = &self.traps[next_trap];
+                if trap.offset == idx as CodeOffset {
+                    sink.trap(trap.code, trap.srcloc);
+                    next_trap += 1;
+                }
+            }
+            if next_call_site < self.call_sites.len() {
+                let call_site = &self.call_sites[next_call_site];
+                if call_site.ret_addr == idx as CodeOffset {
+                    sink.add_call_site(call_site.opcode, call_site.srcloc);
+                    next_call_site += 1;
+                }
+            }
+            sink.put1(*byte);
+        }
+    }
+}
+
+impl MachSectionOutput for MachSection {
+    fn cur_offset_from_start(&self) -> CodeOffset {
+        self.start_offset + self.data.len() as CodeOffset
+    }
+
+    fn start_offset(&self) -> CodeOffset {
+        self.start_offset
+    }
+
+    fn put1(&mut self, value: u8) {
+        assert!(((self.data.len() + 1) as CodeOffset) <= self.length_limit);
+        self.data.push(value);
+    }
+
+    fn put_data(&mut self, data: &[u8]) {
+        assert!(((self.data.len() + data.len()) as CodeOffset) <= self.length_limit);
+        self.data.extend_from_slice(data);
+    }
+
+    fn add_reloc(&mut self, srcloc: SourceLoc, kind: Reloc, name: &ExternalName, addend: Addend) {
+        let name = name.clone();
+        self.relocs.push(MachReloc {
+            offset: self.data.len() as CodeOffset,
+            srcloc,
+            kind,
+            name,
+            addend,
+        });
+    }
+
+    fn add_trap(&mut self, srcloc: SourceLoc, code: TrapCode) {
+        self.traps.push(MachTrap {
+            offset: self.data.len() as CodeOffset,
+            srcloc,
+            code,
+        });
+    }
+
+    fn add_call_site(&mut self, srcloc: SourceLoc, opcode: Opcode) {
+        self.call_sites.push(MachCallSite {
+            ret_addr: self.data.len() as CodeOffset,
+            srcloc,
+            opcode,
+        });
+    }
+}
+
+/// A MachSectionOutput implementation that records only size.
+pub struct MachSectionSize {
+    /// The starting offset of this section.
+    pub start_offset: CodeOffset,
+    /// The current offset of this section.
+    pub offset: CodeOffset,
+}
+
+impl MachSectionSize {
+    /// Create a new size-counting dummy section.
+    pub fn new(start_offset: CodeOffset) -> MachSectionSize {
+        MachSectionSize {
+            start_offset,
+            offset: start_offset,
+        }
+    }
+
+    /// Return the size this section would take if emitted with a real sink.
+    pub fn size(&self) -> CodeOffset {
+        self.offset - self.start_offset
+    }
+}
+
+impl MachSectionOutput for MachSectionSize {
+    fn cur_offset_from_start(&self) -> CodeOffset {
+        // All size-counting sections conceptually start at offset 0; this doesn't
+        // matter when counting code size.
+        self.offset
+    }
+
+    fn start_offset(&self) -> CodeOffset {
+        self.start_offset
+    }
+
+    fn put1(&mut self, _: u8) {
+        self.offset += 1;
+    }
+
+    fn put_data(&mut self, data: &[u8]) {
+        self.offset += data.len() as CodeOffset;
+    }
+
+    fn add_reloc(&mut self, _: SourceLoc, _: Reloc, _: &ExternalName, _: Addend) {}
+
+    fn add_trap(&mut self, _: SourceLoc, _: TrapCode) {}
+
+    fn add_call_site(&mut self, _: SourceLoc, _: Opcode) {}
+}
+
+/// A relocation resulting from a compilation.
+pub struct MachReloc {
+    /// The offset at which the relocation applies, *relative to the
+    /// containing section*.
+    pub offset: CodeOffset,
+    /// The original source location.
+    pub srcloc: SourceLoc,
+    /// The kind of relocation.
+    pub kind: Reloc,
+    /// The external symbol / name to which this relocation refers.
+    pub name: ExternalName,
+    /// The addend to add to the symbol value.
+    pub addend: i64,
+}
+
+/// A trap record resulting from a compilation.
+pub struct MachTrap {
+    /// The offset at which the trap instruction occurs, *relative to the
+    /// containing section*.
+    pub offset: CodeOffset,
+    /// The original source location.
+    pub srcloc: SourceLoc,
+    /// The trap code.
+    pub code: TrapCode,
+}
+
+/// A call site record resulting from a compilation.
+pub struct MachCallSite {
+    /// The offset of the call's return address, *relative to the containing section*.
+    pub ret_addr: CodeOffset,
+    /// The original source location.
+    pub srcloc: SourceLoc,
+    /// The call's opcode.
+    pub opcode: Opcode,
+}
--- a/cranelift/codegen/src/machinst/vcode.rs
+++ b/cranelift/codegen/src/machinst/vcode.rs
@@ -0,0 +1,730 @@
+//! This implements the VCode container: a CFG of Insts that have been lowered.
+//!
+//! VCode is virtual-register code. An instruction in VCode is almost a machine
+//! instruction; however, its register slots can refer to virtual registers in
+//! addition to real machine registers.
+//!
+//! VCode is structured with traditional basic blocks, and
+//! each block must be terminated by an unconditional branch (one target), a
+//! conditional branch (two targets), or a return (no targets). Note that this
+//! slightly differs from the machine code of most ISAs: in most ISAs, a
+//! conditional branch has one target (and the not-taken case falls through).
+//! However, we expect that machine backends will elide branches to the following
+//! block (i.e., zero-offset jumps), and will be able to codegen a branch-cond /
+//! branch-uncond pair if *both* targets are not fallthrough. This allows us to
+//! play with layout prior to final binary emission, as well, if we want.
+//!
+//! See the main module comment in `mod.rs` for more details on the VCode-based
+//! backend pipeline.
+
+use crate::ir;
+use crate::machinst::*;
+use crate::settings;
+
+use regalloc::Function as RegallocFunction;
+use regalloc::Set as RegallocSet;
+use regalloc::{BlockIx, InstIx, Range, RegAllocResult, RegClass, RegUsageCollector};
+
+use alloc::boxed::Box;
+use alloc::vec::Vec;
+use log::debug;
+use smallvec::SmallVec;
+use std::fmt;
+use std::iter;
+use std::string::String;
+
+/// Index referring to an instruction in VCode.
+pub type InsnIndex = u32;
+/// Index referring to a basic block in VCode.
+pub type BlockIndex = u32;
+
+/// VCodeInst wraps all requirements for a MachInst to be in VCode: it must be
+/// a `MachInst` and it must be able to emit itself at least to a `SizeCodeSink`.
+pub trait VCodeInst: MachInst + MachInstEmit<MachSection> + MachInstEmit<MachSectionSize> {}
+impl<I: MachInst + MachInstEmit<MachSection> + MachInstEmit<MachSectionSize>> VCodeInst for I {}
+
+/// A function in "VCode" (virtualized-register code) form, after lowering.
+/// This is essentially a standard CFG of basic blocks, where each basic block
+/// consists of lowered instructions produced by the machine-specific backend.
+pub struct VCode<I: VCodeInst> {
+    /// Function liveins.
+    liveins: RegallocSet<RealReg>,
+
+    /// Function liveouts.
+    liveouts: RegallocSet<RealReg>,
+
+    /// VReg IR-level types.
+    vreg_types: Vec<Type>,
+
+    /// Lowered machine instructions in order corresponding to the original IR.
+    insts: Vec<I>,
+
+    /// Entry block.
+    entry: BlockIndex,
+
+    /// Block instruction indices.
+    block_ranges: Vec<(InsnIndex, InsnIndex)>,
+
+    /// Block successors: index range in the successor-list below.
+    block_succ_range: Vec<(usize, usize)>,
+
+    /// Block successor lists, concatenated into one Vec. The `block_succ_range`
+    /// list of tuples above gives (start, end) ranges within this list that
+    /// correspond to each basic block's successors.
+    block_succs: Vec<BlockIndex>,
+
+    /// Block indices by IR block.
+    block_by_bb: SecondaryMap<ir::Block, BlockIndex>,
+
+    /// IR block for each VCode Block. The length of this Vec will likely be
+    /// less than the total number of Blocks, because new Blocks (for edge
+    /// splits, for example) are appended during lowering.
+    bb_by_block: Vec<ir::Block>,
+
+    /// Order of block IDs in final generated code.
+    final_block_order: Vec<BlockIndex>,
+
+    /// Final block offsets. Computed during branch finalization and used
+    /// during emission.
+    final_block_offsets: Vec<CodeOffset>,
+
+    /// Size of code, accounting for block layout / alignment.
+    code_size: CodeOffset,
+
+    /// ABI object.
+    abi: Box<dyn ABIBody<I = I>>,
+}
+
+/// A builder for a VCode function body. This builder is designed for the
+/// lowering approach that we take: we traverse basic blocks in forward
+/// (original IR) order, but within each basic block, we generate code from
+/// bottom to top; and within each IR instruction that we visit in this reverse
+/// order, we emit machine instructions in *forward* order again.
+///
+/// Hence, to produce the final instructions in proper order, we perform two
+/// swaps.  First, the machine instructions (`I` instances) are produced in
+/// forward order for an individual IR instruction. Then these are *reversed*
+/// and concatenated to `bb_insns` at the end of the IR instruction lowering.
+/// The `bb_insns` vec will thus contain all machine instructions for a basic
+/// block, in reverse order. Finally, when we're done with a basic block, we
+/// reverse the whole block's vec of instructions again, and concatenate onto
+/// the VCode's insts.
+pub struct VCodeBuilder<I: VCodeInst> {
+    /// In-progress VCode.
+    vcode: VCode<I>,
+
+    /// Current basic block instructions, in reverse order (because blocks are
+    /// built bottom-to-top).
+    bb_insns: SmallVec<[I; 32]>,
+
+    /// Current IR-inst instructions, in forward order.
+    ir_inst_insns: SmallVec<[I; 4]>,
+
+    /// Start of succs for the current block in the concatenated succs list.
+    succ_start: usize,
+}
+
+impl<I: VCodeInst> VCodeBuilder<I> {
+    /// Create a new VCodeBuilder.
+    pub fn new(abi: Box<dyn ABIBody<I = I>>) -> VCodeBuilder<I> {
+        let vcode = VCode::new(abi);
+        VCodeBuilder {
+            vcode,
+            bb_insns: SmallVec::new(),
+            ir_inst_insns: SmallVec::new(),
+            succ_start: 0,
+        }
+    }
+
+    /// Access the ABI object.
+    pub fn abi(&mut self) -> &mut dyn ABIBody<I = I> {
+        &mut *self.vcode.abi
+    }
+
+    /// Set the type of a VReg.
+    pub fn set_vreg_type(&mut self, vreg: VirtualReg, ty: Type) {
+        while self.vcode.vreg_types.len() <= vreg.get_index() {
+            self.vcode.vreg_types.push(ir::types::I8); // Default type.
+        }
+        self.vcode.vreg_types[vreg.get_index()] = ty;
+    }
+
+    /// Return the underlying bb-to-BlockIndex map.
+    pub fn blocks_by_bb(&self) -> &SecondaryMap<ir::Block, BlockIndex> {
+        &self.vcode.block_by_bb
+    }
+
+    /// Initialize the bb-to-BlockIndex map. Returns the first free
+    /// BlockIndex.
+    pub fn init_bb_map(&mut self, blocks: &[ir::Block]) -> BlockIndex {
+        let mut bindex: BlockIndex = 0;
+        for bb in blocks.iter() {
+            self.vcode.block_by_bb[*bb] = bindex;
+            self.vcode.bb_by_block.push(*bb);
+            bindex += 1;
+        }
+        bindex
+    }
+
+    /// Get the BlockIndex for an IR block.
+    pub fn bb_to_bindex(&self, bb: ir::Block) -> BlockIndex {
+        self.vcode.block_by_bb[bb]
+    }
+
+    /// Set the current block as the entry block.
+    pub fn set_entry(&mut self, block: BlockIndex) {
+        self.vcode.entry = block;
+    }
+
+    /// End the current IR instruction. Must be called after pushing any
+    /// instructions and prior to ending the basic block.
+    pub fn end_ir_inst(&mut self) {
+        while let Some(i) = self.ir_inst_insns.pop() {
+            self.bb_insns.push(i);
+        }
+    }
+
+    /// End the current basic block. Must be called after emitting vcode insts
+    /// for IR insts and prior to ending the function (building the VCode).
+    pub fn end_bb(&mut self) -> BlockIndex {
+        assert!(self.ir_inst_insns.is_empty());
+        let block_num = self.vcode.block_ranges.len() as BlockIndex;
+        // Push the instructions.
+        let start_idx = self.vcode.insts.len() as InsnIndex;
+        while let Some(i) = self.bb_insns.pop() {
+            self.vcode.insts.push(i);
+        }
+        let end_idx = self.vcode.insts.len() as InsnIndex;
+        // Add the instruction index range to the list of blocks.
+        self.vcode.block_ranges.push((start_idx, end_idx));
+        // End the successors list.
+        let succ_end = self.vcode.block_succs.len();
+        self.vcode
+            .block_succ_range
+            .push((self.succ_start, succ_end));
+        self.succ_start = succ_end;
+
+        block_num
+    }
+
+    /// Push an instruction for the current BB and current IR inst within the BB.
+    pub fn push(&mut self, insn: I) {
+        match insn.is_term() {
+            MachTerminator::None | MachTerminator::Ret => {}
+            MachTerminator::Uncond(target) => {
+                self.vcode.block_succs.push(target);
+            }
+            MachTerminator::Cond(true_branch, false_branch) => {
+                self.vcode.block_succs.push(true_branch);
+                self.vcode.block_succs.push(false_branch);
+            }
+            MachTerminator::Indirect(targets) => {
+                for target in targets {
+                    self.vcode.block_succs.push(*target);
+                }
+            }
+        }
+        self.ir_inst_insns.push(insn);
+    }
+
+    /// Build the final VCode.
+    pub fn build(self) -> VCode<I> {
+        assert!(self.ir_inst_insns.is_empty());
+        assert!(self.bb_insns.is_empty());
+        self.vcode
+    }
+}
+
+fn block_ranges(indices: &[InstIx], len: usize) -> Vec<(usize, usize)> {
+    let v = indices
+        .iter()
+        .map(|iix| iix.get() as usize)
+        .chain(iter::once(len))
+        .collect::<Vec<usize>>();
+    v.windows(2).map(|p| (p[0], p[1])).collect()
+}
+
+fn is_redundant_move<I: VCodeInst>(insn: &I) -> bool {
+    if let Some((to, from)) = insn.is_move() {
+        to.to_reg() == from
+    } else {
+        false
+    }
+}
+
+fn is_trivial_jump_block<I: VCodeInst>(vcode: &VCode<I>, block: BlockIndex) -> Option<BlockIndex> {
+    let range = vcode.block_insns(BlockIx::new(block));
+
+    debug!(
+        "is_trivial_jump_block: block {} has len {}",
+        block,
+        range.len()
+    );
+
+    if range.len() != 1 {
+        return None;
+    }
+    let insn = range.first();
+
+    debug!(
+        " -> only insn is: {:?} with terminator {:?}",
+        vcode.get_insn(insn),
+        vcode.get_insn(insn).is_term()
+    );
+
+    match vcode.get_insn(insn).is_term() {
+        MachTerminator::Uncond(target) => Some(target),
+        _ => None,
+    }
+}
+
+impl<I: VCodeInst> VCode<I> {
+    /// New empty VCode.
+    fn new(abi: Box<dyn ABIBody<I = I>>) -> VCode<I> {
+        VCode {
+            liveins: abi.liveins(),
+            liveouts: abi.liveouts(),
+            vreg_types: vec![],
+            insts: vec![],
+            entry: 0,
+            block_ranges: vec![],
+            block_succ_range: vec![],
+            block_succs: vec![],
+            block_by_bb: SecondaryMap::with_default(0),
+            bb_by_block: vec![],
+            final_block_order: vec![],
+            final_block_offsets: vec![],
+            code_size: 0,
+            abi,
+        }
+    }
+
+    /// Get the IR-level type of a VReg.
+    pub fn vreg_type(&self, vreg: VirtualReg) -> Type {
+        self.vreg_types[vreg.get_index()]
+    }
+
+    /// Get the entry block.
+    pub fn entry(&self) -> BlockIndex {
+        self.entry
+    }
+
+    /// Get the number of blocks. Block indices will be in the range `0 ..
+    /// (self.num_blocks() - 1)`.
+    pub fn num_blocks(&self) -> usize {
+        self.block_ranges.len()
+    }
+
+    /// Stack frame size for the full function's body.
+    pub fn frame_size(&self) -> u32 {
+        self.abi.frame_size()
+    }
+
+    /// Get the successors for a block.
+    pub fn succs(&self, block: BlockIndex) -> &[BlockIndex] {
+        let (start, end) = self.block_succ_range[block as usize];
+        &self.block_succs[start..end]
+    }
+
+    /// Take the results of register allocation, with a sequence of
+    /// instructions including spliced fill/reload/move instructions, and replace
+    /// the VCode with them.
+    pub fn replace_insns_from_regalloc(
+        &mut self,
+        result: RegAllocResult<Self>,
+        flags: &settings::Flags,
+    ) {
+        self.final_block_order = compute_final_block_order(self);
+
+        // Record the spillslot count and clobbered registers for the ABI/stack
+        // setup code.
+        self.abi.set_num_spillslots(result.num_spill_slots as usize);
+        self.abi
+            .set_clobbered(result.clobbered_registers.map(|r| Writable::from_reg(*r)));
+
+        // We want to move instructions over in final block order, using the new
+        // block-start map given by the regalloc.
+        let block_ranges: Vec<(usize, usize)> =
+            block_ranges(result.target_map.elems(), result.insns.len());
+        let mut final_insns = vec![];
+        let mut final_block_ranges = vec![(0, 0); self.num_blocks()];
+
+        for block in &self.final_block_order {
+            let (start, end) = block_ranges[*block as usize];
+            let final_start = final_insns.len() as InsnIndex;
+
+            if *block == self.entry {
+                // Start with the prologue.
+                final_insns.extend(self.abi.gen_prologue(flags).into_iter());
+            }
+
+            for i in start..end {
+                let insn = &result.insns[i];
+
+                // Elide redundant moves at this point (we only know what is
+                // redundant once registers are allocated).
+                if is_redundant_move(insn) {
+                    continue;
+                }
+
+                // Whenever encountering a return instruction, replace it
+                // with the epilogue.
+                let is_ret = insn.is_term() == MachTerminator::Ret;
+                if is_ret {
+                    final_insns.extend(self.abi.gen_epilogue(flags).into_iter());
+                } else {
+                    final_insns.push(insn.clone());
+                }
+            }
+
+            let final_end = final_insns.len() as InsnIndex;
+            final_block_ranges[*block as usize] = (final_start, final_end);
+        }
+
+        self.insts = final_insns;
+        self.block_ranges = final_block_ranges;
+    }
+
+    /// Removes redundant branches, rewriting targets to point directly to the
+    /// ultimate block at the end of a chain of trivial one-target jumps.
+    pub fn remove_redundant_branches(&mut self) {
+        // For each block, compute the actual target block, looking through up to one
+        // block with single-target jumps (this will remove empty edge blocks inserted
+        // by phi-lowering).
+        let block_rewrites: Vec<BlockIndex> = (0..self.num_blocks() as u32)
+            .map(|bix| is_trivial_jump_block(self, bix).unwrap_or(bix))
+            .collect();
+        let mut refcounts: Vec<usize> = vec![0; self.num_blocks()];
+
+        debug!(
+            "remove_redundant_branches: block_rewrites = {:?}",
+            block_rewrites
+        );
+
+        refcounts[self.entry as usize] = 1;
+
+        for block in 0..self.num_blocks() as u32 {
+            for insn in self.block_insns(BlockIx::new(block)) {
+                self.get_insn_mut(insn)
+                    .with_block_rewrites(&block_rewrites[..]);
+                match self.get_insn(insn).is_term() {
+                    MachTerminator::Uncond(bix) => {
+                        refcounts[bix as usize] += 1;
+                    }
+                    MachTerminator::Cond(bix1, bix2) => {
+                        refcounts[bix1 as usize] += 1;
+                        refcounts[bix2 as usize] += 1;
+                    }
+                    MachTerminator::Indirect(blocks) => {
+                        for block in blocks {
+                            refcounts[*block as usize] += 1;
+                        }
+                    }
+                    _ => {}
+                }
+            }
+        }
+
+        let deleted: Vec<bool> = refcounts.iter().map(|r| *r == 0).collect();
+
+        let block_order = std::mem::replace(&mut self.final_block_order, vec![]);
+        self.final_block_order = block_order
+            .into_iter()
+            .filter(|b| !deleted[*b as usize])
+            .collect();
+
+        // Rewrite successor information based on the block-rewrite map.
+        for succ in &mut self.block_succs {
+            let new_succ = block_rewrites[*succ as usize];
+            *succ = new_succ;
+        }
+    }
+
+    /// Mutate branch instructions to (i) lower two-way condbrs to one-way,
+    /// depending on fallthrough; and (ii) use concrete offsets.
+    pub fn finalize_branches(&mut self)
+    where
+        I: MachInstEmit<MachSectionSize>,
+    {
+        // Compute fallthrough block, indexed by block.
+        let num_final_blocks = self.final_block_order.len();
+        let mut block_fallthrough: Vec<Option<BlockIndex>> = vec![None; self.num_blocks()];
+        for i in 0..(num_final_blocks - 1) {
+            let from = self.final_block_order[i];
+            let to = self.final_block_order[i + 1];
+            block_fallthrough[from as usize] = Some(to);
+        }
+
+        // Pass over VCode instructions and finalize two-way branches into
+        // one-way branches with fallthrough.
+        for block in 0..self.num_blocks() {
+            let next_block = block_fallthrough[block];
+            let (start, end) = self.block_ranges[block];
+
+            for iix in start..end {
+                let insn = &mut self.insts[iix as usize];
+                insn.with_fallthrough_block(next_block);
+            }
+        }
+
+        // Compute block offsets.
+        let mut code_section = MachSectionSize::new(0);
+        let mut block_offsets = vec![0; self.num_blocks()];
+        for &block in &self.final_block_order {
+            code_section.offset = I::align_basic_block(code_section.offset);
+            block_offsets[block as usize] = code_section.offset;
+            let (start, end) = self.block_ranges[block as usize];
+            for iix in start..end {
+                self.insts[iix as usize].emit(&mut code_section);
+            }
+        }
+
+        // We now have the section layout.
+        self.final_block_offsets = block_offsets;
+        self.code_size = code_section.size();
+
+        // Update branches with known block offsets. This looks like the
+        // traversal above, but (i) does not update block_offsets, rather uses
+        // it (so forward references are now possible), and (ii) mutates the
+        // instructions.
+        let mut code_section = MachSectionSize::new(0);
+        for &block in &self.final_block_order {
+            code_section.offset = I::align_basic_block(code_section.offset);
+            let (start, end) = self.block_ranges[block as usize];
+            for iix in start..end {
+                self.insts[iix as usize]
+                    .with_block_offsets(code_section.offset, &self.final_block_offsets[..]);
+                self.insts[iix as usize].emit(&mut code_section);
+            }
+        }
+    }
+
+    /// Emit the instructions to a list of sections.
+    pub fn emit(&self) -> MachSections
+    where
+        I: MachInstEmit<MachSection>,
+    {
+        let mut sections = MachSections::new();
+        let code_idx = sections.add_section(0, self.code_size);
+        let code_section = sections.get_section(code_idx);
+
+        for &block in &self.final_block_order {
+            let new_offset = I::align_basic_block(code_section.cur_offset_from_start());
+            while new_offset > code_section.cur_offset_from_start() {
+                // Pad with NOPs up to the aligned block offset.
+                let nop = I::gen_nop((new_offset - code_section.cur_offset_from_start()) as usize);
+                nop.emit(code_section);
+            }
+            assert_eq!(code_section.cur_offset_from_start(), new_offset);
+
+            let (start, end) = self.block_ranges[block as usize];
+            for iix in start..end {
+                self.insts[iix as usize].emit(code_section);
+            }
+        }
+
+        sections
+    }
+
+    /// Get the IR block for a BlockIndex, if one exists.
+    pub fn bindex_to_bb(&self, block: BlockIndex) -> Option<ir::Block> {
+        if (block as usize) < self.bb_by_block.len() {
+            Some(self.bb_by_block[block as usize])
+        } else {
+            None
+        }
+    }
+}
+
+impl<I: VCodeInst> RegallocFunction for VCode<I> {
+    type Inst = I;
+
+    fn insns(&self) -> &[I] {
+        &self.insts[..]
+    }
+
+    fn insns_mut(&mut self) -> &mut [I] {
+        &mut self.insts[..]
+    }
+
+    fn get_insn(&self, insn: InstIx) -> &I {
+        &self.insts[insn.get() as usize]
+    }
+
+    fn get_insn_mut(&mut self, insn: InstIx) -> &mut I {
+        &mut self.insts[insn.get() as usize]
+    }
+
+    fn blocks(&self) -> Range<BlockIx> {
+        Range::new(BlockIx::new(0), self.block_ranges.len())
+    }
+
+    fn entry_block(&self) -> BlockIx {
+        BlockIx::new(self.entry)
+    }
+
+    fn block_insns(&self, block: BlockIx) -> Range<InstIx> {
+        let (start, end) = self.block_ranges[block.get() as usize];
+        Range::new(InstIx::new(start), (end - start) as usize)
+    }
+
+    fn block_succs(&self, block: BlockIx) -> Vec<BlockIx> {
+        let (start, end) = self.block_succ_range[block.get() as usize];
+        self.block_succs[start..end]
+            .iter()
+            .cloned()
+            .map(BlockIx::new)
+            .collect()
+    }
+
+    fn is_ret(&self, insn: InstIx) -> bool {
+        match self.insts[insn.get() as usize].is_term() {
+            MachTerminator::Ret => true,
+            _ => false,
+        }
+    }
+
+    fn get_regs(insn: &I, collector: &mut RegUsageCollector) {
+        insn.get_regs(collector)
+    }
+
+    fn map_regs(
+        insn: &mut I,
+        pre_map: &RegallocMap<VirtualReg, RealReg>,
+        post_map: &RegallocMap<VirtualReg, RealReg>,
+    ) {
+        insn.map_regs(pre_map, post_map);
+    }
+
+    fn is_move(&self, insn: &I) -> Option<(Writable<Reg>, Reg)> {
+        insn.is_move()
+    }
+
+    fn get_spillslot_size(&self, regclass: RegClass, vreg: VirtualReg) -> u32 {
+        let ty = self.vreg_type(vreg);
+        self.abi.get_spillslot_size(regclass, ty)
+    }
+
+    fn gen_spill(&self, to_slot: SpillSlot, from_reg: RealReg, vreg: VirtualReg) -> I {
+        let ty = self.vreg_type(vreg);
+        self.abi.gen_spill(to_slot, from_reg, ty)
+    }
+
+    fn gen_reload(&self, to_reg: Writable<RealReg>, from_slot: SpillSlot, vreg: VirtualReg) -> I {
+        let ty = self.vreg_type(vreg);
+        self.abi.gen_reload(to_reg, from_slot, ty)
+    }
+
+    fn gen_move(&self, to_reg: Writable<RealReg>, from_reg: RealReg, vreg: VirtualReg) -> I {
+        let ty = self.vreg_type(vreg);
+        I::gen_move(to_reg.map(|r| r.to_reg()), from_reg.to_reg(), ty)
+    }
+
+    fn gen_zero_len_nop(&self) -> I {
+        I::gen_zero_len_nop()
+    }
+
+    fn maybe_direct_reload(&self, insn: &I, reg: VirtualReg, slot: SpillSlot) -> Option<I> {
+        insn.maybe_direct_reload(reg, slot)
+    }
+
+    fn func_liveins(&self) -> RegallocSet<RealReg> {
+        self.liveins.clone()
+    }
+
+    fn func_liveouts(&self) -> RegallocSet<RealReg> {
+        self.liveouts.clone()
+    }
+}
+
+impl<I: VCodeInst> fmt::Debug for VCode<I> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        writeln!(f, "VCode_Debug {{")?;
+        writeln!(f, "  Entry block: {}", self.entry)?;
+        writeln!(f, "  Final block order: {:?}", self.final_block_order)?;
+
+        for block in 0..self.num_blocks() {
+            writeln!(f, "Block {}:", block,)?;
+            for succ in self.succs(block as BlockIndex) {
+                writeln!(f, "  (successor: Block {})", succ)?;
+            }
+            let (start, end) = self.block_ranges[block];
+            writeln!(f, "  (instruction range: {} .. {})", start, end)?;
+            for inst in start..end {
+                writeln!(f, "  Inst {}: {:?}", inst, self.insts[inst as usize])?;
+            }
+        }
+
+        writeln!(f, "}}")?;
+        Ok(())
+    }
+}
+
+/// Pretty-printing with `RealRegUniverse` context.
+impl<I: VCodeInst + ShowWithRRU> ShowWithRRU for VCode<I> {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        use std::fmt::Write;
+
+        // Calculate an order in which to display the blocks.  This is the same
+        // as final_block_order, but also includes blocks which are in the
+        // representation but not in final_block_order.
+        let mut display_order = Vec::<usize>::new();
+        // First display blocks in `final_block_order`
+        for bix in &self.final_block_order {
+            assert!((*bix as usize) < self.num_blocks());
+            display_order.push(*bix as usize);
+        }
+        // Now also take care of those not listed in `final_block_order`.
+        // This is quadratic, but it's also debug-only code.
+        for bix in 0..self.num_blocks() {
+            if display_order.contains(&bix) {
+                continue;
+            }
+            display_order.push(bix);
+        }
+
+        let mut s = String::new();
+        write!(&mut s, "VCode_ShowWithRRU {{{{\n").unwrap();
+        write!(&mut s, "  Entry block: {}\n", self.entry).unwrap();
+        write!(
+            &mut s,
+            "  Final block order: {:?}\n",
+            self.final_block_order
+        )
+        .unwrap();
+
+        for i in 0..self.num_blocks() {
+            let block = display_order[i];
+
+            let omitted = if !self.final_block_order.is_empty() && i >= self.final_block_order.len()
+            {
+                "** OMITTED **"
+            } else {
+                ""
+            };
+
+            write!(&mut s, "Block {}: {}\n", block, omitted).unwrap();
+            if let Some(bb) = self.bindex_to_bb(block as BlockIndex) {
+                write!(&mut s, "  (original IR block: {})\n", bb).unwrap();
+            }
+            for succ in self.succs(block as BlockIndex) {
+                write!(&mut s, "  (successor: Block {})\n", succ).unwrap();
+            }
+            let (start, end) = self.block_ranges[block];
+            write!(&mut s, "  (instruction range: {} .. {})\n", start, end).unwrap();
+            for inst in start..end {
+                write!(
+                    &mut s,
+                    "  Inst {}:   {}\n",
+                    inst,
+                    self.insts[inst as usize].show_rru(mb_rru)
+                )
+                .unwrap();
+            }
+        }
+
+        write!(&mut s, "}}}}\n").unwrap();
+
+        s
+    }
+}
--- a/cranelift/codegen/src/num_uses.rs
+++ b/cranelift/codegen/src/num_uses.rs
@@ -0,0 +1,52 @@
+//! A pass that computes the number of uses of any given instruction.
+
+use crate::entity::SecondaryMap;
+use crate::ir::dfg::ValueDef;
+use crate::ir::Value;
+use crate::ir::{DataFlowGraph, Function, Inst};
+
+/// Auxiliary data structure that counts the number of uses of any given
+/// instruction in a Function. This is used during instruction selection
+/// to essentially do incremental DCE: when an instruction is no longer
+/// needed because its computation has been isel'd into another machine
+/// instruction at every use site, we can skip it.
+#[derive(Clone, Debug)]
+pub struct NumUses {
+    uses: SecondaryMap<Inst, u32>,
+}
+
+impl NumUses {
+    fn new() -> NumUses {
+        NumUses {
+            uses: SecondaryMap::with_default(0),
+        }
+    }
+
+    /// Compute the NumUses analysis result for a function.
+    pub fn compute(func: &Function) -> NumUses {
+        let mut uses = NumUses::new();
+        for bb in func.layout.blocks() {
+            for inst in func.layout.block_insts(bb) {
+                for arg in func.dfg.inst_args(inst) {
+                    let v = func.dfg.resolve_aliases(*arg);
+                    uses.add_value(&func.dfg, v);
+                }
+            }
+        }
+        uses
+    }
+
+    fn add_value(&mut self, dfg: &DataFlowGraph, v: Value) {
+        match dfg.value_def(v) {
+            ValueDef::Result(inst, _) => {
+                self.uses[inst] += 1;
+            }
+            _ => {}
+        }
+    }
+
+    /// Take the complete uses map, consuming this analysis result.
+    pub fn take_uses(self) -> SecondaryMap<Inst, u32> {
+        self.uses
+    }
+}
--- a/cranelift/codegen/src/postopt.rs
+++ b/cranelift/codegen/src/postopt.rs
@@ -360,10 +360,11 @@ fn optimize_complex_addresses(pos: &mut EncCursor, inst: Inst, isa: &dyn TargetI
 pub fn do_postopt(func: &mut Function, isa: &dyn TargetIsa) {
    let _tt = timing::postopt();
    let mut pos = EncCursor::new(func, isa);
+    let is_mach_backend = isa.get_mach_backend().is_some();
    while let Some(_block) = pos.next_block() {
        let mut last_flags_clobber = None;
        while let Some(inst) = pos.next_inst() {
-            if isa.uses_cpu_flags() {
+            if !is_mach_backend && isa.uses_cpu_flags() {
                // Optimize instructions to make use of flags.
                optimize_cpu_flags(&mut pos, inst, last_flags_clobber, isa);

--- a/cranelift/codegen/src/verifier/flags.rs
+++ b/cranelift/codegen/src/verifier/flags.rs
@@ -28,10 +28,15 @@ pub fn verify_flags(
    errors: &mut VerifierErrors,
 ) -> VerifierStepResult<()> {
    let _tt = timing::verify_flags();
+    let encinfo = if isa.is_none() || isa.unwrap().get_mach_backend().is_some() {
+        None
+    } else {
+        Some(isa.unwrap().encoding_info())
+    };
    let mut verifier = FlagsVerifier {
        func,
        cfg,
-        encinfo: isa.map(|isa| isa.encoding_info()),
+        encinfo,
        livein: SecondaryMap::new(),
    };
    verifier.check(errors)
--- a/cranelift/filetests/Cargo.toml
+++ b/cranelift/filetests/Cargo.toml
@@ -22,3 +22,4 @@ memmap = "0.7.0"
 num_cpus = "1.8.0"
 region = "2.1.2"
 byteorder = { version = "1.3.2", default-features = false }
+target-lexicon = "0.10"
--- a/cranelift/filetests/filetests/vcode/aarch64/arithmetic.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/arithmetic.clif
@@ -0,0 +1,243 @@
+test vcode
+target aarch64
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = iadd.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  add x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = isub.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  sub x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = imul.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  madd x0, x0, x1, xzr
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = umulhi.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  umulh x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = smulhi.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  smulh x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = sdiv.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  sdiv x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64) -> i64 {
+block0(v0: i64):
+  v1 = iconst.i64 2
+  v2 = sdiv.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  movz x1, #2
+; nextln:  sdiv x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = udiv.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  udiv x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64) -> i64 {
+block0(v0: i64):
+  v1 = iconst.i64 2
+  v2 = udiv.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  movz x1, #2
+; nextln:  udiv x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = srem.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  sdiv x2, x0, x1
+; nextln:  msub x0, x2, x1, x0
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = urem.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  udiv x2, x0, x1
+; nextln:  msub x0, x2, x1, x0
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = band.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  and x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = bor.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  orr x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = bxor.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  eor x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = band_not.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  bic x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = bor_not.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  orn x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = bxor_not.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  eon x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = bnot.i64 v0
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  orn x0, xzr, x0
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
--- a/cranelift/filetests/filetests/vcode/aarch64/basic1.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/basic1.clif
@@ -0,0 +1,14 @@
+test vcode
+target aarch64
+
+function %f(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+    ; check: stp fp, lr, [sp, #-16]!
+    ; check: mov fp, sp
+    v2 = iadd v0, v1
+    ; check: add w0, w0, w1
+    return v2
+    ; check: mov sp, fp
+    ; check: ldp fp, lr, [sp], #16
+    ; check: ret
+}
--- a/cranelift/filetests/filetests/vcode/aarch64/bitops.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/bitops.clif
@@ -0,0 +1,158 @@
+test vcode
+target aarch64
+
+function %a(i32) -> i32 {
+block0(v0: i32):
+    v1 = bitrev v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: rbit w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %a(i64) -> i64 {
+block0(v0: i64):
+    v1 = bitrev v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: rbit x0, x0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %b(i32) -> i32 {
+block0(v0: i32):
+    v1 = clz v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: clz w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %b(i64) -> i64 {
+block0(v0: i64):
+    v1 = clz v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: clz x0, x0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %c(i32) -> i32 {
+block0(v0: i32):
+    v1 = cls v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: cls w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %c(i64) -> i64 {
+block0(v0: i64):
+    v1 = cls v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: cls x0, x0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %d(i32) -> i32 {
+block0(v0: i32):
+    v1 = ctz v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: rbit w0, w0
+; nextln: clz w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %d(i64) -> i64 {
+block0(v0: i64):
+    v1 = ctz v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: rbit x0, x0
+; nextln: clz x0, x0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %d(i64) -> i64 {
+block0(v0: i64):
+    v1 = popcnt v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: lsr x1, x0, #1
+; nextln: and x1, x1, #6148914691236517205
+; nextln: sub x1, x0, x1
+; nextln: and x0, x1, #3689348814741910323
+; nextln: lsr x1, x1, #2
+; nextln: and x1, x1, #3689348814741910323
+; nextln: add x0, x1, x0
+; nextln: add x0, x0, x0, LSR 4
+; nextln: and x0, x0, #1085102592571150095
+; nextln: add x0, x0, x0, LSL 8
+; nextln: add x0, x0, x0, LSL 16
+; nextln: add x0, x0, x0, LSL 32
+; nextln: lsr x0, x0, #56
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %d(i32) -> i32 {
+block0(v0: i32):
+    v1 = popcnt v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: lsr w1, w0, #1
+; nextln: and x1, x1, #6148914691236517205
+; nextln: sub x1, x0, x1
+; nextln: and x0, x1, #3689348814741910323
+; nextln: lsr x1, x1, #2
+; nextln: and x1, x1, #3689348814741910323
+; nextln: add x0, x1, x0
+; nextln: add x0, x0, x0, LSR 4
+; nextln: and x0, x0, #1085102592571150095
+; nextln: add x0, x0, x0, LSL 8
+; nextln: add x0, x0, x0, LSL 16
+; nextln: add x0, x0, x0, LSL 32
+; nextln: lsr x0, x0, #56
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
--- a/cranelift/filetests/filetests/vcode/aarch64/call-indirect.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/call-indirect.clif
@@ -0,0 +1,16 @@
+test vcode
+target aarch64
+
+function %f(i64, i64) -> i64 {
+    sig0 = (i64) -> i64
+block0(v0: i64, v1: i64):
+    v2 = call_indirect.i64 sig0, v1(v0)
+    return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  blr x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
--- a/cranelift/filetests/filetests/vcode/aarch64/call.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/call.clif
@@ -0,0 +1,17 @@
+test vcode
+target aarch64
+
+function %f(i64) -> i64 {
+    fn0 = %g(i64) -> i64
+
+block0(v0: i64):
+    v1 = call fn0(v0)
+    return v1
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  bl 0
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
--- a/cranelift/filetests/filetests/vcode/aarch64/condbr.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/condbr.clif
@@ -0,0 +1,66 @@
+test vcode
+target aarch64
+
+function %f(i64, i64) -> b1 {
+block0(v0: i64, v1: i64):
+  v2 = icmp eq v0, v1
+  return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: subs xzr, x0, x1
+; nextln: cset x0, eq
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = ifcmp v0, v1
+  brif eq v2, block1
+  jump block2
+
+block1:
+  v4 = iconst.i64 1
+  return v4
+
+block2:
+  v5 = iconst.i64 2
+  return v5
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: subs xzr, x0, x1
+; nextln: b.eq 20
+; check: Block 2:
+; check: movz x0, #2
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+; check: Block 1:
+; check: movz x0, #1
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = ifcmp v0, v1
+  brif eq v2, block1
+  jump block1
+
+block1:
+  v4 = iconst.i64 1
+  return v4
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: subs xzr, x0, x1
+; check: Block 1:
+; check: movz x0, #1
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
--- a/cranelift/filetests/filetests/vcode/aarch64/condops.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/condops.clif
@@ -0,0 +1,43 @@
+test vcode
+target aarch64
+
+function %f(i8, i64, i64) -> i64 {
+block0(v0: i8, v1: i64, v2: i64):
+  v3 = iconst.i8 42
+  v4 = ifcmp v0, v3
+  v5 = selectif.i64 eq v4, v1, v2
+  return v5
+}
+
+; check: subs wzr
+; check: csel x0, $(=x[0-9]+, x[0-9]+), eq
+
+function %g(i8) -> b1 {
+block0(v0: i8):
+  v3 = iconst.i8 42
+  v4 = ifcmp v0, v3
+  v5 = trueif eq v4
+  return v5
+}
+
+; check: subs wzr
+; check: cset x0, eq
+
+function %h(i8, i8, i8) -> i8 {
+block0(v0: i8, v1: i8, v2: i8):
+  v3 = bitselect.i8 v0, v1, v2
+  return v3
+}
+
+; check: and
+; nextln: bic
+; nextln: orr
+
+function %i(b1, i8, i8) -> i8 {
+block0(v0: b1, v1: i8, v2: i8):
+  v3 = select.i8 v0, v1, v2
+  return v3
+}
+
+; check: subs wzr
+; nextln: csel
--- a/cranelift/filetests/filetests/vcode/aarch64/constants.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/constants.clif
@@ -0,0 +1,176 @@
+test vcode
+target aarch64
+
+function %f() -> i64 {
+block0:
+  v0 = iconst.i64 0
+  return v0
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movz x0, #0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f() -> i64 {
+block0:
+  v0 = iconst.i64 0xffff
+  return v0
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movz x0, #65535
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f() -> i64 {
+block0:
+  v0 = iconst.i64 0xffff0000
+  return v0
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movz x0, #65535, LSL #16
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f() -> i64 {
+block0:
+  v0 = iconst.i64 0xffff00000000
+  return v0
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movz x0, #65535, LSL #32
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f() -> i64 {
+block0:
+  v0 = iconst.i64 0xffff000000000000
+  return v0
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movz x0, #65535, LSL #48
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f() -> i64 {
+block0:
+  v0 = iconst.i64 0xffffffffffffffff
+  return v0
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movn x0, #0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f() -> i64 {
+block0:
+  v0 = iconst.i64 0xffffffffffff0000
+  return v0
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movn x0, #65535
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f() -> i64 {
+block0:
+  v0 = iconst.i64 0xffffffff0000ffff
+  return v0
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movn x0, #65535, LSL #16
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f() -> i64 {
+block0:
+  v0 = iconst.i64 0xffff0000ffffffff
+  return v0
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movn x0, #65535, LSL #32
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f() -> i64 {
+block0:
+  v0 = iconst.i64 0x0000ffffffffffff
+  return v0
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movn x0, #65535, LSL #48
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f() -> i64 {
+block0:
+  v0 = iconst.i64 0xf34bf0a31212003a ; random digits
+  return v0
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movz x0, #58
+; nextln: movk x0, #4626, LSL #16
+; nextln: movk x0, #61603, LSL #32
+; nextln: movk x0, #62283, LSL #48
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f() -> i64 {
+block0:
+  v0 = iconst.i64 0x12e900001ef40000 ; random digits with 2 clear half words
+  return v0
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movz x0, #7924, LSL #16
+; nextln: movk x0, #4841, LSL #48
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f() -> i64 {
+block0:
+  v0 = iconst.i64 0x12e9ffff1ef4ffff ; random digits with 2 full half words
+  return v0
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movn x0, #57611, LSL #16
+; nextln: movk x0, #4841, LSL #48
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
--- a/cranelift/filetests/filetests/vcode/aarch64/extend-op.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/extend-op.clif
@@ -0,0 +1,18 @@
+test vcode
+target aarch64
+
+function %f(i8) -> i64 {
+block0(v0: i8):
+  v1 = sextend.i64 v0
+  v2 = iconst.i64 42
+  v3 = iadd.i64 v2, v1
+  return v3
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  movz x1, #42
+; nextln:  add x0, x1, x0, SXTB
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
--- a/cranelift/filetests/filetests/vcode/aarch64/jumptable.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/jumptable.clif
@@ -0,0 +1,44 @@
+test vcode
+target aarch64
+
+function %f(i64) -> i64 {
+  jt0 = jump_table [block1, block2, block3]
+
+block0(v0: i64):
+  br_table v0, block4, jt0
+
+block1:
+  v1 = iconst.i64 1
+  jump block5(v1)
+
+block2:
+  v2 = iconst.i64 2
+  jump block5(v2)
+
+block3:
+  v3 = iconst.i64 3
+  jump block5(v3)
+
+block4:
+  v4 = iconst.i64 4
+  jump block5(v4)
+
+block5(v5: i64):
+  v6 = iadd.i64 v0, v5
+  return v6
+}
+
+; check:   subs wzr, w0, #3
+; nextln:   b.hs
+; nextln:   adr x2, pc+16 ; ldrsw x1, [x2, x0, LSL 2] ; add x2, x2, x1 ; br x2 ; jt_entries
+
+; check:   movz x1, #3
+; nextln:   b
+
+; check:   movz x1, #2
+; nextln:   b
+
+; check:   movz x1, #1
+
+; check:   add x0, x0, x1
+
--- a/cranelift/filetests/filetests/vcode/aarch64/narrow-arithmetic.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/narrow-arithmetic.clif
@@ -0,0 +1,69 @@
+test vcode
+target aarch64
+
+function %add8(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = iadd.i8 v0, v1
+  return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: add w0, w0, w1
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %add16(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = iadd.i16 v0, v1
+  return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: add w0, w0, w1
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %add32(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = iadd.i32 v0, v1
+  return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: add w0, w0, w1
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %add32_8(i32, i8) -> i32 {
+block0(v0: i32, v1: i8):
+  v2 = sextend.i32 v1
+  v3 = iadd.i32 v0, v2
+  return v3
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: add w0, w0, w1, SXTB
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %add64_32(i64, i32) -> i64 {
+block0(v0: i64, v1: i32):
+  v2 = sextend.i64 v1
+  v3 = iadd.i64 v0, v2
+  return v3
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: add x0, x0, x1, SXTW
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
--- a/cranelift/filetests/filetests/vcode/aarch64/saturating-ops.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/saturating-ops.clif
@@ -0,0 +1,36 @@
+test vcode
+target aarch64
+
+function %uaddsat64(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = uadd_sat.i64 v0, v1
+  return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: mov v0.d[0], x0
+; nextln: mov v1.d[0], x1
+; nextln: uqadd d0, d0, d1
+; nextln: mov x0, v0.d[0]
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %uaddsat8(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = uadd_sat.i8 v0, v1
+  return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: uxtb x0, w0
+; nextln: uxtb x1, w1
+; nextln: mov v0.d[0], x0
+; nextln: mov v1.d[0], x1
+; nextln: uqadd d0, d0, d1
+; nextln: mov x0, v0.d[0]
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
--- a/cranelift/filetests/filetests/vcode/aarch64/shift-op.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/shift-op.clif
@@ -0,0 +1,17 @@
+test vcode
+target aarch64
+
+function %f(i64) -> i64 {
+block0(v0: i64):
+  v1 = iconst.i64 3
+  v2 = ishl.i64 v0, v1
+  v3 = iadd.i64 v0, v2
+  return v3
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: add x0, x0, x0, LSL 3
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
--- a/cranelift/filetests/filetests/vcode/aarch64/shift-rotate.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/shift-rotate.clif
@@ -0,0 +1,440 @@
+test vcode
+target aarch64
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ROR, variable
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+function %f0(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = rotr.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  ror x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f1(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = rotr.i32 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  ror w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f2(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = rotr.i16 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  uxth w0, w0
+; nextln:  sub w2, w1, #16
+; nextln:  sub w2, wzr, w2
+; nextln:  lsr w1, w0, w1
+; nextln:  lsl w0, w0, w2
+; nextln:  orr w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f3(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = rotr.i8 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  uxtb w0, w0
+; nextln:  sub w2, w1, #8
+; nextln:  sub w2, wzr, w2
+; nextln:  lsr w1, w0, w1
+; nextln:  lsl w0, w0, w2
+; nextln:  orr w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ROL, variable
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+function %f4(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = rotl.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  sub w2, w1, #64
+; nextln:  sub w2, wzr, w2
+; nextln:  lsl x1, x0, x1
+; nextln:  lsr x0, x0, x2
+; nextln:  orr x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f5(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = rotl.i32 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  sub w2, w1, #32
+; nextln:  sub w2, wzr, w2
+; nextln:  lsl w1, w0, w1
+; nextln:  lsr w0, w0, w2
+; nextln:  orr w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f6(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = rotl.i16 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  uxth w0, w0
+; nextln:  sub w2, w1, #16
+; nextln:  sub w2, wzr, w2
+; nextln:  lsl w1, w0, w1
+; nextln:  lsr w0, w0, w2
+; nextln:  orr w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f7(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = rotl.i8 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  uxtb w0, w0
+; nextln:  sub w2, w1, #8
+; nextln:  sub w2, wzr, w2
+; nextln:  lsl w1, w0, w1
+; nextln:  lsr w0, w0, w2
+; nextln:  orr w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; LSR, variable
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+function %f8(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = ushr.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  lsr x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f9(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = ushr.i32 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  lsr w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f10(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = ushr.i16 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  uxth w0, w0
+; nextln:  lsr w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f11(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = ushr.i8 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  uxtb w0, w0
+; nextln:  lsr w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; LSL, variable
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+function %f12(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = ishl.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  lsl x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f13(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = ishl.i32 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  lsl w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f14(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = ishl.i16 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  lsl w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f15(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = ishl.i8 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  lsl w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ASR, variable
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+function %f16(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = sshr.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  asr x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f17(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = sshr.i32 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  asr w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f18(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = sshr.i16 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  sxth w0, w0
+; nextln:  asr w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f19(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = sshr.i8 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  sxtb w0, w0
+; nextln:  asr w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; immediate forms
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+function %f20(i64) -> i64 {
+block0(v0: i64):
+  v1 = iconst.i32 17
+  v2 = rotr.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  ror x0, x0, #17
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f21(i64) -> i64 {
+block0(v0: i64):
+  v1 = iconst.i32 17
+  v2 = rotl.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  lsl x1, x0, #17
+; nextln:  lsr x0, x0, #47
+; nextln:  orr x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f22(i32) -> i32 {
+block0(v0: i32):
+  v1 = iconst.i32 17
+  v2 = rotl.i32 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  lsl w1, w0, #17
+; nextln:  lsr w0, w0, #15
+; nextln:  orr w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f23(i16) -> i16 {
+block0(v0: i16):
+  v1 = iconst.i32 10
+  v2 = rotl.i16 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  uxth w0, w0
+; nextln:  lsl w1, w0, #10
+; nextln:  lsr w0, w0, #6
+; nextln:  orr w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f24(i8) -> i8 {
+block0(v0: i8):
+  v1 = iconst.i32 3
+  v2 = rotl.i8 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  uxtb w0, w0
+; nextln:  lsl w1, w0, #3
+; nextln:  lsr w0, w0, #5
+; nextln:  orr w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f25(i64) -> i64 {
+block0(v0: i64):
+  v1 = iconst.i32 17
+  v2 = ushr.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  lsr x0, x0, #17
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f26(i64) -> i64 {
+block0(v0: i64):
+  v1 = iconst.i32 17
+  v2 = sshr.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  asr x0, x0, #17
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f27(i64) -> i64 {
+block0(v0: i64):
+  v1 = iconst.i32 17
+  v2 = ishl.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  lsl x0, x0, #17
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
--- a/cranelift/filetests/filetests/vcode/aarch64/symbol-value.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/symbol-value.clif
@@ -0,0 +1,17 @@
+test vcode
+target aarch64
+
+function %f() -> i64 {
+  gv0 = symbol %my_global
+
+block0:
+  v0 = symbol_value.i64 gv0
+  return v0
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: ldr x0, 8 ; b 12 ; data
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
--- a/cranelift/filetests/filetests/vcode/aarch64/traps.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/traps.clif
@@ -0,0 +1,29 @@
+test vcode
+target aarch64
+
+function %f() {
+block0:
+  trap user0
+}
+
+; check: udf
+
+function %g(i64) {
+block0(v0: i64):
+  v1 = iconst.i64 42
+  v2 = ifcmp v0, v1
+  trapif eq v2, user0
+  return
+}
+
+; check: subs xzr, x0, #42
+; nextln: b.ne 8
+; nextln: udf
+
+function %h() {
+block0:
+  debugtrap
+  return
+}
+
+; check: brk #0
--- a/cranelift/filetests/filetests/vcode/aarch64/uextend-sextend.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/uextend-sextend.clif
@@ -0,0 +1,158 @@
+test vcode
+target aarch64
+
+function %f_u_8_64(i8) -> i64 {
+block0(v0: i8):
+  v1 = uextend.i64 v0
+  return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: uxtb x0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f_u_8_32(i8) -> i32 {
+block0(v0: i8):
+  v1 = uextend.i32 v0
+  return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: uxtb w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f_u_8_16(i8) -> i16 {
+block0(v0: i8):
+  v1 = uextend.i16 v0
+  return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: uxtb w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f_s_8_64(i8) -> i64 {
+block0(v0: i8):
+  v1 = sextend.i64 v0
+  return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: sxtb x0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f_s_8_32(i8) -> i32 {
+block0(v0: i8):
+  v1 = sextend.i32 v0
+  return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: sxtb w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f_s_8_16(i8) -> i16 {
+block0(v0: i8):
+  v1 = sextend.i16 v0
+  return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: sxtb w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f_u_16_64(i16) -> i64 {
+block0(v0: i16):
+  v1 = uextend.i64 v0
+  return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: uxth x0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f_u_16_32(i16) -> i32 {
+block0(v0: i16):
+  v1 = uextend.i32 v0
+  return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: uxth w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f_s_16_64(i16) -> i64 {
+block0(v0: i16):
+  v1 = sextend.i64 v0
+  return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: sxth x0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f_s_16_32(i16) -> i32 {
+block0(v0: i16):
+  v1 = sextend.i32 v0
+  return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: sxth w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f_u_32_64(i32) -> i64 {
+block0(v0: i32):
+  v1 = uextend.i64 v0
+  return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: mov w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f_s_32_64(i32) -> i64 {
+block0(v0: i32):
+  v1 = sextend.i64 v0
+  return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: sxtw x0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
--- a/cranelift/filetests/src/lib.rs
+++ b/cranelift/filetests/src/lib.rs
@@ -56,6 +56,7 @@ mod test_shrink;
 mod test_simple_gvn;
 mod test_simple_preopt;
 mod test_unwind;
+mod test_vcode;
 mod test_verifier;

 /// The result of running the test in a file.
@@ -134,6 +135,7 @@ fn new_subtest(parsed: &TestCommand) -> subtest::SubtestResult<Box<dyn subtest::
        "run" => test_run::subtest(parsed),
        "shrink" => test_shrink::subtest(parsed),
        "simple-gvn" => test_simple_gvn::subtest(parsed),
+        "vcode" => test_vcode::subtest(parsed),
        "verifier" => test_verifier::subtest(parsed),
        "preopt" => test_preopt::subtest(parsed),
        "safepoint" => test_safepoint::subtest(parsed),
--- a/cranelift/filetests/src/test_vcode.rs
+++ b/cranelift/filetests/src/test_vcode.rs
@@ -0,0 +1,67 @@
+use crate::subtest::{run_filecheck, Context, SubTest, SubtestResult};
+use cranelift_codegen::ir::Function;
+use cranelift_codegen::isa::lookup;
+use cranelift_codegen::settings;
+use cranelift_codegen::Context as CodegenContext;
+use cranelift_reader::{TestCommand, TestOption};
+
+use log::info;
+use std::borrow::Cow;
+use std::string::String;
+
+struct TestVCode {
+    arch: String,
+}
+
+pub fn subtest(parsed: &TestCommand) -> SubtestResult<Box<dyn SubTest>> {
+    assert_eq!(parsed.command, "vcode");
+
+    let mut arch = "arm64".to_string();
+    for option in &parsed.options {
+        match option {
+            TestOption::Value(k, v) if k == &"arch" => {
+                arch = v.to_string();
+            }
+            _ => {}
+        }
+    }
+
+    Ok(Box::new(TestVCode { arch }))
+}
+
+impl SubTest for TestVCode {
+    fn name(&self) -> &'static str {
+        "vcode"
+    }
+
+    fn is_mutating(&self) -> bool {
+        true
+    }
+
+    fn needs_isa(&self) -> bool {
+        true
+    }
+
+    fn run(&self, func: Cow<Function>, context: &Context) -> SubtestResult<()> {
+        let triple = context.isa.unwrap().triple().clone();
+        let func = func.into_owned();
+
+        let mut isa = lookup(triple)
+            .map_err(|_| format!("Could not look up backend for arch '{}'", self.arch))?
+            .finish(settings::Flags::new(settings::builder()));
+
+        let mut codectx = CodegenContext::for_function(func);
+        codectx.set_disasm(true);
+
+        codectx
+            .compile(&mut *isa)
+            .map_err(|e| format!("Could not compile with arch '{}': {:?}", self.arch, e))?;
+
+        let result = codectx.mach_compile_result.take().unwrap();
+        let text = result.disasm.unwrap();
+
+        info!("text input to filecheck is:\n{}\n", text);
+
+        run_filecheck(&text, context)
+    }
+}
--- a/cranelift/src/compile.rs
+++ b/cranelift/src/compile.rs
@@ -49,42 +49,42 @@ fn handle_module(

    // If we have an isa from the command-line, use that. Otherwise if the
    // file contains a unique isa, use that.
-    let isa = if let Some(isa) = fisa.isa {
-        isa
-    } else if let Some(isa) = test_file.isa_spec.unique_isa() {
-        isa
-    } else {
+    let isa = fisa.isa.or(test_file.isa_spec.unique_isa());
+
+    if isa.is_none() {
        return Err(String::from("compilation requires a target isa"));
    };

    for (func, _) in test_file.functions {
-        let mut context = Context::new();
-        context.func = func;
-
        let mut relocs = PrintRelocs::new(flag_print);
        let mut traps = PrintTraps::new(flag_print);
        let mut stackmaps = PrintStackmaps::new(flag_print);
-        let mut mem = vec![];

-        // Compile and encode the result to machine code.
-        let code_info = context
-            .compile_and_emit(isa, &mut mem, &mut relocs, &mut traps, &mut stackmaps)
-            .map_err(|err| pretty_error(&context.func, Some(isa), err))?;
+        if let Some(isa) = isa {
+            let mut context = Context::new();
+            context.func = func;
+            let mut mem = vec![];

-        if flag_print {
-            println!("{}", context.func.display(isa));
-        }
+            // Compile and encode the result to machine code.
+            let code_info = context
+                .compile_and_emit(isa, &mut mem, &mut relocs, &mut traps, &mut stackmaps)
+                .map_err(|err| pretty_error(&context.func, Some(isa), err))?;

-        if flag_disasm {
-            print_all(
-                isa,
-                &mem,
-                code_info.code_size,
-                code_info.jumptables_size + code_info.rodata_size,
-                &relocs,
-                &traps,
-                &stackmaps,
-            )?;
+            if flag_print {
+                println!("{}", context.func.display(isa));
+            }
+
+            if flag_disasm {
+                print_all(
+                    isa,
+                    &mem,
+                    code_info.code_size,
+                    code_info.jumptables_size + code_info.rodata_size,
+                    &relocs,
+                    &traps,
+                    &stackmaps,
+                )?;
+            }
        }
    }

--- a/crates/jit/src/link.rs
+++ b/crates/jit/src/link.rs
@@ -2,7 +2,7 @@

 use crate::Compilation;
 use cranelift_codegen::binemit::Reloc;
-use std::ptr::write_unaligned;
+use std::ptr::{read_unaligned, write_unaligned};
 use wasmtime_environ::{Module, Relocation, RelocationTarget};
 use wasmtime_runtime::libcalls;
 use wasmtime_runtime::VMFunctionBody;
@@ -101,6 +101,23 @@ fn apply_reloc(
        Reloc::X86PCRelRodata4 => {
            // ignore
        }
+        Reloc::Arm64Call => unsafe {
+            let reloc_address = body.add(r.offset as usize) as usize;
+            let reloc_addend = r.addend as isize;
+            let reloc_delta = (target_func_address as u64).wrapping_sub(reloc_address as u64);
+            // TODO: come up with a PLT-like solution for longer calls. We can't extend the
+            // code segment at this point, but we could conservatively allocate space at the
+            // end of the function during codegen, a fixed amount per call, to allow for
+            // potential branch islands.
+            assert!((reloc_delta as i64) < (1 << 27));
+            assert!((reloc_delta as i64) >= -(1 << 27));
+            let reloc_delta = reloc_delta as u32;
+            let reloc_delta = reloc_delta.wrapping_add(reloc_addend as u32);
+            let delta_bits = reloc_delta >> 2;
+            let insn = read_unaligned(reloc_address as *const u32);
+            let new_insn = (insn & 0xfc00_0000) | (delta_bits & 0x03ff_ffff);
+            write_unaligned(reloc_address as *mut u32, new_insn);
+        },
        _ => panic!("unsupported reloc kind"),
    }
 }
@@ -108,14 +125,11 @@ fn apply_reloc(
 // A declaration for the stack probe function in Rust's standard library, for
 // catching callstack overflow.
 cfg_if::cfg_if! {
-    if #[cfg(any(
-        target_arch="aarch64",
-        all(
+    if #[cfg(all(
            target_os = "windows",
            target_env = "msvc",
            target_pointer_width = "64"
-        )
-    ))] {
+            ))] {
        extern "C" {
            pub fn __chkstk();
        }
@@ -128,6 +142,13 @@ cfg_if::cfg_if! {
            pub fn ___chkstk();
        }
        const PROBESTACK: unsafe extern "C" fn() = ___chkstk;
+    } else if #[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))] {
+        // As per
+        // https://github.com/rust-lang/compiler-builtins/blob/cae3e6ea23739166504f9f9fb50ec070097979d4/src/probestack.rs#L39,
+        // LLVM only has stack-probe support on x86-64 and x86. Thus, on any other CPU
+        // architecture, we simply use an empty stack-probe function.
+        extern "C" fn empty_probestack() {}
+        const PROBESTACK: unsafe extern "C" fn() = empty_probestack;
    } else {
        extern "C" {
            pub fn __rust_probestack();
--- a/crates/runtime/src/helpers.c
+++ b/crates/runtime/src/helpers.c
@@ -26,3 +26,12 @@ void* GetPcFromUContext(ucontext_t *cx) {
  return (void*) cx->uc_mcontext->__ss.__rip;
 }
 #endif
+
+#if defined(__linux__) && defined(__aarch64__)
+#include <sys/ucontext.h>
+
+void* GetPcFromUContext(ucontext_t *cx) {
+    return (void*) cx->uc_mcontext.pc;
+}
+
+#endif  // __linux__ && __aarch64__
--- a/crates/runtime/src/traphandlers.rs
+++ b/crates/runtime/src/traphandlers.rs
@@ -156,6 +156,12 @@ cfg_if::cfg_if! {
                if #[cfg(all(target_os = "linux", target_arch = "x86_64"))] {
                    let cx = &*(cx as *const libc::ucontext_t);
                    cx.uc_mcontext.gregs[libc::REG_RIP as usize] as *const u8
+                } else if #[cfg(all(target_os = "linux", target_arch = "aarch64"))] {
+                    // libc doesn't seem to support Linux/aarch64 at the moment?
+                    extern "C" {
+                        fn GetPcFromUContext(cx: *mut libc::c_void) -> *const u8;
+                    }
+                    GetPcFromUContext(cx)
                } else if #[cfg(target_os = "macos")] {
                    // FIXME(rust-lang/libc#1702) - once that lands and is
                    // released we should inline the definition here