Merge pull request #1607 from cfallin/aarch64-stack-frame

Rework aarch64 stack frame implementation to use positive offsets.
2020-05-06 10:29:30 -07:00
parent 1d90751ba9 a66724aafd
commit 6d73fdb70a
16 changed files with 496 additions and 320 deletions
--- a/cranelift/codegen/src/isa/aarch64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -1,4 +1,63 @@
 //! Implementation of the standard AArch64 ABI.
 //!
 //! We implement the standard AArch64 ABI, as documented by ARM. This ABI
 //! specifies how arguments are passed (in registers or on the stack, as
 //! appropriate), which registers are caller- and callee-saved, and how a
 //! particular part of the stack frame (the FP/LR pair) must be linked through
 //! the active stack frames.
 //!
 //! Note, however, that the exact stack layout is up to us. We settled on the
 //! below design based on several requirements. In particular, we need to be
 //! able to generate instructions (or instruction sequences) to access
 //! arguments, stack slots, and spill slots before we know how many spill slots
 //! or clobber-saves there will be, because of our pass structure. We also
 //! prefer positive offsets to negative offsets because of an asymmetry in
 //! AArch64 addressing modes (positive offsets have a larger possible range
 //! without a long-form sequence to synthesize an arbitrary offset). Finally, it
 //! is not allowed to access memory below the current SP value.
 //!
 //! As a result, we keep the FP/LR pair just below stack args so that we can
 //! access these args at known offsets from FP, and we access on-stack storage
 //! using positive offsets from SP. In order to allow codegen for the latter
 //! before knowing how many clobber-saves we have, and also allow it while SP is
 //! being adjusted to set up a call, we implement a "nominal SP" tracking
 //! feature by which a fixup (distance between actual SP and a "nominal" SP) is
 //! known at each instruction. See the documentation for
 //! [MemArg::NominalSPOffset] for more on this.
 //!
 //! The stack looks like:
 //!
 //! ```plain
 //!   (high address)
 //!
 //!                              +---------------------------+
 //!                              |          ...              |
 //!                              | stack args                |
 //!                              | (accessed via FP)         |
 //!                              +---------------------------+
 //! SP at function entry ----->  | LR (pushed by prologue)   |
 //!                              +---------------------------+
 //! FP after prologue -------->  | FP (pushed by prologue)   |
 //!                              +---------------------------+
 //!                              |          ...              |
 //!                              | spill slots               |
 //!                              | (accessed via nominal-SP) |
 //!                              |          ...              |
 //!                              | stack slots               |
 //!                              | (accessed via nominal-SP) |
 //! nominal SP --------------->  | (alloc'd by prologue)     |
 //!                              +---------------------------+
 //!                              |          ...              |
 //!                              | clobbered callee-saves    |
 //! SP at end of prologue ---->  | (pushed by prologue)      |
 //!                              +---------------------------+
 //!                              |          ...              |
 //!                              | args for call             |
 //! SP before making a call -->  | (pushed at callsite)      |
 //!                              +---------------------------+
 //!
 //!   (low address)
 //! ```
 use crate::ir;
 use crate::ir::types;
@@ -13,7 +72,7 @@ use alloc::vec::Vec;
 use regalloc::{RealReg, Reg, RegClass, Set, SpillSlot, Writable};
-use log::debug;
+use log::{debug, trace};
 /// A location for an argument or return value.
 #[derive(Clone, Copy, Debug)]
@@ -188,7 +247,7 @@ pub struct AArch64ABIBody {
    /// Total number of spillslots, from regalloc.
    spillslots: Option<usize>,
    /// Total frame size.
-    frame_size: Option<u32>,
+    total_frame_size: Option<u32>,
    /// Calling convention this function expects.
    call_conv: isa::CallConv,
    /// The settings controlling this function's compilation.
@@ -347,7 +406,7 @@ impl AArch64ABIBody {
            stackslots_size: stack_offset,
            clobbered: Set::empty(),
            spillslots: None,
-            frame_size: None,
+            total_frame_size: None,
            call_conv,
            flags,
            is_leaf: f.is_leaf(),
@@ -355,9 +414,9 @@ impl AArch64ABIBody {
        }
    }
-    /// Returns the size of a function call frame (including return address and FP) for this
+    /// Returns the offset from FP to the argument area, i.e., jumping over the saved FP, return
-    /// function's body.
+    /// address, and maybe other standard elements depending on ABI (e.g. Wasm TLS reg).
-    fn frame_size(&self) -> i64 {
+    fn fp_to_arg_offset(&self) -> i64 {
        if self.call_conv.extends_baldrdash() {
            let num_words = self.flags.baldrdash_prologue_words() as i64;
            debug_assert!(num_words > 0, "baldrdash must set baldrdash_prologue_words");
@@ -383,8 +442,8 @@ impl AArch64ABIBody {
    /// happening so late in the pipeline (e.g. after register allocation). This
    /// means that we need to do manual register allocation here and also be
    /// careful to not clobber any callee-saved or argument registers. For now
-    /// this routine makes do with the `writable_spilltmp_reg` as one temporary
+    /// this routine makes do with the `spilltmp_reg` as one temporary
-    /// register, and a second register of `x16` which is caller-saved. This
+    /// register, and a second register of `tmp2` which is caller-saved. This
    /// should be fine for us since no spills should happen in this sequence of
    /// instructions, so our register won't get accidentally clobbered.
    ///
@@ -413,9 +472,9 @@ impl AArch64ABIBody {
        // Note though that `stack_limit`'s register may be the same as
        // `scratch`. If our stack size doesn't fit into an immediate this
        // means we need a second scratch register for loading the stack size
-        // into a register. We use `x16` here since it's caller-saved and we're
+        // into a register.
        // in the function prologue and nothing else is allocated to it yet.
        let scratch = writable_spilltmp_reg();
        let scratch2 = writable_tmp2_reg();
        let stack_size = u64::from(stack_size);
        if let Some(imm12) = Imm12::maybe_from_u64(stack_size) {
            insts.push(Inst::AluRRImm12 {
@@ -425,16 +484,12 @@ impl AArch64ABIBody {
                imm12,
            });
        } else {
-            let scratch2 = 16;
+            insts.extend(Inst::load_constant(scratch2, stack_size.into()));
            insts.extend(Inst::load_constant(
                Writable::from_reg(xreg(scratch2)),
                stack_size.into(),
            ));
            insts.push(Inst::AluRRRExtend {
                alu_op: ALUOp::Add64,
                rd: scratch,
                rn: stack_limit,
-                rm: xreg(scratch2),
+                rm: scratch2.to_reg(),
                extendop: ExtendOp::UXTX,
            });
        }
@@ -460,8 +515,7 @@ impl AArch64ABIBody {
    }
 }
-fn load_stack_from_fp(fp_offset: i64, into_reg: Writable<Reg>, ty: Type) -> Inst {
+fn load_stack(mem: MemArg, into_reg: Writable<Reg>, ty: Type) -> Inst {
    let mem = MemArg::FPOffset(fp_offset);
    match ty {
        types::B1
        | types::B8
@@ -486,15 +540,11 @@ fn load_stack_from_fp(fp_offset: i64, into_reg: Writable<Reg>, ty: Type) -> Inst
            mem,
            srcloc: None,
        },
-        _ => unimplemented!("load_stack_from_fp({})", ty),
+        _ => unimplemented!("load_stack({})", ty),
    }
 }
 fn store_stack(mem: MemArg, from_reg: Reg, ty: Type) -> Inst {
    debug_assert!(match &mem {
        MemArg::SPOffset(off) => SImm9::maybe_from_i64(*off).is_some(),
        _ => true,
    });
    match ty {
        types::B1
        | types::B8
@@ -523,50 +573,6 @@ fn store_stack(mem: MemArg, from_reg: Reg, ty: Type) -> Inst {
    }
 }
 fn store_stack_fp(fp_offset: i64, from_reg: Reg, ty: Type) -> Inst {
    store_stack(MemArg::FPOffset(fp_offset), from_reg, ty)
 }
 fn store_stack_sp<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
    sp_offset: i64,
    from_reg: Reg,
    ty: Type,
 ) -> Vec<Inst> {
    if SImm9::maybe_from_i64(sp_offset).is_some() {
        vec![store_stack(MemArg::SPOffset(sp_offset), from_reg, ty)]
    } else {
        // mem_finalize will try to generate an add, but in an addition, x31 is the zero register,
        // not sp! So we have to synthesize the full add here.
        let tmp1 = ctx.tmp(RegClass::I64, I64);
        let tmp2 = ctx.tmp(RegClass::I64, I64);
        let mut result = Vec::new();
        // tmp1 := sp
        result.push(Inst::Mov {
            rd: tmp1,
            rm: stack_reg(),
        });
        // tmp2 := offset
        for inst in Inst::load_constant(tmp2, sp_offset as u64) {
            result.push(inst);
        }
        // tmp1 := add tmp1, tmp2
        result.push(Inst::AluRRR {
            alu_op: ALUOp::Add64,
            rd: tmp1,
            rn: tmp1.to_reg(),
            rm: tmp2.to_reg(),
        });
        // Actual store.
        result.push(store_stack(
            MemArg::Unscaled(tmp1.to_reg(), SImm9::maybe_from_i64(0).unwrap()),
            from_reg,
            ty,
        ));
        result
    }
 }
 fn is_callee_save(call_conv: isa::CallConv, r: RealReg) -> bool {
    if call_conv.extends_baldrdash() {
        match r.get_class() {
@@ -706,7 +712,11 @@ impl ABIBody for AArch64ABIBody {
    fn gen_copy_arg_to_reg(&self, idx: usize, into_reg: Writable<Reg>) -> Inst {
        match &self.sig.args[idx] {
            &ABIArg::Reg(r, ty) => Inst::gen_move(into_reg, r.to_reg(), ty),
-            &ABIArg::Stack(off, ty) => load_stack_from_fp(off + self.frame_size(), into_reg, ty),
+            &ABIArg::Stack(off, ty) => load_stack(
                MemArg::FPOffset(self.fp_to_arg_offset() + off),
                into_reg,
                ty,
            ),
        }
    }
@@ -767,8 +777,8 @@ impl ABIBody for AArch64ABIBody {
                    }
                    _ => {}
                };
-                ret.push(store_stack_fp(
+                ret.push(store_stack(
-                    off + self.frame_size(),
+                    MemArg::FPOffset(self.fp_to_arg_offset() + off),
                    from_reg.to_reg(),
                    ty,
                ))
@@ -793,6 +803,7 @@ impl ABIBody for AArch64ABIBody {
        self.clobbered = clobbered;
    }
    /// Load from a stackslot.
    fn load_stackslot(
        &self,
        slot: StackSlot,
@@ -800,47 +811,54 @@ impl ABIBody for AArch64ABIBody {
        ty: Type,
        into_reg: Writable<Reg>,
    ) -> Inst {
-        // Offset from beginning of stackslot area, which is at FP - stackslots_size.
+        // Offset from beginning of stackslot area, which is at nominal-SP (see
        // [MemArg::NominalSPOffset] for more details on nominal-SP tracking).
        let stack_off = self.stackslots[slot.as_u32() as usize] as i64;
-        let fp_off: i64 = -(self.stackslots_size as i64) + stack_off + (offset as i64);
+        let sp_off: i64 = stack_off + (offset as i64);
-        load_stack_from_fp(fp_off, into_reg, ty)
+        trace!("load_stackslot: slot {} -> sp_off {}", slot, sp_off);
        load_stack(MemArg::NominalSPOffset(sp_off), into_reg, ty)
    }
    /// Store to a stackslot.
    fn store_stackslot(&self, slot: StackSlot, offset: u32, ty: Type, from_reg: Reg) -> Inst {
-        // Offset from beginning of stackslot area, which is at FP - stackslots_size.
+        // Offset from beginning of stackslot area, which is at nominal-SP (see
        // [MemArg::NominalSPOffset] for more details on nominal-SP tracking).
        let stack_off = self.stackslots[slot.as_u32() as usize] as i64;
-        let fp_off: i64 = -(self.stackslots_size as i64) + stack_off + (offset as i64);
+        let sp_off: i64 = stack_off + (offset as i64);
-        store_stack_fp(fp_off, from_reg, ty)
+        trace!("store_stackslot: slot {} -> sp_off {}", slot, sp_off);
        store_stack(MemArg::NominalSPOffset(sp_off), from_reg, ty)
    }
    /// Produce an instruction that computes a stackslot address.
    fn stackslot_addr(&self, slot: StackSlot, offset: u32, into_reg: Writable<Reg>) -> Inst {
-        // Offset from beginning of stackslot area, which is at FP - stackslots_size.
+        // Offset from beginning of stackslot area, which is at nominal-SP (see
        // [MemArg::NominalSPOffset] for more details on nominal-SP tracking).
        let stack_off = self.stackslots[slot.as_u32() as usize] as i64;
-        let fp_off: i64 = -(self.stackslots_size as i64) + stack_off + (offset as i64);
+        let sp_off: i64 = stack_off + (offset as i64);
        Inst::LoadAddr {
            rd: into_reg,
-            mem: MemArg::FPOffset(fp_off),
+            mem: MemArg::NominalSPOffset(sp_off),
        }
    }
-    // Load from a spillslot.
+    /// Load from a spillslot.
    fn load_spillslot(&self, slot: SpillSlot, ty: Type, into_reg: Writable<Reg>) -> Inst {
-        // Note that when spills/fills are generated, we don't yet know how many
+        // Offset from beginning of spillslot area, which is at nominal-SP + stackslots_size.
        // spillslots there will be, so we allocate *downward* from the beginning
        // of the stackslot area. Hence: FP - stackslot_size - 8*spillslot -
        // sizeof(ty).
        let islot = slot.get() as i64;
-        let ty_size = self.get_spillslot_size(into_reg.to_reg().get_class(), ty) * 8;
+        let spill_off = islot * 8;
-        let fp_off: i64 = -(self.stackslots_size as i64) - (8 * islot) - ty_size as i64;
+        let sp_off = self.stackslots_size as i64 + spill_off;
-        load_stack_from_fp(fp_off, into_reg, ty)
+        trace!("load_spillslot: slot {:?} -> sp_off {}", slot, sp_off);
        load_stack(MemArg::NominalSPOffset(sp_off), into_reg, ty)
    }
-    // Store to a spillslot.
+    /// Store to a spillslot.
    fn store_spillslot(&self, slot: SpillSlot, ty: Type, from_reg: Reg) -> Inst {
        // Offset from beginning of spillslot area, which is at nominal-SP + stackslots_size.
        let islot = slot.get() as i64;
-        let ty_size = self.get_spillslot_size(from_reg.get_class(), ty) * 8;
+        let spill_off = islot * 8;
-        let fp_off: i64 = -(self.stackslots_size as i64) - (8 * islot) - ty_size as i64;
+        let sp_off = self.stackslots_size as i64 + spill_off;
-        store_stack_fp(fp_off, from_reg, ty)
+        trace!("store_spillslot: slot {:?} -> sp_off {}", slot, sp_off);
        store_stack(MemArg::NominalSPOffset(sp_off), from_reg, ty)
    }
    fn gen_prologue(&mut self) -> Vec<Inst> {
@@ -916,9 +934,18 @@ impl ABIBody for AArch64ABIBody {
            }
        }
        // N.B.: "nominal SP", which we use to refer to stackslots
        // and spillslots, is *here* (the value of SP at this program point).
        // If we push any clobbers below, we emit a virtual-SP adjustment
        // meta-instruction so that the nominal-SP references behave as if SP
        // were still at this point. See documentation for
        // [crate::isa::aarch64::abi](this module) for more details on
        // stackframe layout and nominal-SP maintenance.
        // Save clobbered registers.
        let (clobbered_int, clobbered_vec) =
            get_callee_saves(self.call_conv, self.clobbered.to_vec());
        let mut clobber_size = 0;
        for reg_pair in clobbered_int.chunks(2) {
            let (r1, r2) = if reg_pair.len() == 2 {
                // .to_reg().to_reg(): Writable<RealReg> --> RealReg --> Reg
@@ -939,6 +966,7 @@ impl ABIBody for AArch64ABIBody {
                    SImm7Scaled::maybe_from_i64(-16, types::I64).unwrap(),
                ),
            });
            clobber_size += 16;
        }
        let vec_save_bytes = clobbered_vec.len() * 16;
        if vec_save_bytes != 0 {
@@ -948,6 +976,7 @@ impl ABIBody for AArch64ABIBody {
                rn: stack_reg(),
                imm12: Imm12::maybe_from_u64(vec_save_bytes as u64).unwrap(),
            });
            clobber_size += vec_save_bytes;
        }
        for (i, reg) in clobbered_vec.iter().enumerate() {
            insts.push(Inst::FpuStore128 {
@@ -957,7 +986,13 @@ impl ABIBody for AArch64ABIBody {
            });
        }
-        self.frame_size = Some(total_stacksize);
+        if clobber_size > 0 {
            insts.push(Inst::VirtualSPOffsetAdj {
                offset: clobber_size as i64,
            });
        }
        self.total_frame_size = Some(total_stacksize);
        insts
    }
@@ -1009,6 +1044,12 @@ impl ABIBody for AArch64ABIBody {
            });
        }
        // N.B.: we do *not* emit a nominal-SP adjustment here, because (i) there will be no
        // references to nominal-SP offsets before the return below, and (ii) the instruction
        // emission tracks running SP offset linearly (in straight-line order), not according to
        // the CFG, so early returns in the middle of function bodies would cause an incorrect
        // offset for the rest of the body.
        if !self.call_conv.extends_baldrdash() {
            // The MOV (alias of ORR) interprets x31 as XZR, so use an ADD here.
            // MOV to SP is an alias of ADD.
@@ -1037,7 +1078,7 @@ impl ABIBody for AArch64ABIBody {
    }
    fn frame_size(&self) -> u32 {
-        self.frame_size
+        self.total_frame_size
            .expect("frame size not computed before prologue generation")
    }
@@ -1138,20 +1179,32 @@ impl AArch64ABICall {
    }
 }
-fn adjust_stack(amt: u64, is_sub: bool) -> Vec<Inst> {
+fn adjust_stack(amount: u64, is_sub: bool) -> Vec<Inst> {
-    if amt > 0 {
+    if amount > 0 {
        let sp_adjustment = if is_sub {
            amount as i64
        } else {
            -(amount as i64)
        };
        let adj_meta_insn = Inst::VirtualSPOffsetAdj {
            offset: sp_adjustment,
        };
        let alu_op = if is_sub { ALUOp::Sub64 } else { ALUOp::Add64 };
-        if let Some(imm12) = Imm12::maybe_from_u64(amt) {
+        if let Some(imm12) = Imm12::maybe_from_u64(amount) {
-            vec![Inst::AluRRImm12 {
+            vec![
-                alu_op,
+                adj_meta_insn,
-                rd: writable_stack_reg(),
+                Inst::AluRRImm12 {
-                rn: stack_reg(),
+                    alu_op,
-                imm12,
+                    rd: writable_stack_reg(),
-            }]
+                    rn: stack_reg(),
                    imm12,
                },
            ]
        } else {
            let const_load = Inst::LoadConst64 {
                rd: writable_spilltmp_reg(),
-                const_data: amt,
+                const_data: amount,
            };
            let adj = Inst::AluRRRExtend {
                alu_op,
@@ -1160,7 +1213,7 @@ fn adjust_stack(amt: u64, is_sub: bool) -> Vec<Inst> {
                rm: spilltmp_reg(),
                extendop: ExtendOp::UXTX,
            };
-            vec![const_load, adj]
+            vec![adj_meta_insn, const_load, adj]
        }
    } else {
        vec![]
@@ -1182,19 +1235,14 @@ impl ABICall for AArch64ABICall {
        adjust_stack(self.sig.stack_arg_space as u64, /* is_sub = */ false)
    }
-    fn gen_copy_reg_to_arg<C: LowerCtx<I = Self::I>>(
+    fn gen_copy_reg_to_arg(&self, idx: usize, from_reg: Reg) -> Vec<Inst> {
        &self,
        ctx: &mut C,
        idx: usize,
        from_reg: Reg,
    ) -> Vec<Inst> {
        match &self.sig.args[idx] {
            &ABIArg::Reg(reg, ty) => vec![Inst::gen_move(
                Writable::from_reg(reg.to_reg()),
                from_reg,
                ty,
            )],
-            &ABIArg::Stack(off, ty) => store_stack_sp(ctx, off, from_reg, ty),
+            &ABIArg::Stack(off, ty) => vec![store_stack(MemArg::SPOffset(off), from_reg, ty)],
        }
    }
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -112,7 +112,9 @@ pub enum MemLabel {
 /// A memory argument to load/store, encapsulating the possible addressing modes.
 #[derive(Clone, Debug)]
 pub enum MemArg {
-    Label(MemLabel),
+    //
    // Real ARM64 addressing modes:
    //
    /// "post-indexed" mode as per AArch64 docs: postincrement reg after address computation.
    PostIndexed(Writable<Reg>, SImm9),
    /// "pre-indexed" mode as per AArch64 docs: preincrement reg before address computation.
@@ -137,11 +139,31 @@ pub enum MemArg {
    /// Scaled (by size of a type) unsigned 12-bit immediate offset from reg.
    UnsignedOffset(Reg, UImm12Scaled),
-    /// Offset from the stack pointer. Lowered into a real amode at emission.
+    //
    // virtual addressing modes that are lowered at emission time:
    //
    /// Reference to a "label": e.g., a symbol.
    Label(MemLabel),
    /// Offset from the stack pointer.
    SPOffset(i64),
-    /// Offset from the frame pointer. Lowered into a real amode at emission.
+    /// Offset from the frame pointer.
    FPOffset(i64),
    /// Offset from the "nominal stack pointer", which is where the real SP is
    /// just after stack and spill slots are allocated in the function prologue.
    /// At emission time, this is converted to `SPOffset` with a fixup added to
    /// the offset constant. The fixup is a running value that is tracked as
    /// emission iterates through instructions in linear order, and can be
    /// adjusted up and down with [Inst::VirtualSPOffsetAdj].
    ///
    /// The standard ABI is in charge of handling this (by emitting the
    /// adjustment meta-instructions). It maintains the invariant that "nominal
    /// SP" is where the actual SP is after the function prologue and before
    /// clobber pushes. See the diagram in the documentation for
    /// [crate::isa::aarch64::abi](the ABI module) for more details.
    NominalSPOffset(i64),
 }
 impl MemArg {
@@ -443,7 +465,7 @@ impl ShowWithRRU for MemArg {
                simm9.show_rru(mb_rru)
            ),
            // Eliminated by `mem_finalize()`.
-            &MemArg::SPOffset(..) | &MemArg::FPOffset(..) => {
+            &MemArg::SPOffset(..) | &MemArg::FPOffset(..) | &MemArg::NominalSPOffset(..) => {
                panic!("Unexpected stack-offset mem-arg mode!")
            }
        }
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -10,6 +10,7 @@ use regalloc::{Reg, RegClass, Writable};
 use alloc::vec::Vec;
 use core::convert::TryFrom;
 use log::debug;
 /// Memory label/reference finalization: convert a MemLabel to a PC-relative
 /// offset, possibly emitting relocation(s) as necessary.
@@ -23,33 +24,44 @@ pub fn memlabel_finalize(_insn_off: CodeOffset, label: &MemLabel) -> i32 {
 /// generic arbitrary stack offset) into real addressing modes, possibly by
 /// emitting some helper instructions that come immediately before the use
 /// of this amode.
-pub fn mem_finalize(insn_off: CodeOffset, mem: &MemArg) -> (Vec<Inst>, MemArg) {
+pub fn mem_finalize(insn_off: CodeOffset, mem: &MemArg, state: &EmitState) -> (Vec<Inst>, MemArg) {
    match mem {
-        &MemArg::SPOffset(off) | &MemArg::FPOffset(off) => {
+        &MemArg::SPOffset(off) | &MemArg::FPOffset(off) | &MemArg::NominalSPOffset(off) => {
            let basereg = match mem {
-                &MemArg::SPOffset(..) => stack_reg(),
+                &MemArg::SPOffset(..) | &MemArg::NominalSPOffset(..) => stack_reg(),
                &MemArg::FPOffset(..) => fp_reg(),
                _ => unreachable!(),
            };
            let adj = match mem {
                &MemArg::NominalSPOffset(..) => {
                    debug!(
                        "mem_finalize: nominal SP offset {} + adj {} -> {}",
                        off,
                        state.virtual_sp_offset,
                        off + state.virtual_sp_offset
                    );
                    state.virtual_sp_offset
                }
                _ => 0,
            };
            let off = off + adj;
            if let Some(simm9) = SImm9::maybe_from_i64(off) {
                let mem = MemArg::Unscaled(basereg, simm9);
                (vec![], mem)
            } else {
                // In an addition, x31 is the zero register, not sp; we have only one temporary
                // so we can't do the proper add here.
                debug_assert_ne!(
                    basereg,
                    stack_reg(),
                    "should have diverted SP before mem_finalize"
                );
                let tmp = writable_spilltmp_reg();
                let mut const_insts = Inst::load_constant(tmp, off as u64);
-                let add_inst = Inst::AluRRR {
+                // N.B.: we must use AluRRRExtend because AluRRR uses the "shifted register" form
                // (AluRRRShift) instead, which interprets register 31 as the zero reg, not SP. SP
                // is a valid base (for SPOffset) which we must handle here.
                // Also, SP needs to be the first arg, not second.
                let add_inst = Inst::AluRRRExtend {
                    alu_op: ALUOp::Add64,
                    rd: tmp,
-                    rn: tmp.to_reg(),
+                    rn: basereg,
-                    rm: basereg,
+                    rm: tmp.to_reg(),
                    extendop: ExtendOp::UXTX,
                };
                const_insts.push(add_inst);
                (const_insts.to_vec(), MemArg::reg(tmp.to_reg()))
@@ -322,8 +334,16 @@ fn enc_fround(top22: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
    (top22 << 10) | (machreg_to_vec(rn) << 5) | machreg_to_vec(rd.to_reg())
 }
 /// State carried between emissions of a sequence of instructions.
 #[derive(Default, Clone, Debug)]
 pub struct EmitState {
    virtual_sp_offset: i64,
 }
 impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
-    fn emit(&self, sink: &mut O, flags: &settings::Flags) {
+    type State = EmitState;
    fn emit(&self, sink: &mut O, flags: &settings::Flags, state: &mut EmitState) {
        match self {
            &Inst::AluRRR { alu_op, rd, rn, rm } => {
                let top11 = match alu_op {
@@ -596,10 +616,10 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                ref mem,
                srcloc,
            } => {
-                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem);
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem, state);
                for inst in mem_insts.into_iter() {
-                    inst.emit(sink, flags);
+                    inst.emit(sink, flags, state);
                }
                // ldst encoding helpers take Reg, not Writable<Reg>.
@@ -697,9 +717,9 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                        sink.put4(enc_ldst_simm9(op, simm9, 0b01, reg.to_reg(), rd));
                    }
                    // Eliminated by `mem_finalize()` above.
-                    &MemArg::SPOffset(..) | &MemArg::FPOffset(..) => {
+                    &MemArg::SPOffset(..)
-                        panic!("Should not see stack-offset here!")
+                    | &MemArg::FPOffset(..)
-                    }
+                    | &MemArg::NominalSPOffset(..) => panic!("Should not see stack-offset here!"),
                }
            }
@@ -739,10 +759,10 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                ref mem,
                srcloc,
            } => {
-                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem);
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem, state);
                for inst in mem_insts.into_iter() {
-                    inst.emit(sink, flags);
+                    inst.emit(sink, flags, state);
                }
                let op = match self {
@@ -794,9 +814,9 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                        sink.put4(enc_ldst_simm9(op, simm9, 0b01, reg.to_reg(), rd));
                    }
                    // Eliminated by `mem_finalize()` above.
-                    &MemArg::SPOffset(..) | &MemArg::FPOffset(..) => {
+                    &MemArg::SPOffset(..)
-                        panic!("Should not see stack-offset here!")
+                    | &MemArg::FPOffset(..)
-                    }
+                    | &MemArg::NominalSPOffset(..) => panic!("Should not see stack-offset here!"),
                }
            }
@@ -980,11 +1000,11 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    mem: MemArg::Label(MemLabel::PCRel(8)),
                    srcloc: None,
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                let inst = Inst::Jump {
                    dest: BranchTarget::ResolvedOffset(8),
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                sink.put4(const_data.to_bits());
            }
            &Inst::LoadFpuConst64 { rd, const_data } => {
@@ -993,11 +1013,11 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    mem: MemArg::Label(MemLabel::PCRel(8)),
                    srcloc: None,
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                let inst = Inst::Jump {
                    dest: BranchTarget::ResolvedOffset(12),
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                sink.put8(const_data.to_bits());
            }
            &Inst::FpuCSel32 { rd, rn, rm, cond } => {
@@ -1084,7 +1104,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                if top22 != 0 {
                    sink.put4(enc_extend(top22, rd, rn));
                } else {
-                    Inst::mov32(rd, rn).emit(sink, flags);
+                    Inst::mov32(rd, rn).emit(sink, flags, state);
                }
            }
            &Inst::Extend {
@@ -1107,7 +1127,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    rn: zero_reg(),
                    rm: rd.to_reg(),
                };
-                sub_inst.emit(sink, flags);
+                sub_inst.emit(sink, flags, state);
            }
            &Inst::Extend {
                rd,
@@ -1248,13 +1268,13 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                // Save index in a tmp (the live range of ridx only goes to start of this
                // sequence; rtmp1 or rtmp2 may overwrite it).
                let inst = Inst::gen_move(rtmp2, ridx, I64);
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                // Load address of jump table
                let inst = Inst::Adr {
                    rd: rtmp1,
                    label: MemLabel::PCRel(16),
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                // Load value out of jump table
                let inst = Inst::SLoad32 {
                    rd: rtmp2,
@@ -1266,7 +1286,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    ),
                    srcloc: None, // can't cause a user trap.
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                // Add base of jump table to jump-table-sourced block offset
                let inst = Inst::AluRRR {
                    alu_op: ALUOp::Add64,
@@ -1274,14 +1294,14 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    rn: rtmp1.to_reg(),
                    rm: rtmp2.to_reg(),
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                // Branch to computed address. (`targets` here is only used for successor queries
                // and is not needed for emission.)
                let inst = Inst::IndirectBr {
                    rn: rtmp1.to_reg(),
                    targets: vec![],
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                // Emit jump table (table of 32-bit offsets).
                for target in targets {
                    let off = target.as_offset_words() * 4;
@@ -1297,11 +1317,11 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    mem: MemArg::Label(MemLabel::PCRel(8)),
                    srcloc: None, // can't cause a user trap.
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                let inst = Inst::Jump {
                    dest: BranchTarget::ResolvedOffset(12),
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                sink.put8(const_data);
            }
            &Inst::LoadExtName {
@@ -1315,11 +1335,11 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    mem: MemArg::Label(MemLabel::PCRel(8)),
                    srcloc: None, // can't cause a user trap.
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                let inst = Inst::Jump {
                    dest: BranchTarget::ResolvedOffset(12),
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                sink.add_reloc(srcloc, Reloc::Abs8, name, offset);
                if flags.emit_all_ones_funcaddrs() {
                    sink.put8(u64::max_value());
@@ -1327,52 +1347,81 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    sink.put8(0);
                }
            }
-            &Inst::LoadAddr { rd, ref mem } => match *mem {
+            &Inst::LoadAddr { rd, ref mem } => {
-                MemArg::FPOffset(fp_off) => {
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem, state);
-                    let alu_op = if fp_off < 0 {
+                for inst in mem_insts.into_iter() {
-                        ALUOp::Sub64
+                    inst.emit(sink, flags, state);
                    } else {
                        ALUOp::Add64
                    };
                    if let Some(imm12) = Imm12::maybe_from_u64(u64::try_from(fp_off.abs()).unwrap())
                    {
                        let inst = Inst::AluRRImm12 {
                            alu_op,
                            rd,
                            imm12,
                            rn: fp_reg(),
                        };
                        inst.emit(sink, flags);
                    } else {
                        let const_insts =
                            Inst::load_constant(rd, u64::try_from(fp_off.abs()).unwrap());
                        for inst in const_insts {
                            inst.emit(sink, flags);
                        }
                        let inst = Inst::AluRRR {
                            alu_op,
                            rd,
                            rn: fp_reg(),
                            rm: rd.to_reg(),
                        };
                        inst.emit(sink, flags);
                    }
                }
-                _ => unimplemented!("{:?}", mem),
+
-            },
+                let (reg, offset) = match mem {
                    MemArg::Unscaled(r, simm9) => (r, simm9.value()),
                    MemArg::UnsignedOffset(r, uimm12scaled) => (r, uimm12scaled.value() as i32),
                    _ => panic!("Unsupported case for LoadAddr: {:?}", mem),
                };
                let abs_offset = if offset < 0 {
                    -offset as u64
                } else {
                    offset as u64
                };
                let alu_op = if offset < 0 {
                    ALUOp::Sub64
                } else {
                    ALUOp::Add64
                };
                if offset == 0 {
                    let mov = Inst::mov(rd, reg);
                    mov.emit(sink, flags, state);
                } else if let Some(imm12) = Imm12::maybe_from_u64(abs_offset) {
                    let add = Inst::AluRRImm12 {
                        alu_op,
                        rd,
                        rn: reg,
                        imm12,
                    };
                    add.emit(sink, flags, state);
                } else {
                    // Use `tmp2` here: `reg` may be `spilltmp` if the `MemArg` on this instruction
                    // was initially an `SPOffset`. Assert that `tmp2` is truly free to use. Note
                    // that no other instructions will be inserted here (we're emitting directly),
                    // and a live range of `tmp2` should not span this instruction, so this use
                    // should otherwise be correct.
                    debug_assert!(rd.to_reg() != tmp2_reg());
                    debug_assert!(reg != tmp2_reg());
                    let tmp = writable_tmp2_reg();
                    for insn in Inst::load_constant(tmp, abs_offset).into_iter() {
                        insn.emit(sink, flags, state);
                    }
                    let add = Inst::AluRRR {
                        alu_op,
                        rd,
                        rn: reg,
                        rm: tmp.to_reg(),
                    };
                    add.emit(sink, flags, state);
                }
            }
            &Inst::GetPinnedReg { rd } => {
                let inst = Inst::Mov {
                    rd,
                    rm: xreg(PINNED_REG),
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
            }
            &Inst::SetPinnedReg { rm } => {
                let inst = Inst::Mov {
                    rd: Writable::from_reg(xreg(PINNED_REG)),
                    rm,
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
            }
            &Inst::VirtualSPOffsetAdj { offset } => {
                debug!(
                    "virtual sp offset adjusted by {} -> {}",
                    offset,
                    state.virtual_sp_offset + offset
                );
                state.virtual_sp_offset += offset;
            }
        }
    }
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -1313,8 +1313,8 @@ fn test_aarch64_binemit() {
            mem: MemArg::FPOffset(32768),
            srcloc: None,
        },
-        "0F0090D2EF011D8BE10140F9",
+        "100090D2B063308B010240F9",
-        "movz x15, #32768 ; add x15, x15, fp ; ldr x1, [x15]",
+        "movz x16, #32768 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
    ));
    insns.push((
        Inst::ULoad64 {
@@ -1322,8 +1322,8 @@ fn test_aarch64_binemit() {
            mem: MemArg::FPOffset(-32768),
            srcloc: None,
        },
-        "EFFF8F92EF011D8BE10140F9",
+        "F0FF8F92B063308B010240F9",
-        "movn x15, #32767 ; add x15, x15, fp ; ldr x1, [x15]",
+        "movn x16, #32767 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
    ));
    insns.push((
        Inst::ULoad64 {
@@ -1331,8 +1331,8 @@ fn test_aarch64_binemit() {
            mem: MemArg::FPOffset(1048576), // 2^20
            srcloc: None,
        },
-        "0F02A0D2EF011D8BE10140F9",
+        "1002A0D2B063308B010240F9",
-        "movz x15, #16, LSL #16 ; add x15, x15, fp ; ldr x1, [x15]",
+        "movz x16, #16, LSL #16 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
    ));
    insns.push((
        Inst::ULoad64 {
@@ -1340,8 +1340,8 @@ fn test_aarch64_binemit() {
            mem: MemArg::FPOffset(1048576 + 1), // 2^20 + 1
            srcloc: None,
        },
-        "2F0080D20F02A0F2EF011D8BE10140F9",
+        "300080D21002A0F2B063308B010240F9",
-        "movz x15, #1 ; movk x15, #16, LSL #16 ; add x15, x15, fp ; ldr x1, [x15]",
+        "movz x16, #1 ; movk x16, #16, LSL #16 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
    ));
    insns.push((
@@ -2794,7 +2794,7 @@ fn test_aarch64_binemit() {
        // Check the encoding is as expected.
        let text_size = {
            let mut code_sec = MachSectionSize::new(0);
-            insn.emit(&mut code_sec, &flags);
+            insn.emit(&mut code_sec, &flags, &mut Default::default());
            code_sec.size()
        };
@@ -2802,7 +2802,7 @@ fn test_aarch64_binemit() {
        let mut sections = MachSections::new();
        let code_idx = sections.add_section(0, text_size);
        let code_sec = sections.get_section(code_idx);
-        insn.emit(code_sec, &flags);
+        insn.emit(code_sec, &flags, &mut Default::default());
        sections.emit(&mut sink);
        let actual_encoding = &sink.stringify();
        assert_eq!(expected_encoding, actual_encoding);
--- a/cranelift/codegen/src/isa/aarch64/inst/imms.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/imms.rs
@@ -134,6 +134,11 @@ impl SImm9 {
    pub fn bits(&self) -> u32 {
        (self.value as u32) & 0x1ff
    }
    /// Signed value of immediate.
    pub fn value(&self) -> i32 {
        self.value as i32
    }
 }
 /// An unsigned, scaled 12-bit offset.
@@ -172,6 +177,11 @@ impl UImm12Scaled {
    pub fn bits(&self) -> u32 {
        (self.value as u32 / self.scale_ty.bytes()) & 0xfff
    }
    /// Value after scaling.
    pub fn value(&self) -> u32 {
        self.value as u32 * self.scale_ty.bytes()
    }
 }
 /// A shifted immediate value in 'imm12' format: supports 12 bits, shifted
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -13,7 +13,6 @@ use regalloc::{RealRegUniverse, Reg, RegClass, SpillSlot, VirtualReg, Writable};
 use regalloc::{RegUsageCollector, RegUsageMapper, Set};
 use alloc::vec::Vec;
 use core::convert::TryFrom;
 use smallvec::{smallvec, SmallVec};
 use std::string::{String, ToString};
@@ -741,6 +740,12 @@ pub enum Inst {
    SetPinnedReg {
        rm: Reg,
    },
    /// Marker, no-op in generated code: SP "virtual offset" is adjusted. This
    /// controls MemArg::NominalSPOffset args are lowered.
    VirtualSPOffsetAdj {
        offset: i64,
    },
 }
 fn count_zero_half_words(mut value: u64) -> usize {
@@ -876,7 +881,7 @@ fn memarg_regs(memarg: &MemArg, collector: &mut RegUsageCollector) {
        &MemArg::FPOffset(..) => {
            collector.add_use(fp_reg());
        }
-        &MemArg::SPOffset(..) => {
+        &MemArg::SPOffset(..) | &MemArg::NominalSPOffset(..) => {
            collector.add_use(stack_reg());
        }
    }
@@ -1135,6 +1140,7 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
        &Inst::SetPinnedReg { rm } => {
            collector.add_use(rm);
        }
        &Inst::VirtualSPOffsetAdj { .. } => {}
    }
 }
@@ -1186,7 +1192,9 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RegUsageMapper) {
            &mut MemArg::Label(..) => {}
            &mut MemArg::PreIndexed(ref mut r, ..) => map_mod(m, r),
            &mut MemArg::PostIndexed(ref mut r, ..) => map_mod(m, r),
-            &mut MemArg::FPOffset(..) | &mut MemArg::SPOffset(..) => {}
+            &mut MemArg::FPOffset(..)
            | &mut MemArg::SPOffset(..)
            | &mut MemArg::NominalSPOffset(..) => {}
        };
    }
@@ -1706,6 +1714,7 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RegUsageMapper) {
        &mut Inst::SetPinnedReg { ref mut rm } => {
            map_use(mapper, rm);
        }
        &mut Inst::VirtualSPOffsetAdj { .. } => {}
    }
 }
@@ -1904,7 +1913,7 @@ impl MachInst for Inst {
 // Pretty-printing of instructions.
 fn mem_finalize_for_show(mem: &MemArg, mb_rru: Option<&RealRegUniverse>) -> (String, MemArg) {
-    let (mem_insts, mem) = mem_finalize(0, mem);
+    let (mem_insts, mem) = mem_finalize(0, mem, &mut Default::default());
    let mut mem_str = mem_insts
        .into_iter()
        .map(|inst| inst.show_rru(mb_rru))
@@ -2618,42 +2627,58 @@ impl ShowWithRRU for Inst {
                let rd = rd.show_rru(mb_rru);
                format!("ldr {}, 8 ; b 12 ; data {:?} + {}", rd, name, offset)
            }
-            &Inst::LoadAddr { rd, ref mem } => match *mem {
+            &Inst::LoadAddr { rd, ref mem } => {
-                MemArg::FPOffset(fp_off) => {
+                // TODO: we really should find a better way to avoid duplication of
-                    let alu_op = if fp_off < 0 {
+                // this logic between `emit()` and `show_rru()` -- a separate 1-to-N
-                        ALUOp::Sub64
+                // expansion stage (i.e., legalization, but without the slow edit-in-place
-                    } else {
+                // of the existing legalization framework).
-                        ALUOp::Add64
+                let (mem_insts, mem) = mem_finalize(0, mem, &EmitState::default());
-                    };
+                let mut ret = String::new();
-                    if let Some(imm12) = Imm12::maybe_from_u64(u64::try_from(fp_off.abs()).unwrap())
+                for inst in mem_insts.into_iter() {
-                    {
+                    ret.push_str(&inst.show_rru(mb_rru));
                        let inst = Inst::AluRRImm12 {
                            alu_op,
                            rd,
                            imm12,
                            rn: fp_reg(),
                        };
                        inst.show_rru(mb_rru)
                    } else {
                        let mut res = String::new();
                        let const_insts =
                            Inst::load_constant(rd, u64::try_from(fp_off.abs()).unwrap());
                        for inst in const_insts {
                            res.push_str(&inst.show_rru(mb_rru));
                            res.push_str("; ");
                        }
                        let inst = Inst::AluRRR {
                            alu_op,
                            rd,
                            rn: fp_reg(),
                            rm: rd.to_reg(),
                        };
                        res.push_str(&inst.show_rru(mb_rru));
                        res
                    }
                }
-                _ => unimplemented!("{:?}", mem),
+                let (reg, offset) = match mem {
-            },
+                    MemArg::Unscaled(r, simm9) => (r, simm9.value()),
                    MemArg::UnsignedOffset(r, uimm12scaled) => (r, uimm12scaled.value() as i32),
                    _ => panic!("Unsupported case for LoadAddr: {:?}", mem),
                };
                let abs_offset = if offset < 0 {
                    -offset as u64
                } else {
                    offset as u64
                };
                let alu_op = if offset < 0 {
                    ALUOp::Sub64
                } else {
                    ALUOp::Add64
                };
                if offset == 0 {
                    let mov = Inst::mov(rd, reg);
                    ret.push_str(&mov.show_rru(mb_rru));
                } else if let Some(imm12) = Imm12::maybe_from_u64(abs_offset) {
                    let add = Inst::AluRRImm12 {
                        alu_op,
                        rd,
                        rn: reg,
                        imm12,
                    };
                    ret.push_str(&add.show_rru(mb_rru));
                } else {
                    let tmp = writable_spilltmp_reg();
                    for inst in Inst::load_constant(tmp, abs_offset).into_iter() {
                        ret.push_str(&inst.show_rru(mb_rru));
                    }
                    let add = Inst::AluRRR {
                        alu_op,
                        rd,
                        rn: reg,
                        rm: tmp.to_reg(),
                    };
                    ret.push_str(&add.show_rru(mb_rru));
                }
                ret
            }
            &Inst::GetPinnedReg { rd } => {
                let rd = rd.show_rru(mb_rru);
                format!("get_pinned_reg {}", rd)
@@ -2662,6 +2687,7 @@ impl ShowWithRRU for Inst {
                let rm = rm.show_rru(mb_rru);
                format!("set_pinned_reg {}", rm)
            }
            &Inst::VirtualSPOffsetAdj { offset } => format!("virtual_sp_offset_adjust {}", offset),
        }
    }
 }
--- a/cranelift/codegen/src/isa/aarch64/inst/regs.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/regs.rs
@@ -20,23 +20,21 @@ pub const PINNED_REG: u8 = 21;
 const XREG_INDICES: [u8; 31] = [
    // X0 - X7
    32, 33, 34, 35, 36, 37, 38, 39,
-    // X8 - X14
+    // X8 - X15
-    40, 41, 42, 43, 44, 45, 46,
+    40, 41, 42, 43, 44, 45, 46, 47,
    // X15
    59,
    // X16, X17
-    47, 48,
+    58, 59,
    // X18
    60,
    // X19, X20
-    49, 50,
+    48, 49,
    // X21, put aside because it's the pinned register.
-    58,
+    57,
    // X22 - X28
-    51, 52, 53, 54, 55, 56, 57,
+    50, 51, 52, 53, 54, 55, 56,
-    // X29
+    // X29 (FP)
    61,
-    // X30
+    // X30 (LR)
    62,
 ];
@@ -125,14 +123,17 @@ pub fn writable_fp_reg() -> Writable<Reg> {
    Writable::from_reg(fp_reg())
 }
-/// Get a reference to the "spill temp" register. This register is used to
+/// Get a reference to the first temporary, sometimes "spill temporary", register. This register is
-/// compute the address of a spill slot when a direct offset addressing mode from
+/// used to compute the address of a spill slot when a direct offset addressing mode from FP is not
-/// FP is not sufficient (+/- 2^11 words). We exclude this register from regalloc
+/// sufficient (+/- 2^11 words). We exclude this register from regalloc and reserve it for this
-/// and reserve it for this purpose for simplicity; otherwise we need a
+/// purpose for simplicity; otherwise we need a multi-stage analysis where we first determine how
-/// multi-stage analysis where we first determine how many spill slots we have,
+/// many spill slots we have, then perhaps remove the reg from the pool and recompute regalloc.
-/// then perhaps remove the reg from the pool and recompute regalloc.
+///
 /// We use x16 for this (aka IP0 in the AArch64 ABI) because it's a scratch register but is
 /// slightly special (used for linker veneers). We're free to use it as long as we don't expect it
 /// to live through call instructions.
 pub fn spilltmp_reg() -> Reg {
-    xreg(15)
+    xreg(16)
 }
 /// Get a writable reference to the spilltmp reg.
@@ -140,6 +141,20 @@ pub fn writable_spilltmp_reg() -> Writable<Reg> {
    Writable::from_reg(spilltmp_reg())
 }
 /// Get a reference to the second temp register. We need this in some edge cases
 /// where we need both the spilltmp and another temporary.
 ///
 /// We use x17 (aka IP1), the other "interprocedural"/linker-veneer scratch reg that is
 /// free to use otherwise.
 pub fn tmp2_reg() -> Reg {
    xreg(17)
 }
 /// Get a writable reference to the tmp2 reg.
 pub fn writable_tmp2_reg() -> Writable<Reg> {
    Writable::from_reg(tmp2_reg())
 }
 /// Create the register universe for AArch64.
 pub fn create_reg_universe(flags: &settings::Flags) -> RealRegUniverse {
    let mut regs = vec![];
@@ -173,7 +188,7 @@ pub fn create_reg_universe(flags: &settings::Flags) -> RealRegUniverse {
    for i in 0u8..32u8 {
        // See above for excluded registers.
-        if i == 15 || i == 18 || i == 29 || i == 30 || i == 31 || i == PINNED_REG {
+        if i == 16 || i == 17 || i == 18 || i == 29 || i == 30 || i == 31 || i == PINNED_REG {
            continue;
        }
        let reg = Reg::new_real(
@@ -211,7 +226,8 @@ pub fn create_reg_universe(flags: &settings::Flags) -> RealRegUniverse {
        regs.len()
    };
-    regs.push((xreg(15).to_real_reg(), "x15".to_string()));
+    regs.push((xreg(16).to_real_reg(), "x16".to_string()));
    regs.push((xreg(17).to_real_reg(), "x17".to_string()));
    regs.push((xreg(18).to_real_reg(), "x18".to_string()));
    regs.push((fp_reg().to_real_reg(), "fp".to_string()));
    regs.push((link_reg().to_real_reg(), "lr".to_string()));
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -1291,7 +1291,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
            assert!(inputs.len() == abi.num_args());
            for (i, input) in inputs.iter().enumerate() {
                let arg_reg = input_to_reg(ctx, *input, NarrowValueMode::None);
-                for inst in abi.gen_copy_reg_to_arg(ctx, i, arg_reg) {
+                for inst in abi.gen_copy_reg_to_arg(i, arg_reg) {
                    ctx.emit(inst);
                }
            }
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -2183,7 +2183,7 @@ fn test_x64_emit() {
        // Check the encoding is as expected.
        let text_size = {
            let mut code_sec = MachSectionSize::new(0);
-            insn.emit(&mut code_sec, &flags);
+            insn.emit(&mut code_sec, &flags, &mut Default::default());
            code_sec.size()
        };
@@ -2191,7 +2191,7 @@ fn test_x64_emit() {
        let mut sections = MachSections::new();
        let code_idx = sections.add_section(0, text_size);
        let code_sec = sections.get_section(code_idx);
-        insn.emit(code_sec, &flags);
+        insn.emit(code_sec, &flags, &mut Default::default());
        sections.emit(&mut sink);
        let actual_encoding = &sink.stringify();
        assert_eq!(expected_encoding, actual_encoding);
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -950,7 +950,9 @@ impl MachInst for Inst {
 }
 impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
-    fn emit(&self, sink: &mut O, _flags: &settings::Flags) {
+    type State = ();
    fn emit(&self, sink: &mut O, _flags: &settings::Flags, _: &mut Self::State) {
        emit::emit(self, sink);
    }
 }
--- a/cranelift/codegen/src/machinst/abi.rs
+++ b/cranelift/codegen/src/machinst/abi.rs
@@ -98,7 +98,10 @@ pub trait ABIBody {
    fn gen_epilogue(&self) -> Vec<Self::I>;
    /// Returns the full frame size for the given function, after prologue emission has run. This
-    /// comprises the spill space, incoming argument space, alignment padding, etc.
+    /// comprises the spill slots and stack-storage slots (but not storage for clobbered callee-save
    /// registers, arguments pushed at callsites within this function, or other ephemeral pushes).
    /// This is used for ABI variants where the client generates prologue/epilogue code, as in
    /// Baldrdash (SpiderMonkey integration).
    fn frame_size(&self) -> u32;
    /// Get the spill-slot size.
@@ -133,12 +136,7 @@ pub trait ABICall {
    fn num_args(&self) -> usize;
    /// Copy an argument value from a source register, prior to the call.
-    fn gen_copy_reg_to_arg<C: LowerCtx<I = Self::I>>(
+    fn gen_copy_reg_to_arg(&self, idx: usize, from_reg: Reg) -> Vec<Self::I>;
        &self,
        ctx: &mut C,
        idx: usize,
        from_reg: Reg,
    ) -> Vec<Self::I>;
    /// Copy a return value into a destination register, after the call returns.
    fn gen_copy_retval_to_reg(&self, idx: usize, into_reg: Writable<Reg>) -> Self::I;
--- a/cranelift/codegen/src/machinst/mod.rs
+++ b/cranelift/codegen/src/machinst/mod.rs
@@ -214,8 +214,10 @@ pub enum MachTerminator<'a> {
 /// A trait describing the ability to encode a MachInst into binary machine code.
 pub trait MachInstEmit<O: MachSectionOutput> {
    /// Persistent state carried across `emit` invocations.
    type State: Default + Clone + Debug;
    /// Emit the instruction.
-    fn emit(&self, code: &mut O, flags: &Flags);
+    fn emit(&self, code: &mut O, flags: &Flags, state: &mut Self::State);
 }
 /// The result of a `MachBackend::compile_function()` call. Contains machine
--- a/cranelift/codegen/src/machinst/vcode.rs
+++ b/cranelift/codegen/src/machinst/vcode.rs
@@ -526,12 +526,13 @@ impl<I: VCodeInst> VCode<I> {
        // Compute block offsets.
        let mut code_section = MachSectionSize::new(0);
        let mut block_offsets = vec![0; self.num_blocks()];
        let mut state = Default::default();
        for &block in &self.final_block_order {
            code_section.offset = I::align_basic_block(code_section.offset);
            block_offsets[block as usize] = code_section.offset;
            let (start, end) = self.block_ranges[block as usize];
            for iix in start..end {
-                self.insts[iix as usize].emit(&mut code_section, flags);
+                self.insts[iix as usize].emit(&mut code_section, flags, &mut state);
            }
        }
@@ -544,13 +545,14 @@ impl<I: VCodeInst> VCode<I> {
        // it (so forward references are now possible), and (ii) mutates the
        // instructions.
        let mut code_section = MachSectionSize::new(0);
        let mut state = Default::default();
        for &block in &self.final_block_order {
            code_section.offset = I::align_basic_block(code_section.offset);
            let (start, end) = self.block_ranges[block as usize];
            for iix in start..end {
                self.insts[iix as usize]
                    .with_block_offsets(code_section.offset, &self.final_block_offsets[..]);
-                self.insts[iix as usize].emit(&mut code_section, flags);
+                self.insts[iix as usize].emit(&mut code_section, flags, &mut state);
            }
        }
    }
@@ -563,6 +565,7 @@ impl<I: VCodeInst> VCode<I> {
        let mut sections = MachSections::new();
        let code_idx = sections.add_section(0, self.code_size);
        let code_section = sections.get_section(code_idx);
        let mut state = Default::default();
        let flags = self.abi.flags();
        let mut cur_srcloc = None;
@@ -571,7 +574,7 @@ impl<I: VCodeInst> VCode<I> {
            while new_offset > code_section.cur_offset_from_start() {
                // Pad with NOPs up to the aligned block offset.
                let nop = I::gen_nop((new_offset - code_section.cur_offset_from_start()) as usize);
-                nop.emit(code_section, flags);
+                nop.emit(code_section, flags, &mut Default::default());
            }
            assert_eq!(code_section.cur_offset_from_start(), new_offset);
@@ -586,7 +589,7 @@ impl<I: VCodeInst> VCode<I> {
                    cur_srcloc = Some(srcloc);
                }
-                self.insts[iix as usize].emit(code_section, flags);
+                self.insts[iix as usize].emit(code_section, flags, &mut state);
            }
            if cur_srcloc.is_some() {
--- a/cranelift/filetests/filetests/vcode/aarch64/call.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/call.clif
@@ -11,8 +11,8 @@ block0(v0: i64):
 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln:  mov fp, sp
-; nextln:  ldr x15, 8 ; b 12 ; data
+; nextln:  ldr x16, 8 ; b 12 ; data
-; nextln:  blr x15
+; nextln:  blr x16
 ; nextln:  mov sp, fp
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
--- a/cranelift/filetests/filetests/vcode/aarch64/stack-limit.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/stack-limit.clif
@@ -45,8 +45,8 @@ block0(v0: i64):
 ; nextln:     subs xzr, sp, x0
 ; nextln:     b.hs 8
 ; nextln:     udf
-; nextln:     ldr x15
+; nextln:     ldr x16
-; nextln:     blr x15
+; nextln:     blr x16
 ; nextln:     mov sp, fp
 ; nextln:     ldp fp, lr, [sp], #16
 ; nextln:     ret
@@ -64,13 +64,13 @@ block0(v0: i64):
 ; check:      stp fp, lr, [sp, #-16]!
 ; nextln:     mov fp, sp
-; nextln:     ldr x15, [x0]
+; nextln:     ldr x16, [x0]
-; nextln:     ldr x15, [x15, #4]
+; nextln:     ldr x16, [x16, #4]
-; nextln:     subs xzr, sp, x15
+; nextln:     subs xzr, sp, x16
 ; nextln:     b.hs 8
 ; nextln:     udf
-; nextln:     ldr x15
+; nextln:     ldr x16
-; nextln:     blr x15
+; nextln:     blr x16
 ; nextln:     mov sp, fp
 ; nextln:     ldp fp, lr, [sp], #16
 ; nextln:     ret
@@ -84,8 +84,8 @@ block0(v0: i64):
 ; check:      stp fp, lr, [sp, #-16]!
 ; nextln:     mov fp, sp
-; nextln:     add x15, x0, #176
+; nextln:     add x16, x0, #176
-; nextln:     subs xzr, sp, x15
+; nextln:     subs xzr, sp, x16
 ; nextln:     b.hs 8
 ; nextln:     udf
 ; nextln:     sub sp, sp, #176
@@ -104,14 +104,14 @@ block0(v0: i64):
 ; nextln:     subs xzr, sp, x0
 ; nextln:     b.hs 8
 ; nextln:     udf
-; nextln:     movz x16, #6784
+; nextln:     movz x17, #6784
-; nextln:     movk x16, #6, LSL #16
+; nextln:     movk x17, #6, LSL #16
-; nextln:     add x15, x0, x16, UXTX
+; nextln:     add x16, x0, x17, UXTX
-; nextln:     subs xzr, sp, x15
+; nextln:     subs xzr, sp, x16
 ; nextln:     b.hs 8
 ; nextln:     udf
-; nextln:     ldr x15, 8 ; b 12 ; data 400000
+; nextln:     ldr x16, 8 ; b 12 ; data 400000
-; nextln:     sub sp, sp, x15, UXTX
+; nextln:     sub sp, sp, x16, UXTX
 ; nextln:     mov sp, fp
 ; nextln:     ldp fp, lr, [sp], #16
 ; nextln:     ret
@@ -128,10 +128,10 @@ block0(v0: i64):
 ; check:      stp fp, lr, [sp, #-16]!
 ; nextln:     mov fp, sp
-; nextln:     ldr x15, [x0]
+; nextln:     ldr x16, [x0]
-; nextln:     ldr x15, [x15, #4]
+; nextln:     ldr x16, [x16, #4]
-; nextln:     add x15, x15, #32
+; nextln:     add x16, x16, #32
-; nextln:     subs xzr, sp, x15
+; nextln:     subs xzr, sp, x16
 ; nextln:     b.hs 8
 ; nextln:     udf
 ; nextln:     sub sp, sp, #32
@@ -151,19 +151,19 @@ block0(v0: i64):
 ; check:      stp fp, lr, [sp, #-16]!
 ; nextln:     mov fp, sp
-; nextln:     ldr x15, [x0]
+; nextln:     ldr x16, [x0]
-; nextln:     ldr x15, [x15, #4]
+; nextln:     ldr x16, [x16, #4]
-; nextln:     subs xzr, sp, x15
+; nextln:     subs xzr, sp, x16
 ; nextln:     b.hs 8
 ; nextln:     udf
-; nextln:     movz x16, #6784
+; nextln:     movz x17, #6784
-; nextln:     movk x16, #6, LSL #16
+; nextln:     movk x17, #6, LSL #16
-; nextln:     add x15, x15, x16, UXTX
+; nextln:     add x16, x16, x17, UXTX
-; nextln:     subs xzr, sp, x15
+; nextln:     subs xzr, sp, x16
 ; nextln:     b.hs 8
 ; nextln:     udf
-; nextln:     ldr x15, 8 ; b 12 ; data 400000
+; nextln:     ldr x16, 8 ; b 12 ; data 400000
-; nextln:     sub sp, sp, x15, UXTX
+; nextln:     sub sp, sp, x16, UXTX
 ; nextln:     mov sp, fp
 ; nextln:     ldp fp, lr, [sp], #16
 ; nextln:     ret
@@ -179,11 +179,11 @@ block0(v0: i64):
 ; check:      stp fp, lr, [sp, #-16]!
 ; nextln:     mov fp, sp
-; nextln:     movz x15, #6784
+; nextln:     movz x16, #6784
-; nextln:     movk x15, #6, LSL #16
+; nextln:     movk x16, #6, LSL #16
-; nextln:     ldr x15, [x0, x15]
+; nextln:     ldr x16, [x0, x16]
-; nextln:     add x15, x15, #32
+; nextln:     add x16, x16, #32
-; nextln:     subs xzr, sp, x15
+; nextln:     subs xzr, sp, x16
 ; nextln:     b.hs 8
 ; nextln:     udf
 ; nextln:     sub sp, sp, #32
--- a/cranelift/filetests/filetests/vcode/aarch64/stack.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/stack.clif
@@ -12,7 +12,7 @@ block0:
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
 ; nextln: sub sp, sp, #16
-; nextln: sub x0, fp, #8
+; nextln: mov x0, sp
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
@@ -29,9 +29,9 @@ block0:
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
-; nextln: ldr x15, 8 ; b 12 ; data 100016
+; nextln: ldr x16, 8 ; b 12 ; data 100016
-; nextln: sub sp, sp, x15, UXTX
+; nextln: sub sp, sp, x16, UXTX
-; nextln: movz x0, #34472; movk x0, #1, LSL #16; sub x0, fp, x0
+; nextln: mov x0, sp
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
@@ -50,7 +50,7 @@ block0:
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
 ; nextln: sub sp, sp, #16
-; nextln: sub x0, fp, #8
+; nextln: mov x0, sp
 ; nextln: ldur x0, [x0]
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
@@ -68,9 +68,9 @@ block0:
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
-; nextln: ldr x15, 8 ; b 12 ; data 100016
+; nextln: ldr x16, 8 ; b 12 ; data 100016
-; nextln: sub sp, sp, x15, UXTX
+; nextln: sub sp, sp, x16, UXTX
-; nextln: movz x0, #34472; movk x0, #1, LSL #16; sub x0, fp, x0
+; nextln: mov x0, sp
 ; nextln: ldur x0, [x0]
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
@@ -88,7 +88,7 @@ block0(v0: i64):
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
 ; nextln: sub sp, sp, #16
-; nextln: sub x1, fp, #8
+; nextln: mov x1, sp
 ; nextln: stur x0, [x1]
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
@@ -106,9 +106,9 @@ block0(v0: i64):
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
-; nextln: ldr x15, 8 ; b 12 ; data 100016
+; nextln: ldr x16, 8 ; b 12 ; data 100016
-; nextln: sub sp, sp, x15, UXTX
+; nextln: sub sp, sp, x16, UXTX
-; nextln: movz x1, #34472; movk x1, #1, LSL #16; sub x1, fp, x1
+; nextln: mov x1, sp
 ; nextln: stur x0, [x1]
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16