diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs
index a2d2552d86..c71096d485 100644
--- a/cranelift/codegen/src/isa/aarch64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -1,4 +1,63 @@
 //! Implementation of the standard AArch64 ABI.
+//!
+//! We implement the standard AArch64 ABI, as documented by ARM. This ABI
+//! specifies how arguments are passed (in registers or on the stack, as
+//! appropriate), which registers are caller- and callee-saved, and how a
+//! particular part of the stack frame (the FP/LR pair) must be linked through
+//! the active stack frames.
+//!
+//! Note, however, that the exact stack layout is up to us. We settled on the
+//! below design based on several requirements. In particular, we need to be
+//! able to generate instructions (or instruction sequences) to access
+//! arguments, stack slots, and spill slots before we know how many spill slots
+//! or clobber-saves there will be, because of our pass structure. We also
+//! prefer positive offsets to negative offsets because of an asymmetry in
+//! AArch64 addressing modes (positive offsets have a larger possible range
+//! without a long-form sequence to synthesize an arbitrary offset). Finally, it
+//! is not allowed to access memory below the current SP value.
+//!
+//! As a result, we keep the FP/LR pair just below stack args so that we can
+//! access these args at known offsets from FP, and we access on-stack storage
+//! using positive offsets from SP. In order to allow codegen for the latter
+//! before knowing how many clobber-saves we have, and also allow it while SP is
+//! being adjusted to set up a call, we implement a "nominal SP" tracking
+//! feature by which a fixup (distance between actual SP and a "nominal" SP) is
+//! known at each instruction. See the documentation for
+//! [MemArg::NominalSPOffset] for more on this.
+//!
+//! The stack looks like:
+//!
+//! ```plain
+//!   (high address)
+//!
+//!                              +---------------------------+
+//!                              |          ...              |
+//!                              | stack args                |
+//!                              | (accessed via FP)         |
+//!                              +---------------------------+
+//! SP at function entry ----->  | LR (pushed by prologue)   |
+//!                              +---------------------------+
+//! FP after prologue -------->  | FP (pushed by prologue)   |
+//!                              +---------------------------+
+//!                              |          ...              |
+//!                              | spill slots               |
+//!                              | (accessed via nominal-SP) |
+//!                              |          ...              |
+//!                              | stack slots               |
+//!                              | (accessed via nominal-SP) |
+//! nominal SP --------------->  | (alloc'd by prologue)     |
+//!                              +---------------------------+
+//!                              |          ...              |
+//!                              | clobbered callee-saves    |
+//! SP at end of prologue ---->  | (pushed by prologue)      |
+//!                              +---------------------------+
+//!                              |          ...              |
+//!                              | args for call             |
+//! SP before making a call -->  | (pushed at callsite)      |
+//!                              +---------------------------+
+//!
+//!   (low address)
+//! ```
 
 use crate::ir;
 use crate::ir::types;
@@ -13,7 +72,7 @@ use alloc::vec::Vec;
 
 use regalloc::{RealReg, Reg, RegClass, Set, SpillSlot, Writable};
 
-use log::debug;
+use log::{debug, trace};
 
 /// A location for an argument or return value.
 #[derive(Clone, Copy, Debug)]
@@ -188,7 +247,7 @@ pub struct AArch64ABIBody {
     /// Total number of spillslots, from regalloc.
     spillslots: Option<usize>,
     /// Total frame size.
-    frame_size: Option<u32>,
+    total_frame_size: Option<u32>,
     /// Calling convention this function expects.
     call_conv: isa::CallConv,
     /// The settings controlling this function's compilation.
@@ -347,7 +406,7 @@ impl AArch64ABIBody {
             stackslots_size: stack_offset,
             clobbered: Set::empty(),
             spillslots: None,
-            frame_size: None,
+            total_frame_size: None,
             call_conv,
             flags,
             is_leaf: f.is_leaf(),
@@ -355,9 +414,9 @@ impl AArch64ABIBody {
         }
     }
 
-    /// Returns the size of a function call frame (including return address and FP) for this
-    /// function's body.
-    fn frame_size(&self) -> i64 {
+    /// Returns the offset from FP to the argument area, i.e., jumping over the saved FP, return
+    /// address, and maybe other standard elements depending on ABI (e.g. Wasm TLS reg).
+    fn fp_to_arg_offset(&self) -> i64 {
         if self.call_conv.extends_baldrdash() {
             let num_words = self.flags.baldrdash_prologue_words() as i64;
             debug_assert!(num_words > 0, "baldrdash must set baldrdash_prologue_words");
@@ -383,8 +442,8 @@ impl AArch64ABIBody {
     /// happening so late in the pipeline (e.g. after register allocation). This
     /// means that we need to do manual register allocation here and also be
     /// careful to not clobber any callee-saved or argument registers. For now
-    /// this routine makes do with the `writable_spilltmp_reg` as one temporary
-    /// register, and a second register of `x16` which is caller-saved. This
+    /// this routine makes do with the `spilltmp_reg` as one temporary
+    /// register, and a second register of `tmp2` which is caller-saved. This
     /// should be fine for us since no spills should happen in this sequence of
     /// instructions, so our register won't get accidentally clobbered.
     ///
@@ -413,9 +472,9 @@ impl AArch64ABIBody {
         // Note though that `stack_limit`'s register may be the same as
         // `scratch`. If our stack size doesn't fit into an immediate this
         // means we need a second scratch register for loading the stack size
-        // into a register. We use `x16` here since it's caller-saved and we're
-        // in the function prologue and nothing else is allocated to it yet.
+        // into a register.
         let scratch = writable_spilltmp_reg();
+        let scratch2 = writable_tmp2_reg();
         let stack_size = u64::from(stack_size);
         if let Some(imm12) = Imm12::maybe_from_u64(stack_size) {
             insts.push(Inst::AluRRImm12 {
@@ -425,16 +484,12 @@ impl AArch64ABIBody {
                 imm12,
             });
         } else {
-            let scratch2 = 16;
-            insts.extend(Inst::load_constant(
-                Writable::from_reg(xreg(scratch2)),
-                stack_size.into(),
-            ));
+            insts.extend(Inst::load_constant(scratch2, stack_size.into()));
             insts.push(Inst::AluRRRExtend {
                 alu_op: ALUOp::Add64,
                 rd: scratch,
                 rn: stack_limit,
-                rm: xreg(scratch2),
+                rm: scratch2.to_reg(),
                 extendop: ExtendOp::UXTX,
             });
         }
@@ -460,8 +515,7 @@ impl AArch64ABIBody {
     }
 }
 
-fn load_stack_from_fp(fp_offset: i64, into_reg: Writable<Reg>, ty: Type) -> Inst {
-    let mem = MemArg::FPOffset(fp_offset);
+fn load_stack(mem: MemArg, into_reg: Writable<Reg>, ty: Type) -> Inst {
     match ty {
         types::B1
         | types::B8
@@ -486,15 +540,11 @@ fn load_stack_from_fp(fp_offset: i64, into_reg: Writable<Reg>, ty: Type) -> Inst
             mem,
             srcloc: None,
         },
-        _ => unimplemented!("load_stack_from_fp({})", ty),
+        _ => unimplemented!("load_stack({})", ty),
     }
 }
 
 fn store_stack(mem: MemArg, from_reg: Reg, ty: Type) -> Inst {
-    debug_assert!(match &mem {
-        MemArg::SPOffset(off) => SImm9::maybe_from_i64(*off).is_some(),
-        _ => true,
-    });
     match ty {
         types::B1
         | types::B8
@@ -523,50 +573,6 @@ fn store_stack(mem: MemArg, from_reg: Reg, ty: Type) -> Inst {
     }
 }
 
-fn store_stack_fp(fp_offset: i64, from_reg: Reg, ty: Type) -> Inst {
-    store_stack(MemArg::FPOffset(fp_offset), from_reg, ty)
-}
-
-fn store_stack_sp<C: LowerCtx<I = Inst>>(
-    ctx: &mut C,
-    sp_offset: i64,
-    from_reg: Reg,
-    ty: Type,
-) -> Vec<Inst> {
-    if SImm9::maybe_from_i64(sp_offset).is_some() {
-        vec![store_stack(MemArg::SPOffset(sp_offset), from_reg, ty)]
-    } else {
-        // mem_finalize will try to generate an add, but in an addition, x31 is the zero register,
-        // not sp! So we have to synthesize the full add here.
-        let tmp1 = ctx.tmp(RegClass::I64, I64);
-        let tmp2 = ctx.tmp(RegClass::I64, I64);
-        let mut result = Vec::new();
-        // tmp1 := sp
-        result.push(Inst::Mov {
-            rd: tmp1,
-            rm: stack_reg(),
-        });
-        // tmp2 := offset
-        for inst in Inst::load_constant(tmp2, sp_offset as u64) {
-            result.push(inst);
-        }
-        // tmp1 := add tmp1, tmp2
-        result.push(Inst::AluRRR {
-            alu_op: ALUOp::Add64,
-            rd: tmp1,
-            rn: tmp1.to_reg(),
-            rm: tmp2.to_reg(),
-        });
-        // Actual store.
-        result.push(store_stack(
-            MemArg::Unscaled(tmp1.to_reg(), SImm9::maybe_from_i64(0).unwrap()),
-            from_reg,
-            ty,
-        ));
-        result
-    }
-}
-
 fn is_callee_save(call_conv: isa::CallConv, r: RealReg) -> bool {
     if call_conv.extends_baldrdash() {
         match r.get_class() {
@@ -706,7 +712,11 @@ impl ABIBody for AArch64ABIBody {
     fn gen_copy_arg_to_reg(&self, idx: usize, into_reg: Writable<Reg>) -> Inst {
         match &self.sig.args[idx] {
             &ABIArg::Reg(r, ty) => Inst::gen_move(into_reg, r.to_reg(), ty),
-            &ABIArg::Stack(off, ty) => load_stack_from_fp(off + self.frame_size(), into_reg, ty),
+            &ABIArg::Stack(off, ty) => load_stack(
+                MemArg::FPOffset(self.fp_to_arg_offset() + off),
+                into_reg,
+                ty,
+            ),
         }
     }
 
@@ -767,8 +777,8 @@ impl ABIBody for AArch64ABIBody {
                     }
                     _ => {}
                 };
-                ret.push(store_stack_fp(
-                    off + self.frame_size(),
+                ret.push(store_stack(
+                    MemArg::FPOffset(self.fp_to_arg_offset() + off),
                     from_reg.to_reg(),
                     ty,
                 ))
@@ -793,6 +803,7 @@ impl ABIBody for AArch64ABIBody {
         self.clobbered = clobbered;
     }
 
+    /// Load from a stackslot.
     fn load_stackslot(
         &self,
         slot: StackSlot,
@@ -800,47 +811,54 @@ impl ABIBody for AArch64ABIBody {
         ty: Type,
         into_reg: Writable<Reg>,
     ) -> Inst {
-        // Offset from beginning of stackslot area, which is at FP - stackslots_size.
+        // Offset from beginning of stackslot area, which is at nominal-SP (see
+        // [MemArg::NominalSPOffset] for more details on nominal-SP tracking).
         let stack_off = self.stackslots[slot.as_u32() as usize] as i64;
-        let fp_off: i64 = -(self.stackslots_size as i64) + stack_off + (offset as i64);
-        load_stack_from_fp(fp_off, into_reg, ty)
+        let sp_off: i64 = stack_off + (offset as i64);
+        trace!("load_stackslot: slot {} -> sp_off {}", slot, sp_off);
+        load_stack(MemArg::NominalSPOffset(sp_off), into_reg, ty)
     }
 
+    /// Store to a stackslot.
     fn store_stackslot(&self, slot: StackSlot, offset: u32, ty: Type, from_reg: Reg) -> Inst {
-        // Offset from beginning of stackslot area, which is at FP - stackslots_size.
+        // Offset from beginning of stackslot area, which is at nominal-SP (see
+        // [MemArg::NominalSPOffset] for more details on nominal-SP tracking).
         let stack_off = self.stackslots[slot.as_u32() as usize] as i64;
-        let fp_off: i64 = -(self.stackslots_size as i64) + stack_off + (offset as i64);
-        store_stack_fp(fp_off, from_reg, ty)
+        let sp_off: i64 = stack_off + (offset as i64);
+        trace!("store_stackslot: slot {} -> sp_off {}", slot, sp_off);
+        store_stack(MemArg::NominalSPOffset(sp_off), from_reg, ty)
     }
 
+    /// Produce an instruction that computes a stackslot address.
     fn stackslot_addr(&self, slot: StackSlot, offset: u32, into_reg: Writable<Reg>) -> Inst {
-        // Offset from beginning of stackslot area, which is at FP - stackslots_size.
+        // Offset from beginning of stackslot area, which is at nominal-SP (see
+        // [MemArg::NominalSPOffset] for more details on nominal-SP tracking).
         let stack_off = self.stackslots[slot.as_u32() as usize] as i64;
-        let fp_off: i64 = -(self.stackslots_size as i64) + stack_off + (offset as i64);
+        let sp_off: i64 = stack_off + (offset as i64);
         Inst::LoadAddr {
             rd: into_reg,
-            mem: MemArg::FPOffset(fp_off),
+            mem: MemArg::NominalSPOffset(sp_off),
         }
     }
 
-    // Load from a spillslot.
+    /// Load from a spillslot.
     fn load_spillslot(&self, slot: SpillSlot, ty: Type, into_reg: Writable<Reg>) -> Inst {
-        // Note that when spills/fills are generated, we don't yet know how many
-        // spillslots there will be, so we allocate *downward* from the beginning
-        // of the stackslot area. Hence: FP - stackslot_size - 8*spillslot -
-        // sizeof(ty).
+        // Offset from beginning of spillslot area, which is at nominal-SP + stackslots_size.
         let islot = slot.get() as i64;
-        let ty_size = self.get_spillslot_size(into_reg.to_reg().get_class(), ty) * 8;
-        let fp_off: i64 = -(self.stackslots_size as i64) - (8 * islot) - ty_size as i64;
-        load_stack_from_fp(fp_off, into_reg, ty)
+        let spill_off = islot * 8;
+        let sp_off = self.stackslots_size as i64 + spill_off;
+        trace!("load_spillslot: slot {:?} -> sp_off {}", slot, sp_off);
+        load_stack(MemArg::NominalSPOffset(sp_off), into_reg, ty)
     }
 
-    // Store to a spillslot.
+    /// Store to a spillslot.
     fn store_spillslot(&self, slot: SpillSlot, ty: Type, from_reg: Reg) -> Inst {
+        // Offset from beginning of spillslot area, which is at nominal-SP + stackslots_size.
         let islot = slot.get() as i64;
-        let ty_size = self.get_spillslot_size(from_reg.get_class(), ty) * 8;
-        let fp_off: i64 = -(self.stackslots_size as i64) - (8 * islot) - ty_size as i64;
-        store_stack_fp(fp_off, from_reg, ty)
+        let spill_off = islot * 8;
+        let sp_off = self.stackslots_size as i64 + spill_off;
+        trace!("store_spillslot: slot {:?} -> sp_off {}", slot, sp_off);
+        store_stack(MemArg::NominalSPOffset(sp_off), from_reg, ty)
     }
 
     fn gen_prologue(&mut self) -> Vec<Inst> {
@@ -916,9 +934,18 @@ impl ABIBody for AArch64ABIBody {
             }
         }
 
+        // N.B.: "nominal SP", which we use to refer to stackslots
+        // and spillslots, is *here* (the value of SP at this program point).
+        // If we push any clobbers below, we emit a virtual-SP adjustment
+        // meta-instruction so that the nominal-SP references behave as if SP
+        // were still at this point. See documentation for
+        // [crate::isa::aarch64::abi](this module) for more details on
+        // stackframe layout and nominal-SP maintenance.
+
         // Save clobbered registers.
         let (clobbered_int, clobbered_vec) =
             get_callee_saves(self.call_conv, self.clobbered.to_vec());
+        let mut clobber_size = 0;
         for reg_pair in clobbered_int.chunks(2) {
             let (r1, r2) = if reg_pair.len() == 2 {
                 // .to_reg().to_reg(): Writable<RealReg> --> RealReg --> Reg
@@ -939,6 +966,7 @@ impl ABIBody for AArch64ABIBody {
                     SImm7Scaled::maybe_from_i64(-16, types::I64).unwrap(),
                 ),
             });
+            clobber_size += 16;
         }
         let vec_save_bytes = clobbered_vec.len() * 16;
         if vec_save_bytes != 0 {
@@ -948,6 +976,7 @@ impl ABIBody for AArch64ABIBody {
                 rn: stack_reg(),
                 imm12: Imm12::maybe_from_u64(vec_save_bytes as u64).unwrap(),
             });
+            clobber_size += vec_save_bytes;
         }
         for (i, reg) in clobbered_vec.iter().enumerate() {
             insts.push(Inst::FpuStore128 {
@@ -957,7 +986,13 @@ impl ABIBody for AArch64ABIBody {
             });
         }
 
-        self.frame_size = Some(total_stacksize);
+        if clobber_size > 0 {
+            insts.push(Inst::VirtualSPOffsetAdj {
+                offset: clobber_size as i64,
+            });
+        }
+
+        self.total_frame_size = Some(total_stacksize);
         insts
     }
 
@@ -1009,6 +1044,12 @@ impl ABIBody for AArch64ABIBody {
             });
         }
 
+        // N.B.: we do *not* emit a nominal-SP adjustment here, because (i) there will be no
+        // references to nominal-SP offsets before the return below, and (ii) the instruction
+        // emission tracks running SP offset linearly (in straight-line order), not according to
+        // the CFG, so early returns in the middle of function bodies would cause an incorrect
+        // offset for the rest of the body.
+
         if !self.call_conv.extends_baldrdash() {
             // The MOV (alias of ORR) interprets x31 as XZR, so use an ADD here.
             // MOV to SP is an alias of ADD.
@@ -1037,7 +1078,7 @@ impl ABIBody for AArch64ABIBody {
     }
 
     fn frame_size(&self) -> u32 {
-        self.frame_size
+        self.total_frame_size
             .expect("frame size not computed before prologue generation")
     }
 
@@ -1138,20 +1179,32 @@ impl AArch64ABICall {
     }
 }
 
-fn adjust_stack(amt: u64, is_sub: bool) -> Vec<Inst> {
-    if amt > 0 {
+fn adjust_stack(amount: u64, is_sub: bool) -> Vec<Inst> {
+    if amount > 0 {
+        let sp_adjustment = if is_sub {
+            amount as i64
+        } else {
+            -(amount as i64)
+        };
+        let adj_meta_insn = Inst::VirtualSPOffsetAdj {
+            offset: sp_adjustment,
+        };
+
         let alu_op = if is_sub { ALUOp::Sub64 } else { ALUOp::Add64 };
-        if let Some(imm12) = Imm12::maybe_from_u64(amt) {
-            vec![Inst::AluRRImm12 {
-                alu_op,
-                rd: writable_stack_reg(),
-                rn: stack_reg(),
-                imm12,
-            }]
+        if let Some(imm12) = Imm12::maybe_from_u64(amount) {
+            vec![
+                adj_meta_insn,
+                Inst::AluRRImm12 {
+                    alu_op,
+                    rd: writable_stack_reg(),
+                    rn: stack_reg(),
+                    imm12,
+                },
+            ]
         } else {
             let const_load = Inst::LoadConst64 {
                 rd: writable_spilltmp_reg(),
-                const_data: amt,
+                const_data: amount,
             };
             let adj = Inst::AluRRRExtend {
                 alu_op,
@@ -1160,7 +1213,7 @@ fn adjust_stack(amt: u64, is_sub: bool) -> Vec<Inst> {
                 rm: spilltmp_reg(),
                 extendop: ExtendOp::UXTX,
             };
-            vec![const_load, adj]
+            vec![adj_meta_insn, const_load, adj]
         }
     } else {
         vec![]
@@ -1182,19 +1235,14 @@ impl ABICall for AArch64ABICall {
         adjust_stack(self.sig.stack_arg_space as u64, /* is_sub = */ false)
     }
 
-    fn gen_copy_reg_to_arg<C: LowerCtx<I = Self::I>>(
-        &self,
-        ctx: &mut C,
-        idx: usize,
-        from_reg: Reg,
-    ) -> Vec<Inst> {
+    fn gen_copy_reg_to_arg(&self, idx: usize, from_reg: Reg) -> Vec<Inst> {
         match &self.sig.args[idx] {
             &ABIArg::Reg(reg, ty) => vec![Inst::gen_move(
                 Writable::from_reg(reg.to_reg()),
                 from_reg,
                 ty,
             )],
-            &ABIArg::Stack(off, ty) => store_stack_sp(ctx, off, from_reg, ty),
+            &ABIArg::Stack(off, ty) => vec![store_stack(MemArg::SPOffset(off), from_reg, ty)],
         }
     }
 
diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs
index db385cf5c6..8eb3b9b02a 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -112,7 +112,9 @@ pub enum MemLabel {
 /// A memory argument to load/store, encapsulating the possible addressing modes.
 #[derive(Clone, Debug)]
 pub enum MemArg {
-    Label(MemLabel),
+    //
+    // Real ARM64 addressing modes:
+    //
     /// "post-indexed" mode as per AArch64 docs: postincrement reg after address computation.
     PostIndexed(Writable<Reg>, SImm9),
     /// "pre-indexed" mode as per AArch64 docs: preincrement reg before address computation.
@@ -137,11 +139,31 @@ pub enum MemArg {
     /// Scaled (by size of a type) unsigned 12-bit immediate offset from reg.
     UnsignedOffset(Reg, UImm12Scaled),
 
-    /// Offset from the stack pointer. Lowered into a real amode at emission.
+    //
+    // virtual addressing modes that are lowered at emission time:
+    //
+    /// Reference to a "label": e.g., a symbol.
+    Label(MemLabel),
+
+    /// Offset from the stack pointer.
     SPOffset(i64),
 
-    /// Offset from the frame pointer. Lowered into a real amode at emission.
+    /// Offset from the frame pointer.
     FPOffset(i64),
+
+    /// Offset from the "nominal stack pointer", which is where the real SP is
+    /// just after stack and spill slots are allocated in the function prologue.
+    /// At emission time, this is converted to `SPOffset` with a fixup added to
+    /// the offset constant. The fixup is a running value that is tracked as
+    /// emission iterates through instructions in linear order, and can be
+    /// adjusted up and down with [Inst::VirtualSPOffsetAdj].
+    ///
+    /// The standard ABI is in charge of handling this (by emitting the
+    /// adjustment meta-instructions). It maintains the invariant that "nominal
+    /// SP" is where the actual SP is after the function prologue and before
+    /// clobber pushes. See the diagram in the documentation for
+    /// [crate::isa::aarch64::abi](the ABI module) for more details.
+    NominalSPOffset(i64),
 }
 
 impl MemArg {
@@ -443,7 +465,7 @@ impl ShowWithRRU for MemArg {
                 simm9.show_rru(mb_rru)
             ),
             // Eliminated by `mem_finalize()`.
-            &MemArg::SPOffset(..) | &MemArg::FPOffset(..) => {
+            &MemArg::SPOffset(..) | &MemArg::FPOffset(..) | &MemArg::NominalSPOffset(..) => {
                 panic!("Unexpected stack-offset mem-arg mode!")
             }
         }
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index 2d8613b4b3..da7da92050 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -10,6 +10,7 @@ use regalloc::{Reg, RegClass, Writable};
 
 use alloc::vec::Vec;
 use core::convert::TryFrom;
+use log::debug;
 
 /// Memory label/reference finalization: convert a MemLabel to a PC-relative
 /// offset, possibly emitting relocation(s) as necessary.
@@ -23,33 +24,44 @@ pub fn memlabel_finalize(_insn_off: CodeOffset, label: &MemLabel) -> i32 {
 /// generic arbitrary stack offset) into real addressing modes, possibly by
 /// emitting some helper instructions that come immediately before the use
 /// of this amode.
-pub fn mem_finalize(insn_off: CodeOffset, mem: &MemArg) -> (Vec<Inst>, MemArg) {
+pub fn mem_finalize(insn_off: CodeOffset, mem: &MemArg, state: &EmitState) -> (Vec<Inst>, MemArg) {
     match mem {
-        &MemArg::SPOffset(off) | &MemArg::FPOffset(off) => {
+        &MemArg::SPOffset(off) | &MemArg::FPOffset(off) | &MemArg::NominalSPOffset(off) => {
             let basereg = match mem {
-                &MemArg::SPOffset(..) => stack_reg(),
+                &MemArg::SPOffset(..) | &MemArg::NominalSPOffset(..) => stack_reg(),
                 &MemArg::FPOffset(..) => fp_reg(),
                 _ => unreachable!(),
             };
+            let adj = match mem {
+                &MemArg::NominalSPOffset(..) => {
+                    debug!(
+                        "mem_finalize: nominal SP offset {} + adj {} -> {}",
+                        off,
+                        state.virtual_sp_offset,
+                        off + state.virtual_sp_offset
+                    );
+                    state.virtual_sp_offset
+                }
+                _ => 0,
+            };
+            let off = off + adj;
+
             if let Some(simm9) = SImm9::maybe_from_i64(off) {
                 let mem = MemArg::Unscaled(basereg, simm9);
                 (vec![], mem)
             } else {
-                // In an addition, x31 is the zero register, not sp; we have only one temporary
-                // so we can't do the proper add here.
-                debug_assert_ne!(
-                    basereg,
-                    stack_reg(),
-                    "should have diverted SP before mem_finalize"
-                );
-
                 let tmp = writable_spilltmp_reg();
                 let mut const_insts = Inst::load_constant(tmp, off as u64);
-                let add_inst = Inst::AluRRR {
+                // N.B.: we must use AluRRRExtend because AluRRR uses the "shifted register" form
+                // (AluRRRShift) instead, which interprets register 31 as the zero reg, not SP. SP
+                // is a valid base (for SPOffset) which we must handle here.
+                // Also, SP needs to be the first arg, not second.
+                let add_inst = Inst::AluRRRExtend {
                     alu_op: ALUOp::Add64,
                     rd: tmp,
-                    rn: tmp.to_reg(),
-                    rm: basereg,
+                    rn: basereg,
+                    rm: tmp.to_reg(),
+                    extendop: ExtendOp::UXTX,
                 };
                 const_insts.push(add_inst);
                 (const_insts.to_vec(), MemArg::reg(tmp.to_reg()))
@@ -322,8 +334,16 @@ fn enc_fround(top22: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
     (top22 << 10) | (machreg_to_vec(rn) << 5) | machreg_to_vec(rd.to_reg())
 }
 
+/// State carried between emissions of a sequence of instructions.
+#[derive(Default, Clone, Debug)]
+pub struct EmitState {
+    virtual_sp_offset: i64,
+}
+
 impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
-    fn emit(&self, sink: &mut O, flags: &settings::Flags) {
+    type State = EmitState;
+
+    fn emit(&self, sink: &mut O, flags: &settings::Flags, state: &mut EmitState) {
         match self {
             &Inst::AluRRR { alu_op, rd, rn, rm } => {
                 let top11 = match alu_op {
@@ -596,10 +616,10 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                 ref mem,
                 srcloc,
             } => {
-                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem);
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem, state);
 
                 for inst in mem_insts.into_iter() {
-                    inst.emit(sink, flags);
+                    inst.emit(sink, flags, state);
                 }
 
                 // ldst encoding helpers take Reg, not Writable<Reg>.
@@ -697,9 +717,9 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                         sink.put4(enc_ldst_simm9(op, simm9, 0b01, reg.to_reg(), rd));
                     }
                     // Eliminated by `mem_finalize()` above.
-                    &MemArg::SPOffset(..) | &MemArg::FPOffset(..) => {
-                        panic!("Should not see stack-offset here!")
-                    }
+                    &MemArg::SPOffset(..)
+                    | &MemArg::FPOffset(..)
+                    | &MemArg::NominalSPOffset(..) => panic!("Should not see stack-offset here!"),
                 }
             }
 
@@ -739,10 +759,10 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                 ref mem,
                 srcloc,
             } => {
-                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem);
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem, state);
 
                 for inst in mem_insts.into_iter() {
-                    inst.emit(sink, flags);
+                    inst.emit(sink, flags, state);
                 }
 
                 let op = match self {
@@ -794,9 +814,9 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                         sink.put4(enc_ldst_simm9(op, simm9, 0b01, reg.to_reg(), rd));
                     }
                     // Eliminated by `mem_finalize()` above.
-                    &MemArg::SPOffset(..) | &MemArg::FPOffset(..) => {
-                        panic!("Should not see stack-offset here!")
-                    }
+                    &MemArg::SPOffset(..)
+                    | &MemArg::FPOffset(..)
+                    | &MemArg::NominalSPOffset(..) => panic!("Should not see stack-offset here!"),
                 }
             }
 
@@ -980,11 +1000,11 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                     mem: MemArg::Label(MemLabel::PCRel(8)),
                     srcloc: None,
                 };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                 let inst = Inst::Jump {
                     dest: BranchTarget::ResolvedOffset(8),
                 };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                 sink.put4(const_data.to_bits());
             }
             &Inst::LoadFpuConst64 { rd, const_data } => {
@@ -993,11 +1013,11 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                     mem: MemArg::Label(MemLabel::PCRel(8)),
                     srcloc: None,
                 };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                 let inst = Inst::Jump {
                     dest: BranchTarget::ResolvedOffset(12),
                 };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                 sink.put8(const_data.to_bits());
             }
             &Inst::FpuCSel32 { rd, rn, rm, cond } => {
@@ -1084,7 +1104,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                 if top22 != 0 {
                     sink.put4(enc_extend(top22, rd, rn));
                 } else {
-                    Inst::mov32(rd, rn).emit(sink, flags);
+                    Inst::mov32(rd, rn).emit(sink, flags, state);
                 }
             }
             &Inst::Extend {
@@ -1107,7 +1127,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                     rn: zero_reg(),
                     rm: rd.to_reg(),
                 };
-                sub_inst.emit(sink, flags);
+                sub_inst.emit(sink, flags, state);
             }
             &Inst::Extend {
                 rd,
@@ -1248,13 +1268,13 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                 // Save index in a tmp (the live range of ridx only goes to start of this
                 // sequence; rtmp1 or rtmp2 may overwrite it).
                 let inst = Inst::gen_move(rtmp2, ridx, I64);
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                 // Load address of jump table
                 let inst = Inst::Adr {
                     rd: rtmp1,
                     label: MemLabel::PCRel(16),
                 };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                 // Load value out of jump table
                 let inst = Inst::SLoad32 {
                     rd: rtmp2,
@@ -1266,7 +1286,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                     ),
                     srcloc: None, // can't cause a user trap.
                 };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                 // Add base of jump table to jump-table-sourced block offset
                 let inst = Inst::AluRRR {
                     alu_op: ALUOp::Add64,
@@ -1274,14 +1294,14 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                     rn: rtmp1.to_reg(),
                     rm: rtmp2.to_reg(),
                 };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                 // Branch to computed address. (`targets` here is only used for successor queries
                 // and is not needed for emission.)
                 let inst = Inst::IndirectBr {
                     rn: rtmp1.to_reg(),
                     targets: vec![],
                 };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                 // Emit jump table (table of 32-bit offsets).
                 for target in targets {
                     let off = target.as_offset_words() * 4;
@@ -1297,11 +1317,11 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                     mem: MemArg::Label(MemLabel::PCRel(8)),
                     srcloc: None, // can't cause a user trap.
                 };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                 let inst = Inst::Jump {
                     dest: BranchTarget::ResolvedOffset(12),
                 };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                 sink.put8(const_data);
             }
             &Inst::LoadExtName {
@@ -1315,11 +1335,11 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                     mem: MemArg::Label(MemLabel::PCRel(8)),
                     srcloc: None, // can't cause a user trap.
                 };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                 let inst = Inst::Jump {
                     dest: BranchTarget::ResolvedOffset(12),
                 };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                 sink.add_reloc(srcloc, Reloc::Abs8, name, offset);
                 if flags.emit_all_ones_funcaddrs() {
                     sink.put8(u64::max_value());
@@ -1327,52 +1347,81 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                     sink.put8(0);
                 }
             }
-            &Inst::LoadAddr { rd, ref mem } => match *mem {
-                MemArg::FPOffset(fp_off) => {
-                    let alu_op = if fp_off < 0 {
-                        ALUOp::Sub64
-                    } else {
-                        ALUOp::Add64
-                    };
-                    if let Some(imm12) = Imm12::maybe_from_u64(u64::try_from(fp_off.abs()).unwrap())
-                    {
-                        let inst = Inst::AluRRImm12 {
-                            alu_op,
-                            rd,
-                            imm12,
-                            rn: fp_reg(),
-                        };
-                        inst.emit(sink, flags);
-                    } else {
-                        let const_insts =
-                            Inst::load_constant(rd, u64::try_from(fp_off.abs()).unwrap());
-                        for inst in const_insts {
-                            inst.emit(sink, flags);
-                        }
-                        let inst = Inst::AluRRR {
-                            alu_op,
-                            rd,
-                            rn: fp_reg(),
-                            rm: rd.to_reg(),
-                        };
-                        inst.emit(sink, flags);
-                    }
+            &Inst::LoadAddr { rd, ref mem } => {
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem, state);
+                for inst in mem_insts.into_iter() {
+                    inst.emit(sink, flags, state);
                 }
-                _ => unimplemented!("{:?}", mem),
-            },
+
+                let (reg, offset) = match mem {
+                    MemArg::Unscaled(r, simm9) => (r, simm9.value()),
+                    MemArg::UnsignedOffset(r, uimm12scaled) => (r, uimm12scaled.value() as i32),
+                    _ => panic!("Unsupported case for LoadAddr: {:?}", mem),
+                };
+                let abs_offset = if offset < 0 {
+                    -offset as u64
+                } else {
+                    offset as u64
+                };
+                let alu_op = if offset < 0 {
+                    ALUOp::Sub64
+                } else {
+                    ALUOp::Add64
+                };
+
+                if offset == 0 {
+                    let mov = Inst::mov(rd, reg);
+                    mov.emit(sink, flags, state);
+                } else if let Some(imm12) = Imm12::maybe_from_u64(abs_offset) {
+                    let add = Inst::AluRRImm12 {
+                        alu_op,
+                        rd,
+                        rn: reg,
+                        imm12,
+                    };
+                    add.emit(sink, flags, state);
+                } else {
+                    // Use `tmp2` here: `reg` may be `spilltmp` if the `MemArg` on this instruction
+                    // was initially an `SPOffset`. Assert that `tmp2` is truly free to use. Note
+                    // that no other instructions will be inserted here (we're emitting directly),
+                    // and a live range of `tmp2` should not span this instruction, so this use
+                    // should otherwise be correct.
+                    debug_assert!(rd.to_reg() != tmp2_reg());
+                    debug_assert!(reg != tmp2_reg());
+                    let tmp = writable_tmp2_reg();
+                    for insn in Inst::load_constant(tmp, abs_offset).into_iter() {
+                        insn.emit(sink, flags, state);
+                    }
+                    let add = Inst::AluRRR {
+                        alu_op,
+                        rd,
+                        rn: reg,
+                        rm: tmp.to_reg(),
+                    };
+                    add.emit(sink, flags, state);
+                }
+            }
             &Inst::GetPinnedReg { rd } => {
                 let inst = Inst::Mov {
                     rd,
                     rm: xreg(PINNED_REG),
                 };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
             }
             &Inst::SetPinnedReg { rm } => {
                 let inst = Inst::Mov {
                     rd: Writable::from_reg(xreg(PINNED_REG)),
                     rm,
                 };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
+            }
+            &Inst::VirtualSPOffsetAdj { offset } => {
+                debug!(
+                    "virtual sp offset adjusted by {} -> {}",
+                    offset,
+                    state.virtual_sp_offset + offset
+                );
+                state.virtual_sp_offset += offset;
             }
         }
     }
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
index 9ce622d74c..d9d2fe0fd3 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -1313,8 +1313,8 @@ fn test_aarch64_binemit() {
             mem: MemArg::FPOffset(32768),
             srcloc: None,
         },
-        "0F0090D2EF011D8BE10140F9",
-        "movz x15, #32768 ; add x15, x15, fp ; ldr x1, [x15]",
+        "100090D2B063308B010240F9",
+        "movz x16, #32768 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
     ));
     insns.push((
         Inst::ULoad64 {
@@ -1322,8 +1322,8 @@ fn test_aarch64_binemit() {
             mem: MemArg::FPOffset(-32768),
             srcloc: None,
         },
-        "EFFF8F92EF011D8BE10140F9",
-        "movn x15, #32767 ; add x15, x15, fp ; ldr x1, [x15]",
+        "F0FF8F92B063308B010240F9",
+        "movn x16, #32767 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
     ));
     insns.push((
         Inst::ULoad64 {
@@ -1331,8 +1331,8 @@ fn test_aarch64_binemit() {
             mem: MemArg::FPOffset(1048576), // 2^20
             srcloc: None,
         },
-        "0F02A0D2EF011D8BE10140F9",
-        "movz x15, #16, LSL #16 ; add x15, x15, fp ; ldr x1, [x15]",
+        "1002A0D2B063308B010240F9",
+        "movz x16, #16, LSL #16 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
     ));
     insns.push((
         Inst::ULoad64 {
@@ -1340,8 +1340,8 @@ fn test_aarch64_binemit() {
             mem: MemArg::FPOffset(1048576 + 1), // 2^20 + 1
             srcloc: None,
         },
-        "2F0080D20F02A0F2EF011D8BE10140F9",
-        "movz x15, #1 ; movk x15, #16, LSL #16 ; add x15, x15, fp ; ldr x1, [x15]",
+        "300080D21002A0F2B063308B010240F9",
+        "movz x16, #1 ; movk x16, #16, LSL #16 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
     ));
 
     insns.push((
@@ -2794,7 +2794,7 @@ fn test_aarch64_binemit() {
         // Check the encoding is as expected.
         let text_size = {
             let mut code_sec = MachSectionSize::new(0);
-            insn.emit(&mut code_sec, &flags);
+            insn.emit(&mut code_sec, &flags, &mut Default::default());
             code_sec.size()
         };
 
@@ -2802,7 +2802,7 @@ fn test_aarch64_binemit() {
         let mut sections = MachSections::new();
         let code_idx = sections.add_section(0, text_size);
         let code_sec = sections.get_section(code_idx);
-        insn.emit(code_sec, &flags);
+        insn.emit(code_sec, &flags, &mut Default::default());
         sections.emit(&mut sink);
         let actual_encoding = &sink.stringify();
         assert_eq!(expected_encoding, actual_encoding);
diff --git a/cranelift/codegen/src/isa/aarch64/inst/imms.rs b/cranelift/codegen/src/isa/aarch64/inst/imms.rs
index 08bde5c64b..b8e6bf65bf 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/imms.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/imms.rs
@@ -134,6 +134,11 @@ impl SImm9 {
     pub fn bits(&self) -> u32 {
         (self.value as u32) & 0x1ff
     }
+
+    /// Signed value of immediate.
+    pub fn value(&self) -> i32 {
+        self.value as i32
+    }
 }
 
 /// An unsigned, scaled 12-bit offset.
@@ -172,6 +177,11 @@ impl UImm12Scaled {
     pub fn bits(&self) -> u32 {
         (self.value as u32 / self.scale_ty.bytes()) & 0xfff
     }
+
+    /// Value after scaling.
+    pub fn value(&self) -> u32 {
+        self.value as u32 * self.scale_ty.bytes()
+    }
 }
 
 /// A shifted immediate value in 'imm12' format: supports 12 bits, shifted
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index 436c0f4b78..14a9a7b6bf 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -13,7 +13,6 @@ use regalloc::{RealRegUniverse, Reg, RegClass, SpillSlot, VirtualReg, Writable};
 use regalloc::{RegUsageCollector, RegUsageMapper, Set};
 
 use alloc::vec::Vec;
-use core::convert::TryFrom;
 use smallvec::{smallvec, SmallVec};
 use std::string::{String, ToString};
 
@@ -741,6 +740,12 @@ pub enum Inst {
     SetPinnedReg {
         rm: Reg,
     },
+
+    /// Marker, no-op in generated code: SP "virtual offset" is adjusted. This
+    /// controls MemArg::NominalSPOffset args are lowered.
+    VirtualSPOffsetAdj {
+        offset: i64,
+    },
 }
 
 fn count_zero_half_words(mut value: u64) -> usize {
@@ -876,7 +881,7 @@ fn memarg_regs(memarg: &MemArg, collector: &mut RegUsageCollector) {
         &MemArg::FPOffset(..) => {
             collector.add_use(fp_reg());
         }
-        &MemArg::SPOffset(..) => {
+        &MemArg::SPOffset(..) | &MemArg::NominalSPOffset(..) => {
             collector.add_use(stack_reg());
         }
     }
@@ -1135,6 +1140,7 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
         &Inst::SetPinnedReg { rm } => {
             collector.add_use(rm);
         }
+        &Inst::VirtualSPOffsetAdj { .. } => {}
     }
 }
 
@@ -1186,7 +1192,9 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RegUsageMapper) {
             &mut MemArg::Label(..) => {}
             &mut MemArg::PreIndexed(ref mut r, ..) => map_mod(m, r),
             &mut MemArg::PostIndexed(ref mut r, ..) => map_mod(m, r),
-            &mut MemArg::FPOffset(..) | &mut MemArg::SPOffset(..) => {}
+            &mut MemArg::FPOffset(..)
+            | &mut MemArg::SPOffset(..)
+            | &mut MemArg::NominalSPOffset(..) => {}
         };
     }
 
@@ -1706,6 +1714,7 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RegUsageMapper) {
         &mut Inst::SetPinnedReg { ref mut rm } => {
             map_use(mapper, rm);
         }
+        &mut Inst::VirtualSPOffsetAdj { .. } => {}
     }
 }
 
@@ -1904,7 +1913,7 @@ impl MachInst for Inst {
 // Pretty-printing of instructions.
 
 fn mem_finalize_for_show(mem: &MemArg, mb_rru: Option<&RealRegUniverse>) -> (String, MemArg) {
-    let (mem_insts, mem) = mem_finalize(0, mem);
+    let (mem_insts, mem) = mem_finalize(0, mem, &mut Default::default());
     let mut mem_str = mem_insts
         .into_iter()
         .map(|inst| inst.show_rru(mb_rru))
@@ -2618,42 +2627,58 @@ impl ShowWithRRU for Inst {
                 let rd = rd.show_rru(mb_rru);
                 format!("ldr {}, 8 ; b 12 ; data {:?} + {}", rd, name, offset)
             }
-            &Inst::LoadAddr { rd, ref mem } => match *mem {
-                MemArg::FPOffset(fp_off) => {
-                    let alu_op = if fp_off < 0 {
-                        ALUOp::Sub64
-                    } else {
-                        ALUOp::Add64
-                    };
-                    if let Some(imm12) = Imm12::maybe_from_u64(u64::try_from(fp_off.abs()).unwrap())
-                    {
-                        let inst = Inst::AluRRImm12 {
-                            alu_op,
-                            rd,
-                            imm12,
-                            rn: fp_reg(),
-                        };
-                        inst.show_rru(mb_rru)
-                    } else {
-                        let mut res = String::new();
-                        let const_insts =
-                            Inst::load_constant(rd, u64::try_from(fp_off.abs()).unwrap());
-                        for inst in const_insts {
-                            res.push_str(&inst.show_rru(mb_rru));
-                            res.push_str("; ");
-                        }
-                        let inst = Inst::AluRRR {
-                            alu_op,
-                            rd,
-                            rn: fp_reg(),
-                            rm: rd.to_reg(),
-                        };
-                        res.push_str(&inst.show_rru(mb_rru));
-                        res
-                    }
+            &Inst::LoadAddr { rd, ref mem } => {
+                // TODO: we really should find a better way to avoid duplication of
+                // this logic between `emit()` and `show_rru()` -- a separate 1-to-N
+                // expansion stage (i.e., legalization, but without the slow edit-in-place
+                // of the existing legalization framework).
+                let (mem_insts, mem) = mem_finalize(0, mem, &EmitState::default());
+                let mut ret = String::new();
+                for inst in mem_insts.into_iter() {
+                    ret.push_str(&inst.show_rru(mb_rru));
                 }
-                _ => unimplemented!("{:?}", mem),
-            },
+                let (reg, offset) = match mem {
+                    MemArg::Unscaled(r, simm9) => (r, simm9.value()),
+                    MemArg::UnsignedOffset(r, uimm12scaled) => (r, uimm12scaled.value() as i32),
+                    _ => panic!("Unsupported case for LoadAddr: {:?}", mem),
+                };
+                let abs_offset = if offset < 0 {
+                    -offset as u64
+                } else {
+                    offset as u64
+                };
+                let alu_op = if offset < 0 {
+                    ALUOp::Sub64
+                } else {
+                    ALUOp::Add64
+                };
+
+                if offset == 0 {
+                    let mov = Inst::mov(rd, reg);
+                    ret.push_str(&mov.show_rru(mb_rru));
+                } else if let Some(imm12) = Imm12::maybe_from_u64(abs_offset) {
+                    let add = Inst::AluRRImm12 {
+                        alu_op,
+                        rd,
+                        rn: reg,
+                        imm12,
+                    };
+                    ret.push_str(&add.show_rru(mb_rru));
+                } else {
+                    let tmp = writable_spilltmp_reg();
+                    for inst in Inst::load_constant(tmp, abs_offset).into_iter() {
+                        ret.push_str(&inst.show_rru(mb_rru));
+                    }
+                    let add = Inst::AluRRR {
+                        alu_op,
+                        rd,
+                        rn: reg,
+                        rm: tmp.to_reg(),
+                    };
+                    ret.push_str(&add.show_rru(mb_rru));
+                }
+                ret
+            }
             &Inst::GetPinnedReg { rd } => {
                 let rd = rd.show_rru(mb_rru);
                 format!("get_pinned_reg {}", rd)
@@ -2662,6 +2687,7 @@ impl ShowWithRRU for Inst {
                 let rm = rm.show_rru(mb_rru);
                 format!("set_pinned_reg {}", rm)
             }
+            &Inst::VirtualSPOffsetAdj { offset } => format!("virtual_sp_offset_adjust {}", offset),
         }
     }
 }
diff --git a/cranelift/codegen/src/isa/aarch64/inst/regs.rs b/cranelift/codegen/src/isa/aarch64/inst/regs.rs
index f4f19cf517..3a10231edf 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/regs.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/regs.rs
@@ -20,23 +20,21 @@ pub const PINNED_REG: u8 = 21;
 const XREG_INDICES: [u8; 31] = [
     // X0 - X7
     32, 33, 34, 35, 36, 37, 38, 39,
-    // X8 - X14
-    40, 41, 42, 43, 44, 45, 46,
-    // X15
-    59,
+    // X8 - X15
+    40, 41, 42, 43, 44, 45, 46, 47,
     // X16, X17
-    47, 48,
+    58, 59,
     // X18
     60,
     // X19, X20
-    49, 50,
+    48, 49,
     // X21, put aside because it's the pinned register.
-    58,
+    57,
     // X22 - X28
-    51, 52, 53, 54, 55, 56, 57,
-    // X29
+    50, 51, 52, 53, 54, 55, 56,
+    // X29 (FP)
     61,
-    // X30
+    // X30 (LR)
     62,
 ];
 
@@ -125,14 +123,17 @@ pub fn writable_fp_reg() -> Writable<Reg> {
     Writable::from_reg(fp_reg())
 }
 
-/// Get a reference to the "spill temp" register. This register is used to
-/// compute the address of a spill slot when a direct offset addressing mode from
-/// FP is not sufficient (+/- 2^11 words). We exclude this register from regalloc
-/// and reserve it for this purpose for simplicity; otherwise we need a
-/// multi-stage analysis where we first determine how many spill slots we have,
-/// then perhaps remove the reg from the pool and recompute regalloc.
+/// Get a reference to the first temporary, sometimes "spill temporary", register. This register is
+/// used to compute the address of a spill slot when a direct offset addressing mode from FP is not
+/// sufficient (+/- 2^11 words). We exclude this register from regalloc and reserve it for this
+/// purpose for simplicity; otherwise we need a multi-stage analysis where we first determine how
+/// many spill slots we have, then perhaps remove the reg from the pool and recompute regalloc.
+///
+/// We use x16 for this (aka IP0 in the AArch64 ABI) because it's a scratch register but is
+/// slightly special (used for linker veneers). We're free to use it as long as we don't expect it
+/// to live through call instructions.
 pub fn spilltmp_reg() -> Reg {
-    xreg(15)
+    xreg(16)
 }
 
 /// Get a writable reference to the spilltmp reg.
@@ -140,6 +141,20 @@ pub fn writable_spilltmp_reg() -> Writable<Reg> {
     Writable::from_reg(spilltmp_reg())
 }
 
+/// Get a reference to the second temp register. We need this in some edge cases
+/// where we need both the spilltmp and another temporary.
+///
+/// We use x17 (aka IP1), the other "interprocedural"/linker-veneer scratch reg that is
+/// free to use otherwise.
+pub fn tmp2_reg() -> Reg {
+    xreg(17)
+}
+
+/// Get a writable reference to the tmp2 reg.
+pub fn writable_tmp2_reg() -> Writable<Reg> {
+    Writable::from_reg(tmp2_reg())
+}
+
 /// Create the register universe for AArch64.
 pub fn create_reg_universe(flags: &settings::Flags) -> RealRegUniverse {
     let mut regs = vec![];
@@ -173,7 +188,7 @@ pub fn create_reg_universe(flags: &settings::Flags) -> RealRegUniverse {
 
     for i in 0u8..32u8 {
         // See above for excluded registers.
-        if i == 15 || i == 18 || i == 29 || i == 30 || i == 31 || i == PINNED_REG {
+        if i == 16 || i == 17 || i == 18 || i == 29 || i == 30 || i == 31 || i == PINNED_REG {
             continue;
         }
         let reg = Reg::new_real(
@@ -211,7 +226,8 @@ pub fn create_reg_universe(flags: &settings::Flags) -> RealRegUniverse {
         regs.len()
     };
 
-    regs.push((xreg(15).to_real_reg(), "x15".to_string()));
+    regs.push((xreg(16).to_real_reg(), "x16".to_string()));
+    regs.push((xreg(17).to_real_reg(), "x17".to_string()));
     regs.push((xreg(18).to_real_reg(), "x18".to_string()));
     regs.push((fp_reg().to_real_reg(), "fp".to_string()));
     regs.push((link_reg().to_real_reg(), "lr".to_string()));
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index f8741212a9..bc2944f2b9 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -1291,7 +1291,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRIns
             assert!(inputs.len() == abi.num_args());
             for (i, input) in inputs.iter().enumerate() {
                 let arg_reg = input_to_reg(ctx, *input, NarrowValueMode::None);
-                for inst in abi.gen_copy_reg_to_arg(ctx, i, arg_reg) {
+                for inst in abi.gen_copy_reg_to_arg(i, arg_reg) {
                     ctx.emit(inst);
                 }
             }
diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
index 1a6ab16f69..7c833a47c9 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -2183,7 +2183,7 @@ fn test_x64_emit() {
         // Check the encoding is as expected.
         let text_size = {
             let mut code_sec = MachSectionSize::new(0);
-            insn.emit(&mut code_sec, &flags);
+            insn.emit(&mut code_sec, &flags, &mut Default::default());
             code_sec.size()
         };
 
@@ -2191,7 +2191,7 @@ fn test_x64_emit() {
         let mut sections = MachSections::new();
         let code_idx = sections.add_section(0, text_size);
         let code_sec = sections.get_section(code_idx);
-        insn.emit(code_sec, &flags);
+        insn.emit(code_sec, &flags, &mut Default::default());
         sections.emit(&mut sink);
         let actual_encoding = &sink.stringify();
         assert_eq!(expected_encoding, actual_encoding);
diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs
index a18dcb31fd..29e75b21fe 100644
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -950,7 +950,9 @@ impl MachInst for Inst {
 }
 
 impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
-    fn emit(&self, sink: &mut O, _flags: &settings::Flags) {
+    type State = ();
+
+    fn emit(&self, sink: &mut O, _flags: &settings::Flags, _: &mut Self::State) {
         emit::emit(self, sink);
     }
 }
diff --git a/cranelift/codegen/src/machinst/abi.rs b/cranelift/codegen/src/machinst/abi.rs
index 48278c537a..83aa158662 100644
--- a/cranelift/codegen/src/machinst/abi.rs
+++ b/cranelift/codegen/src/machinst/abi.rs
@@ -98,7 +98,10 @@ pub trait ABIBody {
     fn gen_epilogue(&self) -> Vec<Self::I>;
 
     /// Returns the full frame size for the given function, after prologue emission has run. This
-    /// comprises the spill space, incoming argument space, alignment padding, etc.
+    /// comprises the spill slots and stack-storage slots (but not storage for clobbered callee-save
+    /// registers, arguments pushed at callsites within this function, or other ephemeral pushes).
+    /// This is used for ABI variants where the client generates prologue/epilogue code, as in
+    /// Baldrdash (SpiderMonkey integration).
     fn frame_size(&self) -> u32;
 
     /// Get the spill-slot size.
@@ -133,12 +136,7 @@ pub trait ABICall {
     fn num_args(&self) -> usize;
 
     /// Copy an argument value from a source register, prior to the call.
-    fn gen_copy_reg_to_arg<C: LowerCtx<I = Self::I>>(
-        &self,
-        ctx: &mut C,
-        idx: usize,
-        from_reg: Reg,
-    ) -> Vec<Self::I>;
+    fn gen_copy_reg_to_arg(&self, idx: usize, from_reg: Reg) -> Vec<Self::I>;
 
     /// Copy a return value into a destination register, after the call returns.
     fn gen_copy_retval_to_reg(&self, idx: usize, into_reg: Writable<Reg>) -> Self::I;
diff --git a/cranelift/codegen/src/machinst/mod.rs b/cranelift/codegen/src/machinst/mod.rs
index ccb62deb7e..697601c672 100644
--- a/cranelift/codegen/src/machinst/mod.rs
+++ b/cranelift/codegen/src/machinst/mod.rs
@@ -214,8 +214,10 @@ pub enum MachTerminator<'a> {
 
 /// A trait describing the ability to encode a MachInst into binary machine code.
 pub trait MachInstEmit<O: MachSectionOutput> {
+    /// Persistent state carried across `emit` invocations.
+    type State: Default + Clone + Debug;
     /// Emit the instruction.
-    fn emit(&self, code: &mut O, flags: &Flags);
+    fn emit(&self, code: &mut O, flags: &Flags, state: &mut Self::State);
 }
 
 /// The result of a `MachBackend::compile_function()` call. Contains machine
diff --git a/cranelift/codegen/src/machinst/vcode.rs b/cranelift/codegen/src/machinst/vcode.rs
index 836be33941..a4801bfe3e 100644
--- a/cranelift/codegen/src/machinst/vcode.rs
+++ b/cranelift/codegen/src/machinst/vcode.rs
@@ -526,12 +526,13 @@ impl<I: VCodeInst> VCode<I> {
         // Compute block offsets.
         let mut code_section = MachSectionSize::new(0);
         let mut block_offsets = vec![0; self.num_blocks()];
+        let mut state = Default::default();
         for &block in &self.final_block_order {
             code_section.offset = I::align_basic_block(code_section.offset);
             block_offsets[block as usize] = code_section.offset;
             let (start, end) = self.block_ranges[block as usize];
             for iix in start..end {
-                self.insts[iix as usize].emit(&mut code_section, flags);
+                self.insts[iix as usize].emit(&mut code_section, flags, &mut state);
             }
         }
 
@@ -544,13 +545,14 @@ impl<I: VCodeInst> VCode<I> {
         // it (so forward references are now possible), and (ii) mutates the
         // instructions.
         let mut code_section = MachSectionSize::new(0);
+        let mut state = Default::default();
         for &block in &self.final_block_order {
             code_section.offset = I::align_basic_block(code_section.offset);
             let (start, end) = self.block_ranges[block as usize];
             for iix in start..end {
                 self.insts[iix as usize]
                     .with_block_offsets(code_section.offset, &self.final_block_offsets[..]);
-                self.insts[iix as usize].emit(&mut code_section, flags);
+                self.insts[iix as usize].emit(&mut code_section, flags, &mut state);
             }
         }
     }
@@ -563,6 +565,7 @@ impl<I: VCodeInst> VCode<I> {
         let mut sections = MachSections::new();
         let code_idx = sections.add_section(0, self.code_size);
         let code_section = sections.get_section(code_idx);
+        let mut state = Default::default();
 
         let flags = self.abi.flags();
         let mut cur_srcloc = None;
@@ -571,7 +574,7 @@ impl<I: VCodeInst> VCode<I> {
             while new_offset > code_section.cur_offset_from_start() {
                 // Pad with NOPs up to the aligned block offset.
                 let nop = I::gen_nop((new_offset - code_section.cur_offset_from_start()) as usize);
-                nop.emit(code_section, flags);
+                nop.emit(code_section, flags, &mut Default::default());
             }
             assert_eq!(code_section.cur_offset_from_start(), new_offset);
 
@@ -586,7 +589,7 @@ impl<I: VCodeInst> VCode<I> {
                     cur_srcloc = Some(srcloc);
                 }
 
-                self.insts[iix as usize].emit(code_section, flags);
+                self.insts[iix as usize].emit(code_section, flags, &mut state);
             }
 
             if cur_srcloc.is_some() {
diff --git a/cranelift/filetests/filetests/vcode/aarch64/call.clif b/cranelift/filetests/filetests/vcode/aarch64/call.clif
index d88a20ceab..4178a4c2f7 100644
--- a/cranelift/filetests/filetests/vcode/aarch64/call.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/call.clif
@@ -11,8 +11,8 @@ block0(v0: i64):
 
 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln:  mov fp, sp
-; nextln:  ldr x15, 8 ; b 12 ; data
-; nextln:  blr x15
+; nextln:  ldr x16, 8 ; b 12 ; data
+; nextln:  blr x16
 ; nextln:  mov sp, fp
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
diff --git a/cranelift/filetests/filetests/vcode/aarch64/stack-limit.clif b/cranelift/filetests/filetests/vcode/aarch64/stack-limit.clif
index 13b431867e..c9734e7cdd 100644
--- a/cranelift/filetests/filetests/vcode/aarch64/stack-limit.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/stack-limit.clif
@@ -45,8 +45,8 @@ block0(v0: i64):
 ; nextln:     subs xzr, sp, x0
 ; nextln:     b.hs 8
 ; nextln:     udf
-; nextln:     ldr x15
-; nextln:     blr x15
+; nextln:     ldr x16
+; nextln:     blr x16
 ; nextln:     mov sp, fp
 ; nextln:     ldp fp, lr, [sp], #16
 ; nextln:     ret
@@ -64,13 +64,13 @@ block0(v0: i64):
 
 ; check:      stp fp, lr, [sp, #-16]!
 ; nextln:     mov fp, sp
-; nextln:     ldr x15, [x0]
-; nextln:     ldr x15, [x15, #4]
-; nextln:     subs xzr, sp, x15
+; nextln:     ldr x16, [x0]
+; nextln:     ldr x16, [x16, #4]
+; nextln:     subs xzr, sp, x16
 ; nextln:     b.hs 8
 ; nextln:     udf
-; nextln:     ldr x15
-; nextln:     blr x15
+; nextln:     ldr x16
+; nextln:     blr x16
 ; nextln:     mov sp, fp
 ; nextln:     ldp fp, lr, [sp], #16
 ; nextln:     ret
@@ -84,8 +84,8 @@ block0(v0: i64):
 
 ; check:      stp fp, lr, [sp, #-16]!
 ; nextln:     mov fp, sp
-; nextln:     add x15, x0, #176
-; nextln:     subs xzr, sp, x15
+; nextln:     add x16, x0, #176
+; nextln:     subs xzr, sp, x16
 ; nextln:     b.hs 8
 ; nextln:     udf
 ; nextln:     sub sp, sp, #176
@@ -104,14 +104,14 @@ block0(v0: i64):
 ; nextln:     subs xzr, sp, x0
 ; nextln:     b.hs 8
 ; nextln:     udf
-; nextln:     movz x16, #6784
-; nextln:     movk x16, #6, LSL #16
-; nextln:     add x15, x0, x16, UXTX
-; nextln:     subs xzr, sp, x15
+; nextln:     movz x17, #6784
+; nextln:     movk x17, #6, LSL #16
+; nextln:     add x16, x0, x17, UXTX
+; nextln:     subs xzr, sp, x16
 ; nextln:     b.hs 8
 ; nextln:     udf
-; nextln:     ldr x15, 8 ; b 12 ; data 400000
-; nextln:     sub sp, sp, x15, UXTX
+; nextln:     ldr x16, 8 ; b 12 ; data 400000
+; nextln:     sub sp, sp, x16, UXTX
 ; nextln:     mov sp, fp
 ; nextln:     ldp fp, lr, [sp], #16
 ; nextln:     ret
@@ -128,10 +128,10 @@ block0(v0: i64):
 
 ; check:      stp fp, lr, [sp, #-16]!
 ; nextln:     mov fp, sp
-; nextln:     ldr x15, [x0]
-; nextln:     ldr x15, [x15, #4]
-; nextln:     add x15, x15, #32
-; nextln:     subs xzr, sp, x15
+; nextln:     ldr x16, [x0]
+; nextln:     ldr x16, [x16, #4]
+; nextln:     add x16, x16, #32
+; nextln:     subs xzr, sp, x16
 ; nextln:     b.hs 8
 ; nextln:     udf
 ; nextln:     sub sp, sp, #32
@@ -151,19 +151,19 @@ block0(v0: i64):
 
 ; check:      stp fp, lr, [sp, #-16]!
 ; nextln:     mov fp, sp
-; nextln:     ldr x15, [x0]
-; nextln:     ldr x15, [x15, #4]
-; nextln:     subs xzr, sp, x15
+; nextln:     ldr x16, [x0]
+; nextln:     ldr x16, [x16, #4]
+; nextln:     subs xzr, sp, x16
 ; nextln:     b.hs 8
 ; nextln:     udf
-; nextln:     movz x16, #6784
-; nextln:     movk x16, #6, LSL #16
-; nextln:     add x15, x15, x16, UXTX
-; nextln:     subs xzr, sp, x15
+; nextln:     movz x17, #6784
+; nextln:     movk x17, #6, LSL #16
+; nextln:     add x16, x16, x17, UXTX
+; nextln:     subs xzr, sp, x16
 ; nextln:     b.hs 8
 ; nextln:     udf
-; nextln:     ldr x15, 8 ; b 12 ; data 400000
-; nextln:     sub sp, sp, x15, UXTX
+; nextln:     ldr x16, 8 ; b 12 ; data 400000
+; nextln:     sub sp, sp, x16, UXTX
 ; nextln:     mov sp, fp
 ; nextln:     ldp fp, lr, [sp], #16
 ; nextln:     ret
@@ -179,11 +179,11 @@ block0(v0: i64):
 
 ; check:      stp fp, lr, [sp, #-16]!
 ; nextln:     mov fp, sp
-; nextln:     movz x15, #6784
-; nextln:     movk x15, #6, LSL #16
-; nextln:     ldr x15, [x0, x15]
-; nextln:     add x15, x15, #32
-; nextln:     subs xzr, sp, x15
+; nextln:     movz x16, #6784
+; nextln:     movk x16, #6, LSL #16
+; nextln:     ldr x16, [x0, x16]
+; nextln:     add x16, x16, #32
+; nextln:     subs xzr, sp, x16
 ; nextln:     b.hs 8
 ; nextln:     udf
 ; nextln:     sub sp, sp, #32
diff --git a/cranelift/filetests/filetests/vcode/aarch64/stack.clif b/cranelift/filetests/filetests/vcode/aarch64/stack.clif
index 99d60d97ad..47c4b37a0f 100644
--- a/cranelift/filetests/filetests/vcode/aarch64/stack.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/stack.clif
@@ -12,7 +12,7 @@ block0:
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
 ; nextln: sub sp, sp, #16
-; nextln: sub x0, fp, #8
+; nextln: mov x0, sp
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
@@ -29,9 +29,9 @@ block0:
 
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
-; nextln: ldr x15, 8 ; b 12 ; data 100016
-; nextln: sub sp, sp, x15, UXTX
-; nextln: movz x0, #34472; movk x0, #1, LSL #16; sub x0, fp, x0
+; nextln: ldr x16, 8 ; b 12 ; data 100016
+; nextln: sub sp, sp, x16, UXTX
+; nextln: mov x0, sp
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
@@ -50,7 +50,7 @@ block0:
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
 ; nextln: sub sp, sp, #16
-; nextln: sub x0, fp, #8
+; nextln: mov x0, sp
 ; nextln: ldur x0, [x0]
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
@@ -68,9 +68,9 @@ block0:
 
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
-; nextln: ldr x15, 8 ; b 12 ; data 100016
-; nextln: sub sp, sp, x15, UXTX
-; nextln: movz x0, #34472; movk x0, #1, LSL #16; sub x0, fp, x0
+; nextln: ldr x16, 8 ; b 12 ; data 100016
+; nextln: sub sp, sp, x16, UXTX
+; nextln: mov x0, sp
 ; nextln: ldur x0, [x0]
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
@@ -88,7 +88,7 @@ block0(v0: i64):
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
 ; nextln: sub sp, sp, #16
-; nextln: sub x1, fp, #8
+; nextln: mov x1, sp
 ; nextln: stur x0, [x1]
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
@@ -106,9 +106,9 @@ block0(v0: i64):
 
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
-; nextln: ldr x15, 8 ; b 12 ; data 100016
-; nextln: sub sp, sp, x15, UXTX
-; nextln: movz x1, #34472; movk x1, #1, LSL #16; sub x1, fp, x1
+; nextln: ldr x16, 8 ; b 12 ; data 100016
+; nextln: sub sp, sp, x16, UXTX
+; nextln: mov x1, sp
 ; nextln: stur x0, [x1]
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16