Rework aarch64 stack frame implementation.

This PR changes the aarch64 ABI implementation to use positive offsets from SP, rather than negative offsets from FP, to refer to spill slots and stack-local storage. This allows for better addressing-mode options, and hence slightly better code: e.g., the unsigned scaled 12-bit offset mode can be used to reach anywhere in a 32KB frame without extra address-construction instructions, whereas negative offsets are limited to a signed 9-bit unscaled mode (-256 bytes). To enable this, the PR introduces a notion of "nominal SP offsets" as a virtual addressing mode, lowered during the emission pass. The offsets are relative to "SP after adjusting downward to allocate stack/spill slots", but before pushing clobbers. This allows the addressing-mode expressions to be generated before register allocation (or during it, for spill/reload sequences). To convert these offsets into *true* offsets from SP, we need to track how much further SP is moved downward, and compensate for this. We do so with "virtual SP offset adjustment" pseudo-instructions: these are seen by the emission pass, and result in no instruction (0 byte output), but update state that is now threaded through each instruction emission in turn. In this way, we can push e.g. stack args for a call and adjust the virtual SP offset, allowing reloads from nominal-SP-relative spillslots while we do the argument setup with "real SP offsets" at the same time.
2020-04-24 22:32:35 -07:00
parent 176b3a8382
commit a66724aafd
16 changed files with 496 additions and 320 deletions
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -112,7 +112,9 @@ pub enum MemLabel {
 /// A memory argument to load/store, encapsulating the possible addressing modes.
 #[derive(Clone, Debug)]
 pub enum MemArg {
-    Label(MemLabel),
+    //
+    // Real ARM64 addressing modes:
+    //
    /// "post-indexed" mode as per AArch64 docs: postincrement reg after address computation.
    PostIndexed(Writable<Reg>, SImm9),
    /// "pre-indexed" mode as per AArch64 docs: preincrement reg before address computation.
@@ -137,11 +139,31 @@ pub enum MemArg {
    /// Scaled (by size of a type) unsigned 12-bit immediate offset from reg.
    UnsignedOffset(Reg, UImm12Scaled),

-    /// Offset from the stack pointer. Lowered into a real amode at emission.
+    //
+    // virtual addressing modes that are lowered at emission time:
+    //
+    /// Reference to a "label": e.g., a symbol.
+    Label(MemLabel),
+
+    /// Offset from the stack pointer.
    SPOffset(i64),

-    /// Offset from the frame pointer. Lowered into a real amode at emission.
+    /// Offset from the frame pointer.
    FPOffset(i64),
+
+    /// Offset from the "nominal stack pointer", which is where the real SP is
+    /// just after stack and spill slots are allocated in the function prologue.
+    /// At emission time, this is converted to `SPOffset` with a fixup added to
+    /// the offset constant. The fixup is a running value that is tracked as
+    /// emission iterates through instructions in linear order, and can be
+    /// adjusted up and down with [Inst::VirtualSPOffsetAdj].
+    ///
+    /// The standard ABI is in charge of handling this (by emitting the
+    /// adjustment meta-instructions). It maintains the invariant that "nominal
+    /// SP" is where the actual SP is after the function prologue and before
+    /// clobber pushes. See the diagram in the documentation for
+    /// [crate::isa::aarch64::abi](the ABI module) for more details.
+    NominalSPOffset(i64),
 }

 impl MemArg {
@@ -443,7 +465,7 @@ impl ShowWithRRU for MemArg {
                simm9.show_rru(mb_rru)
            ),
            // Eliminated by `mem_finalize()`.
-            &MemArg::SPOffset(..) | &MemArg::FPOffset(..) => {
+            &MemArg::SPOffset(..) | &MemArg::FPOffset(..) | &MemArg::NominalSPOffset(..) => {
                panic!("Unexpected stack-offset mem-arg mode!")
            }
        }
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -10,6 +10,7 @@ use regalloc::{Reg, RegClass, Writable};

 use alloc::vec::Vec;
 use core::convert::TryFrom;
+use log::debug;

 /// Memory label/reference finalization: convert a MemLabel to a PC-relative
 /// offset, possibly emitting relocation(s) as necessary.
@@ -23,33 +24,44 @@ pub fn memlabel_finalize(_insn_off: CodeOffset, label: &MemLabel) -> i32 {
 /// generic arbitrary stack offset) into real addressing modes, possibly by
 /// emitting some helper instructions that come immediately before the use
 /// of this amode.
-pub fn mem_finalize(insn_off: CodeOffset, mem: &MemArg) -> (Vec<Inst>, MemArg) {
+pub fn mem_finalize(insn_off: CodeOffset, mem: &MemArg, state: &EmitState) -> (Vec<Inst>, MemArg) {
    match mem {
-        &MemArg::SPOffset(off) | &MemArg::FPOffset(off) => {
+        &MemArg::SPOffset(off) | &MemArg::FPOffset(off) | &MemArg::NominalSPOffset(off) => {
            let basereg = match mem {
-                &MemArg::SPOffset(..) => stack_reg(),
+                &MemArg::SPOffset(..) | &MemArg::NominalSPOffset(..) => stack_reg(),
                &MemArg::FPOffset(..) => fp_reg(),
                _ => unreachable!(),
            };
+            let adj = match mem {
+                &MemArg::NominalSPOffset(..) => {
+                    debug!(
+                        "mem_finalize: nominal SP offset {} + adj {} -> {}",
+                        off,
+                        state.virtual_sp_offset,
+                        off + state.virtual_sp_offset
+                    );
+                    state.virtual_sp_offset
+                }
+                _ => 0,
+            };
+            let off = off + adj;
+
            if let Some(simm9) = SImm9::maybe_from_i64(off) {
                let mem = MemArg::Unscaled(basereg, simm9);
                (vec![], mem)
            } else {
-                // In an addition, x31 is the zero register, not sp; we have only one temporary
-                // so we can't do the proper add here.
-                debug_assert_ne!(
-                    basereg,
-                    stack_reg(),
-                    "should have diverted SP before mem_finalize"
-                );
-
                let tmp = writable_spilltmp_reg();
                let mut const_insts = Inst::load_constant(tmp, off as u64);
-                let add_inst = Inst::AluRRR {
+                // N.B.: we must use AluRRRExtend because AluRRR uses the "shifted register" form
+                // (AluRRRShift) instead, which interprets register 31 as the zero reg, not SP. SP
+                // is a valid base (for SPOffset) which we must handle here.
+                // Also, SP needs to be the first arg, not second.
+                let add_inst = Inst::AluRRRExtend {
                    alu_op: ALUOp::Add64,
                    rd: tmp,
-                    rn: tmp.to_reg(),
-                    rm: basereg,
+                    rn: basereg,
+                    rm: tmp.to_reg(),
+                    extendop: ExtendOp::UXTX,
                };
                const_insts.push(add_inst);
                (const_insts.to_vec(), MemArg::reg(tmp.to_reg()))
@@ -322,8 +334,16 @@ fn enc_fround(top22: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
    (top22 << 10) | (machreg_to_vec(rn) << 5) | machreg_to_vec(rd.to_reg())
 }

+/// State carried between emissions of a sequence of instructions.
+#[derive(Default, Clone, Debug)]
+pub struct EmitState {
+    virtual_sp_offset: i64,
+}
+
 impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
-    fn emit(&self, sink: &mut O, flags: &settings::Flags) {
+    type State = EmitState;
+
+    fn emit(&self, sink: &mut O, flags: &settings::Flags, state: &mut EmitState) {
        match self {
            &Inst::AluRRR { alu_op, rd, rn, rm } => {
                let top11 = match alu_op {
@@ -596,10 +616,10 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                ref mem,
                srcloc,
            } => {
-                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem);
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem, state);

                for inst in mem_insts.into_iter() {
-                    inst.emit(sink, flags);
+                    inst.emit(sink, flags, state);
                }

                // ldst encoding helpers take Reg, not Writable<Reg>.
@@ -697,9 +717,9 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                        sink.put4(enc_ldst_simm9(op, simm9, 0b01, reg.to_reg(), rd));
                    }
                    // Eliminated by `mem_finalize()` above.
-                    &MemArg::SPOffset(..) | &MemArg::FPOffset(..) => {
-                        panic!("Should not see stack-offset here!")
-                    }
+                    &MemArg::SPOffset(..)
+                    | &MemArg::FPOffset(..)
+                    | &MemArg::NominalSPOffset(..) => panic!("Should not see stack-offset here!"),
                }
            }

@@ -739,10 +759,10 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                ref mem,
                srcloc,
            } => {
-                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem);
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem, state);

                for inst in mem_insts.into_iter() {
-                    inst.emit(sink, flags);
+                    inst.emit(sink, flags, state);
                }

                let op = match self {
@@ -794,9 +814,9 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                        sink.put4(enc_ldst_simm9(op, simm9, 0b01, reg.to_reg(), rd));
                    }
                    // Eliminated by `mem_finalize()` above.
-                    &MemArg::SPOffset(..) | &MemArg::FPOffset(..) => {
-                        panic!("Should not see stack-offset here!")
-                    }
+                    &MemArg::SPOffset(..)
+                    | &MemArg::FPOffset(..)
+                    | &MemArg::NominalSPOffset(..) => panic!("Should not see stack-offset here!"),
                }
            }

@@ -980,11 +1000,11 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    mem: MemArg::Label(MemLabel::PCRel(8)),
                    srcloc: None,
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                let inst = Inst::Jump {
                    dest: BranchTarget::ResolvedOffset(8),
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                sink.put4(const_data.to_bits());
            }
            &Inst::LoadFpuConst64 { rd, const_data } => {
@@ -993,11 +1013,11 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    mem: MemArg::Label(MemLabel::PCRel(8)),
                    srcloc: None,
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                let inst = Inst::Jump {
                    dest: BranchTarget::ResolvedOffset(12),
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                sink.put8(const_data.to_bits());
            }
            &Inst::FpuCSel32 { rd, rn, rm, cond } => {
@@ -1084,7 +1104,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                if top22 != 0 {
                    sink.put4(enc_extend(top22, rd, rn));
                } else {
-                    Inst::mov32(rd, rn).emit(sink, flags);
+                    Inst::mov32(rd, rn).emit(sink, flags, state);
                }
            }
            &Inst::Extend {
@@ -1107,7 +1127,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    rn: zero_reg(),
                    rm: rd.to_reg(),
                };
-                sub_inst.emit(sink, flags);
+                sub_inst.emit(sink, flags, state);
            }
            &Inst::Extend {
                rd,
@@ -1248,13 +1268,13 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                // Save index in a tmp (the live range of ridx only goes to start of this
                // sequence; rtmp1 or rtmp2 may overwrite it).
                let inst = Inst::gen_move(rtmp2, ridx, I64);
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                // Load address of jump table
                let inst = Inst::Adr {
                    rd: rtmp1,
                    label: MemLabel::PCRel(16),
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                // Load value out of jump table
                let inst = Inst::SLoad32 {
                    rd: rtmp2,
@@ -1266,7 +1286,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    ),
                    srcloc: None, // can't cause a user trap.
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                // Add base of jump table to jump-table-sourced block offset
                let inst = Inst::AluRRR {
                    alu_op: ALUOp::Add64,
@@ -1274,14 +1294,14 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    rn: rtmp1.to_reg(),
                    rm: rtmp2.to_reg(),
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                // Branch to computed address. (`targets` here is only used for successor queries
                // and is not needed for emission.)
                let inst = Inst::IndirectBr {
                    rn: rtmp1.to_reg(),
                    targets: vec![],
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                // Emit jump table (table of 32-bit offsets).
                for target in targets {
                    let off = target.as_offset_words() * 4;
@@ -1297,11 +1317,11 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    mem: MemArg::Label(MemLabel::PCRel(8)),
                    srcloc: None, // can't cause a user trap.
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                let inst = Inst::Jump {
                    dest: BranchTarget::ResolvedOffset(12),
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                sink.put8(const_data);
            }
            &Inst::LoadExtName {
@@ -1315,11 +1335,11 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    mem: MemArg::Label(MemLabel::PCRel(8)),
                    srcloc: None, // can't cause a user trap.
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                let inst = Inst::Jump {
                    dest: BranchTarget::ResolvedOffset(12),
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
                sink.add_reloc(srcloc, Reloc::Abs8, name, offset);
                if flags.emit_all_ones_funcaddrs() {
                    sink.put8(u64::max_value());
@@ -1327,52 +1347,81 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                    sink.put8(0);
                }
            }
-            &Inst::LoadAddr { rd, ref mem } => match *mem {
-                MemArg::FPOffset(fp_off) => {
-                    let alu_op = if fp_off < 0 {
-                        ALUOp::Sub64
-                    } else {
-                        ALUOp::Add64
-                    };
-                    if let Some(imm12) = Imm12::maybe_from_u64(u64::try_from(fp_off.abs()).unwrap())
-                    {
-                        let inst = Inst::AluRRImm12 {
-                            alu_op,
-                            rd,
-                            imm12,
-                            rn: fp_reg(),
-                        };
-                        inst.emit(sink, flags);
-                    } else {
-                        let const_insts =
-                            Inst::load_constant(rd, u64::try_from(fp_off.abs()).unwrap());
-                        for inst in const_insts {
-                            inst.emit(sink, flags);
-                        }
-                        let inst = Inst::AluRRR {
-                            alu_op,
-                            rd,
-                            rn: fp_reg(),
-                            rm: rd.to_reg(),
-                        };
-                        inst.emit(sink, flags);
-                    }
+            &Inst::LoadAddr { rd, ref mem } => {
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem, state);
+                for inst in mem_insts.into_iter() {
+                    inst.emit(sink, flags, state);
                }
-                _ => unimplemented!("{:?}", mem),
-            },
+
+                let (reg, offset) = match mem {
+                    MemArg::Unscaled(r, simm9) => (r, simm9.value()),
+                    MemArg::UnsignedOffset(r, uimm12scaled) => (r, uimm12scaled.value() as i32),
+                    _ => panic!("Unsupported case for LoadAddr: {:?}", mem),
+                };
+                let abs_offset = if offset < 0 {
+                    -offset as u64
+                } else {
+                    offset as u64
+                };
+                let alu_op = if offset < 0 {
+                    ALUOp::Sub64
+                } else {
+                    ALUOp::Add64
+                };
+
+                if offset == 0 {
+                    let mov = Inst::mov(rd, reg);
+                    mov.emit(sink, flags, state);
+                } else if let Some(imm12) = Imm12::maybe_from_u64(abs_offset) {
+                    let add = Inst::AluRRImm12 {
+                        alu_op,
+                        rd,
+                        rn: reg,
+                        imm12,
+                    };
+                    add.emit(sink, flags, state);
+                } else {
+                    // Use `tmp2` here: `reg` may be `spilltmp` if the `MemArg` on this instruction
+                    // was initially an `SPOffset`. Assert that `tmp2` is truly free to use. Note
+                    // that no other instructions will be inserted here (we're emitting directly),
+                    // and a live range of `tmp2` should not span this instruction, so this use
+                    // should otherwise be correct.
+                    debug_assert!(rd.to_reg() != tmp2_reg());
+                    debug_assert!(reg != tmp2_reg());
+                    let tmp = writable_tmp2_reg();
+                    for insn in Inst::load_constant(tmp, abs_offset).into_iter() {
+                        insn.emit(sink, flags, state);
+                    }
+                    let add = Inst::AluRRR {
+                        alu_op,
+                        rd,
+                        rn: reg,
+                        rm: tmp.to_reg(),
+                    };
+                    add.emit(sink, flags, state);
+                }
+            }
            &Inst::GetPinnedReg { rd } => {
                let inst = Inst::Mov {
                    rd,
                    rm: xreg(PINNED_REG),
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
            }
            &Inst::SetPinnedReg { rm } => {
                let inst = Inst::Mov {
                    rd: Writable::from_reg(xreg(PINNED_REG)),
                    rm,
                };
-                inst.emit(sink, flags);
+                inst.emit(sink, flags, state);
+            }
+            &Inst::VirtualSPOffsetAdj { offset } => {
+                debug!(
+                    "virtual sp offset adjusted by {} -> {}",
+                    offset,
+                    state.virtual_sp_offset + offset
+                );
+                state.virtual_sp_offset += offset;
            }
        }
    }
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -1313,8 +1313,8 @@ fn test_aarch64_binemit() {
            mem: MemArg::FPOffset(32768),
            srcloc: None,
        },
-        "0F0090D2EF011D8BE10140F9",
-        "movz x15, #32768 ; add x15, x15, fp ; ldr x1, [x15]",
+        "100090D2B063308B010240F9",
+        "movz x16, #32768 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
    ));
    insns.push((
        Inst::ULoad64 {
@@ -1322,8 +1322,8 @@ fn test_aarch64_binemit() {
            mem: MemArg::FPOffset(-32768),
            srcloc: None,
        },
-        "EFFF8F92EF011D8BE10140F9",
-        "movn x15, #32767 ; add x15, x15, fp ; ldr x1, [x15]",
+        "F0FF8F92B063308B010240F9",
+        "movn x16, #32767 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
    ));
    insns.push((
        Inst::ULoad64 {
@@ -1331,8 +1331,8 @@ fn test_aarch64_binemit() {
            mem: MemArg::FPOffset(1048576), // 2^20
            srcloc: None,
        },
-        "0F02A0D2EF011D8BE10140F9",
-        "movz x15, #16, LSL #16 ; add x15, x15, fp ; ldr x1, [x15]",
+        "1002A0D2B063308B010240F9",
+        "movz x16, #16, LSL #16 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
    ));
    insns.push((
        Inst::ULoad64 {
@@ -1340,8 +1340,8 @@ fn test_aarch64_binemit() {
            mem: MemArg::FPOffset(1048576 + 1), // 2^20 + 1
            srcloc: None,
        },
-        "2F0080D20F02A0F2EF011D8BE10140F9",
-        "movz x15, #1 ; movk x15, #16, LSL #16 ; add x15, x15, fp ; ldr x1, [x15]",
+        "300080D21002A0F2B063308B010240F9",
+        "movz x16, #1 ; movk x16, #16, LSL #16 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
    ));

    insns.push((
@@ -2794,7 +2794,7 @@ fn test_aarch64_binemit() {
        // Check the encoding is as expected.
        let text_size = {
            let mut code_sec = MachSectionSize::new(0);
-            insn.emit(&mut code_sec, &flags);
+            insn.emit(&mut code_sec, &flags, &mut Default::default());
            code_sec.size()
        };

@@ -2802,7 +2802,7 @@ fn test_aarch64_binemit() {
        let mut sections = MachSections::new();
        let code_idx = sections.add_section(0, text_size);
        let code_sec = sections.get_section(code_idx);
-        insn.emit(code_sec, &flags);
+        insn.emit(code_sec, &flags, &mut Default::default());
        sections.emit(&mut sink);
        let actual_encoding = &sink.stringify();
        assert_eq!(expected_encoding, actual_encoding);
--- a/cranelift/codegen/src/isa/aarch64/inst/imms.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/imms.rs
@@ -134,6 +134,11 @@ impl SImm9 {
    pub fn bits(&self) -> u32 {
        (self.value as u32) & 0x1ff
    }
+
+    /// Signed value of immediate.
+    pub fn value(&self) -> i32 {
+        self.value as i32
+    }
 }

 /// An unsigned, scaled 12-bit offset.
@@ -172,6 +177,11 @@ impl UImm12Scaled {
    pub fn bits(&self) -> u32 {
        (self.value as u32 / self.scale_ty.bytes()) & 0xfff
    }
+
+    /// Value after scaling.
+    pub fn value(&self) -> u32 {
+        self.value as u32 * self.scale_ty.bytes()
+    }
 }

 /// A shifted immediate value in 'imm12' format: supports 12 bits, shifted
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -13,7 +13,6 @@ use regalloc::{RealRegUniverse, Reg, RegClass, SpillSlot, VirtualReg, Writable};
 use regalloc::{RegUsageCollector, RegUsageMapper, Set};

 use alloc::vec::Vec;
-use core::convert::TryFrom;
 use smallvec::{smallvec, SmallVec};
 use std::string::{String, ToString};

@@ -741,6 +740,12 @@ pub enum Inst {
    SetPinnedReg {
        rm: Reg,
    },
+
+    /// Marker, no-op in generated code: SP "virtual offset" is adjusted. This
+    /// controls MemArg::NominalSPOffset args are lowered.
+    VirtualSPOffsetAdj {
+        offset: i64,
+    },
 }

 fn count_zero_half_words(mut value: u64) -> usize {
@@ -876,7 +881,7 @@ fn memarg_regs(memarg: &MemArg, collector: &mut RegUsageCollector) {
        &MemArg::FPOffset(..) => {
            collector.add_use(fp_reg());
        }
-        &MemArg::SPOffset(..) => {
+        &MemArg::SPOffset(..) | &MemArg::NominalSPOffset(..) => {
            collector.add_use(stack_reg());
        }
    }
@@ -1135,6 +1140,7 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
        &Inst::SetPinnedReg { rm } => {
            collector.add_use(rm);
        }
+        &Inst::VirtualSPOffsetAdj { .. } => {}
    }
 }

@@ -1186,7 +1192,9 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RegUsageMapper) {
            &mut MemArg::Label(..) => {}
            &mut MemArg::PreIndexed(ref mut r, ..) => map_mod(m, r),
            &mut MemArg::PostIndexed(ref mut r, ..) => map_mod(m, r),
-            &mut MemArg::FPOffset(..) | &mut MemArg::SPOffset(..) => {}
+            &mut MemArg::FPOffset(..)
+            | &mut MemArg::SPOffset(..)
+            | &mut MemArg::NominalSPOffset(..) => {}
        };
    }

@@ -1706,6 +1714,7 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RegUsageMapper) {
        &mut Inst::SetPinnedReg { ref mut rm } => {
            map_use(mapper, rm);
        }
+        &mut Inst::VirtualSPOffsetAdj { .. } => {}
    }
 }

@@ -1904,7 +1913,7 @@ impl MachInst for Inst {
 // Pretty-printing of instructions.

 fn mem_finalize_for_show(mem: &MemArg, mb_rru: Option<&RealRegUniverse>) -> (String, MemArg) {
-    let (mem_insts, mem) = mem_finalize(0, mem);
+    let (mem_insts, mem) = mem_finalize(0, mem, &mut Default::default());
    let mut mem_str = mem_insts
        .into_iter()
        .map(|inst| inst.show_rru(mb_rru))
@@ -2618,42 +2627,58 @@ impl ShowWithRRU for Inst {
                let rd = rd.show_rru(mb_rru);
                format!("ldr {}, 8 ; b 12 ; data {:?} + {}", rd, name, offset)
            }
-            &Inst::LoadAddr { rd, ref mem } => match *mem {
-                MemArg::FPOffset(fp_off) => {
-                    let alu_op = if fp_off < 0 {
-                        ALUOp::Sub64
-                    } else {
-                        ALUOp::Add64
-                    };
-                    if let Some(imm12) = Imm12::maybe_from_u64(u64::try_from(fp_off.abs()).unwrap())
-                    {
-                        let inst = Inst::AluRRImm12 {
-                            alu_op,
-                            rd,
-                            imm12,
-                            rn: fp_reg(),
-                        };
-                        inst.show_rru(mb_rru)
-                    } else {
-                        let mut res = String::new();
-                        let const_insts =
-                            Inst::load_constant(rd, u64::try_from(fp_off.abs()).unwrap());
-                        for inst in const_insts {
-                            res.push_str(&inst.show_rru(mb_rru));
-                            res.push_str("; ");
-                        }
-                        let inst = Inst::AluRRR {
-                            alu_op,
-                            rd,
-                            rn: fp_reg(),
-                            rm: rd.to_reg(),
-                        };
-                        res.push_str(&inst.show_rru(mb_rru));
-                        res
-                    }
+            &Inst::LoadAddr { rd, ref mem } => {
+                // TODO: we really should find a better way to avoid duplication of
+                // this logic between `emit()` and `show_rru()` -- a separate 1-to-N
+                // expansion stage (i.e., legalization, but without the slow edit-in-place
+                // of the existing legalization framework).
+                let (mem_insts, mem) = mem_finalize(0, mem, &EmitState::default());
+                let mut ret = String::new();
+                for inst in mem_insts.into_iter() {
+                    ret.push_str(&inst.show_rru(mb_rru));
                }
-                _ => unimplemented!("{:?}", mem),
-            },
+                let (reg, offset) = match mem {
+                    MemArg::Unscaled(r, simm9) => (r, simm9.value()),
+                    MemArg::UnsignedOffset(r, uimm12scaled) => (r, uimm12scaled.value() as i32),
+                    _ => panic!("Unsupported case for LoadAddr: {:?}", mem),
+                };
+                let abs_offset = if offset < 0 {
+                    -offset as u64
+                } else {
+                    offset as u64
+                };
+                let alu_op = if offset < 0 {
+                    ALUOp::Sub64
+                } else {
+                    ALUOp::Add64
+                };
+
+                if offset == 0 {
+                    let mov = Inst::mov(rd, reg);
+                    ret.push_str(&mov.show_rru(mb_rru));
+                } else if let Some(imm12) = Imm12::maybe_from_u64(abs_offset) {
+                    let add = Inst::AluRRImm12 {
+                        alu_op,
+                        rd,
+                        rn: reg,
+                        imm12,
+                    };
+                    ret.push_str(&add.show_rru(mb_rru));
+                } else {
+                    let tmp = writable_spilltmp_reg();
+                    for inst in Inst::load_constant(tmp, abs_offset).into_iter() {
+                        ret.push_str(&inst.show_rru(mb_rru));
+                    }
+                    let add = Inst::AluRRR {
+                        alu_op,
+                        rd,
+                        rn: reg,
+                        rm: tmp.to_reg(),
+                    };
+                    ret.push_str(&add.show_rru(mb_rru));
+                }
+                ret
+            }
            &Inst::GetPinnedReg { rd } => {
                let rd = rd.show_rru(mb_rru);
                format!("get_pinned_reg {}", rd)
@@ -2662,6 +2687,7 @@ impl ShowWithRRU for Inst {
                let rm = rm.show_rru(mb_rru);
                format!("set_pinned_reg {}", rm)
            }
+            &Inst::VirtualSPOffsetAdj { offset } => format!("virtual_sp_offset_adjust {}", offset),
        }
    }
 }
--- a/cranelift/codegen/src/isa/aarch64/inst/regs.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/regs.rs
@@ -20,23 +20,21 @@ pub const PINNED_REG: u8 = 21;
 const XREG_INDICES: [u8; 31] = [
    // X0 - X7
    32, 33, 34, 35, 36, 37, 38, 39,
-    // X8 - X14
-    40, 41, 42, 43, 44, 45, 46,
-    // X15
-    59,
+    // X8 - X15
+    40, 41, 42, 43, 44, 45, 46, 47,
    // X16, X17
-    47, 48,
+    58, 59,
    // X18
    60,
    // X19, X20
-    49, 50,
+    48, 49,
    // X21, put aside because it's the pinned register.
-    58,
+    57,
    // X22 - X28
-    51, 52, 53, 54, 55, 56, 57,
-    // X29
+    50, 51, 52, 53, 54, 55, 56,
+    // X29 (FP)
    61,
-    // X30
+    // X30 (LR)
    62,
 ];

@@ -125,14 +123,17 @@ pub fn writable_fp_reg() -> Writable<Reg> {
    Writable::from_reg(fp_reg())
 }

-/// Get a reference to the "spill temp" register. This register is used to
-/// compute the address of a spill slot when a direct offset addressing mode from
-/// FP is not sufficient (+/- 2^11 words). We exclude this register from regalloc
-/// and reserve it for this purpose for simplicity; otherwise we need a
-/// multi-stage analysis where we first determine how many spill slots we have,
-/// then perhaps remove the reg from the pool and recompute regalloc.
+/// Get a reference to the first temporary, sometimes "spill temporary", register. This register is
+/// used to compute the address of a spill slot when a direct offset addressing mode from FP is not
+/// sufficient (+/- 2^11 words). We exclude this register from regalloc and reserve it for this
+/// purpose for simplicity; otherwise we need a multi-stage analysis where we first determine how
+/// many spill slots we have, then perhaps remove the reg from the pool and recompute regalloc.
+///
+/// We use x16 for this (aka IP0 in the AArch64 ABI) because it's a scratch register but is
+/// slightly special (used for linker veneers). We're free to use it as long as we don't expect it
+/// to live through call instructions.
 pub fn spilltmp_reg() -> Reg {
-    xreg(15)
+    xreg(16)
 }

 /// Get a writable reference to the spilltmp reg.
@@ -140,6 +141,20 @@ pub fn writable_spilltmp_reg() -> Writable<Reg> {
    Writable::from_reg(spilltmp_reg())
 }

+/// Get a reference to the second temp register. We need this in some edge cases
+/// where we need both the spilltmp and another temporary.
+///
+/// We use x17 (aka IP1), the other "interprocedural"/linker-veneer scratch reg that is
+/// free to use otherwise.
+pub fn tmp2_reg() -> Reg {
+    xreg(17)
+}
+
+/// Get a writable reference to the tmp2 reg.
+pub fn writable_tmp2_reg() -> Writable<Reg> {
+    Writable::from_reg(tmp2_reg())
+}
+
 /// Create the register universe for AArch64.
 pub fn create_reg_universe(flags: &settings::Flags) -> RealRegUniverse {
    let mut regs = vec![];
@@ -173,7 +188,7 @@ pub fn create_reg_universe(flags: &settings::Flags) -> RealRegUniverse {

    for i in 0u8..32u8 {
        // See above for excluded registers.
-        if i == 15 || i == 18 || i == 29 || i == 30 || i == 31 || i == PINNED_REG {
+        if i == 16 || i == 17 || i == 18 || i == 29 || i == 30 || i == 31 || i == PINNED_REG {
            continue;
        }
        let reg = Reg::new_real(
@@ -211,7 +226,8 @@ pub fn create_reg_universe(flags: &settings::Flags) -> RealRegUniverse {
        regs.len()
    };

-    regs.push((xreg(15).to_real_reg(), "x15".to_string()));
+    regs.push((xreg(16).to_real_reg(), "x16".to_string()));
+    regs.push((xreg(17).to_real_reg(), "x17".to_string()));
    regs.push((xreg(18).to_real_reg(), "x18".to_string()));
    regs.push((fp_reg().to_real_reg(), "fp".to_string()));
    regs.push((link_reg().to_real_reg(), "lr".to_string()));