Address review comments.

2020-06-02 16:57:50 -07:00
parent 615362068f
commit fe97659813
13 changed files with 224 additions and 169 deletions
--- a/cranelift/codegen/src/isa/aarch64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -73,9 +73,9 @@
 //! https://searchfox.org/mozilla-central/rev/bc3600def806859c31b2c7ac06e3d69271052a89/js/src/wasm/WasmStubs.h#134
 //!
 //! In brief:
-//! - Returns are processed in *reverse* order.
-//! - The first return in this order (so the last return) goes into the ordinary
-//!   return register, X0.
+//! - Return values are processed in *reverse* order.
+//! - The first return value in this order (so the last return) goes into the
+//!   ordinary return register, X0.
 //! - Any further returns go in a struct-return area, allocated upwards (in
 //!   address order) during the reverse traversal.
 //! - This struct-return area is provided by the caller, and a pointer to its
@@ -98,6 +98,7 @@ use crate::isa;
 use crate::isa::aarch64::{inst::*, lower::ty_bits};
 use crate::machinst::*;
 use crate::settings;
+use crate::{CodegenError, CodegenResult};

 use alloc::boxed::Box;
 use alloc::vec::Vec;
@@ -134,6 +135,11 @@ struct ABISig {
    call_conv: isa::CallConv,
 }

+/// This is the limit for the size of argument and return-value areas on the
+/// stack. We place a reasonable limit here to avoid integer overflow issues
+/// with 32-bit arithmetic: for now, 128 MB.
+static STACK_ARG_RET_SIZE_LIMIT: u64 = 128 * 1024 * 1024;
+
 // Spidermonkey specific ABI convention.

 /// This is SpiderMonkey's `WasmTableCallSigReg`.
@@ -208,14 +214,15 @@ enum ArgsOrRets {
 /// Process a list of parameters or return values and allocate them to X-regs,
 /// V-regs, and stack slots.
 ///
-/// Returns the list of argument locations, and the stack-space used (rounded up
-/// to a 16-byte-aligned boundary).
+/// Returns the list of argument locations, the stack-space used (rounded up
+/// to a 16-byte-aligned boundary), and if `add_ret_area_ptr` was passed, the
+/// index of the extra synthetic arg that was added.
 fn compute_arg_locs(
    call_conv: isa::CallConv,
    params: &[ir::AbiParam],
    args_or_rets: ArgsOrRets,
    add_ret_area_ptr: bool,
-) -> (Vec<ABIArg>, i64) {
+) -> CodegenResult<(Vec<ABIArg>, i64, Option<usize>)> {
    let is_baldrdash = call_conv.extends_baldrdash();

    // See AArch64 ABI (https://c9x.me/compile/bib/abi-arm64.pdf), sections 5.4.
@@ -290,7 +297,7 @@ fn compute_arg_locs(
        ret.reverse();
    }

-    if add_ret_area_ptr {
+    let extra_arg = if add_ret_area_ptr {
        debug_assert!(args_or_rets == ArgsOrRets::Args);
        if next_xreg < max_reg_vals {
            ret.push(ABIArg::Reg(xreg(next_xreg).to_real_reg(), I64));
@@ -298,35 +305,39 @@ fn compute_arg_locs(
            ret.push(ABIArg::Stack(next_stack as i64, I64));
            next_stack += 8;
        }
-    }
+        Some(ret.len() - 1)
+    } else {
+        None
+    };

    next_stack = (next_stack + 15) & !15;

-    (ret, next_stack as i64)
+    // To avoid overflow issues, limit the arg/return size to something
+    // reasonable -- here, 128 MB.
+    if next_stack > STACK_ARG_RET_SIZE_LIMIT {
+        return Err(CodegenError::ImplLimitExceeded);
+    }
+
+    Ok((ret, next_stack as i64, extra_arg))
 }

 impl ABISig {
-    fn from_func_sig(sig: &ir::Signature) -> ABISig {
+    fn from_func_sig(sig: &ir::Signature) -> CodegenResult<ABISig> {
        // Compute args and retvals from signature. Handle retvals first,
        // because we may need to add a return-area arg to the args.
-        let (rets, stack_ret_space) = compute_arg_locs(
+        let (rets, stack_ret_space, _) = compute_arg_locs(
            sig.call_conv,
            &sig.returns,
            ArgsOrRets::Rets,
            /* extra ret-area ptr = */ false,
-        );
+        )?;
        let need_stack_return_area = stack_ret_space > 0;
-        let (args, stack_arg_space) = compute_arg_locs(
+        let (args, stack_arg_space, stack_ret_arg) = compute_arg_locs(
            sig.call_conv,
            &sig.params,
            ArgsOrRets::Args,
            need_stack_return_area,
-        );
-        let stack_ret_arg = if need_stack_return_area {
-            Some(args.len() - 1)
-        } else {
-            None
-        };
+        )?;

        trace!(
            "ABISig: sig {:?} => args = {:?} rets = {:?} arg stack = {} ret stack = {} stack_ret_arg = {:?}",
@@ -338,14 +349,14 @@ impl ABISig {
            stack_ret_arg
        );

-        ABISig {
+        Ok(ABISig {
            args,
            rets,
            stack_arg_space,
            stack_ret_space,
            stack_ret_arg,
            call_conv: sig.call_conv,
-        }
+        })
    }
 }

@@ -446,15 +457,7 @@ fn gen_stack_limit(f: &ir::Function, abi: &ABISig, gv: ir::GlobalValue) -> (Reg,
            } => {
                let base = generate_gv(f, abi, base, insts);
                let into_reg = writable_spilltmp_reg();
-                let mem = if let Some(offset) =
-                    UImm12Scaled::maybe_from_i64(offset.into(), ir::types::I8)
-                {
-                    MemArg::UnsignedOffset(base, offset)
-                } else {
-                    let offset: i64 = offset.into();
-                    insts.extend(Inst::load_constant(into_reg, offset as u64));
-                    MemArg::RegReg(base, into_reg.to_reg())
-                };
+                let mem = MemArg::RegOffset(base, offset.into(), I64);
                insts.push(Inst::ULoad64 {
                    rd: into_reg,
                    mem,
@@ -481,10 +484,10 @@ fn get_special_purpose_param_register(

 impl AArch64ABIBody {
    /// Create a new body ABI instance.
-    pub fn new(f: &ir::Function, flags: settings::Flags) -> Self {
+    pub fn new(f: &ir::Function, flags: settings::Flags) -> CodegenResult<Self> {
        debug!("AArch64 ABI: func signature {:?}", f.signature);

-        let sig = ABISig::from_func_sig(&f.signature);
+        let sig = ABISig::from_func_sig(&f.signature)?;

        let call_conv = f.signature.call_conv;
        // Only these calling conventions are supported.
@@ -517,7 +520,7 @@ impl AArch64ABIBody {
                .map(|reg| (reg, Vec::new()))
                .or_else(|| f.stack_limit.map(|gv| gen_stack_limit(f, &sig, gv)));

-        Self {
+        Ok(Self {
            sig,
            stackslots,
            stackslots_size: stack_offset,
@@ -529,7 +532,7 @@ impl AArch64ABIBody {
            flags,
            is_leaf: f.is_leaf(),
            stack_limit,
-        }
+        })
    }

    /// Returns the offset from FP to the argument area, i.e., jumping over the saved FP, return
@@ -635,15 +638,22 @@ impl AArch64ABIBody {

 fn load_stack(mem: MemArg, into_reg: Writable<Reg>, ty: Type) -> Inst {
    match ty {
-        types::B1
-        | types::B8
-        | types::I8
-        | types::B16
-        | types::I16
-        | types::B32
-        | types::I32
-        | types::B64
-        | types::I64 => Inst::ULoad64 {
+        types::B1 | types::B8 | types::I8 => Inst::ULoad8 {
+            rd: into_reg,
+            mem,
+            srcloc: None,
+        },
+        types::B16 | types::I16 => Inst::ULoad16 {
+            rd: into_reg,
+            mem,
+            srcloc: None,
+        },
+        types::B32 | types::I32 => Inst::ULoad32 {
+            rd: into_reg,
+            mem,
+            srcloc: None,
+        },
+        types::B64 | types::I64 => Inst::ULoad64 {
            rd: into_reg,
            mem,
            srcloc: None,
@@ -664,15 +674,22 @@ fn load_stack(mem: MemArg, into_reg: Writable<Reg>, ty: Type) -> Inst {

 fn store_stack(mem: MemArg, from_reg: Reg, ty: Type) -> Inst {
    match ty {
-        types::B1
-        | types::B8
-        | types::I8
-        | types::B16
-        | types::I16
-        | types::B32
-        | types::I32
-        | types::B64
-        | types::I64 => Inst::Store64 {
+        types::B1 | types::B8 | types::I8 => Inst::Store8 {
+            rd: from_reg,
+            mem,
+            srcloc: None,
+        },
+        types::B16 | types::I16 => Inst::Store16 {
+            rd: from_reg,
+            mem,
+            srcloc: None,
+        },
+        types::B32 | types::I32 => Inst::Store32 {
+            rd: from_reg,
+            mem,
+            srcloc: None,
+        },
+        types::B64 | types::I64 => Inst::Store64 {
            rd: from_reg,
            mem,
            srcloc: None,
@@ -791,17 +808,14 @@ fn get_caller_saves(call_conv: isa::CallConv) -> Vec<Writable<Reg>> {
 impl ABIBody for AArch64ABIBody {
    type I = Inst;

-    fn needed_tmps(&self) -> usize {
-        if self.sig.stack_ret_arg.is_some() {
-            1
-        } else {
-            0
-        }
+    fn temp_needed(&self) -> bool {
+        self.sig.stack_ret_arg.is_some()
    }

-    fn init_with_tmps(&mut self, tmps: &[Writable<Reg>]) {
+    fn init(&mut self, maybe_tmp: Option<Writable<Reg>>) {
        if self.sig.stack_ret_arg.is_some() {
-            self.ret_area_ptr = Some(tmps[0]);
+            assert!(maybe_tmp.is_some());
+            self.ret_area_ptr = maybe_tmp;
        }
    }

@@ -845,14 +859,14 @@ impl ABIBody for AArch64ABIBody {
        match &self.sig.args[idx] {
            &ABIArg::Reg(r, ty) => Inst::gen_move(into_reg, r.to_reg(), ty),
            &ABIArg::Stack(off, ty) => load_stack(
-                MemArg::FPOffset(self.fp_to_arg_offset() + off),
+                MemArg::FPOffset(self.fp_to_arg_offset() + off, ty),
                into_reg,
                ty,
            ),
        }
    }

-    fn gen_retval_area_setup(&self) -> Vec<Inst> {
+    fn gen_retval_area_setup(&self) -> Option<Inst> {
        if let Some(i) = self.sig.stack_ret_arg {
            let inst = self.gen_copy_arg_to_reg(i, self.ret_area_ptr.unwrap());
            trace!(
@@ -860,10 +874,10 @@ impl ABIBody for AArch64ABIBody {
                inst,
                self.ret_area_ptr.unwrap().to_reg()
            );
-            vec![inst]
+            Some(inst)
        } else {
            trace!("gen_retval_area_setup: not needed");
-            vec![]
+            None
        }
    }

@@ -924,8 +938,7 @@ impl ABIBody for AArch64ABIBody {
                    }
                    _ => {}
                };
-                let mem = MemArg::reg_maybe_offset(self.ret_area_ptr.unwrap().to_reg(), off, ty)
-                    .expect("Return-value area is too large");
+                let mem = MemArg::RegOffset(self.ret_area_ptr.unwrap().to_reg(), off, ty);
                ret.push(store_stack(mem, from_reg.to_reg(), ty))
            }
        }
@@ -961,7 +974,7 @@ impl ABIBody for AArch64ABIBody {
        let stack_off = self.stackslots[slot.as_u32() as usize] as i64;
        let sp_off: i64 = stack_off + (offset as i64);
        trace!("load_stackslot: slot {} -> sp_off {}", slot, sp_off);
-        load_stack(MemArg::NominalSPOffset(sp_off), into_reg, ty)
+        load_stack(MemArg::NominalSPOffset(sp_off, ty), into_reg, ty)
    }

    /// Store to a stackslot.
@@ -971,7 +984,7 @@ impl ABIBody for AArch64ABIBody {
        let stack_off = self.stackslots[slot.as_u32() as usize] as i64;
        let sp_off: i64 = stack_off + (offset as i64);
        trace!("store_stackslot: slot {} -> sp_off {}", slot, sp_off);
-        store_stack(MemArg::NominalSPOffset(sp_off), from_reg, ty)
+        store_stack(MemArg::NominalSPOffset(sp_off, ty), from_reg, ty)
    }

    /// Produce an instruction that computes a stackslot address.
@@ -982,7 +995,7 @@ impl ABIBody for AArch64ABIBody {
        let sp_off: i64 = stack_off + (offset as i64);
        Inst::LoadAddr {
            rd: into_reg,
-            mem: MemArg::NominalSPOffset(sp_off),
+            mem: MemArg::NominalSPOffset(sp_off, I8),
        }
    }

@@ -993,7 +1006,7 @@ impl ABIBody for AArch64ABIBody {
        let spill_off = islot * 8;
        let sp_off = self.stackslots_size as i64 + spill_off;
        trace!("load_spillslot: slot {:?} -> sp_off {}", slot, sp_off);
-        load_stack(MemArg::NominalSPOffset(sp_off), into_reg, ty)
+        load_stack(MemArg::NominalSPOffset(sp_off, ty), into_reg, ty)
    }

    /// Store to a spillslot.
@@ -1003,7 +1016,7 @@ impl ABIBody for AArch64ABIBody {
        let spill_off = islot * 8;
        let sp_off = self.stackslots_size as i64 + spill_off;
        trace!("store_spillslot: slot {:?} -> sp_off {}", slot, sp_off);
-        store_stack(MemArg::NominalSPOffset(sp_off), from_reg, ty)
+        store_stack(MemArg::NominalSPOffset(sp_off, ty), from_reg, ty)
    }

    fn gen_prologue(&mut self) -> Vec<Inst> {
@@ -1290,17 +1303,17 @@ impl AArch64ABICall {
        extname: &ir::ExternalName,
        dist: RelocDistance,
        loc: ir::SourceLoc,
-    ) -> AArch64ABICall {
-        let sig = ABISig::from_func_sig(sig);
+    ) -> CodegenResult<AArch64ABICall> {
+        let sig = ABISig::from_func_sig(sig)?;
        let (uses, defs) = abisig_to_uses_and_defs(&sig);
-        AArch64ABICall {
+        Ok(AArch64ABICall {
            sig,
            uses,
            defs,
            dest: CallDest::ExtName(extname.clone(), dist),
            loc,
            opcode: ir::Opcode::Call,
-        }
+        })
    }

    /// Create a callsite ABI object for a call to a function pointer with the
@@ -1310,17 +1323,17 @@ impl AArch64ABICall {
        ptr: Reg,
        loc: ir::SourceLoc,
        opcode: ir::Opcode,
-    ) -> AArch64ABICall {
-        let sig = ABISig::from_func_sig(sig);
+    ) -> CodegenResult<AArch64ABICall> {
+        let sig = ABISig::from_func_sig(sig)?;
        let (uses, defs) = abisig_to_uses_and_defs(&sig);
-        AArch64ABICall {
+        Ok(AArch64ABICall {
            sig,
            uses,
            defs,
            dest: CallDest::Reg(ptr),
            loc,
            opcode,
-        }
+        })
    }
 }

@@ -1394,7 +1407,9 @@ impl ABICall for AArch64ABICall {
                from_reg,
                ty,
            )),
-            &ABIArg::Stack(off, ty) => ctx.emit(store_stack(MemArg::SPOffset(off), from_reg, ty)),
+            &ABIArg::Stack(off, ty) => {
+                ctx.emit(store_stack(MemArg::SPOffset(off, ty), from_reg, ty))
+            }
        }
    }

@@ -1409,7 +1424,7 @@ impl ABICall for AArch64ABICall {
            &ABIArg::Stack(off, ty) => {
                let ret_area_base = self.sig.stack_arg_space;
                ctx.emit(load_stack(
-                    MemArg::SPOffset(off + ret_area_base),
+                    MemArg::SPOffset(off + ret_area_base, ty),
                    into_reg,
                    ty,
                ));
@@ -1427,7 +1442,7 @@ impl ABICall for AArch64ABICall {
            let ret_area_base = self.sig.stack_arg_space;
            ctx.emit(Inst::LoadAddr {
                rd,
-                mem: MemArg::SPOffset(ret_area_base),
+                mem: MemArg::SPOffset(ret_area_base, I8),
            });
            self.emit_copy_reg_to_arg(ctx, i, rd.to_reg());
        }
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -145,11 +145,15 @@ pub enum MemArg {
    /// Reference to a "label": e.g., a symbol.
    Label(MemLabel),

+    /// Arbitrary offset from a register. Converted to generation of large
+    /// offsets with multiple instructions as necessary during code emission.
+    RegOffset(Reg, i64, Type),
+
    /// Offset from the stack pointer.
-    SPOffset(i64),
+    SPOffset(i64, Type),

    /// Offset from the frame pointer.
-    FPOffset(i64),
+    FPOffset(i64, Type),

    /// Offset from the "nominal stack pointer", which is where the real SP is
    /// just after stack and spill slots are allocated in the function prologue.
@@ -163,7 +167,7 @@ pub enum MemArg {
    /// SP" is where the actual SP is after the function prologue and before
    /// clobber pushes. See the diagram in the documentation for
    /// [crate::isa::aarch64::abi](the ABI module) for more details.
-    NominalSPOffset(i64),
+    NominalSPOffset(i64, Type),
 }

 impl MemArg {
@@ -174,17 +178,6 @@ impl MemArg {
        MemArg::UnsignedOffset(reg, UImm12Scaled::zero(I64))
    }

-    /// Memory reference using an address in a register and an offset, if possible.
-    pub fn reg_maybe_offset(reg: Reg, offset: i64, value_type: Type) -> Option<MemArg> {
-        if let Some(simm9) = SImm9::maybe_from_i64(offset) {
-            Some(MemArg::Unscaled(reg, simm9))
-        } else if let Some(uimm12s) = UImm12Scaled::maybe_from_i64(offset, value_type) {
-            Some(MemArg::UnsignedOffset(reg, uimm12s))
-        } else {
-            None
-        }
-    }
-
    /// Memory reference using the sum of two registers as an address.
    pub fn reg_plus_reg(reg1: Reg, reg2: Reg) -> MemArg {
        MemArg::RegReg(reg1, reg2)
@@ -431,8 +424,11 @@ impl ShowWithRRU for MemArg {
                simm9.show_rru(mb_rru)
            ),
            // Eliminated by `mem_finalize()`.
-            &MemArg::SPOffset(..) | &MemArg::FPOffset(..) | &MemArg::NominalSPOffset(..) => {
-                panic!("Unexpected stack-offset mem-arg mode!")
+            &MemArg::SPOffset(..)
+            | &MemArg::FPOffset(..)
+            | &MemArg::NominalSPOffset(..)
+            | &MemArg::RegOffset(..) => {
+                panic!("Unexpected pseudo mem-arg mode (stack-offset or generic reg-offset)!")
            }
        }
    }
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -5,6 +5,7 @@ use crate::ir::constant::ConstantData;
 use crate::ir::types::*;
 use crate::ir::TrapCode;
 use crate::isa::aarch64::inst::*;
+use crate::isa::aarch64::lower::ty_bits;

 use regalloc::{Reg, RegClass, Writable};

@@ -29,8 +30,12 @@ pub fn mem_finalize(
    state: &EmitState,
 ) -> (SmallVec<[Inst; 4]>, MemArg) {
    match mem {
-        &MemArg::SPOffset(off) | &MemArg::FPOffset(off) | &MemArg::NominalSPOffset(off) => {
+        &MemArg::RegOffset(_, off, ty)
+        | &MemArg::SPOffset(off, ty)
+        | &MemArg::FPOffset(off, ty)
+        | &MemArg::NominalSPOffset(off, ty) => {
            let basereg = match mem {
+                &MemArg::RegOffset(reg, _, _) => reg,
                &MemArg::SPOffset(..) | &MemArg::NominalSPOffset(..) => stack_reg(),
                &MemArg::FPOffset(..) => fp_reg(),
                _ => unreachable!(),
@@ -52,6 +57,9 @@ pub fn mem_finalize(
            if let Some(simm9) = SImm9::maybe_from_i64(off) {
                let mem = MemArg::Unscaled(basereg, simm9);
                (smallvec![], mem)
+            } else if let Some(uimm12s) = UImm12Scaled::maybe_from_i64(off, ty) {
+                let mem = MemArg::UnsignedOffset(basereg, uimm12s);
+                (smallvec![], mem)
            } else {
                let tmp = writable_spilltmp_reg();
                let mut const_insts = Inst::load_constant(tmp, off as u64);
@@ -654,17 +662,17 @@ impl MachInstEmit for Inst {
                // This is the base opcode (top 10 bits) for the "unscaled
                // immediate" form (Unscaled). Other addressing modes will OR in
                // other values for bits 24/25 (bits 1/2 of this constant).
-                let op = match self {
-                    &Inst::ULoad8 { .. } => 0b0011100001,
-                    &Inst::SLoad8 { .. } => 0b0011100010,
-                    &Inst::ULoad16 { .. } => 0b0111100001,
-                    &Inst::SLoad16 { .. } => 0b0111100010,
-                    &Inst::ULoad32 { .. } => 0b1011100001,
-                    &Inst::SLoad32 { .. } => 0b1011100010,
-                    &Inst::ULoad64 { .. } => 0b1111100001,
-                    &Inst::FpuLoad32 { .. } => 0b1011110001,
-                    &Inst::FpuLoad64 { .. } => 0b1111110001,
-                    &Inst::FpuLoad128 { .. } => 0b0011110011,
+                let (op, bits) = match self {
+                    &Inst::ULoad8 { .. } => (0b0011100001, 8),
+                    &Inst::SLoad8 { .. } => (0b0011100010, 8),
+                    &Inst::ULoad16 { .. } => (0b0111100001, 16),
+                    &Inst::SLoad16 { .. } => (0b0111100010, 16),
+                    &Inst::ULoad32 { .. } => (0b1011100001, 32),
+                    &Inst::SLoad32 { .. } => (0b1011100010, 32),
+                    &Inst::ULoad64 { .. } => (0b1111100001, 64),
+                    &Inst::FpuLoad32 { .. } => (0b1011110001, 32),
+                    &Inst::FpuLoad64 { .. } => (0b1111110001, 64),
+                    &Inst::FpuLoad128 { .. } => (0b0011110011, 128),
                    _ => unreachable!(),
                };

@@ -678,6 +686,9 @@ impl MachInstEmit for Inst {
                        sink.put4(enc_ldst_simm9(op, simm9, 0b00, reg, rd));
                    }
                    &MemArg::UnsignedOffset(reg, uimm12scaled) => {
+                        if uimm12scaled.value() != 0 {
+                            assert_eq!(bits, ty_bits(uimm12scaled.scale_ty()));
+                        }
                        sink.put4(enc_ldst_uimm12(op, uimm12scaled, reg, rd));
                    }
                    &MemArg::RegReg(r1, r2) => {
@@ -686,19 +697,7 @@ impl MachInstEmit for Inst {
                        ));
                    }
                    &MemArg::RegScaled(r1, r2, ty) | &MemArg::RegScaledExtended(r1, r2, ty, _) => {
-                        match (ty, self) {
-                            (I8, &Inst::ULoad8 { .. }) => {}
-                            (I8, &Inst::SLoad8 { .. }) => {}
-                            (I16, &Inst::ULoad16 { .. }) => {}
-                            (I16, &Inst::SLoad16 { .. }) => {}
-                            (I32, &Inst::ULoad32 { .. }) => {}
-                            (I32, &Inst::SLoad32 { .. }) => {}
-                            (I64, &Inst::ULoad64 { .. }) => {}
-                            (F32, &Inst::FpuLoad32 { .. }) => {}
-                            (F64, &Inst::FpuLoad64 { .. }) => {}
-                            (I128, &Inst::FpuLoad128 { .. }) => {}
-                            _ => panic!("Mismatching reg-scaling type in MemArg"),
-                        }
+                        assert_eq!(bits, ty_bits(ty));
                        let extendop = match &mem {
                            &MemArg::RegScaled(..) => None,
                            &MemArg::RegScaledExtended(_, _, _, op) => Some(op),
@@ -746,6 +745,7 @@ impl MachInstEmit for Inst {
                    &MemArg::SPOffset(..)
                    | &MemArg::FPOffset(..)
                    | &MemArg::NominalSPOffset(..) => panic!("Should not see stack-offset here!"),
+                    &MemArg::RegOffset(..) => panic!("SHould not see generic reg-offset here!"),
                }
            }

@@ -791,14 +791,14 @@ impl MachInstEmit for Inst {
                    inst.emit(sink, flags, state);
                }

-                let op = match self {
-                    &Inst::Store8 { .. } => 0b0011100000,
-                    &Inst::Store16 { .. } => 0b0111100000,
-                    &Inst::Store32 { .. } => 0b1011100000,
-                    &Inst::Store64 { .. } => 0b1111100000,
-                    &Inst::FpuStore32 { .. } => 0b1011110000,
-                    &Inst::FpuStore64 { .. } => 0b1111110000,
-                    &Inst::FpuStore128 { .. } => 0b0011110010,
+                let (op, bits) = match self {
+                    &Inst::Store8 { .. } => (0b0011100000, 8),
+                    &Inst::Store16 { .. } => (0b0111100000, 16),
+                    &Inst::Store32 { .. } => (0b1011100000, 32),
+                    &Inst::Store64 { .. } => (0b1111100000, 64),
+                    &Inst::FpuStore32 { .. } => (0b1011110000, 32),
+                    &Inst::FpuStore64 { .. } => (0b1111110000, 64),
+                    &Inst::FpuStore128 { .. } => (0b0011110010, 128),
                    _ => unreachable!(),
                };

@@ -812,6 +812,9 @@ impl MachInstEmit for Inst {
                        sink.put4(enc_ldst_simm9(op, simm9, 0b00, reg, rd));
                    }
                    &MemArg::UnsignedOffset(reg, uimm12scaled) => {
+                        if uimm12scaled.value() != 0 {
+                            assert_eq!(bits, ty_bits(uimm12scaled.scale_ty()));
+                        }
                        sink.put4(enc_ldst_uimm12(op, uimm12scaled, reg, rd));
                    }
                    &MemArg::RegReg(r1, r2) => {
@@ -843,6 +846,7 @@ impl MachInstEmit for Inst {
                    &MemArg::SPOffset(..)
                    | &MemArg::FPOffset(..)
                    | &MemArg::NominalSPOffset(..) => panic!("Should not see stack-offset here!"),
+                    &MemArg::RegOffset(..) => panic!("SHould not see generic reg-offset here!"),
                }
            }

--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -1311,7 +1311,7 @@ fn test_aarch64_binemit() {
    insns.push((
        Inst::ULoad64 {
            rd: writable_xreg(1),
-            mem: MemArg::FPOffset(32768),
+            mem: MemArg::FPOffset(32768, I8),
            srcloc: None,
        },
        "100090D2B063308B010240F9",
@@ -1320,7 +1320,7 @@ fn test_aarch64_binemit() {
    insns.push((
        Inst::ULoad64 {
            rd: writable_xreg(1),
-            mem: MemArg::FPOffset(-32768),
+            mem: MemArg::FPOffset(-32768, I8),
            srcloc: None,
        },
        "F0FF8F92B063308B010240F9",
@@ -1329,7 +1329,7 @@ fn test_aarch64_binemit() {
    insns.push((
        Inst::ULoad64 {
            rd: writable_xreg(1),
-            mem: MemArg::FPOffset(1048576), // 2^20
+            mem: MemArg::FPOffset(1048576, I8), // 2^20
            srcloc: None,
        },
        "1002A0D2B063308B010240F9",
@@ -1338,13 +1338,43 @@ fn test_aarch64_binemit() {
    insns.push((
        Inst::ULoad64 {
            rd: writable_xreg(1),
-            mem: MemArg::FPOffset(1048576 + 1), // 2^20 + 1
+            mem: MemArg::FPOffset(1048576 + 1, I8), // 2^20 + 1
            srcloc: None,
        },
        "300080D21002A0F2B063308B010240F9",
        "movz x16, #1 ; movk x16, #16, LSL #16 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
    ));

+    insns.push((
+        Inst::ULoad64 {
+            rd: writable_xreg(1),
+            mem: MemArg::RegOffset(xreg(7), 8, I64),
+            srcloc: None,
+        },
+        "E18040F8",
+        "ldur x1, [x7, #8]",
+    ));
+
+    insns.push((
+        Inst::ULoad64 {
+            rd: writable_xreg(1),
+            mem: MemArg::RegOffset(xreg(7), 1024, I64),
+            srcloc: None,
+        },
+        "E10042F9",
+        "ldr x1, [x7, #1024]",
+    ));
+
+    insns.push((
+        Inst::ULoad64 {
+            rd: writable_xreg(1),
+            mem: MemArg::RegOffset(xreg(7), 1048576, I64),
+            srcloc: None,
+        },
+        "1002A0D2F060308B010240F9",
+        "movz x16, #16, LSL #16 ; add x16, x7, x16, UXTX ; ldr x1, [x16]",
+    ));
+
    insns.push((
        Inst::Store8 {
            rd: xreg(1),
--- a/cranelift/codegen/src/isa/aarch64/inst/imms.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/imms.rs
@@ -259,7 +259,12 @@ impl UImm12Scaled {

    /// Value after scaling.
    pub fn value(&self) -> u32 {
-        self.value as u32 * self.scale_ty.bytes()
+        self.value as u32
+    }
+
+    /// The value type which is the scaling base.
+    pub fn scale_ty(&self) -> Type {
+        self.scale_ty
    }
 }

--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -1004,6 +1004,9 @@ fn memarg_regs(memarg: &MemArg, collector: &mut RegUsageCollector) {
        &MemArg::SPOffset(..) | &MemArg::NominalSPOffset(..) => {
            collector.add_use(stack_reg());
        }
+        &MemArg::RegOffset(r, ..) => {
+            collector.add_use(r);
+        }
    }
 }

@@ -1318,6 +1321,7 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
            &mut MemArg::FPOffset(..)
            | &mut MemArg::SPOffset(..)
            | &mut MemArg::NominalSPOffset(..) => {}
+            &mut MemArg::RegOffset(ref mut r, ..) => map_use(m, r),
        };
    }

--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -539,12 +539,10 @@ pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
    // TODO: support base_reg + scale * index_reg. For this, we would need to pattern-match shl or
    // mul instructions (Load/StoreComplex don't include scale factors).

-    // Handle one reg and offset that fits in immediate, if possible.
+    // Handle one reg and offset.
    if addends.len() == 1 {
        let reg = input_to_reg(ctx, addends[0], NarrowValueMode::ZeroExtend64);
-        if let Some(memarg) = MemArg::reg_maybe_offset(reg, offset as i64, elem_ty) {
-            return memarg;
-        }
+        return MemArg::RegOffset(reg, offset as i64, elem_ty);
    }

    // Handle two regs and a zero offset, if possible.
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -1335,7 +1335,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    assert!(inputs.len() == sig.params.len());
                    assert!(outputs.len() == sig.returns.len());
                    (
-                        AArch64ABICall::from_func(sig, &extname, dist, loc),
+                        AArch64ABICall::from_func(sig, &extname, dist, loc)?,
                        &inputs[..],
                    )
                }
@@ -1344,7 +1344,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    let sig = ctx.call_sig(insn).unwrap();
                    assert!(inputs.len() - 1 == sig.params.len());
                    assert!(outputs.len() == sig.returns.len());
-                    (AArch64ABICall::from_ptr(sig, ptr, loc, op), &inputs[1..])
+                    (AArch64ABICall::from_ptr(sig, ptr, loc, op)?, &inputs[1..])
                }
                _ => unreachable!(),
            };
--- a/cranelift/codegen/src/isa/aarch64/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/mod.rs
@@ -46,7 +46,7 @@ impl AArch64Backend {
        func: &Function,
        flags: settings::Flags,
    ) -> CodegenResult<VCode<inst::Inst>> {
-        let abi = Box::new(abi::AArch64ABIBody::new(func, flags));
+        let abi = Box::new(abi::AArch64ABIBody::new(func, flags)?);
        compile::compile::<AArch64Backend>(func, self, abi)
    }
 }
--- a/cranelift/codegen/src/isa/x64/abi.rs
+++ b/cranelift/codegen/src/isa/x64/abi.rs
@@ -184,11 +184,11 @@ impl X64ABIBody {
 impl ABIBody for X64ABIBody {
    type I = Inst;

-    fn needed_tmps(&self) -> usize {
-        0
+    fn temp_needed(&self) -> bool {
+        false
    }

-    fn init_with_tmps(&mut self, _: &[Writable<Reg>]) {}
+    fn init(&mut self, _: Option<Writable<Reg>>) {}

    fn flags(&self) -> &settings::Flags {
        &self.flags
@@ -239,8 +239,8 @@ impl ABIBody for X64ABIBody {
        }
    }

-    fn gen_retval_area_setup(&self) -> Vec<Inst> {
-        vec![]
+    fn gen_retval_area_setup(&self) -> Option<Inst> {
+        None
    }

    fn gen_copy_reg_to_retval(
--- a/cranelift/codegen/src/machinst/abi.rs
+++ b/cranelift/codegen/src/machinst/abi.rs
@@ -12,11 +12,14 @@ pub trait ABIBody {
    /// The instruction type for the ISA associated with this ABI.
    type I: VCodeInst;

-    /// How many temps are needed?
-    fn needed_tmps(&self) -> usize;
+    /// Does the ABI-body code need a temp reg? One will be provided to `init()`
+    /// as the `maybe_tmp` arg if so.
+    fn temp_needed(&self) -> bool;

-    /// Initialize, providing the requersted temps.
-    fn init_with_tmps(&mut self, tmps: &[Writable<Reg>]);
+    /// Initialize. This is called after the ABIBody is constructed because it
+    /// may be provided with a temp vreg, which can only be allocated once the
+    /// lowering context exists.
+    fn init(&mut self, maybe_tmp: Option<Writable<Reg>>);

    /// Get the settings controlling this function's compilation.
    fn flags(&self) -> &settings::Flags;
@@ -40,12 +43,12 @@ pub trait ABIBody {
    /// register.
    fn gen_copy_arg_to_reg(&self, idx: usize, into_reg: Writable<Reg>) -> Self::I;

-    /// Generate any setup instructions needed to save values to the
+    /// Generate any setup instruction needed to save values to the
    /// return-value area. This is usually used when were are multiple return
    /// values or an otherwise large return value that must be passed on the
    /// stack; typically the ABI specifies an extra hidden argument that is a
    /// pointer to that memory.
-    fn gen_retval_area_setup(&self) -> Vec<Self::I>;
+    fn gen_retval_area_setup(&self) -> Option<Self::I>;

    /// Generate an instruction which copies a source register to a return value slot.
    fn gen_copy_reg_to_retval(
--- a/cranelift/codegen/src/machinst/lower.rs
+++ b/cranelift/codegen/src/machinst/lower.rs
@@ -383,7 +383,7 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
                let insn = self.vcode.abi().gen_copy_arg_to_reg(i, reg);
                self.emit(insn);
            }
-            for insn in self.vcode.abi().gen_retval_area_setup().into_iter() {
+            if let Some(insn) = self.vcode.abi().gen_retval_area_setup() {
                self.emit(insn);
            }
        }
@@ -652,11 +652,13 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
    pub fn lower<B: LowerBackend<MInst = I>>(mut self, backend: &B) -> CodegenResult<VCode<I>> {
        debug!("about to lower function: {:?}", self.f);

-        // Initialize the ABI object with any temps it needs.
-        let tmps: SmallVec<[Writable<Reg>; 4]> = (0..self.vcode.abi().needed_tmps())
-            .map(|_| self.alloc_tmp(RegClass::I64, I64))
-            .collect();
-        self.vcode.abi().init_with_tmps(&tmps[..]);
+        // Initialize the ABI object, giving it a temp if requested.
+        let maybe_tmp = if self.vcode.abi().temp_needed() {
+            Some(self.alloc_tmp(RegClass::I64, I64))
+        } else {
+            None
+        };
+        self.vcode.abi().init(maybe_tmp);

        // Get the pinned reg here (we only parameterize this function on `B`,
        // not the whole `Lower` impl).
--- a/cranelift/filetests/filetests/vcode/aarch64/stack-limit.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/stack-limit.clif
@@ -64,8 +64,8 @@ block0(v0: i64):

 ; check:      stp fp, lr, [sp, #-16]!
 ; nextln:     mov fp, sp
-; nextln:     ldr x16, [x0]
-; nextln:     ldr x16, [x16, #4]
+; nextln:     ldur x16, [x0]
+; nextln:     ldur x16, [x16, #4]
 ; nextln:     subs xzr, sp, x16
 ; nextln:     b.hs 8
 ; nextln:     udf
@@ -128,8 +128,8 @@ block0(v0: i64):

 ; check:      stp fp, lr, [sp, #-16]!
 ; nextln:     mov fp, sp
-; nextln:     ldr x16, [x0]
-; nextln:     ldr x16, [x16, #4]
+; nextln:     ldur x16, [x0]
+; nextln:     ldur x16, [x16, #4]
 ; nextln:     add x16, x16, #32
 ; nextln:     subs xzr, sp, x16
 ; nextln:     b.hs 8
@@ -151,8 +151,8 @@ block0(v0: i64):

 ; check:      stp fp, lr, [sp, #-16]!
 ; nextln:     mov fp, sp
-; nextln:     ldr x16, [x0]
-; nextln:     ldr x16, [x16, #4]
+; nextln:     ldur x16, [x0]
+; nextln:     ldur x16, [x16, #4]
 ; nextln:     subs xzr, sp, x16
 ; nextln:     b.hs 8
 ; nextln:     udf
@@ -179,9 +179,7 @@ block0(v0: i64):

 ; check:      stp fp, lr, [sp, #-16]!
 ; nextln:     mov fp, sp
-; nextln:     movz x16, #6784
-; nextln:     movk x16, #6, LSL #16
-; nextln:     ldr x16, [x0, x16]
+; nextln:     movz x16, #6784 ; movk x16, #6, LSL #16 ; add x16, x0, x16, UXTX ; ldr x16, [x16]
 ; nextln:     add x16, x16, #32
 ; nextln:     subs xzr, sp, x16
 ; nextln:     b.hs 8