From 0b13d8c848aa40e2b643e8994e796ca47433a8ef Mon Sep 17 00:00:00 2001
From: Benjamin Bouvier <public@benj.me>
Date: Thu, 23 Apr 2020 19:23:16 +0200
Subject: [PATCH] aarch64: copy SP whenever it's involved in an address
 lowering with an explicit add;

---
 build.rs                                      |  6 +-
 cranelift/codegen/src/isa/aarch64/abi.rs      | 71 +++++++++++++++++--
 .../codegen/src/isa/aarch64/inst/emit.rs      | 34 +++++++--
 cranelift/codegen/src/isa/aarch64/inst/mod.rs |  3 +
 cranelift/codegen/src/isa/aarch64/lower.rs    | 20 +++++-
 cranelift/codegen/src/machinst/abi.rs         |  9 ++-
 6 files changed, 124 insertions(+), 19 deletions(-)
diff --git a/build.rs b/build.rs
index f94f0d3980..98a31a2c10 100644
--- a/build.rs
+++ b/build.rs
@@ -237,11 +237,7 @@ fn should_panic(testsuite: &str, testname: &str) -> bool {
     }
     match (testsuite, testname) {
         // FIXME(#1521)
-        ("misc_testsuite", "func_400_params")
-        | ("simd", _)
-        | ("multi_value", "call")
-        | ("spec_testsuite", "call") => true,
-
+        ("simd", _) => true,
         _ => false,
     }
 }
diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs
index 1b314037f7..a8fc48d794 100644
--- a/cranelift/codegen/src/isa/aarch64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -295,6 +295,10 @@ fn load_stack_from_fp(fp_offset: i64, into_reg: Writable<Reg>, ty: Type) -> Inst
 }
 
 fn store_stack(mem: MemArg, from_reg: Reg, ty: Type) -> Inst {
+    debug_assert!(match &mem {
+        MemArg::SPOffset(off) => SImm9::maybe_from_i64(*off).is_some(),
+        _ => true,
+    });
     match ty {
         types::B1
         | types::B8
@@ -323,6 +327,49 @@ fn store_stack(mem: MemArg, from_reg: Reg, ty: Type) -> Inst {
     }
 }
 
+fn store_stack_fp(fp_offset: i64, from_reg: Reg, ty: Type) -> Inst {
+    store_stack(MemArg::FPOffset(fp_offset), from_reg, ty)
+}
+
+fn store_stack_sp(
+    sp_offset: i64,
+    from_reg: Reg,
+    ty: Type,
+    tmp1: Writable<Reg>,
+    tmp2: Writable<Reg>,
+) -> Vec<Inst> {
+    if SImm9::maybe_from_i64(sp_offset).is_some() {
+        vec![store_stack(MemArg::SPOffset(sp_offset), from_reg, ty)]
+    } else {
+        // mem_finalize will try to generate an add, but in an addition, x31 is the zero register,
+        // not sp! So we have to synthesize the full add here.
+        let mut result = Vec::new();
+        // tmp1 := sp
+        result.push(Inst::Mov {
+            rd: tmp1,
+            rm: stack_reg(),
+        });
+        // tmp2 := offset
+        for inst in Inst::load_constant(tmp2, sp_offset as u64) {
+            result.push(inst);
+        }
+        // tmp1 := add tmp1, tmp2
+        result.push(Inst::AluRRR {
+            alu_op: ALUOp::Add64,
+            rd: tmp1,
+            rn: tmp1.to_reg(),
+            rm: tmp2.to_reg(),
+        });
+        // Actual store.
+        result.push(store_stack(
+            MemArg::Unscaled(tmp1.to_reg(), SImm9::maybe_from_i64(0).unwrap()),
+            from_reg,
+            ty,
+        ));
+        result
+    }
+}
+
 fn is_callee_save(call_conv: isa::CallConv, r: RealReg) -> bool {
     if call_conv.extends_baldrdash() {
         match r.get_class() {
@@ -523,8 +570,8 @@ impl ABIBody for AArch64ABIBody {
                     }
                     _ => {}
                 };
-                ret.push(store_stack(
-                    MemArg::FPOffset(off + self.frame_size()),
+                ret.push(store_stack_fp(
+                    off + self.frame_size(),
                     from_reg.to_reg(),
                     ty,
                 ))
@@ -566,7 +613,7 @@ impl ABIBody for AArch64ABIBody {
         // Offset from beginning of stackslot area, which is at FP - stackslots_size.
         let stack_off = self.stackslots[slot.as_u32() as usize] as i64;
         let fp_off: i64 = -(self.stackslots_size as i64) + stack_off + (offset as i64);
-        store_stack(MemArg::FPOffset(fp_off), from_reg, ty)
+        store_stack_fp(fp_off, from_reg, ty)
     }
 
     fn stackslot_addr(&self, slot: StackSlot, offset: u32, into_reg: Writable<Reg>) -> Inst {
@@ -596,7 +643,7 @@ impl ABIBody for AArch64ABIBody {
         let islot = slot.get() as i64;
         let ty_size = self.get_spillslot_size(from_reg.get_class(), ty) * 8;
         let fp_off: i64 = -(self.stackslots_size as i64) - (8 * islot) - ty_size as i64;
-        store_stack(MemArg::FPOffset(fp_off), from_reg, ty)
+        store_stack_fp(fp_off, from_reg, ty)
     }
 
     fn gen_prologue(&mut self) -> Vec<Inst> {
@@ -927,10 +974,20 @@ impl ABICall for AArch64ABICall {
         adjust_stack(self.sig.stack_arg_space as u64, /* is_sub = */ false)
     }
 
-    fn gen_copy_reg_to_arg(&self, idx: usize, from_reg: Reg) -> Inst {
+    fn gen_copy_reg_to_arg(
+        &self,
+        idx: usize,
+        from_reg: Reg,
+        tmp1: Writable<Reg>,
+        tmp2: Writable<Reg>,
+    ) -> Vec<Inst> {
         match &self.sig.args[idx] {
-            &ABIArg::Reg(reg, ty) => Inst::gen_move(Writable::from_reg(reg.to_reg()), from_reg, ty),
-            &ABIArg::Stack(off, ty) => store_stack(MemArg::SPOffset(off), from_reg, ty),
+            &ABIArg::Reg(reg, ty) => vec![Inst::gen_move(
+                Writable::from_reg(reg.to_reg()),
+                from_reg,
+                ty,
+            )],
+            &ABIArg::Stack(off, ty) => store_stack_sp(off, from_reg, ty, tmp1, tmp2),
         }
     }
 
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index b137ad3017..d012521ecd 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -35,6 +35,14 @@ pub fn mem_finalize(insn_off: CodeOffset, mem: &MemArg) -> (Vec<Inst>, MemArg) {
                 let mem = MemArg::Unscaled(basereg, simm9);
                 (vec![], mem)
             } else {
+                // In an addition, x31 is the zero register, not sp; we have only one temporary
+                // so we can't do the proper add here.
+                debug_assert_ne!(
+                    basereg,
+                    stack_reg(),
+                    "should have diverted SP before mem_finalize"
+                );
+
                 let tmp = writable_spilltmp_reg();
                 let mut const_insts = Inst::load_constant(tmp, off as u64);
                 let add_inst = Inst::AluRRR {
@@ -363,7 +371,11 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                     ALUOp::Lsl32 | ALUOp::Lsl64 => 0b001000,
                     _ => 0b000000,
                 };
-                assert_ne!(writable_stack_reg(), rd);
+                debug_assert_ne!(writable_stack_reg(), rd);
+                // The stack pointer is the zero register in this context, so this might be an
+                // indication that something is wrong.
+                debug_assert_ne!(stack_reg(), rn);
+                debug_assert_ne!(stack_reg(), rm);
                 sink.put4(enc_arith_rrr(top11, bit15_10, rd, rn, rm));
             }
             &Inst::AluRRRR {
@@ -818,11 +830,25 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
             &Inst::Mov { rd, rm } => {
                 assert!(rd.to_reg().get_class() == rm.get_class());
                 assert!(rm.get_class() == RegClass::I64);
+
                 // MOV to SP is interpreted as MOV to XZR instead. And our codegen
                 // should never MOV to XZR.
-                assert!(machreg_to_gpr(rd.to_reg()) != 31);
-                // Encoded as ORR rd, rm, zero.
-                sink.put4(enc_arith_rrr(0b10101010_000, 0b000_000, rd, zero_reg(), rm));
+                assert!(rd.to_reg() != stack_reg());
+
+                if rm == stack_reg() {
+                    // We can't use ORR here, so use an `add rd, sp, #0` instead.
+                    let imm12 = Imm12::maybe_from_u64(0).unwrap();
+                    sink.put4(enc_arith_rr_imm12(
+                        0b100_10001,
+                        imm12.shift_bits(),
+                        imm12.imm_bits(),
+                        rm,
+                        rd,
+                    ));
+                } else {
+                    // Encoded as ORR rd, rm, zero.
+                    sink.put4(enc_arith_rrr(0b10101010_000, 0b000_000, rd, zero_reg(), rm));
+                }
             }
             &Inst::Mov32 { rd, rm } => {
                 // MOV to SP is interpreted as MOV to XZR instead. And our codegen
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index 101e217ae7..52330e66e6 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -1719,6 +1719,9 @@ impl MachInst for Inst {
 
     fn is_move(&self) -> Option<(Writable<Reg>, Reg)> {
         match self {
+            // TODO a regalloc assertion is triggered if we don't have this, see also #1586 on
+            // wasmtime, as well as https://github.com/bytecodealliance/regalloc.rs/issues/52.
+            &Inst::Mov { rm, .. } if rm == stack_reg() => None,
             &Inst::Mov { rd, rm } => Some((rd, rm)),
             &Inst::FpuMove64 { rd, rn } => Some((rd, rn)),
             _ => None,
diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
index e9e72dd6d5..66e708249d 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -595,6 +595,20 @@ fn lower_address<C: LowerCtx<I = Inst>>(
     // Add each addend to the address.
     for addend in addends {
         let reg = input_to_reg(ctx, *addend, NarrowValueMode::ZeroExtend64);
+
+        // In an addition, the stack register is the zero register, so divert it to another
+        // register just before doing the actual add.
+        let reg = if reg == stack_reg() {
+            let tmp = ctx.tmp(RegClass::I64, I64);
+            ctx.emit(Inst::Mov {
+                rd: tmp,
+                rm: stack_reg(),
+            });
+            tmp.to_reg()
+        } else {
+            reg
+        };
+
         ctx.emit(Inst::AluRRR {
             alu_op: ALUOp::Add64,
             rd: addr.clone(),
@@ -1968,9 +1982,13 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) {
                 ctx.emit(inst);
             }
             assert!(inputs.len() == abi.num_args());
+            let tmp1 = ctx.tmp(RegClass::I64, I64);
+            let tmp2 = ctx.tmp(RegClass::I64, I64);
             for (i, input) in inputs.iter().enumerate() {
                 let arg_reg = input_to_reg(ctx, *input, NarrowValueMode::None);
-                ctx.emit(abi.gen_copy_reg_to_arg(i, arg_reg));
+                for inst in abi.gen_copy_reg_to_arg(i, arg_reg, tmp1, tmp2) {
+                    ctx.emit(inst);
+                }
             }
             for inst in abi.gen_call().into_iter() {
                 ctx.emit(inst);
diff --git a/cranelift/codegen/src/machinst/abi.rs b/cranelift/codegen/src/machinst/abi.rs
index e22bc0bdc4..b5bbbb9cb5 100644
--- a/cranelift/codegen/src/machinst/abi.rs
+++ b/cranelift/codegen/src/machinst/abi.rs
@@ -132,9 +132,14 @@ pub trait ABICall {
     /// Get the number of arguments expected.
     fn num_args(&self) -> usize;
 
-    /// Save the clobbered registers.
     /// Copy an argument value from a source register, prior to the call.
-    fn gen_copy_reg_to_arg(&self, idx: usize, from_reg: Reg) -> Self::I;
+    fn gen_copy_reg_to_arg(
+        &self,
+        idx: usize,
+        from_reg: Reg,
+        tmp1: Writable<Reg>,
+        tmp2: Writable<Reg>,
+    ) -> Vec<Self::I>;
 
     /// Copy a return value into a destination register, after the call returns.
     fn gen_copy_retval_to_reg(&self, idx: usize, into_reg: Writable<Reg>) -> Self::I;