Implement inline stack probes for AArch64 (#5353)

* Turn off probestack by default in Cranelift The probestack feature is not implemented for the aarch64 and s390x backends and currently the on-by-default status requires the aarch64 and s390x implementations to be a stub. Turning off probestack by default allows the s390x and aarch64 backends to panic with an error message to avoid providing a false sense of security. When the probestack option is implemented for all backends, however, it may be reasonable to re-enable. * aarch64: Improve codegen for AMode fallback Currently the final fallback for finalizing an `AMode` will generate both a constant-loading instruction as well as an `add` instruction to the base register into the same temporary. This commit improves the codegen by removing the `add` instruction and folding the final add into the finalized `AMode`. This changes the `extendop` used but both registers are 64-bit so shouldn't be affected by the extending operation. * aarch64: Implement inline stack probes This commit implements inline stack probes for the aarch64 backend in Cranelift. The support here is modeled after the x64 support where unrolled probes are used up to a particular threshold after which a loop is generated. The instructions here are similar in spirit to x64 except that unlike x64 the stack pointer isn't modified during the unrolled loop to avoid needing to re-adjust it back up at the end of the loop. * Enable inline probestack for AArch64 and Riscv64 This commit enables inline probestacks for the AArch64 and Riscv64 architectures in the same manner that x86_64 has it enabled now. Some more testing was additionally added since on Unix platforms we should be guaranteed that Rust's stack overflow message is now printed too. * Enable probestack for aarch64 in cranelift-fuzzgen * Address review comments * Remove implicit stack overflow traps from x64 backend This commit removes implicit `StackOverflow` traps inserted by the x64 backend for stack-based operations. This was historically required when stack overflow was detected with page faults but Wasmtime no longer requires that since it's not suitable for wasm modules which call host functions. Additionally no other backend implements this form of implicit trap-code additions so this is intended to synchronize the behavior of all the backends. This fixes a test added prior for aarch64 to properly abort the process instead of accidentally being caught by Wasmtime. * Fix a style issue
2022-11-30 12:30:00 -06:00
parent 8bc7550211
commit 830885383f
22 changed files with 367 additions and 177 deletions
--- a/cranelift/codegen/src/isa/aarch64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -631,14 +631,56 @@ impl ABIMachineSpec for AArch64MachineDeps {
        insts
    }

-    fn gen_probestack(_: u32) -> SmallInstVec<Self::I> {
+    fn gen_probestack(_insts: &mut SmallInstVec<Self::I>, _: u32) {
        // TODO: implement if we ever require stack probes on an AArch64 host
        // (unlikely unless Lucet is ported)
-        smallvec![]
+        unimplemented!("Stack probing is unimplemented on AArch64");
    }

-    fn gen_inline_probestack(_frame_size: u32, _guard_size: u32) -> SmallInstVec<Self::I> {
-        unimplemented!("Inline stack probing is unimplemented on AArch64");
+    fn gen_inline_probestack(insts: &mut SmallInstVec<Self::I>, frame_size: u32, guard_size: u32) {
+        // The stack probe loop currently takes 6 instructions and each inline
+        // probe takes 2 (ish, these numbers sort of depend on the constants).
+        // Set this to 3 to keep the max size of the probe to 6 instructions.
+        const PROBE_MAX_UNROLL: u32 = 3;
+
+        let probe_count = align_to(frame_size, guard_size) / guard_size;
+        if probe_count <= PROBE_MAX_UNROLL {
+            // When manually unrolling stick an instruction that stores 0 at a
+            // constant offset relative to the stack pointer. This will
+            // turn into something like `movn tmp, #n ; stur xzr [sp, tmp]`.
+            //
+            // Note that this may actually store beyond the stack size for the
+            // last item but that's ok since it's unused stack space and if
+            // that faults accidentally we're so close to faulting it shouldn't
+            // make too much difference to fault there.
+            insts.reserve(probe_count as usize);
+            for i in 0..probe_count {
+                let offset = (guard_size * (i + 1)) as i64;
+                insts.push(Self::gen_store_stack(
+                    StackAMode::SPOffset(-offset, I8),
+                    zero_reg(),
+                    I32,
+                ));
+            }
+        } else {
+            // The non-unrolled version uses two temporary registers. The
+            // `start` contains the current offset from sp and counts downwards
+            // during the loop by increments of `guard_size`. The `end` is
+            // the size of the frame and where we stop.
+            //
+            // Note that this emission is all post-regalloc so it should be ok
+            // to use the temporary registers here as input/output as the loop
+            // itself is not allowed to use the registers.
+            let start = writable_spilltmp_reg();
+            let end = writable_tmp2_reg();
+            insts.extend(Inst::load_constant(start, 0));
+            insts.extend(Inst::load_constant(end, frame_size.into()));
+            insts.push(Inst::StackProbeLoop {
+                start,
+                end: end.to_reg(),
+                step: Imm12::maybe_from_u64(guard_size.into()).unwrap(),
+            });
+        }
    }

    // Returns stack bytes used as well as instructions. Does not adjust
--- a/cranelift/codegen/src/isa/aarch64/inst.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst.isle
@@ -935,7 +935,16 @@

       ;; A dummy use, useful to keep a value alive.
       (DummyUse
-        (reg Reg))))
+        (reg Reg))
+
+       ;; Emits an inline stack probe loop.
+       ;;
+       ;; Note that this is emitted post-regalloc so `start` and `end` can be
+       ;; temporary registers such as the spilltmp and tmp2 registers. This also
+       ;; means that the internal codegen can't use these registers.
+       (StackProbeLoop (start WritableReg)
+                       (end Reg)
+                       (step Imm12))))

 ;; An ALU operation. This can be paired with several instruction formats
 ;; below (see `Inst`) in any combination.
@@ -1134,8 +1143,7 @@
        ;; [crate::isa::aarch64::abi](the ABI module) for more details.
        (NominalSPOffset
         (off i64)
-         (ty Type))
-))
+         (ty Type))))

 (type PairAMode extern (enum))
 (type FPUOpRI extern (enum))
@@ -2242,7 +2250,7 @@
      (let ((dst WritableReg (temp_writable_reg $I64)))
        (ConsumesFlags.ConsumesFlagsReturnsResultWithProducer (MInst.CSet dst cond) dst)))

-;; Helper for constructing `csetm` instructions. 
+;; Helper for constructing `csetm` instructions.
 (decl csetm (Cond) ConsumesFlags)
 (rule (csetm cond)
      (let ((dst WritableReg (temp_writable_reg $I64)))
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -63,21 +63,14 @@ pub fn mem_finalize(
                (smallvec![], mem)
            } else {
                let tmp = writable_spilltmp_reg();
-                let mut const_insts = Inst::load_constant(tmp, off as u64);
-                // N.B.: we must use AluRRRExtend because AluRRR uses the "shifted register" form
-                // (AluRRRShift) instead, which interprets register 31 as the zero reg, not SP. SP
-                // is a valid base (for SPOffset) which we must handle here.
-                // Also, SP needs to be the first arg, not second.
-                let add_inst = Inst::AluRRRExtend {
-                    alu_op: ALUOp::Add,
-                    size: OperandSize::Size64,
-                    rd: tmp,
-                    rn: basereg,
-                    rm: tmp.to_reg(),
-                    extendop: ExtendOp::UXTX,
-                };
-                const_insts.push(add_inst);
-                (const_insts, AMode::reg(tmp.to_reg()))
+                (
+                    Inst::load_constant(tmp, off as u64),
+                    AMode::RegExtended {
+                        rn: basereg,
+                        rm: tmp.to_reg(),
+                        extendop: ExtendOp::SXTX,
+                    },
+                )
            }
        }

@@ -3424,6 +3417,72 @@ impl MachInstEmit for Inst {
            }

            &Inst::DummyUse { .. } => {}
+
+            &Inst::StackProbeLoop { start, end, step } => {
+                assert!(emit_info.0.enable_probestack());
+                let start = allocs.next_writable(start);
+                let end = allocs.next(end);
+
+                // The loop generated here uses `start` as a counter register to
+                // count backwards until negating it exceeds `end`. In other
+                // words `start` is an offset from `sp` we're testing where
+                // `end` is the max size we need to test. The loop looks like:
+                //
+                //      loop_start:
+                //          sub start, start, #step
+                //          stur xzr, [sp, start]
+                //          cmn start, end
+                //          br.gt loop_start
+                //      loop_end:
+                //
+                // Note that this loop cannot use the spilltmp and tmp2
+                // registers as those are currently used as the input to this
+                // loop when generating the instruction. This means that some
+                // more flavorful address modes and lowerings need to be
+                // avoided.
+                //
+                // Perhaps someone more clever than I can figure out how to use
+                // `subs` or the like and skip the `cmn`, but I can't figure it
+                // out at this time.
+
+                let loop_start = sink.get_label();
+                sink.bind_label(loop_start);
+
+                Inst::AluRRImm12 {
+                    alu_op: ALUOp::Sub,
+                    size: OperandSize::Size64,
+                    rd: start,
+                    rn: start.to_reg(),
+                    imm12: step,
+                }
+                .emit(&[], sink, emit_info, state);
+                Inst::Store32 {
+                    rd: regs::zero_reg(),
+                    mem: AMode::RegReg {
+                        rn: regs::stack_reg(),
+                        rm: start.to_reg(),
+                    },
+                    flags: MemFlags::trusted(),
+                }
+                .emit(&[], sink, emit_info, state);
+                Inst::AluRRR {
+                    alu_op: ALUOp::AddS,
+                    size: OperandSize::Size64,
+                    rd: regs::writable_zero_reg(),
+                    rn: start.to_reg(),
+                    rm: end,
+                }
+                .emit(&[], sink, emit_info, state);
+
+                let loop_end = sink.get_label();
+                Inst::CondBr {
+                    taken: BranchTarget::Label(loop_start),
+                    not_taken: BranchTarget::Label(loop_end),
+                    kind: CondBrKind::Cond(Cond::Gt),
+                }
+                .emit(&[], sink, emit_info, state);
+                sink.bind_label(loop_end);
+            }
        }

        let end_off = sink.cur_offset();
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -1789,8 +1789,8 @@ fn test_aarch64_binemit() {
            mem: AMode::FPOffset { off: 32768, ty: I8 },
            flags: MemFlags::trusted(),
        },
-        "100090D2B063308B010240F9",
-        "movz x16, #32768 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
+        "100090D2A1EB70F8",
+        "movz x16, #32768 ; ldr x1, [fp, x16, SXTX]",
    ));
    insns.push((
        Inst::ULoad64 {
@@ -1801,8 +1801,8 @@ fn test_aarch64_binemit() {
            },
            flags: MemFlags::trusted(),
        },
-        "F0FF8F92B063308B010240F9",
-        "movn x16, #32767 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
+        "F0FF8F92A1EB70F8",
+        "movn x16, #32767 ; ldr x1, [fp, x16, SXTX]",
    ));
    insns.push((
        Inst::ULoad64 {
@@ -1813,8 +1813,8 @@ fn test_aarch64_binemit() {
            }, // 2^20
            flags: MemFlags::trusted(),
        },
-        "1002A0D2B063308B010240F9",
-        "movz x16, #16, LSL #16 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
+        "1002A0D2A1EB70F8",
+        "movz x16, #16, LSL #16 ; ldr x1, [fp, x16, SXTX]",
    ));
    insns.push((
        Inst::ULoad64 {
@@ -1825,8 +1825,8 @@ fn test_aarch64_binemit() {
            }, // 2^20 + 1
            flags: MemFlags::trusted(),
        },
-        "300080521002A072B063308B010240F9",
-        "movz w16, #1 ; movk w16, w16, #16, LSL #16 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
+        "300080521002A072A1EB70F8",
+        "movz w16, #1 ; movk w16, w16, #16, LSL #16 ; ldr x1, [fp, x16, SXTX]",
    ));

    insns.push((
@@ -1867,8 +1867,8 @@ fn test_aarch64_binemit() {
            },
            flags: MemFlags::trusted(),
        },
-        "1002A0D2F060308B010240F9",
-        "movz x16, #16, LSL #16 ; add x16, x7, x16, UXTX ; ldr x1, [x16]",
+        "1002A0D2E1E870F8",
+        "movz x16, #16, LSL #16 ; ldr x1, [x7, x16, SXTX]",
    ));

    insns.push((
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -1103,6 +1103,10 @@ fn aarch64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
        &Inst::DummyUse { reg } => {
            collector.reg_use(reg);
        }
+        &Inst::StackProbeLoop { start, end, .. } => {
+            collector.reg_early_def(start);
+            collector.reg_use(end);
+        }
    }
 }

@@ -2887,6 +2891,12 @@ impl Inst {
                let reg = pretty_print_reg(reg, allocs);
                format!("dummy_use {}", reg)
            }
+            &Inst::StackProbeLoop { start, end, step } => {
+                let start = pretty_print_reg(start.to_reg(), allocs);
+                let end = pretty_print_reg(end, allocs);
+                let step = step.pretty_print(0, allocs);
+                format!("stack_probe_loop {start}, {end}, {step}")
+            }
        }
    }
 }