cranelift: Add inline stack probing for x64 (#4747)

* cranelift: Add inline stack probe for x64 * cranelift: Cleanups comments Thanks @jameysharp!
2022-09-01 23:32:54 +01:00
parent 84ac24c23d
commit 08e7a7f1a0
16 changed files with 394 additions and 17 deletions
--- a/cranelift/codegen/meta/src/shared/settings.rs
+++ b/cranelift/codegen/meta/src/shared/settings.rs
@@ -284,6 +284,18 @@ pub(crate) fn define() -> SettingGroup {
        12,
    );

+    settings.add_enum(
+        "probestack_strategy",
+        "Controls what kinds of stack probes are emitted.",
+        r#"
+            Supported strategies:
+
+            - `outline`: Always emits stack probes as calls to a probe stack function.
+            - `inline`: Always emits inline stack probes.
+        "#,
+        vec!["outline", "inline"],
+    );
+
    // Jump table options.

    settings.add_bool(
--- a/cranelift/codegen/src/isa/aarch64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -610,6 +610,10 @@ impl ABIMachineSpec for AArch64MachineDeps {
        smallvec![]
    }

+    fn gen_inline_probestack(_frame_size: u32, _guard_size: u32) -> SmallInstVec<Self::I> {
+        unimplemented!("Inline stack probing is unimplemented on AArch64");
+    }
+
    // Returns stack bytes used as well as instructions. Does not adjust
    // nominal SP offset; abi generic code will do that.
    fn gen_clobber_save(
--- a/cranelift/codegen/src/isa/s390x/abi.rs
+++ b/cranelift/codegen/src/isa/s390x/abi.rs
@@ -574,6 +574,10 @@ impl ABIMachineSpec for S390xMachineDeps {
        smallvec![]
    }

+    fn gen_inline_probestack(_frame_size: u32, _guard_size: u32) -> SmallInstVec<Self::I> {
+        unimplemented!("Inline stack probing is unimplemented on S390x");
+    }
+
    // Returns stack bytes used as well as instructions. Does not adjust
    // nominal SP offset; abi generic code will do that.
    fn gen_clobber_save(
--- a/cranelift/codegen/src/isa/x64/abi.rs
+++ b/cranelift/codegen/src/isa/x64/abi.rs
@@ -29,6 +29,42 @@ pub(crate) type X64Caller = Caller<X64ABIMachineSpec>;
 /// Implementation of ABI primitives for x64.
 pub struct X64ABIMachineSpec;

+impl X64ABIMachineSpec {
+    fn gen_probestack_unroll(guard_size: u32, probe_count: u32) -> SmallInstVec<Inst> {
+        let mut insts = SmallVec::with_capacity(probe_count as usize);
+        for i in 0..probe_count {
+            let offset = (guard_size * (i + 1)) as i64;
+
+            // TODO: It would be nice if we could store the imm 0, but we don't have insts for those
+            // so store the stack pointer. Any register will do, since the stack is undefined at this point
+            insts.push(Self::gen_store_stack(
+                StackAMode::SPOffset(-offset, I8),
+                regs::rsp(),
+                I32,
+            ));
+        }
+        insts
+    }
+    fn gen_probestack_loop(frame_size: u32, guard_size: u32) -> SmallInstVec<Inst> {
+        // We have to use a caller saved register since clobbering only happens
+        // after stack probing.
+        //
+        // R11 is caller saved on both Fastcall and SystemV, and not used for argument
+        // passing, so it's pretty much free. It is also not used by the stacklimit mechanism.
+        let tmp = regs::r11();
+        debug_assert!({
+            let real_reg = tmp.to_real_reg().unwrap();
+            !is_callee_save_systemv(real_reg, false) && !is_callee_save_fastcall(real_reg, false)
+        });
+
+        smallvec![Inst::StackProbeLoop {
+            tmp: Writable::from_reg(tmp),
+            frame_size,
+            guard_size,
+        }]
+    }
+}
+
 impl IsaFlags for x64_settings::Flags {}

 impl ABIMachineSpec for X64ABIMachineSpec {
@@ -398,6 +434,23 @@ impl ABIMachineSpec for X64ABIMachineSpec {
        insts
    }

+    fn gen_inline_probestack(frame_size: u32, guard_size: u32) -> SmallInstVec<Self::I> {
+        // Unroll at most n consecutive probes, before falling back to using a loop
+        //
+        // This was number was picked because the loop version is 38 bytes long. We can fit
+        // 5 inline probes in that space, so unroll if its beneficial in terms of code size.
+        const PROBE_MAX_UNROLL: u32 = 5;
+
+        // Number of probes that we need to perform
+        let probe_count = align_to(frame_size, guard_size) / guard_size;
+
+        if probe_count <= PROBE_MAX_UNROLL {
+            Self::gen_probestack_unroll(guard_size, probe_count)
+        } else {
+            Self::gen_probestack_loop(frame_size, guard_size)
+        }
+    }
+
    fn gen_clobber_save(
        _call_conv: isa::CallConv,
        setup_frame: bool,
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -177,6 +177,11 @@
       ;; popq reg
       (Pop64 (dst WritableGpr))

+      ;; Emits a inline stack probe loop.
+      (StackProbeLoop (tmp WritableReg)
+                      (frame_size u32)
+                      (guard_size u32))
+
       ;; =========================================
       ;; Floating-point operations.

--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -1234,6 +1234,109 @@ pub(crate) fn emit(
            sink.put1(0x58 + (enc_dst & 7));
        }

+        Inst::StackProbeLoop {
+            tmp,
+            frame_size,
+            guard_size,
+        } => {
+            assert!(info.flags.enable_probestack());
+            assert!(guard_size.is_power_of_two());
+
+            let tmp = allocs.next_writable(*tmp);
+
+            // Number of probes that we need to perform
+            let probe_count = align_to(*frame_size, *guard_size) / guard_size;
+
+            // The inline stack probe loop has 3 phases:
+            //
+            // We generate the "guard area" register which is essentially the frame_size aligned to
+            // guard_size. We copy the stack pointer and subtract the guard area from it. This
+            // gets us a register that we can use to compare when looping.
+            //
+            // After that we emit the loop. Essentially we just adjust the stack pointer one guard_size'd
+            // distance at a time and then touch the stack by writing anything to it. We use the previously
+            // created "guard area" register to know when to stop looping.
+            //
+            // When we have touched all the pages that we need, we have to restore the stack pointer
+            // to where it was before.
+            //
+            // Generate the following code:
+            //         mov  tmp_reg, rsp
+            //         sub  tmp_reg, guard_size * probe_count
+            // .loop_start:
+            //         sub  rsp, guard_size
+            //         mov  [rsp], rsp
+            //         cmp  rsp, tmp_reg
+            //         jne  .loop_start
+            //         add  rsp, guard_size * probe_count
+
+            // Create the guard bound register
+            // mov  tmp_reg, rsp
+            let inst = Inst::gen_move(tmp, regs::rsp(), types::I64);
+            inst.emit(&[], sink, info, state);
+
+            // sub  tmp_reg, GUARD_SIZE * probe_count
+            let inst = Inst::alu_rmi_r(
+                OperandSize::Size64,
+                AluRmiROpcode::Sub,
+                RegMemImm::imm(guard_size * probe_count),
+                tmp,
+            );
+            inst.emit(&[], sink, info, state);
+
+            // Emit the main loop!
+            let loop_start = sink.get_label();
+            sink.bind_label(loop_start);
+
+            // sub  rsp, GUARD_SIZE
+            let inst = Inst::alu_rmi_r(
+                OperandSize::Size64,
+                AluRmiROpcode::Sub,
+                RegMemImm::imm(*guard_size),
+                Writable::from_reg(regs::rsp()),
+            );
+            inst.emit(&[], sink, info, state);
+
+            // TODO: `mov [rsp], 0` would be better, but we don't have that instruction
+            // Probe the stack! We don't use Inst::gen_store_stack here because we need a predictable
+            // instruction size.
+            // mov  [rsp], rsp
+            let inst = Inst::mov_r_m(
+                OperandSize::Size32, // Use Size32 since it saves us one byte
+                regs::rsp(),
+                SyntheticAmode::Real(Amode::imm_reg(0, regs::rsp())),
+            );
+            inst.emit(&[], sink, info, state);
+
+            // Compare and jump if we are not done yet
+            // cmp  rsp, tmp_reg
+            let inst = Inst::cmp_rmi_r(
+                OperandSize::Size64,
+                RegMemImm::reg(regs::rsp()),
+                tmp.to_reg(),
+            );
+            inst.emit(&[], sink, info, state);
+
+            // jne  .loop_start
+            // TODO: Encoding the JmpIf as a short jump saves us 4 bytes here.
+            one_way_jmp(sink, CC::NZ, loop_start);
+
+            // The regular prologue code is going to emit a `sub` after this, so we need to
+            // reset the stack pointer
+            //
+            // TODO: It would be better if we could avoid the `add` + `sub` that is generated here
+            // and in the stack adj portion of the prologue
+            //
+            // add rsp, GUARD_SIZE * probe_count
+            let inst = Inst::alu_rmi_r(
+                OperandSize::Size64,
+                AluRmiROpcode::Add,
+                RegMemImm::imm(guard_size * probe_count),
+                Writable::from_reg(regs::rsp()),
+            );
+            inst.emit(&[], sink, info, state);
+        }
+
        Inst::CallKnown {
            dest,
            info: call_info,
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -100,6 +100,7 @@ impl Inst {
            | Inst::Nop { .. }
            | Inst::Pop64 { .. }
            | Inst::Push64 { .. }
+            | Inst::StackProbeLoop { .. }
            | Inst::Ret { .. }
            | Inst::Setcc { .. }
            | Inst::ShiftR { .. }
@@ -1427,6 +1428,21 @@ impl PrettyPrint for Inst {
                format!("{} {}", ljustify("pushq".to_string()), src)
            }

+            Inst::StackProbeLoop {
+                tmp,
+                frame_size,
+                guard_size,
+            } => {
+                let tmp = pretty_print_reg(tmp.to_reg(), 8, allocs);
+                format!(
+                    "{} {}, frame_size={}, guard_size={}",
+                    ljustify("stack_probe_loop".to_string()),
+                    tmp,
+                    frame_size,
+                    guard_size
+                )
+            }
+
            Inst::Pop64 { dst } => {
                let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
                format!("{} {}", ljustify("popq".to_string()), dst)
@@ -1946,6 +1962,9 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
        Inst::Pop64 { dst } => {
            collector.reg_def(dst.to_writable_reg());
        }
+        Inst::StackProbeLoop { tmp, .. } => {
+            collector.reg_early_def(*tmp);
+        }

        Inst::CallKnown { ref info, .. } => {
            for &u in &info.uses {
--- a/cranelift/codegen/src/machinst/abi.rs
+++ b/cranelift/codegen/src/machinst/abi.rs
@@ -108,6 +108,7 @@ use crate::ir::types::*;
 use crate::ir::{ArgumentExtension, ArgumentPurpose, DynamicStackSlot, Signature, StackSlot};
 use crate::isa::TargetIsa;
 use crate::settings;
+use crate::settings::ProbestackStrategy;
 use crate::CodegenResult;
 use crate::{ir, isa};
 use crate::{machinst::*, trace};
@@ -430,6 +431,9 @@ pub trait ABIMachineSpec {
    /// Generate a probestack call.
    fn gen_probestack(_frame_size: u32) -> SmallInstVec<Self::I>;

+    /// Generate a inline stack probe.
+    fn gen_inline_probestack(_frame_size: u32, _guard_size: u32) -> SmallInstVec<Self::I>;
+
    /// Get all clobbered registers that are callee-saved according to the ABI; the result
    /// contains the registers in a sorted order.
    fn get_clobbered_callee_saves(
@@ -1660,10 +1664,20 @@ impl<M: ABIMachineSpec> Callee<M> {
                insts.extend(stack_limit_load.clone());
                self.insert_stack_check(*reg, total_stacksize, &mut insts);
            }
-            if let Some(min_frame) = &self.probestack_min_frame {
-                if total_stacksize >= *min_frame {
-                    insts.extend(M::gen_probestack(total_stacksize));
-                }
+
+            let needs_probestack = self
+                .probestack_min_frame
+                .map_or(false, |min_frame| total_stacksize >= min_frame);
+
+            if needs_probestack {
+                insts.extend(
+                    if self.flags.probestack_strategy() == ProbestackStrategy::Inline {
+                        let guard_size = 1 << self.flags.probestack_size_log2();
+                        M::gen_inline_probestack(total_stacksize, guard_size)
+                    } else {
+                        M::gen_probestack(total_stacksize)
+                    },
+                );
            }
        }

--- a/cranelift/codegen/src/settings.rs
+++ b/cranelift/codegen/src/settings.rs
@@ -525,6 +525,7 @@ opt_level = "none"
 tls_model = "none"
 libcall_call_conv = "isa_default"
 probestack_size_log2 = 12
+probestack_strategy = "outline"
 regalloc_checker = false
 regalloc_verbose_logs = false
 enable_alias_analysis = true
--- a/cranelift/filetests/filetests/isa/x64/inline-probestack-large.clif
+++ b/cranelift/filetests/filetests/isa/x64/inline-probestack-large.clif
@@ -0,0 +1,67 @@
+test compile precise-output
+set enable_probestack=true
+; Test with the larger size of 64k
+set probestack_size_log2=16
+set probestack_strategy=inline
+target x86_64
+
+
+
+; If the stack size is just one page, we can avoid the stack probe entirely
+function %single_page() -> i64 system_v {
+ss0 = explicit_slot 8192
+
+block0:
+  v1 = stack_addr.i64 ss0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+;   subq    %rsp, $8192, %rsp
+; block0:
+;   lea     rsp(0 + virtual offset), %rax
+;   addq    %rsp, $8192, %rsp
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %unrolled() -> i64 system_v {
+ss0 = explicit_slot 196608
+
+block0:
+  v1 = stack_addr.i64 ss0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+;   movl    %esp, -65536(%rsp)
+;   movl    %esp, -131072(%rsp)
+;   movl    %esp, -196608(%rsp)
+;   subq    %rsp, $196608, %rsp
+; block0:
+;   lea     rsp(0 + virtual offset), %rax
+;   addq    %rsp, $196608, %rsp
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %large() -> i64 system_v {
+ss0 = explicit_slot 2097152
+
+block0:
+  v1 = stack_addr.i64 ss0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+;   stack_probe_loop %r11, frame_size=2097152, guard_size=65536
+;   subq    %rsp, $2097152, %rsp
+; block0:
+;   lea     rsp(0 + virtual offset), %rax
+;   addq    %rsp, $2097152, %rsp
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
--- a/cranelift/filetests/filetests/isa/x64/inline-probestack.clif
+++ b/cranelift/filetests/filetests/isa/x64/inline-probestack.clif
@@ -0,0 +1,66 @@
+test compile precise-output
+set enable_probestack=true
+set probestack_strategy=inline
+; This is the default and is equivalent to a page size of 4096
+set probestack_size_log2=12
+target x86_64
+
+
+; If the stack size is just one page, we can avoid the stack probe entirely
+function %single_page() -> i64 system_v {
+ss0 = explicit_slot 2048
+
+block0:
+  v1 = stack_addr.i64 ss0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+;   subq    %rsp, $2048, %rsp
+; block0:
+;   lea     rsp(0 + virtual offset), %rax
+;   addq    %rsp, $2048, %rsp
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %unrolled() -> i64 system_v {
+ss0 = explicit_slot 12288
+
+block0:
+  v1 = stack_addr.i64 ss0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+;   movl    %esp, -4096(%rsp)
+;   movl    %esp, -8192(%rsp)
+;   movl    %esp, -12288(%rsp)
+;   subq    %rsp, $12288, %rsp
+; block0:
+;   lea     rsp(0 + virtual offset), %rax
+;   addq    %rsp, $12288, %rsp
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %large() -> i64 system_v {
+ss0 = explicit_slot 100000
+
+block0:
+  v1 = stack_addr.i64 ss0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+;   stack_probe_loop %r11, frame_size=100000, guard_size=4096
+;   subq    %rsp, $100000, %rsp
+; block0:
+;   lea     rsp(0 + virtual offset), %rax
+;   addq    %rsp, $100000, %rsp
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
--- a/cranelift/filetests/filetests/runtests/i128-load-store.clif
+++ b/cranelift/filetests/filetests/runtests/i128-load-store.clif
@@ -1,5 +1,7 @@
 test run
 set enable_llvm_abi_extensions=true
+; Disable stack probes since these tests don't require them
+set enable_probestack=false
 target x86_64
 target aarch64
 target s390x
--- a/cranelift/filetests/filetests/runtests/inline-probestack.clif
+++ b/cranelift/filetests/filetests/runtests/inline-probestack.clif
@@ -0,0 +1,37 @@
+test interpret
+test run
+set enable_probestack=true
+set probestack_strategy=inline
+
+; This is the default and is equivalent to a page size of 4096
+set probestack_size_log2=12
+target x86_64
+; Test also with 64k pages
+set probestack_size_log2=16
+target x86_64
+
+; Create a huge stack slot (1MB), way larger than PAGE_SIZE and touch the end of it.
+; This guarantees that we bypass the guard page, cause a page fault the OS isn't expecting
+; which turns into a segfault if we haven't correctly implemented stack probing.
+
+function %probe_loop(i64) -> i64 {
+    ss0 = explicit_slot 1048576
+
+block0(v0: i64):
+    stack_store.i64 v0, ss0
+    v1 = stack_load.i64 ss0
+    return v1
+}
+; run: %probe_loop(1) == 1
+
+
+; Tests the unrolled version of the stackprobe
+function %probe_unroll(i64) -> i64 {
+    ss0 = explicit_slot 9000
+
+block0(v0: i64):
+    stack_store.i64 v0, ss0
+    v1 = stack_load.i64 ss0
+    return v1
+}
+; run: %probe_unroll(1) == 1
--- a/cranelift/filetests/filetests/runtests/stack.clif
+++ b/cranelift/filetests/filetests/runtests/stack.clif
@@ -1,5 +1,7 @@
 test interpret
 test run
+; Disable stack probes since these tests don't require them
+set enable_probestack=false
 target x86_64
 target s390x
 target aarch64
--- a/cranelift/filetests/src/test_run.rs
+++ b/cranelift/filetests/src/test_run.rs
@@ -43,19 +43,6 @@ fn build_host_isa(
        builder.set(value.name, &value.value_string()).unwrap();
    }

-    // We need to force disable stack probing, since we don't support it yet.
-    let flags = {
-        let mut flags_builder = settings::builder();
-
-        // Copy all flags
-        for flag in flags.iter() {
-            flags_builder.set(flag.name, &flag.value_string()).unwrap();
-        }
-
-        flags_builder.set("enable_probestack", "false").unwrap();
-        settings::Flags::new(flags_builder)
-    };
-
    builder.finish(flags).unwrap()
 }

--- a/crates/wasmtime/src/engine.rs
+++ b/crates/wasmtime/src/engine.rs
@@ -354,6 +354,7 @@ impl Engine {
            "enable_llvm_abi_extensions" => *value == FlagValue::Bool(false),
            "enable_pinned_reg" => *value == FlagValue::Bool(false),
            "enable_probestack" => *value == FlagValue::Bool(false),
+            "probestack_strategy" => *value == FlagValue::Enum("outline".into()),
            "use_colocated_libcalls" => *value == FlagValue::Bool(false),
            "use_pinned_reg_as_heap_base" => *value == FlagValue::Bool(false),