diff --git a/cranelift/codegen/meta/src/shared/settings.rs b/cranelift/codegen/meta/src/shared/settings.rs index 9cdfcc19a8..9b4b3656ec 100644 --- a/cranelift/codegen/meta/src/shared/settings.rs +++ b/cranelift/codegen/meta/src/shared/settings.rs @@ -284,6 +284,18 @@ pub(crate) fn define() -> SettingGroup { 12, ); + settings.add_enum( + "probestack_strategy", + "Controls what kinds of stack probes are emitted.", + r#" + Supported strategies: + + - `outline`: Always emits stack probes as calls to a probe stack function. + - `inline`: Always emits inline stack probes. + "#, + vec!["outline", "inline"], + ); + // Jump table options. settings.add_bool( diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs index 3f5aeb784f..f34d73a4d0 100644 --- a/cranelift/codegen/src/isa/aarch64/abi.rs +++ b/cranelift/codegen/src/isa/aarch64/abi.rs @@ -610,6 +610,10 @@ impl ABIMachineSpec for AArch64MachineDeps { smallvec![] } + fn gen_inline_probestack(_frame_size: u32, _guard_size: u32) -> SmallInstVec { + unimplemented!("Inline stack probing is unimplemented on AArch64"); + } + // Returns stack bytes used as well as instructions. Does not adjust // nominal SP offset; abi generic code will do that. fn gen_clobber_save( diff --git a/cranelift/codegen/src/isa/s390x/abi.rs b/cranelift/codegen/src/isa/s390x/abi.rs index 41dc93a076..0055a3a37b 100644 --- a/cranelift/codegen/src/isa/s390x/abi.rs +++ b/cranelift/codegen/src/isa/s390x/abi.rs @@ -574,6 +574,10 @@ impl ABIMachineSpec for S390xMachineDeps { smallvec![] } + fn gen_inline_probestack(_frame_size: u32, _guard_size: u32) -> SmallInstVec { + unimplemented!("Inline stack probing is unimplemented on S390x"); + } + // Returns stack bytes used as well as instructions. Does not adjust // nominal SP offset; abi generic code will do that. fn gen_clobber_save( diff --git a/cranelift/codegen/src/isa/x64/abi.rs b/cranelift/codegen/src/isa/x64/abi.rs index 610ca101c7..69d0a4dc9b 100644 --- a/cranelift/codegen/src/isa/x64/abi.rs +++ b/cranelift/codegen/src/isa/x64/abi.rs @@ -29,6 +29,42 @@ pub(crate) type X64Caller = Caller; /// Implementation of ABI primitives for x64. pub struct X64ABIMachineSpec; +impl X64ABIMachineSpec { + fn gen_probestack_unroll(guard_size: u32, probe_count: u32) -> SmallInstVec { + let mut insts = SmallVec::with_capacity(probe_count as usize); + for i in 0..probe_count { + let offset = (guard_size * (i + 1)) as i64; + + // TODO: It would be nice if we could store the imm 0, but we don't have insts for those + // so store the stack pointer. Any register will do, since the stack is undefined at this point + insts.push(Self::gen_store_stack( + StackAMode::SPOffset(-offset, I8), + regs::rsp(), + I32, + )); + } + insts + } + fn gen_probestack_loop(frame_size: u32, guard_size: u32) -> SmallInstVec { + // We have to use a caller saved register since clobbering only happens + // after stack probing. + // + // R11 is caller saved on both Fastcall and SystemV, and not used for argument + // passing, so it's pretty much free. It is also not used by the stacklimit mechanism. + let tmp = regs::r11(); + debug_assert!({ + let real_reg = tmp.to_real_reg().unwrap(); + !is_callee_save_systemv(real_reg, false) && !is_callee_save_fastcall(real_reg, false) + }); + + smallvec![Inst::StackProbeLoop { + tmp: Writable::from_reg(tmp), + frame_size, + guard_size, + }] + } +} + impl IsaFlags for x64_settings::Flags {} impl ABIMachineSpec for X64ABIMachineSpec { @@ -398,6 +434,23 @@ impl ABIMachineSpec for X64ABIMachineSpec { insts } + fn gen_inline_probestack(frame_size: u32, guard_size: u32) -> SmallInstVec { + // Unroll at most n consecutive probes, before falling back to using a loop + // + // This was number was picked because the loop version is 38 bytes long. We can fit + // 5 inline probes in that space, so unroll if its beneficial in terms of code size. + const PROBE_MAX_UNROLL: u32 = 5; + + // Number of probes that we need to perform + let probe_count = align_to(frame_size, guard_size) / guard_size; + + if probe_count <= PROBE_MAX_UNROLL { + Self::gen_probestack_unroll(guard_size, probe_count) + } else { + Self::gen_probestack_loop(frame_size, guard_size) + } + } + fn gen_clobber_save( _call_conv: isa::CallConv, setup_frame: bool, diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index 25d41ae483..fbe4e09ac9 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -177,6 +177,11 @@ ;; popq reg (Pop64 (dst WritableGpr)) + ;; Emits a inline stack probe loop. + (StackProbeLoop (tmp WritableReg) + (frame_size u32) + (guard_size u32)) + ;; ========================================= ;; Floating-point operations. diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 77b0fad3d4..4a60b83c38 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1234,6 +1234,109 @@ pub(crate) fn emit( sink.put1(0x58 + (enc_dst & 7)); } + Inst::StackProbeLoop { + tmp, + frame_size, + guard_size, + } => { + assert!(info.flags.enable_probestack()); + assert!(guard_size.is_power_of_two()); + + let tmp = allocs.next_writable(*tmp); + + // Number of probes that we need to perform + let probe_count = align_to(*frame_size, *guard_size) / guard_size; + + // The inline stack probe loop has 3 phases: + // + // We generate the "guard area" register which is essentially the frame_size aligned to + // guard_size. We copy the stack pointer and subtract the guard area from it. This + // gets us a register that we can use to compare when looping. + // + // After that we emit the loop. Essentially we just adjust the stack pointer one guard_size'd + // distance at a time and then touch the stack by writing anything to it. We use the previously + // created "guard area" register to know when to stop looping. + // + // When we have touched all the pages that we need, we have to restore the stack pointer + // to where it was before. + // + // Generate the following code: + // mov tmp_reg, rsp + // sub tmp_reg, guard_size * probe_count + // .loop_start: + // sub rsp, guard_size + // mov [rsp], rsp + // cmp rsp, tmp_reg + // jne .loop_start + // add rsp, guard_size * probe_count + + // Create the guard bound register + // mov tmp_reg, rsp + let inst = Inst::gen_move(tmp, regs::rsp(), types::I64); + inst.emit(&[], sink, info, state); + + // sub tmp_reg, GUARD_SIZE * probe_count + let inst = Inst::alu_rmi_r( + OperandSize::Size64, + AluRmiROpcode::Sub, + RegMemImm::imm(guard_size * probe_count), + tmp, + ); + inst.emit(&[], sink, info, state); + + // Emit the main loop! + let loop_start = sink.get_label(); + sink.bind_label(loop_start); + + // sub rsp, GUARD_SIZE + let inst = Inst::alu_rmi_r( + OperandSize::Size64, + AluRmiROpcode::Sub, + RegMemImm::imm(*guard_size), + Writable::from_reg(regs::rsp()), + ); + inst.emit(&[], sink, info, state); + + // TODO: `mov [rsp], 0` would be better, but we don't have that instruction + // Probe the stack! We don't use Inst::gen_store_stack here because we need a predictable + // instruction size. + // mov [rsp], rsp + let inst = Inst::mov_r_m( + OperandSize::Size32, // Use Size32 since it saves us one byte + regs::rsp(), + SyntheticAmode::Real(Amode::imm_reg(0, regs::rsp())), + ); + inst.emit(&[], sink, info, state); + + // Compare and jump if we are not done yet + // cmp rsp, tmp_reg + let inst = Inst::cmp_rmi_r( + OperandSize::Size64, + RegMemImm::reg(regs::rsp()), + tmp.to_reg(), + ); + inst.emit(&[], sink, info, state); + + // jne .loop_start + // TODO: Encoding the JmpIf as a short jump saves us 4 bytes here. + one_way_jmp(sink, CC::NZ, loop_start); + + // The regular prologue code is going to emit a `sub` after this, so we need to + // reset the stack pointer + // + // TODO: It would be better if we could avoid the `add` + `sub` that is generated here + // and in the stack adj portion of the prologue + // + // add rsp, GUARD_SIZE * probe_count + let inst = Inst::alu_rmi_r( + OperandSize::Size64, + AluRmiROpcode::Add, + RegMemImm::imm(guard_size * probe_count), + Writable::from_reg(regs::rsp()), + ); + inst.emit(&[], sink, info, state); + } + Inst::CallKnown { dest, info: call_info, diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index fe4f99d561..fd449fe56a 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -100,6 +100,7 @@ impl Inst { | Inst::Nop { .. } | Inst::Pop64 { .. } | Inst::Push64 { .. } + | Inst::StackProbeLoop { .. } | Inst::Ret { .. } | Inst::Setcc { .. } | Inst::ShiftR { .. } @@ -1427,6 +1428,21 @@ impl PrettyPrint for Inst { format!("{} {}", ljustify("pushq".to_string()), src) } + Inst::StackProbeLoop { + tmp, + frame_size, + guard_size, + } => { + let tmp = pretty_print_reg(tmp.to_reg(), 8, allocs); + format!( + "{} {}, frame_size={}, guard_size={}", + ljustify("stack_probe_loop".to_string()), + tmp, + frame_size, + guard_size + ) + } + Inst::Pop64 { dst } => { let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs); format!("{} {}", ljustify("popq".to_string()), dst) @@ -1946,6 +1962,9 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol Inst::Pop64 { dst } => { collector.reg_def(dst.to_writable_reg()); } + Inst::StackProbeLoop { tmp, .. } => { + collector.reg_early_def(*tmp); + } Inst::CallKnown { ref info, .. } => { for &u in &info.uses { diff --git a/cranelift/codegen/src/machinst/abi.rs b/cranelift/codegen/src/machinst/abi.rs index 92bc906a85..e205658365 100644 --- a/cranelift/codegen/src/machinst/abi.rs +++ b/cranelift/codegen/src/machinst/abi.rs @@ -108,6 +108,7 @@ use crate::ir::types::*; use crate::ir::{ArgumentExtension, ArgumentPurpose, DynamicStackSlot, Signature, StackSlot}; use crate::isa::TargetIsa; use crate::settings; +use crate::settings::ProbestackStrategy; use crate::CodegenResult; use crate::{ir, isa}; use crate::{machinst::*, trace}; @@ -430,6 +431,9 @@ pub trait ABIMachineSpec { /// Generate a probestack call. fn gen_probestack(_frame_size: u32) -> SmallInstVec; + /// Generate a inline stack probe. + fn gen_inline_probestack(_frame_size: u32, _guard_size: u32) -> SmallInstVec; + /// Get all clobbered registers that are callee-saved according to the ABI; the result /// contains the registers in a sorted order. fn get_clobbered_callee_saves( @@ -1660,10 +1664,20 @@ impl Callee { insts.extend(stack_limit_load.clone()); self.insert_stack_check(*reg, total_stacksize, &mut insts); } - if let Some(min_frame) = &self.probestack_min_frame { - if total_stacksize >= *min_frame { - insts.extend(M::gen_probestack(total_stacksize)); - } + + let needs_probestack = self + .probestack_min_frame + .map_or(false, |min_frame| total_stacksize >= min_frame); + + if needs_probestack { + insts.extend( + if self.flags.probestack_strategy() == ProbestackStrategy::Inline { + let guard_size = 1 << self.flags.probestack_size_log2(); + M::gen_inline_probestack(total_stacksize, guard_size) + } else { + M::gen_probestack(total_stacksize) + }, + ); } } diff --git a/cranelift/codegen/src/settings.rs b/cranelift/codegen/src/settings.rs index ef9556d021..29ad916097 100644 --- a/cranelift/codegen/src/settings.rs +++ b/cranelift/codegen/src/settings.rs @@ -525,6 +525,7 @@ opt_level = "none" tls_model = "none" libcall_call_conv = "isa_default" probestack_size_log2 = 12 +probestack_strategy = "outline" regalloc_checker = false regalloc_verbose_logs = false enable_alias_analysis = true diff --git a/cranelift/filetests/filetests/isa/x64/inline-probestack-large.clif b/cranelift/filetests/filetests/isa/x64/inline-probestack-large.clif new file mode 100644 index 0000000000..b5b592c653 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/inline-probestack-large.clif @@ -0,0 +1,67 @@ +test compile precise-output +set enable_probestack=true +; Test with the larger size of 64k +set probestack_size_log2=16 +set probestack_strategy=inline +target x86_64 + + + +; If the stack size is just one page, we can avoid the stack probe entirely +function %single_page() -> i64 system_v { +ss0 = explicit_slot 8192 + +block0: + v1 = stack_addr.i64 ss0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; subq %rsp, $8192, %rsp +; block0: +; lea rsp(0 + virtual offset), %rax +; addq %rsp, $8192, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret + +function %unrolled() -> i64 system_v { +ss0 = explicit_slot 196608 + +block0: + v1 = stack_addr.i64 ss0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; movl %esp, -65536(%rsp) +; movl %esp, -131072(%rsp) +; movl %esp, -196608(%rsp) +; subq %rsp, $196608, %rsp +; block0: +; lea rsp(0 + virtual offset), %rax +; addq %rsp, $196608, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret + +function %large() -> i64 system_v { +ss0 = explicit_slot 2097152 + +block0: + v1 = stack_addr.i64 ss0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; stack_probe_loop %r11, frame_size=2097152, guard_size=65536 +; subq %rsp, $2097152, %rsp +; block0: +; lea rsp(0 + virtual offset), %rax +; addq %rsp, $2097152, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret diff --git a/cranelift/filetests/filetests/isa/x64/inline-probestack.clif b/cranelift/filetests/filetests/isa/x64/inline-probestack.clif new file mode 100644 index 0000000000..aa37bede6f --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/inline-probestack.clif @@ -0,0 +1,66 @@ +test compile precise-output +set enable_probestack=true +set probestack_strategy=inline +; This is the default and is equivalent to a page size of 4096 +set probestack_size_log2=12 +target x86_64 + + +; If the stack size is just one page, we can avoid the stack probe entirely +function %single_page() -> i64 system_v { +ss0 = explicit_slot 2048 + +block0: + v1 = stack_addr.i64 ss0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; subq %rsp, $2048, %rsp +; block0: +; lea rsp(0 + virtual offset), %rax +; addq %rsp, $2048, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret + +function %unrolled() -> i64 system_v { +ss0 = explicit_slot 12288 + +block0: + v1 = stack_addr.i64 ss0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; movl %esp, -4096(%rsp) +; movl %esp, -8192(%rsp) +; movl %esp, -12288(%rsp) +; subq %rsp, $12288, %rsp +; block0: +; lea rsp(0 + virtual offset), %rax +; addq %rsp, $12288, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret + +function %large() -> i64 system_v { +ss0 = explicit_slot 100000 + +block0: + v1 = stack_addr.i64 ss0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; stack_probe_loop %r11, frame_size=100000, guard_size=4096 +; subq %rsp, $100000, %rsp +; block0: +; lea rsp(0 + virtual offset), %rax +; addq %rsp, $100000, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret diff --git a/cranelift/filetests/filetests/runtests/i128-load-store.clif b/cranelift/filetests/filetests/runtests/i128-load-store.clif index d5da854969..9663ead067 100644 --- a/cranelift/filetests/filetests/runtests/i128-load-store.clif +++ b/cranelift/filetests/filetests/runtests/i128-load-store.clif @@ -1,5 +1,7 @@ test run set enable_llvm_abi_extensions=true +; Disable stack probes since these tests don't require them +set enable_probestack=false target x86_64 target aarch64 target s390x diff --git a/cranelift/filetests/filetests/runtests/inline-probestack.clif b/cranelift/filetests/filetests/runtests/inline-probestack.clif new file mode 100644 index 0000000000..21426137d8 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/inline-probestack.clif @@ -0,0 +1,37 @@ +test interpret +test run +set enable_probestack=true +set probestack_strategy=inline + +; This is the default and is equivalent to a page size of 4096 +set probestack_size_log2=12 +target x86_64 +; Test also with 64k pages +set probestack_size_log2=16 +target x86_64 + +; Create a huge stack slot (1MB), way larger than PAGE_SIZE and touch the end of it. +; This guarantees that we bypass the guard page, cause a page fault the OS isn't expecting +; which turns into a segfault if we haven't correctly implemented stack probing. + +function %probe_loop(i64) -> i64 { + ss0 = explicit_slot 1048576 + +block0(v0: i64): + stack_store.i64 v0, ss0 + v1 = stack_load.i64 ss0 + return v1 +} +; run: %probe_loop(1) == 1 + + +; Tests the unrolled version of the stackprobe +function %probe_unroll(i64) -> i64 { + ss0 = explicit_slot 9000 + +block0(v0: i64): + stack_store.i64 v0, ss0 + v1 = stack_load.i64 ss0 + return v1 +} +; run: %probe_unroll(1) == 1 diff --git a/cranelift/filetests/filetests/runtests/stack.clif b/cranelift/filetests/filetests/runtests/stack.clif index 008572d51e..698dcd681c 100644 --- a/cranelift/filetests/filetests/runtests/stack.clif +++ b/cranelift/filetests/filetests/runtests/stack.clif @@ -1,5 +1,7 @@ test interpret test run +; Disable stack probes since these tests don't require them +set enable_probestack=false target x86_64 target s390x target aarch64 diff --git a/cranelift/filetests/src/test_run.rs b/cranelift/filetests/src/test_run.rs index f41f64ef9d..47f2a4a2ab 100644 --- a/cranelift/filetests/src/test_run.rs +++ b/cranelift/filetests/src/test_run.rs @@ -43,19 +43,6 @@ fn build_host_isa( builder.set(value.name, &value.value_string()).unwrap(); } - // We need to force disable stack probing, since we don't support it yet. - let flags = { - let mut flags_builder = settings::builder(); - - // Copy all flags - for flag in flags.iter() { - flags_builder.set(flag.name, &flag.value_string()).unwrap(); - } - - flags_builder.set("enable_probestack", "false").unwrap(); - settings::Flags::new(flags_builder) - }; - builder.finish(flags).unwrap() } diff --git a/crates/wasmtime/src/engine.rs b/crates/wasmtime/src/engine.rs index a77155532e..02747725eb 100644 --- a/crates/wasmtime/src/engine.rs +++ b/crates/wasmtime/src/engine.rs @@ -354,6 +354,7 @@ impl Engine { "enable_llvm_abi_extensions" => *value == FlagValue::Bool(false), "enable_pinned_reg" => *value == FlagValue::Bool(false), "enable_probestack" => *value == FlagValue::Bool(false), + "probestack_strategy" => *value == FlagValue::Enum("outline".into()), "use_colocated_libcalls" => *value == FlagValue::Bool(false), "use_pinned_reg_as_heap_base" => *value == FlagValue::Bool(false),