From 3a01d147129fa1a5273d39d57fc9303a632ba40e Mon Sep 17 00:00:00 2001 From: Chris Fallin Date: Mon, 7 Dec 2020 15:09:47 -0800 Subject: [PATCH] Two Lucet-related fixes to stack overflow handling. Lucet uses stack probes rather than explicit stack limit checks as Wasmtime does. In bytecodealliance/lucet#616, I have discovered that I previously was not running some Lucet runtime tests with the new backend, so was missing some test failures due to missing pieces in the new backend. This PR adds (i) calls to probestack, when enabled, in the prologue of every function with a stack frame larger than one page (configurable via flags); and (ii) trap metadata for every instruction on x86-64 that can access the stack, hence be the first point at which a stack overflow is detected when the stack pointer is decremented. --- .github/workflows/main.yml | 13 ++- cranelift/codegen/src/isa/aarch64/abi.rs | 8 +- cranelift/codegen/src/isa/arm32/abi.rs | 8 +- cranelift/codegen/src/isa/x64/abi.rs | 18 +++- cranelift/codegen/src/isa/x64/inst/emit.rs | 96 +++++++++++++++++-- cranelift/codegen/src/isa/x64/lower.rs | 4 +- cranelift/codegen/src/machinst/abi_impl.rs | 24 +++++ .../filetests/isa/x64/probestack.clif | 17 ++++ 8 files changed, 169 insertions(+), 19 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/x64/probestack.clif diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 3bd17dbdca..5e80781daf 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -60,7 +60,10 @@ jobs: # nightly-only feature right now. - uses: ./.github/actions/install-rust with: - toolchain: nightly + # TODO (rust-lang/rust#79661): We are seeing an internal compiler error when + # building with the latest (2020-12-06) nightly; pin on a slightly older + # version for now. + toolchain: nightly-2020-11-29 - run: cargo doc --no-deps --all --exclude wasmtime-cli --exclude test-programs --exclude cranelift-codegen-meta - run: cargo doc --package cranelift-codegen-meta --document-private-items - uses: actions/upload-artifact@v1 @@ -145,7 +148,7 @@ jobs: # flags to rustc. - uses: ./.github/actions/install-rust with: - toolchain: nightly + toolchain: nightly-2020-11-29 - run: cargo install cargo-fuzz --vers "^0.8" - run: cargo fetch working-directory: ./fuzz @@ -202,7 +205,7 @@ jobs: rust: beta - build: nightly os: ubuntu-latest - rust: nightly + rust: nightly-2020-11-29 - build: macos os: macos-latest rust: stable @@ -292,7 +295,7 @@ jobs: submodules: true - uses: ./.github/actions/install-rust with: - toolchain: nightly + toolchain: nightly-2020-11-29 - uses: ./.github/actions/define-llvm-env # Install wasm32 targets in order to build various tests throughout the @@ -303,7 +306,7 @@ jobs: # Run the x64 CI script. - run: ./ci/run-experimental-x64-ci.sh env: - CARGO_VERSION: "+nightly" + CARGO_VERSION: "+nightly-2020-11-29" RUST_BACKTRACE: 1 # Build and test the wasi-nn module. diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs index 8496e51711..69335c10cd 100644 --- a/cranelift/codegen/src/isa/aarch64/abi.rs +++ b/cranelift/codegen/src/isa/aarch64/abi.rs @@ -12,7 +12,7 @@ use crate::{CodegenError, CodegenResult}; use alloc::boxed::Box; use alloc::vec::Vec; use regalloc::{RealReg, Reg, RegClass, Set, Writable}; -use smallvec::SmallVec; +use smallvec::{smallvec, SmallVec}; // We use a generic implementation that factors out AArch64 and x64 ABI commonalities, because // these ABIs are very similar. @@ -508,6 +508,12 @@ impl ABIMachineSpec for AArch64MachineDeps { insts } + fn gen_probestack(_: u32) -> SmallVec<[Self::I; 2]> { + // TODO: implement if we ever require stack probes on an AArch64 host + // (unlikely unless Lucet is ported) + smallvec![] + } + // Returns stack bytes used as well as instructions. Does not adjust // nominal SP offset; abi_impl generic code will do that. fn gen_clobber_save( diff --git a/cranelift/codegen/src/isa/arm32/abi.rs b/cranelift/codegen/src/isa/arm32/abi.rs index 4b69e1cf43..f09dd7dced 100644 --- a/cranelift/codegen/src/isa/arm32/abi.rs +++ b/cranelift/codegen/src/isa/arm32/abi.rs @@ -10,7 +10,7 @@ use crate::{CodegenError, CodegenResult}; use alloc::boxed::Box; use alloc::vec::Vec; use regalloc::{RealReg, Reg, RegClass, Set, Writable}; -use smallvec::SmallVec; +use smallvec::{smallvec, SmallVec}; /// Support for the ARM ABI from the callee side (within a function body). pub(crate) type Arm32ABICallee = ABICalleeImpl; @@ -305,6 +305,12 @@ impl ABIMachineSpec for Arm32MachineDeps { ret } + fn gen_probestack(_: u32) -> SmallVec<[Self::I; 2]> { + // TODO: implement if we ever require stack probes on ARM32 (unlikely + // unless Lucet is ported) + smallvec![] + } + /// Returns stack bytes used as well as instructions. Does not adjust /// nominal SP offset; caller will do that. fn gen_clobber_save( diff --git a/cranelift/codegen/src/isa/x64/abi.rs b/cranelift/codegen/src/isa/x64/abi.rs index 4f84d75c1d..dc9592908b 100644 --- a/cranelift/codegen/src/isa/x64/abi.rs +++ b/cranelift/codegen/src/isa/x64/abi.rs @@ -1,7 +1,7 @@ //! Implementation of the standard x64 ABI. use crate::ir::types::*; -use crate::ir::{self, types, MemFlags, TrapCode, Type}; +use crate::ir::{self, types, ExternalName, LibCall, MemFlags, Opcode, TrapCode, Type}; use crate::isa; use crate::isa::{x64::inst::*, CallConv}; use crate::machinst::abi_impl::*; @@ -389,6 +389,22 @@ impl ABIMachineSpec for X64ABIMachineSpec { insts } + fn gen_probestack(frame_size: u32) -> SmallVec<[Self::I; 2]> { + let mut insts = SmallVec::new(); + insts.push(Inst::imm( + OperandSize::Size32, + frame_size as u64, + Writable::from_reg(regs::rax()), + )); + insts.push(Inst::CallKnown { + dest: ExternalName::LibCall(LibCall::Probestack), + uses: vec![regs::rax()], + defs: vec![], + opcode: Opcode::Call, + }); + insts + } + fn gen_clobber_save( call_conv: isa::CallConv, _: &settings::Flags, diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 7d15063ad4..0289f2ea92 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -182,6 +182,7 @@ impl LegacyPrefixes { fn emit_std_enc_mem( sink: &mut MachBuffer, state: &EmitState, + info: &EmitInfo, prefixes: LegacyPrefixes, opcodes: u32, mut num_opcodes: usize, @@ -194,7 +195,8 @@ fn emit_std_enc_mem( // expression. But `enc_g` can be derived from a register of any class. let srcloc = state.cur_srcloc(); - if srcloc != SourceLoc::default() && mem_e.can_trap() { + let can_trap = mem_e.can_trap(); + if srcloc != SourceLoc::default() && can_trap { sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); } @@ -202,6 +204,12 @@ fn emit_std_enc_mem( match mem_e { Amode::ImmReg { simm32, base, .. } => { + // If this is an access based off of RSP, it may trap with a stack overflow if it's the + // first touch of a new stack page. + if *base == regs::rsp() && !can_trap && info.flags().enable_probestack() { + sink.add_trap(srcloc, TrapCode::StackOverflow); + } + // First, the REX byte. let enc_e = int_reg_enc(*base); rex.emit_two_op(sink, enc_g, enc_e); @@ -262,6 +270,12 @@ fn emit_std_enc_mem( shift, .. } => { + // If this is an access based off of RSP, it may trap with a stack overflow if it's the + // first touch of a new stack page. + if *reg_base == regs::rsp() && !can_trap && info.flags().enable_probestack() { + sink.add_trap(srcloc, TrapCode::StackOverflow); + } + let enc_base = int_reg_enc(*reg_base); let enc_index = int_reg_enc(*reg_index); @@ -350,6 +364,7 @@ fn emit_std_enc_enc( fn emit_std_reg_mem( sink: &mut MachBuffer, state: &EmitState, + info: &EmitInfo, prefixes: LegacyPrefixes, opcodes: u32, num_opcodes: usize, @@ -361,6 +376,7 @@ fn emit_std_reg_mem( emit_std_enc_mem( sink, state, + info, prefixes, opcodes, num_opcodes, @@ -538,6 +554,7 @@ pub(crate) fn emit( emit_std_reg_mem( sink, state, + info, LegacyPrefixes::None, 0x0FAF, 2, @@ -597,6 +614,7 @@ pub(crate) fn emit( emit_std_reg_mem( sink, state, + info, LegacyPrefixes::None, opcode_m, 1, @@ -654,6 +672,7 @@ pub(crate) fn emit( emit_std_reg_mem( sink, state, + info, prefix, opcode, num_opcodes, @@ -717,7 +736,9 @@ pub(crate) fn emit( } RegMem::Mem { addr: src } => { let amode = src.finalize(state, sink); - emit_std_enc_mem(sink, state, prefix, opcode, 1, subopcode, &amode, rex_flags); + emit_std_enc_mem( + sink, state, info, prefix, opcode, 1, subopcode, &amode, rex_flags, + ); } } } @@ -738,7 +759,9 @@ pub(crate) fn emit( } RegMem::Mem { addr: src } => { let amode = src.finalize(state, sink); - emit_std_enc_mem(sink, state, prefix, 0xF7, 1, subopcode, &amode, rex_flags); + emit_std_enc_mem( + sink, state, info, prefix, 0xF7, 1, subopcode, &amode, rex_flags, + ); } } } @@ -987,6 +1010,7 @@ pub(crate) fn emit( emit_std_reg_mem( sink, state, + info, LegacyPrefixes::None, opcodes, num_opcodes, @@ -1004,6 +1028,7 @@ pub(crate) fn emit( emit_std_reg_mem( sink, state, + info, LegacyPrefixes::None, 0x8B, 1, @@ -1019,6 +1044,7 @@ pub(crate) fn emit( emit_std_reg_mem( sink, state, + info, LegacyPrefixes::None, 0x8D, 1, @@ -1081,6 +1107,7 @@ pub(crate) fn emit( emit_std_reg_mem( sink, state, + info, LegacyPrefixes::None, opcodes, num_opcodes, @@ -1108,7 +1135,17 @@ pub(crate) fn emit( }; // MOV r8, r/m8 is (REX.W==0) 88 /r - emit_std_reg_mem(sink, state, LegacyPrefixes::None, 0x88, 1, *src, dst, rex) + emit_std_reg_mem( + sink, + state, + info, + LegacyPrefixes::None, + 0x88, + 1, + *src, + dst, + rex, + ) } 2 => { @@ -1116,6 +1153,7 @@ pub(crate) fn emit( emit_std_reg_mem( sink, state, + info, LegacyPrefixes::_66, 0x89, 1, @@ -1130,6 +1168,7 @@ pub(crate) fn emit( emit_std_reg_mem( sink, state, + info, LegacyPrefixes::None, 0x89, 1, @@ -1144,6 +1183,7 @@ pub(crate) fn emit( emit_std_reg_mem( sink, state, + info, LegacyPrefixes::None, 0x89, 1, @@ -1253,6 +1293,7 @@ pub(crate) fn emit( emit_std_reg_mem( sink, state, + info, prefix, opcode_bytes, 2, @@ -1311,7 +1352,7 @@ pub(crate) fn emit( let addr = &addr.finalize(state, sink); // Whereas here we revert to the "normal" G-E ordering. let opcode = if *size == 1 { 0x3A } else { 0x3B }; - emit_std_reg_mem(sink, state, prefix, opcode, 1, *reg_g, addr, rex); + emit_std_reg_mem(sink, state, info, prefix, opcode, 1, *reg_g, addr, rex); } RegMemImm::Imm { simm32 } => { @@ -1372,6 +1413,7 @@ pub(crate) fn emit( emit_std_reg_mem( sink, state, + info, prefix, opcode, 2, @@ -1408,6 +1450,10 @@ pub(crate) fn emit( } Inst::Push64 { src } => { + if info.flags().enable_probestack() { + sink.add_trap(state.cur_srcloc(), TrapCode::StackOverflow); + } + match src { RegMemImm::Reg { reg } => { let enc_reg = int_reg_enc(*reg); @@ -1423,6 +1469,7 @@ pub(crate) fn emit( emit_std_enc_mem( sink, state, + info, LegacyPrefixes::None, 0xFF, 1, @@ -1454,6 +1501,9 @@ pub(crate) fn emit( } Inst::CallKnown { dest, opcode, .. } => { + if info.flags().enable_probestack() { + sink.add_trap(state.cur_srcloc(), TrapCode::StackOverflow); + } if let Some(s) = state.take_stack_map() { sink.add_stack_map(StackMapExtent::UpcomingBytes(5), s); } @@ -1469,6 +1519,9 @@ pub(crate) fn emit( } Inst::CallUnknown { dest, opcode, .. } => { + if info.flags().enable_probestack() { + sink.add_trap(state.cur_srcloc(), TrapCode::StackOverflow); + } let start_offset = sink.cur_offset(); match dest { RegMem::Reg { reg } => { @@ -1489,6 +1542,7 @@ pub(crate) fn emit( emit_std_enc_mem( sink, state, + info, LegacyPrefixes::None, 0xFF, 1, @@ -1587,6 +1641,7 @@ pub(crate) fn emit( emit_std_enc_mem( sink, state, + info, LegacyPrefixes::None, 0xFF, 1, @@ -1733,6 +1788,7 @@ pub(crate) fn emit( emit_std_reg_mem( sink, state, + info, prefix, opcode, num_opcodes, @@ -1848,6 +1904,7 @@ pub(crate) fn emit( emit_std_reg_mem( sink, state, + info, prefix, opcode, length, @@ -1994,7 +2051,17 @@ pub(crate) fn emit( !regs_swapped, "No existing way to encode a mem argument in the ModRM r/m field." ); - emit_std_reg_mem(sink, state, prefix, opcode, len, dst.to_reg(), addr, rex); + emit_std_reg_mem( + sink, + state, + info, + prefix, + opcode, + len, + dst.to_reg(), + addr, + rex, + ); } } sink.put1(*imm); @@ -2027,6 +2094,7 @@ pub(crate) fn emit( emit_std_reg_mem( sink, state, + info, prefix, opcode, 2, @@ -2091,7 +2159,17 @@ pub(crate) fn emit( } RegMem::Mem { addr } => { let addr = &addr.finalize(state, sink); - emit_std_reg_mem(sink, state, prefix, opcode, 2, reg_g.to_reg(), addr, rex); + emit_std_reg_mem( + sink, + state, + info, + prefix, + opcode, + 2, + reg_g.to_reg(), + addr, + rex, + ); } } } @@ -2111,7 +2189,7 @@ pub(crate) fn emit( } RegMem::Mem { addr } => { let addr = &addr.finalize(state, sink); - emit_std_reg_mem(sink, state, prefix, opcode, len, *dst, addr, rex); + emit_std_reg_mem(sink, state, info, prefix, opcode, len, *dst, addr, rex); } } } @@ -2625,7 +2703,7 @@ pub(crate) fn emit( _ => unreachable!(), }; let amode = dst.finalize(state, sink); - emit_std_reg_mem(sink, state, prefix, opcodes, 2, *src, &amode, rex); + emit_std_reg_mem(sink, state, info, prefix, opcodes, 2, *src, &amode, rex); } Inst::AtomicRmwSeq { ty, op } => { diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index bbe886c24b..538e0e6dac 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -4079,8 +4079,8 @@ impl LowerBackend for X64Backend { let ty = ctx.input_ty(ifcmp_sp, 0); ctx.emit(Inst::cmp_rmi_r( ty.bytes() as u8, - RegMemImm::reg(operand), - regs::rsp(), + RegMemImm::reg(regs::rsp()), + operand, )); let cond_code = ctx.data(branches[0]).cond_code().unwrap(); let cc = CC::from_intcc(cond_code); diff --git a/cranelift/codegen/src/machinst/abi_impl.rs b/cranelift/codegen/src/machinst/abi_impl.rs index b9774faba9..4b21c2d946 100644 --- a/cranelift/codegen/src/machinst/abi_impl.rs +++ b/cranelift/codegen/src/machinst/abi_impl.rs @@ -314,6 +314,9 @@ pub trait ABIMachineSpec { /// Generate the usual frame-restore sequence for this architecture. fn gen_epilogue_frame_restore() -> SmallVec<[Self::I; 2]>; + /// Generate a probestack call. + fn gen_probestack(_frame_size: u32) -> SmallVec<[Self::I; 2]>; + /// Generate a clobber-save sequence. This takes the list of *all* registers /// written/modified by the function body. The implementation here is /// responsible for determining which of these are callee-saved according to @@ -481,6 +484,9 @@ pub struct ABICalleeImpl { /// manually register-allocated and carefully only use caller-saved /// registers and keep nothing live after this sequence of instructions. stack_limit: Option<(Reg, Vec)>, + /// Are we to invoke the probestack function in the prologue? If so, + /// what is the minimum size at which we must invoke it? + probestack_min_frame: Option, _mach: PhantomData, } @@ -536,6 +542,18 @@ impl ABICalleeImpl { .map(|reg| (reg, Vec::new())) .or_else(|| f.stack_limit.map(|gv| gen_stack_limit::(f, &sig, gv))); + // Determine whether a probestack call is required for large enough + // frames (and the minimum frame size if so). + let probestack_min_frame = if flags.enable_probestack() { + assert!( + !flags.probestack_func_adjusts_sp(), + "SP-adjusting probestack not supported in new backends" + ); + Some(1 << flags.probestack_size_log2()) + } else { + None + }; + Ok(Self { sig, stackslots, @@ -550,6 +568,7 @@ impl ABICalleeImpl { flags, is_leaf: f.is_leaf(), stack_limit, + probestack_min_frame, _mach: PhantomData, }) } @@ -978,6 +997,11 @@ impl ABICallee for ABICalleeImpl { insts.extend_from_slice(stack_limit_load); self.insert_stack_check(*reg, total_stacksize, &mut insts); } + if let Some(min_frame) = &self.probestack_min_frame { + if total_stacksize >= *min_frame { + insts.extend(M::gen_probestack(total_stacksize)); + } + } } if total_stacksize > 0 { self.fixed_frame_storage_size += total_stacksize; diff --git a/cranelift/filetests/filetests/isa/x64/probestack.clif b/cranelift/filetests/filetests/isa/x64/probestack.clif new file mode 100644 index 0000000000..135587d355 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/probestack.clif @@ -0,0 +1,17 @@ +test compile +set enable_probestack=true +target x86_64 +feature "experimental_x64" + +function %f1() -> i64 { +ss0 = explicit_slot 100000 + +block0: + v1 = stack_addr.i64 ss0 + return v1 +} + +; check: pushq %rbp +; nextln: movq %rsp, %rbp +; nextln: movl $$100000, %eax +; nextln: call LibCall(Probestack)