From b73ac83c375f953e9433021343f3c85f15beff58 Mon Sep 17 00:00:00 2001 From: Chris Fallin Date: Tue, 18 Jan 2022 16:42:24 -0800 Subject: [PATCH] Add a pooling allocator mode based on copy-on-write mappings of memfds. As first suggested by Jan on the Zulip here [1], a cheap and effective way to obtain copy-on-write semantics of a "backing image" for a Wasm memory is to mmap a file with `MAP_PRIVATE`. The `memfd` mechanism provided by the Linux kernel allows us to create anonymous, in-memory-only files that we can use for this mapping, so we can construct the image contents on-the-fly then effectively create a CoW overlay. Furthermore, and importantly, `madvise(MADV_DONTNEED, ...)` will discard the CoW overlay, returning the mapping to its original state. By itself this is almost enough for a very fast instantiation-termination loop of the same image over and over, without changing the address space mapping at all (which is expensive). The only missing bit is how to implement heap *growth*. But here memfds can help us again: if we create another anonymous file and map it where the extended parts of the heap would go, we can take advantage of the fact that a `mmap()` mapping can be *larger than the file itself*, with accesses beyond the end generating a `SIGBUS`, and the fact that we can cheaply resize the file with `ftruncate`, even after a mapping exists. So we can map the "heap extension" file once with the maximum memory-slot size and grow the memfd itself as `memory.grow` operations occur. The above CoW technique and heap-growth technique together allow us a fastpath of `madvise()` and `ftruncate()` only when we re-instantiate the same module over and over, as long as we can reuse the same slot. This fastpath avoids all whole-process address-space locks in the Linux kernel, which should mean it is highly scalable. It also avoids the cost of copying data on read, as the `uffd` heap backend does when servicing pagefaults; the kernel's own optimized CoW logic (same as used by all file mmaps) is used instead. [1] https://bytecodealliance.zulipchat.com/#narrow/stream/206238-general/topic/Copy.20on.20write.20based.20instance.20reuse/near/266657772 --- .github/workflows/main.yml | 5 +- Cargo.lock | 10 + Cargo.toml | 2 + crates/environ/src/module.rs | 23 ++ crates/jit/src/instantiate.rs | 15 +- crates/runtime/Cargo.toml | 3 + crates/runtime/src/instance.rs | 23 ++ crates/runtime/src/instance/allocator.rs | 63 ++-- .../runtime/src/instance/allocator/memfd.rs | 290 ++++++++++++++++++ .../src/instance/allocator/memfd_disabled.rs | 49 +++ .../runtime/src/instance/allocator/pooling.rs | 260 ++++++++++------ .../src/instance/allocator/pooling/uffd.rs | 1 + crates/runtime/src/lib.rs | 44 +++ crates/runtime/src/memfd.rs | 236 ++++++++++++++ crates/runtime/src/memory.rs | 57 +++- crates/runtime/src/module_id.rs | 28 ++ crates/runtime/src/traphandlers/unix.rs | 14 +- crates/wasmtime/Cargo.toml | 2 + crates/wasmtime/src/engine.rs | 8 +- crates/wasmtime/src/instance.rs | 3 +- crates/wasmtime/src/module.rs | 33 +- crates/wasmtime/src/module/serialization.rs | 7 +- crates/wasmtime/src/store.rs | 2 + crates/wasmtime/src/trampoline.rs | 1 + crates/wasmtime/src/trampoline/func.rs | 1 + src/lib.rs | 25 ++ 26 files changed, 1070 insertions(+), 135 deletions(-) create mode 100644 crates/runtime/src/instance/allocator/memfd.rs create mode 100644 crates/runtime/src/instance/allocator/memfd_disabled.rs create mode 100644 crates/runtime/src/memfd.rs create mode 100644 crates/runtime/src/module_id.rs diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index e89e33e165..5e5e0c64d9 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -136,6 +136,7 @@ jobs: - run: cargo check -p wasmtime --no-default-features --features async - run: cargo check -p wasmtime --no-default-features --features uffd - run: cargo check -p wasmtime --no-default-features --features pooling-allocator + - run: cargo check -p wasmtime --no-default-features --features memfd-allocator - run: cargo check -p wasmtime --no-default-features --features cranelift - run: cargo check -p wasmtime --no-default-features --features cranelift,wat,async,cache @@ -310,11 +311,13 @@ jobs: env: RUST_BACKTRACE: 1 - # Test uffd functionality on Linux + # Test Linux-specific functionality - run: | cargo test --features uffd -p wasmtime-runtime instance::allocator::pooling cargo test --features uffd -p wasmtime-cli pooling_allocator cargo test --features uffd -p wasmtime-cli wast::Cranelift + cargo test --features memfd-allocator -p wasmtime-cli pooling_allocator + cargo test --features memfd-allocator -p wasmtime-cli wast::Cranelift if: matrix.os == 'ubuntu-latest' && matrix.target == '' env: RUST_BACKTRACE: 1 diff --git a/Cargo.lock b/Cargo.lock index 6362f3b413..dbe67fe9c7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1602,6 +1602,15 @@ version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" +[[package]] +name = "memfd" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6627dc657574b49d6ad27105ed671822be56e0d2547d413bfbf3e8d8fa92e7a" +dependencies = [ + "libc", +] + [[package]] name = "memmap2" version = "0.2.3" @@ -3587,6 +3596,7 @@ dependencies = [ "libc", "log", "mach", + "memfd", "memoffset", "more-asserts", "rand 0.8.3", diff --git a/Cargo.toml b/Cargo.toml index 748cb801da..51c4843fcc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -95,6 +95,8 @@ vtune = ["wasmtime/vtune"] wasi-crypto = ["wasmtime-wasi-crypto"] wasi-nn = ["wasmtime-wasi-nn"] uffd = ["wasmtime/uffd"] +pooling-allocator = ["wasmtime/pooling-allocator"] +memfd-allocator = ["pooling-allocator", "wasmtime/memfd-allocator"] all-arch = ["wasmtime/all-arch"] posix-signals-on-macos = ["wasmtime/posix-signals-on-macos"] diff --git a/crates/environ/src/module.rs b/crates/environ/src/module.rs index 8b05e2eb1c..d941801658 100644 --- a/crates/environ/src/module.rs +++ b/crates/environ/src/module.rs @@ -95,6 +95,19 @@ impl MemoryPlan { }, } } + + /// Determine whether a data segment (memory initializer) is + /// possibly out-of-bounds. Returns `true` if the initializer has a + /// dynamic location and this question cannot be resolved + /// pre-instantiation; hence, this method's result should not be + /// used to signal an error, only to exit optimized/simple fastpaths. + pub fn initializer_possibly_out_of_bounds(&self, init: &MemoryInitializer) -> bool { + match init.end() { + // Not statically known, so possibly out of bounds (we can't guarantee in-bounds). + None => true, + Some(end) => end > self.memory.minimum * (WASM_PAGE_SIZE as u64), + } + } } /// A WebAssembly linear memory initializer. @@ -113,6 +126,16 @@ pub struct MemoryInitializer { pub data: Range, } +impl MemoryInitializer { + /// If this initializer has a definite, static, non-overflowed end address, return it. + pub fn end(&self) -> Option { + if self.base.is_some() { + return None; + } + self.offset.checked_add(self.data.len() as u64) + } +} + /// The type of WebAssembly linear memory initialization to use for a module. #[derive(Clone, Debug, Serialize, Deserialize)] pub enum MemoryInitialization { diff --git a/crates/jit/src/instantiate.rs b/crates/jit/src/instantiate.rs index cc6a3844d1..6a41160070 100644 --- a/crates/jit/src/instantiate.rs +++ b/crates/jit/src/instantiate.rs @@ -19,7 +19,10 @@ use wasmtime_environ::{ StackMapInformation, Trampoline, Tunables, WasmFuncType, ELF_WASMTIME_ADDRMAP, ELF_WASMTIME_TRAPS, }; -use wasmtime_runtime::{GdbJitImageRegistration, InstantiationError, VMFunctionBody, VMTrampoline}; +use wasmtime_runtime::{ + CompiledModuleId, CompiledModuleIdAllocator, GdbJitImageRegistration, InstantiationError, + VMFunctionBody, VMTrampoline, +}; /// This is the name of the section in the final ELF image which contains /// concatenated data segments from the original wasm module. @@ -248,6 +251,8 @@ pub struct CompiledModule { code: Range, code_memory: CodeMemory, dbg_jit_registration: Option, + /// A unique ID used to register this module with the engine. + unique_id: CompiledModuleId, } impl CompiledModule { @@ -271,6 +276,7 @@ impl CompiledModule { mmap: MmapVec, info: Option, profiler: &dyn ProfilingAgent, + id_allocator: &CompiledModuleIdAllocator, ) -> Result> { // Transfer ownership of `obj` to a `CodeMemory` object which will // manage permissions, such as the executable bit. Once it's located @@ -312,6 +318,7 @@ impl CompiledModule { dbg_jit_registration: None, code_memory, meta: info.meta, + unique_id: id_allocator.alloc(), }; ret.register_debug_and_profiling(profiler)?; @@ -333,6 +340,12 @@ impl CompiledModule { Ok(()) } + /// Get this module's unique ID. It is unique with respect to a + /// single allocator (which is ordinarily held on a Wasm engine). + pub fn unique_id(&self) -> CompiledModuleId { + self.unique_id + } + /// Returns the underlying memory which contains the compiled module's /// image. pub fn mmap(&self) -> &MmapVec { diff --git a/crates/runtime/Cargo.toml b/crates/runtime/Cargo.toml index 827439d1d5..aaef30f677 100644 --- a/crates/runtime/Cargo.toml +++ b/crates/runtime/Cargo.toml @@ -37,6 +37,7 @@ winapi = { version = "0.3.7", features = ["winbase", "memoryapi", "errhandlingap [target.'cfg(target_os = "linux")'.dependencies] userfaultfd = { version = "0.4.1", optional = true } +memfd = { version = "0.4.1", optional = true } [build-dependencies] cc = "1.0" @@ -59,3 +60,5 @@ uffd = ["userfaultfd", "pooling-allocator"] # It is useful for applications that do not bind their own exception ports and # need portable signal handling. posix-signals-on-macos = [] + +memfd-allocator = ["pooling-allocator", "memfd"] diff --git a/crates/runtime/src/instance.rs b/crates/runtime/src/instance.rs index 2c9487d75e..9c56dfb222 100644 --- a/crates/runtime/src/instance.rs +++ b/crates/runtime/src/instance.rs @@ -97,6 +97,29 @@ pub(crate) struct Instance { #[allow(clippy::cast_ptr_alignment)] impl Instance { + /// Helper for allocators; not a public API. + pub(crate) fn create_raw( + module: &Arc, + wasm_data: &'static [u8], + memories: PrimaryMap, + tables: PrimaryMap, + host_state: Box, + ) -> Instance { + Instance { + module: module.clone(), + offsets: VMOffsets::new(HostPtr, &module), + memories, + tables, + dropped_elements: EntitySet::with_capacity(module.passive_elements.len()), + dropped_data: EntitySet::with_capacity(module.passive_data_map.len()), + host_state, + wasm_data, + vmctx: VMContext { + _marker: std::marker::PhantomPinned, + }, + } + } + /// Helper function to access various locations offset from our `*mut /// VMContext` object. unsafe fn vmctx_plus_offset(&self, offset: u32) -> *mut T { diff --git a/crates/runtime/src/instance/allocator.rs b/crates/runtime/src/instance/allocator.rs index 82c1eec31e..12fa88ddc8 100644 --- a/crates/runtime/src/instance/allocator.rs +++ b/crates/runtime/src/instance/allocator.rs @@ -4,28 +4,37 @@ use crate::memory::{DefaultMemoryCreator, Memory}; use crate::table::Table; use crate::traphandlers::Trap; use crate::vmcontext::{ - VMBuiltinFunctionsArray, VMCallerCheckedAnyfunc, VMContext, VMGlobalDefinition, - VMSharedSignatureIndex, + VMBuiltinFunctionsArray, VMCallerCheckedAnyfunc, VMGlobalDefinition, VMSharedSignatureIndex, }; +use crate::ModuleMemFds; use crate::Store; use anyhow::Result; use std::alloc; use std::any::Any; use std::convert::TryFrom; -use std::marker; use std::ptr::{self, NonNull}; use std::slice; use std::sync::Arc; use thiserror::Error; use wasmtime_environ::{ - DefinedFuncIndex, DefinedMemoryIndex, DefinedTableIndex, EntityRef, EntitySet, FunctionInfo, - GlobalInit, HostPtr, MemoryInitialization, MemoryInitializer, Module, ModuleType, PrimaryMap, - SignatureIndex, TableInitializer, TrapCode, VMOffsets, WasmType, WASM_PAGE_SIZE, + DefinedFuncIndex, DefinedMemoryIndex, DefinedTableIndex, EntityRef, FunctionInfo, GlobalInit, + MemoryInitialization, MemoryInitializer, Module, ModuleType, PrimaryMap, SignatureIndex, + TableInitializer, TrapCode, WasmType, WASM_PAGE_SIZE, }; #[cfg(feature = "pooling-allocator")] mod pooling; +#[cfg(feature = "memfd-allocator")] +mod memfd; +#[cfg(feature = "memfd-allocator")] +pub use self::memfd::MemFdSlot; + +#[cfg(not(feature = "memfd-allocator"))] +mod memfd_disabled; +#[cfg(not(feature = "memfd-allocator"))] +pub use self::memfd_disabled::MemFdSlot; + #[cfg(feature = "pooling-allocator")] pub use self::pooling::{ InstanceLimits, ModuleLimits, PoolingAllocationStrategy, PoolingInstanceAllocator, @@ -39,6 +48,9 @@ pub struct InstanceAllocationRequest<'a> { /// The base address of where JIT functions are located. pub image_base: usize, + /// If using MemFD-based memories, the backing MemFDs. + pub memfds: Option>, + /// Descriptors about each compiled function, such as the offset from /// `image_base`. pub functions: &'a PrimaryMap, @@ -376,9 +388,23 @@ fn check_memory_init_bounds( fn initialize_memories( instance: &mut Instance, + module: &Module, initializers: &[MemoryInitializer], ) -> Result<(), InstantiationError> { for init in initializers { + // Check whether this is a MemFD memory; if so, we can skip + // all initializers. + let memory = init.memory_index; + if let Some(defined_index) = module.defined_memory_index(memory) { + // We can only skip if there is actually a MemFD image. In + // some situations the MemFD image creation code will bail + // (e.g. due to an out of bounds data segment) and so we + // need to fall back on the usual initialization below. + if instance.memories[defined_index].is_memfd_with_image() { + continue; + } + } + instance .memory_init_segment( init.memory_index, @@ -432,6 +458,14 @@ fn initialize_instance( match &module.memory_initialization { MemoryInitialization::Paged { map, out_of_bounds } => { for (index, pages) in map { + // We can only skip if there is actually a MemFD image. In + // some situations the MemFD image creation code will bail + // (e.g. due to an out of bounds data segment) and so we + // need to fall back on the usual initialization below. + if instance.memories[index].is_memfd_with_image() { + continue; + } + let memory = instance.memory(index); let slice = unsafe { slice::from_raw_parts_mut(memory.base, memory.current_length) }; @@ -453,7 +487,7 @@ fn initialize_instance( } } MemoryInitialization::Segmented(initializers) => { - initialize_memories(instance, initializers)?; + initialize_memories(instance, module, initializers)?; } } @@ -691,19 +725,8 @@ unsafe impl InstanceAllocator for OnDemandInstanceAllocator { let host_state = std::mem::replace(&mut req.host_state, Box::new(())); let mut handle = { - let instance = Instance { - module: req.module.clone(), - offsets: VMOffsets::new(HostPtr, &req.module), - memories, - tables, - dropped_elements: EntitySet::with_capacity(req.module.passive_elements.len()), - dropped_data: EntitySet::with_capacity(req.module.passive_data_map.len()), - host_state, - wasm_data: &*req.wasm_data, - vmctx: VMContext { - _marker: marker::PhantomPinned, - }, - }; + let instance = + Instance::create_raw(&req.module, &*req.wasm_data, memories, tables, host_state); let layout = instance.alloc_layout(); let instance_ptr = alloc::alloc(layout) as *mut Instance; if instance_ptr.is_null() { diff --git a/crates/runtime/src/instance/allocator/memfd.rs b/crates/runtime/src/instance/allocator/memfd.rs new file mode 100644 index 0000000000..8713794824 --- /dev/null +++ b/crates/runtime/src/instance/allocator/memfd.rs @@ -0,0 +1,290 @@ +//! memfd mapping logic for use by the pooling allocator. + +use crate::memfd::MemoryMemFd; +use crate::InstantiationError; +use anyhow::Result; +use libc::c_void; +use rustix::fd::AsRawFd; +use std::convert::TryFrom; +use std::fs::File; +use std::sync::Arc; + +/// A single slot handled by the memfd instance-heap mechanism. +/// +/// The mmap scheme is: +/// +/// base ==> (points here) +/// - (image.offset bytes) anonymous zero memory, pre-image +/// - (image.len bytes) CoW mapping of memfd heap image +/// - (up to extension_offset) anonymous zero memory, post-image +/// - (up to static_size) heap expansion region; CoW mapping of per-slot memfd +/// +/// The ordering of mmaps to set this up is: +/// +/// - once, when pooling allocator is created: +/// - one large mmap to create 8GiB * instances * memories slots +/// +/// - per instantiation of new image in a slot: +/// - mmap of anonymous zero memory, from 0 to initial heap size +/// - mmap of CoW'd memfd image, from `image.offset` to +/// `image.offset + image.len`. This overwrites part of the +/// anonymous zero memory, potentially splitting it into a pre- +/// and post-region. +/// - mmap of CoW'd extension file, past the initial heap size up to +/// the end of the max memory size (just before the +/// post-guard). This is always adjacent to the above mmaps, but +/// does not overlap/overwrite them. +#[derive(Debug)] +pub struct MemFdSlot { + /// The base of the actual heap memory. Bytes at this address are + /// what is seen by the Wasm guest code. + base: usize, + /// The maximum static memory size, plus post-guard. + static_size: usize, + /// The memfd image that backs this memory. May be `None`, in + /// which case the memory is all zeroes. + pub(crate) image: Option>, + /// The offset at which the "extension file", which is used to + /// allow for efficient heap growth, is mapped. This is always + /// immediately after the end of the initial memory size. + extension_offset: usize, + /// The anonymous memfd, owned by this slot, which we mmap in the + /// area where the heap may grow during runtime. We use the + /// ftruncate() syscall (invoked via `File::set_len()`) to set its + /// size. We never write any data to it -- we CoW-map it so we can + /// throw away dirty data on termination. Instead, we just use its + /// size as a "watermark" that delineates the boundary between + /// safe-to-access memory and SIGBUS-causing memory. (This works + /// because one can mmap a file beyond its end, and is good + /// because ftruncate does not take the process-wide lock that + /// mmap and mprotect do.) + extension_file: File, + /// Whether this slot may have "dirty" pages (pages written by an + /// instantiation). Set by `instantiate()` and cleared by + /// `clear_and_remain_ready()`, and used in assertions to ensure + /// those methods are called properly. + dirty: bool, +} + +impl MemFdSlot { + pub(crate) fn create( + base_addr: *mut c_void, + static_size: usize, + ) -> Result { + let base = base_addr as usize; + + // Create a MemFD for the memory growth first -- this covers + // extended heap beyond the initial image. + let extension_memfd = memfd::MemfdOptions::new() + .allow_sealing(true) + .create("wasm-anonymous-heap") + .map_err(|e| InstantiationError::Resource(e.into()))?; + // Seal the ability to write the extension file (make it + // permanently read-only). This is a defense-in-depth + // mitigation to make extra-sure that we don't leak + // information between instantiations. See note in `memfd.rs` + // for more about why we use seals. + extension_memfd + .add_seal(memfd::FileSeal::SealWrite) + .map_err(|e| InstantiationError::Resource(e.into()))?; + extension_memfd + .add_seal(memfd::FileSeal::SealSeal) + .map_err(|e| InstantiationError::Resource(e.into()))?; + let extension_file = extension_memfd.into_file(); + extension_file + .set_len(0) + .map_err(|e| InstantiationError::Resource(e.into()))?; + + Ok(MemFdSlot { + base, + static_size, + image: None, + extension_file, + extension_offset: 0, + dirty: false, + }) + } + + pub(crate) fn set_heap_limit(&mut self, size_bytes: usize) -> Result<()> { + assert!(size_bytes >= self.extension_offset); + // This is all that is needed to make the new memory + // accessible; we don't need to mprotect anything. (The + // mapping itself is always R+W for the max possible heap + // size, and only the anonymous-backing file length catches + // out-of-bounds accesses.) + self.extension_file + .set_len(u64::try_from(size_bytes - self.extension_offset).unwrap())?; + Ok(()) + } + + pub(crate) fn instantiate( + &mut self, + initial_size_bytes: usize, + maybe_image: Option<&Arc>, + ) -> Result<(), InstantiationError> { + assert!(!self.dirty); + + if let Some(existing_image) = &self.image { + // Fast-path: previously instantiated with the same image, + // so the mappings are already correct; there is no need + // to mmap anything. Given that we asserted not-dirty + // above, any dirty pages will have already been thrown + // away by madvise() during the previous termination. + if let Some(image) = maybe_image { + if existing_image.fd.as_file().as_raw_fd() == image.fd.as_file().as_raw_fd() { + self.dirty = true; + return Ok(()); + } + } + } + + // Otherwise, we need to redo (i) the anonymous-mmap backing + // for the initial heap size, (ii) the extension-file backing, + // and (iii) the initial-heap-image mapping if present. + + // Security/audit note: we map all of these MAP_PRIVATE, so + // all instance data is local to the mapping, not propagated + // to the backing fd. We throw away this CoW overlay with + // madvise() below, from base up to extension_offset (which is + // at least initial_size_bytes, and extended when the + // extension file is, so it covers all three mappings) when + // terminating the instance. + + // Anonymous mapping behind the initial heap size: this gives + // zeroes for any "holes" in the initial heap image. Anonymous + // mmap memory is faster to fault in than a CoW of a file, + // even a file with zero holes, because the kernel's CoW path + // unconditionally copies *something* (even if just a page of + // zeroes). Anonymous zero pages are fast: the kernel + // pre-zeroes them, and even if it runs out of those, a memset + // is half as expensive as a memcpy (only writes, no reads). + if initial_size_bytes > 0 { + unsafe { + let ptr = rustix::io::mmap_anonymous( + self.base as *mut c_void, + initial_size_bytes, + rustix::io::ProtFlags::READ | rustix::io::ProtFlags::WRITE, + rustix::io::MapFlags::PRIVATE | rustix::io::MapFlags::FIXED, + ) + .map_err(|e| InstantiationError::Resource(e.into()))?; + assert_eq!(ptr as usize, self.base); + } + } + + // An "extension file": this allows us to grow the heap by + // doing just an ftruncate(), without changing any + // mappings. This is important to avoid the process-wide mmap + // lock on Linux. + self.extension_offset = initial_size_bytes; + let extension_map_len = self.static_size - initial_size_bytes; + if extension_map_len > 0 { + unsafe { + let fd = rustix::fd::BorrowedFd::borrow_raw_fd(self.extension_file.as_raw_fd()); + let ptr = rustix::io::mmap( + (self.base + initial_size_bytes) as *mut c_void, + extension_map_len, + rustix::io::ProtFlags::READ | rustix::io::ProtFlags::WRITE, + rustix::io::MapFlags::PRIVATE | rustix::io::MapFlags::FIXED, + &fd, + 0, + ) + .map_err(|e| InstantiationError::Resource(e.into()))?; + assert_eq!(ptr as usize, self.base + initial_size_bytes); + } + } + + // Finally, the initial memory image. + if let Some(image) = maybe_image { + if image.len > 0 { + let image = image.clone(); + + unsafe { + let fd = rustix::fd::BorrowedFd::borrow_raw_fd(image.fd.as_file().as_raw_fd()); + let ptr = rustix::io::mmap( + (self.base + image.offset) as *mut c_void, + image.len, + rustix::io::ProtFlags::READ | rustix::io::ProtFlags::WRITE, + rustix::io::MapFlags::PRIVATE | rustix::io::MapFlags::FIXED, + &fd, + image.offset as u64, + ) + .map_err(|e| InstantiationError::Resource(e.into()))?; + assert_eq!(ptr as usize, self.base + image.offset); + } + + self.image = Some(image); + } + } + + self.dirty = true; + Ok(()) + } + + pub(crate) fn clear_and_remain_ready(&mut self) -> Result<()> { + assert!(self.dirty); + // madvise the image range; that's it! This will throw away + // dirty pages, which are CoW-private pages on top of the + // initial heap image memfd. + unsafe { + rustix::io::madvise( + self.base as *mut c_void, + self.extension_offset, + rustix::io::Advice::LinuxDontNeed, + )?; + } + + // truncate the extension file down to zero bytes to reset heap length. + self.extension_file + .set_len(0) + .map_err(|e| InstantiationError::Resource(e.into()))?; + self.dirty = false; + Ok(()) + } + + pub(crate) fn has_image(&self) -> bool { + self.image.is_some() + } + + pub(crate) fn is_dirty(&self) -> bool { + self.dirty + } +} + +#[cfg(feature = "memfd-allocator")] +impl Drop for MemFdSlot { + fn drop(&mut self) { + // The MemFdSlot may be dropped if there is an error during + // instantiation: for example, if a memory-growth limiter + // disallows a guest from having a memory of a certain size, + // after we've already initialized the MemFdSlot. + // + // We need to return this region of the large pool mmap to a + // safe state (with no module-specific mappings). The + // MemFdSlot will not be returned to the MemoryPool, so a new + // MemFdSlot will be created and overwrite the mappings anyway + // on the slot's next use; but for safety and to avoid + // resource leaks it's better not to have stale mappings to a + // possibly-otherwise-dead module's image. + // + // To "wipe the slate clean", let's do a mmap of anonymous + // memory over the whole region, with PROT_NONE. Note that we + // *can't* simply munmap, because that leaves a hole in the + // middle of the pooling allocator's big memory area that some + // other random mmap may swoop in and take, to be trampled + // over by the next MemFdSlot later. + // + // Since we're in drop(), we can't sanely return an error if + // this mmap fails. Let's ignore the failure if so; the next + // MemFdSlot to be created for this slot will try to overwrite + // the existing stale mappings, and return a failure properly + // if we still cannot map new memory. + unsafe { + let _ = rustix::io::mmap_anonymous( + self.base as *mut _, + self.static_size, + rustix::io::ProtFlags::empty(), + rustix::io::MapFlags::FIXED | rustix::io::MapFlags::NORESERVE, + ); + } + } +} diff --git a/crates/runtime/src/instance/allocator/memfd_disabled.rs b/crates/runtime/src/instance/allocator/memfd_disabled.rs new file mode 100644 index 0000000000..9c87591bd5 --- /dev/null +++ b/crates/runtime/src/instance/allocator/memfd_disabled.rs @@ -0,0 +1,49 @@ +//! Shims for MemFdSlot when the memfd allocator is not +//! included. Enables unconditional use of the type and its methods +//! throughout higher-level code. + +use crate::InstantiationError; +use anyhow::Result; +use std::sync::Arc; + +/// A placeholder for MemFdSlot when we have not included the pooling +/// allocator. +/// +/// To allow MemFdSlot to be unconditionally passed around in various +/// places (e.g. a `Memory`), we define a zero-sized type when memfd is +/// not included in the build. +#[cfg(not(feature = "memfd-allocator"))] +#[derive(Debug)] +pub struct MemFdSlot; + +#[cfg(not(feature = "memfd-allocator"))] +#[allow(dead_code)] +impl MemFdSlot { + pub(crate) fn create(_: *mut libc::c_void, _: usize) -> Result { + panic!("create() on invalid MemFdSlot"); + } + + pub(crate) fn instantiate( + &mut self, + _: usize, + _: Option<&Arc>, + ) -> Result { + panic!("instantiate() on invalid MemFdSlot"); + } + + pub(crate) fn clear_and_remain_ready(&mut self) -> Result<()> { + Ok(()) + } + + pub(crate) fn has_image(&self) -> bool { + false + } + + pub(crate) fn is_dirty(&self) -> bool { + false + } + + pub(crate) fn set_heap_limit(&mut self, _: usize) -> Result<()> { + panic!("set_heap_limit on invalid MemFdSlot"); + } +} diff --git a/crates/runtime/src/instance/allocator/pooling.rs b/crates/runtime/src/instance/allocator/pooling.rs index 76614137d5..6aa291d7a9 100644 --- a/crates/runtime/src/instance/allocator/pooling.rs +++ b/crates/runtime/src/instance/allocator/pooling.rs @@ -7,19 +7,21 @@ //! Using the pooling instance allocator can speed up module instantiation //! when modules can be constrained based on configurable limits. +use super::MemFdSlot; use super::{ initialize_instance, initialize_vmcontext, InstanceAllocationRequest, InstanceAllocator, InstanceHandle, InstantiationError, }; -use crate::{instance::Instance, Memory, Mmap, Table, VMContext}; +use crate::{instance::Instance, Memory, Mmap, ModuleMemFds, Table}; use anyhow::{anyhow, bail, Context, Result}; +use libc::c_void; use rand::Rng; use std::convert::TryFrom; -use std::marker; use std::mem; -use std::sync::{Arc, Mutex}; +use std::sync::Arc; +use std::sync::Mutex; use wasmtime_environ::{ - EntitySet, HostPtr, MemoryStyle, Module, PrimaryMap, Tunables, VMOffsets, VMOffsetsFields, + HostPtr, MemoryIndex, MemoryStyle, Module, PrimaryMap, Tunables, VMOffsets, VMOffsetsFields, WASM_PAGE_SIZE, }; @@ -284,7 +286,6 @@ struct InstancePool { free_list: Mutex>, memories: MemoryPool, tables: TablePool, - empty_module: Arc, } impl InstancePool { @@ -332,14 +333,8 @@ impl InstancePool { free_list: Mutex::new((0..max_instances).collect()), memories: MemoryPool::new(module_limits, instance_limits, tunables)?, tables: TablePool::new(module_limits, instance_limits)?, - empty_module: Arc::new(Module::default()), }; - // Use a default module to initialize the instances to start - for i in 0..instance_limits.count as usize { - pool.initialize(module_limits, i); - } - Ok(pool) } @@ -348,41 +343,26 @@ impl InstancePool { &mut *(self.mapping.as_mut_ptr().add(index * self.instance_size) as *mut Instance) } - fn initialize(&self, limits: &ModuleLimits, index: usize) { - unsafe { - let instance = self.instance(index); - - // Write a default instance with preallocated memory/table map storage to the ptr - std::ptr::write( - instance as _, - Instance { - module: self.empty_module.clone(), - offsets: VMOffsets::new(HostPtr, &self.empty_module), - memories: PrimaryMap::with_capacity(limits.memories as usize), - tables: PrimaryMap::with_capacity(limits.tables as usize), - dropped_elements: EntitySet::new(), - dropped_data: EntitySet::new(), - host_state: Box::new(()), - wasm_data: &[], - vmctx: VMContext { - _marker: marker::PhantomPinned, - }, - }, - ); - } - } - unsafe fn setup_instance( &self, index: usize, mut req: InstanceAllocationRequest, ) -> Result { - let instance = self.instance(index); + let host_state = std::mem::replace(&mut req.host_state, Box::new(())); + let instance_data = Instance::create_raw( + &req.module, + &*req.wasm_data, + PrimaryMap::default(), + PrimaryMap::default(), + host_state, + ); - instance.module = req.module.clone(); - instance.offsets = VMOffsets::new(HostPtr, instance.module.as_ref()); - instance.host_state = std::mem::replace(&mut req.host_state, Box::new(())); - instance.wasm_data = &*req.wasm_data; + // Instances are uninitialized memory at first; we need to + // write an empty but initialized `Instance` struct into the + // chosen slot before we do anything else with it. (This is + // paired with a `drop_in_place` in deallocate below.) + let instance = self.instance(index); + std::ptr::write(instance as _, instance_data); // set_instance_memories and _tables will need the store before we can completely // initialize the vmcontext. @@ -391,8 +371,10 @@ impl InstancePool { } Self::set_instance_memories( + index, instance, - self.memories.get(index), + &self.memories, + &req.memfds, self.memories.max_wasm_pages, )?; @@ -448,20 +430,44 @@ impl InstancePool { let instance = unsafe { &mut *handle.instance }; // Decommit any linear memories that were used - for (memory, base) in instance.memories.values_mut().zip(self.memories.get(index)) { + for ((def_mem_idx, memory), base) in + instance.memories.iter_mut().zip(self.memories.get(index)) + { let mut memory = mem::take(memory); debug_assert!(memory.is_static()); - // Reset any faulted guard pages as the physical memory may be reused for another instance in the future - #[cfg(all(feature = "uffd", target_os = "linux"))] - memory - .reset_guard_pages() - .expect("failed to reset guard pages"); - drop(&mut memory); // require mutable on all platforms, not just uffd + match memory { + Memory::Static { + memfd_slot: Some(mut memfd_slot), + .. + } => { + let mem_idx = instance.module.memory_index(def_mem_idx); + // If there was any error clearing the memfd, just + // drop it here, and let the drop handler for the + // MemFdSlot unmap in a way that retains the + // address space reservation. + if memfd_slot.clear_and_remain_ready().is_ok() { + self.memories.return_memfd_slot(index, mem_idx, memfd_slot); + } + } - let size = memory.byte_size(); - drop(memory); - decommit_memory_pages(base, size).expect("failed to decommit linear memory pages"); + _ => { + // Reset any faulted guard pages as the physical + // memory may be reused for another instance in + // the future. + #[cfg(all(feature = "uffd", target_os = "linux"))] + memory + .reset_guard_pages() + .expect("failed to reset guard pages"); + // require mutable on all platforms, not just uffd + drop(&mut memory); + + let size = memory.byte_size(); + drop(memory); + decommit_memory_pages(base, size) + .expect("failed to decommit linear memory pages"); + } + } } instance.memories.clear(); @@ -481,50 +487,81 @@ impl InstancePool { decommit_table_pages(base, size).expect("failed to decommit table pages"); } - instance.tables.clear(); - instance.dropped_elements.clear(); - - // Drop all `global` values which need a destructor, such as externref - // values which now need their reference count dropped. - instance.drop_globals(); - - // Drop any host state - instance.host_state = Box::new(()); - - // And finally reset the module/offsets back to their original. This - // should put everything back in a relatively pristine state for each - // fresh allocation later on. - instance.module = self.empty_module.clone(); - instance.offsets = VMOffsets::new(HostPtr, &self.empty_module); - instance.wasm_data = &[]; + // We've now done all of the pooling-allocator-specific + // teardown, so we can drop the Instance and let destructors + // take care of any other fields (host state, globals, etc.). + unsafe { + std::ptr::drop_in_place(instance as *mut _); + } + // The instance is now uninitialized memory and cannot be + // touched again until we write a fresh Instance in-place with + // std::ptr::write in allocate() above. self.free_list.lock().unwrap().push(index); } fn set_instance_memories( + instance_idx: usize, instance: &mut Instance, - mut memories: impl Iterator, + memories: &MemoryPool, + maybe_memfds: &Option>, max_pages: u64, ) -> Result<(), InstantiationError> { let module = instance.module.as_ref(); debug_assert!(instance.memories.is_empty()); - for plan in - (&module.memory_plans.values().as_slice()[module.num_imported_memories..]).iter() + for (memory_index, plan) in module + .memory_plans + .iter() + .skip(module.num_imported_memories) { + let defined_index = module + .defined_memory_index(memory_index) + .expect("should be a defined memory since we skipped imported ones"); + let memory = unsafe { std::slice::from_raw_parts_mut( - memories.next().unwrap(), + memories.get_base(instance_idx, memory_index), (max_pages as usize) * (WASM_PAGE_SIZE as usize), ) }; - instance.memories.push( - Memory::new_static(plan, memory, commit_memory_pages, unsafe { - &mut *instance.store() - }) - .map_err(InstantiationError::Resource)?, - ); + + if let Some(memfds) = maybe_memfds { + let image = memfds.get_memory_image(defined_index); + let mut slot = memories.take_memfd_slot(instance_idx, memory_index)?; + let initial_size = plan.memory.minimum * WASM_PAGE_SIZE as u64; + + // If instantiation fails, we can propagate the error + // upward and drop the slot. This will cause the Drop + // handler to attempt to map the range with PROT_NONE + // memory, to reserve the space while releasing any + // stale mappings. The next use of this slot will then + // create a new MemFdSlot that will try to map over + // this, returning errors as well if the mapping + // errors persist. The unmap-on-drop is best effort; + // if it fails, then we can still soundly continue + // using the rest of the pool and allowing the rest of + // the process to continue, because we never perform a + // mmap that would leave an open space for someone + // else to come in and map something. + slot.instantiate(initial_size as usize, image) + .map_err(|e| InstantiationError::Resource(e.into()))?; + + instance.memories.push( + Memory::new_static(plan, memory, None, Some(slot), unsafe { + &mut *instance.store() + }) + .map_err(InstantiationError::Resource)?, + ); + } else { + instance.memories.push( + Memory::new_static(plan, memory, Some(commit_memory_pages), None, unsafe { + &mut *instance.store() + }) + .map_err(InstantiationError::Resource)?, + ); + } } debug_assert!(instance.dropped_data.is_empty()); @@ -566,17 +603,6 @@ impl InstancePool { } } -impl Drop for InstancePool { - fn drop(&mut self) { - unsafe { - for i in 0..self.max_instances { - let ptr = self.mapping.as_mut_ptr().add(i * self.instance_size) as *mut Instance; - std::ptr::drop_in_place(ptr); - } - } - } -} - /// Represents a pool of WebAssembly linear memories. /// /// A linear memory is divided into accessible pages and guard pages. @@ -589,6 +615,10 @@ impl Drop for InstancePool { #[derive(Debug)] struct MemoryPool { mapping: Mmap, + // If using the memfd allocation scheme, the MemFd slots. We + // dynamically transfer ownership of a slot to a Memory when in + // use. + memfd_slots: Vec>>, // The size, in bytes, of each linear memory's reservation plus the guard // region allocated for it. memory_size: usize, @@ -673,8 +703,18 @@ impl MemoryPool { let mapping = Mmap::accessible_reserved(0, allocation_size) .context("failed to create memory pool mapping")?; + let num_memfd_slots = if cfg!(feature = "memfd-allocator") { + max_instances * max_memories + } else { + 0 + }; + let memfd_slots: Vec<_> = std::iter::repeat_with(|| Mutex::new(None)) + .take(num_memfd_slots) + .collect(); + let pool = Self { mapping, + memfd_slots, memory_size, initial_memory_offset, max_memories, @@ -689,17 +729,43 @@ impl MemoryPool { Ok(pool) } - fn get(&self, instance_index: usize) -> impl Iterator { + fn get_base(&self, instance_index: usize, memory_index: MemoryIndex) -> *mut u8 { debug_assert!(instance_index < self.max_instances); + let memory_index = memory_index.as_u32() as usize; + debug_assert!(memory_index < self.max_memories); + let idx = instance_index * self.max_memories + memory_index; + let offset = self.initial_memory_offset + idx * self.memory_size; + unsafe { self.mapping.as_mut_ptr().offset(offset as isize) } + } - let base: *mut u8 = unsafe { - self.mapping.as_mut_ptr().add( - self.initial_memory_offset + instance_index * self.memory_size * self.max_memories, - ) as _ - }; + fn get<'a>(&'a self, instance_index: usize) -> impl Iterator + 'a { + (0..self.max_memories) + .map(move |i| self.get_base(instance_index, MemoryIndex::from_u32(i as u32))) + } - let size = self.memory_size; - (0..self.max_memories).map(move |i| unsafe { base.add(i * size) }) + /// Take ownership of the given memfd slot. Must be returned via + /// `return_memfd_slot` when the instance is done using it. + fn take_memfd_slot( + &self, + instance_index: usize, + memory_index: MemoryIndex, + ) -> Result { + let idx = instance_index * self.max_memories + (memory_index.as_u32() as usize); + let maybe_slot = self.memfd_slots[idx].lock().unwrap().take(); + + maybe_slot.map(|slot| Ok(slot)).unwrap_or_else(|| { + MemFdSlot::create( + self.get_base(instance_index, memory_index) as *mut c_void, + self.memory_size, + ) + }) + } + + /// Return ownership of the given memfd slot. + fn return_memfd_slot(&self, instance_index: usize, memory_index: MemoryIndex, slot: MemFdSlot) { + assert!(!slot.is_dirty()); + let idx = instance_index * self.max_memories + (memory_index.as_u32() as usize); + *self.memfd_slots[idx].lock().unwrap() = Some(slot); } } @@ -1413,6 +1479,7 @@ mod test { host_state: Box::new(()), store: StorePtr::empty(), wasm_data: &[], + memfds: None, }, ) .expect("allocation should succeed"), @@ -1437,6 +1504,7 @@ mod test { host_state: Box::new(()), store: StorePtr::empty(), wasm_data: &[], + memfds: None, }, ) { Err(InstantiationError::Limit(3)) => {} diff --git a/crates/runtime/src/instance/allocator/pooling/uffd.rs b/crates/runtime/src/instance/allocator/pooling/uffd.rs index 55b4479fd1..87dd9a0c57 100644 --- a/crates/runtime/src/instance/allocator/pooling/uffd.rs +++ b/crates/runtime/src/instance/allocator/pooling/uffd.rs @@ -577,6 +577,7 @@ mod test { PoolingAllocationStrategy::Random, InstanceAllocationRequest { module: module.clone(), + memfds: None, image_base: 0, functions, imports: Imports { diff --git a/crates/runtime/src/lib.rs b/crates/runtime/src/lib.rs index f96e7d8dda..806c8c9c5c 100644 --- a/crates/runtime/src/lib.rs +++ b/crates/runtime/src/lib.rs @@ -19,6 +19,7 @@ clippy::use_self ) )] +#![cfg_attr(feature = "memfd-allocator", allow(dead_code))] use std::sync::atomic::AtomicU64; @@ -63,6 +64,49 @@ pub use crate::vmcontext::{ VMSharedSignatureIndex, VMTableDefinition, VMTableImport, VMTrampoline, ValRaw, }; +mod module_id; +pub use module_id::{CompiledModuleId, CompiledModuleIdAllocator}; + +#[cfg(feature = "memfd-allocator")] +mod memfd; + +/// When memfd support is not included, provide a shim type and +/// constructor instead so that higher-level code does not need +/// feature-conditional compilation. +#[cfg(not(feature = "memfd-allocator"))] +#[allow(dead_code)] +mod memfd { + use anyhow::Result; + use std::sync::Arc; + use wasmtime_environ::{DefinedMemoryIndex, Module}; + + /// A shim for the memfd image container when memfd support is not + /// included. + pub enum ModuleMemFds {} + + /// A shim for an individual memory image. + #[allow(dead_code)] + pub enum MemoryMemFd {} + + impl ModuleMemFds { + /// Construct a new set of memfd images. This variant is used + /// when memfd support is not included; it always returns no + /// images. + pub fn new(_: &Module, _: &[u8]) -> Result>> { + Ok(None) + } + + /// Get the memfd image for a particular memory. + pub(crate) fn get_memory_image(&self, _: DefinedMemoryIndex) -> Option<&Arc> { + // Should be unreachable because the `Self` type is + // uninhabitable. + match *self {} + } + } +} + +pub use crate::memfd::ModuleMemFds; + /// Version number of this crate. pub const VERSION: &str = env!("CARGO_PKG_VERSION"); diff --git a/crates/runtime/src/memfd.rs b/crates/runtime/src/memfd.rs new file mode 100644 index 0000000000..46ebc4e228 --- /dev/null +++ b/crates/runtime/src/memfd.rs @@ -0,0 +1,236 @@ +//! memfd support. + +use anyhow::Result; +use memfd::{Memfd, MemfdOptions}; +use rustix::fs::FileExt; +use std::convert::TryFrom; +use std::sync::Arc; +use wasmtime_environ::{ + DefinedMemoryIndex, MemoryInitialization, MemoryInitializer, MemoryPlan, Module, PrimaryMap, +}; + +/// MemFDs containing backing images for certain memories in a module. +/// +/// This is meant to be built once, when a module is first +/// loaded/constructed, and then used many times for instantiation. +pub struct ModuleMemFds { + memories: PrimaryMap>>, +} + +const MAX_MEMFD_IMAGE_SIZE: u64 = 1024 * 1024 * 1024; // limit to 1GiB. + +impl ModuleMemFds { + pub(crate) fn get_memory_image( + &self, + defined_index: DefinedMemoryIndex, + ) -> Option<&Arc> { + self.memories[defined_index].as_ref() + } +} + +/// One backing image for one memory. +#[derive(Debug)] +pub(crate) struct MemoryMemFd { + pub(crate) fd: Memfd, + /// Length of image. Note that initial memory size may be larger; + /// leading and trailing zeroes are truncated (handled by + /// anonymous backing memfd). + pub(crate) len: usize, + /// Image starts this many bytes into heap space. Note that the + /// memfd's offsets are always equal to the heap offsets, so we + /// map at an offset into the fd as well. (This simplifies + /// construction.) + pub(crate) offset: usize, +} + +fn unsupported_initializer(segment: &MemoryInitializer, plan: &MemoryPlan) -> bool { + // If the segment has a base that is dynamically determined + // (by a global value, which may be a function of an imported + // module, for example), then we cannot build a single static + // image that is used for every instantiation. So we skip this + // memory entirely. + let end = match segment.end() { + None => { + return true; + } + Some(end) => end, + }; + + // Cannot be out-of-bounds. If there is a *possibility* it may + // be, then we just fall back on ordinary initialization. + if plan.initializer_possibly_out_of_bounds(segment) { + return true; + } + + // Must fit in our max size. + if end > MAX_MEMFD_IMAGE_SIZE { + return true; + } + + false +} + +impl ModuleMemFds { + /// Create a new `ModuleMemFds` for the given module. This can be + /// passed in as part of a `InstanceAllocationRequest` to speed up + /// instantiation and execution by using memfd-backed memories. + pub fn new(module: &Module, wasm_data: &[u8]) -> Result>> { + let page_size = region::page::size() as u64; + let num_defined_memories = module.memory_plans.len() - module.num_imported_memories; + + // Allocate a memfd file initially for every memory. We'll + // release those and set `excluded_memories` for those that we + // determine during initializer processing we cannot support a + // static image (e.g. due to dynamically-located segments). + let mut memfds: PrimaryMap> = PrimaryMap::default(); + let mut sizes: PrimaryMap = PrimaryMap::default(); + let mut excluded_memories: PrimaryMap = PrimaryMap::new(); + + for _ in 0..num_defined_memories { + memfds.push(None); + sizes.push(0); + excluded_memories.push(false); + } + + fn create_memfd() -> Result { + // Create the memfd. It needs a name, but the + // documentation for `memfd_create()` says that names can + // be duplicated with no issues. + MemfdOptions::new() + .allow_sealing(true) + .create("wasm-memory-image") + .map_err(|e| e.into()) + } + let round_up_page = |len: u64| (len + page_size - 1) & !(page_size - 1); + + match &module.memory_initialization { + &MemoryInitialization::Segmented(ref segments) => { + for (i, segment) in segments.iter().enumerate() { + let defined_memory = match module.defined_memory_index(segment.memory_index) { + Some(defined_memory) => defined_memory, + None => continue, + }; + if excluded_memories[defined_memory] { + continue; + } + + if unsupported_initializer(segment, &module.memory_plans[segment.memory_index]) + { + memfds[defined_memory] = None; + excluded_memories[defined_memory] = true; + continue; + } + + if memfds[defined_memory].is_none() { + memfds[defined_memory] = Some(create_memfd()?); + } + let memfd = memfds[defined_memory].as_mut().unwrap(); + + let end = round_up_page(segment.end().expect("must have statically-known end")); + if end > sizes[defined_memory] { + sizes[defined_memory] = end; + memfd.as_file().set_len(end)?; + } + + let base = segments[i].offset; + let data = &wasm_data[segment.data.start as usize..segment.data.end as usize]; + memfd.as_file().write_at(data, base)?; + } + } + &MemoryInitialization::Paged { ref map, .. } => { + for (defined_memory, pages) in map { + let top = pages + .iter() + .map(|(base, range)| *base + range.len() as u64) + .max() + .unwrap_or(0); + + let memfd = create_memfd()?; + memfd.as_file().set_len(top)?; + + for (base, range) in pages { + let data = &wasm_data[range.start as usize..range.end as usize]; + memfd.as_file().write_at(data, *base)?; + } + + memfds[defined_memory] = Some(memfd); + sizes[defined_memory] = top; + } + } + } + + // Now finalize each memory. + let mut memories: PrimaryMap>> = + PrimaryMap::default(); + for (defined_memory, maybe_memfd) in memfds { + let memfd = match maybe_memfd { + Some(memfd) => memfd, + None => { + memories.push(None); + continue; + } + }; + let size = sizes[defined_memory]; + + // Find leading and trailing zero data so that the mmap + // can precisely map only the nonzero data; anon-mmap zero + // memory is faster for anything that doesn't actually + // have content. + let mut page_data = vec![0; page_size as usize]; + let mut page_is_nonzero = |page| { + let offset = page_size * page; + memfd.as_file().read_at(&mut page_data[..], offset).unwrap(); + page_data.iter().any(|byte| *byte != 0) + }; + let n_pages = size / page_size; + + let mut offset = 0; + for page in 0..n_pages { + if page_is_nonzero(page) { + break; + } + offset += page_size; + } + let len = if offset == size { + 0 + } else { + let mut len = 0; + for page in (0..n_pages).rev() { + if page_is_nonzero(page) { + len = (page + 1) * page_size - offset; + break; + } + } + len + }; + + // Seal the memfd's data and length. + // + // This is a defense-in-depth security mitigation. The + // memfd will serve as the starting point for the heap of + // every instance of this module. If anything were to + // write to this, it could affect every execution. The + // memfd object itself is owned by the machinery here and + // not exposed elsewhere, but it is still an ambient open + // file descriptor at the syscall level, so some other + // vulnerability that allowed writes to arbitrary fds + // could modify it. Or we could have some issue with the + // way that we map it into each instance. To be + // extra-super-sure that it never changes, and because + // this costs very little, we use the kernel's "seal" API + // to make the memfd image permanently read-only. + memfd.add_seal(memfd::FileSeal::SealGrow)?; + memfd.add_seal(memfd::FileSeal::SealShrink)?; + memfd.add_seal(memfd::FileSeal::SealWrite)?; + memfd.add_seal(memfd::FileSeal::SealSeal)?; + + memories.push(Some(Arc::new(MemoryMemFd { + fd: memfd, + offset: usize::try_from(offset).unwrap(), + len: usize::try_from(len).unwrap(), + }))); + } + + Ok(Some(Arc::new(ModuleMemFds { memories }))) + } +} diff --git a/crates/runtime/src/memory.rs b/crates/runtime/src/memory.rs index 07c0c619cc..894a8afd96 100644 --- a/crates/runtime/src/memory.rs +++ b/crates/runtime/src/memory.rs @@ -2,6 +2,7 @@ //! //! `RuntimeLinearMemory` is to WebAssembly linear memories what `Table` is to WebAssembly tables. +use crate::instance::MemFdSlot; use crate::mmap::Mmap; use crate::vmcontext::VMMemoryDefinition; use crate::Store; @@ -208,7 +209,11 @@ pub enum Memory { /// A callback which makes portions of `base` accessible for when memory /// is grown. Otherwise it's expected that accesses to `base` will /// fault. - make_accessible: fn(*mut u8, usize) -> Result<()>, + make_accessible: Option Result<()>>, + + /// The MemFdSlot, if any, for this memory. Owned here and + /// returned to the pooling allocator when termination occurs. + memfd_slot: Option, /// Stores the pages in the linear memory that have faulted as guard pages when using the `uffd` feature. /// These pages need their protection level reset before the memory can grow. @@ -236,7 +241,8 @@ impl Memory { pub fn new_static( plan: &MemoryPlan, base: &'static mut [u8], - make_accessible: fn(*mut u8, usize) -> Result<()>, + make_accessible: Option Result<()>>, + memfd_slot: Option, store: &mut dyn Store, ) -> Result { let (minimum, maximum) = Self::limit_new(plan, store)?; @@ -246,14 +252,17 @@ impl Memory { _ => base, }; - if minimum > 0 { - make_accessible(base.as_mut_ptr(), minimum)?; + if let Some(make_accessible) = make_accessible { + if minimum > 0 { + make_accessible(base.as_mut_ptr(), minimum)?; + } } Ok(Memory::Static { base, size: minimum, make_accessible, + memfd_slot, #[cfg(all(feature = "uffd", target_os = "linux"))] guard_page_faults: Vec::new(), }) @@ -373,6 +382,22 @@ impl Memory { } } + /// Returns whether or not this memory is backed by a MemFD + /// image. Note that this is testing whether there is actually an + /// *image* mapped, not just whether the MemFD mechanism is being + /// used. The distinction is important because if we are not using + /// a prevalidated and prepared image, we need to fall back to + /// ordinary initialization code. + pub(crate) fn is_memfd_with_image(&self) -> bool { + match self { + Memory::Static { + memfd_slot: Some(ref slot), + .. + } => slot.has_image(), + _ => false, + } + } + /// Grow memory by the specified amount of wasm pages. /// /// Returns `None` if memory can't be grown by the specified amount @@ -443,12 +468,33 @@ impl Memory { } match self { + Memory::Static { + base, + size, + memfd_slot: Some(ref mut memfd_slot), + .. + } => { + // Never exceed static memory size + if new_byte_size > base.len() { + store.memory_grow_failed(&format_err!("static memory size exceeded")); + return Ok(None); + } + + if let Err(e) = memfd_slot.set_heap_limit(new_byte_size) { + store.memory_grow_failed(&e); + return Ok(None); + } + *size = new_byte_size; + } Memory::Static { base, size, make_accessible, .. } => { + let make_accessible = make_accessible + .expect("make_accessible must be Some if this is not a MemFD memory"); + // Never exceed static memory size if new_byte_size > base.len() { store.memory_grow_failed(&format_err!("static memory size exceeded")); @@ -540,7 +586,8 @@ impl Default for Memory { Memory::Static { base: &mut [], size: 0, - make_accessible: |_, _| unreachable!(), + make_accessible: Some(|_, _| unreachable!()), + memfd_slot: None, #[cfg(all(feature = "uffd", target_os = "linux"))] guard_page_faults: Vec::new(), } diff --git a/crates/runtime/src/module_id.rs b/crates/runtime/src/module_id.rs new file mode 100644 index 0000000000..481a63e0bd --- /dev/null +++ b/crates/runtime/src/module_id.rs @@ -0,0 +1,28 @@ +//! Unique IDs for modules in the runtime. + +use std::sync::atomic::{AtomicU64, Ordering}; + +/// A unique identifier (within an engine or similar) for a compiled +/// module. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct CompiledModuleId(u64); + +/// An allocator for compiled module IDs. +pub struct CompiledModuleIdAllocator { + next: AtomicU64, +} + +impl CompiledModuleIdAllocator { + /// Create a compiled-module ID allocator. + pub fn new() -> Self { + Self { + next: AtomicU64::new(1), + } + } + + /// Allocate a new ID. + pub fn alloc(&self) -> CompiledModuleId { + let id = self.next.fetch_add(1, Ordering::Relaxed); + CompiledModuleId(id) + } +} diff --git a/crates/runtime/src/traphandlers/unix.rs b/crates/runtime/src/traphandlers/unix.rs index cf41176cb7..fd16bfcdd1 100644 --- a/crates/runtime/src/traphandlers/unix.rs +++ b/crates/runtime/src/traphandlers/unix.rs @@ -51,9 +51,17 @@ pub unsafe fn platform_init() { register(&mut PREV_SIGFPE, libc::SIGFPE); } - // On ARM, handle Unaligned Accesses. - // On Darwin, guard page accesses are raised as SIGBUS. - if cfg!(target_arch = "arm") || cfg!(target_os = "macos") || cfg!(target_os = "freebsd") { + // Sometimes we need to handle SIGBUS too: + // - On ARM, handle Unaligned Accesses. + // - On Darwin, guard page accesses are raised as SIGBUS. + // - With the MemFD allocator, heap growth is controlled by + // ftruncate'ing an mmap'd file, and so out-of-bounds accesses + // are raised as SIGBUS. + if cfg!(target_arch = "arm") + || cfg!(target_os = "macos") + || cfg!(target_os = "freebsd") + || cfg!(feature = "memfd-allocator") + { register(&mut PREV_SIGBUS, libc::SIGBUS); } } diff --git a/crates/wasmtime/Cargo.toml b/crates/wasmtime/Cargo.toml index b5912ceb83..c7b0037d0e 100644 --- a/crates/wasmtime/Cargo.toml +++ b/crates/wasmtime/Cargo.toml @@ -89,3 +89,5 @@ all-arch = ["wasmtime-cranelift/all-arch"] # It is useful for applications that do not bind their own exception ports and # need portable signal handling. posix-signals-on-macos = ["wasmtime-runtime/posix-signals-on-macos"] + +memfd-allocator = ["wasmtime-runtime/memfd-allocator", "pooling-allocator"] \ No newline at end of file diff --git a/crates/wasmtime/src/engine.rs b/crates/wasmtime/src/engine.rs index 8a419c5170..48420ff492 100644 --- a/crates/wasmtime/src/engine.rs +++ b/crates/wasmtime/src/engine.rs @@ -7,7 +7,7 @@ use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; #[cfg(feature = "cache")] use wasmtime_cache::CacheConfig; -use wasmtime_runtime::{debug_builtins, InstanceAllocator}; +use wasmtime_runtime::{debug_builtins, CompiledModuleIdAllocator, InstanceAllocator}; /// An `Engine` which is a global context for compilation and management of wasm /// modules. @@ -43,6 +43,7 @@ struct EngineInner { allocator: Box, signatures: SignatureRegistry, epoch: AtomicU64, + unique_id_allocator: CompiledModuleIdAllocator, } impl Engine { @@ -68,6 +69,7 @@ impl Engine { allocator, signatures: registry, epoch: AtomicU64::new(0), + unique_id_allocator: CompiledModuleIdAllocator::new(), }), }) } @@ -153,6 +155,10 @@ impl Engine { self.inner.epoch.fetch_add(1, Ordering::Relaxed); } + pub(crate) fn unique_id_allocator(&self) -> &CompiledModuleIdAllocator { + &self.inner.unique_id_allocator + } + /// Ahead-of-time (AOT) compiles a WebAssembly module. /// /// The `bytes` provided must be in one of two formats: diff --git a/crates/wasmtime/src/instance.rs b/crates/wasmtime/src/instance.rs index aec6c1ba06..7f5b5e823d 100644 --- a/crates/wasmtime/src/instance.rs +++ b/crates/wasmtime/src/instance.rs @@ -651,7 +651,7 @@ impl<'a> Instantiator<'a> { artifacts, modules, &self.cur.modules, - ); + )?; self.cur.modules.push(submodule); } @@ -707,6 +707,7 @@ impl<'a> Instantiator<'a> { .allocator() .allocate(InstanceAllocationRequest { module: compiled_module.module().clone(), + memfds: self.cur.module.memfds().clone(), image_base: compiled_module.code().as_ptr() as usize, functions: compiled_module.functions(), imports: self.cur.build(), diff --git a/crates/wasmtime/src/module.rs b/crates/wasmtime/src/module.rs index 04c695f214..09c2d3f485 100644 --- a/crates/wasmtime/src/module.rs +++ b/crates/wasmtime/src/module.rs @@ -11,6 +11,7 @@ use std::sync::Arc; use wasmparser::{Parser, ValidPayload, Validator}; use wasmtime_environ::{ModuleEnvironment, ModuleIndex, PrimaryMap}; use wasmtime_jit::{CompiledModule, CompiledModuleInfo, MmapVec, TypeTables}; +use wasmtime_runtime::ModuleMemFds; mod registry; mod serialization; @@ -107,6 +108,8 @@ struct ModuleInner { types: Arc, /// Registered shared signature for the module. signatures: Arc, + /// a set of memfd images for memories, if any. + memfds: Option>, } impl Module { @@ -336,7 +339,12 @@ impl Module { }; let modules = engine.run_maybe_parallel(artifacts, |(a, b)| { - CompiledModule::from_artifacts(a, b, &*engine.config().profiler) + CompiledModule::from_artifacts( + a, + b, + &*engine.config().profiler, + engine.unique_id_allocator(), + ) })?; Self::from_parts(engine, modules, main_module, Arc::new(types), &[]) @@ -523,6 +531,8 @@ impl Module { }) .collect::>>()?; + let memfds = ModuleMemFds::new(module.module(), module.wasm_data())?; + return Ok(Self { inner: Arc::new(ModuleInner { engine: engine.clone(), @@ -531,6 +541,7 @@ impl Module { artifact_upvars: modules, module_upvars, signatures, + memfds, }), }); @@ -543,11 +554,14 @@ impl Module { module_upvars: &[serialization::SerializedModuleUpvar], signatures: &Arc, ) -> Result { + let module = artifacts[module_index].clone(); + let memfds = ModuleMemFds::new(module.module(), module.wasm_data())?; Ok(Module { inner: Arc::new(ModuleInner { engine: engine.clone(), types: types.clone(), - module: artifacts[module_index].clone(), + module, + memfds, artifact_upvars: artifact_upvars .iter() .map(|i| artifacts[*i].clone()) @@ -666,12 +680,15 @@ impl Module { artifact_upvars: &[usize], module_upvars: &[wasmtime_environ::ModuleUpvar], modules: &PrimaryMap, - ) -> Module { - Module { + ) -> Result { + let module = self.inner.artifact_upvars[artifact_index].clone(); + let memfds = ModuleMemFds::new(module.module(), module.wasm_data())?; + Ok(Module { inner: Arc::new(ModuleInner { types: self.inner.types.clone(), engine: self.inner.engine.clone(), - module: self.inner.artifact_upvars[artifact_index].clone(), + module, + memfds, artifact_upvars: artifact_upvars .iter() .map(|i| self.inner.artifact_upvars[*i].clone()) @@ -687,7 +704,7 @@ impl Module { .collect(), signatures: self.inner.signatures.clone(), }), - } + }) } pub(crate) fn compiled_module(&self) -> &Arc { @@ -706,6 +723,10 @@ impl Module { &self.inner.signatures } + pub(crate) fn memfds(&self) -> &Option> { + &self.inner.memfds + } + /// Looks up the module upvar value at the `index` specified. /// /// Note that this panics if `index` is out of bounds since this should diff --git a/crates/wasmtime/src/module/serialization.rs b/crates/wasmtime/src/module/serialization.rs index 740d1eab92..cb643d795d 100644 --- a/crates/wasmtime/src/module/serialization.rs +++ b/crates/wasmtime/src/module/serialization.rs @@ -274,7 +274,12 @@ impl<'a> SerializedModule<'a> { pub fn into_module(self, engine: &Engine) -> Result { let (main_module, modules, types, upvars) = self.into_parts(engine)?; let modules = engine.run_maybe_parallel(modules, |(i, m)| { - CompiledModule::from_artifacts(i, m, &*engine.config().profiler) + CompiledModule::from_artifacts( + i, + m, + &*engine.config().profiler, + engine.unique_id_allocator(), + ) })?; Module::from_parts(engine, modules, main_module, Arc::new(types), &upvars) diff --git a/crates/wasmtime/src/store.rs b/crates/wasmtime/src/store.rs index c6d7914e47..362fb59848 100644 --- a/crates/wasmtime/src/store.rs +++ b/crates/wasmtime/src/store.rs @@ -421,11 +421,13 @@ impl Store { shared_signatures: None.into(), imports: Default::default(), module: Arc::new(wasmtime_environ::Module::default()), + memfds: None, store: StorePtr::empty(), wasm_data: &[], }) .expect("failed to allocate default callee") }; + let mut inner = Box::new(StoreInner { inner: StoreOpaque { _marker: marker::PhantomPinned, diff --git a/crates/wasmtime/src/trampoline.rs b/crates/wasmtime/src/trampoline.rs index c1f8038a5a..790cbf9ef9 100644 --- a/crates/wasmtime/src/trampoline.rs +++ b/crates/wasmtime/src/trampoline.rs @@ -41,6 +41,7 @@ fn create_handle( let handle = OnDemandInstanceAllocator::new(config.mem_creator.clone(), 0).allocate( InstanceAllocationRequest { module: Arc::new(module), + memfds: None, functions, image_base: 0, imports, diff --git a/crates/wasmtime/src/trampoline/func.rs b/crates/wasmtime/src/trampoline/func.rs index 67d57fc334..47513f83cf 100644 --- a/crates/wasmtime/src/trampoline/func.rs +++ b/crates/wasmtime/src/trampoline/func.rs @@ -161,6 +161,7 @@ pub unsafe fn create_raw_function( Ok( OnDemandInstanceAllocator::default().allocate(InstanceAllocationRequest { module: Arc::new(module), + memfds: None, functions: &functions, image_base: (*func).as_ptr() as usize, imports: Imports::default(), diff --git a/src/lib.rs b/src/lib.rs index fb43affad1..b3cb8961f7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -100,6 +100,8 @@ use std::collections::HashMap; use std::path::PathBuf; use structopt::StructOpt; use wasmtime::{Config, ProfilingStrategy}; +#[cfg(feature = "pooling-allocator")] +use wasmtime::{InstanceLimits, ModuleLimits, PoolingAllocationStrategy}; fn pick_profiling_strategy(jitdump: bool, vtune: bool) -> Result { Ok(match (jitdump, vtune) { @@ -250,6 +252,12 @@ struct CommonOptions { /// the data segments specified in the original wasm module. #[structopt(long)] paged_memory_initialization: bool, + + /// Enables the pooling allocator, in place of the on-demand + /// allocator. + #[cfg(feature = "pooling-allocator")] + #[structopt(long)] + pooling_allocator: bool, } impl CommonOptions { @@ -325,6 +333,23 @@ impl CommonOptions { config.generate_address_map(!self.disable_address_map); config.paged_memory_initialization(self.paged_memory_initialization); + #[cfg(feature = "pooling-allocator")] + { + if self.pooling_allocator { + let mut module_limits = ModuleLimits::default(); + module_limits.functions = 50000; + module_limits.types = 10000; + module_limits.globals = 1000; + module_limits.memory_pages = 2048; + let instance_limits = InstanceLimits::default(); + config.allocation_strategy(wasmtime::InstanceAllocationStrategy::Pooling { + strategy: PoolingAllocationStrategy::NextAvailable, + module_limits, + instance_limits, + }); + } + } + Ok(config) }