From b73ac83c375f953e9433021343f3c85f15beff58 Mon Sep 17 00:00:00 2001 From: Chris Fallin Date: Tue, 18 Jan 2022 16:42:24 -0800 Subject: [PATCH 01/12] Add a pooling allocator mode based on copy-on-write mappings of memfds. As first suggested by Jan on the Zulip here [1], a cheap and effective way to obtain copy-on-write semantics of a "backing image" for a Wasm memory is to mmap a file with `MAP_PRIVATE`. The `memfd` mechanism provided by the Linux kernel allows us to create anonymous, in-memory-only files that we can use for this mapping, so we can construct the image contents on-the-fly then effectively create a CoW overlay. Furthermore, and importantly, `madvise(MADV_DONTNEED, ...)` will discard the CoW overlay, returning the mapping to its original state. By itself this is almost enough for a very fast instantiation-termination loop of the same image over and over, without changing the address space mapping at all (which is expensive). The only missing bit is how to implement heap *growth*. But here memfds can help us again: if we create another anonymous file and map it where the extended parts of the heap would go, we can take advantage of the fact that a `mmap()` mapping can be *larger than the file itself*, with accesses beyond the end generating a `SIGBUS`, and the fact that we can cheaply resize the file with `ftruncate`, even after a mapping exists. So we can map the "heap extension" file once with the maximum memory-slot size and grow the memfd itself as `memory.grow` operations occur. The above CoW technique and heap-growth technique together allow us a fastpath of `madvise()` and `ftruncate()` only when we re-instantiate the same module over and over, as long as we can reuse the same slot. This fastpath avoids all whole-process address-space locks in the Linux kernel, which should mean it is highly scalable. It also avoids the cost of copying data on read, as the `uffd` heap backend does when servicing pagefaults; the kernel's own optimized CoW logic (same as used by all file mmaps) is used instead. [1] https://bytecodealliance.zulipchat.com/#narrow/stream/206238-general/topic/Copy.20on.20write.20based.20instance.20reuse/near/266657772 --- .github/workflows/main.yml | 5 +- Cargo.lock | 10 + Cargo.toml | 2 + crates/environ/src/module.rs | 23 ++ crates/jit/src/instantiate.rs | 15 +- crates/runtime/Cargo.toml | 3 + crates/runtime/src/instance.rs | 23 ++ crates/runtime/src/instance/allocator.rs | 63 ++-- .../runtime/src/instance/allocator/memfd.rs | 290 ++++++++++++++++++ .../src/instance/allocator/memfd_disabled.rs | 49 +++ .../runtime/src/instance/allocator/pooling.rs | 260 ++++++++++------ .../src/instance/allocator/pooling/uffd.rs | 1 + crates/runtime/src/lib.rs | 44 +++ crates/runtime/src/memfd.rs | 236 ++++++++++++++ crates/runtime/src/memory.rs | 57 +++- crates/runtime/src/module_id.rs | 28 ++ crates/runtime/src/traphandlers/unix.rs | 14 +- crates/wasmtime/Cargo.toml | 2 + crates/wasmtime/src/engine.rs | 8 +- crates/wasmtime/src/instance.rs | 3 +- crates/wasmtime/src/module.rs | 33 +- crates/wasmtime/src/module/serialization.rs | 7 +- crates/wasmtime/src/store.rs | 2 + crates/wasmtime/src/trampoline.rs | 1 + crates/wasmtime/src/trampoline/func.rs | 1 + src/lib.rs | 25 ++ 26 files changed, 1070 insertions(+), 135 deletions(-) create mode 100644 crates/runtime/src/instance/allocator/memfd.rs create mode 100644 crates/runtime/src/instance/allocator/memfd_disabled.rs create mode 100644 crates/runtime/src/memfd.rs create mode 100644 crates/runtime/src/module_id.rs diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index e89e33e165..5e5e0c64d9 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -136,6 +136,7 @@ jobs: - run: cargo check -p wasmtime --no-default-features --features async - run: cargo check -p wasmtime --no-default-features --features uffd - run: cargo check -p wasmtime --no-default-features --features pooling-allocator + - run: cargo check -p wasmtime --no-default-features --features memfd-allocator - run: cargo check -p wasmtime --no-default-features --features cranelift - run: cargo check -p wasmtime --no-default-features --features cranelift,wat,async,cache @@ -310,11 +311,13 @@ jobs: env: RUST_BACKTRACE: 1 - # Test uffd functionality on Linux + # Test Linux-specific functionality - run: | cargo test --features uffd -p wasmtime-runtime instance::allocator::pooling cargo test --features uffd -p wasmtime-cli pooling_allocator cargo test --features uffd -p wasmtime-cli wast::Cranelift + cargo test --features memfd-allocator -p wasmtime-cli pooling_allocator + cargo test --features memfd-allocator -p wasmtime-cli wast::Cranelift if: matrix.os == 'ubuntu-latest' && matrix.target == '' env: RUST_BACKTRACE: 1 diff --git a/Cargo.lock b/Cargo.lock index 6362f3b413..dbe67fe9c7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1602,6 +1602,15 @@ version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" +[[package]] +name = "memfd" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6627dc657574b49d6ad27105ed671822be56e0d2547d413bfbf3e8d8fa92e7a" +dependencies = [ + "libc", +] + [[package]] name = "memmap2" version = "0.2.3" @@ -3587,6 +3596,7 @@ dependencies = [ "libc", "log", "mach", + "memfd", "memoffset", "more-asserts", "rand 0.8.3", diff --git a/Cargo.toml b/Cargo.toml index 748cb801da..51c4843fcc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -95,6 +95,8 @@ vtune = ["wasmtime/vtune"] wasi-crypto = ["wasmtime-wasi-crypto"] wasi-nn = ["wasmtime-wasi-nn"] uffd = ["wasmtime/uffd"] +pooling-allocator = ["wasmtime/pooling-allocator"] +memfd-allocator = ["pooling-allocator", "wasmtime/memfd-allocator"] all-arch = ["wasmtime/all-arch"] posix-signals-on-macos = ["wasmtime/posix-signals-on-macos"] diff --git a/crates/environ/src/module.rs b/crates/environ/src/module.rs index 8b05e2eb1c..d941801658 100644 --- a/crates/environ/src/module.rs +++ b/crates/environ/src/module.rs @@ -95,6 +95,19 @@ impl MemoryPlan { }, } } + + /// Determine whether a data segment (memory initializer) is + /// possibly out-of-bounds. Returns `true` if the initializer has a + /// dynamic location and this question cannot be resolved + /// pre-instantiation; hence, this method's result should not be + /// used to signal an error, only to exit optimized/simple fastpaths. + pub fn initializer_possibly_out_of_bounds(&self, init: &MemoryInitializer) -> bool { + match init.end() { + // Not statically known, so possibly out of bounds (we can't guarantee in-bounds). + None => true, + Some(end) => end > self.memory.minimum * (WASM_PAGE_SIZE as u64), + } + } } /// A WebAssembly linear memory initializer. @@ -113,6 +126,16 @@ pub struct MemoryInitializer { pub data: Range, } +impl MemoryInitializer { + /// If this initializer has a definite, static, non-overflowed end address, return it. + pub fn end(&self) -> Option { + if self.base.is_some() { + return None; + } + self.offset.checked_add(self.data.len() as u64) + } +} + /// The type of WebAssembly linear memory initialization to use for a module. #[derive(Clone, Debug, Serialize, Deserialize)] pub enum MemoryInitialization { diff --git a/crates/jit/src/instantiate.rs b/crates/jit/src/instantiate.rs index cc6a3844d1..6a41160070 100644 --- a/crates/jit/src/instantiate.rs +++ b/crates/jit/src/instantiate.rs @@ -19,7 +19,10 @@ use wasmtime_environ::{ StackMapInformation, Trampoline, Tunables, WasmFuncType, ELF_WASMTIME_ADDRMAP, ELF_WASMTIME_TRAPS, }; -use wasmtime_runtime::{GdbJitImageRegistration, InstantiationError, VMFunctionBody, VMTrampoline}; +use wasmtime_runtime::{ + CompiledModuleId, CompiledModuleIdAllocator, GdbJitImageRegistration, InstantiationError, + VMFunctionBody, VMTrampoline, +}; /// This is the name of the section in the final ELF image which contains /// concatenated data segments from the original wasm module. @@ -248,6 +251,8 @@ pub struct CompiledModule { code: Range, code_memory: CodeMemory, dbg_jit_registration: Option, + /// A unique ID used to register this module with the engine. + unique_id: CompiledModuleId, } impl CompiledModule { @@ -271,6 +276,7 @@ impl CompiledModule { mmap: MmapVec, info: Option, profiler: &dyn ProfilingAgent, + id_allocator: &CompiledModuleIdAllocator, ) -> Result> { // Transfer ownership of `obj` to a `CodeMemory` object which will // manage permissions, such as the executable bit. Once it's located @@ -312,6 +318,7 @@ impl CompiledModule { dbg_jit_registration: None, code_memory, meta: info.meta, + unique_id: id_allocator.alloc(), }; ret.register_debug_and_profiling(profiler)?; @@ -333,6 +340,12 @@ impl CompiledModule { Ok(()) } + /// Get this module's unique ID. It is unique with respect to a + /// single allocator (which is ordinarily held on a Wasm engine). + pub fn unique_id(&self) -> CompiledModuleId { + self.unique_id + } + /// Returns the underlying memory which contains the compiled module's /// image. pub fn mmap(&self) -> &MmapVec { diff --git a/crates/runtime/Cargo.toml b/crates/runtime/Cargo.toml index 827439d1d5..aaef30f677 100644 --- a/crates/runtime/Cargo.toml +++ b/crates/runtime/Cargo.toml @@ -37,6 +37,7 @@ winapi = { version = "0.3.7", features = ["winbase", "memoryapi", "errhandlingap [target.'cfg(target_os = "linux")'.dependencies] userfaultfd = { version = "0.4.1", optional = true } +memfd = { version = "0.4.1", optional = true } [build-dependencies] cc = "1.0" @@ -59,3 +60,5 @@ uffd = ["userfaultfd", "pooling-allocator"] # It is useful for applications that do not bind their own exception ports and # need portable signal handling. posix-signals-on-macos = [] + +memfd-allocator = ["pooling-allocator", "memfd"] diff --git a/crates/runtime/src/instance.rs b/crates/runtime/src/instance.rs index 2c9487d75e..9c56dfb222 100644 --- a/crates/runtime/src/instance.rs +++ b/crates/runtime/src/instance.rs @@ -97,6 +97,29 @@ pub(crate) struct Instance { #[allow(clippy::cast_ptr_alignment)] impl Instance { + /// Helper for allocators; not a public API. + pub(crate) fn create_raw( + module: &Arc, + wasm_data: &'static [u8], + memories: PrimaryMap, + tables: PrimaryMap, + host_state: Box, + ) -> Instance { + Instance { + module: module.clone(), + offsets: VMOffsets::new(HostPtr, &module), + memories, + tables, + dropped_elements: EntitySet::with_capacity(module.passive_elements.len()), + dropped_data: EntitySet::with_capacity(module.passive_data_map.len()), + host_state, + wasm_data, + vmctx: VMContext { + _marker: std::marker::PhantomPinned, + }, + } + } + /// Helper function to access various locations offset from our `*mut /// VMContext` object. unsafe fn vmctx_plus_offset(&self, offset: u32) -> *mut T { diff --git a/crates/runtime/src/instance/allocator.rs b/crates/runtime/src/instance/allocator.rs index 82c1eec31e..12fa88ddc8 100644 --- a/crates/runtime/src/instance/allocator.rs +++ b/crates/runtime/src/instance/allocator.rs @@ -4,28 +4,37 @@ use crate::memory::{DefaultMemoryCreator, Memory}; use crate::table::Table; use crate::traphandlers::Trap; use crate::vmcontext::{ - VMBuiltinFunctionsArray, VMCallerCheckedAnyfunc, VMContext, VMGlobalDefinition, - VMSharedSignatureIndex, + VMBuiltinFunctionsArray, VMCallerCheckedAnyfunc, VMGlobalDefinition, VMSharedSignatureIndex, }; +use crate::ModuleMemFds; use crate::Store; use anyhow::Result; use std::alloc; use std::any::Any; use std::convert::TryFrom; -use std::marker; use std::ptr::{self, NonNull}; use std::slice; use std::sync::Arc; use thiserror::Error; use wasmtime_environ::{ - DefinedFuncIndex, DefinedMemoryIndex, DefinedTableIndex, EntityRef, EntitySet, FunctionInfo, - GlobalInit, HostPtr, MemoryInitialization, MemoryInitializer, Module, ModuleType, PrimaryMap, - SignatureIndex, TableInitializer, TrapCode, VMOffsets, WasmType, WASM_PAGE_SIZE, + DefinedFuncIndex, DefinedMemoryIndex, DefinedTableIndex, EntityRef, FunctionInfo, GlobalInit, + MemoryInitialization, MemoryInitializer, Module, ModuleType, PrimaryMap, SignatureIndex, + TableInitializer, TrapCode, WasmType, WASM_PAGE_SIZE, }; #[cfg(feature = "pooling-allocator")] mod pooling; +#[cfg(feature = "memfd-allocator")] +mod memfd; +#[cfg(feature = "memfd-allocator")] +pub use self::memfd::MemFdSlot; + +#[cfg(not(feature = "memfd-allocator"))] +mod memfd_disabled; +#[cfg(not(feature = "memfd-allocator"))] +pub use self::memfd_disabled::MemFdSlot; + #[cfg(feature = "pooling-allocator")] pub use self::pooling::{ InstanceLimits, ModuleLimits, PoolingAllocationStrategy, PoolingInstanceAllocator, @@ -39,6 +48,9 @@ pub struct InstanceAllocationRequest<'a> { /// The base address of where JIT functions are located. pub image_base: usize, + /// If using MemFD-based memories, the backing MemFDs. + pub memfds: Option>, + /// Descriptors about each compiled function, such as the offset from /// `image_base`. pub functions: &'a PrimaryMap, @@ -376,9 +388,23 @@ fn check_memory_init_bounds( fn initialize_memories( instance: &mut Instance, + module: &Module, initializers: &[MemoryInitializer], ) -> Result<(), InstantiationError> { for init in initializers { + // Check whether this is a MemFD memory; if so, we can skip + // all initializers. + let memory = init.memory_index; + if let Some(defined_index) = module.defined_memory_index(memory) { + // We can only skip if there is actually a MemFD image. In + // some situations the MemFD image creation code will bail + // (e.g. due to an out of bounds data segment) and so we + // need to fall back on the usual initialization below. + if instance.memories[defined_index].is_memfd_with_image() { + continue; + } + } + instance .memory_init_segment( init.memory_index, @@ -432,6 +458,14 @@ fn initialize_instance( match &module.memory_initialization { MemoryInitialization::Paged { map, out_of_bounds } => { for (index, pages) in map { + // We can only skip if there is actually a MemFD image. In + // some situations the MemFD image creation code will bail + // (e.g. due to an out of bounds data segment) and so we + // need to fall back on the usual initialization below. + if instance.memories[index].is_memfd_with_image() { + continue; + } + let memory = instance.memory(index); let slice = unsafe { slice::from_raw_parts_mut(memory.base, memory.current_length) }; @@ -453,7 +487,7 @@ fn initialize_instance( } } MemoryInitialization::Segmented(initializers) => { - initialize_memories(instance, initializers)?; + initialize_memories(instance, module, initializers)?; } } @@ -691,19 +725,8 @@ unsafe impl InstanceAllocator for OnDemandInstanceAllocator { let host_state = std::mem::replace(&mut req.host_state, Box::new(())); let mut handle = { - let instance = Instance { - module: req.module.clone(), - offsets: VMOffsets::new(HostPtr, &req.module), - memories, - tables, - dropped_elements: EntitySet::with_capacity(req.module.passive_elements.len()), - dropped_data: EntitySet::with_capacity(req.module.passive_data_map.len()), - host_state, - wasm_data: &*req.wasm_data, - vmctx: VMContext { - _marker: marker::PhantomPinned, - }, - }; + let instance = + Instance::create_raw(&req.module, &*req.wasm_data, memories, tables, host_state); let layout = instance.alloc_layout(); let instance_ptr = alloc::alloc(layout) as *mut Instance; if instance_ptr.is_null() { diff --git a/crates/runtime/src/instance/allocator/memfd.rs b/crates/runtime/src/instance/allocator/memfd.rs new file mode 100644 index 0000000000..8713794824 --- /dev/null +++ b/crates/runtime/src/instance/allocator/memfd.rs @@ -0,0 +1,290 @@ +//! memfd mapping logic for use by the pooling allocator. + +use crate::memfd::MemoryMemFd; +use crate::InstantiationError; +use anyhow::Result; +use libc::c_void; +use rustix::fd::AsRawFd; +use std::convert::TryFrom; +use std::fs::File; +use std::sync::Arc; + +/// A single slot handled by the memfd instance-heap mechanism. +/// +/// The mmap scheme is: +/// +/// base ==> (points here) +/// - (image.offset bytes) anonymous zero memory, pre-image +/// - (image.len bytes) CoW mapping of memfd heap image +/// - (up to extension_offset) anonymous zero memory, post-image +/// - (up to static_size) heap expansion region; CoW mapping of per-slot memfd +/// +/// The ordering of mmaps to set this up is: +/// +/// - once, when pooling allocator is created: +/// - one large mmap to create 8GiB * instances * memories slots +/// +/// - per instantiation of new image in a slot: +/// - mmap of anonymous zero memory, from 0 to initial heap size +/// - mmap of CoW'd memfd image, from `image.offset` to +/// `image.offset + image.len`. This overwrites part of the +/// anonymous zero memory, potentially splitting it into a pre- +/// and post-region. +/// - mmap of CoW'd extension file, past the initial heap size up to +/// the end of the max memory size (just before the +/// post-guard). This is always adjacent to the above mmaps, but +/// does not overlap/overwrite them. +#[derive(Debug)] +pub struct MemFdSlot { + /// The base of the actual heap memory. Bytes at this address are + /// what is seen by the Wasm guest code. + base: usize, + /// The maximum static memory size, plus post-guard. + static_size: usize, + /// The memfd image that backs this memory. May be `None`, in + /// which case the memory is all zeroes. + pub(crate) image: Option>, + /// The offset at which the "extension file", which is used to + /// allow for efficient heap growth, is mapped. This is always + /// immediately after the end of the initial memory size. + extension_offset: usize, + /// The anonymous memfd, owned by this slot, which we mmap in the + /// area where the heap may grow during runtime. We use the + /// ftruncate() syscall (invoked via `File::set_len()`) to set its + /// size. We never write any data to it -- we CoW-map it so we can + /// throw away dirty data on termination. Instead, we just use its + /// size as a "watermark" that delineates the boundary between + /// safe-to-access memory and SIGBUS-causing memory. (This works + /// because one can mmap a file beyond its end, and is good + /// because ftruncate does not take the process-wide lock that + /// mmap and mprotect do.) + extension_file: File, + /// Whether this slot may have "dirty" pages (pages written by an + /// instantiation). Set by `instantiate()` and cleared by + /// `clear_and_remain_ready()`, and used in assertions to ensure + /// those methods are called properly. + dirty: bool, +} + +impl MemFdSlot { + pub(crate) fn create( + base_addr: *mut c_void, + static_size: usize, + ) -> Result { + let base = base_addr as usize; + + // Create a MemFD for the memory growth first -- this covers + // extended heap beyond the initial image. + let extension_memfd = memfd::MemfdOptions::new() + .allow_sealing(true) + .create("wasm-anonymous-heap") + .map_err(|e| InstantiationError::Resource(e.into()))?; + // Seal the ability to write the extension file (make it + // permanently read-only). This is a defense-in-depth + // mitigation to make extra-sure that we don't leak + // information between instantiations. See note in `memfd.rs` + // for more about why we use seals. + extension_memfd + .add_seal(memfd::FileSeal::SealWrite) + .map_err(|e| InstantiationError::Resource(e.into()))?; + extension_memfd + .add_seal(memfd::FileSeal::SealSeal) + .map_err(|e| InstantiationError::Resource(e.into()))?; + let extension_file = extension_memfd.into_file(); + extension_file + .set_len(0) + .map_err(|e| InstantiationError::Resource(e.into()))?; + + Ok(MemFdSlot { + base, + static_size, + image: None, + extension_file, + extension_offset: 0, + dirty: false, + }) + } + + pub(crate) fn set_heap_limit(&mut self, size_bytes: usize) -> Result<()> { + assert!(size_bytes >= self.extension_offset); + // This is all that is needed to make the new memory + // accessible; we don't need to mprotect anything. (The + // mapping itself is always R+W for the max possible heap + // size, and only the anonymous-backing file length catches + // out-of-bounds accesses.) + self.extension_file + .set_len(u64::try_from(size_bytes - self.extension_offset).unwrap())?; + Ok(()) + } + + pub(crate) fn instantiate( + &mut self, + initial_size_bytes: usize, + maybe_image: Option<&Arc>, + ) -> Result<(), InstantiationError> { + assert!(!self.dirty); + + if let Some(existing_image) = &self.image { + // Fast-path: previously instantiated with the same image, + // so the mappings are already correct; there is no need + // to mmap anything. Given that we asserted not-dirty + // above, any dirty pages will have already been thrown + // away by madvise() during the previous termination. + if let Some(image) = maybe_image { + if existing_image.fd.as_file().as_raw_fd() == image.fd.as_file().as_raw_fd() { + self.dirty = true; + return Ok(()); + } + } + } + + // Otherwise, we need to redo (i) the anonymous-mmap backing + // for the initial heap size, (ii) the extension-file backing, + // and (iii) the initial-heap-image mapping if present. + + // Security/audit note: we map all of these MAP_PRIVATE, so + // all instance data is local to the mapping, not propagated + // to the backing fd. We throw away this CoW overlay with + // madvise() below, from base up to extension_offset (which is + // at least initial_size_bytes, and extended when the + // extension file is, so it covers all three mappings) when + // terminating the instance. + + // Anonymous mapping behind the initial heap size: this gives + // zeroes for any "holes" in the initial heap image. Anonymous + // mmap memory is faster to fault in than a CoW of a file, + // even a file with zero holes, because the kernel's CoW path + // unconditionally copies *something* (even if just a page of + // zeroes). Anonymous zero pages are fast: the kernel + // pre-zeroes them, and even if it runs out of those, a memset + // is half as expensive as a memcpy (only writes, no reads). + if initial_size_bytes > 0 { + unsafe { + let ptr = rustix::io::mmap_anonymous( + self.base as *mut c_void, + initial_size_bytes, + rustix::io::ProtFlags::READ | rustix::io::ProtFlags::WRITE, + rustix::io::MapFlags::PRIVATE | rustix::io::MapFlags::FIXED, + ) + .map_err(|e| InstantiationError::Resource(e.into()))?; + assert_eq!(ptr as usize, self.base); + } + } + + // An "extension file": this allows us to grow the heap by + // doing just an ftruncate(), without changing any + // mappings. This is important to avoid the process-wide mmap + // lock on Linux. + self.extension_offset = initial_size_bytes; + let extension_map_len = self.static_size - initial_size_bytes; + if extension_map_len > 0 { + unsafe { + let fd = rustix::fd::BorrowedFd::borrow_raw_fd(self.extension_file.as_raw_fd()); + let ptr = rustix::io::mmap( + (self.base + initial_size_bytes) as *mut c_void, + extension_map_len, + rustix::io::ProtFlags::READ | rustix::io::ProtFlags::WRITE, + rustix::io::MapFlags::PRIVATE | rustix::io::MapFlags::FIXED, + &fd, + 0, + ) + .map_err(|e| InstantiationError::Resource(e.into()))?; + assert_eq!(ptr as usize, self.base + initial_size_bytes); + } + } + + // Finally, the initial memory image. + if let Some(image) = maybe_image { + if image.len > 0 { + let image = image.clone(); + + unsafe { + let fd = rustix::fd::BorrowedFd::borrow_raw_fd(image.fd.as_file().as_raw_fd()); + let ptr = rustix::io::mmap( + (self.base + image.offset) as *mut c_void, + image.len, + rustix::io::ProtFlags::READ | rustix::io::ProtFlags::WRITE, + rustix::io::MapFlags::PRIVATE | rustix::io::MapFlags::FIXED, + &fd, + image.offset as u64, + ) + .map_err(|e| InstantiationError::Resource(e.into()))?; + assert_eq!(ptr as usize, self.base + image.offset); + } + + self.image = Some(image); + } + } + + self.dirty = true; + Ok(()) + } + + pub(crate) fn clear_and_remain_ready(&mut self) -> Result<()> { + assert!(self.dirty); + // madvise the image range; that's it! This will throw away + // dirty pages, which are CoW-private pages on top of the + // initial heap image memfd. + unsafe { + rustix::io::madvise( + self.base as *mut c_void, + self.extension_offset, + rustix::io::Advice::LinuxDontNeed, + )?; + } + + // truncate the extension file down to zero bytes to reset heap length. + self.extension_file + .set_len(0) + .map_err(|e| InstantiationError::Resource(e.into()))?; + self.dirty = false; + Ok(()) + } + + pub(crate) fn has_image(&self) -> bool { + self.image.is_some() + } + + pub(crate) fn is_dirty(&self) -> bool { + self.dirty + } +} + +#[cfg(feature = "memfd-allocator")] +impl Drop for MemFdSlot { + fn drop(&mut self) { + // The MemFdSlot may be dropped if there is an error during + // instantiation: for example, if a memory-growth limiter + // disallows a guest from having a memory of a certain size, + // after we've already initialized the MemFdSlot. + // + // We need to return this region of the large pool mmap to a + // safe state (with no module-specific mappings). The + // MemFdSlot will not be returned to the MemoryPool, so a new + // MemFdSlot will be created and overwrite the mappings anyway + // on the slot's next use; but for safety and to avoid + // resource leaks it's better not to have stale mappings to a + // possibly-otherwise-dead module's image. + // + // To "wipe the slate clean", let's do a mmap of anonymous + // memory over the whole region, with PROT_NONE. Note that we + // *can't* simply munmap, because that leaves a hole in the + // middle of the pooling allocator's big memory area that some + // other random mmap may swoop in and take, to be trampled + // over by the next MemFdSlot later. + // + // Since we're in drop(), we can't sanely return an error if + // this mmap fails. Let's ignore the failure if so; the next + // MemFdSlot to be created for this slot will try to overwrite + // the existing stale mappings, and return a failure properly + // if we still cannot map new memory. + unsafe { + let _ = rustix::io::mmap_anonymous( + self.base as *mut _, + self.static_size, + rustix::io::ProtFlags::empty(), + rustix::io::MapFlags::FIXED | rustix::io::MapFlags::NORESERVE, + ); + } + } +} diff --git a/crates/runtime/src/instance/allocator/memfd_disabled.rs b/crates/runtime/src/instance/allocator/memfd_disabled.rs new file mode 100644 index 0000000000..9c87591bd5 --- /dev/null +++ b/crates/runtime/src/instance/allocator/memfd_disabled.rs @@ -0,0 +1,49 @@ +//! Shims for MemFdSlot when the memfd allocator is not +//! included. Enables unconditional use of the type and its methods +//! throughout higher-level code. + +use crate::InstantiationError; +use anyhow::Result; +use std::sync::Arc; + +/// A placeholder for MemFdSlot when we have not included the pooling +/// allocator. +/// +/// To allow MemFdSlot to be unconditionally passed around in various +/// places (e.g. a `Memory`), we define a zero-sized type when memfd is +/// not included in the build. +#[cfg(not(feature = "memfd-allocator"))] +#[derive(Debug)] +pub struct MemFdSlot; + +#[cfg(not(feature = "memfd-allocator"))] +#[allow(dead_code)] +impl MemFdSlot { + pub(crate) fn create(_: *mut libc::c_void, _: usize) -> Result { + panic!("create() on invalid MemFdSlot"); + } + + pub(crate) fn instantiate( + &mut self, + _: usize, + _: Option<&Arc>, + ) -> Result { + panic!("instantiate() on invalid MemFdSlot"); + } + + pub(crate) fn clear_and_remain_ready(&mut self) -> Result<()> { + Ok(()) + } + + pub(crate) fn has_image(&self) -> bool { + false + } + + pub(crate) fn is_dirty(&self) -> bool { + false + } + + pub(crate) fn set_heap_limit(&mut self, _: usize) -> Result<()> { + panic!("set_heap_limit on invalid MemFdSlot"); + } +} diff --git a/crates/runtime/src/instance/allocator/pooling.rs b/crates/runtime/src/instance/allocator/pooling.rs index 76614137d5..6aa291d7a9 100644 --- a/crates/runtime/src/instance/allocator/pooling.rs +++ b/crates/runtime/src/instance/allocator/pooling.rs @@ -7,19 +7,21 @@ //! Using the pooling instance allocator can speed up module instantiation //! when modules can be constrained based on configurable limits. +use super::MemFdSlot; use super::{ initialize_instance, initialize_vmcontext, InstanceAllocationRequest, InstanceAllocator, InstanceHandle, InstantiationError, }; -use crate::{instance::Instance, Memory, Mmap, Table, VMContext}; +use crate::{instance::Instance, Memory, Mmap, ModuleMemFds, Table}; use anyhow::{anyhow, bail, Context, Result}; +use libc::c_void; use rand::Rng; use std::convert::TryFrom; -use std::marker; use std::mem; -use std::sync::{Arc, Mutex}; +use std::sync::Arc; +use std::sync::Mutex; use wasmtime_environ::{ - EntitySet, HostPtr, MemoryStyle, Module, PrimaryMap, Tunables, VMOffsets, VMOffsetsFields, + HostPtr, MemoryIndex, MemoryStyle, Module, PrimaryMap, Tunables, VMOffsets, VMOffsetsFields, WASM_PAGE_SIZE, }; @@ -284,7 +286,6 @@ struct InstancePool { free_list: Mutex>, memories: MemoryPool, tables: TablePool, - empty_module: Arc, } impl InstancePool { @@ -332,14 +333,8 @@ impl InstancePool { free_list: Mutex::new((0..max_instances).collect()), memories: MemoryPool::new(module_limits, instance_limits, tunables)?, tables: TablePool::new(module_limits, instance_limits)?, - empty_module: Arc::new(Module::default()), }; - // Use a default module to initialize the instances to start - for i in 0..instance_limits.count as usize { - pool.initialize(module_limits, i); - } - Ok(pool) } @@ -348,41 +343,26 @@ impl InstancePool { &mut *(self.mapping.as_mut_ptr().add(index * self.instance_size) as *mut Instance) } - fn initialize(&self, limits: &ModuleLimits, index: usize) { - unsafe { - let instance = self.instance(index); - - // Write a default instance with preallocated memory/table map storage to the ptr - std::ptr::write( - instance as _, - Instance { - module: self.empty_module.clone(), - offsets: VMOffsets::new(HostPtr, &self.empty_module), - memories: PrimaryMap::with_capacity(limits.memories as usize), - tables: PrimaryMap::with_capacity(limits.tables as usize), - dropped_elements: EntitySet::new(), - dropped_data: EntitySet::new(), - host_state: Box::new(()), - wasm_data: &[], - vmctx: VMContext { - _marker: marker::PhantomPinned, - }, - }, - ); - } - } - unsafe fn setup_instance( &self, index: usize, mut req: InstanceAllocationRequest, ) -> Result { - let instance = self.instance(index); + let host_state = std::mem::replace(&mut req.host_state, Box::new(())); + let instance_data = Instance::create_raw( + &req.module, + &*req.wasm_data, + PrimaryMap::default(), + PrimaryMap::default(), + host_state, + ); - instance.module = req.module.clone(); - instance.offsets = VMOffsets::new(HostPtr, instance.module.as_ref()); - instance.host_state = std::mem::replace(&mut req.host_state, Box::new(())); - instance.wasm_data = &*req.wasm_data; + // Instances are uninitialized memory at first; we need to + // write an empty but initialized `Instance` struct into the + // chosen slot before we do anything else with it. (This is + // paired with a `drop_in_place` in deallocate below.) + let instance = self.instance(index); + std::ptr::write(instance as _, instance_data); // set_instance_memories and _tables will need the store before we can completely // initialize the vmcontext. @@ -391,8 +371,10 @@ impl InstancePool { } Self::set_instance_memories( + index, instance, - self.memories.get(index), + &self.memories, + &req.memfds, self.memories.max_wasm_pages, )?; @@ -448,20 +430,44 @@ impl InstancePool { let instance = unsafe { &mut *handle.instance }; // Decommit any linear memories that were used - for (memory, base) in instance.memories.values_mut().zip(self.memories.get(index)) { + for ((def_mem_idx, memory), base) in + instance.memories.iter_mut().zip(self.memories.get(index)) + { let mut memory = mem::take(memory); debug_assert!(memory.is_static()); - // Reset any faulted guard pages as the physical memory may be reused for another instance in the future - #[cfg(all(feature = "uffd", target_os = "linux"))] - memory - .reset_guard_pages() - .expect("failed to reset guard pages"); - drop(&mut memory); // require mutable on all platforms, not just uffd + match memory { + Memory::Static { + memfd_slot: Some(mut memfd_slot), + .. + } => { + let mem_idx = instance.module.memory_index(def_mem_idx); + // If there was any error clearing the memfd, just + // drop it here, and let the drop handler for the + // MemFdSlot unmap in a way that retains the + // address space reservation. + if memfd_slot.clear_and_remain_ready().is_ok() { + self.memories.return_memfd_slot(index, mem_idx, memfd_slot); + } + } - let size = memory.byte_size(); - drop(memory); - decommit_memory_pages(base, size).expect("failed to decommit linear memory pages"); + _ => { + // Reset any faulted guard pages as the physical + // memory may be reused for another instance in + // the future. + #[cfg(all(feature = "uffd", target_os = "linux"))] + memory + .reset_guard_pages() + .expect("failed to reset guard pages"); + // require mutable on all platforms, not just uffd + drop(&mut memory); + + let size = memory.byte_size(); + drop(memory); + decommit_memory_pages(base, size) + .expect("failed to decommit linear memory pages"); + } + } } instance.memories.clear(); @@ -481,50 +487,81 @@ impl InstancePool { decommit_table_pages(base, size).expect("failed to decommit table pages"); } - instance.tables.clear(); - instance.dropped_elements.clear(); - - // Drop all `global` values which need a destructor, such as externref - // values which now need their reference count dropped. - instance.drop_globals(); - - // Drop any host state - instance.host_state = Box::new(()); - - // And finally reset the module/offsets back to their original. This - // should put everything back in a relatively pristine state for each - // fresh allocation later on. - instance.module = self.empty_module.clone(); - instance.offsets = VMOffsets::new(HostPtr, &self.empty_module); - instance.wasm_data = &[]; + // We've now done all of the pooling-allocator-specific + // teardown, so we can drop the Instance and let destructors + // take care of any other fields (host state, globals, etc.). + unsafe { + std::ptr::drop_in_place(instance as *mut _); + } + // The instance is now uninitialized memory and cannot be + // touched again until we write a fresh Instance in-place with + // std::ptr::write in allocate() above. self.free_list.lock().unwrap().push(index); } fn set_instance_memories( + instance_idx: usize, instance: &mut Instance, - mut memories: impl Iterator, + memories: &MemoryPool, + maybe_memfds: &Option>, max_pages: u64, ) -> Result<(), InstantiationError> { let module = instance.module.as_ref(); debug_assert!(instance.memories.is_empty()); - for plan in - (&module.memory_plans.values().as_slice()[module.num_imported_memories..]).iter() + for (memory_index, plan) in module + .memory_plans + .iter() + .skip(module.num_imported_memories) { + let defined_index = module + .defined_memory_index(memory_index) + .expect("should be a defined memory since we skipped imported ones"); + let memory = unsafe { std::slice::from_raw_parts_mut( - memories.next().unwrap(), + memories.get_base(instance_idx, memory_index), (max_pages as usize) * (WASM_PAGE_SIZE as usize), ) }; - instance.memories.push( - Memory::new_static(plan, memory, commit_memory_pages, unsafe { - &mut *instance.store() - }) - .map_err(InstantiationError::Resource)?, - ); + + if let Some(memfds) = maybe_memfds { + let image = memfds.get_memory_image(defined_index); + let mut slot = memories.take_memfd_slot(instance_idx, memory_index)?; + let initial_size = plan.memory.minimum * WASM_PAGE_SIZE as u64; + + // If instantiation fails, we can propagate the error + // upward and drop the slot. This will cause the Drop + // handler to attempt to map the range with PROT_NONE + // memory, to reserve the space while releasing any + // stale mappings. The next use of this slot will then + // create a new MemFdSlot that will try to map over + // this, returning errors as well if the mapping + // errors persist. The unmap-on-drop is best effort; + // if it fails, then we can still soundly continue + // using the rest of the pool and allowing the rest of + // the process to continue, because we never perform a + // mmap that would leave an open space for someone + // else to come in and map something. + slot.instantiate(initial_size as usize, image) + .map_err(|e| InstantiationError::Resource(e.into()))?; + + instance.memories.push( + Memory::new_static(plan, memory, None, Some(slot), unsafe { + &mut *instance.store() + }) + .map_err(InstantiationError::Resource)?, + ); + } else { + instance.memories.push( + Memory::new_static(plan, memory, Some(commit_memory_pages), None, unsafe { + &mut *instance.store() + }) + .map_err(InstantiationError::Resource)?, + ); + } } debug_assert!(instance.dropped_data.is_empty()); @@ -566,17 +603,6 @@ impl InstancePool { } } -impl Drop for InstancePool { - fn drop(&mut self) { - unsafe { - for i in 0..self.max_instances { - let ptr = self.mapping.as_mut_ptr().add(i * self.instance_size) as *mut Instance; - std::ptr::drop_in_place(ptr); - } - } - } -} - /// Represents a pool of WebAssembly linear memories. /// /// A linear memory is divided into accessible pages and guard pages. @@ -589,6 +615,10 @@ impl Drop for InstancePool { #[derive(Debug)] struct MemoryPool { mapping: Mmap, + // If using the memfd allocation scheme, the MemFd slots. We + // dynamically transfer ownership of a slot to a Memory when in + // use. + memfd_slots: Vec>>, // The size, in bytes, of each linear memory's reservation plus the guard // region allocated for it. memory_size: usize, @@ -673,8 +703,18 @@ impl MemoryPool { let mapping = Mmap::accessible_reserved(0, allocation_size) .context("failed to create memory pool mapping")?; + let num_memfd_slots = if cfg!(feature = "memfd-allocator") { + max_instances * max_memories + } else { + 0 + }; + let memfd_slots: Vec<_> = std::iter::repeat_with(|| Mutex::new(None)) + .take(num_memfd_slots) + .collect(); + let pool = Self { mapping, + memfd_slots, memory_size, initial_memory_offset, max_memories, @@ -689,17 +729,43 @@ impl MemoryPool { Ok(pool) } - fn get(&self, instance_index: usize) -> impl Iterator { + fn get_base(&self, instance_index: usize, memory_index: MemoryIndex) -> *mut u8 { debug_assert!(instance_index < self.max_instances); + let memory_index = memory_index.as_u32() as usize; + debug_assert!(memory_index < self.max_memories); + let idx = instance_index * self.max_memories + memory_index; + let offset = self.initial_memory_offset + idx * self.memory_size; + unsafe { self.mapping.as_mut_ptr().offset(offset as isize) } + } - let base: *mut u8 = unsafe { - self.mapping.as_mut_ptr().add( - self.initial_memory_offset + instance_index * self.memory_size * self.max_memories, - ) as _ - }; + fn get<'a>(&'a self, instance_index: usize) -> impl Iterator + 'a { + (0..self.max_memories) + .map(move |i| self.get_base(instance_index, MemoryIndex::from_u32(i as u32))) + } - let size = self.memory_size; - (0..self.max_memories).map(move |i| unsafe { base.add(i * size) }) + /// Take ownership of the given memfd slot. Must be returned via + /// `return_memfd_slot` when the instance is done using it. + fn take_memfd_slot( + &self, + instance_index: usize, + memory_index: MemoryIndex, + ) -> Result { + let idx = instance_index * self.max_memories + (memory_index.as_u32() as usize); + let maybe_slot = self.memfd_slots[idx].lock().unwrap().take(); + + maybe_slot.map(|slot| Ok(slot)).unwrap_or_else(|| { + MemFdSlot::create( + self.get_base(instance_index, memory_index) as *mut c_void, + self.memory_size, + ) + }) + } + + /// Return ownership of the given memfd slot. + fn return_memfd_slot(&self, instance_index: usize, memory_index: MemoryIndex, slot: MemFdSlot) { + assert!(!slot.is_dirty()); + let idx = instance_index * self.max_memories + (memory_index.as_u32() as usize); + *self.memfd_slots[idx].lock().unwrap() = Some(slot); } } @@ -1413,6 +1479,7 @@ mod test { host_state: Box::new(()), store: StorePtr::empty(), wasm_data: &[], + memfds: None, }, ) .expect("allocation should succeed"), @@ -1437,6 +1504,7 @@ mod test { host_state: Box::new(()), store: StorePtr::empty(), wasm_data: &[], + memfds: None, }, ) { Err(InstantiationError::Limit(3)) => {} diff --git a/crates/runtime/src/instance/allocator/pooling/uffd.rs b/crates/runtime/src/instance/allocator/pooling/uffd.rs index 55b4479fd1..87dd9a0c57 100644 --- a/crates/runtime/src/instance/allocator/pooling/uffd.rs +++ b/crates/runtime/src/instance/allocator/pooling/uffd.rs @@ -577,6 +577,7 @@ mod test { PoolingAllocationStrategy::Random, InstanceAllocationRequest { module: module.clone(), + memfds: None, image_base: 0, functions, imports: Imports { diff --git a/crates/runtime/src/lib.rs b/crates/runtime/src/lib.rs index f96e7d8dda..806c8c9c5c 100644 --- a/crates/runtime/src/lib.rs +++ b/crates/runtime/src/lib.rs @@ -19,6 +19,7 @@ clippy::use_self ) )] +#![cfg_attr(feature = "memfd-allocator", allow(dead_code))] use std::sync::atomic::AtomicU64; @@ -63,6 +64,49 @@ pub use crate::vmcontext::{ VMSharedSignatureIndex, VMTableDefinition, VMTableImport, VMTrampoline, ValRaw, }; +mod module_id; +pub use module_id::{CompiledModuleId, CompiledModuleIdAllocator}; + +#[cfg(feature = "memfd-allocator")] +mod memfd; + +/// When memfd support is not included, provide a shim type and +/// constructor instead so that higher-level code does not need +/// feature-conditional compilation. +#[cfg(not(feature = "memfd-allocator"))] +#[allow(dead_code)] +mod memfd { + use anyhow::Result; + use std::sync::Arc; + use wasmtime_environ::{DefinedMemoryIndex, Module}; + + /// A shim for the memfd image container when memfd support is not + /// included. + pub enum ModuleMemFds {} + + /// A shim for an individual memory image. + #[allow(dead_code)] + pub enum MemoryMemFd {} + + impl ModuleMemFds { + /// Construct a new set of memfd images. This variant is used + /// when memfd support is not included; it always returns no + /// images. + pub fn new(_: &Module, _: &[u8]) -> Result>> { + Ok(None) + } + + /// Get the memfd image for a particular memory. + pub(crate) fn get_memory_image(&self, _: DefinedMemoryIndex) -> Option<&Arc> { + // Should be unreachable because the `Self` type is + // uninhabitable. + match *self {} + } + } +} + +pub use crate::memfd::ModuleMemFds; + /// Version number of this crate. pub const VERSION: &str = env!("CARGO_PKG_VERSION"); diff --git a/crates/runtime/src/memfd.rs b/crates/runtime/src/memfd.rs new file mode 100644 index 0000000000..46ebc4e228 --- /dev/null +++ b/crates/runtime/src/memfd.rs @@ -0,0 +1,236 @@ +//! memfd support. + +use anyhow::Result; +use memfd::{Memfd, MemfdOptions}; +use rustix::fs::FileExt; +use std::convert::TryFrom; +use std::sync::Arc; +use wasmtime_environ::{ + DefinedMemoryIndex, MemoryInitialization, MemoryInitializer, MemoryPlan, Module, PrimaryMap, +}; + +/// MemFDs containing backing images for certain memories in a module. +/// +/// This is meant to be built once, when a module is first +/// loaded/constructed, and then used many times for instantiation. +pub struct ModuleMemFds { + memories: PrimaryMap>>, +} + +const MAX_MEMFD_IMAGE_SIZE: u64 = 1024 * 1024 * 1024; // limit to 1GiB. + +impl ModuleMemFds { + pub(crate) fn get_memory_image( + &self, + defined_index: DefinedMemoryIndex, + ) -> Option<&Arc> { + self.memories[defined_index].as_ref() + } +} + +/// One backing image for one memory. +#[derive(Debug)] +pub(crate) struct MemoryMemFd { + pub(crate) fd: Memfd, + /// Length of image. Note that initial memory size may be larger; + /// leading and trailing zeroes are truncated (handled by + /// anonymous backing memfd). + pub(crate) len: usize, + /// Image starts this many bytes into heap space. Note that the + /// memfd's offsets are always equal to the heap offsets, so we + /// map at an offset into the fd as well. (This simplifies + /// construction.) + pub(crate) offset: usize, +} + +fn unsupported_initializer(segment: &MemoryInitializer, plan: &MemoryPlan) -> bool { + // If the segment has a base that is dynamically determined + // (by a global value, which may be a function of an imported + // module, for example), then we cannot build a single static + // image that is used for every instantiation. So we skip this + // memory entirely. + let end = match segment.end() { + None => { + return true; + } + Some(end) => end, + }; + + // Cannot be out-of-bounds. If there is a *possibility* it may + // be, then we just fall back on ordinary initialization. + if plan.initializer_possibly_out_of_bounds(segment) { + return true; + } + + // Must fit in our max size. + if end > MAX_MEMFD_IMAGE_SIZE { + return true; + } + + false +} + +impl ModuleMemFds { + /// Create a new `ModuleMemFds` for the given module. This can be + /// passed in as part of a `InstanceAllocationRequest` to speed up + /// instantiation and execution by using memfd-backed memories. + pub fn new(module: &Module, wasm_data: &[u8]) -> Result>> { + let page_size = region::page::size() as u64; + let num_defined_memories = module.memory_plans.len() - module.num_imported_memories; + + // Allocate a memfd file initially for every memory. We'll + // release those and set `excluded_memories` for those that we + // determine during initializer processing we cannot support a + // static image (e.g. due to dynamically-located segments). + let mut memfds: PrimaryMap> = PrimaryMap::default(); + let mut sizes: PrimaryMap = PrimaryMap::default(); + let mut excluded_memories: PrimaryMap = PrimaryMap::new(); + + for _ in 0..num_defined_memories { + memfds.push(None); + sizes.push(0); + excluded_memories.push(false); + } + + fn create_memfd() -> Result { + // Create the memfd. It needs a name, but the + // documentation for `memfd_create()` says that names can + // be duplicated with no issues. + MemfdOptions::new() + .allow_sealing(true) + .create("wasm-memory-image") + .map_err(|e| e.into()) + } + let round_up_page = |len: u64| (len + page_size - 1) & !(page_size - 1); + + match &module.memory_initialization { + &MemoryInitialization::Segmented(ref segments) => { + for (i, segment) in segments.iter().enumerate() { + let defined_memory = match module.defined_memory_index(segment.memory_index) { + Some(defined_memory) => defined_memory, + None => continue, + }; + if excluded_memories[defined_memory] { + continue; + } + + if unsupported_initializer(segment, &module.memory_plans[segment.memory_index]) + { + memfds[defined_memory] = None; + excluded_memories[defined_memory] = true; + continue; + } + + if memfds[defined_memory].is_none() { + memfds[defined_memory] = Some(create_memfd()?); + } + let memfd = memfds[defined_memory].as_mut().unwrap(); + + let end = round_up_page(segment.end().expect("must have statically-known end")); + if end > sizes[defined_memory] { + sizes[defined_memory] = end; + memfd.as_file().set_len(end)?; + } + + let base = segments[i].offset; + let data = &wasm_data[segment.data.start as usize..segment.data.end as usize]; + memfd.as_file().write_at(data, base)?; + } + } + &MemoryInitialization::Paged { ref map, .. } => { + for (defined_memory, pages) in map { + let top = pages + .iter() + .map(|(base, range)| *base + range.len() as u64) + .max() + .unwrap_or(0); + + let memfd = create_memfd()?; + memfd.as_file().set_len(top)?; + + for (base, range) in pages { + let data = &wasm_data[range.start as usize..range.end as usize]; + memfd.as_file().write_at(data, *base)?; + } + + memfds[defined_memory] = Some(memfd); + sizes[defined_memory] = top; + } + } + } + + // Now finalize each memory. + let mut memories: PrimaryMap>> = + PrimaryMap::default(); + for (defined_memory, maybe_memfd) in memfds { + let memfd = match maybe_memfd { + Some(memfd) => memfd, + None => { + memories.push(None); + continue; + } + }; + let size = sizes[defined_memory]; + + // Find leading and trailing zero data so that the mmap + // can precisely map only the nonzero data; anon-mmap zero + // memory is faster for anything that doesn't actually + // have content. + let mut page_data = vec![0; page_size as usize]; + let mut page_is_nonzero = |page| { + let offset = page_size * page; + memfd.as_file().read_at(&mut page_data[..], offset).unwrap(); + page_data.iter().any(|byte| *byte != 0) + }; + let n_pages = size / page_size; + + let mut offset = 0; + for page in 0..n_pages { + if page_is_nonzero(page) { + break; + } + offset += page_size; + } + let len = if offset == size { + 0 + } else { + let mut len = 0; + for page in (0..n_pages).rev() { + if page_is_nonzero(page) { + len = (page + 1) * page_size - offset; + break; + } + } + len + }; + + // Seal the memfd's data and length. + // + // This is a defense-in-depth security mitigation. The + // memfd will serve as the starting point for the heap of + // every instance of this module. If anything were to + // write to this, it could affect every execution. The + // memfd object itself is owned by the machinery here and + // not exposed elsewhere, but it is still an ambient open + // file descriptor at the syscall level, so some other + // vulnerability that allowed writes to arbitrary fds + // could modify it. Or we could have some issue with the + // way that we map it into each instance. To be + // extra-super-sure that it never changes, and because + // this costs very little, we use the kernel's "seal" API + // to make the memfd image permanently read-only. + memfd.add_seal(memfd::FileSeal::SealGrow)?; + memfd.add_seal(memfd::FileSeal::SealShrink)?; + memfd.add_seal(memfd::FileSeal::SealWrite)?; + memfd.add_seal(memfd::FileSeal::SealSeal)?; + + memories.push(Some(Arc::new(MemoryMemFd { + fd: memfd, + offset: usize::try_from(offset).unwrap(), + len: usize::try_from(len).unwrap(), + }))); + } + + Ok(Some(Arc::new(ModuleMemFds { memories }))) + } +} diff --git a/crates/runtime/src/memory.rs b/crates/runtime/src/memory.rs index 07c0c619cc..894a8afd96 100644 --- a/crates/runtime/src/memory.rs +++ b/crates/runtime/src/memory.rs @@ -2,6 +2,7 @@ //! //! `RuntimeLinearMemory` is to WebAssembly linear memories what `Table` is to WebAssembly tables. +use crate::instance::MemFdSlot; use crate::mmap::Mmap; use crate::vmcontext::VMMemoryDefinition; use crate::Store; @@ -208,7 +209,11 @@ pub enum Memory { /// A callback which makes portions of `base` accessible for when memory /// is grown. Otherwise it's expected that accesses to `base` will /// fault. - make_accessible: fn(*mut u8, usize) -> Result<()>, + make_accessible: Option Result<()>>, + + /// The MemFdSlot, if any, for this memory. Owned here and + /// returned to the pooling allocator when termination occurs. + memfd_slot: Option, /// Stores the pages in the linear memory that have faulted as guard pages when using the `uffd` feature. /// These pages need their protection level reset before the memory can grow. @@ -236,7 +241,8 @@ impl Memory { pub fn new_static( plan: &MemoryPlan, base: &'static mut [u8], - make_accessible: fn(*mut u8, usize) -> Result<()>, + make_accessible: Option Result<()>>, + memfd_slot: Option, store: &mut dyn Store, ) -> Result { let (minimum, maximum) = Self::limit_new(plan, store)?; @@ -246,14 +252,17 @@ impl Memory { _ => base, }; - if minimum > 0 { - make_accessible(base.as_mut_ptr(), minimum)?; + if let Some(make_accessible) = make_accessible { + if minimum > 0 { + make_accessible(base.as_mut_ptr(), minimum)?; + } } Ok(Memory::Static { base, size: minimum, make_accessible, + memfd_slot, #[cfg(all(feature = "uffd", target_os = "linux"))] guard_page_faults: Vec::new(), }) @@ -373,6 +382,22 @@ impl Memory { } } + /// Returns whether or not this memory is backed by a MemFD + /// image. Note that this is testing whether there is actually an + /// *image* mapped, not just whether the MemFD mechanism is being + /// used. The distinction is important because if we are not using + /// a prevalidated and prepared image, we need to fall back to + /// ordinary initialization code. + pub(crate) fn is_memfd_with_image(&self) -> bool { + match self { + Memory::Static { + memfd_slot: Some(ref slot), + .. + } => slot.has_image(), + _ => false, + } + } + /// Grow memory by the specified amount of wasm pages. /// /// Returns `None` if memory can't be grown by the specified amount @@ -443,12 +468,33 @@ impl Memory { } match self { + Memory::Static { + base, + size, + memfd_slot: Some(ref mut memfd_slot), + .. + } => { + // Never exceed static memory size + if new_byte_size > base.len() { + store.memory_grow_failed(&format_err!("static memory size exceeded")); + return Ok(None); + } + + if let Err(e) = memfd_slot.set_heap_limit(new_byte_size) { + store.memory_grow_failed(&e); + return Ok(None); + } + *size = new_byte_size; + } Memory::Static { base, size, make_accessible, .. } => { + let make_accessible = make_accessible + .expect("make_accessible must be Some if this is not a MemFD memory"); + // Never exceed static memory size if new_byte_size > base.len() { store.memory_grow_failed(&format_err!("static memory size exceeded")); @@ -540,7 +586,8 @@ impl Default for Memory { Memory::Static { base: &mut [], size: 0, - make_accessible: |_, _| unreachable!(), + make_accessible: Some(|_, _| unreachable!()), + memfd_slot: None, #[cfg(all(feature = "uffd", target_os = "linux"))] guard_page_faults: Vec::new(), } diff --git a/crates/runtime/src/module_id.rs b/crates/runtime/src/module_id.rs new file mode 100644 index 0000000000..481a63e0bd --- /dev/null +++ b/crates/runtime/src/module_id.rs @@ -0,0 +1,28 @@ +//! Unique IDs for modules in the runtime. + +use std::sync::atomic::{AtomicU64, Ordering}; + +/// A unique identifier (within an engine or similar) for a compiled +/// module. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct CompiledModuleId(u64); + +/// An allocator for compiled module IDs. +pub struct CompiledModuleIdAllocator { + next: AtomicU64, +} + +impl CompiledModuleIdAllocator { + /// Create a compiled-module ID allocator. + pub fn new() -> Self { + Self { + next: AtomicU64::new(1), + } + } + + /// Allocate a new ID. + pub fn alloc(&self) -> CompiledModuleId { + let id = self.next.fetch_add(1, Ordering::Relaxed); + CompiledModuleId(id) + } +} diff --git a/crates/runtime/src/traphandlers/unix.rs b/crates/runtime/src/traphandlers/unix.rs index cf41176cb7..fd16bfcdd1 100644 --- a/crates/runtime/src/traphandlers/unix.rs +++ b/crates/runtime/src/traphandlers/unix.rs @@ -51,9 +51,17 @@ pub unsafe fn platform_init() { register(&mut PREV_SIGFPE, libc::SIGFPE); } - // On ARM, handle Unaligned Accesses. - // On Darwin, guard page accesses are raised as SIGBUS. - if cfg!(target_arch = "arm") || cfg!(target_os = "macos") || cfg!(target_os = "freebsd") { + // Sometimes we need to handle SIGBUS too: + // - On ARM, handle Unaligned Accesses. + // - On Darwin, guard page accesses are raised as SIGBUS. + // - With the MemFD allocator, heap growth is controlled by + // ftruncate'ing an mmap'd file, and so out-of-bounds accesses + // are raised as SIGBUS. + if cfg!(target_arch = "arm") + || cfg!(target_os = "macos") + || cfg!(target_os = "freebsd") + || cfg!(feature = "memfd-allocator") + { register(&mut PREV_SIGBUS, libc::SIGBUS); } } diff --git a/crates/wasmtime/Cargo.toml b/crates/wasmtime/Cargo.toml index b5912ceb83..c7b0037d0e 100644 --- a/crates/wasmtime/Cargo.toml +++ b/crates/wasmtime/Cargo.toml @@ -89,3 +89,5 @@ all-arch = ["wasmtime-cranelift/all-arch"] # It is useful for applications that do not bind their own exception ports and # need portable signal handling. posix-signals-on-macos = ["wasmtime-runtime/posix-signals-on-macos"] + +memfd-allocator = ["wasmtime-runtime/memfd-allocator", "pooling-allocator"] \ No newline at end of file diff --git a/crates/wasmtime/src/engine.rs b/crates/wasmtime/src/engine.rs index 8a419c5170..48420ff492 100644 --- a/crates/wasmtime/src/engine.rs +++ b/crates/wasmtime/src/engine.rs @@ -7,7 +7,7 @@ use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; #[cfg(feature = "cache")] use wasmtime_cache::CacheConfig; -use wasmtime_runtime::{debug_builtins, InstanceAllocator}; +use wasmtime_runtime::{debug_builtins, CompiledModuleIdAllocator, InstanceAllocator}; /// An `Engine` which is a global context for compilation and management of wasm /// modules. @@ -43,6 +43,7 @@ struct EngineInner { allocator: Box, signatures: SignatureRegistry, epoch: AtomicU64, + unique_id_allocator: CompiledModuleIdAllocator, } impl Engine { @@ -68,6 +69,7 @@ impl Engine { allocator, signatures: registry, epoch: AtomicU64::new(0), + unique_id_allocator: CompiledModuleIdAllocator::new(), }), }) } @@ -153,6 +155,10 @@ impl Engine { self.inner.epoch.fetch_add(1, Ordering::Relaxed); } + pub(crate) fn unique_id_allocator(&self) -> &CompiledModuleIdAllocator { + &self.inner.unique_id_allocator + } + /// Ahead-of-time (AOT) compiles a WebAssembly module. /// /// The `bytes` provided must be in one of two formats: diff --git a/crates/wasmtime/src/instance.rs b/crates/wasmtime/src/instance.rs index aec6c1ba06..7f5b5e823d 100644 --- a/crates/wasmtime/src/instance.rs +++ b/crates/wasmtime/src/instance.rs @@ -651,7 +651,7 @@ impl<'a> Instantiator<'a> { artifacts, modules, &self.cur.modules, - ); + )?; self.cur.modules.push(submodule); } @@ -707,6 +707,7 @@ impl<'a> Instantiator<'a> { .allocator() .allocate(InstanceAllocationRequest { module: compiled_module.module().clone(), + memfds: self.cur.module.memfds().clone(), image_base: compiled_module.code().as_ptr() as usize, functions: compiled_module.functions(), imports: self.cur.build(), diff --git a/crates/wasmtime/src/module.rs b/crates/wasmtime/src/module.rs index 04c695f214..09c2d3f485 100644 --- a/crates/wasmtime/src/module.rs +++ b/crates/wasmtime/src/module.rs @@ -11,6 +11,7 @@ use std::sync::Arc; use wasmparser::{Parser, ValidPayload, Validator}; use wasmtime_environ::{ModuleEnvironment, ModuleIndex, PrimaryMap}; use wasmtime_jit::{CompiledModule, CompiledModuleInfo, MmapVec, TypeTables}; +use wasmtime_runtime::ModuleMemFds; mod registry; mod serialization; @@ -107,6 +108,8 @@ struct ModuleInner { types: Arc, /// Registered shared signature for the module. signatures: Arc, + /// a set of memfd images for memories, if any. + memfds: Option>, } impl Module { @@ -336,7 +339,12 @@ impl Module { }; let modules = engine.run_maybe_parallel(artifacts, |(a, b)| { - CompiledModule::from_artifacts(a, b, &*engine.config().profiler) + CompiledModule::from_artifacts( + a, + b, + &*engine.config().profiler, + engine.unique_id_allocator(), + ) })?; Self::from_parts(engine, modules, main_module, Arc::new(types), &[]) @@ -523,6 +531,8 @@ impl Module { }) .collect::>>()?; + let memfds = ModuleMemFds::new(module.module(), module.wasm_data())?; + return Ok(Self { inner: Arc::new(ModuleInner { engine: engine.clone(), @@ -531,6 +541,7 @@ impl Module { artifact_upvars: modules, module_upvars, signatures, + memfds, }), }); @@ -543,11 +554,14 @@ impl Module { module_upvars: &[serialization::SerializedModuleUpvar], signatures: &Arc, ) -> Result { + let module = artifacts[module_index].clone(); + let memfds = ModuleMemFds::new(module.module(), module.wasm_data())?; Ok(Module { inner: Arc::new(ModuleInner { engine: engine.clone(), types: types.clone(), - module: artifacts[module_index].clone(), + module, + memfds, artifact_upvars: artifact_upvars .iter() .map(|i| artifacts[*i].clone()) @@ -666,12 +680,15 @@ impl Module { artifact_upvars: &[usize], module_upvars: &[wasmtime_environ::ModuleUpvar], modules: &PrimaryMap, - ) -> Module { - Module { + ) -> Result { + let module = self.inner.artifact_upvars[artifact_index].clone(); + let memfds = ModuleMemFds::new(module.module(), module.wasm_data())?; + Ok(Module { inner: Arc::new(ModuleInner { types: self.inner.types.clone(), engine: self.inner.engine.clone(), - module: self.inner.artifact_upvars[artifact_index].clone(), + module, + memfds, artifact_upvars: artifact_upvars .iter() .map(|i| self.inner.artifact_upvars[*i].clone()) @@ -687,7 +704,7 @@ impl Module { .collect(), signatures: self.inner.signatures.clone(), }), - } + }) } pub(crate) fn compiled_module(&self) -> &Arc { @@ -706,6 +723,10 @@ impl Module { &self.inner.signatures } + pub(crate) fn memfds(&self) -> &Option> { + &self.inner.memfds + } + /// Looks up the module upvar value at the `index` specified. /// /// Note that this panics if `index` is out of bounds since this should diff --git a/crates/wasmtime/src/module/serialization.rs b/crates/wasmtime/src/module/serialization.rs index 740d1eab92..cb643d795d 100644 --- a/crates/wasmtime/src/module/serialization.rs +++ b/crates/wasmtime/src/module/serialization.rs @@ -274,7 +274,12 @@ impl<'a> SerializedModule<'a> { pub fn into_module(self, engine: &Engine) -> Result { let (main_module, modules, types, upvars) = self.into_parts(engine)?; let modules = engine.run_maybe_parallel(modules, |(i, m)| { - CompiledModule::from_artifacts(i, m, &*engine.config().profiler) + CompiledModule::from_artifacts( + i, + m, + &*engine.config().profiler, + engine.unique_id_allocator(), + ) })?; Module::from_parts(engine, modules, main_module, Arc::new(types), &upvars) diff --git a/crates/wasmtime/src/store.rs b/crates/wasmtime/src/store.rs index c6d7914e47..362fb59848 100644 --- a/crates/wasmtime/src/store.rs +++ b/crates/wasmtime/src/store.rs @@ -421,11 +421,13 @@ impl Store { shared_signatures: None.into(), imports: Default::default(), module: Arc::new(wasmtime_environ::Module::default()), + memfds: None, store: StorePtr::empty(), wasm_data: &[], }) .expect("failed to allocate default callee") }; + let mut inner = Box::new(StoreInner { inner: StoreOpaque { _marker: marker::PhantomPinned, diff --git a/crates/wasmtime/src/trampoline.rs b/crates/wasmtime/src/trampoline.rs index c1f8038a5a..790cbf9ef9 100644 --- a/crates/wasmtime/src/trampoline.rs +++ b/crates/wasmtime/src/trampoline.rs @@ -41,6 +41,7 @@ fn create_handle( let handle = OnDemandInstanceAllocator::new(config.mem_creator.clone(), 0).allocate( InstanceAllocationRequest { module: Arc::new(module), + memfds: None, functions, image_base: 0, imports, diff --git a/crates/wasmtime/src/trampoline/func.rs b/crates/wasmtime/src/trampoline/func.rs index 67d57fc334..47513f83cf 100644 --- a/crates/wasmtime/src/trampoline/func.rs +++ b/crates/wasmtime/src/trampoline/func.rs @@ -161,6 +161,7 @@ pub unsafe fn create_raw_function( Ok( OnDemandInstanceAllocator::default().allocate(InstanceAllocationRequest { module: Arc::new(module), + memfds: None, functions: &functions, image_base: (*func).as_ptr() as usize, imports: Imports::default(), diff --git a/src/lib.rs b/src/lib.rs index fb43affad1..b3cb8961f7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -100,6 +100,8 @@ use std::collections::HashMap; use std::path::PathBuf; use structopt::StructOpt; use wasmtime::{Config, ProfilingStrategy}; +#[cfg(feature = "pooling-allocator")] +use wasmtime::{InstanceLimits, ModuleLimits, PoolingAllocationStrategy}; fn pick_profiling_strategy(jitdump: bool, vtune: bool) -> Result { Ok(match (jitdump, vtune) { @@ -250,6 +252,12 @@ struct CommonOptions { /// the data segments specified in the original wasm module. #[structopt(long)] paged_memory_initialization: bool, + + /// Enables the pooling allocator, in place of the on-demand + /// allocator. + #[cfg(feature = "pooling-allocator")] + #[structopt(long)] + pooling_allocator: bool, } impl CommonOptions { @@ -325,6 +333,23 @@ impl CommonOptions { config.generate_address_map(!self.disable_address_map); config.paged_memory_initialization(self.paged_memory_initialization); + #[cfg(feature = "pooling-allocator")] + { + if self.pooling_allocator { + let mut module_limits = ModuleLimits::default(); + module_limits.functions = 50000; + module_limits.types = 10000; + module_limits.globals = 1000; + module_limits.memory_pages = 2048; + let instance_limits = InstanceLimits::default(); + config.allocation_strategy(wasmtime::InstanceAllocationStrategy::Pooling { + strategy: PoolingAllocationStrategy::NextAvailable, + module_limits, + instance_limits, + }); + } + } + Ok(config) } From 3702e81d30888875d468636daca9035af661114a Mon Sep 17 00:00:00 2001 From: Chris Fallin Date: Mon, 31 Jan 2022 11:13:43 -0800 Subject: [PATCH 02/12] Remove ftruncate-trick for heap growth with memfd backend. Testing so far with recent Wasmtime has not been able to show the need for avoiding the process-wide mmap lock in real-world use-cases. As such, the technique of using an anonymous file and ftruncate() to extend it seems unnecessary; instead, memfd can always use anonymous zeroed memory for heap backing where the CoW image is not present, and mprotect() to extend the heap limit by changing page protections. --- .../runtime/src/instance/allocator/memfd.rs | 194 ++++++++---------- .../src/instance/allocator/memfd_disabled.rs | 2 +- .../runtime/src/instance/allocator/pooling.rs | 10 +- 3 files changed, 85 insertions(+), 121 deletions(-) diff --git a/crates/runtime/src/instance/allocator/memfd.rs b/crates/runtime/src/instance/allocator/memfd.rs index 8713794824..67741f8bbd 100644 --- a/crates/runtime/src/instance/allocator/memfd.rs +++ b/crates/runtime/src/instance/allocator/memfd.rs @@ -5,8 +5,6 @@ use crate::InstantiationError; use anyhow::Result; use libc::c_void; use rustix::fd::AsRawFd; -use std::convert::TryFrom; -use std::fs::File; use std::sync::Arc; /// A single slot handled by the memfd instance-heap mechanism. @@ -16,8 +14,7 @@ use std::sync::Arc; /// base ==> (points here) /// - (image.offset bytes) anonymous zero memory, pre-image /// - (image.len bytes) CoW mapping of memfd heap image -/// - (up to extension_offset) anonymous zero memory, post-image -/// - (up to static_size) heap expansion region; CoW mapping of per-slot memfd +/// - (up to static_size) anonymous zero memory, post-image /// /// The ordering of mmaps to set this up is: /// @@ -25,15 +22,15 @@ use std::sync::Arc; /// - one large mmap to create 8GiB * instances * memories slots /// /// - per instantiation of new image in a slot: -/// - mmap of anonymous zero memory, from 0 to initial heap size +/// - mmap of anonymous zero memory, from 0 to max heap size +/// (static_size) /// - mmap of CoW'd memfd image, from `image.offset` to /// `image.offset + image.len`. This overwrites part of the /// anonymous zero memory, potentially splitting it into a pre- /// and post-region. -/// - mmap of CoW'd extension file, past the initial heap size up to -/// the end of the max memory size (just before the -/// post-guard). This is always adjacent to the above mmaps, but -/// does not overlap/overwrite them. +/// - mprotect(PROT_NONE) on the part of the heap beyond the initial +/// heap size; we re-mprotect it with R+W bits when the heap is +/// grown. #[derive(Debug)] pub struct MemFdSlot { /// The base of the actual heap memory. Bytes at this address are @@ -44,21 +41,11 @@ pub struct MemFdSlot { /// The memfd image that backs this memory. May be `None`, in /// which case the memory is all zeroes. pub(crate) image: Option>, - /// The offset at which the "extension file", which is used to - /// allow for efficient heap growth, is mapped. This is always - /// immediately after the end of the initial memory size. - extension_offset: usize, - /// The anonymous memfd, owned by this slot, which we mmap in the - /// area where the heap may grow during runtime. We use the - /// ftruncate() syscall (invoked via `File::set_len()`) to set its - /// size. We never write any data to it -- we CoW-map it so we can - /// throw away dirty data on termination. Instead, we just use its - /// size as a "watermark" that delineates the boundary between - /// safe-to-access memory and SIGBUS-causing memory. (This works - /// because one can mmap a file beyond its end, and is good - /// because ftruncate does not take the process-wide lock that - /// mmap and mprotect do.) - extension_file: File, + /// The initial heap size. + initial_size: usize, + /// The current heap size. All memory above `base + cur_size` + /// should be PROT_NONE (mapped inaccessible). + cur_size: usize, /// Whether this slot may have "dirty" pages (pages written by an /// instantiation). Set by `instantiate()` and cleared by /// `clear_and_remain_ready()`, and used in assertions to ensure @@ -67,53 +54,31 @@ pub struct MemFdSlot { } impl MemFdSlot { - pub(crate) fn create( - base_addr: *mut c_void, - static_size: usize, - ) -> Result { + pub(crate) fn create(base_addr: *mut c_void, static_size: usize) -> Self { let base = base_addr as usize; - - // Create a MemFD for the memory growth first -- this covers - // extended heap beyond the initial image. - let extension_memfd = memfd::MemfdOptions::new() - .allow_sealing(true) - .create("wasm-anonymous-heap") - .map_err(|e| InstantiationError::Resource(e.into()))?; - // Seal the ability to write the extension file (make it - // permanently read-only). This is a defense-in-depth - // mitigation to make extra-sure that we don't leak - // information between instantiations. See note in `memfd.rs` - // for more about why we use seals. - extension_memfd - .add_seal(memfd::FileSeal::SealWrite) - .map_err(|e| InstantiationError::Resource(e.into()))?; - extension_memfd - .add_seal(memfd::FileSeal::SealSeal) - .map_err(|e| InstantiationError::Resource(e.into()))?; - let extension_file = extension_memfd.into_file(); - extension_file - .set_len(0) - .map_err(|e| InstantiationError::Resource(e.into()))?; - - Ok(MemFdSlot { + MemFdSlot { base, static_size, + initial_size: 0, + cur_size: 0, image: None, - extension_file, - extension_offset: 0, dirty: false, - }) + } } pub(crate) fn set_heap_limit(&mut self, size_bytes: usize) -> Result<()> { - assert!(size_bytes >= self.extension_offset); - // This is all that is needed to make the new memory - // accessible; we don't need to mprotect anything. (The - // mapping itself is always R+W for the max possible heap - // size, and only the anonymous-backing file length catches - // out-of-bounds accesses.) - self.extension_file - .set_len(u64::try_from(size_bytes - self.extension_offset).unwrap())?; + assert!(size_bytes > self.cur_size); + // mprotect the relevant region. + let start = self.base + self.cur_size; + let len = size_bytes - self.cur_size; + unsafe { + rustix::io::mprotect( + start as *mut _, + len, + rustix::io::MprotectFlags::READ | rustix::io::MprotectFlags::WRITE, + )?; + } + Ok(()) } @@ -124,31 +89,36 @@ impl MemFdSlot { ) -> Result<(), InstantiationError> { assert!(!self.dirty); - if let Some(existing_image) = &self.image { - // Fast-path: previously instantiated with the same image, - // so the mappings are already correct; there is no need - // to mmap anything. Given that we asserted not-dirty - // above, any dirty pages will have already been thrown - // away by madvise() during the previous termination. - if let Some(image) = maybe_image { - if existing_image.fd.as_file().as_raw_fd() == image.fd.as_file().as_raw_fd() { - self.dirty = true; - return Ok(()); - } - } + // Fast-path: previously instantiated with the same image, or + // no image but the same initial size, so the mappings are + // already correct; there is no need to mmap anything. Given + // that we asserted not-dirty above, any dirty pages will have + // already been thrown away by madvise() during the previous + // termination. The `clear_and_remain_ready()` path also + // mprotects memory above the initial heap size back to + // PROT_NONE, so we don't need to do that here. + if (self.image.is_none() + && maybe_image.is_none() + && self.initial_size == initial_size_bytes) + || (self.image.is_some() + && maybe_image.is_some() + && self.image.as_ref().unwrap().fd.as_file().as_raw_fd() + == maybe_image.as_ref().unwrap().fd.as_file().as_raw_fd()) + { + self.dirty = true; + return Ok(()); } // Otherwise, we need to redo (i) the anonymous-mmap backing - // for the initial heap size, (ii) the extension-file backing, - // and (iii) the initial-heap-image mapping if present. + // for the whole slot, (ii) the initial-heap-image mapping if + // present, and (iii) the mprotect(PROT_NONE) above the + // initial heap size. // Security/audit note: we map all of these MAP_PRIVATE, so // all instance data is local to the mapping, not propagated // to the backing fd. We throw away this CoW overlay with - // madvise() below, from base up to extension_offset (which is - // at least initial_size_bytes, and extended when the - // extension file is, so it covers all three mappings) when - // terminating the instance. + // madvise() below, from base up to static_size (which is the + // whole slot) when terminating the instance. // Anonymous mapping behind the initial heap size: this gives // zeroes for any "holes" in the initial heap image. Anonymous @@ -162,7 +132,7 @@ impl MemFdSlot { unsafe { let ptr = rustix::io::mmap_anonymous( self.base as *mut c_void, - initial_size_bytes, + self.static_size, rustix::io::ProtFlags::READ | rustix::io::ProtFlags::WRITE, rustix::io::MapFlags::PRIVATE | rustix::io::MapFlags::FIXED, ) @@ -171,29 +141,8 @@ impl MemFdSlot { } } - // An "extension file": this allows us to grow the heap by - // doing just an ftruncate(), without changing any - // mappings. This is important to avoid the process-wide mmap - // lock on Linux. - self.extension_offset = initial_size_bytes; - let extension_map_len = self.static_size - initial_size_bytes; - if extension_map_len > 0 { - unsafe { - let fd = rustix::fd::BorrowedFd::borrow_raw_fd(self.extension_file.as_raw_fd()); - let ptr = rustix::io::mmap( - (self.base + initial_size_bytes) as *mut c_void, - extension_map_len, - rustix::io::ProtFlags::READ | rustix::io::ProtFlags::WRITE, - rustix::io::MapFlags::PRIVATE | rustix::io::MapFlags::FIXED, - &fd, - 0, - ) - .map_err(|e| InstantiationError::Resource(e.into()))?; - assert_eq!(ptr as usize, self.base + initial_size_bytes); - } - } - - // Finally, the initial memory image. + // The initial memory image, if given. If not, we just get a + // memory filled with zeroes. if let Some(image) = maybe_image { if image.len > 0 { let image = image.clone(); @@ -216,31 +165,50 @@ impl MemFdSlot { } } + // mprotect above `initial_size_bytes`. + self.initial_size = initial_size_bytes; + self.protect_past_initial_size() + .map_err(|e| InstantiationError::Resource(e.into()))?; + self.dirty = true; Ok(()) } pub(crate) fn clear_and_remain_ready(&mut self) -> Result<()> { assert!(self.dirty); - // madvise the image range; that's it! This will throw away - // dirty pages, which are CoW-private pages on top of the - // initial heap image memfd. + // madvise the image range. This will throw away dirty pages, + // which are CoW-private pages on top of the initial heap + // image memfd. unsafe { rustix::io::madvise( self.base as *mut c_void, - self.extension_offset, + self.static_size, rustix::io::Advice::LinuxDontNeed, )?; } - // truncate the extension file down to zero bytes to reset heap length. - self.extension_file - .set_len(0) - .map_err(|e| InstantiationError::Resource(e.into()))?; + // mprotect the region beyond the initial heap size back to PROT_NONE. + self.protect_past_initial_size()?; self.dirty = false; Ok(()) } + fn protect_past_initial_size(&self) -> Result<()> { + let mprotect_start = self.base + self.initial_size; + let mprotect_len = self.static_size - self.initial_size; + if mprotect_len > 0 { + unsafe { + rustix::io::mprotect( + mprotect_start as *mut _, + mprotect_len, + rustix::io::MprotectFlags::empty(), + )?; + } + } + + Ok(()) + } + pub(crate) fn has_image(&self) -> bool { self.image.is_some() } diff --git a/crates/runtime/src/instance/allocator/memfd_disabled.rs b/crates/runtime/src/instance/allocator/memfd_disabled.rs index 9c87591bd5..304dd3eebb 100644 --- a/crates/runtime/src/instance/allocator/memfd_disabled.rs +++ b/crates/runtime/src/instance/allocator/memfd_disabled.rs @@ -19,7 +19,7 @@ pub struct MemFdSlot; #[cfg(not(feature = "memfd-allocator"))] #[allow(dead_code)] impl MemFdSlot { - pub(crate) fn create(_: *mut libc::c_void, _: usize) -> Result { + pub(crate) fn create(_: *mut libc::c_void, _: usize) -> Self { panic!("create() on invalid MemFdSlot"); } diff --git a/crates/runtime/src/instance/allocator/pooling.rs b/crates/runtime/src/instance/allocator/pooling.rs index 6aa291d7a9..fb60ffc4b1 100644 --- a/crates/runtime/src/instance/allocator/pooling.rs +++ b/crates/runtime/src/instance/allocator/pooling.rs @@ -529,7 +529,7 @@ impl InstancePool { if let Some(memfds) = maybe_memfds { let image = memfds.get_memory_image(defined_index); - let mut slot = memories.take_memfd_slot(instance_idx, memory_index)?; + let mut slot = memories.take_memfd_slot(instance_idx, memory_index); let initial_size = plan.memory.minimum * WASM_PAGE_SIZE as u64; // If instantiation fails, we can propagate the error @@ -745,15 +745,11 @@ impl MemoryPool { /// Take ownership of the given memfd slot. Must be returned via /// `return_memfd_slot` when the instance is done using it. - fn take_memfd_slot( - &self, - instance_index: usize, - memory_index: MemoryIndex, - ) -> Result { + fn take_memfd_slot(&self, instance_index: usize, memory_index: MemoryIndex) -> MemFdSlot { let idx = instance_index * self.max_memories + (memory_index.as_u32() as usize); let maybe_slot = self.memfd_slots[idx].lock().unwrap().take(); - maybe_slot.map(|slot| Ok(slot)).unwrap_or_else(|| { + maybe_slot.unwrap_or_else(|| { MemFdSlot::create( self.get_base(instance_index, memory_index) as *mut c_void, self.memory_size, From 570dee63f321975a55b4c3a00347ed5738622a2d Mon Sep 17 00:00:00 2001 From: Chris Fallin Date: Mon, 31 Jan 2022 13:59:51 -0800 Subject: [PATCH 03/12] Use MemFdSlot in the on-demand allocator as well. --- crates/runtime/src/instance/allocator.rs | 43 ++++++++---- crates/runtime/src/lib.rs | 2 + crates/runtime/src/memfd.rs | 11 +-- crates/runtime/src/memory.rs | 87 ++++++++++++++++++++---- crates/wasmtime/src/trampoline/memory.rs | 9 ++- 5 files changed, 120 insertions(+), 32 deletions(-) diff --git a/crates/runtime/src/instance/allocator.rs b/crates/runtime/src/instance/allocator.rs index 12fa88ddc8..739313da72 100644 --- a/crates/runtime/src/instance/allocator.rs +++ b/crates/runtime/src/instance/allocator.rs @@ -392,15 +392,15 @@ fn initialize_memories( initializers: &[MemoryInitializer], ) -> Result<(), InstantiationError> { for init in initializers { - // Check whether this is a MemFD memory; if so, we can skip - // all initializers. + // Check whether we can skip all initializers (due to, e.g., + // memfd). let memory = init.memory_index; if let Some(defined_index) = module.defined_memory_index(memory) { // We can only skip if there is actually a MemFD image. In // some situations the MemFD image creation code will bail // (e.g. due to an out of bounds data segment) and so we // need to fall back on the usual initialization below. - if instance.memories[defined_index].is_memfd_with_image() { + if !instance.memories[defined_index].needs_init() { continue; } } @@ -458,11 +458,10 @@ fn initialize_instance( match &module.memory_initialization { MemoryInitialization::Paged { map, out_of_bounds } => { for (index, pages) in map { - // We can only skip if there is actually a MemFD image. In - // some situations the MemFD image creation code will bail - // (e.g. due to an out of bounds data segment) and so we - // need to fall back on the usual initialization below. - if instance.memories[index].is_memfd_with_image() { + // Check whether the memory actually needs + // initialization. It may not if we're using a CoW + // mechanism like memfd. + if !instance.memories[index].needs_init() { continue; } @@ -682,6 +681,7 @@ impl OnDemandInstanceAllocator { &self, module: &Module, store: &mut StorePtr, + memfds: &Option>, ) -> Result, InstantiationError> { let creator = self .mem_creator @@ -690,13 +690,26 @@ impl OnDemandInstanceAllocator { let num_imports = module.num_imported_memories; let mut memories: PrimaryMap = PrimaryMap::with_capacity(module.memory_plans.len() - num_imports); - for plan in &module.memory_plans.values().as_slice()[num_imports..] { + for (memory_idx, plan) in module.memory_plans.iter().skip(num_imports) { + // Create a MemFdSlot if there is an image for this memory. + let defined_memory_idx = module + .defined_memory_index(memory_idx) + .expect("Skipped imports, should never be None"); + let memfd_image = memfds + .as_ref() + .and_then(|memfds| memfds.get_memory_image(defined_memory_idx)); + memories.push( - Memory::new_dynamic(plan, creator, unsafe { - store - .get() - .expect("if module has memory plans, store is not empty") - }) + Memory::new_dynamic( + plan, + creator, + unsafe { + store + .get() + .expect("if module has memory plans, store is not empty") + }, + memfd_image, + ) .map_err(InstantiationError::Resource)?, ); } @@ -719,7 +732,7 @@ unsafe impl InstanceAllocator for OnDemandInstanceAllocator { &self, mut req: InstanceAllocationRequest, ) -> Result { - let memories = self.create_memories(&req.module, &mut req.store)?; + let memories = self.create_memories(&req.module, &mut req.store, &req.memfds)?; let tables = Self::create_tables(&req.module, &mut req.store)?; let host_state = std::mem::replace(&mut req.host_state, Box::new(())); diff --git a/crates/runtime/src/lib.rs b/crates/runtime/src/lib.rs index 806c8c9c5c..550480b3b4 100644 --- a/crates/runtime/src/lib.rs +++ b/crates/runtime/src/lib.rs @@ -70,6 +70,8 @@ pub use module_id::{CompiledModuleId, CompiledModuleIdAllocator}; #[cfg(feature = "memfd-allocator")] mod memfd; +pub use crate::memfd::MemoryMemFd; + /// When memfd support is not included, provide a shim type and /// constructor instead so that higher-level code does not need /// feature-conditional compilation. diff --git a/crates/runtime/src/memfd.rs b/crates/runtime/src/memfd.rs index 46ebc4e228..dc6e2ef815 100644 --- a/crates/runtime/src/memfd.rs +++ b/crates/runtime/src/memfd.rs @@ -30,17 +30,20 @@ impl ModuleMemFds { /// One backing image for one memory. #[derive(Debug)] -pub(crate) struct MemoryMemFd { - pub(crate) fd: Memfd, +pub struct MemoryMemFd { + /// The actual memfd image: an anonymous file in memory which we + /// use as the backing content for a copy-on-write (CoW) mapping + /// in the memory region. + pub fd: Memfd, /// Length of image. Note that initial memory size may be larger; /// leading and trailing zeroes are truncated (handled by /// anonymous backing memfd). - pub(crate) len: usize, + pub len: usize, /// Image starts this many bytes into heap space. Note that the /// memfd's offsets are always equal to the heap offsets, so we /// map at an offset into the fd as well. (This simplifies /// construction.) - pub(crate) offset: usize, + pub offset: usize, } fn unsupported_initializer(segment: &MemoryInitializer, plan: &MemoryPlan) -> bool { diff --git a/crates/runtime/src/memory.rs b/crates/runtime/src/memory.rs index 894a8afd96..71c77b43ca 100644 --- a/crates/runtime/src/memory.rs +++ b/crates/runtime/src/memory.rs @@ -3,6 +3,7 @@ //! `RuntimeLinearMemory` is to WebAssembly linear memories what `Table` is to WebAssembly tables. use crate::instance::MemFdSlot; +use crate::memfd::MemoryMemFd; use crate::mmap::Mmap; use crate::vmcontext::VMMemoryDefinition; use crate::Store; @@ -10,6 +11,7 @@ use anyhow::Error; use anyhow::{bail, format_err, Result}; use more_asserts::{assert_ge, assert_le}; use std::convert::TryFrom; +use std::sync::Arc; use wasmtime_environ::{MemoryPlan, MemoryStyle, WASM32_MAX_PAGES, WASM64_MAX_PAGES}; const WASM_PAGE_SIZE: usize = wasmtime_environ::WASM_PAGE_SIZE as usize; @@ -23,6 +25,8 @@ pub trait RuntimeMemoryCreator: Send + Sync { plan: &MemoryPlan, minimum: usize, maximum: Option, + // Optionally, a memfd image for CoW backing. + memfd_image: Option<&Arc>, ) -> Result>; } @@ -36,8 +40,14 @@ impl RuntimeMemoryCreator for DefaultMemoryCreator { plan: &MemoryPlan, minimum: usize, maximum: Option, + memfd_image: Option<&Arc>, ) -> Result> { - Ok(Box::new(MmapMemory::new(plan, minimum, maximum)?)) + Ok(Box::new(MmapMemory::new( + plan, + minimum, + maximum, + memfd_image, + )?)) } } @@ -59,6 +69,11 @@ pub trait RuntimeLinearMemory: Send + Sync { /// Return a `VMMemoryDefinition` for exposing the memory to compiled wasm /// code. fn vmmemory(&self) -> VMMemoryDefinition; + + /// Does this memory need initialization? It may not if it already + /// has initial contents courtesy of the `MemoryMemFd` passed to + /// `RuntimeMemoryCreator::new_memory()`. + fn needs_init(&self) -> bool; } /// A linear memory instance. @@ -87,11 +102,24 @@ pub struct MmapMemory { // optimize loads and stores with constant offsets. pre_guard_size: usize, offset_guard_size: usize, + + // A MemFd CoW mapping that provides the initial content of this + // MmapMemory, if mapped. + // + // N.B.: this comes after the `mmap` field above because it must + // be destructed first. It puts a placeholder mapping in place on + // drop, then the `mmap` above completely unmaps the region. + memfd: Option, } impl MmapMemory { /// Create a new linear memory instance with specified minimum and maximum number of wasm pages. - pub fn new(plan: &MemoryPlan, minimum: usize, mut maximum: Option) -> Result { + pub fn new( + plan: &MemoryPlan, + minimum: usize, + mut maximum: Option, + memfd_image: Option<&Arc>, + ) -> Result { // It's a programmer error for these two configuration values to exceed // the host available address space, so panic if such a configuration is // found (mostly an issue for hypothetical 32-bit hosts). @@ -127,6 +155,18 @@ impl MmapMemory { mmap.make_accessible(pre_guard_bytes, minimum)?; } + // If a memfd image was specified, try to create the MemFdSlot on top of our mmap. + let memfd = match memfd_image { + Some(image) => { + let base = unsafe { mmap.as_mut_ptr().offset(pre_guard_bytes as isize) }; + let len = request_bytes - pre_guard_bytes; + let mut memfd_slot = MemFdSlot::create(base as *mut _, len); + memfd_slot.instantiate(minimum, Some(image))?; + Some(memfd_slot) + } + None => None, + }; + Ok(Self { mmap, accessible: minimum, @@ -134,6 +174,7 @@ impl MmapMemory { pre_guard_size: pre_guard_bytes, offset_guard_size: offset_guard_bytes, extra_to_reserve_on_growth, + memfd, }) } } @@ -166,7 +207,19 @@ impl RuntimeLinearMemory for MmapMemory { new_mmap.as_mut_slice()[self.pre_guard_size..][..self.accessible] .copy_from_slice(&self.mmap.as_slice()[self.pre_guard_size..][..self.accessible]); + // Now drop the MemFdSlot, if any. We've lost the CoW + // advantages by explicitly copying all data, but we have + // preserved all of its content; so we no longer need the + // memfd mapping. We need to do this before we + // (implicitly) drop the `mmap` field by overwriting it + // below. + let _ = self.memfd.take(); + self.mmap = new_mmap; + } else if let Some(memfd) = self.memfd.as_mut() { + // MemFdSlot has its own growth mechanisms; defer to its + // implementation. + memfd.set_heap_limit(new_size)?; } else { // If the new size of this heap fits within the existing allocation // then all we need to do is to make the new pages accessible. This @@ -192,6 +245,12 @@ impl RuntimeLinearMemory for MmapMemory { current_length: self.accessible, } } + + fn needs_init(&self) -> bool { + // If we're using a memfd CoW mapping, then no initialization + // is needed. + self.memfd.is_none() + } } /// Representation of a runtime wasm linear memory. @@ -232,9 +291,15 @@ impl Memory { plan: &MemoryPlan, creator: &dyn RuntimeMemoryCreator, store: &mut dyn Store, + memfd_image: Option<&Arc>, ) -> Result { let (minimum, maximum) = Self::limit_new(plan, store)?; - Ok(Memory::Dynamic(creator.new_memory(plan, minimum, maximum)?)) + Ok(Memory::Dynamic(creator.new_memory( + plan, + minimum, + maximum, + memfd_image, + )?)) } /// Create a new static (immovable) memory instance for the specified plan. @@ -382,19 +447,17 @@ impl Memory { } } - /// Returns whether or not this memory is backed by a MemFD - /// image. Note that this is testing whether there is actually an - /// *image* mapped, not just whether the MemFD mechanism is being - /// used. The distinction is important because if we are not using - /// a prevalidated and prepared image, we need to fall back to - /// ordinary initialization code. - pub(crate) fn is_memfd_with_image(&self) -> bool { + /// Returns whether or not this memory needs initialization. It + /// may not if it already has initial content thanks to a CoW + /// mechanism like memfd. + pub(crate) fn needs_init(&self) -> bool { match self { Memory::Static { memfd_slot: Some(ref slot), .. - } => slot.has_image(), - _ => false, + } => !slot.has_image(), + Memory::Dynamic(mem) => mem.needs_init(), + _ => true, } } diff --git a/crates/wasmtime/src/trampoline/memory.rs b/crates/wasmtime/src/trampoline/memory.rs index 942cb6bd6f..bd47e45144 100644 --- a/crates/wasmtime/src/trampoline/memory.rs +++ b/crates/wasmtime/src/trampoline/memory.rs @@ -6,7 +6,9 @@ use anyhow::{anyhow, Result}; use std::convert::TryFrom; use std::sync::Arc; use wasmtime_environ::{EntityIndex, MemoryPlan, MemoryStyle, Module, WASM_PAGE_SIZE}; -use wasmtime_runtime::{RuntimeLinearMemory, RuntimeMemoryCreator, VMMemoryDefinition}; +use wasmtime_runtime::{ + MemoryMemFd, RuntimeLinearMemory, RuntimeMemoryCreator, VMMemoryDefinition, +}; pub fn create_memory(store: &mut StoreOpaque, memory: &MemoryType) -> Result { let mut module = Module::new(); @@ -46,6 +48,10 @@ impl RuntimeLinearMemory for LinearMemoryProxy { current_length: self.mem.byte_size(), } } + + fn needs_init(&self) -> bool { + true + } } #[derive(Clone)] @@ -57,6 +63,7 @@ impl RuntimeMemoryCreator for MemoryCreatorProxy { plan: &MemoryPlan, minimum: usize, maximum: Option, + _: Option<&Arc>, ) -> Result> { let ty = MemoryType::from_wasmtime_memory(&plan.memory); let reserved_size_in_bytes = match plan.style { From 982df2f2e5e7018ae8bdbf9b534743a816ab3c60 Mon Sep 17 00:00:00 2001 From: Chris Fallin Date: Mon, 31 Jan 2022 16:11:25 -0800 Subject: [PATCH 04/12] Review feedback. --- crates/runtime/src/instance/allocator.rs | 10 - .../runtime/src/instance/allocator/memfd.rs | 258 ----------------- .../runtime/src/instance/allocator/pooling.rs | 18 +- crates/runtime/src/lib.rs | 42 +-- crates/runtime/src/memfd.rs | 274 +++++++++++++++++- .../allocator => }/memfd_disabled.rs | 29 +- crates/runtime/src/memory.rs | 11 +- 7 files changed, 332 insertions(+), 310 deletions(-) delete mode 100644 crates/runtime/src/instance/allocator/memfd.rs rename crates/runtime/src/{instance/allocator => }/memfd_disabled.rs (59%) diff --git a/crates/runtime/src/instance/allocator.rs b/crates/runtime/src/instance/allocator.rs index 739313da72..1ee7a195fa 100644 --- a/crates/runtime/src/instance/allocator.rs +++ b/crates/runtime/src/instance/allocator.rs @@ -25,16 +25,6 @@ use wasmtime_environ::{ #[cfg(feature = "pooling-allocator")] mod pooling; -#[cfg(feature = "memfd-allocator")] -mod memfd; -#[cfg(feature = "memfd-allocator")] -pub use self::memfd::MemFdSlot; - -#[cfg(not(feature = "memfd-allocator"))] -mod memfd_disabled; -#[cfg(not(feature = "memfd-allocator"))] -pub use self::memfd_disabled::MemFdSlot; - #[cfg(feature = "pooling-allocator")] pub use self::pooling::{ InstanceLimits, ModuleLimits, PoolingAllocationStrategy, PoolingInstanceAllocator, diff --git a/crates/runtime/src/instance/allocator/memfd.rs b/crates/runtime/src/instance/allocator/memfd.rs deleted file mode 100644 index 67741f8bbd..0000000000 --- a/crates/runtime/src/instance/allocator/memfd.rs +++ /dev/null @@ -1,258 +0,0 @@ -//! memfd mapping logic for use by the pooling allocator. - -use crate::memfd::MemoryMemFd; -use crate::InstantiationError; -use anyhow::Result; -use libc::c_void; -use rustix::fd::AsRawFd; -use std::sync::Arc; - -/// A single slot handled by the memfd instance-heap mechanism. -/// -/// The mmap scheme is: -/// -/// base ==> (points here) -/// - (image.offset bytes) anonymous zero memory, pre-image -/// - (image.len bytes) CoW mapping of memfd heap image -/// - (up to static_size) anonymous zero memory, post-image -/// -/// The ordering of mmaps to set this up is: -/// -/// - once, when pooling allocator is created: -/// - one large mmap to create 8GiB * instances * memories slots -/// -/// - per instantiation of new image in a slot: -/// - mmap of anonymous zero memory, from 0 to max heap size -/// (static_size) -/// - mmap of CoW'd memfd image, from `image.offset` to -/// `image.offset + image.len`. This overwrites part of the -/// anonymous zero memory, potentially splitting it into a pre- -/// and post-region. -/// - mprotect(PROT_NONE) on the part of the heap beyond the initial -/// heap size; we re-mprotect it with R+W bits when the heap is -/// grown. -#[derive(Debug)] -pub struct MemFdSlot { - /// The base of the actual heap memory. Bytes at this address are - /// what is seen by the Wasm guest code. - base: usize, - /// The maximum static memory size, plus post-guard. - static_size: usize, - /// The memfd image that backs this memory. May be `None`, in - /// which case the memory is all zeroes. - pub(crate) image: Option>, - /// The initial heap size. - initial_size: usize, - /// The current heap size. All memory above `base + cur_size` - /// should be PROT_NONE (mapped inaccessible). - cur_size: usize, - /// Whether this slot may have "dirty" pages (pages written by an - /// instantiation). Set by `instantiate()` and cleared by - /// `clear_and_remain_ready()`, and used in assertions to ensure - /// those methods are called properly. - dirty: bool, -} - -impl MemFdSlot { - pub(crate) fn create(base_addr: *mut c_void, static_size: usize) -> Self { - let base = base_addr as usize; - MemFdSlot { - base, - static_size, - initial_size: 0, - cur_size: 0, - image: None, - dirty: false, - } - } - - pub(crate) fn set_heap_limit(&mut self, size_bytes: usize) -> Result<()> { - assert!(size_bytes > self.cur_size); - // mprotect the relevant region. - let start = self.base + self.cur_size; - let len = size_bytes - self.cur_size; - unsafe { - rustix::io::mprotect( - start as *mut _, - len, - rustix::io::MprotectFlags::READ | rustix::io::MprotectFlags::WRITE, - )?; - } - - Ok(()) - } - - pub(crate) fn instantiate( - &mut self, - initial_size_bytes: usize, - maybe_image: Option<&Arc>, - ) -> Result<(), InstantiationError> { - assert!(!self.dirty); - - // Fast-path: previously instantiated with the same image, or - // no image but the same initial size, so the mappings are - // already correct; there is no need to mmap anything. Given - // that we asserted not-dirty above, any dirty pages will have - // already been thrown away by madvise() during the previous - // termination. The `clear_and_remain_ready()` path also - // mprotects memory above the initial heap size back to - // PROT_NONE, so we don't need to do that here. - if (self.image.is_none() - && maybe_image.is_none() - && self.initial_size == initial_size_bytes) - || (self.image.is_some() - && maybe_image.is_some() - && self.image.as_ref().unwrap().fd.as_file().as_raw_fd() - == maybe_image.as_ref().unwrap().fd.as_file().as_raw_fd()) - { - self.dirty = true; - return Ok(()); - } - - // Otherwise, we need to redo (i) the anonymous-mmap backing - // for the whole slot, (ii) the initial-heap-image mapping if - // present, and (iii) the mprotect(PROT_NONE) above the - // initial heap size. - - // Security/audit note: we map all of these MAP_PRIVATE, so - // all instance data is local to the mapping, not propagated - // to the backing fd. We throw away this CoW overlay with - // madvise() below, from base up to static_size (which is the - // whole slot) when terminating the instance. - - // Anonymous mapping behind the initial heap size: this gives - // zeroes for any "holes" in the initial heap image. Anonymous - // mmap memory is faster to fault in than a CoW of a file, - // even a file with zero holes, because the kernel's CoW path - // unconditionally copies *something* (even if just a page of - // zeroes). Anonymous zero pages are fast: the kernel - // pre-zeroes them, and even if it runs out of those, a memset - // is half as expensive as a memcpy (only writes, no reads). - if initial_size_bytes > 0 { - unsafe { - let ptr = rustix::io::mmap_anonymous( - self.base as *mut c_void, - self.static_size, - rustix::io::ProtFlags::READ | rustix::io::ProtFlags::WRITE, - rustix::io::MapFlags::PRIVATE | rustix::io::MapFlags::FIXED, - ) - .map_err(|e| InstantiationError::Resource(e.into()))?; - assert_eq!(ptr as usize, self.base); - } - } - - // The initial memory image, if given. If not, we just get a - // memory filled with zeroes. - if let Some(image) = maybe_image { - if image.len > 0 { - let image = image.clone(); - - unsafe { - let fd = rustix::fd::BorrowedFd::borrow_raw_fd(image.fd.as_file().as_raw_fd()); - let ptr = rustix::io::mmap( - (self.base + image.offset) as *mut c_void, - image.len, - rustix::io::ProtFlags::READ | rustix::io::ProtFlags::WRITE, - rustix::io::MapFlags::PRIVATE | rustix::io::MapFlags::FIXED, - &fd, - image.offset as u64, - ) - .map_err(|e| InstantiationError::Resource(e.into()))?; - assert_eq!(ptr as usize, self.base + image.offset); - } - - self.image = Some(image); - } - } - - // mprotect above `initial_size_bytes`. - self.initial_size = initial_size_bytes; - self.protect_past_initial_size() - .map_err(|e| InstantiationError::Resource(e.into()))?; - - self.dirty = true; - Ok(()) - } - - pub(crate) fn clear_and_remain_ready(&mut self) -> Result<()> { - assert!(self.dirty); - // madvise the image range. This will throw away dirty pages, - // which are CoW-private pages on top of the initial heap - // image memfd. - unsafe { - rustix::io::madvise( - self.base as *mut c_void, - self.static_size, - rustix::io::Advice::LinuxDontNeed, - )?; - } - - // mprotect the region beyond the initial heap size back to PROT_NONE. - self.protect_past_initial_size()?; - self.dirty = false; - Ok(()) - } - - fn protect_past_initial_size(&self) -> Result<()> { - let mprotect_start = self.base + self.initial_size; - let mprotect_len = self.static_size - self.initial_size; - if mprotect_len > 0 { - unsafe { - rustix::io::mprotect( - mprotect_start as *mut _, - mprotect_len, - rustix::io::MprotectFlags::empty(), - )?; - } - } - - Ok(()) - } - - pub(crate) fn has_image(&self) -> bool { - self.image.is_some() - } - - pub(crate) fn is_dirty(&self) -> bool { - self.dirty - } -} - -#[cfg(feature = "memfd-allocator")] -impl Drop for MemFdSlot { - fn drop(&mut self) { - // The MemFdSlot may be dropped if there is an error during - // instantiation: for example, if a memory-growth limiter - // disallows a guest from having a memory of a certain size, - // after we've already initialized the MemFdSlot. - // - // We need to return this region of the large pool mmap to a - // safe state (with no module-specific mappings). The - // MemFdSlot will not be returned to the MemoryPool, so a new - // MemFdSlot will be created and overwrite the mappings anyway - // on the slot's next use; but for safety and to avoid - // resource leaks it's better not to have stale mappings to a - // possibly-otherwise-dead module's image. - // - // To "wipe the slate clean", let's do a mmap of anonymous - // memory over the whole region, with PROT_NONE. Note that we - // *can't* simply munmap, because that leaves a hole in the - // middle of the pooling allocator's big memory area that some - // other random mmap may swoop in and take, to be trampled - // over by the next MemFdSlot later. - // - // Since we're in drop(), we can't sanely return an error if - // this mmap fails. Let's ignore the failure if so; the next - // MemFdSlot to be created for this slot will try to overwrite - // the existing stale mappings, and return a failure properly - // if we still cannot map new memory. - unsafe { - let _ = rustix::io::mmap_anonymous( - self.base as *mut _, - self.static_size, - rustix::io::ProtFlags::empty(), - rustix::io::MapFlags::FIXED | rustix::io::MapFlags::NORESERVE, - ); - } - } -} diff --git a/crates/runtime/src/instance/allocator/pooling.rs b/crates/runtime/src/instance/allocator/pooling.rs index fb60ffc4b1..abf2683c45 100644 --- a/crates/runtime/src/instance/allocator/pooling.rs +++ b/crates/runtime/src/instance/allocator/pooling.rs @@ -7,11 +7,11 @@ //! Using the pooling instance allocator can speed up module instantiation //! when modules can be constrained based on configurable limits. -use super::MemFdSlot; use super::{ initialize_instance, initialize_vmcontext, InstanceAllocationRequest, InstanceAllocator, InstanceHandle, InstantiationError, }; +use crate::MemFdSlot; use crate::{instance::Instance, Memory, Mmap, ModuleMemFds, Table}; use anyhow::{anyhow, bail, Context, Result}; use libc::c_void; @@ -765,6 +765,22 @@ impl MemoryPool { } } +impl Drop for MemoryPool { + fn drop(&mut self) { + // Clear the `clear_no_drop` flag (i.e., ask to *not* clear on + // drop) for all MemFdSlots, and then drop them here. This is + // valid because the one `Mmap` that covers the whole region + // can just do its one munmap. + for memfd in std::mem::take(&mut self.memfd_slots) { + if let Some(memfd_slot) = memfd.lock().unwrap().as_mut() { + unsafe { + memfd_slot.no_clear_on_drop(); + } + } + } + } +} + /// Represents a pool of WebAssembly tables. /// /// Each instance index into the pool returns an iterator over the base addresses diff --git a/crates/runtime/src/lib.rs b/crates/runtime/src/lib.rs index 550480b3b4..822970727c 100644 --- a/crates/runtime/src/lib.rs +++ b/crates/runtime/src/lib.rs @@ -69,45 +69,13 @@ pub use module_id::{CompiledModuleId, CompiledModuleIdAllocator}; #[cfg(feature = "memfd-allocator")] mod memfd; +#[cfg(feature = "memfd-allocator")] +pub use crate::memfd::{MemFdSlot, MemoryMemFd, ModuleMemFds}; -pub use crate::memfd::MemoryMemFd; - -/// When memfd support is not included, provide a shim type and -/// constructor instead so that higher-level code does not need -/// feature-conditional compilation. #[cfg(not(feature = "memfd-allocator"))] -#[allow(dead_code)] -mod memfd { - use anyhow::Result; - use std::sync::Arc; - use wasmtime_environ::{DefinedMemoryIndex, Module}; - - /// A shim for the memfd image container when memfd support is not - /// included. - pub enum ModuleMemFds {} - - /// A shim for an individual memory image. - #[allow(dead_code)] - pub enum MemoryMemFd {} - - impl ModuleMemFds { - /// Construct a new set of memfd images. This variant is used - /// when memfd support is not included; it always returns no - /// images. - pub fn new(_: &Module, _: &[u8]) -> Result>> { - Ok(None) - } - - /// Get the memfd image for a particular memory. - pub(crate) fn get_memory_image(&self, _: DefinedMemoryIndex) -> Option<&Arc> { - // Should be unreachable because the `Self` type is - // uninhabitable. - match *self {} - } - } -} - -pub use crate::memfd::ModuleMemFds; +mod memfd_disabled; +#[cfg(not(feature = "memfd-allocator"))] +pub use crate::memfd_disabled::{MemFdSlot, MemoryMemFd, ModuleMemFds}; /// Version number of this crate. pub const VERSION: &str = env!("CARGO_PKG_VERSION"); diff --git a/crates/runtime/src/memfd.rs b/crates/runtime/src/memfd.rs index dc6e2ef815..1740aba324 100644 --- a/crates/runtime/src/memfd.rs +++ b/crates/runtime/src/memfd.rs @@ -1,7 +1,11 @@ -//! memfd support. +//! memfd support: creation of backing images for modules, and logic +//! to support mapping these backing images into memory. +use crate::InstantiationError; use anyhow::Result; +use libc::c_void; use memfd::{Memfd, MemfdOptions}; +use rustix::fd::AsRawFd; use rustix::fs::FileExt; use std::convert::TryFrom; use std::sync::Arc; @@ -237,3 +241,271 @@ impl ModuleMemFds { Ok(Some(Arc::new(ModuleMemFds { memories }))) } } + +/// A single slot handled by the memfd instance-heap mechanism. +/// +/// The mmap scheme is: +/// +/// base ==> (points here) +/// - (image.offset bytes) anonymous zero memory, pre-image +/// - (image.len bytes) CoW mapping of memfd heap image +/// - (up to static_size) anonymous zero memory, post-image +/// +/// The ordering of mmaps to set this up is: +/// +/// - once, when pooling allocator is created: +/// - one large mmap to create 8GiB * instances * memories slots +/// +/// - per instantiation of new image in a slot: +/// - mmap of anonymous zero memory, from 0 to max heap size +/// (static_size) +/// - mmap of CoW'd memfd image, from `image.offset` to +/// `image.offset + image.len`. This overwrites part of the +/// anonymous zero memory, potentially splitting it into a pre- +/// and post-region. +/// - mprotect(PROT_NONE) on the part of the heap beyond the initial +/// heap size; we re-mprotect it with R+W bits when the heap is +/// grown. +#[derive(Debug)] +pub struct MemFdSlot { + /// The base of the actual heap memory. Bytes at this address are + /// what is seen by the Wasm guest code. + base: usize, + /// The maximum static memory size, plus post-guard. + static_size: usize, + /// The memfd image that backs this memory. May be `None`, in + /// which case the memory is all zeroes. + pub(crate) image: Option>, + /// The initial heap size. + initial_size: usize, + /// The current heap size. All memory above `base + cur_size` + /// should be PROT_NONE (mapped inaccessible). + cur_size: usize, + /// Whether this slot may have "dirty" pages (pages written by an + /// instantiation). Set by `instantiate()` and cleared by + /// `clear_and_remain_ready()`, and used in assertions to ensure + /// those methods are called properly. + dirty: bool, + /// Whether this MemFdSlot is responsible for mapping anonymous + /// memory (to hold the reservation while overwriting mappings + /// specific to this slot) in place when it is dropped. Default + /// on, unless the caller knows what they are doing. + clear_on_drop: bool, +} + +impl MemFdSlot { + pub(crate) fn create(base_addr: *mut c_void, static_size: usize) -> Self { + let base = base_addr as usize; + MemFdSlot { + base, + static_size, + initial_size: 0, + cur_size: 0, + image: None, + dirty: false, + clear_on_drop: true, + } + } + + /// Inform the MemFdSlot that it should *not* clear the underlying + /// address space when dropped. This should be used only when the + /// caller will clear or reuse the address space in some other + /// way. + pub(crate) unsafe fn no_clear_on_drop(&mut self) { + self.clear_on_drop = false; + } + + pub(crate) fn set_heap_limit(&mut self, size_bytes: usize) -> Result<()> { + assert!(size_bytes > self.cur_size); + // mprotect the relevant region. + let start = self.base + self.cur_size; + let len = size_bytes - self.cur_size; + unsafe { + rustix::io::mprotect( + start as *mut _, + len, + rustix::io::MprotectFlags::READ | rustix::io::MprotectFlags::WRITE, + )?; + } + + Ok(()) + } + + pub(crate) fn instantiate( + &mut self, + initial_size_bytes: usize, + maybe_image: Option<&Arc>, + ) -> Result<(), InstantiationError> { + assert!(!self.dirty); + + // Fast-path: previously instantiated with the same image, or + // no image but the same initial size, so the mappings are + // already correct; there is no need to mmap anything. Given + // that we asserted not-dirty above, any dirty pages will have + // already been thrown away by madvise() during the previous + // termination. The `clear_and_remain_ready()` path also + // mprotects memory above the initial heap size back to + // PROT_NONE, so we don't need to do that here. + if (self.image.is_none() + && maybe_image.is_none() + && self.initial_size == initial_size_bytes) + || (self.image.is_some() + && maybe_image.is_some() + && self.image.as_ref().unwrap().fd.as_file().as_raw_fd() + == maybe_image.as_ref().unwrap().fd.as_file().as_raw_fd()) + { + self.dirty = true; + return Ok(()); + } + + // Otherwise, we need to redo (i) the anonymous-mmap backing + // for the whole slot, (ii) the initial-heap-image mapping if + // present, and (iii) the mprotect(PROT_NONE) above the + // initial heap size. + + // Security/audit note: we map all of these MAP_PRIVATE, so + // all instance data is local to the mapping, not propagated + // to the backing fd. We throw away this CoW overlay with + // madvise() below, from base up to static_size (which is the + // whole slot) when terminating the instance. + + // Anonymous mapping behind the initial heap size: this gives + // zeroes for any "holes" in the initial heap image. Anonymous + // mmap memory is faster to fault in than a CoW of a file, + // even a file with zero holes, because the kernel's CoW path + // unconditionally copies *something* (even if just a page of + // zeroes). Anonymous zero pages are fast: the kernel + // pre-zeroes them, and even if it runs out of those, a memset + // is half as expensive as a memcpy (only writes, no reads). + self.map_anon_memory(rustix::io::ProtFlags::READ | rustix::io::ProtFlags::WRITE) + .map_err(|e| InstantiationError::Resource(e.into()))?; + + // The initial memory image, if given. If not, we just get a + // memory filled with zeroes. + if let Some(image) = maybe_image { + if image.len > 0 { + let image = image.clone(); + + unsafe { + let ptr = rustix::io::mmap( + (self.base + image.offset) as *mut c_void, + image.len, + rustix::io::ProtFlags::READ | rustix::io::ProtFlags::WRITE, + rustix::io::MapFlags::PRIVATE | rustix::io::MapFlags::FIXED, + image.fd.as_file(), + image.offset as u64, + ) + .map_err(|e| InstantiationError::Resource(e.into()))?; + assert_eq!(ptr as usize, self.base + image.offset); + } + + self.image = Some(image); + } + } + + // mprotect above `initial_size_bytes`. + self.initial_size = initial_size_bytes; + self.protect_past_initial_size() + .map_err(|e| InstantiationError::Resource(e.into()))?; + + self.dirty = true; + Ok(()) + } + + pub(crate) fn clear_and_remain_ready(&mut self) -> Result<()> { + assert!(self.dirty); + // madvise the image range. This will throw away dirty pages, + // which are CoW-private pages on top of the initial heap + // image memfd. + unsafe { + rustix::io::madvise( + self.base as *mut c_void, + self.static_size, + rustix::io::Advice::LinuxDontNeed, + )?; + } + + // mprotect the region beyond the initial heap size back to PROT_NONE. + self.protect_past_initial_size()?; + self.dirty = false; + Ok(()) + } + + fn protect_past_initial_size(&self) -> Result<()> { + let mprotect_start = self.base + self.initial_size; + let mprotect_len = self.static_size - self.initial_size; + if mprotect_len > 0 { + unsafe { + rustix::io::mprotect( + mprotect_start as *mut _, + mprotect_len, + rustix::io::MprotectFlags::empty(), + )?; + } + } + + Ok(()) + } + + pub(crate) fn has_image(&self) -> bool { + self.image.is_some() + } + + pub(crate) fn is_dirty(&self) -> bool { + self.dirty + } + + /// Map anonymous zeroed memory across the whole slot, with the + /// given protections. Used both during instantiate and during + /// drop. + fn map_anon_memory(&self, prot: rustix::io::ProtFlags) -> Result<()> { + unsafe { + let ptr = rustix::io::mmap_anonymous( + self.base as *mut c_void, + self.static_size, + prot, + rustix::io::MapFlags::PRIVATE | rustix::io::MapFlags::FIXED, + )?; + assert_eq!(ptr as usize, self.base); + } + Ok(()) + } +} + +impl Drop for MemFdSlot { + fn drop(&mut self) { + // The MemFdSlot may be dropped if there is an error during + // instantiation: for example, if a memory-growth limiter + // disallows a guest from having a memory of a certain size, + // after we've already initialized the MemFdSlot. + // + // We need to return this region of the large pool mmap to a + // safe state (with no module-specific mappings). The + // MemFdSlot will not be returned to the MemoryPool, so a new + // MemFdSlot will be created and overwrite the mappings anyway + // on the slot's next use; but for safety and to avoid + // resource leaks it's better not to have stale mappings to a + // possibly-otherwise-dead module's image. + // + // To "wipe the slate clean", let's do a mmap of anonymous + // memory over the whole region, with PROT_NONE. Note that we + // *can't* simply munmap, because that leaves a hole in the + // middle of the pooling allocator's big memory area that some + // other random mmap may swoop in and take, to be trampled + // over by the next MemFdSlot later. + // + // Since we're in drop(), we can't sanely return an error if + // this mmap fails. Let's ignore the failure if so; the next + // MemFdSlot to be created for this slot will try to overwrite + // the existing stale mappings, and return a failure properly + // if we still cannot map new memory. + // + // The exception to all of this is if the `unmap_on_drop` flag + // (which is set by default) is false. If so, the owner of + // this MemFdSlot has indicated that it will clean up in some + // other way. + if self.clear_on_drop { + let _ = self.map_anon_memory(rustix::io::ProtFlags::empty()); + } + } +} diff --git a/crates/runtime/src/instance/allocator/memfd_disabled.rs b/crates/runtime/src/memfd_disabled.rs similarity index 59% rename from crates/runtime/src/instance/allocator/memfd_disabled.rs rename to crates/runtime/src/memfd_disabled.rs index 304dd3eebb..30dfb5fa8f 100644 --- a/crates/runtime/src/instance/allocator/memfd_disabled.rs +++ b/crates/runtime/src/memfd_disabled.rs @@ -5,6 +5,31 @@ use crate::InstantiationError; use anyhow::Result; use std::sync::Arc; +use wasmtime_environ::{DefinedMemoryIndex, Module}; + +/// A shim for the memfd image container when memfd support is not +/// included. +pub enum ModuleMemFds {} + +/// A shim for an individual memory image. +#[allow(dead_code)] +pub enum MemoryMemFd {} + +impl ModuleMemFds { + /// Construct a new set of memfd images. This variant is used + /// when memfd support is not included; it always returns no + /// images. + pub fn new(_: &Module, _: &[u8]) -> Result>> { + Ok(None) + } + + /// Get the memfd image for a particular memory. + pub(crate) fn get_memory_image(&self, _: DefinedMemoryIndex) -> Option<&Arc> { + // Should be unreachable because the `Self` type is + // uninhabitable. + match *self {} + } +} /// A placeholder for MemFdSlot when we have not included the pooling /// allocator. @@ -26,11 +51,13 @@ impl MemFdSlot { pub(crate) fn instantiate( &mut self, _: usize, - _: Option<&Arc>, + _: Option<&Arc>, ) -> Result { panic!("instantiate() on invalid MemFdSlot"); } + pub(crate) unsafe fn no_clear_on_drop(&mut self) {} + pub(crate) fn clear_and_remain_ready(&mut self) -> Result<()> { Ok(()) } diff --git a/crates/runtime/src/memory.rs b/crates/runtime/src/memory.rs index 71c77b43ca..99447ed82b 100644 --- a/crates/runtime/src/memory.rs +++ b/crates/runtime/src/memory.rs @@ -2,10 +2,10 @@ //! //! `RuntimeLinearMemory` is to WebAssembly linear memories what `Table` is to WebAssembly tables. -use crate::instance::MemFdSlot; -use crate::memfd::MemoryMemFd; use crate::mmap::Mmap; use crate::vmcontext::VMMemoryDefinition; +use crate::MemFdSlot; +use crate::MemoryMemFd; use crate::Store; use anyhow::Error; use anyhow::{bail, format_err, Result}; @@ -162,6 +162,13 @@ impl MmapMemory { let len = request_bytes - pre_guard_bytes; let mut memfd_slot = MemFdSlot::create(base as *mut _, len); memfd_slot.instantiate(minimum, Some(image))?; + unsafe { + // On drop, we will unmap our mmap'd range that + // this memfd_slot was mapped on top of, so there + // is no need for the memfd_slot to wipe it with + // an anonymous mapping first. + memfd_slot.no_clear_on_drop(); + } Some(memfd_slot) } None => None, From ccfa245261bea22b2a5ef381a915606f6e1773d0 Mon Sep 17 00:00:00 2001 From: Chris Fallin Date: Mon, 31 Jan 2022 17:03:42 -0800 Subject: [PATCH 05/12] Optimization: only mprotect the *new* bit of heap, not all of it. (This was not a correctness bug, but is an obvious performance bug...) --- crates/runtime/src/memfd.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/crates/runtime/src/memfd.rs b/crates/runtime/src/memfd.rs index 1740aba324..2a0aea640e 100644 --- a/crates/runtime/src/memfd.rs +++ b/crates/runtime/src/memfd.rs @@ -316,7 +316,12 @@ impl MemFdSlot { } pub(crate) fn set_heap_limit(&mut self, size_bytes: usize) -> Result<()> { - assert!(size_bytes > self.cur_size); + assert!( + size_bytes > self.cur_size, + "size_bytes = {} cur_size = {}", + size_bytes, + self.cur_size + ); // mprotect the relevant region. let start = self.base + self.cur_size; let len = size_bytes - self.cur_size; @@ -327,6 +332,7 @@ impl MemFdSlot { rustix::io::MprotectFlags::READ | rustix::io::MprotectFlags::WRITE, )?; } + self.cur_size = size_bytes; Ok(()) } @@ -355,6 +361,7 @@ impl MemFdSlot { == maybe_image.as_ref().unwrap().fd.as_file().as_raw_fd()) { self.dirty = true; + self.cur_size = initial_size_bytes; return Ok(()); } @@ -405,6 +412,7 @@ impl MemFdSlot { // mprotect above `initial_size_bytes`. self.initial_size = initial_size_bytes; + self.cur_size = initial_size_bytes; self.protect_past_initial_size() .map_err(|e| InstantiationError::Resource(e.into()))?; From 0ff8f6ab2020c721b0d8065cd503422e4d0dbf10 Mon Sep 17 00:00:00 2001 From: Chris Fallin Date: Mon, 31 Jan 2022 16:54:04 -0800 Subject: [PATCH 06/12] Make build-config magic use memfd by default. --- .github/workflows/main.yml | 3 --- Cargo.toml | 3 +-- crates/runtime/Cargo.toml | 4 +--- crates/runtime/build.rs | 11 +++++++++++ crates/runtime/src/instance/allocator/pooling.rs | 2 +- crates/runtime/src/lib.rs | 10 +++++----- crates/runtime/src/memfd_disabled.rs | 2 -- crates/runtime/src/traphandlers/unix.rs | 9 +-------- crates/wasmtime/Cargo.toml | 4 ++-- 9 files changed, 22 insertions(+), 26 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 5e5e0c64d9..8516d4ec0a 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -136,7 +136,6 @@ jobs: - run: cargo check -p wasmtime --no-default-features --features async - run: cargo check -p wasmtime --no-default-features --features uffd - run: cargo check -p wasmtime --no-default-features --features pooling-allocator - - run: cargo check -p wasmtime --no-default-features --features memfd-allocator - run: cargo check -p wasmtime --no-default-features --features cranelift - run: cargo check -p wasmtime --no-default-features --features cranelift,wat,async,cache @@ -316,8 +315,6 @@ jobs: cargo test --features uffd -p wasmtime-runtime instance::allocator::pooling cargo test --features uffd -p wasmtime-cli pooling_allocator cargo test --features uffd -p wasmtime-cli wast::Cranelift - cargo test --features memfd-allocator -p wasmtime-cli pooling_allocator - cargo test --features memfd-allocator -p wasmtime-cli wast::Cranelift if: matrix.os == 'ubuntu-latest' && matrix.target == '' env: RUST_BACKTRACE: 1 diff --git a/Cargo.toml b/Cargo.toml index 51c4843fcc..bc385ece76 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,7 +21,7 @@ path = "src/bin/wasmtime.rs" doc = false [dependencies] -wasmtime = { path = "crates/wasmtime", version = "0.33.0", default-features = false, features = ['cache', 'cranelift'] } +wasmtime = { path = "crates/wasmtime", version = "0.33.0", default-features = false, features = ['cache', 'cranelift', 'pooling-allocator', 'memfd'] } wasmtime-cache = { path = "crates/cache", version = "=0.33.0" } wasmtime-cranelift = { path = "crates/cranelift", version = "=0.33.0" } wasmtime-environ = { path = "crates/environ", version = "=0.33.0" } @@ -96,7 +96,6 @@ wasi-crypto = ["wasmtime-wasi-crypto"] wasi-nn = ["wasmtime-wasi-nn"] uffd = ["wasmtime/uffd"] pooling-allocator = ["wasmtime/pooling-allocator"] -memfd-allocator = ["pooling-allocator", "wasmtime/memfd-allocator"] all-arch = ["wasmtime/all-arch"] posix-signals-on-macos = ["wasmtime/posix-signals-on-macos"] diff --git a/crates/runtime/Cargo.toml b/crates/runtime/Cargo.toml index aaef30f677..a4b717fee6 100644 --- a/crates/runtime/Cargo.toml +++ b/crates/runtime/Cargo.toml @@ -25,6 +25,7 @@ backtrace = "0.3.61" lazy_static = "1.3.0" rand = "0.8.3" anyhow = "1.0.38" +memfd = { version = "0.4.1", optional = true } [target.'cfg(target_os = "macos")'.dependencies] mach = "0.3.2" @@ -37,7 +38,6 @@ winapi = { version = "0.3.7", features = ["winbase", "memoryapi", "errhandlingap [target.'cfg(target_os = "linux")'.dependencies] userfaultfd = { version = "0.4.1", optional = true } -memfd = { version = "0.4.1", optional = true } [build-dependencies] cc = "1.0" @@ -60,5 +60,3 @@ uffd = ["userfaultfd", "pooling-allocator"] # It is useful for applications that do not bind their own exception ports and # need portable signal handling. posix-signals-on-macos = [] - -memfd-allocator = ["pooling-allocator", "memfd"] diff --git a/crates/runtime/build.rs b/crates/runtime/build.rs index 6f112f25c9..9c2741714a 100644 --- a/crates/runtime/build.rs +++ b/crates/runtime/build.rs @@ -10,4 +10,15 @@ fn main() { ) .file("src/helpers.c") .compile("wasmtime-helpers"); + + // Check to see if we are on Linux and the `memfd` feature is + // active. If so, enable the `memfd` rustc cfg so `#[cfg(memfd)]` + // will work. + let os = env::var("CARGO_CFG_TARGET_OS").unwrap(); + let is_memfd = env::var("CARGO_FEATURE_MEMFD").is_ok(); + let is_pooling = env::var("CARGO_FEATURE_POOLING_ALLOCATOR").is_ok(); + let is_uffd = env::var("CARGO_FEATURE_UFFD").is_ok(); + if &os == "linux" && is_memfd && is_pooling && !is_uffd { + println!("cargo:rustc-cfg=memfd"); + } } diff --git a/crates/runtime/src/instance/allocator/pooling.rs b/crates/runtime/src/instance/allocator/pooling.rs index abf2683c45..a633bbac29 100644 --- a/crates/runtime/src/instance/allocator/pooling.rs +++ b/crates/runtime/src/instance/allocator/pooling.rs @@ -703,7 +703,7 @@ impl MemoryPool { let mapping = Mmap::accessible_reserved(0, allocation_size) .context("failed to create memory pool mapping")?; - let num_memfd_slots = if cfg!(feature = "memfd-allocator") { + let num_memfd_slots = if cfg!(memfd) { max_instances * max_memories } else { 0 diff --git a/crates/runtime/src/lib.rs b/crates/runtime/src/lib.rs index 822970727c..fb1ffee621 100644 --- a/crates/runtime/src/lib.rs +++ b/crates/runtime/src/lib.rs @@ -19,7 +19,7 @@ clippy::use_self ) )] -#![cfg_attr(feature = "memfd-allocator", allow(dead_code))] +#![cfg_attr(memfd, allow(dead_code))] use std::sync::atomic::AtomicU64; @@ -67,14 +67,14 @@ pub use crate::vmcontext::{ mod module_id; pub use module_id::{CompiledModuleId, CompiledModuleIdAllocator}; -#[cfg(feature = "memfd-allocator")] +#[cfg(memfd)] mod memfd; -#[cfg(feature = "memfd-allocator")] +#[cfg(memfd)] pub use crate::memfd::{MemFdSlot, MemoryMemFd, ModuleMemFds}; -#[cfg(not(feature = "memfd-allocator"))] +#[cfg(not(memfd))] mod memfd_disabled; -#[cfg(not(feature = "memfd-allocator"))] +#[cfg(not(memfd))] pub use crate::memfd_disabled::{MemFdSlot, MemoryMemFd, ModuleMemFds}; /// Version number of this crate. diff --git a/crates/runtime/src/memfd_disabled.rs b/crates/runtime/src/memfd_disabled.rs index 30dfb5fa8f..a0adf10d2d 100644 --- a/crates/runtime/src/memfd_disabled.rs +++ b/crates/runtime/src/memfd_disabled.rs @@ -37,11 +37,9 @@ impl ModuleMemFds { /// To allow MemFdSlot to be unconditionally passed around in various /// places (e.g. a `Memory`), we define a zero-sized type when memfd is /// not included in the build. -#[cfg(not(feature = "memfd-allocator"))] #[derive(Debug)] pub struct MemFdSlot; -#[cfg(not(feature = "memfd-allocator"))] #[allow(dead_code)] impl MemFdSlot { pub(crate) fn create(_: *mut libc::c_void, _: usize) -> Self { diff --git a/crates/runtime/src/traphandlers/unix.rs b/crates/runtime/src/traphandlers/unix.rs index fd16bfcdd1..30545914d8 100644 --- a/crates/runtime/src/traphandlers/unix.rs +++ b/crates/runtime/src/traphandlers/unix.rs @@ -54,14 +54,7 @@ pub unsafe fn platform_init() { // Sometimes we need to handle SIGBUS too: // - On ARM, handle Unaligned Accesses. // - On Darwin, guard page accesses are raised as SIGBUS. - // - With the MemFD allocator, heap growth is controlled by - // ftruncate'ing an mmap'd file, and so out-of-bounds accesses - // are raised as SIGBUS. - if cfg!(target_arch = "arm") - || cfg!(target_os = "macos") - || cfg!(target_os = "freebsd") - || cfg!(feature = "memfd-allocator") - { + if cfg!(target_arch = "arm") || cfg!(target_os = "macos") || cfg!(target_os = "freebsd") { register(&mut PREV_SIGBUS, libc::SIGBUS); } } diff --git a/crates/wasmtime/Cargo.toml b/crates/wasmtime/Cargo.toml index c7b0037d0e..3f2930b872 100644 --- a/crates/wasmtime/Cargo.toml +++ b/crates/wasmtime/Cargo.toml @@ -50,7 +50,7 @@ wasi-cap-std-sync = { path = "../wasi-common/cap-std-sync" } maintenance = { status = "actively-developed" } [features] -default = ['async', 'cache', 'wat', 'jitdump', 'parallel-compilation', 'cranelift', 'pooling-allocator'] +default = ['async', 'cache', 'wat', 'jitdump', 'parallel-compilation', 'cranelift', 'pooling-allocator', 'memfd'] # An on-by-default feature enabling runtime compilation of WebAssembly modules # with the Cranelift compiler. Cranelift is the default compilation backend of @@ -90,4 +90,4 @@ all-arch = ["wasmtime-cranelift/all-arch"] # need portable signal handling. posix-signals-on-macos = ["wasmtime-runtime/posix-signals-on-macos"] -memfd-allocator = ["wasmtime-runtime/memfd-allocator", "pooling-allocator"] \ No newline at end of file +memfd = ["wasmtime-runtime/memfd", "pooling-allocator"] From 01e6bb81fb6d2aab64e6d3c4de783042c2a4bf31 Mon Sep 17 00:00:00 2001 From: Chris Fallin Date: Tue, 1 Feb 2022 15:49:44 -0800 Subject: [PATCH 07/12] Review feedback. --- crates/runtime/build.rs | 3 +- .../runtime/src/instance/allocator/pooling.rs | 8 +-- crates/runtime/src/lib.rs | 1 - crates/runtime/src/memfd.rs | 66 ++++++++++++++----- crates/runtime/src/memfd_disabled.rs | 2 +- crates/runtime/src/memory.rs | 12 ++-- crates/wasmtime/Cargo.toml | 2 +- 7 files changed, 60 insertions(+), 34 deletions(-) diff --git a/crates/runtime/build.rs b/crates/runtime/build.rs index 9c2741714a..1b5d21a933 100644 --- a/crates/runtime/build.rs +++ b/crates/runtime/build.rs @@ -16,9 +16,8 @@ fn main() { // will work. let os = env::var("CARGO_CFG_TARGET_OS").unwrap(); let is_memfd = env::var("CARGO_FEATURE_MEMFD").is_ok(); - let is_pooling = env::var("CARGO_FEATURE_POOLING_ALLOCATOR").is_ok(); let is_uffd = env::var("CARGO_FEATURE_UFFD").is_ok(); - if &os == "linux" && is_memfd && is_pooling && !is_uffd { + if &os == "linux" && is_memfd && !is_uffd { println!("cargo:rustc-cfg=memfd"); } } diff --git a/crates/runtime/src/instance/allocator/pooling.rs b/crates/runtime/src/instance/allocator/pooling.rs index a633bbac29..72a21ff5ba 100644 --- a/crates/runtime/src/instance/allocator/pooling.rs +++ b/crates/runtime/src/instance/allocator/pooling.rs @@ -771,11 +771,9 @@ impl Drop for MemoryPool { // drop) for all MemFdSlots, and then drop them here. This is // valid because the one `Mmap` that covers the whole region // can just do its one munmap. - for memfd in std::mem::take(&mut self.memfd_slots) { - if let Some(memfd_slot) = memfd.lock().unwrap().as_mut() { - unsafe { - memfd_slot.no_clear_on_drop(); - } + for mut memfd in std::mem::take(&mut self.memfd_slots) { + if let Some(memfd_slot) = memfd.get_mut().unwrap() { + memfd_slot.no_clear_on_drop(); } } } diff --git a/crates/runtime/src/lib.rs b/crates/runtime/src/lib.rs index fb1ffee621..c2c74b566b 100644 --- a/crates/runtime/src/lib.rs +++ b/crates/runtime/src/lib.rs @@ -19,7 +19,6 @@ clippy::use_self ) )] -#![cfg_attr(memfd, allow(dead_code))] use std::sync::atomic::AtomicU64; diff --git a/crates/runtime/src/memfd.rs b/crates/runtime/src/memfd.rs index 2a0aea640e..da335a53f4 100644 --- a/crates/runtime/src/memfd.rs +++ b/crates/runtime/src/memfd.rs @@ -42,11 +42,15 @@ pub struct MemoryMemFd { /// Length of image. Note that initial memory size may be larger; /// leading and trailing zeroes are truncated (handled by /// anonymous backing memfd). + /// + /// Must be a multiple of the system page size. pub len: usize, /// Image starts this many bytes into heap space. Note that the /// memfd's offsets are always equal to the heap offsets, so we /// map at an offset into the fd as well. (This simplifies /// construction.) + /// + /// Must be a multiple of the system page size. pub offset: usize, } @@ -231,6 +235,9 @@ impl ModuleMemFds { memfd.add_seal(memfd::FileSeal::SealWrite)?; memfd.add_seal(memfd::FileSeal::SealSeal)?; + assert_eq!(offset % page_size, 0); + assert_eq!(len % page_size, 0); + memories.push(Some(Arc::new(MemoryMemFd { fd: memfd, offset: usize::try_from(offset).unwrap(), @@ -294,6 +301,8 @@ pub struct MemFdSlot { } impl MemFdSlot { + /// Create a new MemFdSlot. Assumes that there is an anonymous + /// mmap backing in the given range to start. pub(crate) fn create(base_addr: *mut c_void, static_size: usize) -> Self { let base = base_addr as usize; MemFdSlot { @@ -311,7 +320,7 @@ impl MemFdSlot { /// address space when dropped. This should be used only when the /// caller will clear or reuse the address space in some other /// way. - pub(crate) unsafe fn no_clear_on_drop(&mut self) { + pub(crate) fn no_clear_on_drop(&mut self) { self.clear_on_drop = false; } @@ -384,8 +393,22 @@ impl MemFdSlot { // zeroes). Anonymous zero pages are fast: the kernel // pre-zeroes them, and even if it runs out of those, a memset // is half as expensive as a memcpy (only writes, no reads). - self.map_anon_memory(rustix::io::ProtFlags::READ | rustix::io::ProtFlags::WRITE) - .map_err(|e| InstantiationError::Resource(e.into()))?; + // + // We map these inaccessible at first then mprotect() the + // whole of the initial heap size to R+W below. + // + // Special case: we can skip if the last instantiation had no + // image. This means that the whole slot is filled with an + // anonymous mmap backing (and it will have already been + // cleared by the madvise). This also lets us skip an mmap the + // first time a MemFdSlot is used, because we require the + // caller to give us a fixed address in an + // already-mmaped-with-anon-memory region. This is important + // for the on-demand allocator. + if self.image.is_some() { + self.map_anon_memory(rustix::io::ProtFlags::empty()) + .map_err(|e| InstantiationError::Resource(e.into()))?; + } // The initial memory image, if given. If not, we just get a // memory filled with zeroes. @@ -410,11 +433,15 @@ impl MemFdSlot { } } - // mprotect above `initial_size_bytes`. + // mprotect the initial `initial_size_bytes` to be accessible. self.initial_size = initial_size_bytes; self.cur_size = initial_size_bytes; - self.protect_past_initial_size() - .map_err(|e| InstantiationError::Resource(e.into()))?; + self.set_protection( + 0, + initial_size_bytes, + rustix::io::MprotectFlags::READ | rustix::io::MprotectFlags::WRITE, + ) + .map_err(|e| InstantiationError::Resource(e.into()))?; self.dirty = true; Ok(()) @@ -433,22 +460,27 @@ impl MemFdSlot { )?; } - // mprotect the region beyond the initial heap size back to PROT_NONE. - self.protect_past_initial_size()?; + // mprotect the initial heap region beyond the initial heap size back to PROT_NONE. + self.set_protection( + self.initial_size, + self.static_size - self.initial_size, + rustix::io::MprotectFlags::empty(), + )?; self.dirty = false; Ok(()) } - fn protect_past_initial_size(&self) -> Result<()> { - let mprotect_start = self.base + self.initial_size; - let mprotect_len = self.static_size - self.initial_size; - if mprotect_len > 0 { + fn set_protection( + &self, + start: usize, + len: usize, + flags: rustix::io::MprotectFlags, + ) -> Result<()> { + assert!(start.checked_add(len).unwrap() <= self.static_size); + let mprotect_start = self.base.checked_add(start).unwrap(); + if len > 0 { unsafe { - rustix::io::mprotect( - mprotect_start as *mut _, - mprotect_len, - rustix::io::MprotectFlags::empty(), - )?; + rustix::io::mprotect(mprotect_start as *mut _, len, flags)?; } } diff --git a/crates/runtime/src/memfd_disabled.rs b/crates/runtime/src/memfd_disabled.rs index a0adf10d2d..6aee1a38df 100644 --- a/crates/runtime/src/memfd_disabled.rs +++ b/crates/runtime/src/memfd_disabled.rs @@ -54,7 +54,7 @@ impl MemFdSlot { panic!("instantiate() on invalid MemFdSlot"); } - pub(crate) unsafe fn no_clear_on_drop(&mut self) {} + pub(crate) fn no_clear_on_drop(&mut self) {} pub(crate) fn clear_and_remain_ready(&mut self) -> Result<()> { Ok(()) diff --git a/crates/runtime/src/memory.rs b/crates/runtime/src/memory.rs index 99447ed82b..e36c521846 100644 --- a/crates/runtime/src/memory.rs +++ b/crates/runtime/src/memory.rs @@ -162,13 +162,11 @@ impl MmapMemory { let len = request_bytes - pre_guard_bytes; let mut memfd_slot = MemFdSlot::create(base as *mut _, len); memfd_slot.instantiate(minimum, Some(image))?; - unsafe { - // On drop, we will unmap our mmap'd range that - // this memfd_slot was mapped on top of, so there - // is no need for the memfd_slot to wipe it with - // an anonymous mapping first. - memfd_slot.no_clear_on_drop(); - } + // On drop, we will unmap our mmap'd range that this + // memfd_slot was mapped on top of, so there is no + // need for the memfd_slot to wipe it with an + // anonymous mapping first. + memfd_slot.no_clear_on_drop(); Some(memfd_slot) } None => None, diff --git a/crates/wasmtime/Cargo.toml b/crates/wasmtime/Cargo.toml index 3f2930b872..00cb081dec 100644 --- a/crates/wasmtime/Cargo.toml +++ b/crates/wasmtime/Cargo.toml @@ -90,4 +90,4 @@ all-arch = ["wasmtime-cranelift/all-arch"] # need portable signal handling. posix-signals-on-macos = ["wasmtime-runtime/posix-signals-on-macos"] -memfd = ["wasmtime-runtime/memfd", "pooling-allocator"] +memfd = ["wasmtime-runtime/memfd"] From 84a8368e88ceee972348f5d4dfe0b4b1f81beece Mon Sep 17 00:00:00 2001 From: Chris Fallin Date: Tue, 1 Feb 2022 16:34:06 -0800 Subject: [PATCH 08/12] Fix to the optimization: mprotect(NONE) sometimes needed after skipping the initial mmap. --- crates/runtime/src/memfd.rs | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/crates/runtime/src/memfd.rs b/crates/runtime/src/memfd.rs index da335a53f4..e94d17fe22 100644 --- a/crates/runtime/src/memfd.rs +++ b/crates/runtime/src/memfd.rs @@ -292,6 +292,12 @@ pub struct MemFdSlot { /// instantiation). Set by `instantiate()` and cleared by /// `clear_and_remain_ready()`, and used in assertions to ensure /// those methods are called properly. + /// + /// Invariant: if !dirty, then this memory slot contains a clean + /// CoW mapping of `image`, if `Some(..)`, and anonymous-zero + /// memory beyond the image up to `static_size`. The addresses + /// from offset 0 to `initial_size` are accessible R+W and the + /// rest of the slot is inaccessible. dirty: bool, /// Whether this MemFdSlot is responsible for mapping anonymous /// memory (to hold the reservation while overwriting mappings @@ -400,14 +406,23 @@ impl MemFdSlot { // Special case: we can skip if the last instantiation had no // image. This means that the whole slot is filled with an // anonymous mmap backing (and it will have already been - // cleared by the madvise). This also lets us skip an mmap the - // first time a MemFdSlot is used, because we require the - // caller to give us a fixed address in an + // cleared by the madvise). We may however need to + // mprotect(NONE) the space above `initial_size_bytes` if the + // last use of this slot left it larger. This also lets us + // skip an mmap the first time a MemFdSlot is used, because we + // require the caller to give us a fixed address in an // already-mmaped-with-anon-memory region. This is important // for the on-demand allocator. if self.image.is_some() { self.map_anon_memory(rustix::io::ProtFlags::empty()) .map_err(|e| InstantiationError::Resource(e.into()))?; + } else if initial_size_bytes < self.initial_size { + self.set_protection( + initial_size_bytes, + self.initial_size, + rustix::io::MprotectFlags::empty(), + ) + .map_err(|e| InstantiationError::Resource(e.into()))?; } // The initial memory image, if given. If not, we just get a From 94410a8d4bed9423bfc09de9abc5b0401f4d0bef Mon Sep 17 00:00:00 2001 From: Chris Fallin Date: Wed, 2 Feb 2022 10:03:31 -0800 Subject: [PATCH 09/12] Review comments. --- crates/runtime/src/memfd.rs | 45 ++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/crates/runtime/src/memfd.rs b/crates/runtime/src/memfd.rs index e94d17fe22..9b8c25a698 100644 --- a/crates/runtime/src/memfd.rs +++ b/crates/runtime/src/memfd.rs @@ -402,21 +402,30 @@ impl MemFdSlot { // // We map these inaccessible at first then mprotect() the // whole of the initial heap size to R+W below. - // - // Special case: we can skip if the last instantiation had no - // image. This means that the whole slot is filled with an - // anonymous mmap backing (and it will have already been - // cleared by the madvise). We may however need to - // mprotect(NONE) the space above `initial_size_bytes` if the - // last use of this slot left it larger. This also lets us - // skip an mmap the first time a MemFdSlot is used, because we - // require the caller to give us a fixed address in an - // already-mmaped-with-anon-memory region. This is important - // for the on-demand allocator. if self.image.is_some() { - self.map_anon_memory(rustix::io::ProtFlags::empty()) + self.reset_with_anon_memory() .map_err(|e| InstantiationError::Resource(e.into()))?; } else if initial_size_bytes < self.initial_size { + // Special case: we can skip if the last instantiation had + // no image. This means that the whole slot is filled with + // an anonymous mmap backing (and it will have already + // been cleared by the madvise). We may however need to + // mprotect(NONE) the space above `initial_size_bytes` if + // the last use of this slot left it larger. This also + // lets us skip an mmap the first time a MemFdSlot is + // used, because we require the caller to give us a fixed + // address in an already-mmaped-with-anon-memory + // region. This is important for the on-demand allocator. + // + // So we come in with: + // - anon-zero memory, R+W, [0, self.initial_size) + // - anon-zero memory, none, [self.initial_size, self.static_size) + // and we want: + // - anon-zero memory, R+W, [0, initial_size_bytes) + // - anon-zero memory, none, [initial_size_bytes, self.static_size) + // + // so given initial_size_bytes < self.initial_size we + // mprotect(NONE) the zone from the first to the second. self.set_protection( initial_size_bytes, self.initial_size, @@ -428,6 +437,7 @@ impl MemFdSlot { // The initial memory image, if given. If not, we just get a // memory filled with zeroes. if let Some(image) = maybe_image { + assert!(image.offset.checked_add(image.len).unwrap() <= initial_size_bytes); if image.len > 0 { let image = image.clone(); @@ -510,15 +520,14 @@ impl MemFdSlot { self.dirty } - /// Map anonymous zeroed memory across the whole slot, with the - /// given protections. Used both during instantiate and during - /// drop. - fn map_anon_memory(&self, prot: rustix::io::ProtFlags) -> Result<()> { + /// Map anonymous zeroed memory across the whole slot, + /// inaccessible. Used both during instantiate and during drop. + fn reset_with_anon_memory(&self) -> Result<()> { unsafe { let ptr = rustix::io::mmap_anonymous( self.base as *mut c_void, self.static_size, - prot, + rustix::io::ProtFlags::empty(), rustix::io::MapFlags::PRIVATE | rustix::io::MapFlags::FIXED, )?; assert_eq!(ptr as usize, self.base); @@ -560,7 +569,7 @@ impl Drop for MemFdSlot { // this MemFdSlot has indicated that it will clean up in some // other way. if self.clear_on_drop { - let _ = self.map_anon_memory(rustix::io::ProtFlags::empty()); + let _ = self.reset_with_anon_memory(); } } } From 0ec45d3ae439b9ce8aa314a50b0b19480ef914b7 Mon Sep 17 00:00:00 2001 From: Chris Fallin Date: Wed, 2 Feb 2022 11:28:47 -0800 Subject: [PATCH 10/12] Add additional tests for MemFdSlot. --- crates/runtime/src/memfd.rs | 131 ++++++++++++++++++++++++++++++++---- 1 file changed, 118 insertions(+), 13 deletions(-) diff --git a/crates/runtime/src/memfd.rs b/crates/runtime/src/memfd.rs index 9b8c25a698..8ccea69072 100644 --- a/crates/runtime/src/memfd.rs +++ b/crates/runtime/src/memfd.rs @@ -81,6 +81,16 @@ fn unsupported_initializer(segment: &MemoryInitializer, plan: &MemoryPlan) -> bo false } +fn create_memfd() -> Result { + // Create the memfd. It needs a name, but the + // documentation for `memfd_create()` says that names can + // be duplicated with no issues. + MemfdOptions::new() + .allow_sealing(true) + .create("wasm-memory-image") + .map_err(|e| e.into()) +} + impl ModuleMemFds { /// Create a new `ModuleMemFds` for the given module. This can be /// passed in as part of a `InstanceAllocationRequest` to speed up @@ -103,15 +113,6 @@ impl ModuleMemFds { excluded_memories.push(false); } - fn create_memfd() -> Result { - // Create the memfd. It needs a name, but the - // documentation for `memfd_create()` says that names can - // be duplicated with no issues. - MemfdOptions::new() - .allow_sealing(true) - .create("wasm-memory-image") - .map_err(|e| e.into()) - } let round_up_page = |len: u64| (len + page_size - 1) & !(page_size - 1); match &module.memory_initialization { @@ -439,8 +440,6 @@ impl MemFdSlot { if let Some(image) = maybe_image { assert!(image.offset.checked_add(image.len).unwrap() <= initial_size_bytes); if image.len > 0 { - let image = image.clone(); - unsafe { let ptr = rustix::io::mmap( (self.base + image.offset) as *mut c_void, @@ -453,11 +452,11 @@ impl MemFdSlot { .map_err(|e| InstantiationError::Resource(e.into()))?; assert_eq!(ptr as usize, self.base + image.offset); } - - self.image = Some(image); } } + self.image = maybe_image.cloned(); + // mprotect the initial `initial_size_bytes` to be accessible. self.initial_size = initial_size_bytes; self.cur_size = initial_size_bytes; @@ -573,3 +572,109 @@ impl Drop for MemFdSlot { } } } + +#[cfg(test)] +mod test { + use std::sync::Arc; + + use super::create_memfd; + use super::MemFdSlot; + use super::MemoryMemFd; + use crate::mmap::Mmap; + use anyhow::Result; + use rustix::fs::FileExt; + + fn create_memfd_with_data(offset: usize, data: &[u8]) -> Result { + let page_size = region::page::size(); + let memfd = create_memfd()?; + // Offset and length have to be page-aligned. + assert_eq!(offset & (page_size - 1), 0); + let image_len = offset + data.len(); + let image_len = (image_len + page_size - 1) & !(page_size - 1); + memfd.as_file().set_len(image_len as u64)?; + memfd.as_file().write_at(data, offset as u64)?; + Ok(MemoryMemFd { + fd: memfd, + len: image_len, + offset, + }) + } + + #[test] + fn instantiate_no_image() { + // 4 MiB mmap'd area, not accessible + let mut mmap = Mmap::accessible_reserved(0, 4 << 20).unwrap(); + // Create a MemFdSlot on top of it + let mut memfd = MemFdSlot::create(mmap.as_mut_ptr() as *mut _, 4 << 20); + memfd.no_clear_on_drop(); + assert!(!memfd.is_dirty()); + // instantiate with 64 KiB initial size + memfd.instantiate(64 << 10, None).unwrap(); + assert!(memfd.is_dirty()); + // We should be able to access this 64 KiB (try both ends) and + // it should consist of zeroes. + let slice = mmap.as_mut_slice(); + assert_eq!(0, slice[0]); + assert_eq!(0, slice[65535]); + slice[1024] = 42; + assert_eq!(42, slice[1024]); + // grow the heap + memfd.set_heap_limit(128 << 10).unwrap(); + let slice = mmap.as_slice(); + assert_eq!(42, slice[1024]); + assert_eq!(0, slice[131071]); + // instantiate again; we should see zeroes, even as the + // reuse-anon-mmap-opt kicks in + memfd.clear_and_remain_ready().unwrap(); + assert!(!memfd.is_dirty()); + memfd.instantiate(64 << 10, None).unwrap(); + let slice = mmap.as_slice(); + assert_eq!(0, slice[1024]); + } + + #[test] + fn instantiate_image() { + // 4 MiB mmap'd area, not accessible + let mut mmap = Mmap::accessible_reserved(0, 4 << 20).unwrap(); + // Create a MemFdSlot on top of it + let mut memfd = MemFdSlot::create(mmap.as_mut_ptr() as *mut _, 4 << 20); + memfd.no_clear_on_drop(); + // Create an image with some data. + let image = Arc::new(create_memfd_with_data(4096, &[1, 2, 3, 4]).unwrap()); + // Instantiate with this image + memfd.instantiate(64 << 10, Some(&image)).unwrap(); + assert!(memfd.has_image()); + let slice = mmap.as_mut_slice(); + assert_eq!(&[1, 2, 3, 4], &slice[4096..4100]); + slice[4096] = 5; + // Clear and re-instantiate same image + memfd.clear_and_remain_ready().unwrap(); + memfd.instantiate(64 << 10, Some(&image)).unwrap(); + let slice = mmap.as_slice(); + // Should not see mutation from above + assert_eq!(&[1, 2, 3, 4], &slice[4096..4100]); + // Clear and re-instantiate no image + memfd.clear_and_remain_ready().unwrap(); + memfd.instantiate(64 << 10, None).unwrap(); + assert!(!memfd.has_image()); + let slice = mmap.as_slice(); + assert_eq!(&[0, 0, 0, 0], &slice[4096..4100]); + // Clear and re-instantiate image again + memfd.clear_and_remain_ready().unwrap(); + memfd.instantiate(64 << 10, Some(&image)).unwrap(); + let slice = mmap.as_slice(); + assert_eq!(&[1, 2, 3, 4], &slice[4096..4100]); + // Create another image with different data. + let image2 = Arc::new(create_memfd_with_data(4096, &[10, 11, 12, 13]).unwrap()); + memfd.clear_and_remain_ready().unwrap(); + memfd.instantiate(128 << 10, Some(&image2)).unwrap(); + let slice = mmap.as_slice(); + assert_eq!(&[10, 11, 12, 13], &slice[4096..4100]); + // Instantiate the original image again; we should notice it's + // a different image and not reuse the mappings. + memfd.clear_and_remain_ready().unwrap(); + memfd.instantiate(64 << 10, Some(&image)).unwrap(); + let slice = mmap.as_slice(); + assert_eq!(&[1, 2, 3, 4], &slice[4096..4100]); + } +} From d7b04f5ced25a1adf31aadfb919f7d15d672cc83 Mon Sep 17 00:00:00 2001 From: Chris Fallin Date: Wed, 2 Feb 2022 11:41:31 -0800 Subject: [PATCH 11/12] Review comments. --- crates/runtime/src/memfd.rs | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/crates/runtime/src/memfd.rs b/crates/runtime/src/memfd.rs index 8ccea69072..16d41d1fb8 100644 --- a/crates/runtime/src/memfd.rs +++ b/crates/runtime/src/memfd.rs @@ -7,8 +7,8 @@ use libc::c_void; use memfd::{Memfd, MemfdOptions}; use rustix::fd::AsRawFd; use rustix::fs::FileExt; -use std::convert::TryFrom; use std::sync::Arc; +use std::{convert::TryFrom, ops::Range}; use wasmtime_environ::{ DefinedMemoryIndex, MemoryInitialization, MemoryInitializer, MemoryPlan, Module, PrimaryMap, }; @@ -428,8 +428,7 @@ impl MemFdSlot { // so given initial_size_bytes < self.initial_size we // mprotect(NONE) the zone from the first to the second. self.set_protection( - initial_size_bytes, - self.initial_size, + initial_size_bytes..self.initial_size, rustix::io::MprotectFlags::empty(), ) .map_err(|e| InstantiationError::Resource(e.into()))?; @@ -461,8 +460,7 @@ impl MemFdSlot { self.initial_size = initial_size_bytes; self.cur_size = initial_size_bytes; self.set_protection( - 0, - initial_size_bytes, + 0..initial_size_bytes, rustix::io::MprotectFlags::READ | rustix::io::MprotectFlags::WRITE, ) .map_err(|e| InstantiationError::Resource(e.into()))?; @@ -486,25 +484,19 @@ impl MemFdSlot { // mprotect the initial heap region beyond the initial heap size back to PROT_NONE. self.set_protection( - self.initial_size, - self.static_size - self.initial_size, + self.initial_size..self.static_size, rustix::io::MprotectFlags::empty(), )?; self.dirty = false; Ok(()) } - fn set_protection( - &self, - start: usize, - len: usize, - flags: rustix::io::MprotectFlags, - ) -> Result<()> { - assert!(start.checked_add(len).unwrap() <= self.static_size); - let mprotect_start = self.base.checked_add(start).unwrap(); - if len > 0 { + fn set_protection(&self, range: Range, flags: rustix::io::MprotectFlags) -> Result<()> { + assert!(range.end <= self.static_size); + let mprotect_start = self.base.checked_add(range.start).unwrap(); + if range.len() > 0 { unsafe { - rustix::io::mprotect(mprotect_start as *mut _, len, flags)?; + rustix::io::mprotect(mprotect_start as *mut _, range.len(), flags)?; } } From 9880eba2a8514da3cfd1daa45949ba9b257f9eb6 Mon Sep 17 00:00:00 2001 From: Chris Fallin Date: Wed, 2 Feb 2022 12:25:20 -0800 Subject: [PATCH 12/12] Skip memfd tests when on qemu, due to differing madvise semantics. --- crates/runtime/src/memfd.rs | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/crates/runtime/src/memfd.rs b/crates/runtime/src/memfd.rs index 16d41d1fb8..cf26e9b765 100644 --- a/crates/runtime/src/memfd.rs +++ b/crates/runtime/src/memfd.rs @@ -594,6 +594,10 @@ mod test { #[test] fn instantiate_no_image() { + if skip_tests_due_to_qemu_madvise_semantics() { + return; + } + // 4 MiB mmap'd area, not accessible let mut mmap = Mmap::accessible_reserved(0, 4 << 20).unwrap(); // Create a MemFdSlot on top of it @@ -626,6 +630,10 @@ mod test { #[test] fn instantiate_image() { + if skip_tests_due_to_qemu_madvise_semantics() { + return; + } + // 4 MiB mmap'd area, not accessible let mut mmap = Mmap::accessible_reserved(0, 4 << 20).unwrap(); // Create a MemFdSlot on top of it @@ -669,4 +677,19 @@ mod test { let slice = mmap.as_slice(); assert_eq!(&[1, 2, 3, 4], &slice[4096..4100]); } + + /// qemu's madvise implementation does not implement the + /// "flash-reset back to zero or CoW backing" semantics that Linux + /// does. Our CI setup uses qemu (in usermode-binary mode, not + /// whole-system mode) to run tests on aarch64 and s390x. We want + /// to skip these tests when under qemu, but not when someone is + /// developing natively on one of these architectures. So instead, + /// we dynamically detect an environment variable that our CI + /// setup sets. + /// + /// See `skip_pooling_allocator_tests()` in `tests/all/main.rs` + /// for more. + fn skip_tests_due_to_qemu_madvise_semantics() -> bool { + std::env::var("WASMTIME_TEST_NO_HOG_MEMORY").is_ok() + } }