diff --git a/crates/runtime/src/instance/allocator/memfd.rs b/crates/runtime/src/instance/allocator/memfd.rs index 8713794824..67741f8bbd 100644 --- a/crates/runtime/src/instance/allocator/memfd.rs +++ b/crates/runtime/src/instance/allocator/memfd.rs @@ -5,8 +5,6 @@ use crate::InstantiationError; use anyhow::Result; use libc::c_void; use rustix::fd::AsRawFd; -use std::convert::TryFrom; -use std::fs::File; use std::sync::Arc; /// A single slot handled by the memfd instance-heap mechanism. @@ -16,8 +14,7 @@ use std::sync::Arc; /// base ==> (points here) /// - (image.offset bytes) anonymous zero memory, pre-image /// - (image.len bytes) CoW mapping of memfd heap image -/// - (up to extension_offset) anonymous zero memory, post-image -/// - (up to static_size) heap expansion region; CoW mapping of per-slot memfd +/// - (up to static_size) anonymous zero memory, post-image /// /// The ordering of mmaps to set this up is: /// @@ -25,15 +22,15 @@ use std::sync::Arc; /// - one large mmap to create 8GiB * instances * memories slots /// /// - per instantiation of new image in a slot: -/// - mmap of anonymous zero memory, from 0 to initial heap size +/// - mmap of anonymous zero memory, from 0 to max heap size +/// (static_size) /// - mmap of CoW'd memfd image, from `image.offset` to /// `image.offset + image.len`. This overwrites part of the /// anonymous zero memory, potentially splitting it into a pre- /// and post-region. -/// - mmap of CoW'd extension file, past the initial heap size up to -/// the end of the max memory size (just before the -/// post-guard). This is always adjacent to the above mmaps, but -/// does not overlap/overwrite them. +/// - mprotect(PROT_NONE) on the part of the heap beyond the initial +/// heap size; we re-mprotect it with R+W bits when the heap is +/// grown. #[derive(Debug)] pub struct MemFdSlot { /// The base of the actual heap memory. Bytes at this address are @@ -44,21 +41,11 @@ pub struct MemFdSlot { /// The memfd image that backs this memory. May be `None`, in /// which case the memory is all zeroes. pub(crate) image: Option>, - /// The offset at which the "extension file", which is used to - /// allow for efficient heap growth, is mapped. This is always - /// immediately after the end of the initial memory size. - extension_offset: usize, - /// The anonymous memfd, owned by this slot, which we mmap in the - /// area where the heap may grow during runtime. We use the - /// ftruncate() syscall (invoked via `File::set_len()`) to set its - /// size. We never write any data to it -- we CoW-map it so we can - /// throw away dirty data on termination. Instead, we just use its - /// size as a "watermark" that delineates the boundary between - /// safe-to-access memory and SIGBUS-causing memory. (This works - /// because one can mmap a file beyond its end, and is good - /// because ftruncate does not take the process-wide lock that - /// mmap and mprotect do.) - extension_file: File, + /// The initial heap size. + initial_size: usize, + /// The current heap size. All memory above `base + cur_size` + /// should be PROT_NONE (mapped inaccessible). + cur_size: usize, /// Whether this slot may have "dirty" pages (pages written by an /// instantiation). Set by `instantiate()` and cleared by /// `clear_and_remain_ready()`, and used in assertions to ensure @@ -67,53 +54,31 @@ pub struct MemFdSlot { } impl MemFdSlot { - pub(crate) fn create( - base_addr: *mut c_void, - static_size: usize, - ) -> Result { + pub(crate) fn create(base_addr: *mut c_void, static_size: usize) -> Self { let base = base_addr as usize; - - // Create a MemFD for the memory growth first -- this covers - // extended heap beyond the initial image. - let extension_memfd = memfd::MemfdOptions::new() - .allow_sealing(true) - .create("wasm-anonymous-heap") - .map_err(|e| InstantiationError::Resource(e.into()))?; - // Seal the ability to write the extension file (make it - // permanently read-only). This is a defense-in-depth - // mitigation to make extra-sure that we don't leak - // information between instantiations. See note in `memfd.rs` - // for more about why we use seals. - extension_memfd - .add_seal(memfd::FileSeal::SealWrite) - .map_err(|e| InstantiationError::Resource(e.into()))?; - extension_memfd - .add_seal(memfd::FileSeal::SealSeal) - .map_err(|e| InstantiationError::Resource(e.into()))?; - let extension_file = extension_memfd.into_file(); - extension_file - .set_len(0) - .map_err(|e| InstantiationError::Resource(e.into()))?; - - Ok(MemFdSlot { + MemFdSlot { base, static_size, + initial_size: 0, + cur_size: 0, image: None, - extension_file, - extension_offset: 0, dirty: false, - }) + } } pub(crate) fn set_heap_limit(&mut self, size_bytes: usize) -> Result<()> { - assert!(size_bytes >= self.extension_offset); - // This is all that is needed to make the new memory - // accessible; we don't need to mprotect anything. (The - // mapping itself is always R+W for the max possible heap - // size, and only the anonymous-backing file length catches - // out-of-bounds accesses.) - self.extension_file - .set_len(u64::try_from(size_bytes - self.extension_offset).unwrap())?; + assert!(size_bytes > self.cur_size); + // mprotect the relevant region. + let start = self.base + self.cur_size; + let len = size_bytes - self.cur_size; + unsafe { + rustix::io::mprotect( + start as *mut _, + len, + rustix::io::MprotectFlags::READ | rustix::io::MprotectFlags::WRITE, + )?; + } + Ok(()) } @@ -124,31 +89,36 @@ impl MemFdSlot { ) -> Result<(), InstantiationError> { assert!(!self.dirty); - if let Some(existing_image) = &self.image { - // Fast-path: previously instantiated with the same image, - // so the mappings are already correct; there is no need - // to mmap anything. Given that we asserted not-dirty - // above, any dirty pages will have already been thrown - // away by madvise() during the previous termination. - if let Some(image) = maybe_image { - if existing_image.fd.as_file().as_raw_fd() == image.fd.as_file().as_raw_fd() { - self.dirty = true; - return Ok(()); - } - } + // Fast-path: previously instantiated with the same image, or + // no image but the same initial size, so the mappings are + // already correct; there is no need to mmap anything. Given + // that we asserted not-dirty above, any dirty pages will have + // already been thrown away by madvise() during the previous + // termination. The `clear_and_remain_ready()` path also + // mprotects memory above the initial heap size back to + // PROT_NONE, so we don't need to do that here. + if (self.image.is_none() + && maybe_image.is_none() + && self.initial_size == initial_size_bytes) + || (self.image.is_some() + && maybe_image.is_some() + && self.image.as_ref().unwrap().fd.as_file().as_raw_fd() + == maybe_image.as_ref().unwrap().fd.as_file().as_raw_fd()) + { + self.dirty = true; + return Ok(()); } // Otherwise, we need to redo (i) the anonymous-mmap backing - // for the initial heap size, (ii) the extension-file backing, - // and (iii) the initial-heap-image mapping if present. + // for the whole slot, (ii) the initial-heap-image mapping if + // present, and (iii) the mprotect(PROT_NONE) above the + // initial heap size. // Security/audit note: we map all of these MAP_PRIVATE, so // all instance data is local to the mapping, not propagated // to the backing fd. We throw away this CoW overlay with - // madvise() below, from base up to extension_offset (which is - // at least initial_size_bytes, and extended when the - // extension file is, so it covers all three mappings) when - // terminating the instance. + // madvise() below, from base up to static_size (which is the + // whole slot) when terminating the instance. // Anonymous mapping behind the initial heap size: this gives // zeroes for any "holes" in the initial heap image. Anonymous @@ -162,7 +132,7 @@ impl MemFdSlot { unsafe { let ptr = rustix::io::mmap_anonymous( self.base as *mut c_void, - initial_size_bytes, + self.static_size, rustix::io::ProtFlags::READ | rustix::io::ProtFlags::WRITE, rustix::io::MapFlags::PRIVATE | rustix::io::MapFlags::FIXED, ) @@ -171,29 +141,8 @@ impl MemFdSlot { } } - // An "extension file": this allows us to grow the heap by - // doing just an ftruncate(), without changing any - // mappings. This is important to avoid the process-wide mmap - // lock on Linux. - self.extension_offset = initial_size_bytes; - let extension_map_len = self.static_size - initial_size_bytes; - if extension_map_len > 0 { - unsafe { - let fd = rustix::fd::BorrowedFd::borrow_raw_fd(self.extension_file.as_raw_fd()); - let ptr = rustix::io::mmap( - (self.base + initial_size_bytes) as *mut c_void, - extension_map_len, - rustix::io::ProtFlags::READ | rustix::io::ProtFlags::WRITE, - rustix::io::MapFlags::PRIVATE | rustix::io::MapFlags::FIXED, - &fd, - 0, - ) - .map_err(|e| InstantiationError::Resource(e.into()))?; - assert_eq!(ptr as usize, self.base + initial_size_bytes); - } - } - - // Finally, the initial memory image. + // The initial memory image, if given. If not, we just get a + // memory filled with zeroes. if let Some(image) = maybe_image { if image.len > 0 { let image = image.clone(); @@ -216,31 +165,50 @@ impl MemFdSlot { } } + // mprotect above `initial_size_bytes`. + self.initial_size = initial_size_bytes; + self.protect_past_initial_size() + .map_err(|e| InstantiationError::Resource(e.into()))?; + self.dirty = true; Ok(()) } pub(crate) fn clear_and_remain_ready(&mut self) -> Result<()> { assert!(self.dirty); - // madvise the image range; that's it! This will throw away - // dirty pages, which are CoW-private pages on top of the - // initial heap image memfd. + // madvise the image range. This will throw away dirty pages, + // which are CoW-private pages on top of the initial heap + // image memfd. unsafe { rustix::io::madvise( self.base as *mut c_void, - self.extension_offset, + self.static_size, rustix::io::Advice::LinuxDontNeed, )?; } - // truncate the extension file down to zero bytes to reset heap length. - self.extension_file - .set_len(0) - .map_err(|e| InstantiationError::Resource(e.into()))?; + // mprotect the region beyond the initial heap size back to PROT_NONE. + self.protect_past_initial_size()?; self.dirty = false; Ok(()) } + fn protect_past_initial_size(&self) -> Result<()> { + let mprotect_start = self.base + self.initial_size; + let mprotect_len = self.static_size - self.initial_size; + if mprotect_len > 0 { + unsafe { + rustix::io::mprotect( + mprotect_start as *mut _, + mprotect_len, + rustix::io::MprotectFlags::empty(), + )?; + } + } + + Ok(()) + } + pub(crate) fn has_image(&self) -> bool { self.image.is_some() } diff --git a/crates/runtime/src/instance/allocator/memfd_disabled.rs b/crates/runtime/src/instance/allocator/memfd_disabled.rs index 9c87591bd5..304dd3eebb 100644 --- a/crates/runtime/src/instance/allocator/memfd_disabled.rs +++ b/crates/runtime/src/instance/allocator/memfd_disabled.rs @@ -19,7 +19,7 @@ pub struct MemFdSlot; #[cfg(not(feature = "memfd-allocator"))] #[allow(dead_code)] impl MemFdSlot { - pub(crate) fn create(_: *mut libc::c_void, _: usize) -> Result { + pub(crate) fn create(_: *mut libc::c_void, _: usize) -> Self { panic!("create() on invalid MemFdSlot"); } diff --git a/crates/runtime/src/instance/allocator/pooling.rs b/crates/runtime/src/instance/allocator/pooling.rs index 6aa291d7a9..fb60ffc4b1 100644 --- a/crates/runtime/src/instance/allocator/pooling.rs +++ b/crates/runtime/src/instance/allocator/pooling.rs @@ -529,7 +529,7 @@ impl InstancePool { if let Some(memfds) = maybe_memfds { let image = memfds.get_memory_image(defined_index); - let mut slot = memories.take_memfd_slot(instance_idx, memory_index)?; + let mut slot = memories.take_memfd_slot(instance_idx, memory_index); let initial_size = plan.memory.minimum * WASM_PAGE_SIZE as u64; // If instantiation fails, we can propagate the error @@ -745,15 +745,11 @@ impl MemoryPool { /// Take ownership of the given memfd slot. Must be returned via /// `return_memfd_slot` when the instance is done using it. - fn take_memfd_slot( - &self, - instance_index: usize, - memory_index: MemoryIndex, - ) -> Result { + fn take_memfd_slot(&self, instance_index: usize, memory_index: MemoryIndex) -> MemFdSlot { let idx = instance_index * self.max_memories + (memory_index.as_u32() as usize); let maybe_slot = self.memfd_slots[idx].lock().unwrap().take(); - maybe_slot.map(|slot| Ok(slot)).unwrap_or_else(|| { + maybe_slot.unwrap_or_else(|| { MemFdSlot::create( self.get_base(instance_index, memory_index) as *mut c_void, self.memory_size,