Remove ftruncate-trick for heap growth with memfd backend.

Testing so far with recent Wasmtime has not been able to show the need
for avoiding the process-wide mmap lock in real-world use-cases. As
such, the technique of using an anonymous file and ftruncate() to extend
it seems unnecessary; instead, memfd can always use anonymous zeroed
memory for heap backing where the CoW image is not present, and
mprotect() to extend the heap limit by changing page protections.
This commit is contained in:
Chris Fallin
2022-01-31 11:13:43 -08:00
parent b73ac83c37
commit 3702e81d30
3 changed files with 85 additions and 121 deletions

View File

@@ -5,8 +5,6 @@ use crate::InstantiationError;
use anyhow::Result; use anyhow::Result;
use libc::c_void; use libc::c_void;
use rustix::fd::AsRawFd; use rustix::fd::AsRawFd;
use std::convert::TryFrom;
use std::fs::File;
use std::sync::Arc; use std::sync::Arc;
/// A single slot handled by the memfd instance-heap mechanism. /// A single slot handled by the memfd instance-heap mechanism.
@@ -16,8 +14,7 @@ use std::sync::Arc;
/// base ==> (points here) /// base ==> (points here)
/// - (image.offset bytes) anonymous zero memory, pre-image /// - (image.offset bytes) anonymous zero memory, pre-image
/// - (image.len bytes) CoW mapping of memfd heap image /// - (image.len bytes) CoW mapping of memfd heap image
/// - (up to extension_offset) anonymous zero memory, post-image /// - (up to static_size) anonymous zero memory, post-image
/// - (up to static_size) heap expansion region; CoW mapping of per-slot memfd
/// ///
/// The ordering of mmaps to set this up is: /// The ordering of mmaps to set this up is:
/// ///
@@ -25,15 +22,15 @@ use std::sync::Arc;
/// - one large mmap to create 8GiB * instances * memories slots /// - one large mmap to create 8GiB * instances * memories slots
/// ///
/// - per instantiation of new image in a slot: /// - per instantiation of new image in a slot:
/// - mmap of anonymous zero memory, from 0 to initial heap size /// - mmap of anonymous zero memory, from 0 to max heap size
/// (static_size)
/// - mmap of CoW'd memfd image, from `image.offset` to /// - mmap of CoW'd memfd image, from `image.offset` to
/// `image.offset + image.len`. This overwrites part of the /// `image.offset + image.len`. This overwrites part of the
/// anonymous zero memory, potentially splitting it into a pre- /// anonymous zero memory, potentially splitting it into a pre-
/// and post-region. /// and post-region.
/// - mmap of CoW'd extension file, past the initial heap size up to /// - mprotect(PROT_NONE) on the part of the heap beyond the initial
/// the end of the max memory size (just before the /// heap size; we re-mprotect it with R+W bits when the heap is
/// post-guard). This is always adjacent to the above mmaps, but /// grown.
/// does not overlap/overwrite them.
#[derive(Debug)] #[derive(Debug)]
pub struct MemFdSlot { pub struct MemFdSlot {
/// The base of the actual heap memory. Bytes at this address are /// The base of the actual heap memory. Bytes at this address are
@@ -44,21 +41,11 @@ pub struct MemFdSlot {
/// The memfd image that backs this memory. May be `None`, in /// The memfd image that backs this memory. May be `None`, in
/// which case the memory is all zeroes. /// which case the memory is all zeroes.
pub(crate) image: Option<Arc<MemoryMemFd>>, pub(crate) image: Option<Arc<MemoryMemFd>>,
/// The offset at which the "extension file", which is used to /// The initial heap size.
/// allow for efficient heap growth, is mapped. This is always initial_size: usize,
/// immediately after the end of the initial memory size. /// The current heap size. All memory above `base + cur_size`
extension_offset: usize, /// should be PROT_NONE (mapped inaccessible).
/// The anonymous memfd, owned by this slot, which we mmap in the cur_size: usize,
/// area where the heap may grow during runtime. We use the
/// ftruncate() syscall (invoked via `File::set_len()`) to set its
/// size. We never write any data to it -- we CoW-map it so we can
/// throw away dirty data on termination. Instead, we just use its
/// size as a "watermark" that delineates the boundary between
/// safe-to-access memory and SIGBUS-causing memory. (This works
/// because one can mmap a file beyond its end, and is good
/// because ftruncate does not take the process-wide lock that
/// mmap and mprotect do.)
extension_file: File,
/// Whether this slot may have "dirty" pages (pages written by an /// Whether this slot may have "dirty" pages (pages written by an
/// instantiation). Set by `instantiate()` and cleared by /// instantiation). Set by `instantiate()` and cleared by
/// `clear_and_remain_ready()`, and used in assertions to ensure /// `clear_and_remain_ready()`, and used in assertions to ensure
@@ -67,53 +54,31 @@ pub struct MemFdSlot {
} }
impl MemFdSlot { impl MemFdSlot {
pub(crate) fn create( pub(crate) fn create(base_addr: *mut c_void, static_size: usize) -> Self {
base_addr: *mut c_void,
static_size: usize,
) -> Result<Self, InstantiationError> {
let base = base_addr as usize; let base = base_addr as usize;
MemFdSlot {
// Create a MemFD for the memory growth first -- this covers
// extended heap beyond the initial image.
let extension_memfd = memfd::MemfdOptions::new()
.allow_sealing(true)
.create("wasm-anonymous-heap")
.map_err(|e| InstantiationError::Resource(e.into()))?;
// Seal the ability to write the extension file (make it
// permanently read-only). This is a defense-in-depth
// mitigation to make extra-sure that we don't leak
// information between instantiations. See note in `memfd.rs`
// for more about why we use seals.
extension_memfd
.add_seal(memfd::FileSeal::SealWrite)
.map_err(|e| InstantiationError::Resource(e.into()))?;
extension_memfd
.add_seal(memfd::FileSeal::SealSeal)
.map_err(|e| InstantiationError::Resource(e.into()))?;
let extension_file = extension_memfd.into_file();
extension_file
.set_len(0)
.map_err(|e| InstantiationError::Resource(e.into()))?;
Ok(MemFdSlot {
base, base,
static_size, static_size,
initial_size: 0,
cur_size: 0,
image: None, image: None,
extension_file,
extension_offset: 0,
dirty: false, dirty: false,
}) }
} }
pub(crate) fn set_heap_limit(&mut self, size_bytes: usize) -> Result<()> { pub(crate) fn set_heap_limit(&mut self, size_bytes: usize) -> Result<()> {
assert!(size_bytes >= self.extension_offset); assert!(size_bytes > self.cur_size);
// This is all that is needed to make the new memory // mprotect the relevant region.
// accessible; we don't need to mprotect anything. (The let start = self.base + self.cur_size;
// mapping itself is always R+W for the max possible heap let len = size_bytes - self.cur_size;
// size, and only the anonymous-backing file length catches unsafe {
// out-of-bounds accesses.) rustix::io::mprotect(
self.extension_file start as *mut _,
.set_len(u64::try_from(size_bytes - self.extension_offset).unwrap())?; len,
rustix::io::MprotectFlags::READ | rustix::io::MprotectFlags::WRITE,
)?;
}
Ok(()) Ok(())
} }
@@ -124,31 +89,36 @@ impl MemFdSlot {
) -> Result<(), InstantiationError> { ) -> Result<(), InstantiationError> {
assert!(!self.dirty); assert!(!self.dirty);
if let Some(existing_image) = &self.image { // Fast-path: previously instantiated with the same image, or
// Fast-path: previously instantiated with the same image, // no image but the same initial size, so the mappings are
// so the mappings are already correct; there is no need // already correct; there is no need to mmap anything. Given
// to mmap anything. Given that we asserted not-dirty // that we asserted not-dirty above, any dirty pages will have
// above, any dirty pages will have already been thrown // already been thrown away by madvise() during the previous
// away by madvise() during the previous termination. // termination. The `clear_and_remain_ready()` path also
if let Some(image) = maybe_image { // mprotects memory above the initial heap size back to
if existing_image.fd.as_file().as_raw_fd() == image.fd.as_file().as_raw_fd() { // PROT_NONE, so we don't need to do that here.
if (self.image.is_none()
&& maybe_image.is_none()
&& self.initial_size == initial_size_bytes)
|| (self.image.is_some()
&& maybe_image.is_some()
&& self.image.as_ref().unwrap().fd.as_file().as_raw_fd()
== maybe_image.as_ref().unwrap().fd.as_file().as_raw_fd())
{
self.dirty = true; self.dirty = true;
return Ok(()); return Ok(());
} }
}
}
// Otherwise, we need to redo (i) the anonymous-mmap backing // Otherwise, we need to redo (i) the anonymous-mmap backing
// for the initial heap size, (ii) the extension-file backing, // for the whole slot, (ii) the initial-heap-image mapping if
// and (iii) the initial-heap-image mapping if present. // present, and (iii) the mprotect(PROT_NONE) above the
// initial heap size.
// Security/audit note: we map all of these MAP_PRIVATE, so // Security/audit note: we map all of these MAP_PRIVATE, so
// all instance data is local to the mapping, not propagated // all instance data is local to the mapping, not propagated
// to the backing fd. We throw away this CoW overlay with // to the backing fd. We throw away this CoW overlay with
// madvise() below, from base up to extension_offset (which is // madvise() below, from base up to static_size (which is the
// at least initial_size_bytes, and extended when the // whole slot) when terminating the instance.
// extension file is, so it covers all three mappings) when
// terminating the instance.
// Anonymous mapping behind the initial heap size: this gives // Anonymous mapping behind the initial heap size: this gives
// zeroes for any "holes" in the initial heap image. Anonymous // zeroes for any "holes" in the initial heap image. Anonymous
@@ -162,7 +132,7 @@ impl MemFdSlot {
unsafe { unsafe {
let ptr = rustix::io::mmap_anonymous( let ptr = rustix::io::mmap_anonymous(
self.base as *mut c_void, self.base as *mut c_void,
initial_size_bytes, self.static_size,
rustix::io::ProtFlags::READ | rustix::io::ProtFlags::WRITE, rustix::io::ProtFlags::READ | rustix::io::ProtFlags::WRITE,
rustix::io::MapFlags::PRIVATE | rustix::io::MapFlags::FIXED, rustix::io::MapFlags::PRIVATE | rustix::io::MapFlags::FIXED,
) )
@@ -171,29 +141,8 @@ impl MemFdSlot {
} }
} }
// An "extension file": this allows us to grow the heap by // The initial memory image, if given. If not, we just get a
// doing just an ftruncate(), without changing any // memory filled with zeroes.
// mappings. This is important to avoid the process-wide mmap
// lock on Linux.
self.extension_offset = initial_size_bytes;
let extension_map_len = self.static_size - initial_size_bytes;
if extension_map_len > 0 {
unsafe {
let fd = rustix::fd::BorrowedFd::borrow_raw_fd(self.extension_file.as_raw_fd());
let ptr = rustix::io::mmap(
(self.base + initial_size_bytes) as *mut c_void,
extension_map_len,
rustix::io::ProtFlags::READ | rustix::io::ProtFlags::WRITE,
rustix::io::MapFlags::PRIVATE | rustix::io::MapFlags::FIXED,
&fd,
0,
)
.map_err(|e| InstantiationError::Resource(e.into()))?;
assert_eq!(ptr as usize, self.base + initial_size_bytes);
}
}
// Finally, the initial memory image.
if let Some(image) = maybe_image { if let Some(image) = maybe_image {
if image.len > 0 { if image.len > 0 {
let image = image.clone(); let image = image.clone();
@@ -216,31 +165,50 @@ impl MemFdSlot {
} }
} }
// mprotect above `initial_size_bytes`.
self.initial_size = initial_size_bytes;
self.protect_past_initial_size()
.map_err(|e| InstantiationError::Resource(e.into()))?;
self.dirty = true; self.dirty = true;
Ok(()) Ok(())
} }
pub(crate) fn clear_and_remain_ready(&mut self) -> Result<()> { pub(crate) fn clear_and_remain_ready(&mut self) -> Result<()> {
assert!(self.dirty); assert!(self.dirty);
// madvise the image range; that's it! This will throw away // madvise the image range. This will throw away dirty pages,
// dirty pages, which are CoW-private pages on top of the // which are CoW-private pages on top of the initial heap
// initial heap image memfd. // image memfd.
unsafe { unsafe {
rustix::io::madvise( rustix::io::madvise(
self.base as *mut c_void, self.base as *mut c_void,
self.extension_offset, self.static_size,
rustix::io::Advice::LinuxDontNeed, rustix::io::Advice::LinuxDontNeed,
)?; )?;
} }
// truncate the extension file down to zero bytes to reset heap length. // mprotect the region beyond the initial heap size back to PROT_NONE.
self.extension_file self.protect_past_initial_size()?;
.set_len(0)
.map_err(|e| InstantiationError::Resource(e.into()))?;
self.dirty = false; self.dirty = false;
Ok(()) Ok(())
} }
fn protect_past_initial_size(&self) -> Result<()> {
let mprotect_start = self.base + self.initial_size;
let mprotect_len = self.static_size - self.initial_size;
if mprotect_len > 0 {
unsafe {
rustix::io::mprotect(
mprotect_start as *mut _,
mprotect_len,
rustix::io::MprotectFlags::empty(),
)?;
}
}
Ok(())
}
pub(crate) fn has_image(&self) -> bool { pub(crate) fn has_image(&self) -> bool {
self.image.is_some() self.image.is_some()
} }

View File

@@ -19,7 +19,7 @@ pub struct MemFdSlot;
#[cfg(not(feature = "memfd-allocator"))] #[cfg(not(feature = "memfd-allocator"))]
#[allow(dead_code)] #[allow(dead_code)]
impl MemFdSlot { impl MemFdSlot {
pub(crate) fn create(_: *mut libc::c_void, _: usize) -> Result<Self, InstantiationError> { pub(crate) fn create(_: *mut libc::c_void, _: usize) -> Self {
panic!("create() on invalid MemFdSlot"); panic!("create() on invalid MemFdSlot");
} }

View File

@@ -529,7 +529,7 @@ impl InstancePool {
if let Some(memfds) = maybe_memfds { if let Some(memfds) = maybe_memfds {
let image = memfds.get_memory_image(defined_index); let image = memfds.get_memory_image(defined_index);
let mut slot = memories.take_memfd_slot(instance_idx, memory_index)?; let mut slot = memories.take_memfd_slot(instance_idx, memory_index);
let initial_size = plan.memory.minimum * WASM_PAGE_SIZE as u64; let initial_size = plan.memory.minimum * WASM_PAGE_SIZE as u64;
// If instantiation fails, we can propagate the error // If instantiation fails, we can propagate the error
@@ -745,15 +745,11 @@ impl MemoryPool {
/// Take ownership of the given memfd slot. Must be returned via /// Take ownership of the given memfd slot. Must be returned via
/// `return_memfd_slot` when the instance is done using it. /// `return_memfd_slot` when the instance is done using it.
fn take_memfd_slot( fn take_memfd_slot(&self, instance_index: usize, memory_index: MemoryIndex) -> MemFdSlot {
&self,
instance_index: usize,
memory_index: MemoryIndex,
) -> Result<MemFdSlot, InstantiationError> {
let idx = instance_index * self.max_memories + (memory_index.as_u32() as usize); let idx = instance_index * self.max_memories + (memory_index.as_u32() as usize);
let maybe_slot = self.memfd_slots[idx].lock().unwrap().take(); let maybe_slot = self.memfd_slots[idx].lock().unwrap().take();
maybe_slot.map(|slot| Ok(slot)).unwrap_or_else(|| { maybe_slot.unwrap_or_else(|| {
MemFdSlot::create( MemFdSlot::create(
self.get_base(instance_index, memory_index) as *mut c_void, self.get_base(instance_index, memory_index) as *mut c_void,
self.memory_size, self.memory_size,