Remove ftruncate-trick for heap growth with memfd backend.

Testing so far with recent Wasmtime has not been able to show the need
for avoiding the process-wide mmap lock in real-world use-cases. As
such, the technique of using an anonymous file and ftruncate() to extend
it seems unnecessary; instead, memfd can always use anonymous zeroed
memory for heap backing where the CoW image is not present, and
mprotect() to extend the heap limit by changing page protections.
This commit is contained in:
Chris Fallin
2022-01-31 11:13:43 -08:00
parent b73ac83c37
commit 3702e81d30
3 changed files with 85 additions and 121 deletions

View File

@@ -5,8 +5,6 @@ use crate::InstantiationError;
use anyhow::Result;
use libc::c_void;
use rustix::fd::AsRawFd;
use std::convert::TryFrom;
use std::fs::File;
use std::sync::Arc;
/// A single slot handled by the memfd instance-heap mechanism.
@@ -16,8 +14,7 @@ use std::sync::Arc;
/// base ==> (points here)
/// - (image.offset bytes) anonymous zero memory, pre-image
/// - (image.len bytes) CoW mapping of memfd heap image
/// - (up to extension_offset) anonymous zero memory, post-image
/// - (up to static_size) heap expansion region; CoW mapping of per-slot memfd
/// - (up to static_size) anonymous zero memory, post-image
///
/// The ordering of mmaps to set this up is:
///
@@ -25,15 +22,15 @@ use std::sync::Arc;
/// - one large mmap to create 8GiB * instances * memories slots
///
/// - per instantiation of new image in a slot:
/// - mmap of anonymous zero memory, from 0 to initial heap size
/// - mmap of anonymous zero memory, from 0 to max heap size
/// (static_size)
/// - mmap of CoW'd memfd image, from `image.offset` to
/// `image.offset + image.len`. This overwrites part of the
/// anonymous zero memory, potentially splitting it into a pre-
/// and post-region.
/// - mmap of CoW'd extension file, past the initial heap size up to
/// the end of the max memory size (just before the
/// post-guard). This is always adjacent to the above mmaps, but
/// does not overlap/overwrite them.
/// - mprotect(PROT_NONE) on the part of the heap beyond the initial
/// heap size; we re-mprotect it with R+W bits when the heap is
/// grown.
#[derive(Debug)]
pub struct MemFdSlot {
/// The base of the actual heap memory. Bytes at this address are
@@ -44,21 +41,11 @@ pub struct MemFdSlot {
/// The memfd image that backs this memory. May be `None`, in
/// which case the memory is all zeroes.
pub(crate) image: Option<Arc<MemoryMemFd>>,
/// The offset at which the "extension file", which is used to
/// allow for efficient heap growth, is mapped. This is always
/// immediately after the end of the initial memory size.
extension_offset: usize,
/// The anonymous memfd, owned by this slot, which we mmap in the
/// area where the heap may grow during runtime. We use the
/// ftruncate() syscall (invoked via `File::set_len()`) to set its
/// size. We never write any data to it -- we CoW-map it so we can
/// throw away dirty data on termination. Instead, we just use its
/// size as a "watermark" that delineates the boundary between
/// safe-to-access memory and SIGBUS-causing memory. (This works
/// because one can mmap a file beyond its end, and is good
/// because ftruncate does not take the process-wide lock that
/// mmap and mprotect do.)
extension_file: File,
/// The initial heap size.
initial_size: usize,
/// The current heap size. All memory above `base + cur_size`
/// should be PROT_NONE (mapped inaccessible).
cur_size: usize,
/// Whether this slot may have "dirty" pages (pages written by an
/// instantiation). Set by `instantiate()` and cleared by
/// `clear_and_remain_ready()`, and used in assertions to ensure
@@ -67,53 +54,31 @@ pub struct MemFdSlot {
}
impl MemFdSlot {
pub(crate) fn create(
base_addr: *mut c_void,
static_size: usize,
) -> Result<Self, InstantiationError> {
pub(crate) fn create(base_addr: *mut c_void, static_size: usize) -> Self {
let base = base_addr as usize;
// Create a MemFD for the memory growth first -- this covers
// extended heap beyond the initial image.
let extension_memfd = memfd::MemfdOptions::new()
.allow_sealing(true)
.create("wasm-anonymous-heap")
.map_err(|e| InstantiationError::Resource(e.into()))?;
// Seal the ability to write the extension file (make it
// permanently read-only). This is a defense-in-depth
// mitigation to make extra-sure that we don't leak
// information between instantiations. See note in `memfd.rs`
// for more about why we use seals.
extension_memfd
.add_seal(memfd::FileSeal::SealWrite)
.map_err(|e| InstantiationError::Resource(e.into()))?;
extension_memfd
.add_seal(memfd::FileSeal::SealSeal)
.map_err(|e| InstantiationError::Resource(e.into()))?;
let extension_file = extension_memfd.into_file();
extension_file
.set_len(0)
.map_err(|e| InstantiationError::Resource(e.into()))?;
Ok(MemFdSlot {
MemFdSlot {
base,
static_size,
initial_size: 0,
cur_size: 0,
image: None,
extension_file,
extension_offset: 0,
dirty: false,
})
}
}
pub(crate) fn set_heap_limit(&mut self, size_bytes: usize) -> Result<()> {
assert!(size_bytes >= self.extension_offset);
// This is all that is needed to make the new memory
// accessible; we don't need to mprotect anything. (The
// mapping itself is always R+W for the max possible heap
// size, and only the anonymous-backing file length catches
// out-of-bounds accesses.)
self.extension_file
.set_len(u64::try_from(size_bytes - self.extension_offset).unwrap())?;
assert!(size_bytes > self.cur_size);
// mprotect the relevant region.
let start = self.base + self.cur_size;
let len = size_bytes - self.cur_size;
unsafe {
rustix::io::mprotect(
start as *mut _,
len,
rustix::io::MprotectFlags::READ | rustix::io::MprotectFlags::WRITE,
)?;
}
Ok(())
}
@@ -124,31 +89,36 @@ impl MemFdSlot {
) -> Result<(), InstantiationError> {
assert!(!self.dirty);
if let Some(existing_image) = &self.image {
// Fast-path: previously instantiated with the same image,
// so the mappings are already correct; there is no need
// to mmap anything. Given that we asserted not-dirty
// above, any dirty pages will have already been thrown
// away by madvise() during the previous termination.
if let Some(image) = maybe_image {
if existing_image.fd.as_file().as_raw_fd() == image.fd.as_file().as_raw_fd() {
self.dirty = true;
return Ok(());
}
}
// Fast-path: previously instantiated with the same image, or
// no image but the same initial size, so the mappings are
// already correct; there is no need to mmap anything. Given
// that we asserted not-dirty above, any dirty pages will have
// already been thrown away by madvise() during the previous
// termination. The `clear_and_remain_ready()` path also
// mprotects memory above the initial heap size back to
// PROT_NONE, so we don't need to do that here.
if (self.image.is_none()
&& maybe_image.is_none()
&& self.initial_size == initial_size_bytes)
|| (self.image.is_some()
&& maybe_image.is_some()
&& self.image.as_ref().unwrap().fd.as_file().as_raw_fd()
== maybe_image.as_ref().unwrap().fd.as_file().as_raw_fd())
{
self.dirty = true;
return Ok(());
}
// Otherwise, we need to redo (i) the anonymous-mmap backing
// for the initial heap size, (ii) the extension-file backing,
// and (iii) the initial-heap-image mapping if present.
// for the whole slot, (ii) the initial-heap-image mapping if
// present, and (iii) the mprotect(PROT_NONE) above the
// initial heap size.
// Security/audit note: we map all of these MAP_PRIVATE, so
// all instance data is local to the mapping, not propagated
// to the backing fd. We throw away this CoW overlay with
// madvise() below, from base up to extension_offset (which is
// at least initial_size_bytes, and extended when the
// extension file is, so it covers all three mappings) when
// terminating the instance.
// madvise() below, from base up to static_size (which is the
// whole slot) when terminating the instance.
// Anonymous mapping behind the initial heap size: this gives
// zeroes for any "holes" in the initial heap image. Anonymous
@@ -162,7 +132,7 @@ impl MemFdSlot {
unsafe {
let ptr = rustix::io::mmap_anonymous(
self.base as *mut c_void,
initial_size_bytes,
self.static_size,
rustix::io::ProtFlags::READ | rustix::io::ProtFlags::WRITE,
rustix::io::MapFlags::PRIVATE | rustix::io::MapFlags::FIXED,
)
@@ -171,29 +141,8 @@ impl MemFdSlot {
}
}
// An "extension file": this allows us to grow the heap by
// doing just an ftruncate(), without changing any
// mappings. This is important to avoid the process-wide mmap
// lock on Linux.
self.extension_offset = initial_size_bytes;
let extension_map_len = self.static_size - initial_size_bytes;
if extension_map_len > 0 {
unsafe {
let fd = rustix::fd::BorrowedFd::borrow_raw_fd(self.extension_file.as_raw_fd());
let ptr = rustix::io::mmap(
(self.base + initial_size_bytes) as *mut c_void,
extension_map_len,
rustix::io::ProtFlags::READ | rustix::io::ProtFlags::WRITE,
rustix::io::MapFlags::PRIVATE | rustix::io::MapFlags::FIXED,
&fd,
0,
)
.map_err(|e| InstantiationError::Resource(e.into()))?;
assert_eq!(ptr as usize, self.base + initial_size_bytes);
}
}
// Finally, the initial memory image.
// The initial memory image, if given. If not, we just get a
// memory filled with zeroes.
if let Some(image) = maybe_image {
if image.len > 0 {
let image = image.clone();
@@ -216,31 +165,50 @@ impl MemFdSlot {
}
}
// mprotect above `initial_size_bytes`.
self.initial_size = initial_size_bytes;
self.protect_past_initial_size()
.map_err(|e| InstantiationError::Resource(e.into()))?;
self.dirty = true;
Ok(())
}
pub(crate) fn clear_and_remain_ready(&mut self) -> Result<()> {
assert!(self.dirty);
// madvise the image range; that's it! This will throw away
// dirty pages, which are CoW-private pages on top of the
// initial heap image memfd.
// madvise the image range. This will throw away dirty pages,
// which are CoW-private pages on top of the initial heap
// image memfd.
unsafe {
rustix::io::madvise(
self.base as *mut c_void,
self.extension_offset,
self.static_size,
rustix::io::Advice::LinuxDontNeed,
)?;
}
// truncate the extension file down to zero bytes to reset heap length.
self.extension_file
.set_len(0)
.map_err(|e| InstantiationError::Resource(e.into()))?;
// mprotect the region beyond the initial heap size back to PROT_NONE.
self.protect_past_initial_size()?;
self.dirty = false;
Ok(())
}
fn protect_past_initial_size(&self) -> Result<()> {
let mprotect_start = self.base + self.initial_size;
let mprotect_len = self.static_size - self.initial_size;
if mprotect_len > 0 {
unsafe {
rustix::io::mprotect(
mprotect_start as *mut _,
mprotect_len,
rustix::io::MprotectFlags::empty(),
)?;
}
}
Ok(())
}
pub(crate) fn has_image(&self) -> bool {
self.image.is_some()
}

View File

@@ -19,7 +19,7 @@ pub struct MemFdSlot;
#[cfg(not(feature = "memfd-allocator"))]
#[allow(dead_code)]
impl MemFdSlot {
pub(crate) fn create(_: *mut libc::c_void, _: usize) -> Result<Self, InstantiationError> {
pub(crate) fn create(_: *mut libc::c_void, _: usize) -> Self {
panic!("create() on invalid MemFdSlot");
}

View File

@@ -529,7 +529,7 @@ impl InstancePool {
if let Some(memfds) = maybe_memfds {
let image = memfds.get_memory_image(defined_index);
let mut slot = memories.take_memfd_slot(instance_idx, memory_index)?;
let mut slot = memories.take_memfd_slot(instance_idx, memory_index);
let initial_size = plan.memory.minimum * WASM_PAGE_SIZE as u64;
// If instantiation fails, we can propagate the error
@@ -745,15 +745,11 @@ impl MemoryPool {
/// Take ownership of the given memfd slot. Must be returned via
/// `return_memfd_slot` when the instance is done using it.
fn take_memfd_slot(
&self,
instance_index: usize,
memory_index: MemoryIndex,
) -> Result<MemFdSlot, InstantiationError> {
fn take_memfd_slot(&self, instance_index: usize, memory_index: MemoryIndex) -> MemFdSlot {
let idx = instance_index * self.max_memories + (memory_index.as_u32() as usize);
let maybe_slot = self.memfd_slots[idx].lock().unwrap().take();
maybe_slot.map(|slot| Ok(slot)).unwrap_or_else(|| {
maybe_slot.unwrap_or_else(|| {
MemFdSlot::create(
self.get_base(instance_index, memory_index) as *mut c_void,
self.memory_size,