Add a pooling allocator mode based on copy-on-write mappings of memfds.

As first suggested by Jan on the Zulip here [1], a cheap and effective way to obtain copy-on-write semantics of a "backing image" for a Wasm memory is to mmap a file with `MAP_PRIVATE`. The `memfd` mechanism provided by the Linux kernel allows us to create anonymous, in-memory-only files that we can use for this mapping, so we can construct the image contents on-the-fly then effectively create a CoW overlay. Furthermore, and importantly, `madvise(MADV_DONTNEED, ...)` will discard the CoW overlay, returning the mapping to its original state. By itself this is almost enough for a very fast instantiation-termination loop of the same image over and over, without changing the address space mapping at all (which is expensive). The only missing bit is how to implement heap *growth*. But here memfds can help us again: if we create another anonymous file and map it where the extended parts of the heap would go, we can take advantage of the fact that a `mmap()` mapping can be *larger than the file itself*, with accesses beyond the end generating a `SIGBUS`, and the fact that we can cheaply resize the file with `ftruncate`, even after a mapping exists. So we can map the "heap extension" file once with the maximum memory-slot size and grow the memfd itself as `memory.grow` operations occur. The above CoW technique and heap-growth technique together allow us a fastpath of `madvise()` and `ftruncate()` only when we re-instantiate the same module over and over, as long as we can reuse the same slot. This fastpath avoids all whole-process address-space locks in the Linux kernel, which should mean it is highly scalable. It also avoids the cost of copying data on read, as the `uffd` heap backend does when servicing pagefaults; the kernel's own optimized CoW logic (same as used by all file mmaps) is used instead. [1] https://bytecodealliance.zulipchat.com/#narrow/stream/206238-general/topic/Copy.20on.20write.20based.20instance.20reuse/near/266657772
2022-01-18 16:42:24 -08:00
parent 90e7cef56c
commit b73ac83c37
26 changed files with 1070 additions and 135 deletions
--- a/crates/runtime/src/instance/allocator.rs
+++ b/crates/runtime/src/instance/allocator.rs
@@ -4,28 +4,37 @@ use crate::memory::{DefaultMemoryCreator, Memory};
 use crate::table::Table;
 use crate::traphandlers::Trap;
 use crate::vmcontext::{
-    VMBuiltinFunctionsArray, VMCallerCheckedAnyfunc, VMContext, VMGlobalDefinition,
-    VMSharedSignatureIndex,
+    VMBuiltinFunctionsArray, VMCallerCheckedAnyfunc, VMGlobalDefinition, VMSharedSignatureIndex,
 };
+use crate::ModuleMemFds;
 use crate::Store;
 use anyhow::Result;
 use std::alloc;
 use std::any::Any;
 use std::convert::TryFrom;
-use std::marker;
 use std::ptr::{self, NonNull};
 use std::slice;
 use std::sync::Arc;
 use thiserror::Error;
 use wasmtime_environ::{
-    DefinedFuncIndex, DefinedMemoryIndex, DefinedTableIndex, EntityRef, EntitySet, FunctionInfo,
-    GlobalInit, HostPtr, MemoryInitialization, MemoryInitializer, Module, ModuleType, PrimaryMap,
-    SignatureIndex, TableInitializer, TrapCode, VMOffsets, WasmType, WASM_PAGE_SIZE,
+    DefinedFuncIndex, DefinedMemoryIndex, DefinedTableIndex, EntityRef, FunctionInfo, GlobalInit,
+    MemoryInitialization, MemoryInitializer, Module, ModuleType, PrimaryMap, SignatureIndex,
+    TableInitializer, TrapCode, WasmType, WASM_PAGE_SIZE,
 };

 #[cfg(feature = "pooling-allocator")]
 mod pooling;

+#[cfg(feature = "memfd-allocator")]
+mod memfd;
+#[cfg(feature = "memfd-allocator")]
+pub use self::memfd::MemFdSlot;
+
+#[cfg(not(feature = "memfd-allocator"))]
+mod memfd_disabled;
+#[cfg(not(feature = "memfd-allocator"))]
+pub use self::memfd_disabled::MemFdSlot;
+
 #[cfg(feature = "pooling-allocator")]
 pub use self::pooling::{
    InstanceLimits, ModuleLimits, PoolingAllocationStrategy, PoolingInstanceAllocator,
@@ -39,6 +48,9 @@ pub struct InstanceAllocationRequest<'a> {
    /// The base address of where JIT functions are located.
    pub image_base: usize,

+    /// If using MemFD-based memories, the backing MemFDs.
+    pub memfds: Option<Arc<ModuleMemFds>>,
+
    /// Descriptors about each compiled function, such as the offset from
    /// `image_base`.
    pub functions: &'a PrimaryMap<DefinedFuncIndex, FunctionInfo>,
@@ -376,9 +388,23 @@ fn check_memory_init_bounds(

 fn initialize_memories(
    instance: &mut Instance,
+    module: &Module,
    initializers: &[MemoryInitializer],
 ) -> Result<(), InstantiationError> {
    for init in initializers {
+        // Check whether this is a MemFD memory; if so, we can skip
+        // all initializers.
+        let memory = init.memory_index;
+        if let Some(defined_index) = module.defined_memory_index(memory) {
+            // We can only skip if there is actually a MemFD image. In
+            // some situations the MemFD image creation code will bail
+            // (e.g. due to an out of bounds data segment) and so we
+            // need to fall back on the usual initialization below.
+            if instance.memories[defined_index].is_memfd_with_image() {
+                continue;
+            }
+        }
+
        instance
            .memory_init_segment(
                init.memory_index,
@@ -432,6 +458,14 @@ fn initialize_instance(
    match &module.memory_initialization {
        MemoryInitialization::Paged { map, out_of_bounds } => {
            for (index, pages) in map {
+                // We can only skip if there is actually a MemFD image. In
+                // some situations the MemFD image creation code will bail
+                // (e.g. due to an out of bounds data segment) and so we
+                // need to fall back on the usual initialization below.
+                if instance.memories[index].is_memfd_with_image() {
+                    continue;
+                }
+
                let memory = instance.memory(index);
                let slice =
                    unsafe { slice::from_raw_parts_mut(memory.base, memory.current_length) };
@@ -453,7 +487,7 @@ fn initialize_instance(
            }
        }
        MemoryInitialization::Segmented(initializers) => {
-            initialize_memories(instance, initializers)?;
+            initialize_memories(instance, module, initializers)?;
        }
    }

@@ -691,19 +725,8 @@ unsafe impl InstanceAllocator for OnDemandInstanceAllocator {
        let host_state = std::mem::replace(&mut req.host_state, Box::new(()));

        let mut handle = {
-            let instance = Instance {
-                module: req.module.clone(),
-                offsets: VMOffsets::new(HostPtr, &req.module),
-                memories,
-                tables,
-                dropped_elements: EntitySet::with_capacity(req.module.passive_elements.len()),
-                dropped_data: EntitySet::with_capacity(req.module.passive_data_map.len()),
-                host_state,
-                wasm_data: &*req.wasm_data,
-                vmctx: VMContext {
-                    _marker: marker::PhantomPinned,
-                },
-            };
+            let instance =
+                Instance::create_raw(&req.module, &*req.wasm_data, memories, tables, host_state);
            let layout = instance.alloc_layout();
            let instance_ptr = alloc::alloc(layout) as *mut Instance;
            if instance_ptr.is_null() {
--- a/crates/runtime/src/instance/allocator/memfd.rs
+++ b/crates/runtime/src/instance/allocator/memfd.rs
@@ -0,0 +1,290 @@
+//! memfd mapping logic for use by the pooling allocator.
+
+use crate::memfd::MemoryMemFd;
+use crate::InstantiationError;
+use anyhow::Result;
+use libc::c_void;
+use rustix::fd::AsRawFd;
+use std::convert::TryFrom;
+use std::fs::File;
+use std::sync::Arc;
+
+/// A single slot handled by the memfd instance-heap mechanism.
+///
+/// The mmap scheme is:
+///
+/// base ==> (points here)
+/// - (image.offset bytes)   anonymous zero memory, pre-image
+/// - (image.len bytes)      CoW mapping of memfd heap image
+/// - (up to extension_offset)  anonymous zero memory, post-image
+/// - (up to static_size)    heap expansion region; CoW mapping of per-slot memfd
+///
+/// The ordering of mmaps to set this up is:
+///
+/// - once, when pooling allocator is created:
+///   - one large mmap to create 8GiB * instances * memories slots
+///
+/// - per instantiation of new image in a slot:
+///   - mmap of anonymous zero memory, from 0 to initial heap size
+///   - mmap of CoW'd memfd image, from `image.offset` to
+///     `image.offset + image.len`. This overwrites part of the
+///     anonymous zero memory, potentially splitting it into a pre-
+///     and post-region.
+///   - mmap of CoW'd extension file, past the initial heap size up to
+///     the end of the max memory size (just before the
+///     post-guard). This is always adjacent to the above mmaps, but
+///     does not overlap/overwrite them.
+#[derive(Debug)]
+pub struct MemFdSlot {
+    /// The base of the actual heap memory. Bytes at this address are
+    /// what is seen by the Wasm guest code.
+    base: usize,
+    /// The maximum static memory size, plus post-guard.
+    static_size: usize,
+    /// The memfd image that backs this memory. May be `None`, in
+    /// which case the memory is all zeroes.
+    pub(crate) image: Option<Arc<MemoryMemFd>>,
+    /// The offset at which the "extension file", which is used to
+    /// allow for efficient heap growth, is mapped. This is always
+    /// immediately after the end of the initial memory size.
+    extension_offset: usize,
+    /// The anonymous memfd, owned by this slot, which we mmap in the
+    /// area where the heap may grow during runtime. We use the
+    /// ftruncate() syscall (invoked via `File::set_len()`) to set its
+    /// size. We never write any data to it -- we CoW-map it so we can
+    /// throw away dirty data on termination. Instead, we just use its
+    /// size as a "watermark" that delineates the boundary between
+    /// safe-to-access memory and SIGBUS-causing memory. (This works
+    /// because one can mmap a file beyond its end, and is good
+    /// because ftruncate does not take the process-wide lock that
+    /// mmap and mprotect do.)
+    extension_file: File,
+    /// Whether this slot may have "dirty" pages (pages written by an
+    /// instantiation). Set by `instantiate()` and cleared by
+    /// `clear_and_remain_ready()`, and used in assertions to ensure
+    /// those methods are called properly.
+    dirty: bool,
+}
+
+impl MemFdSlot {
+    pub(crate) fn create(
+        base_addr: *mut c_void,
+        static_size: usize,
+    ) -> Result<Self, InstantiationError> {
+        let base = base_addr as usize;
+
+        // Create a MemFD for the memory growth first -- this covers
+        // extended heap beyond the initial image.
+        let extension_memfd = memfd::MemfdOptions::new()
+            .allow_sealing(true)
+            .create("wasm-anonymous-heap")
+            .map_err(|e| InstantiationError::Resource(e.into()))?;
+        // Seal the ability to write the extension file (make it
+        // permanently read-only). This is a defense-in-depth
+        // mitigation to make extra-sure that we don't leak
+        // information between instantiations. See note in `memfd.rs`
+        // for more about why we use seals.
+        extension_memfd
+            .add_seal(memfd::FileSeal::SealWrite)
+            .map_err(|e| InstantiationError::Resource(e.into()))?;
+        extension_memfd
+            .add_seal(memfd::FileSeal::SealSeal)
+            .map_err(|e| InstantiationError::Resource(e.into()))?;
+        let extension_file = extension_memfd.into_file();
+        extension_file
+            .set_len(0)
+            .map_err(|e| InstantiationError::Resource(e.into()))?;
+
+        Ok(MemFdSlot {
+            base,
+            static_size,
+            image: None,
+            extension_file,
+            extension_offset: 0,
+            dirty: false,
+        })
+    }
+
+    pub(crate) fn set_heap_limit(&mut self, size_bytes: usize) -> Result<()> {
+        assert!(size_bytes >= self.extension_offset);
+        // This is all that is needed to make the new memory
+        // accessible; we don't need to mprotect anything. (The
+        // mapping itself is always R+W for the max possible heap
+        // size, and only the anonymous-backing file length catches
+        // out-of-bounds accesses.)
+        self.extension_file
+            .set_len(u64::try_from(size_bytes - self.extension_offset).unwrap())?;
+        Ok(())
+    }
+
+    pub(crate) fn instantiate(
+        &mut self,
+        initial_size_bytes: usize,
+        maybe_image: Option<&Arc<MemoryMemFd>>,
+    ) -> Result<(), InstantiationError> {
+        assert!(!self.dirty);
+
+        if let Some(existing_image) = &self.image {
+            // Fast-path: previously instantiated with the same image,
+            // so the mappings are already correct; there is no need
+            // to mmap anything. Given that we asserted not-dirty
+            // above, any dirty pages will have already been thrown
+            // away by madvise() during the previous termination.
+            if let Some(image) = maybe_image {
+                if existing_image.fd.as_file().as_raw_fd() == image.fd.as_file().as_raw_fd() {
+                    self.dirty = true;
+                    return Ok(());
+                }
+            }
+        }
+
+        // Otherwise, we need to redo (i) the anonymous-mmap backing
+        // for the initial heap size, (ii) the extension-file backing,
+        // and (iii) the initial-heap-image mapping if present.
+
+        // Security/audit note: we map all of these MAP_PRIVATE, so
+        // all instance data is local to the mapping, not propagated
+        // to the backing fd. We throw away this CoW overlay with
+        // madvise() below, from base up to extension_offset (which is
+        // at least initial_size_bytes, and extended when the
+        // extension file is, so it covers all three mappings) when
+        // terminating the instance.
+
+        // Anonymous mapping behind the initial heap size: this gives
+        // zeroes for any "holes" in the initial heap image. Anonymous
+        // mmap memory is faster to fault in than a CoW of a file,
+        // even a file with zero holes, because the kernel's CoW path
+        // unconditionally copies *something* (even if just a page of
+        // zeroes). Anonymous zero pages are fast: the kernel
+        // pre-zeroes them, and even if it runs out of those, a memset
+        // is half as expensive as a memcpy (only writes, no reads).
+        if initial_size_bytes > 0 {
+            unsafe {
+                let ptr = rustix::io::mmap_anonymous(
+                    self.base as *mut c_void,
+                    initial_size_bytes,
+                    rustix::io::ProtFlags::READ | rustix::io::ProtFlags::WRITE,
+                    rustix::io::MapFlags::PRIVATE | rustix::io::MapFlags::FIXED,
+                )
+                .map_err(|e| InstantiationError::Resource(e.into()))?;
+                assert_eq!(ptr as usize, self.base);
+            }
+        }
+
+        // An "extension file": this allows us to grow the heap by
+        // doing just an ftruncate(), without changing any
+        // mappings. This is important to avoid the process-wide mmap
+        // lock on Linux.
+        self.extension_offset = initial_size_bytes;
+        let extension_map_len = self.static_size - initial_size_bytes;
+        if extension_map_len > 0 {
+            unsafe {
+                let fd = rustix::fd::BorrowedFd::borrow_raw_fd(self.extension_file.as_raw_fd());
+                let ptr = rustix::io::mmap(
+                    (self.base + initial_size_bytes) as *mut c_void,
+                    extension_map_len,
+                    rustix::io::ProtFlags::READ | rustix::io::ProtFlags::WRITE,
+                    rustix::io::MapFlags::PRIVATE | rustix::io::MapFlags::FIXED,
+                    &fd,
+                    0,
+                )
+                .map_err(|e| InstantiationError::Resource(e.into()))?;
+                assert_eq!(ptr as usize, self.base + initial_size_bytes);
+            }
+        }
+
+        // Finally, the initial memory image.
+        if let Some(image) = maybe_image {
+            if image.len > 0 {
+                let image = image.clone();
+
+                unsafe {
+                    let fd = rustix::fd::BorrowedFd::borrow_raw_fd(image.fd.as_file().as_raw_fd());
+                    let ptr = rustix::io::mmap(
+                        (self.base + image.offset) as *mut c_void,
+                        image.len,
+                        rustix::io::ProtFlags::READ | rustix::io::ProtFlags::WRITE,
+                        rustix::io::MapFlags::PRIVATE | rustix::io::MapFlags::FIXED,
+                        &fd,
+                        image.offset as u64,
+                    )
+                    .map_err(|e| InstantiationError::Resource(e.into()))?;
+                    assert_eq!(ptr as usize, self.base + image.offset);
+                }
+
+                self.image = Some(image);
+            }
+        }
+
+        self.dirty = true;
+        Ok(())
+    }
+
+    pub(crate) fn clear_and_remain_ready(&mut self) -> Result<()> {
+        assert!(self.dirty);
+        // madvise the image range; that's it! This will throw away
+        // dirty pages, which are CoW-private pages on top of the
+        // initial heap image memfd.
+        unsafe {
+            rustix::io::madvise(
+                self.base as *mut c_void,
+                self.extension_offset,
+                rustix::io::Advice::LinuxDontNeed,
+            )?;
+        }
+
+        // truncate the extension file down to zero bytes to reset heap length.
+        self.extension_file
+            .set_len(0)
+            .map_err(|e| InstantiationError::Resource(e.into()))?;
+        self.dirty = false;
+        Ok(())
+    }
+
+    pub(crate) fn has_image(&self) -> bool {
+        self.image.is_some()
+    }
+
+    pub(crate) fn is_dirty(&self) -> bool {
+        self.dirty
+    }
+}
+
+#[cfg(feature = "memfd-allocator")]
+impl Drop for MemFdSlot {
+    fn drop(&mut self) {
+        // The MemFdSlot may be dropped if there is an error during
+        // instantiation: for example, if a memory-growth limiter
+        // disallows a guest from having a memory of a certain size,
+        // after we've already initialized the MemFdSlot.
+        //
+        // We need to return this region of the large pool mmap to a
+        // safe state (with no module-specific mappings). The
+        // MemFdSlot will not be returned to the MemoryPool, so a new
+        // MemFdSlot will be created and overwrite the mappings anyway
+        // on the slot's next use; but for safety and to avoid
+        // resource leaks it's better not to have stale mappings to a
+        // possibly-otherwise-dead module's image.
+        //
+        // To "wipe the slate clean", let's do a mmap of anonymous
+        // memory over the whole region, with PROT_NONE. Note that we
+        // *can't* simply munmap, because that leaves a hole in the
+        // middle of the pooling allocator's big memory area that some
+        // other random mmap may swoop in and take, to be trampled
+        // over by the next MemFdSlot later.
+        //
+        // Since we're in drop(), we can't sanely return an error if
+        // this mmap fails. Let's ignore the failure if so; the next
+        // MemFdSlot to be created for this slot will try to overwrite
+        // the existing stale mappings, and return a failure properly
+        // if we still cannot map new memory.
+        unsafe {
+            let _ = rustix::io::mmap_anonymous(
+                self.base as *mut _,
+                self.static_size,
+                rustix::io::ProtFlags::empty(),
+                rustix::io::MapFlags::FIXED | rustix::io::MapFlags::NORESERVE,
+            );
+        }
+    }
+}
--- a/crates/runtime/src/instance/allocator/memfd_disabled.rs
+++ b/crates/runtime/src/instance/allocator/memfd_disabled.rs
@@ -0,0 +1,49 @@
+//! Shims for MemFdSlot when the memfd allocator is not
+//! included. Enables unconditional use of the type and its methods
+//! throughout higher-level code.
+
+use crate::InstantiationError;
+use anyhow::Result;
+use std::sync::Arc;
+
+/// A placeholder for MemFdSlot when we have not included the pooling
+/// allocator.
+///
+/// To allow MemFdSlot to be unconditionally passed around in various
+/// places (e.g. a `Memory`), we define a zero-sized type when memfd is
+/// not included in the build.
+#[cfg(not(feature = "memfd-allocator"))]
+#[derive(Debug)]
+pub struct MemFdSlot;
+
+#[cfg(not(feature = "memfd-allocator"))]
+#[allow(dead_code)]
+impl MemFdSlot {
+    pub(crate) fn create(_: *mut libc::c_void, _: usize) -> Result<Self, InstantiationError> {
+        panic!("create() on invalid MemFdSlot");
+    }
+
+    pub(crate) fn instantiate(
+        &mut self,
+        _: usize,
+        _: Option<&Arc<crate::memfd::MemoryMemFd>>,
+    ) -> Result<Self, InstantiationError> {
+        panic!("instantiate() on invalid MemFdSlot");
+    }
+
+    pub(crate) fn clear_and_remain_ready(&mut self) -> Result<()> {
+        Ok(())
+    }
+
+    pub(crate) fn has_image(&self) -> bool {
+        false
+    }
+
+    pub(crate) fn is_dirty(&self) -> bool {
+        false
+    }
+
+    pub(crate) fn set_heap_limit(&mut self, _: usize) -> Result<()> {
+        panic!("set_heap_limit on invalid MemFdSlot");
+    }
+}
--- a/crates/runtime/src/instance/allocator/pooling.rs
+++ b/crates/runtime/src/instance/allocator/pooling.rs
@@ -7,19 +7,21 @@
 //! Using the pooling instance allocator can speed up module instantiation
 //! when modules can be constrained based on configurable limits.

+use super::MemFdSlot;
 use super::{
    initialize_instance, initialize_vmcontext, InstanceAllocationRequest, InstanceAllocator,
    InstanceHandle, InstantiationError,
 };
-use crate::{instance::Instance, Memory, Mmap, Table, VMContext};
+use crate::{instance::Instance, Memory, Mmap, ModuleMemFds, Table};
 use anyhow::{anyhow, bail, Context, Result};
+use libc::c_void;
 use rand::Rng;
 use std::convert::TryFrom;
-use std::marker;
 use std::mem;
-use std::sync::{Arc, Mutex};
+use std::sync::Arc;
+use std::sync::Mutex;
 use wasmtime_environ::{
-    EntitySet, HostPtr, MemoryStyle, Module, PrimaryMap, Tunables, VMOffsets, VMOffsetsFields,
+    HostPtr, MemoryIndex, MemoryStyle, Module, PrimaryMap, Tunables, VMOffsets, VMOffsetsFields,
    WASM_PAGE_SIZE,
 };

@@ -284,7 +286,6 @@ struct InstancePool {
    free_list: Mutex<Vec<usize>>,
    memories: MemoryPool,
    tables: TablePool,
-    empty_module: Arc<Module>,
 }

 impl InstancePool {
@@ -332,14 +333,8 @@ impl InstancePool {
            free_list: Mutex::new((0..max_instances).collect()),
            memories: MemoryPool::new(module_limits, instance_limits, tunables)?,
            tables: TablePool::new(module_limits, instance_limits)?,
-            empty_module: Arc::new(Module::default()),
        };

-        // Use a default module to initialize the instances to start
-        for i in 0..instance_limits.count as usize {
-            pool.initialize(module_limits, i);
-        }
-
        Ok(pool)
    }

@@ -348,41 +343,26 @@ impl InstancePool {
        &mut *(self.mapping.as_mut_ptr().add(index * self.instance_size) as *mut Instance)
    }

-    fn initialize(&self, limits: &ModuleLimits, index: usize) {
-        unsafe {
-            let instance = self.instance(index);
-
-            // Write a default instance with preallocated memory/table map storage to the ptr
-            std::ptr::write(
-                instance as _,
-                Instance {
-                    module: self.empty_module.clone(),
-                    offsets: VMOffsets::new(HostPtr, &self.empty_module),
-                    memories: PrimaryMap::with_capacity(limits.memories as usize),
-                    tables: PrimaryMap::with_capacity(limits.tables as usize),
-                    dropped_elements: EntitySet::new(),
-                    dropped_data: EntitySet::new(),
-                    host_state: Box::new(()),
-                    wasm_data: &[],
-                    vmctx: VMContext {
-                        _marker: marker::PhantomPinned,
-                    },
-                },
-            );
-        }
-    }
-
    unsafe fn setup_instance(
        &self,
        index: usize,
        mut req: InstanceAllocationRequest,
    ) -> Result<InstanceHandle, InstantiationError> {
-        let instance = self.instance(index);
+        let host_state = std::mem::replace(&mut req.host_state, Box::new(()));
+        let instance_data = Instance::create_raw(
+            &req.module,
+            &*req.wasm_data,
+            PrimaryMap::default(),
+            PrimaryMap::default(),
+            host_state,
+        );

-        instance.module = req.module.clone();
-        instance.offsets = VMOffsets::new(HostPtr, instance.module.as_ref());
-        instance.host_state = std::mem::replace(&mut req.host_state, Box::new(()));
-        instance.wasm_data = &*req.wasm_data;
+        // Instances are uninitialized memory at first; we need to
+        // write an empty but initialized `Instance` struct into the
+        // chosen slot before we do anything else with it. (This is
+        // paired with a `drop_in_place` in deallocate below.)
+        let instance = self.instance(index);
+        std::ptr::write(instance as _, instance_data);

        // set_instance_memories and _tables will need the store before we can completely
        // initialize the vmcontext.
@@ -391,8 +371,10 @@ impl InstancePool {
        }

        Self::set_instance_memories(
+            index,
            instance,
-            self.memories.get(index),
+            &self.memories,
+            &req.memfds,
            self.memories.max_wasm_pages,
        )?;

@@ -448,20 +430,44 @@ impl InstancePool {
        let instance = unsafe { &mut *handle.instance };

        // Decommit any linear memories that were used
-        for (memory, base) in instance.memories.values_mut().zip(self.memories.get(index)) {
+        for ((def_mem_idx, memory), base) in
+            instance.memories.iter_mut().zip(self.memories.get(index))
+        {
            let mut memory = mem::take(memory);
            debug_assert!(memory.is_static());

-            // Reset any faulted guard pages as the physical memory may be reused for another instance in the future
-            #[cfg(all(feature = "uffd", target_os = "linux"))]
-            memory
-                .reset_guard_pages()
-                .expect("failed to reset guard pages");
-            drop(&mut memory); // require mutable on all platforms, not just uffd
+            match memory {
+                Memory::Static {
+                    memfd_slot: Some(mut memfd_slot),
+                    ..
+                } => {
+                    let mem_idx = instance.module.memory_index(def_mem_idx);
+                    // If there was any error clearing the memfd, just
+                    // drop it here, and let the drop handler for the
+                    // MemFdSlot unmap in a way that retains the
+                    // address space reservation.
+                    if memfd_slot.clear_and_remain_ready().is_ok() {
+                        self.memories.return_memfd_slot(index, mem_idx, memfd_slot);
+                    }
+                }

-            let size = memory.byte_size();
-            drop(memory);
-            decommit_memory_pages(base, size).expect("failed to decommit linear memory pages");
+                _ => {
+                    // Reset any faulted guard pages as the physical
+                    // memory may be reused for another instance in
+                    // the future.
+                    #[cfg(all(feature = "uffd", target_os = "linux"))]
+                    memory
+                        .reset_guard_pages()
+                        .expect("failed to reset guard pages");
+                    // require mutable on all platforms, not just uffd
+                    drop(&mut memory);
+
+                    let size = memory.byte_size();
+                    drop(memory);
+                    decommit_memory_pages(base, size)
+                        .expect("failed to decommit linear memory pages");
+                }
+            }
        }

        instance.memories.clear();
@@ -481,50 +487,81 @@ impl InstancePool {
            decommit_table_pages(base, size).expect("failed to decommit table pages");
        }

-        instance.tables.clear();
-        instance.dropped_elements.clear();
-
-        // Drop all `global` values which need a destructor, such as externref
-        // values which now need their reference count dropped.
-        instance.drop_globals();
-
-        // Drop any host state
-        instance.host_state = Box::new(());
-
-        // And finally reset the module/offsets back to their original. This
-        // should put everything back in a relatively pristine state for each
-        // fresh allocation later on.
-        instance.module = self.empty_module.clone();
-        instance.offsets = VMOffsets::new(HostPtr, &self.empty_module);
-        instance.wasm_data = &[];
+        // We've now done all of the pooling-allocator-specific
+        // teardown, so we can drop the Instance and let destructors
+        // take care of any other fields (host state, globals, etc.).
+        unsafe {
+            std::ptr::drop_in_place(instance as *mut _);
+        }
+        // The instance is now uninitialized memory and cannot be
+        // touched again until we write a fresh Instance in-place with
+        // std::ptr::write in allocate() above.

        self.free_list.lock().unwrap().push(index);
    }

    fn set_instance_memories(
+        instance_idx: usize,
        instance: &mut Instance,
-        mut memories: impl Iterator<Item = *mut u8>,
+        memories: &MemoryPool,
+        maybe_memfds: &Option<Arc<ModuleMemFds>>,
        max_pages: u64,
    ) -> Result<(), InstantiationError> {
        let module = instance.module.as_ref();

        debug_assert!(instance.memories.is_empty());

-        for plan in
-            (&module.memory_plans.values().as_slice()[module.num_imported_memories..]).iter()
+        for (memory_index, plan) in module
+            .memory_plans
+            .iter()
+            .skip(module.num_imported_memories)
        {
+            let defined_index = module
+                .defined_memory_index(memory_index)
+                .expect("should be a defined memory since we skipped imported ones");
+
            let memory = unsafe {
                std::slice::from_raw_parts_mut(
-                    memories.next().unwrap(),
+                    memories.get_base(instance_idx, memory_index),
                    (max_pages as usize) * (WASM_PAGE_SIZE as usize),
                )
            };
-            instance.memories.push(
-                Memory::new_static(plan, memory, commit_memory_pages, unsafe {
-                    &mut *instance.store()
-                })
-                .map_err(InstantiationError::Resource)?,
-            );
+
+            if let Some(memfds) = maybe_memfds {
+                let image = memfds.get_memory_image(defined_index);
+                let mut slot = memories.take_memfd_slot(instance_idx, memory_index)?;
+                let initial_size = plan.memory.minimum * WASM_PAGE_SIZE as u64;
+
+                // If instantiation fails, we can propagate the error
+                // upward and drop the slot. This will cause the Drop
+                // handler to attempt to map the range with PROT_NONE
+                // memory, to reserve the space while releasing any
+                // stale mappings. The next use of this slot will then
+                // create a new MemFdSlot that will try to map over
+                // this, returning errors as well if the mapping
+                // errors persist. The unmap-on-drop is best effort;
+                // if it fails, then we can still soundly continue
+                // using the rest of the pool and allowing the rest of
+                // the process to continue, because we never perform a
+                // mmap that would leave an open space for someone
+                // else to come in and map something.
+                slot.instantiate(initial_size as usize, image)
+                    .map_err(|e| InstantiationError::Resource(e.into()))?;
+
+                instance.memories.push(
+                    Memory::new_static(plan, memory, None, Some(slot), unsafe {
+                        &mut *instance.store()
+                    })
+                    .map_err(InstantiationError::Resource)?,
+                );
+            } else {
+                instance.memories.push(
+                    Memory::new_static(plan, memory, Some(commit_memory_pages), None, unsafe {
+                        &mut *instance.store()
+                    })
+                    .map_err(InstantiationError::Resource)?,
+                );
+            }
        }

        debug_assert!(instance.dropped_data.is_empty());
@@ -566,17 +603,6 @@ impl InstancePool {
    }
 }

-impl Drop for InstancePool {
-    fn drop(&mut self) {
-        unsafe {
-            for i in 0..self.max_instances {
-                let ptr = self.mapping.as_mut_ptr().add(i * self.instance_size) as *mut Instance;
-                std::ptr::drop_in_place(ptr);
-            }
-        }
-    }
-}
-
 /// Represents a pool of WebAssembly linear memories.
 ///
 /// A linear memory is divided into accessible pages and guard pages.
@@ -589,6 +615,10 @@ impl Drop for InstancePool {
 #[derive(Debug)]
 struct MemoryPool {
    mapping: Mmap,
+    // If using the memfd allocation scheme, the MemFd slots. We
+    // dynamically transfer ownership of a slot to a Memory when in
+    // use.
+    memfd_slots: Vec<Mutex<Option<MemFdSlot>>>,
    // The size, in bytes, of each linear memory's reservation plus the guard
    // region allocated for it.
    memory_size: usize,
@@ -673,8 +703,18 @@ impl MemoryPool {
        let mapping = Mmap::accessible_reserved(0, allocation_size)
            .context("failed to create memory pool mapping")?;

+        let num_memfd_slots = if cfg!(feature = "memfd-allocator") {
+            max_instances * max_memories
+        } else {
+            0
+        };
+        let memfd_slots: Vec<_> = std::iter::repeat_with(|| Mutex::new(None))
+            .take(num_memfd_slots)
+            .collect();
+
        let pool = Self {
            mapping,
+            memfd_slots,
            memory_size,
            initial_memory_offset,
            max_memories,
@@ -689,17 +729,43 @@ impl MemoryPool {
        Ok(pool)
    }

-    fn get(&self, instance_index: usize) -> impl Iterator<Item = *mut u8> {
+    fn get_base(&self, instance_index: usize, memory_index: MemoryIndex) -> *mut u8 {
        debug_assert!(instance_index < self.max_instances);
+        let memory_index = memory_index.as_u32() as usize;
+        debug_assert!(memory_index < self.max_memories);
+        let idx = instance_index * self.max_memories + memory_index;
+        let offset = self.initial_memory_offset + idx * self.memory_size;
+        unsafe { self.mapping.as_mut_ptr().offset(offset as isize) }
+    }

-        let base: *mut u8 = unsafe {
-            self.mapping.as_mut_ptr().add(
-                self.initial_memory_offset + instance_index * self.memory_size * self.max_memories,
-            ) as _
-        };
+    fn get<'a>(&'a self, instance_index: usize) -> impl Iterator<Item = *mut u8> + 'a {
+        (0..self.max_memories)
+            .map(move |i| self.get_base(instance_index, MemoryIndex::from_u32(i as u32)))
+    }

-        let size = self.memory_size;
-        (0..self.max_memories).map(move |i| unsafe { base.add(i * size) })
+    /// Take ownership of the given memfd slot. Must be returned via
+    /// `return_memfd_slot` when the instance is done using it.
+    fn take_memfd_slot(
+        &self,
+        instance_index: usize,
+        memory_index: MemoryIndex,
+    ) -> Result<MemFdSlot, InstantiationError> {
+        let idx = instance_index * self.max_memories + (memory_index.as_u32() as usize);
+        let maybe_slot = self.memfd_slots[idx].lock().unwrap().take();
+
+        maybe_slot.map(|slot| Ok(slot)).unwrap_or_else(|| {
+            MemFdSlot::create(
+                self.get_base(instance_index, memory_index) as *mut c_void,
+                self.memory_size,
+            )
+        })
+    }
+
+    /// Return ownership of the given memfd slot.
+    fn return_memfd_slot(&self, instance_index: usize, memory_index: MemoryIndex, slot: MemFdSlot) {
+        assert!(!slot.is_dirty());
+        let idx = instance_index * self.max_memories + (memory_index.as_u32() as usize);
+        *self.memfd_slots[idx].lock().unwrap() = Some(slot);
    }
 }

@@ -1413,6 +1479,7 @@ mod test {
                            host_state: Box::new(()),
                            store: StorePtr::empty(),
                            wasm_data: &[],
+                            memfds: None,
                        },
                    )
                    .expect("allocation should succeed"),
@@ -1437,6 +1504,7 @@ mod test {
                host_state: Box::new(()),
                store: StorePtr::empty(),
                wasm_data: &[],
+                memfds: None,
            },
        ) {
            Err(InstantiationError::Limit(3)) => {}
--- a/crates/runtime/src/instance/allocator/pooling/uffd.rs
+++ b/crates/runtime/src/instance/allocator/pooling/uffd.rs
@@ -577,6 +577,7 @@ mod test {
                            PoolingAllocationStrategy::Random,
                            InstanceAllocationRequest {
                                module: module.clone(),
+                                memfds: None,
                                image_base: 0,
                                functions,
                                imports: Imports {