Add a pooling allocator mode based on copy-on-write mappings of memfds.

As first suggested by Jan on the Zulip here [1], a cheap and effective way to obtain copy-on-write semantics of a "backing image" for a Wasm memory is to mmap a file with `MAP_PRIVATE`. The `memfd` mechanism provided by the Linux kernel allows us to create anonymous, in-memory-only files that we can use for this mapping, so we can construct the image contents on-the-fly then effectively create a CoW overlay. Furthermore, and importantly, `madvise(MADV_DONTNEED, ...)` will discard the CoW overlay, returning the mapping to its original state. By itself this is almost enough for a very fast instantiation-termination loop of the same image over and over, without changing the address space mapping at all (which is expensive). The only missing bit is how to implement heap *growth*. But here memfds can help us again: if we create another anonymous file and map it where the extended parts of the heap would go, we can take advantage of the fact that a `mmap()` mapping can be *larger than the file itself*, with accesses beyond the end generating a `SIGBUS`, and the fact that we can cheaply resize the file with `ftruncate`, even after a mapping exists. So we can map the "heap extension" file once with the maximum memory-slot size and grow the memfd itself as `memory.grow` operations occur. The above CoW technique and heap-growth technique together allow us a fastpath of `madvise()` and `ftruncate()` only when we re-instantiate the same module over and over, as long as we can reuse the same slot. This fastpath avoids all whole-process address-space locks in the Linux kernel, which should mean it is highly scalable. It also avoids the cost of copying data on read, as the `uffd` heap backend does when servicing pagefaults; the kernel's own optimized CoW logic (same as used by all file mmaps) is used instead. [1] https://bytecodealliance.zulipchat.com/#narrow/stream/206238-general/topic/Copy.20on.20write.20based.20instance.20reuse/near/266657772
2022-01-18 16:42:24 -08:00
parent 90e7cef56c
commit b73ac83c37
26 changed files with 1070 additions and 135 deletions
--- a/crates/runtime/src/instance/allocator/pooling.rs
+++ b/crates/runtime/src/instance/allocator/pooling.rs
@@ -7,19 +7,21 @@
 //! Using the pooling instance allocator can speed up module instantiation
 //! when modules can be constrained based on configurable limits.

+use super::MemFdSlot;
 use super::{
    initialize_instance, initialize_vmcontext, InstanceAllocationRequest, InstanceAllocator,
    InstanceHandle, InstantiationError,
 };
-use crate::{instance::Instance, Memory, Mmap, Table, VMContext};
+use crate::{instance::Instance, Memory, Mmap, ModuleMemFds, Table};
 use anyhow::{anyhow, bail, Context, Result};
+use libc::c_void;
 use rand::Rng;
 use std::convert::TryFrom;
-use std::marker;
 use std::mem;
-use std::sync::{Arc, Mutex};
+use std::sync::Arc;
+use std::sync::Mutex;
 use wasmtime_environ::{
-    EntitySet, HostPtr, MemoryStyle, Module, PrimaryMap, Tunables, VMOffsets, VMOffsetsFields,
+    HostPtr, MemoryIndex, MemoryStyle, Module, PrimaryMap, Tunables, VMOffsets, VMOffsetsFields,
    WASM_PAGE_SIZE,
 };

@@ -284,7 +286,6 @@ struct InstancePool {
    free_list: Mutex<Vec<usize>>,
    memories: MemoryPool,
    tables: TablePool,
-    empty_module: Arc<Module>,
 }

 impl InstancePool {
@@ -332,14 +333,8 @@ impl InstancePool {
            free_list: Mutex::new((0..max_instances).collect()),
            memories: MemoryPool::new(module_limits, instance_limits, tunables)?,
            tables: TablePool::new(module_limits, instance_limits)?,
-            empty_module: Arc::new(Module::default()),
        };

-        // Use a default module to initialize the instances to start
-        for i in 0..instance_limits.count as usize {
-            pool.initialize(module_limits, i);
-        }
-
        Ok(pool)
    }

@@ -348,41 +343,26 @@ impl InstancePool {
        &mut *(self.mapping.as_mut_ptr().add(index * self.instance_size) as *mut Instance)
    }

-    fn initialize(&self, limits: &ModuleLimits, index: usize) {
-        unsafe {
-            let instance = self.instance(index);
-
-            // Write a default instance with preallocated memory/table map storage to the ptr
-            std::ptr::write(
-                instance as _,
-                Instance {
-                    module: self.empty_module.clone(),
-                    offsets: VMOffsets::new(HostPtr, &self.empty_module),
-                    memories: PrimaryMap::with_capacity(limits.memories as usize),
-                    tables: PrimaryMap::with_capacity(limits.tables as usize),
-                    dropped_elements: EntitySet::new(),
-                    dropped_data: EntitySet::new(),
-                    host_state: Box::new(()),
-                    wasm_data: &[],
-                    vmctx: VMContext {
-                        _marker: marker::PhantomPinned,
-                    },
-                },
-            );
-        }
-    }
-
    unsafe fn setup_instance(
        &self,
        index: usize,
        mut req: InstanceAllocationRequest,
    ) -> Result<InstanceHandle, InstantiationError> {
-        let instance = self.instance(index);
+        let host_state = std::mem::replace(&mut req.host_state, Box::new(()));
+        let instance_data = Instance::create_raw(
+            &req.module,
+            &*req.wasm_data,
+            PrimaryMap::default(),
+            PrimaryMap::default(),
+            host_state,
+        );

-        instance.module = req.module.clone();
-        instance.offsets = VMOffsets::new(HostPtr, instance.module.as_ref());
-        instance.host_state = std::mem::replace(&mut req.host_state, Box::new(()));
-        instance.wasm_data = &*req.wasm_data;
+        // Instances are uninitialized memory at first; we need to
+        // write an empty but initialized `Instance` struct into the
+        // chosen slot before we do anything else with it. (This is
+        // paired with a `drop_in_place` in deallocate below.)
+        let instance = self.instance(index);
+        std::ptr::write(instance as _, instance_data);

        // set_instance_memories and _tables will need the store before we can completely
        // initialize the vmcontext.
@@ -391,8 +371,10 @@ impl InstancePool {
        }

        Self::set_instance_memories(
+            index,
            instance,
-            self.memories.get(index),
+            &self.memories,
+            &req.memfds,
            self.memories.max_wasm_pages,
        )?;

@@ -448,20 +430,44 @@ impl InstancePool {
        let instance = unsafe { &mut *handle.instance };

        // Decommit any linear memories that were used
-        for (memory, base) in instance.memories.values_mut().zip(self.memories.get(index)) {
+        for ((def_mem_idx, memory), base) in
+            instance.memories.iter_mut().zip(self.memories.get(index))
+        {
            let mut memory = mem::take(memory);
            debug_assert!(memory.is_static());

-            // Reset any faulted guard pages as the physical memory may be reused for another instance in the future
-            #[cfg(all(feature = "uffd", target_os = "linux"))]
-            memory
-                .reset_guard_pages()
-                .expect("failed to reset guard pages");
-            drop(&mut memory); // require mutable on all platforms, not just uffd
+            match memory {
+                Memory::Static {
+                    memfd_slot: Some(mut memfd_slot),
+                    ..
+                } => {
+                    let mem_idx = instance.module.memory_index(def_mem_idx);
+                    // If there was any error clearing the memfd, just
+                    // drop it here, and let the drop handler for the
+                    // MemFdSlot unmap in a way that retains the
+                    // address space reservation.
+                    if memfd_slot.clear_and_remain_ready().is_ok() {
+                        self.memories.return_memfd_slot(index, mem_idx, memfd_slot);
+                    }
+                }

-            let size = memory.byte_size();
-            drop(memory);
-            decommit_memory_pages(base, size).expect("failed to decommit linear memory pages");
+                _ => {
+                    // Reset any faulted guard pages as the physical
+                    // memory may be reused for another instance in
+                    // the future.
+                    #[cfg(all(feature = "uffd", target_os = "linux"))]
+                    memory
+                        .reset_guard_pages()
+                        .expect("failed to reset guard pages");
+                    // require mutable on all platforms, not just uffd
+                    drop(&mut memory);
+
+                    let size = memory.byte_size();
+                    drop(memory);
+                    decommit_memory_pages(base, size)
+                        .expect("failed to decommit linear memory pages");
+                }
+            }
        }

        instance.memories.clear();
@@ -481,50 +487,81 @@ impl InstancePool {
            decommit_table_pages(base, size).expect("failed to decommit table pages");
        }

-        instance.tables.clear();
-        instance.dropped_elements.clear();
-
-        // Drop all `global` values which need a destructor, such as externref
-        // values which now need their reference count dropped.
-        instance.drop_globals();
-
-        // Drop any host state
-        instance.host_state = Box::new(());
-
-        // And finally reset the module/offsets back to their original. This
-        // should put everything back in a relatively pristine state for each
-        // fresh allocation later on.
-        instance.module = self.empty_module.clone();
-        instance.offsets = VMOffsets::new(HostPtr, &self.empty_module);
-        instance.wasm_data = &[];
+        // We've now done all of the pooling-allocator-specific
+        // teardown, so we can drop the Instance and let destructors
+        // take care of any other fields (host state, globals, etc.).
+        unsafe {
+            std::ptr::drop_in_place(instance as *mut _);
+        }
+        // The instance is now uninitialized memory and cannot be
+        // touched again until we write a fresh Instance in-place with
+        // std::ptr::write in allocate() above.

        self.free_list.lock().unwrap().push(index);
    }

    fn set_instance_memories(
+        instance_idx: usize,
        instance: &mut Instance,
-        mut memories: impl Iterator<Item = *mut u8>,
+        memories: &MemoryPool,
+        maybe_memfds: &Option<Arc<ModuleMemFds>>,
        max_pages: u64,
    ) -> Result<(), InstantiationError> {
        let module = instance.module.as_ref();

        debug_assert!(instance.memories.is_empty());

-        for plan in
-            (&module.memory_plans.values().as_slice()[module.num_imported_memories..]).iter()
+        for (memory_index, plan) in module
+            .memory_plans
+            .iter()
+            .skip(module.num_imported_memories)
        {
+            let defined_index = module
+                .defined_memory_index(memory_index)
+                .expect("should be a defined memory since we skipped imported ones");
+
            let memory = unsafe {
                std::slice::from_raw_parts_mut(
-                    memories.next().unwrap(),
+                    memories.get_base(instance_idx, memory_index),
                    (max_pages as usize) * (WASM_PAGE_SIZE as usize),
                )
            };
-            instance.memories.push(
-                Memory::new_static(plan, memory, commit_memory_pages, unsafe {
-                    &mut *instance.store()
-                })
-                .map_err(InstantiationError::Resource)?,
-            );
+
+            if let Some(memfds) = maybe_memfds {
+                let image = memfds.get_memory_image(defined_index);
+                let mut slot = memories.take_memfd_slot(instance_idx, memory_index)?;
+                let initial_size = plan.memory.minimum * WASM_PAGE_SIZE as u64;
+
+                // If instantiation fails, we can propagate the error
+                // upward and drop the slot. This will cause the Drop
+                // handler to attempt to map the range with PROT_NONE
+                // memory, to reserve the space while releasing any
+                // stale mappings. The next use of this slot will then
+                // create a new MemFdSlot that will try to map over
+                // this, returning errors as well if the mapping
+                // errors persist. The unmap-on-drop is best effort;
+                // if it fails, then we can still soundly continue
+                // using the rest of the pool and allowing the rest of
+                // the process to continue, because we never perform a
+                // mmap that would leave an open space for someone
+                // else to come in and map something.
+                slot.instantiate(initial_size as usize, image)
+                    .map_err(|e| InstantiationError::Resource(e.into()))?;
+
+                instance.memories.push(
+                    Memory::new_static(plan, memory, None, Some(slot), unsafe {
+                        &mut *instance.store()
+                    })
+                    .map_err(InstantiationError::Resource)?,
+                );
+            } else {
+                instance.memories.push(
+                    Memory::new_static(plan, memory, Some(commit_memory_pages), None, unsafe {
+                        &mut *instance.store()
+                    })
+                    .map_err(InstantiationError::Resource)?,
+                );
+            }
        }

        debug_assert!(instance.dropped_data.is_empty());
@@ -566,17 +603,6 @@ impl InstancePool {
    }
 }

-impl Drop for InstancePool {
-    fn drop(&mut self) {
-        unsafe {
-            for i in 0..self.max_instances {
-                let ptr = self.mapping.as_mut_ptr().add(i * self.instance_size) as *mut Instance;
-                std::ptr::drop_in_place(ptr);
-            }
-        }
-    }
-}
-
 /// Represents a pool of WebAssembly linear memories.
 ///
 /// A linear memory is divided into accessible pages and guard pages.
@@ -589,6 +615,10 @@ impl Drop for InstancePool {
 #[derive(Debug)]
 struct MemoryPool {
    mapping: Mmap,
+    // If using the memfd allocation scheme, the MemFd slots. We
+    // dynamically transfer ownership of a slot to a Memory when in
+    // use.
+    memfd_slots: Vec<Mutex<Option<MemFdSlot>>>,
    // The size, in bytes, of each linear memory's reservation plus the guard
    // region allocated for it.
    memory_size: usize,
@@ -673,8 +703,18 @@ impl MemoryPool {
        let mapping = Mmap::accessible_reserved(0, allocation_size)
            .context("failed to create memory pool mapping")?;

+        let num_memfd_slots = if cfg!(feature = "memfd-allocator") {
+            max_instances * max_memories
+        } else {
+            0
+        };
+        let memfd_slots: Vec<_> = std::iter::repeat_with(|| Mutex::new(None))
+            .take(num_memfd_slots)
+            .collect();
+
        let pool = Self {
            mapping,
+            memfd_slots,
            memory_size,
            initial_memory_offset,
            max_memories,
@@ -689,17 +729,43 @@ impl MemoryPool {
        Ok(pool)
    }

-    fn get(&self, instance_index: usize) -> impl Iterator<Item = *mut u8> {
+    fn get_base(&self, instance_index: usize, memory_index: MemoryIndex) -> *mut u8 {
        debug_assert!(instance_index < self.max_instances);
+        let memory_index = memory_index.as_u32() as usize;
+        debug_assert!(memory_index < self.max_memories);
+        let idx = instance_index * self.max_memories + memory_index;
+        let offset = self.initial_memory_offset + idx * self.memory_size;
+        unsafe { self.mapping.as_mut_ptr().offset(offset as isize) }
+    }

-        let base: *mut u8 = unsafe {
-            self.mapping.as_mut_ptr().add(
-                self.initial_memory_offset + instance_index * self.memory_size * self.max_memories,
-            ) as _
-        };
+    fn get<'a>(&'a self, instance_index: usize) -> impl Iterator<Item = *mut u8> + 'a {
+        (0..self.max_memories)
+            .map(move |i| self.get_base(instance_index, MemoryIndex::from_u32(i as u32)))
+    }

-        let size = self.memory_size;
-        (0..self.max_memories).map(move |i| unsafe { base.add(i * size) })
+    /// Take ownership of the given memfd slot. Must be returned via
+    /// `return_memfd_slot` when the instance is done using it.
+    fn take_memfd_slot(
+        &self,
+        instance_index: usize,
+        memory_index: MemoryIndex,
+    ) -> Result<MemFdSlot, InstantiationError> {
+        let idx = instance_index * self.max_memories + (memory_index.as_u32() as usize);
+        let maybe_slot = self.memfd_slots[idx].lock().unwrap().take();
+
+        maybe_slot.map(|slot| Ok(slot)).unwrap_or_else(|| {
+            MemFdSlot::create(
+                self.get_base(instance_index, memory_index) as *mut c_void,
+                self.memory_size,
+            )
+        })
+    }
+
+    /// Return ownership of the given memfd slot.
+    fn return_memfd_slot(&self, instance_index: usize, memory_index: MemoryIndex, slot: MemFdSlot) {
+        assert!(!slot.is_dirty());
+        let idx = instance_index * self.max_memories + (memory_index.as_u32() as usize);
+        *self.memfd_slots[idx].lock().unwrap() = Some(slot);
    }
 }

@@ -1413,6 +1479,7 @@ mod test {
                            host_state: Box::new(()),
                            store: StorePtr::empty(),
                            wasm_data: &[],
+                            memfds: None,
                        },
                    )
                    .expect("allocation should succeed"),
@@ -1437,6 +1504,7 @@ mod test {
                host_state: Box::new(()),
                store: StorePtr::empty(),
                wasm_data: &[],
+                memfds: None,
            },
        ) {
            Err(InstantiationError::Limit(3)) => {}