Remove ftruncate-trick for heap growth with memfd backend.

Testing so far with recent Wasmtime has not been able to show the need for avoiding the process-wide mmap lock in real-world use-cases. As such, the technique of using an anonymous file and ftruncate() to extend it seems unnecessary; instead, memfd can always use anonymous zeroed memory for heap backing where the CoW image is not present, and mprotect() to extend the heap limit by changing page protections.
2022-01-31 11:13:43 -08:00
parent b73ac83c37
commit 3702e81d30
3 changed files with 85 additions and 121 deletions
--- a/crates/runtime/src/instance/allocator/memfd.rs
+++ b/crates/runtime/src/instance/allocator/memfd.rs
@@ -5,8 +5,6 @@ use crate::InstantiationError;
 use anyhow::Result;
 use libc::c_void;
 use rustix::fd::AsRawFd;
-use std::convert::TryFrom;
-use std::fs::File;
 use std::sync::Arc;

 /// A single slot handled by the memfd instance-heap mechanism.
@@ -16,8 +14,7 @@ use std::sync::Arc;
 /// base ==> (points here)
 /// - (image.offset bytes)   anonymous zero memory, pre-image
 /// - (image.len bytes)      CoW mapping of memfd heap image
-/// - (up to extension_offset)  anonymous zero memory, post-image
-/// - (up to static_size)    heap expansion region; CoW mapping of per-slot memfd
+/// - (up to static_size)    anonymous zero memory, post-image
 ///
 /// The ordering of mmaps to set this up is:
 ///
@@ -25,15 +22,15 @@ use std::sync::Arc;
 ///   - one large mmap to create 8GiB * instances * memories slots
 ///
 /// - per instantiation of new image in a slot:
-///   - mmap of anonymous zero memory, from 0 to initial heap size
+///   - mmap of anonymous zero memory, from 0 to max heap size
+///     (static_size)
 ///   - mmap of CoW'd memfd image, from `image.offset` to
 ///     `image.offset + image.len`. This overwrites part of the
 ///     anonymous zero memory, potentially splitting it into a pre-
 ///     and post-region.
-///   - mmap of CoW'd extension file, past the initial heap size up to
-///     the end of the max memory size (just before the
-///     post-guard). This is always adjacent to the above mmaps, but
-///     does not overlap/overwrite them.
+///   - mprotect(PROT_NONE) on the part of the heap beyond the initial
+///     heap size; we re-mprotect it with R+W bits when the heap is
+///     grown.
 #[derive(Debug)]
 pub struct MemFdSlot {
    /// The base of the actual heap memory. Bytes at this address are
@@ -44,21 +41,11 @@ pub struct MemFdSlot {
    /// The memfd image that backs this memory. May be `None`, in
    /// which case the memory is all zeroes.
    pub(crate) image: Option<Arc<MemoryMemFd>>,
-    /// The offset at which the "extension file", which is used to
-    /// allow for efficient heap growth, is mapped. This is always
-    /// immediately after the end of the initial memory size.
-    extension_offset: usize,
-    /// The anonymous memfd, owned by this slot, which we mmap in the
-    /// area where the heap may grow during runtime. We use the
-    /// ftruncate() syscall (invoked via `File::set_len()`) to set its
-    /// size. We never write any data to it -- we CoW-map it so we can
-    /// throw away dirty data on termination. Instead, we just use its
-    /// size as a "watermark" that delineates the boundary between
-    /// safe-to-access memory and SIGBUS-causing memory. (This works
-    /// because one can mmap a file beyond its end, and is good
-    /// because ftruncate does not take the process-wide lock that
-    /// mmap and mprotect do.)
-    extension_file: File,
+    /// The initial heap size.
+    initial_size: usize,
+    /// The current heap size. All memory above `base + cur_size`
+    /// should be PROT_NONE (mapped inaccessible).
+    cur_size: usize,
    /// Whether this slot may have "dirty" pages (pages written by an
    /// instantiation). Set by `instantiate()` and cleared by
    /// `clear_and_remain_ready()`, and used in assertions to ensure
@@ -67,53 +54,31 @@ pub struct MemFdSlot {
 }

 impl MemFdSlot {
-    pub(crate) fn create(
-        base_addr: *mut c_void,
-        static_size: usize,
-    ) -> Result<Self, InstantiationError> {
+    pub(crate) fn create(base_addr: *mut c_void, static_size: usize) -> Self {
        let base = base_addr as usize;
-
-        // Create a MemFD for the memory growth first -- this covers
-        // extended heap beyond the initial image.
-        let extension_memfd = memfd::MemfdOptions::new()
-            .allow_sealing(true)
-            .create("wasm-anonymous-heap")
-            .map_err(|e| InstantiationError::Resource(e.into()))?;
-        // Seal the ability to write the extension file (make it
-        // permanently read-only). This is a defense-in-depth
-        // mitigation to make extra-sure that we don't leak
-        // information between instantiations. See note in `memfd.rs`
-        // for more about why we use seals.
-        extension_memfd
-            .add_seal(memfd::FileSeal::SealWrite)
-            .map_err(|e| InstantiationError::Resource(e.into()))?;
-        extension_memfd
-            .add_seal(memfd::FileSeal::SealSeal)
-            .map_err(|e| InstantiationError::Resource(e.into()))?;
-        let extension_file = extension_memfd.into_file();
-        extension_file
-            .set_len(0)
-            .map_err(|e| InstantiationError::Resource(e.into()))?;
-
-        Ok(MemFdSlot {
+        MemFdSlot {
            base,
            static_size,
+            initial_size: 0,
+            cur_size: 0,
            image: None,
-            extension_file,
-            extension_offset: 0,
            dirty: false,
-        })
+        }
    }

    pub(crate) fn set_heap_limit(&mut self, size_bytes: usize) -> Result<()> {
-        assert!(size_bytes >= self.extension_offset);
-        // This is all that is needed to make the new memory
-        // accessible; we don't need to mprotect anything. (The
-        // mapping itself is always R+W for the max possible heap
-        // size, and only the anonymous-backing file length catches
-        // out-of-bounds accesses.)
-        self.extension_file
-            .set_len(u64::try_from(size_bytes - self.extension_offset).unwrap())?;
+        assert!(size_bytes > self.cur_size);
+        // mprotect the relevant region.
+        let start = self.base + self.cur_size;
+        let len = size_bytes - self.cur_size;
+        unsafe {
+            rustix::io::mprotect(
+                start as *mut _,
+                len,
+                rustix::io::MprotectFlags::READ | rustix::io::MprotectFlags::WRITE,
+            )?;
+        }
+
        Ok(())
    }

@@ -124,31 +89,36 @@ impl MemFdSlot {
    ) -> Result<(), InstantiationError> {
        assert!(!self.dirty);

-        if let Some(existing_image) = &self.image {
-            // Fast-path: previously instantiated with the same image,
-            // so the mappings are already correct; there is no need
-            // to mmap anything. Given that we asserted not-dirty
-            // above, any dirty pages will have already been thrown
-            // away by madvise() during the previous termination.
-            if let Some(image) = maybe_image {
-                if existing_image.fd.as_file().as_raw_fd() == image.fd.as_file().as_raw_fd() {
-                    self.dirty = true;
-                    return Ok(());
-                }
-            }
+        // Fast-path: previously instantiated with the same image, or
+        // no image but the same initial size, so the mappings are
+        // already correct; there is no need to mmap anything. Given
+        // that we asserted not-dirty above, any dirty pages will have
+        // already been thrown away by madvise() during the previous
+        // termination.  The `clear_and_remain_ready()` path also
+        // mprotects memory above the initial heap size back to
+        // PROT_NONE, so we don't need to do that here.
+        if (self.image.is_none()
+            && maybe_image.is_none()
+            && self.initial_size == initial_size_bytes)
+            || (self.image.is_some()
+                && maybe_image.is_some()
+                && self.image.as_ref().unwrap().fd.as_file().as_raw_fd()
+                    == maybe_image.as_ref().unwrap().fd.as_file().as_raw_fd())
+        {
+            self.dirty = true;
+            return Ok(());
        }

        // Otherwise, we need to redo (i) the anonymous-mmap backing
-        // for the initial heap size, (ii) the extension-file backing,
-        // and (iii) the initial-heap-image mapping if present.
+        // for the whole slot, (ii) the initial-heap-image mapping if
+        // present, and (iii) the mprotect(PROT_NONE) above the
+        // initial heap size.

        // Security/audit note: we map all of these MAP_PRIVATE, so
        // all instance data is local to the mapping, not propagated
        // to the backing fd. We throw away this CoW overlay with
-        // madvise() below, from base up to extension_offset (which is
-        // at least initial_size_bytes, and extended when the
-        // extension file is, so it covers all three mappings) when
-        // terminating the instance.
+        // madvise() below, from base up to static_size (which is the
+        // whole slot) when terminating the instance.

        // Anonymous mapping behind the initial heap size: this gives
        // zeroes for any "holes" in the initial heap image. Anonymous
@@ -162,7 +132,7 @@ impl MemFdSlot {
            unsafe {
                let ptr = rustix::io::mmap_anonymous(
                    self.base as *mut c_void,
-                    initial_size_bytes,
+                    self.static_size,
                    rustix::io::ProtFlags::READ | rustix::io::ProtFlags::WRITE,
                    rustix::io::MapFlags::PRIVATE | rustix::io::MapFlags::FIXED,
                )
@@ -171,29 +141,8 @@ impl MemFdSlot {
            }
        }

-        // An "extension file": this allows us to grow the heap by
-        // doing just an ftruncate(), without changing any
-        // mappings. This is important to avoid the process-wide mmap
-        // lock on Linux.
-        self.extension_offset = initial_size_bytes;
-        let extension_map_len = self.static_size - initial_size_bytes;
-        if extension_map_len > 0 {
-            unsafe {
-                let fd = rustix::fd::BorrowedFd::borrow_raw_fd(self.extension_file.as_raw_fd());
-                let ptr = rustix::io::mmap(
-                    (self.base + initial_size_bytes) as *mut c_void,
-                    extension_map_len,
-                    rustix::io::ProtFlags::READ | rustix::io::ProtFlags::WRITE,
-                    rustix::io::MapFlags::PRIVATE | rustix::io::MapFlags::FIXED,
-                    &fd,
-                    0,
-                )
-                .map_err(|e| InstantiationError::Resource(e.into()))?;
-                assert_eq!(ptr as usize, self.base + initial_size_bytes);
-            }
-        }
-
-        // Finally, the initial memory image.
+        // The initial memory image, if given. If not, we just get a
+        // memory filled with zeroes.
        if let Some(image) = maybe_image {
            if image.len > 0 {
                let image = image.clone();
@@ -216,31 +165,50 @@ impl MemFdSlot {
            }
        }

+        // mprotect above `initial_size_bytes`.
+        self.initial_size = initial_size_bytes;
+        self.protect_past_initial_size()
+            .map_err(|e| InstantiationError::Resource(e.into()))?;
+
        self.dirty = true;
        Ok(())
    }

    pub(crate) fn clear_and_remain_ready(&mut self) -> Result<()> {
        assert!(self.dirty);
-        // madvise the image range; that's it! This will throw away
-        // dirty pages, which are CoW-private pages on top of the
-        // initial heap image memfd.
+        // madvise the image range. This will throw away dirty pages,
+        // which are CoW-private pages on top of the initial heap
+        // image memfd.
        unsafe {
            rustix::io::madvise(
                self.base as *mut c_void,
-                self.extension_offset,
+                self.static_size,
                rustix::io::Advice::LinuxDontNeed,
            )?;
        }

-        // truncate the extension file down to zero bytes to reset heap length.
-        self.extension_file
-            .set_len(0)
-            .map_err(|e| InstantiationError::Resource(e.into()))?;
+        // mprotect the region beyond the initial heap size back to PROT_NONE.
+        self.protect_past_initial_size()?;
        self.dirty = false;
        Ok(())
    }

+    fn protect_past_initial_size(&self) -> Result<()> {
+        let mprotect_start = self.base + self.initial_size;
+        let mprotect_len = self.static_size - self.initial_size;
+        if mprotect_len > 0 {
+            unsafe {
+                rustix::io::mprotect(
+                    mprotect_start as *mut _,
+                    mprotect_len,
+                    rustix::io::MprotectFlags::empty(),
+                )?;
+            }
+        }
+
+        Ok(())
+    }
+
    pub(crate) fn has_image(&self) -> bool {
        self.image.is_some()
    }
--- a/crates/runtime/src/instance/allocator/memfd_disabled.rs
+++ b/crates/runtime/src/instance/allocator/memfd_disabled.rs
@@ -19,7 +19,7 @@ pub struct MemFdSlot;
 #[cfg(not(feature = "memfd-allocator"))]
 #[allow(dead_code)]
 impl MemFdSlot {
-    pub(crate) fn create(_: *mut libc::c_void, _: usize) -> Result<Self, InstantiationError> {
+    pub(crate) fn create(_: *mut libc::c_void, _: usize) -> Self {
        panic!("create() on invalid MemFdSlot");
    }

--- a/crates/runtime/src/instance/allocator/pooling.rs
+++ b/crates/runtime/src/instance/allocator/pooling.rs
@@ -529,7 +529,7 @@ impl InstancePool {

            if let Some(memfds) = maybe_memfds {
                let image = memfds.get_memory_image(defined_index);
-                let mut slot = memories.take_memfd_slot(instance_idx, memory_index)?;
+                let mut slot = memories.take_memfd_slot(instance_idx, memory_index);
                let initial_size = plan.memory.minimum * WASM_PAGE_SIZE as u64;

                // If instantiation fails, we can propagate the error
@@ -745,15 +745,11 @@ impl MemoryPool {

    /// Take ownership of the given memfd slot. Must be returned via
    /// `return_memfd_slot` when the instance is done using it.
-    fn take_memfd_slot(
-        &self,
-        instance_index: usize,
-        memory_index: MemoryIndex,
-    ) -> Result<MemFdSlot, InstantiationError> {
+    fn take_memfd_slot(&self, instance_index: usize, memory_index: MemoryIndex) -> MemFdSlot {
        let idx = instance_index * self.max_memories + (memory_index.as_u32() as usize);
        let maybe_slot = self.memfd_slots[idx].lock().unwrap().take();

-        maybe_slot.map(|slot| Ok(slot)).unwrap_or_else(|| {
+        maybe_slot.unwrap_or_else(|| {
            MemFdSlot::create(
                self.get_base(instance_index, memory_index) as *mut c_void,
                self.memory_size,