memfd: Reduce some syscalls in the on-demand case (#3757)
* memfd: Reduce some syscalls in the on-demand case This tweaks the internal organization of the `MemFdSlot` to avoid some syscalls in the default case as well as opportunistically in the pooling case. The two cases added here are: * A `MemFdSlot` is now created with a specified initial size. For pooling this is 0 but for the on-demand case this can be non-zero. * When `instantiate` is called with no prior image and the sizes match (as will be the case for on-demand allocation) then `mprotect` is skipped entirely. * In the `clear_and_remain-ready` case the `mprotect` is skipped if the heap wasn't grown at all. This should avoid ever using `mprotect` unnecessarily and makes the ranges we `mprotect` a bit smaller as well. * Review comments * Tweak allow to apply to whole crate
This commit is contained in:
@@ -754,6 +754,7 @@ impl MemoryPool {
|
|||||||
maybe_slot.unwrap_or_else(|| {
|
maybe_slot.unwrap_or_else(|| {
|
||||||
MemFdSlot::create(
|
MemFdSlot::create(
|
||||||
self.get_base(instance_index, memory_index) as *mut c_void,
|
self.get_base(instance_index, memory_index) as *mut c_void,
|
||||||
|
0,
|
||||||
self.memory_size,
|
self.memory_size,
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -19,6 +19,7 @@
|
|||||||
clippy::use_self
|
clippy::use_self
|
||||||
)
|
)
|
||||||
)]
|
)]
|
||||||
|
#![cfg_attr(not(memfd), allow(unused_variables, unreachable_code))]
|
||||||
|
|
||||||
use std::sync::atomic::AtomicU64;
|
use std::sync::atomic::AtomicU64;
|
||||||
|
|
||||||
|
|||||||
@@ -310,13 +310,13 @@ pub struct MemFdSlot {
|
|||||||
impl MemFdSlot {
|
impl MemFdSlot {
|
||||||
/// Create a new MemFdSlot. Assumes that there is an anonymous
|
/// Create a new MemFdSlot. Assumes that there is an anonymous
|
||||||
/// mmap backing in the given range to start.
|
/// mmap backing in the given range to start.
|
||||||
pub(crate) fn create(base_addr: *mut c_void, static_size: usize) -> Self {
|
pub(crate) fn create(base_addr: *mut c_void, initial_size: usize, static_size: usize) -> Self {
|
||||||
let base = base_addr as usize;
|
let base = base_addr as usize;
|
||||||
MemFdSlot {
|
MemFdSlot {
|
||||||
base,
|
base,
|
||||||
static_size,
|
static_size,
|
||||||
initial_size: 0,
|
initial_size,
|
||||||
cur_size: 0,
|
cur_size: initial_size,
|
||||||
image: None,
|
image: None,
|
||||||
dirty: false,
|
dirty: false,
|
||||||
clear_on_drop: true,
|
clear_on_drop: true,
|
||||||
@@ -332,22 +332,11 @@ impl MemFdSlot {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn set_heap_limit(&mut self, size_bytes: usize) -> Result<()> {
|
pub(crate) fn set_heap_limit(&mut self, size_bytes: usize) -> Result<()> {
|
||||||
assert!(
|
|
||||||
size_bytes > self.cur_size,
|
|
||||||
"size_bytes = {} cur_size = {}",
|
|
||||||
size_bytes,
|
|
||||||
self.cur_size
|
|
||||||
);
|
|
||||||
// mprotect the relevant region.
|
// mprotect the relevant region.
|
||||||
let start = self.base + self.cur_size;
|
self.set_protection(
|
||||||
let len = size_bytes - self.cur_size;
|
self.cur_size..size_bytes,
|
||||||
unsafe {
|
rustix::io::MprotectFlags::READ | rustix::io::MprotectFlags::WRITE,
|
||||||
rustix::io::mprotect(
|
)?;
|
||||||
start as *mut _,
|
|
||||||
len,
|
|
||||||
rustix::io::MprotectFlags::READ | rustix::io::MprotectFlags::WRITE,
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
self.cur_size = size_bytes;
|
self.cur_size = size_bytes;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -359,13 +348,14 @@ impl MemFdSlot {
|
|||||||
maybe_image: Option<&Arc<MemoryMemFd>>,
|
maybe_image: Option<&Arc<MemoryMemFd>>,
|
||||||
) -> Result<(), InstantiationError> {
|
) -> Result<(), InstantiationError> {
|
||||||
assert!(!self.dirty);
|
assert!(!self.dirty);
|
||||||
|
assert_eq!(self.cur_size, self.initial_size);
|
||||||
|
|
||||||
// Fast-path: previously instantiated with the same image, or
|
// Fast-path: previously instantiated with the same image, or
|
||||||
// no image but the same initial size, so the mappings are
|
// no image but the same initial size, so the mappings are
|
||||||
// already correct; there is no need to mmap anything. Given
|
// already correct; there is no need to mmap anything. Given
|
||||||
// that we asserted not-dirty above, any dirty pages will have
|
// that we asserted not-dirty above, any dirty pages will have
|
||||||
// already been thrown away by madvise() during the previous
|
// already been thrown away by madvise() during the previous
|
||||||
// termination. The `clear_and_remain_ready()` path also
|
// termination. The `clear_and_remain_ready()` path also
|
||||||
// mprotects memory above the initial heap size back to
|
// mprotects memory above the initial heap size back to
|
||||||
// PROT_NONE, so we don't need to do that here.
|
// PROT_NONE, so we don't need to do that here.
|
||||||
if (self.image.is_none()
|
if (self.image.is_none()
|
||||||
@@ -377,46 +367,45 @@ impl MemFdSlot {
|
|||||||
== maybe_image.as_ref().unwrap().fd.as_file().as_raw_fd())
|
== maybe_image.as_ref().unwrap().fd.as_file().as_raw_fd())
|
||||||
{
|
{
|
||||||
self.dirty = true;
|
self.dirty = true;
|
||||||
self.cur_size = initial_size_bytes;
|
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
// Otherwise, we need to transition from the previous state to the
|
||||||
// Otherwise, we need to redo (i) the anonymous-mmap backing
|
// state now requested. An attempt is made here to minimize syscalls to
|
||||||
// for the whole slot, (ii) the initial-heap-image mapping if
|
// the kernel to ideally reduce the overhead of this as it's fairly
|
||||||
// present, and (iii) the mprotect(PROT_NONE) above the
|
// performance sensitive with memories. Note that the "previous state"
|
||||||
// initial heap size.
|
// is assumed to be post-initialization (e.g. after an mmap on-demand
|
||||||
|
// memory was created) or after `clear_and_remain_ready` was called
|
||||||
|
// which notably means that `madvise` has reset all the memory back to
|
||||||
|
// its original state.
|
||||||
|
//
|
||||||
// Security/audit note: we map all of these MAP_PRIVATE, so
|
// Security/audit note: we map all of these MAP_PRIVATE, so
|
||||||
// all instance data is local to the mapping, not propagated
|
// all instance data is local to the mapping, not propagated
|
||||||
// to the backing fd. We throw away this CoW overlay with
|
// to the backing fd. We throw away this CoW overlay with
|
||||||
// madvise() below, from base up to static_size (which is the
|
// madvise() below, from base up to static_size (which is the
|
||||||
// whole slot) when terminating the instance.
|
// whole slot) when terminating the instance.
|
||||||
|
|
||||||
// Anonymous mapping behind the initial heap size: this gives
|
|
||||||
// zeroes for any "holes" in the initial heap image. Anonymous
|
|
||||||
// mmap memory is faster to fault in than a CoW of a file,
|
|
||||||
// even a file with zero holes, because the kernel's CoW path
|
|
||||||
// unconditionally copies *something* (even if just a page of
|
|
||||||
// zeroes). Anonymous zero pages are fast: the kernel
|
|
||||||
// pre-zeroes them, and even if it runs out of those, a memset
|
|
||||||
// is half as expensive as a memcpy (only writes, no reads).
|
|
||||||
//
|
|
||||||
// We map these inaccessible at first then mprotect() the
|
|
||||||
// whole of the initial heap size to R+W below.
|
|
||||||
if self.image.is_some() {
|
if self.image.is_some() {
|
||||||
|
// In this case the state of memory at this time is that the memory
|
||||||
|
// from `0..self.initial_size` is reset back to its original state,
|
||||||
|
// but this memory contians a CoW image that is different from the
|
||||||
|
// one specified here. To reset state we first reset the mapping
|
||||||
|
// of memory to anonymous PROT_NONE memory, and then afterwards the
|
||||||
|
// heap is made visible with an mprotect.
|
||||||
self.reset_with_anon_memory()
|
self.reset_with_anon_memory()
|
||||||
.map_err(|e| InstantiationError::Resource(e.into()))?;
|
.map_err(|e| InstantiationError::Resource(e.into()))?;
|
||||||
|
self.set_protection(
|
||||||
|
0..initial_size_bytes,
|
||||||
|
rustix::io::MprotectFlags::READ | rustix::io::MprotectFlags::WRITE,
|
||||||
|
)
|
||||||
|
.map_err(|e| InstantiationError::Resource(e.into()))?;
|
||||||
} else if initial_size_bytes < self.initial_size {
|
} else if initial_size_bytes < self.initial_size {
|
||||||
// Special case: we can skip if the last instantiation had
|
// In this case the previous module had now CoW image which means
|
||||||
// no image. This means that the whole slot is filled with
|
// that the memory at `0..self.initial_size` is all zeros and
|
||||||
// an anonymous mmap backing (and it will have already
|
// read-write, everything afterwards being PROT_NONE.
|
||||||
// been cleared by the madvise). We may however need to
|
//
|
||||||
// mprotect(NONE) the space above `initial_size_bytes` if
|
// Our requested heap size is smaller than the previous heap size
|
||||||
// the last use of this slot left it larger. This also
|
// so all that's needed now is to shrink the heap further to
|
||||||
// lets us skip an mmap the first time a MemFdSlot is
|
// `initial_size_bytes`.
|
||||||
// used, because we require the caller to give us a fixed
|
|
||||||
// address in an already-mmaped-with-anon-memory
|
|
||||||
// region. This is important for the on-demand allocator.
|
|
||||||
//
|
//
|
||||||
// So we come in with:
|
// So we come in with:
|
||||||
// - anon-zero memory, R+W, [0, self.initial_size)
|
// - anon-zero memory, R+W, [0, self.initial_size)
|
||||||
@@ -432,8 +421,30 @@ impl MemFdSlot {
|
|||||||
rustix::io::MprotectFlags::empty(),
|
rustix::io::MprotectFlags::empty(),
|
||||||
)
|
)
|
||||||
.map_err(|e| InstantiationError::Resource(e.into()))?;
|
.map_err(|e| InstantiationError::Resource(e.into()))?;
|
||||||
|
} else if initial_size_bytes > self.initial_size {
|
||||||
|
// In this case, like the previous one, the previous module had no
|
||||||
|
// CoW image but had a smaller heap than desired for this module.
|
||||||
|
// That means that here `mprotect` is used to make the new pages
|
||||||
|
// read/write, and since they're all reset from before they'll be
|
||||||
|
// made visible as zeros.
|
||||||
|
self.set_protection(
|
||||||
|
self.initial_size..initial_size_bytes,
|
||||||
|
rustix::io::MprotectFlags::READ | rustix::io::MprotectFlags::WRITE,
|
||||||
|
)
|
||||||
|
.map_err(|e| InstantiationError::Resource(e.into()))?;
|
||||||
|
} else {
|
||||||
|
// The final case here is that the previous module has no CoW image
|
||||||
|
// so the previous heap is all zeros. The previous heap is the exact
|
||||||
|
// same size as the requested heap, so no syscalls are needed to do
|
||||||
|
// anything else.
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// The memory image, at this point, should have `initial_size_bytes` of
|
||||||
|
// zeros starting at `self.base` followed by inaccessible memory to
|
||||||
|
// `self.static_size`. Update sizing fields to reflect this.
|
||||||
|
self.initial_size = initial_size_bytes;
|
||||||
|
self.cur_size = initial_size_bytes;
|
||||||
|
|
||||||
// The initial memory image, if given. If not, we just get a
|
// The initial memory image, if given. If not, we just get a
|
||||||
// memory filled with zeroes.
|
// memory filled with zeroes.
|
||||||
if let Some(image) = maybe_image {
|
if let Some(image) = maybe_image {
|
||||||
@@ -455,17 +466,8 @@ impl MemFdSlot {
|
|||||||
}
|
}
|
||||||
|
|
||||||
self.image = maybe_image.cloned();
|
self.image = maybe_image.cloned();
|
||||||
|
|
||||||
// mprotect the initial `initial_size_bytes` to be accessible.
|
|
||||||
self.initial_size = initial_size_bytes;
|
|
||||||
self.cur_size = initial_size_bytes;
|
|
||||||
self.set_protection(
|
|
||||||
0..initial_size_bytes,
|
|
||||||
rustix::io::MprotectFlags::READ | rustix::io::MprotectFlags::WRITE,
|
|
||||||
)
|
|
||||||
.map_err(|e| InstantiationError::Resource(e.into()))?;
|
|
||||||
|
|
||||||
self.dirty = true;
|
self.dirty = true;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -477,21 +479,23 @@ impl MemFdSlot {
|
|||||||
unsafe {
|
unsafe {
|
||||||
rustix::io::madvise(
|
rustix::io::madvise(
|
||||||
self.base as *mut c_void,
|
self.base as *mut c_void,
|
||||||
self.static_size,
|
self.cur_size,
|
||||||
rustix::io::Advice::LinuxDontNeed,
|
rustix::io::Advice::LinuxDontNeed,
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// mprotect the initial heap region beyond the initial heap size back to PROT_NONE.
|
// mprotect the initial heap region beyond the initial heap size back to PROT_NONE.
|
||||||
self.set_protection(
|
self.set_protection(
|
||||||
self.initial_size..self.static_size,
|
self.initial_size..self.cur_size,
|
||||||
rustix::io::MprotectFlags::empty(),
|
rustix::io::MprotectFlags::empty(),
|
||||||
)?;
|
)?;
|
||||||
|
self.cur_size = self.initial_size;
|
||||||
self.dirty = false;
|
self.dirty = false;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn set_protection(&self, range: Range<usize>, flags: rustix::io::MprotectFlags) -> Result<()> {
|
fn set_protection(&self, range: Range<usize>, flags: rustix::io::MprotectFlags) -> Result<()> {
|
||||||
|
assert!(range.start <= range.end);
|
||||||
assert!(range.end <= self.static_size);
|
assert!(range.end <= self.static_size);
|
||||||
let mprotect_start = self.base.checked_add(range.start).unwrap();
|
let mprotect_start = self.base.checked_add(range.start).unwrap();
|
||||||
if range.len() > 0 {
|
if range.len() > 0 {
|
||||||
@@ -601,7 +605,7 @@ mod test {
|
|||||||
// 4 MiB mmap'd area, not accessible
|
// 4 MiB mmap'd area, not accessible
|
||||||
let mut mmap = Mmap::accessible_reserved(0, 4 << 20).unwrap();
|
let mut mmap = Mmap::accessible_reserved(0, 4 << 20).unwrap();
|
||||||
// Create a MemFdSlot on top of it
|
// Create a MemFdSlot on top of it
|
||||||
let mut memfd = MemFdSlot::create(mmap.as_mut_ptr() as *mut _, 4 << 20);
|
let mut memfd = MemFdSlot::create(mmap.as_mut_ptr() as *mut _, 0, 4 << 20);
|
||||||
memfd.no_clear_on_drop();
|
memfd.no_clear_on_drop();
|
||||||
assert!(!memfd.is_dirty());
|
assert!(!memfd.is_dirty());
|
||||||
// instantiate with 64 KiB initial size
|
// instantiate with 64 KiB initial size
|
||||||
@@ -637,7 +641,7 @@ mod test {
|
|||||||
// 4 MiB mmap'd area, not accessible
|
// 4 MiB mmap'd area, not accessible
|
||||||
let mut mmap = Mmap::accessible_reserved(0, 4 << 20).unwrap();
|
let mut mmap = Mmap::accessible_reserved(0, 4 << 20).unwrap();
|
||||||
// Create a MemFdSlot on top of it
|
// Create a MemFdSlot on top of it
|
||||||
let mut memfd = MemFdSlot::create(mmap.as_mut_ptr() as *mut _, 4 << 20);
|
let mut memfd = MemFdSlot::create(mmap.as_mut_ptr() as *mut _, 0, 4 << 20);
|
||||||
memfd.no_clear_on_drop();
|
memfd.no_clear_on_drop();
|
||||||
// Create an image with some data.
|
// Create an image with some data.
|
||||||
let image = Arc::new(create_memfd_with_data(4096, &[1, 2, 3, 4]).unwrap());
|
let image = Arc::new(create_memfd_with_data(4096, &[1, 2, 3, 4]).unwrap());
|
||||||
|
|||||||
@@ -38,11 +38,11 @@ impl ModuleMemFds {
|
|||||||
/// places (e.g. a `Memory`), we define a zero-sized type when memfd is
|
/// places (e.g. a `Memory`), we define a zero-sized type when memfd is
|
||||||
/// not included in the build.
|
/// not included in the build.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct MemFdSlot;
|
pub enum MemFdSlot {}
|
||||||
|
|
||||||
#[allow(dead_code)]
|
#[allow(dead_code)]
|
||||||
impl MemFdSlot {
|
impl MemFdSlot {
|
||||||
pub(crate) fn create(_: *mut libc::c_void, _: usize) -> Self {
|
pub(crate) fn create(_: *mut libc::c_void, _: usize, _: usize) -> Self {
|
||||||
panic!("create() on invalid MemFdSlot");
|
panic!("create() on invalid MemFdSlot");
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -51,24 +51,26 @@ impl MemFdSlot {
|
|||||||
_: usize,
|
_: usize,
|
||||||
_: Option<&Arc<MemoryMemFd>>,
|
_: Option<&Arc<MemoryMemFd>>,
|
||||||
) -> Result<Self, InstantiationError> {
|
) -> Result<Self, InstantiationError> {
|
||||||
panic!("instantiate() on invalid MemFdSlot");
|
match *self {}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn no_clear_on_drop(&mut self) {}
|
pub(crate) fn no_clear_on_drop(&mut self) {
|
||||||
|
match *self {}
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) fn clear_and_remain_ready(&mut self) -> Result<()> {
|
pub(crate) fn clear_and_remain_ready(&mut self) -> Result<()> {
|
||||||
Ok(())
|
match *self {}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn has_image(&self) -> bool {
|
pub(crate) fn has_image(&self) -> bool {
|
||||||
false
|
match *self {}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn is_dirty(&self) -> bool {
|
pub(crate) fn is_dirty(&self) -> bool {
|
||||||
false
|
match *self {}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn set_heap_limit(&mut self, _: usize) -> Result<()> {
|
pub(crate) fn set_heap_limit(&mut self, _: usize) -> Result<()> {
|
||||||
panic!("set_heap_limit on invalid MemFdSlot");
|
match *self {}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -158,9 +158,8 @@ impl MmapMemory {
|
|||||||
// If a memfd image was specified, try to create the MemFdSlot on top of our mmap.
|
// If a memfd image was specified, try to create the MemFdSlot on top of our mmap.
|
||||||
let memfd = match memfd_image {
|
let memfd = match memfd_image {
|
||||||
Some(image) => {
|
Some(image) => {
|
||||||
let base = unsafe { mmap.as_mut_ptr().offset(pre_guard_bytes as isize) };
|
let base = unsafe { mmap.as_mut_ptr().add(pre_guard_bytes) };
|
||||||
let len = request_bytes - pre_guard_bytes;
|
let mut memfd_slot = MemFdSlot::create(base.cast(), minimum, alloc_bytes);
|
||||||
let mut memfd_slot = MemFdSlot::create(base as *mut _, len);
|
|
||||||
memfd_slot.instantiate(minimum, Some(image))?;
|
memfd_slot.instantiate(minimum, Some(image))?;
|
||||||
// On drop, we will unmap our mmap'd range that this
|
// On drop, we will unmap our mmap'd range that this
|
||||||
// memfd_slot was mapped on top of, so there is no
|
// memfd_slot was mapped on top of, so there is no
|
||||||
|
|||||||
Reference in New Issue
Block a user