diff --git a/crates/fuzzing/src/generators/pooling_config.rs b/crates/fuzzing/src/generators/pooling_config.rs index f6a49a5e7b..f0cfef18bd 100644 --- a/crates/fuzzing/src/generators/pooling_config.rs +++ b/crates/fuzzing/src/generators/pooling_config.rs @@ -14,6 +14,9 @@ pub struct PoolingAllocationConfig { pub instance_table_elements: u32, pub instance_size: usize, pub async_stack_zeroing: bool, + pub async_stack_keep_resident: usize, + pub linear_memory_keep_resident: usize, + pub table_keep_resident: usize, } impl PoolingAllocationConfig { @@ -28,7 +31,10 @@ impl PoolingAllocationConfig { .instance_memory_pages(self.instance_memory_pages) .instance_table_elements(self.instance_table_elements) .instance_size(self.instance_size) - .async_stack_zeroing(self.async_stack_zeroing); + .async_stack_zeroing(self.async_stack_zeroing) + .async_stack_keep_resident(self.async_stack_keep_resident) + .linear_memory_keep_resident(self.linear_memory_keep_resident) + .table_keep_resident(self.table_keep_resident); cfg } } @@ -51,6 +57,9 @@ impl<'a> Arbitrary<'a> for PoolingAllocationConfig { instance_count: u.int_in_range(1..=MAX_COUNT)?, instance_size: u.int_in_range(0..=MAX_SIZE)?, async_stack_zeroing: u.arbitrary()?, + async_stack_keep_resident: u.int_in_range(0..=1 << 20)?, + linear_memory_keep_resident: u.int_in_range(0..=1 << 20)?, + table_keep_resident: u.int_in_range(0..=1 << 20)?, }) } } diff --git a/crates/runtime/src/cow.rs b/crates/runtime/src/cow.rs index a4364e6f36..f40edc72e8 100644 --- a/crates/runtime/src/cow.rs +++ b/crates/runtime/src/cow.rs @@ -466,39 +466,23 @@ impl MemoryImageSlot { Ok(()) } + /// Resets this linear memory slot back to a "pristine state". + /// + /// This will reset the memory back to its original contents on Linux or + /// reset the contents back to zero on other platforms. The `keep_resident` + /// argument is the maximum amount of memory to keep resident in this + /// process's memory on Linux. Up to that much memory will be `memset` to + /// zero where the rest of it will be reset or released with `madvise`. #[allow(dead_code)] // ignore warnings as this is only used in some cfgs - pub(crate) fn clear_and_remain_ready(&mut self) -> Result<()> { + pub(crate) fn clear_and_remain_ready(&mut self, keep_resident: usize) -> Result<()> { assert!(self.dirty); - cfg_if::cfg_if! { - if #[cfg(target_os = "linux")] { - // On Linux we can use `madvise` to reset the virtual memory - // back to its original state. This means back to all zeros for - // anonymous-backed pages and back to the original contents for - // CoW memory (the initial heap image). This has the precise - // semantics we want for reuse between instances, so it's all we - // need to do. - unsafe { - rustix::mm::madvise( - self.base as *mut c_void, - self.cur_size, - rustix::mm::Advice::LinuxDontNeed, - )?; - } - } else { - // If we're not on Linux, however, then there's no generic - // platform way to reset memory back to its original state, so - // instead this is "feigned" by resetting memory back to - // entirely zeros with an anonymous backing. - // - // Additionally the previous image, if any, is dropped here - // since it's no longer applicable to this mapping. - self.reset_with_anon_memory()?; - self.image = None; - } + unsafe { + self.reset_all_memory_contents(keep_resident)?; } - // mprotect the initial heap region beyond the initial heap size back to PROT_NONE. + // mprotect the initial heap region beyond the initial heap size back to + // PROT_NONE. self.set_protection( self.initial_size..self.cur_size, rustix::mm::MprotectFlags::empty(), @@ -508,6 +492,136 @@ impl MemoryImageSlot { Ok(()) } + #[allow(dead_code)] // ignore warnings as this is only used in some cfgs + unsafe fn reset_all_memory_contents(&mut self, keep_resident: usize) -> Result<()> { + if !cfg!(target_os = "linux") { + // If we're not on Linux then there's no generic platform way to + // reset memory back to its original state, so instead reset memory + // back to entirely zeros with an anonymous backing. + // + // Additionally the previous image, if any, is dropped here + // since it's no longer applicable to this mapping. + return self.reset_with_anon_memory(); + } + + match &self.image { + Some(image) => { + assert!(self.cur_size >= image.linear_memory_offset + image.len); + if image.linear_memory_offset < keep_resident { + // If the image starts below the `keep_resident` then + // memory looks something like this: + // + // up to `keep_resident` bytes + // | + // +--------------------------+ remaining_memset + // | | / + // <--------------> <-------> + // + // image_end + // 0 linear_memory_offset | cur_size + // | | | | + // +----------------+--------------+---------+--------+ + // | dirty memory | image | dirty memory | + // +----------------+--------------+---------+--------+ + // + // <------+-------> <-----+-----> <---+---> <--+---> + // | | | | + // | | | | + // memset (1) / | madvise (4) + // mmadvise (2) / + // / + // memset (3) + // + // + // In this situation there are two disjoint regions that are + // `memset` manually to zero. Note that `memset (3)` may be + // zero bytes large. Furthermore `madvise (4)` may also be + // zero bytes large. + + let image_end = image.linear_memory_offset + image.len; + let mem_after_image = self.cur_size - image_end; + let remaining_memset = + (keep_resident - image.linear_memory_offset).min(mem_after_image); + + // This is memset (1) + std::ptr::write_bytes(self.base as *mut u8, 0u8, image.linear_memory_offset); + + // This is madvise (2) + self.madvise_reset(image.linear_memory_offset, image.len)?; + + // This is memset (3) + std::ptr::write_bytes( + (self.base + image_end) as *mut u8, + 0u8, + remaining_memset, + ); + + // This is madvise (4) + self.madvise_reset( + image_end + remaining_memset, + mem_after_image - remaining_memset, + )?; + } else { + // If the image starts after the `keep_resident` threshold + // then we memset the start of linear memory and then use + // madvise below for the rest of it, including the image. + // + // 0 keep_resident cur_size + // | | | + // +----------------+---+----------+------------------+ + // | dirty memory | image | dirty memory | + // +----------------+---+----------+------------------+ + // + // <------+-------> <-------------+-----------------> + // | | + // | | + // memset (1) madvise (2) + // + // Here only a single memset is necessary since the image + // started after the threshold which we're keeping resident. + // Note that the memset may be zero bytes here. + + // This is memset (1) + std::ptr::write_bytes(self.base as *mut u8, 0u8, keep_resident); + + // This is madvise (2) + self.madvise_reset(keep_resident, self.cur_size - keep_resident)?; + } + } + + // If there's no memory image for this slot then memset the first + // bytes in the memory back to zero while using `madvise` to purge + // the rest. + None => { + let size_to_memset = keep_resident.min(self.cur_size); + std::ptr::write_bytes(self.base as *mut u8, 0u8, size_to_memset); + self.madvise_reset(size_to_memset, self.cur_size - size_to_memset)?; + } + } + + Ok(()) + } + + #[allow(dead_code)] // ignore warnings as this is only used in some cfgs + unsafe fn madvise_reset(&self, base: usize, len: usize) -> Result<()> { + assert!(base + len <= self.cur_size); + if len == 0 { + return Ok(()); + } + cfg_if::cfg_if! { + if #[cfg(target_os = "linux")] { + rustix::mm::madvise( + (self.base + base) as *mut c_void, + len, + rustix::mm::Advice::LinuxDontNeed, + )?; + Ok(()) + } else { + unreachable!(); + } + } + } + fn set_protection(&self, range: Range, flags: rustix::mm::MprotectFlags) -> Result<()> { assert!(range.start <= range.end); assert!(range.end <= self.static_size); @@ -532,7 +646,7 @@ impl MemoryImageSlot { /// Map anonymous zeroed memory across the whole slot, /// inaccessible. Used both during instantiate and during drop. - fn reset_with_anon_memory(&self) -> Result<()> { + fn reset_with_anon_memory(&mut self) -> Result<()> { unsafe { let ptr = rustix::mm::mmap_anonymous( self.base as *mut c_void, @@ -542,6 +656,11 @@ impl MemoryImageSlot { )?; assert_eq!(ptr as usize, self.base); } + + self.image = None; + self.cur_size = 0; + self.initial_size = 0; + Ok(()) } } @@ -638,7 +757,7 @@ mod test { assert_eq!(0, slice[131071]); // instantiate again; we should see zeroes, even as the // reuse-anon-mmap-opt kicks in - memfd.clear_and_remain_ready().unwrap(); + memfd.clear_and_remain_ready(0).unwrap(); assert!(!memfd.is_dirty()); memfd.instantiate(64 << 10, None).unwrap(); let slice = mmap.as_slice(); @@ -661,33 +780,69 @@ mod test { assert_eq!(&[1, 2, 3, 4], &slice[4096..4100]); slice[4096] = 5; // Clear and re-instantiate same image - memfd.clear_and_remain_ready().unwrap(); + memfd.clear_and_remain_ready(0).unwrap(); memfd.instantiate(64 << 10, Some(&image)).unwrap(); let slice = mmap.as_slice(); // Should not see mutation from above assert_eq!(&[1, 2, 3, 4], &slice[4096..4100]); // Clear and re-instantiate no image - memfd.clear_and_remain_ready().unwrap(); + memfd.clear_and_remain_ready(0).unwrap(); memfd.instantiate(64 << 10, None).unwrap(); assert!(!memfd.has_image()); let slice = mmap.as_slice(); assert_eq!(&[0, 0, 0, 0], &slice[4096..4100]); // Clear and re-instantiate image again - memfd.clear_and_remain_ready().unwrap(); + memfd.clear_and_remain_ready(0).unwrap(); memfd.instantiate(64 << 10, Some(&image)).unwrap(); let slice = mmap.as_slice(); assert_eq!(&[1, 2, 3, 4], &slice[4096..4100]); // Create another image with different data. let image2 = Arc::new(create_memfd_with_data(4096, &[10, 11, 12, 13]).unwrap()); - memfd.clear_and_remain_ready().unwrap(); + memfd.clear_and_remain_ready(0).unwrap(); memfd.instantiate(128 << 10, Some(&image2)).unwrap(); let slice = mmap.as_slice(); assert_eq!(&[10, 11, 12, 13], &slice[4096..4100]); // Instantiate the original image again; we should notice it's // a different image and not reuse the mappings. - memfd.clear_and_remain_ready().unwrap(); + memfd.clear_and_remain_ready(0).unwrap(); memfd.instantiate(64 << 10, Some(&image)).unwrap(); let slice = mmap.as_slice(); assert_eq!(&[1, 2, 3, 4], &slice[4096..4100]); } + + #[test] + #[cfg(target_os = "linux")] + fn memset_instead_of_madvise() { + let mut mmap = Mmap::accessible_reserved(0, 4 << 20).unwrap(); + let mut memfd = MemoryImageSlot::create(mmap.as_mut_ptr() as *mut _, 0, 4 << 20); + memfd.no_clear_on_drop(); + + // Test basics with the image + for image_off in [0, 4096, 8 << 10] { + let image = Arc::new(create_memfd_with_data(image_off, &[1, 2, 3, 4]).unwrap()); + for amt_to_memset in [0, 4096, 10 << 12, 1 << 20, 10 << 20] { + memfd.instantiate(64 << 10, Some(&image)).unwrap(); + assert!(memfd.has_image()); + let slice = mmap.as_mut_slice(); + if image_off > 0 { + assert_eq!(slice[image_off - 1], 0); + } + assert_eq!(slice[image_off + 5], 0); + assert_eq!(&[1, 2, 3, 4], &slice[image_off..][..4]); + slice[image_off] = 5; + assert_eq!(&[5, 2, 3, 4], &slice[image_off..][..4]); + memfd.clear_and_remain_ready(amt_to_memset).unwrap(); + } + } + + // Test without an image + for amt_to_memset in [0, 4096, 10 << 12, 1 << 20, 10 << 20] { + memfd.instantiate(64 << 10, None).unwrap(); + for chunk in mmap.as_mut_slice()[..64 << 10].chunks_mut(1024) { + assert_eq!(chunk[0], 0); + chunk[0] = 5; + } + memfd.clear_and_remain_ready(amt_to_memset).unwrap(); + } + } } diff --git a/crates/runtime/src/cow_disabled.rs b/crates/runtime/src/cow_disabled.rs index be06a9f4a1..63a92bd0ce 100644 --- a/crates/runtime/src/cow_disabled.rs +++ b/crates/runtime/src/cow_disabled.rs @@ -57,7 +57,7 @@ impl MemoryImageSlot { unreachable!(); } - pub(crate) fn clear_and_remain_ready(&mut self) -> Result<()> { + pub(crate) fn clear_and_remain_ready(&mut self, _keep_resident: usize) -> Result<()> { unreachable!(); } diff --git a/crates/runtime/src/instance/allocator/pooling.rs b/crates/runtime/src/instance/allocator/pooling.rs index 424708d5b0..d938cd295f 100644 --- a/crates/runtime/src/instance/allocator/pooling.rs +++ b/crates/runtime/src/instance/allocator/pooling.rs @@ -126,6 +126,8 @@ struct InstancePool { index_allocator: Mutex, memories: MemoryPool, tables: TablePool, + linear_memory_keep_resident: usize, + table_keep_resident: usize, } impl InstancePool { @@ -156,6 +158,8 @@ impl InstancePool { )), memories: MemoryPool::new(&config.limits, tunables)?, tables: TablePool::new(&config.limits)?, + linear_memory_keep_resident: config.linear_memory_keep_resident, + table_keep_resident: config.table_keep_resident, }; Ok(pool) @@ -373,7 +377,10 @@ impl InstancePool { // image, just drop it here, and let the drop handler for the // slot unmap in a way that retains the address space // reservation. - if image.clear_and_remain_ready().is_ok() { + if image + .clear_and_remain_ready(self.linear_memory_keep_resident) + .is_ok() + { self.memories .return_memory_image_slot(instance_index, def_mem_idx, image); } @@ -437,10 +444,20 @@ impl InstancePool { ); drop(table); - decommit_table_pages(base, size).expect("failed to decommit table pages"); + self.reset_table_pages_to_zero(base, size) + .expect("failed to decommit table pages"); } } + fn reset_table_pages_to_zero(&self, base: *mut u8, size: usize) -> Result<()> { + let size_to_memset = size.min(self.table_keep_resident); + unsafe { + std::ptr::write_bytes(base, 0, size_to_memset); + decommit_table_pages(base.add(size_to_memset), size - size_to_memset)?; + } + Ok(()) + } + fn validate_table_plans(&self, module: &Module) -> Result<()> { let tables = module.table_plans.len() - module.num_imported_tables; if tables > self.tables.max_tables { @@ -807,6 +824,7 @@ struct StackPool { page_size: usize, index_allocator: Mutex, async_stack_zeroing: bool, + async_stack_keep_resident: usize, } #[cfg(all(feature = "async", unix))] @@ -852,6 +870,7 @@ impl StackPool { max_instances, page_size, async_stack_zeroing: config.async_stack_zeroing, + async_stack_keep_resident: config.async_stack_keep_resident, // We always use a `NextAvailable` strategy for stack // allocation. We don't want or need an affinity policy // here: stacks do not benefit from being allocated to the @@ -919,11 +938,32 @@ impl StackPool { assert!(index < self.max_instances); if self.async_stack_zeroing { - reset_stack_pages_to_zero(bottom_of_stack as _, stack_size).unwrap(); + self.zero_stack(bottom_of_stack, stack_size); } self.index_allocator.lock().unwrap().free(SlotId(index)); } + + fn zero_stack(&self, bottom: usize, size: usize) { + // Manually zero the top of the stack to keep the pages resident in + // memory and avoid future page faults. Use the system to deallocate + // pages past this. This hopefully strikes a reasonable balance between: + // + // * memset for the whole range is probably expensive + // * madvise for the whole range incurs expensive future page faults + // * most threads probably don't use most of the stack anyway + let size_to_memset = size.min(self.async_stack_keep_resident); + unsafe { + std::ptr::write_bytes( + (bottom + size - size_to_memset) as *mut u8, + 0, + size_to_memset, + ); + } + + // Use the system to reset remaining stack pages to zero. + reset_stack_pages_to_zero(bottom as _, size - size_to_memset).unwrap(); + } } /// Configuration options for the pooling instance allocator supplied at @@ -940,6 +980,22 @@ pub struct PoolingInstanceAllocatorConfig { pub limits: InstanceLimits, /// Whether or not async stacks are zeroed after use. pub async_stack_zeroing: bool, + /// If async stack zeroing is enabled and the host platform is Linux this is + /// how much memory to zero out with `memset`. + /// + /// The rest of memory will be zeroed out with `madvise`. + pub async_stack_keep_resident: usize, + /// How much linear memory, in bytes, to keep resident after resetting for + /// use with the next instance. This much memory will be `memset` to zero + /// when a linear memory is deallocated. + /// + /// Memory exceeding this amount in the wasm linear memory will be released + /// with `madvise` back to the kernel. + /// + /// Only applicable on Linux. + pub linear_memory_keep_resident: usize, + /// Same as `linear_memory_keep_resident` but for tables. + pub table_keep_resident: usize, } impl Default for PoolingInstanceAllocatorConfig { @@ -949,6 +1005,9 @@ impl Default for PoolingInstanceAllocatorConfig { stack_size: 2 << 20, limits: InstanceLimits::default(), async_stack_zeroing: false, + async_stack_keep_resident: 0, + linear_memory_keep_resident: 0, + table_keep_resident: 0, } } } diff --git a/crates/wasmtime/src/config.rs b/crates/wasmtime/src/config.rs index 836f942c96..7b51361c60 100644 --- a/crates/wasmtime/src/config.rs +++ b/crates/wasmtime/src/config.rs @@ -1664,14 +1664,11 @@ pub enum WasmBacktraceDetails { Environment, } -/// Global configuration options used to create an [`Engine`](crate::Engine) -/// and customize its behavior. +/// Configuration options used with [`InstanceAllocationStrategy::Pooling`] to +/// change the behavior of the pooling instance allocator. /// -/// This structure exposed a builder-like interface and is primarily consumed by -/// [`Engine::new()`](crate::Engine::new). -/// -/// The validation of `Config` is deferred until the engine is being built, thus -/// a problematic config may cause `Engine::new` to fail. +/// This structure has a builder-style API in the same manner as [`Config`] and +/// is configured with [`Config::allocation_strategy`]. #[cfg(feature = "pooling-allocator")] #[derive(Debug, Clone, Default)] pub struct PoolingAllocationConfig { @@ -1703,11 +1700,8 @@ impl PoolingAllocationConfig { /// Wasmtime and the [`call_async`] variant /// of calling WebAssembly is used then Wasmtime will create a separate /// runtime execution stack for each future produced by [`call_async`]. - /// When using the pooling instance allocator - /// ([`InstanceAllocationStrategy::Pooling`]) this allocation will happen - /// from a pool of stacks and additionally deallocation will simply release - /// the stack back to the pool. During the deallocation process Wasmtime - /// won't by default reset the contents of the stack back to zero. + /// During the deallocation process Wasmtime won't by default reset the + /// contents of the stack back to zero. /// /// When this option is enabled it can be seen as a defense-in-depth /// mechanism to reset a stack back to zero. This is not required for @@ -1725,6 +1719,57 @@ impl PoolingAllocationConfig { self } + /// How much memory, in bytes, to keep resident for async stacks allocated + /// with the pooling allocator. + /// + /// When [`PoolingAllocationConfig::async_stack_zeroing`] is enabled then + /// Wasmtime will reset the contents of async stacks back to zero upon + /// deallocation. This option can be used to perform the zeroing operation + /// with `memset` up to a certain threshold of bytes instead of using system + /// calls to reset the stack to zero. + /// + /// Note that when using this option the memory with async stacks will + /// never be decommitted. + #[cfg(feature = "async")] + #[cfg_attr(nightlydoc, doc(cfg(feature = "async")))] + pub fn async_stack_keep_resident(&mut self, size: usize) -> &mut Self { + let size = round_up_to_pages(size as u64) as usize; + self.config.async_stack_keep_resident = size; + self + } + + /// How much memory, in bytes, to keep resident for each linear memory + /// after deallocation. + /// + /// This option is only applicable on Linux and has no effect on other + /// platforms. + /// + /// By default Wasmtime will use `madvise` to reset the entire contents of + /// linear memory back to zero when a linear memory is deallocated. This + /// option can be used to use `memset` instead to set memory back to zero + /// which can, in some configurations, reduce the number of page faults + /// taken when a slot is reused. + pub fn linear_memory_keep_resident(&mut self, size: usize) -> &mut Self { + let size = round_up_to_pages(size as u64) as usize; + self.config.linear_memory_keep_resident = size; + self + } + + /// How much memory, in bytes, to keep resident for each table after + /// deallocation. + /// + /// This option is only applicable on Linux and has no effect on other + /// platforms. + /// + /// This option is the same as + /// [`PoolingAllocationConfig::linear_memory_keep_resident`] except that it + /// is applicable to tables instead. + pub fn table_keep_resident(&mut self, size: usize) -> &mut Self { + let size = round_up_to_pages(size as u64) as usize; + self.config.table_keep_resident = size; + self + } + /// The maximum number of concurrent instances supported (default is 1000). /// /// This value has a direct impact on the amount of memory allocated by the pooling