Reimplement the pooling instance allocation strategy (#5661)

* Reimplement the pooling instance allocation strategy This commit is a reimplementation of the strategy by which the pooling instance allocator selects a slot for a module. Previously there was a choice amongst three different algorithms: "reuse affinity", "next available", and "random". The default was "reuse affinity" but some new data has come to light which shows that this may not always be a good default. Notably the pooling allocator will retain some memory per-slot in the pooling instance allocator, for example instance data or memory data if-so-configured. This means that a currently unused, but previously used, slot can contribute to the RSS usage of a program using Wasmtime. Consequently the RSS impact here is O(max slots) which can be counter-intuitive for embedders. This particularly affects "reuse affinity" because the algorithm for picking a slot when there are no affine slots is "pick a random slot", which means eventually all slots will get used. In discussions about possible ways to tackle this, an alternative to "pick a strategy" arose and is now implemented in this commit. Concretely the new allocation algorithm for a slot is now: * First pick the most recently used affine slot, if one exists. * Otherwise if the number of affine slots to other modules is above some threshold N then pick the least-recently used affine slot. * Otherwise pick a slot that's affine to nothing. The "N" in this algorithm is configurable and setting it to 0 is the same as the old "next available" strategy while setting it to infinity is the same as the "reuse affinity" algorithm. Setting it to something in the middle provides a knob to allow a modest "cache" of affine slots while not allowing the total set of slots used to grow too much beyond the maximal concurrent set of modules. The "random" strategy is now no longer possible and was removed to help simplify the allocator. * Resolve rustdoc warnings in `wasmtime-runtime` crate * Remove `max_cold` as it duplicates the `slot_state.len()` * More descriptive names * Add a comment and debug assertion * Add some list assertions
2023-02-01 11:43:51 -06:00
parent cb3b6c621f
commit 8ffbb9cfd7
7 changed files with 444 additions and 440 deletions
--- a/crates/fuzzing/src/generators/pooling_config.rs
+++ b/crates/fuzzing/src/generators/pooling_config.rs
@@ -6,7 +6,7 @@ use arbitrary::{Arbitrary, Unstructured};
 #[derive(Debug, Clone, Eq, PartialEq, Hash)]
 #[allow(missing_docs)]
 pub struct PoolingAllocationConfig {
-    pub strategy: PoolingAllocationStrategy,
+    pub max_unused_warm_slots: u32,
    pub instance_count: u32,
    pub instance_memories: u32,
    pub instance_tables: u32,
@@ -24,7 +24,7 @@ impl PoolingAllocationConfig {
    pub fn to_wasmtime(&self) -> wasmtime::PoolingAllocationConfig {
        let mut cfg = wasmtime::PoolingAllocationConfig::default();

-        cfg.strategy(self.strategy.to_wasmtime())
+        cfg.max_unused_warm_slots(self.max_unused_warm_slots)
            .instance_count(self.instance_count)
            .instance_memories(self.instance_memories)
            .instance_tables(self.instance_tables)
@@ -48,13 +48,15 @@ impl<'a> Arbitrary<'a> for PoolingAllocationConfig {
        const MAX_MEMORY_PAGES: u64 = 160; // 10 MiB
        const MAX_SIZE: usize = 1 << 20; // 1 MiB

+        let instance_count = u.int_in_range(1..=MAX_COUNT)?;
+
        Ok(Self {
-            strategy: u.arbitrary()?,
+            max_unused_warm_slots: u.int_in_range(0..=instance_count + 10)?,
            instance_tables: u.int_in_range(0..=MAX_TABLES)?,
            instance_memories: u.int_in_range(0..=MAX_MEMORIES)?,
            instance_table_elements: u.int_in_range(0..=MAX_ELEMENTS)?,
            instance_memory_pages: u.int_in_range(0..=MAX_MEMORY_PAGES)?,
-            instance_count: u.int_in_range(1..=MAX_COUNT)?,
+            instance_count,
            instance_size: u.int_in_range(0..=MAX_SIZE)?,
            async_stack_zeroing: u.arbitrary()?,
            async_stack_keep_resident: u.int_in_range(0..=1 << 20)?,
@@ -63,28 +65,3 @@ impl<'a> Arbitrary<'a> for PoolingAllocationConfig {
        })
    }
 }
-
-/// Configuration for `wasmtime::PoolingAllocationStrategy`.
-#[derive(Arbitrary, Clone, Debug, PartialEq, Eq, Hash)]
-pub enum PoolingAllocationStrategy {
-    /// Use next available instance slot.
-    NextAvailable,
-    /// Use random instance slot.
-    Random,
-    /// Use an affinity-based strategy.
-    ReuseAffinity,
-}
-
-impl PoolingAllocationStrategy {
-    fn to_wasmtime(&self) -> wasmtime::PoolingAllocationStrategy {
-        match self {
-            PoolingAllocationStrategy::NextAvailable => {
-                wasmtime::PoolingAllocationStrategy::NextAvailable
-            }
-            PoolingAllocationStrategy::Random => wasmtime::PoolingAllocationStrategy::Random,
-            PoolingAllocationStrategy::ReuseAffinity => {
-                wasmtime::PoolingAllocationStrategy::ReuseAffinity
-            }
-        }
-    }
-}
--- a/crates/runtime/src/cow.rs
+++ b/crates/runtime/src/cow.rs
@@ -305,7 +305,7 @@ impl ModuleMemoryImages {
 /// middle of it. Pictorially this data structure manages a virtual memory
 /// region that looks like:
 ///
-/// ```ignore
+/// ```text
 ///   +--------------------+-------------------+--------------+--------------+
 ///   |   anonymous        |      optional     |   anonymous  |    PROT_NONE |
 ///   |     zero           |       memory      |     zero     |     memory   |
@@ -333,7 +333,7 @@ impl ModuleMemoryImages {
 /// `accessible` limits are. Initially there is assumed to be no image in linear
 /// memory.
 ///
-/// When [`MemoryImageSlot::instantiate`] is called then the method will perform
+/// When `MemoryImageSlot::instantiate` is called then the method will perform
 /// a "synchronization" to take the image from its prior state to the new state
 /// for the image specified. The first instantiation for example will mmap the
 /// heap image into place. Upon reuse of a slot nothing happens except possibly
@@ -343,7 +343,7 @@ impl ModuleMemoryImages {
 /// A `MemoryImageSlot` is either `dirty` or it isn't. When a `MemoryImageSlot`
 /// is dirty then it is assumed that any memory beneath `self.accessible` could
 /// have any value. Instantiation cannot happen into a `dirty` slot, however, so
-/// the [`MemoryImageSlot::clear_and_remain_ready`] returns this memory back to
+/// the `MemoryImageSlot::clear_and_remain_ready` returns this memory back to
 /// its original state to mark `dirty = false`. This is done by resetting all
 /// anonymous memory back to zero and the image itself back to its initial
 /// contents.
--- a/crates/runtime/src/instance/allocator.rs
+++ b/crates/runtime/src/instance/allocator.rs
@@ -19,10 +19,7 @@ use wasmtime_environ::{
 mod pooling;

 #[cfg(feature = "pooling-allocator")]
-pub use self::pooling::{
-    InstanceLimits, PoolingAllocationStrategy, PoolingInstanceAllocator,
-    PoolingInstanceAllocatorConfig,
-};
+pub use self::pooling::{InstanceLimits, PoolingInstanceAllocator, PoolingInstanceAllocatorConfig};

 /// Represents a request for a new runtime instance.
 pub struct InstanceAllocationRequest<'a> {
--- a/crates/runtime/src/instance/allocator/pooling.rs
+++ b/crates/runtime/src/instance/allocator/pooling.rs
@@ -83,25 +83,6 @@ impl Default for InstanceLimits {
    }
 }

-/// The allocation strategy to use for the pooling instance allocator.
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum PoolingAllocationStrategy {
-    /// Allocate from the next available instance.
-    NextAvailable,
-    /// Allocate from a random available instance.
-    Random,
-    /// Try to allocate an instance slot that was previously used for
-    /// the same module, potentially enabling faster instantiation by
-    /// reusing e.g. memory mappings.
-    ReuseAffinity,
-}
-
-impl Default for PoolingAllocationStrategy {
-    fn default() -> Self {
-        Self::ReuseAffinity
-    }
-}
-
 /// Represents a pool of maximal `Instance` structures.
 ///
 /// Each index in the pool provides enough space for a maximal `Instance`
@@ -142,7 +123,7 @@ impl InstancePool {
            mapping,
            instance_size,
            max_instances,
-            index_allocator: IndexAllocator::new(config.strategy, max_instances),
+            index_allocator: IndexAllocator::new(config.limits.count, config.max_unused_warm_slots),
            memories: MemoryPool::new(&config.limits, tunables)?,
            tables: TablePool::new(&config.limits)?,
            linear_memory_keep_resident: config.linear_memory_keep_resident,
@@ -248,7 +229,7 @@ impl InstancePool {
        // touched again until we write a fresh Instance in-place with
        // std::ptr::write in allocate() above.

-        self.index_allocator.free(SlotId(index));
+        self.index_allocator.free(SlotId(index as u32));
    }

    fn allocate_instance_resources(
@@ -546,7 +527,7 @@ impl InstancePool {
        // any sort of infinite loop since this should be the final operation
        // working with `module`.
        while let Some(index) = self.index_allocator.alloc_affine_and_clear_affinity(module) {
-            self.memories.clear_images(index.0);
+            self.memories.clear_images(index.index());
            self.index_allocator.free(index);
        }
    }
@@ -892,15 +873,10 @@ impl StackPool {
            page_size,
            async_stack_zeroing: config.async_stack_zeroing,
            async_stack_keep_resident: config.async_stack_keep_resident,
-            // We always use a `NextAvailable` strategy for stack
-            // allocation. We don't want or need an affinity policy
-            // here: stacks do not benefit from being allocated to the
-            // same compiled module with the same image (they always
-            // start zeroed just the same for everyone).
-            index_allocator: IndexAllocator::new(
-                PoolingAllocationStrategy::NextAvailable,
-                max_instances,
-            ),
+            // Note that `max_unused_warm_slots` is set to zero since stacks
+            // have no affinity so there's no need to keep intentionally unused
+            // warm slots around.
+            index_allocator: IndexAllocator::new(config.limits.count, 0),
        })
    }

@@ -965,7 +941,7 @@ impl StackPool {
            self.zero_stack(bottom_of_stack, stack_size);
        }

-        self.index_allocator.free(SlotId(index));
+        self.index_allocator.free(SlotId(index as u32));
    }

    fn zero_stack(&self, bottom: usize, size: usize) {
@@ -994,9 +970,8 @@ impl StackPool {
 /// construction.
 #[derive(Copy, Clone, Debug)]
 pub struct PoolingInstanceAllocatorConfig {
-    /// Allocation strategy to use for slot indexes in the pooling instance
-    /// allocator.
-    pub strategy: PoolingAllocationStrategy,
+    /// See `PoolingAllocatorConfig::max_unused_warm_slots` in `wasmtime`
+    pub max_unused_warm_slots: u32,
    /// The size, in bytes, of async stacks to allocate (not including the guard
    /// page).
    pub stack_size: usize,
@@ -1025,7 +1000,7 @@ pub struct PoolingInstanceAllocatorConfig {
 impl Default for PoolingInstanceAllocatorConfig {
    fn default() -> PoolingInstanceAllocatorConfig {
        PoolingInstanceAllocatorConfig {
-            strategy: Default::default(),
+            max_unused_warm_slots: 100,
            stack_size: 2 << 20,
            limits: InstanceLimits::default(),
            async_stack_zeroing: false,
@@ -1177,7 +1152,7 @@ mod test {
    #[test]
    fn test_instance_pool() -> Result<()> {
        let mut config = PoolingInstanceAllocatorConfig::default();
-        config.strategy = PoolingAllocationStrategy::NextAvailable;
+        config.max_unused_warm_slots = 0;
        config.limits = InstanceLimits {
            count: 3,
            tables: 1,
@@ -1199,10 +1174,7 @@ mod test {
        assert_eq!(instances.instance_size, 1008); // round 1000 up to alignment
        assert_eq!(instances.max_instances, 3);

-        assert_eq!(
-            instances.index_allocator.testing_freelist(),
-            [SlotId(0), SlotId(1), SlotId(2)]
-        );
+        assert_eq!(instances.index_allocator.testing_freelist(), []);

        let mut handles = Vec::new();
        let module = Arc::new(Module::default());
@@ -1248,7 +1220,7 @@ mod test {

        assert_eq!(
            instances.index_allocator.testing_freelist(),
-            [SlotId(2), SlotId(1), SlotId(0)]
+            [SlotId(0), SlotId(1), SlotId(2)]
        );

        Ok(())
@@ -1353,26 +1325,12 @@ mod test {
        assert_eq!(pool.max_instances, 10);
        assert_eq!(pool.page_size, native_page_size);

-        assert_eq!(
-            pool.index_allocator.testing_freelist(),
-            [
-                SlotId(0),
-                SlotId(1),
-                SlotId(2),
-                SlotId(3),
-                SlotId(4),
-                SlotId(5),
-                SlotId(6),
-                SlotId(7),
-                SlotId(8),
-                SlotId(9)
-            ],
-        );
+        assert_eq!(pool.index_allocator.testing_freelist(), []);

        let base = pool.mapping.as_ptr() as usize;

        let mut stacks = Vec::new();
-        for i in (0..10).rev() {
+        for i in 0..10 {
            let stack = pool.allocate().expect("allocation should succeed");
            assert_eq!(
                ((stack.top().unwrap() as usize - base) / pool.stack_size) - 1,
@@ -1392,16 +1350,16 @@ mod test {
        assert_eq!(
            pool.index_allocator.testing_freelist(),
            [
-                SlotId(9),
-                SlotId(8),
-                SlotId(7),
-                SlotId(6),
-                SlotId(5),
-                SlotId(4),
-                SlotId(3),
-                SlotId(2),
+                SlotId(0),
                SlotId(1),
-                SlotId(0)
+                SlotId(2),
+                SlotId(3),
+                SlotId(4),
+                SlotId(5),
+                SlotId(6),
+                SlotId(7),
+                SlotId(8),
+                SlotId(9)
            ],
        );

@@ -1475,7 +1433,7 @@ mod test {
    #[test]
    fn test_stack_zeroed() -> Result<()> {
        let config = PoolingInstanceAllocatorConfig {
-            strategy: PoolingAllocationStrategy::NextAvailable,
+            max_unused_warm_slots: 0,
            limits: InstanceLimits {
                count: 1,
                table_elements: 0,
@@ -1511,7 +1469,7 @@ mod test {
    #[test]
    fn test_stack_unzeroed() -> Result<()> {
        let config = PoolingInstanceAllocatorConfig {
-            strategy: PoolingAllocationStrategy::NextAvailable,
+            max_unused_warm_slots: 0,
            limits: InstanceLimits {
                count: 1,
                table_elements: 0,
--- a/crates/runtime/src/instance/allocator/pooling/index_allocator.rs
+++ b/crates/runtime/src/instance/allocator/pooling/index_allocator.rs
@@ -1,40 +1,18 @@
 //! Index/slot allocator policies for the pooling allocator.

-use super::PoolingAllocationStrategy;
 use crate::CompiledModuleId;
-use rand::rngs::SmallRng;
-use rand::{Rng, SeedableRng};
-use std::collections::HashMap;
+use std::collections::hash_map::{Entry, HashMap};
+use std::mem;
 use std::sync::Mutex;

 /// A slot index. The job of this allocator is to hand out these
 /// indices.
-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub struct SlotId(pub usize);
+#[derive(Hash, Clone, Copy, Debug, PartialEq, Eq)]
+pub struct SlotId(pub u32);
 impl SlotId {
    /// The index of this slot.
    pub fn index(self) -> usize {
-        self.0
-    }
-}
-
-/// An index in the global freelist.
-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub struct GlobalFreeListIndex(usize);
-impl GlobalFreeListIndex {
-    /// The index of this slot.
-    fn index(self) -> usize {
-        self.0
-    }
-}
-
-/// An index in a per-module freelist.
-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub struct PerModuleFreeListIndex(usize);
-impl PerModuleFreeListIndex {
-    /// The index of this slot.
-    fn index(self) -> usize {
-        self.0
+        self.0 as usize
    }
 }

@@ -43,153 +21,93 @@ pub struct IndexAllocator(Mutex<Inner>);

 #[derive(Debug)]
 struct Inner {
-    strategy: PoolingAllocationStrategy,
-    rng: SmallRng,
-
-    /// Free-list of all slots.
+    /// Maximum  number of "unused warm slots" which will be allowed during
+    /// allocation.
    ///
-    /// We use this to pick a victim when we don't have an appropriate slot with
-    /// the preferred affinity.
-    free_list: Vec<SlotId>,
+    /// This is a user-configurable knob which can be used to influence the
+    /// maximum number of unused slots at any one point in time. A "warm slot"
+    /// is one that's considered having been previously allocated.
+    max_unused_warm_slots: u32,

-    /// Affine slot management which tracks which slots are free and were last
-    /// used with the specified `CompiledModuleId`.
+    /// Current count of "warm slots", or those that were previously allocated
+    /// which are now no longer in use.
    ///
-    /// Invariant: any module ID in this hashmap must have a non-empty list of
-    /// free slots (otherwise we remove it). We remove a module's freelist when
-    /// we have no more slots with affinity for that module.
-    per_module: HashMap<CompiledModuleId, Vec<SlotId>>,
+    /// This is the size of the `warm` list.
+    unused_warm_slots: u32,
+
+    /// A linked list (via indices) which enumerates all "warm and unused"
+    /// slots, or those which have previously been allocated and then free'd.
+    warm: List,
+
+    /// Last slot that was allocated for the first time ever.
+    ///
+    /// This is initially 0 and is incremented during `pick_cold`. If this
+    /// matches `max_cold`, there are no more cold slots left.
+    last_cold: u32,

    /// The state of any given slot.
    ///
    /// Records indices in the above list (empty) or two lists (with affinity),
    /// and these indices are kept up-to-date to allow fast removal.
    slot_state: Vec<SlotState>,
+
+    /// Affine slot management which tracks which slots are free and were last
+    /// used with the specified `CompiledModuleId`.
+    ///
+    /// The `List` here is appended to during deallocation and removal happens
+    /// from the tail during allocation.
+    module_affine: HashMap<CompiledModuleId, List>,
+}
+
+/// A helper "linked list" data structure which is based on indices.
+#[derive(Default, Debug)]
+struct List {
+    head: Option<SlotId>,
+    tail: Option<SlotId>,
+}
+
+/// A helper data structure for an intrusive linked list, coupled with the
+/// `List` type.
+#[derive(Default, Debug, Copy, Clone)]
+struct Link {
+    prev: Option<SlotId>,
+    next: Option<SlotId>,
 }

 #[derive(Clone, Debug)]
-pub(crate) enum SlotState {
-    /// Currently allocated.
+enum SlotState {
+    /// This slot is currently in use and is affine to the specified module.
+    Used(Option<CompiledModuleId>),
+
+    /// This slot is not currently used, and has never been used.
+    UnusedCold,
+
+    /// This slot is not currently used, but was previously allocated.
    ///
-    /// Invariant: no slot in this state has its index in either
-    /// `free_list` or any list in `per_module`.
-    Taken(Option<CompiledModuleId>),
-    /// Currently free. A free slot is able to be allocated for any
-    /// request, but may have affinity to a certain module that we
-    /// prefer to use it for.
-    ///
-    /// Invariant: every slot in this state has its index in at least
-    /// `free_list`, and possibly a `per_module` free-list; see
-    /// FreeSlotState.
-    Free(FreeSlotState),
+    /// The payload here is metadata about the lists that this slot is contained
+    /// within.
+    UnusedWarm(Unused),
 }

 impl SlotState {
-    fn unwrap_free(&self) -> &FreeSlotState {
+    fn unwrap_unused(&mut self) -> &mut Unused {
        match self {
-            &Self::Free(ref free) => free,
-            _ => panic!("Slot not free"),
-        }
-    }
-
-    fn unwrap_free_mut(&mut self) -> &mut FreeSlotState {
-        match self {
-            &mut Self::Free(ref mut free) => free,
-            _ => panic!("Slot not free"),
-        }
-    }
-
-    fn unwrap_module_id(&self) -> Option<CompiledModuleId> {
-        match self {
-            &Self::Taken(module_id) => module_id,
-            _ => panic!("Slot not in Taken state"),
+            SlotState::UnusedWarm(u) => u,
+            _ => unreachable!(),
        }
    }
 }

-#[derive(Clone, Debug)]
-pub(crate) enum FreeSlotState {
-    /// The slot is free, and has no affinity.
-    ///
-    /// Invariant: every slot in this state has its index in
-    /// `free_list`. No slot in this state has its index in any other
-    /// (per-module) free-list.
-    NoAffinity {
-        /// Index in the global free list.
-        ///
-        /// Invariant: free_list[slot_state[i].free_list_index] == i.
-        free_list_index: GlobalFreeListIndex,
-    },
-    /// The slot is free, and has an affinity for some module. This
-    /// means we prefer to choose this slot (or some other one with
-    /// the same affinity) given a request to allocate a slot for this
-    /// module. It can, however, still be used for any other module if
-    /// needed.
-    ///
-    /// Invariant: every slot in this state has its index in both
-    /// `free_list` *and* exactly one list in `per_module`.
-    Affinity {
-        module: CompiledModuleId,
-        /// Index in the global free list.
-        ///
-        /// Invariant: free_list[slot_state[i].free_list_index] == i.
-        free_list_index: GlobalFreeListIndex,
-        /// Index in a per-module free list.
-        ///
-        /// Invariant: per_module[slot_state[i].module][slot_state[i].per_module_index]
-        /// == i.
-        per_module_index: PerModuleFreeListIndex,
-    },
-}
+#[derive(Default, Copy, Clone, Debug)]
+struct Unused {
+    /// Which module this slot was historically affine to, if any.
+    affinity: Option<CompiledModuleId>,

-impl FreeSlotState {
-    /// Get the index of this slot in the global free list.
-    fn free_list_index(&self) -> GlobalFreeListIndex {
-        match self {
-            &Self::NoAffinity { free_list_index }
-            | &Self::Affinity {
-                free_list_index, ..
-            } => free_list_index,
-        }
-    }
+    /// Metadata about the linked list for all slots affine to `affinity`.
+    affine_list_link: Link,

-    /// Update the index of this slot in the global free list.
-    fn update_free_list_index(&mut self, index: GlobalFreeListIndex) {
-        match self {
-            &mut Self::NoAffinity {
-                ref mut free_list_index,
-            }
-            | &mut Self::Affinity {
-                ref mut free_list_index,
-                ..
-            } => {
-                *free_list_index = index;
-            }
-        }
-    }
-
-    /// Get the index of this slot in its per-module free list.
-    fn per_module_index(&self) -> PerModuleFreeListIndex {
-        match self {
-            &Self::Affinity {
-                per_module_index, ..
-            } => per_module_index,
-            _ => panic!("per_module_index on slot with no affinity"),
-        }
-    }
-
-    /// Update the index of this slot in its per-module free list.
-    fn update_per_module_index(&mut self, index: PerModuleFreeListIndex) {
-        match self {
-            &mut Self::Affinity {
-                ref mut per_module_index,
-                ..
-            } => {
-                *per_module_index = index;
-            }
-            _ => panic!("per_module_index on slot with no affinity"),
-        }
-    }
+    /// Metadata within the `warm` list of the main allocator.
+    unused_list_link: Link,
 }

 enum AllocMode {
@@ -199,29 +117,14 @@ enum AllocMode {

 impl IndexAllocator {
    /// Create the default state for this strategy.
-    pub fn new(strategy: PoolingAllocationStrategy, max_instances: usize) -> Self {
-        let ids = (0..max_instances).map(|i| SlotId(i)).collect::<Vec<_>>();
-        // Use a deterministic seed during fuzzing to improve reproducibility of
-        // test cases, but otherwise outside of fuzzing use a random seed to
-        // shake things up.
-        let seed = if cfg!(fuzzing) {
-            [0; 32]
-        } else {
-            rand::thread_rng().gen()
-        };
-        let rng = SmallRng::from_seed(seed);
+    pub fn new(max_instances: u32, max_unused_warm_slots: u32) -> Self {
        IndexAllocator(Mutex::new(Inner {
-            rng,
-            strategy,
-            free_list: ids,
-            per_module: HashMap::new(),
-            slot_state: (0..max_instances)
-                .map(|i| {
-                    SlotState::Free(FreeSlotState::NoAffinity {
-                        free_list_index: GlobalFreeListIndex(i),
-                    })
-                })
-                .collect(),
+            last_cold: 0,
+            max_unused_warm_slots,
+            unused_warm_slots: 0,
+            module_affine: HashMap::new(),
+            slot_state: (0..max_instances).map(|_| SlotState::UnusedCold).collect(),
+            warm: List::default(),
        }))
    }

@@ -248,59 +151,51 @@ impl IndexAllocator {
        let mut inner = self.0.lock().unwrap();
        let inner = &mut *inner;

-        // Determine which `SlotId` will be chosen first. Below the free list
-        // metadata will be updated with our choice.
-        let slot_id = match mode {
-            // If any slot is desired then the pooling allocation strategy
-            // determines which index is chosen.
-            AllocMode::AnySlot => {
-                match inner.strategy {
-                    PoolingAllocationStrategy::NextAvailable => inner.pick_last_used()?,
-                    PoolingAllocationStrategy::Random => inner.pick_random()?,
-                    // First attempt an affine allocation where the slot
-                    // returned was previously used by `id`, but if that fails
-                    // pick a random free slot ID.
-                    //
-                    // Note that we do this to maintain an unbiased stealing
-                    // distribution: we want the likelihood of our taking a slot
-                    // from some other module's freelist to be proportional to
-                    // that module's freelist length. Or in other words, every
-                    // *slot* should be equally likely to be stolen. The
-                    // alternative, where we pick the victim module freelist
-                    // first, means that either a module with an affinity
-                    // freelist of one slot has the same chances of losing that
-                    // slot as one with a hundred slots; or else we need a
-                    // weighted random choice among modules, which is just as
-                    // complex as this process.
-                    //
-                    // We don't bother picking an empty slot (no established
-                    // affinity) before a random slot, because this is more
-                    // complex, and in the steady state, all slots will see at
-                    // least one instantiation very quickly, so there will never
-                    // (past an initial phase) be a slot with no affinity.
-                    PoolingAllocationStrategy::ReuseAffinity => inner
-                        .pick_affine(module_id)
-                        .or_else(|| inner.pick_random())?,
+        // As a first-pass always attempt an affine allocation. This will
+        // succeed if any slots are considered affine to `module_id` (if it's
+        // specified). Failing that something else is attempted to be chosen.
+        let slot_id = inner.pick_affine(module_id).or_else(|| {
+            match mode {
+                // If any slot is requested then this is a normal instantiation
+                // looking for an index. Without any affine candidates there are
+                // two options here:
+                //
+                // 1. Pick a slot amongst previously allocated slots
+                // 2. Pick a slot that's never been used before
+                //
+                // The choice here is guided by the initial configuration of
+                // `max_unused_warm_slots`. If our unused warm slots, which are
+                // likely all affine, is below this threshold then the affinity
+                // of the warm slots isn't tampered with and first a cold slot
+                // is chosen. If the cold slot allocation fails, however, a warm
+                // slot is evicted.
+                //
+                // The opposite happens when we're above our threshold for the
+                // maximum number of warm slots, meaning that a warm slot is
+                // attempted to be picked from first with a cold slot following
+                // that. Note that the warm slot allocation in this case should
+                // only fail of `max_unused_warm_slots` is 0, otherwise
+                // `pick_warm` will always succeed.
+                AllocMode::AnySlot => {
+                    if inner.unused_warm_slots < inner.max_unused_warm_slots {
+                        inner.pick_cold().or_else(|| inner.pick_warm())
+                    } else {
+                        inner.pick_warm().or_else(|| {
+                            debug_assert!(inner.max_unused_warm_slots == 0);
+                            inner.pick_cold()
+                        })
+                    }
                }
+
+                // In this mode an affinity-based allocation is always performed
+                // as the purpose here is to clear out slots relevant to
+                // `module_id` during module teardown. This means that there's
+                // no consulting non-affine slots in this path.
+                AllocMode::ForceAffineAndClear => None,
            }
+        })?;

-            // In this mode an affinity-based allocation is always performed as
-            // the purpose here is to clear out slots relevant to `module_id`
-            // during module teardown.
-            AllocMode::ForceAffineAndClear => inner.pick_affine(module_id)?,
-        };
-
-        // Update internal metadata about the allocation of `slot_id` to
-        // `module_id`, meaning that it's removed from the per-module freelist
-        // if it was previously affine and additionally it's removed from the
-        // global freelist.
-        inner.remove_global_free_list_item(slot_id);
-        if let &SlotState::Free(FreeSlotState::Affinity { module, .. }) =
-            &inner.slot_state[slot_id.index()]
-        {
-            inner.remove_module_free_list_item(module, slot_id);
-        }
-        inner.slot_state[slot_id.index()] = SlotState::Taken(match mode {
+        inner.slot_state[slot_id.index()] = SlotState::Used(match mode {
            AllocMode::ForceAffineAndClear => None,
            AllocMode::AnySlot => module_id,
        });
@@ -310,24 +205,43 @@ impl IndexAllocator {

    pub(crate) fn free(&self, index: SlotId) {
        let mut inner = self.0.lock().unwrap();
-        let free_list_index = GlobalFreeListIndex(inner.free_list.len());
-        inner.free_list.push(index);
-        let module_id = inner.slot_state[index.index()].unwrap_module_id();
-        inner.slot_state[index.index()] = if let Some(id) = module_id {
-            let per_module_list = inner
-                .per_module
-                .entry(id)
-                .or_insert_with(|| Vec::with_capacity(1));
-            let per_module_index = PerModuleFreeListIndex(per_module_list.len());
-            per_module_list.push(index);
-            SlotState::Free(FreeSlotState::Affinity {
-                module: id,
-                free_list_index,
-                per_module_index,
-            })
-        } else {
-            SlotState::Free(FreeSlotState::NoAffinity { free_list_index })
+        let inner = &mut *inner;
+        let module = match inner.slot_state[index.index()] {
+            SlotState::Used(module) => module,
+            _ => unreachable!(),
        };
+
+        // Bump the number of warm slots since this slot is now considered
+        // previously used. Afterwards append it to the linked list of all
+        // unused and warm slots.
+        inner.unused_warm_slots += 1;
+        let unused_list_link = inner
+            .warm
+            .append(index, &mut inner.slot_state, |s| &mut s.unused_list_link);
+
+        let affine_list_link = match module {
+            // If this slot is affine to a particular module then append this
+            // index to the linked list for the affine module. Otherwise insert
+            // a new one-element linked list.
+            Some(module) => match inner.module_affine.entry(module) {
+                Entry::Occupied(mut e) => e
+                    .get_mut()
+                    .append(index, &mut inner.slot_state, |s| &mut s.affine_list_link),
+                Entry::Vacant(v) => {
+                    v.insert(List::new(index));
+                    Link::default()
+                }
+            },
+
+            // If this slot has no affinity then the affine link is empty.
+            None => Link::default(),
+        };
+
+        inner.slot_state[index.index()] = SlotState::UnusedWarm(Unused {
+            affinity: module,
+            affine_list_link,
+            unused_list_link,
+        });
    }

    /// For testing only, we want to be able to assert what is on the
@@ -335,7 +249,10 @@ impl IndexAllocator {
    #[cfg(test)]
    pub(crate) fn testing_freelist(&self) -> Vec<SlotId> {
        let inner = self.0.lock().unwrap();
-        inner.free_list.clone()
+        inner
+            .warm
+            .iter(&inner.slot_state, |s| &s.unused_list_link)
+            .collect()
    }

    /// For testing only, get the list of all modules with at least
@@ -343,102 +260,165 @@ impl IndexAllocator {
    #[cfg(test)]
    pub(crate) fn testing_module_affinity_list(&self) -> Vec<CompiledModuleId> {
        let inner = self.0.lock().unwrap();
-        let mut ret = vec![];
-        for (module, list) in inner.per_module.iter() {
-            assert!(!list.is_empty());
-            ret.push(*module);
-        }
-        ret
+        inner.module_affine.keys().copied().collect()
    }
 }

 impl Inner {
-    fn pick_last_used(&self) -> Option<SlotId> {
-        self.free_list.last().copied()
-    }
-
-    fn pick_random(&mut self) -> Option<SlotId> {
-        if self.free_list.len() == 0 {
-            return None;
-        }
-        let i = self.rng.gen_range(0..self.free_list.len());
-        Some(self.free_list[i])
-    }
-
    /// Attempts to allocate a slot already affine to `id`, returning `None` if
    /// `id` is `None` or if there are no affine slots.
-    fn pick_affine(&self, module_id: Option<CompiledModuleId>) -> Option<SlotId> {
-        let free = self.per_module.get(&module_id?)?;
-        free.last().copied()
+    fn pick_affine(&mut self, module_id: Option<CompiledModuleId>) -> Option<SlotId> {
+        // Note that the `tail` is chosen here of the affine list as it's the
+        // most recently used, which for affine allocations is what we want --
+        // maximizing temporal reuse.
+        let ret = self.module_affine.get(&module_id?)?.tail?;
+        self.remove(ret);
+        Some(ret)
    }

-    /// Remove a slot-index from the global free list.
-    fn remove_global_free_list_item(&mut self, index: SlotId) {
-        let free_list_index = self.slot_state[index.index()]
-            .unwrap_free()
-            .free_list_index();
-        assert_eq!(index, self.free_list.swap_remove(free_list_index.index()));
-        if free_list_index.index() < self.free_list.len() {
-            let replaced = self.free_list[free_list_index.index()];
-            self.slot_state[replaced.index()]
-                .unwrap_free_mut()
-                .update_free_list_index(free_list_index);
+    fn pick_warm(&mut self) -> Option<SlotId> {
+        // Insertions into the `unused` list happen at the `tail`, so the
+        // least-recently-used item will be at the head. That's our goal here,
+        // pick the least-recently-used slot since something "warm" is being
+        // evicted anyway.
+        let head = self.warm.head?;
+        self.remove(head);
+        Some(head)
+    }
+
+    fn remove(&mut self, slot: SlotId) {
+        // Decrement the size of the warm list, and additionally remove it from
+        // the `warm` linked list.
+        self.unused_warm_slots -= 1;
+        self.warm
+            .remove(slot, &mut self.slot_state, |u| &mut u.unused_list_link);
+
+        // If this slot is affine to a module then additionally remove it from
+        // that module's affinity linked list. Note that if the module's affine
+        // list is empty then the module's entry in the map is completely
+        // removed as well.
+        let module = self.slot_state[slot.index()].unwrap_unused().affinity;
+        if let Some(module) = module {
+            let mut list = match self.module_affine.entry(module) {
+                Entry::Occupied(e) => e,
+                Entry::Vacant(_) => unreachable!(),
+            };
+            list.get_mut()
+                .remove(slot, &mut self.slot_state, |u| &mut u.affine_list_link);
+
+            if list.get_mut().head.is_none() {
+                list.remove();
+            }
        }
    }

-    /// Remove a slot-index from a per-module free list.
-    fn remove_module_free_list_item(&mut self, module_id: CompiledModuleId, index: SlotId) {
-        debug_assert!(
-            self.per_module.contains_key(&module_id),
-            "per_module list for given module should not be empty"
-        );
-
-        let per_module_list = self.per_module.get_mut(&module_id).unwrap();
-        debug_assert!(!per_module_list.is_empty());
-
-        let per_module_index = self.slot_state[index.index()]
-            .unwrap_free()
-            .per_module_index();
-        assert_eq!(index, per_module_list.swap_remove(per_module_index.index()));
-        if per_module_index.index() < per_module_list.len() {
-            let replaced = per_module_list[per_module_index.index()];
-            self.slot_state[replaced.index()]
-                .unwrap_free_mut()
-                .update_per_module_index(per_module_index);
+    fn pick_cold(&mut self) -> Option<SlotId> {
+        if (self.last_cold as usize) == self.slot_state.len() {
+            None
+        } else {
+            let ret = Some(SlotId(self.last_cold));
+            self.last_cold += 1;
+            ret
        }
-        if per_module_list.is_empty() {
-            self.per_module.remove(&module_id);
+    }
+}
+
+impl List {
+    /// Creates a new one-element list pointing at `id`.
+    fn new(id: SlotId) -> List {
+        List {
+            head: Some(id),
+            tail: Some(id),
        }
    }
+
+    /// Appends the `id` to this list whose links are determined by `link`.
+    fn append(
+        &mut self,
+        id: SlotId,
+        states: &mut [SlotState],
+        link: fn(&mut Unused) -> &mut Link,
+    ) -> Link {
+        // This `id` is the new tail...
+        let tail = mem::replace(&mut self.tail, Some(id));
+
+        // If the tail was present, then update its `next` field to ourselves as
+        // we've been appended, otherwise update the `head` since the list was
+        // previously empty.
+        match tail {
+            Some(tail) => link(states[tail.index()].unwrap_unused()).next = Some(id),
+            None => self.head = Some(id),
+        }
+        Link {
+            prev: tail,
+            next: None,
+        }
+    }
+
+    /// Removes `id` from this list whose links are determined by `link`.
+    fn remove(
+        &mut self,
+        id: SlotId,
+        slot_state: &mut [SlotState],
+        link: fn(&mut Unused) -> &mut Link,
+    ) -> Unused {
+        let mut state = *slot_state[id.index()].unwrap_unused();
+        let next = link(&mut state).next;
+        let prev = link(&mut state).prev;
+
+        // If a `next` node is present for this link, then its previous was our
+        // own previous now. Otherwise we are the tail so the new tail is our
+        // previous.
+        match next {
+            Some(next) => link(slot_state[next.index()].unwrap_unused()).prev = prev,
+            None => self.tail = prev,
+        }
+
+        // Same as the `next` node, except everything is in reverse.
+        match prev {
+            Some(prev) => link(slot_state[prev.index()].unwrap_unused()).next = next,
+            None => self.head = next,
+        }
+        state
+    }
+
+    #[cfg(test)]
+    fn iter<'a>(
+        &'a self,
+        states: &'a [SlotState],
+        link: fn(&Unused) -> &Link,
+    ) -> impl Iterator<Item = SlotId> + 'a {
+        let mut cur = self.head;
+        let mut prev = None;
+        std::iter::from_fn(move || {
+            if cur.is_none() {
+                assert_eq!(prev, self.tail);
+            }
+            let ret = cur?;
+            match &states[ret.index()] {
+                SlotState::UnusedWarm(u) => {
+                    assert_eq!(link(u).prev, prev);
+                    prev = Some(ret);
+                    cur = link(u).next
+                }
+                _ => unreachable!(),
+            }
+            Some(ret)
+        })
+    }
 }

 #[cfg(test)]
 mod test {
    use super::{IndexAllocator, SlotId};
    use crate::CompiledModuleIdAllocator;
-    use crate::PoolingAllocationStrategy;

    #[test]
    fn test_next_available_allocation_strategy() {
-        let strat = PoolingAllocationStrategy::NextAvailable;
-
        for size in 0..20 {
-            let state = IndexAllocator::new(strat, size);
+            let state = IndexAllocator::new(size, 0);
            for i in 0..size {
-                assert_eq!(state.alloc(None).unwrap().index(), size - i - 1);
-            }
-            assert!(state.alloc(None).is_none());
-        }
-    }
-
-    #[test]
-    fn test_random_allocation_strategy() {
-        let strat = PoolingAllocationStrategy::Random;
-
-        for size in 0..20 {
-            let state = IndexAllocator::new(strat, size);
-            for _ in 0..size {
-                assert!(state.alloc(None).unwrap().index() < size);
+                assert_eq!(state.alloc(None).unwrap().index(), i as usize);
            }
            assert!(state.alloc(None).is_none());
        }
@@ -446,16 +426,15 @@ mod test {

    #[test]
    fn test_affinity_allocation_strategy() {
-        let strat = PoolingAllocationStrategy::ReuseAffinity;
        let id_alloc = CompiledModuleIdAllocator::new();
        let id1 = id_alloc.alloc();
        let id2 = id_alloc.alloc();
-        let state = IndexAllocator::new(strat, 100);
+        let state = IndexAllocator::new(100, 100);

        let index1 = state.alloc(Some(id1)).unwrap();
-        assert!(index1.index() < 100);
+        assert_eq!(index1.index(), 0);
        let index2 = state.alloc(Some(id2)).unwrap();
-        assert!(index2.index() < 100);
+        assert_eq!(index2.index(), 1);
        assert_ne!(index1, index2);

        state.free(index1);
@@ -503,12 +482,8 @@ mod test {
        let id_alloc = CompiledModuleIdAllocator::new();
        let id = id_alloc.alloc();

-        for strat in [
-            PoolingAllocationStrategy::ReuseAffinity,
-            PoolingAllocationStrategy::NextAvailable,
-            PoolingAllocationStrategy::Random,
-        ] {
-            let state = IndexAllocator::new(strat, 100);
+        for max_unused_warm_slots in [0, 1, 2] {
+            let state = IndexAllocator::new(100, max_unused_warm_slots);

            let index1 = state.alloc(Some(id)).unwrap();
            let index2 = state.alloc(Some(id)).unwrap();
@@ -525,12 +500,11 @@ mod test {
        use rand::Rng;
        let mut rng = rand::thread_rng();

-        let strat = PoolingAllocationStrategy::ReuseAffinity;
        let id_alloc = CompiledModuleIdAllocator::new();
        let ids = std::iter::repeat_with(|| id_alloc.alloc())
            .take(10)
            .collect::<Vec<_>>();
-        let state = IndexAllocator::new(strat, 1000);
+        let state = IndexAllocator::new(1000, 1000);
        let mut allocated: Vec<SlotId> = vec![];
        let mut last_id = vec![None; 1000];

@@ -566,4 +540,59 @@ mod test {
            hits
        );
    }
+
+    #[test]
+    fn test_affinity_threshold() {
+        let id_alloc = CompiledModuleIdAllocator::new();
+        let id1 = id_alloc.alloc();
+        let id2 = id_alloc.alloc();
+        let id3 = id_alloc.alloc();
+        let state = IndexAllocator::new(10, 2);
+
+        // Set some slot affinities
+        assert_eq!(state.alloc(Some(id1)), Some(SlotId(0)));
+        state.free(SlotId(0));
+        assert_eq!(state.alloc(Some(id2)), Some(SlotId(1)));
+        state.free(SlotId(1));
+
+        // Only 2 slots are allowed to be unused and warm, so we're at our
+        // threshold, meaning one must now be evicted.
+        assert_eq!(state.alloc(Some(id3)), Some(SlotId(0)));
+        state.free(SlotId(0));
+
+        // pickup `id2` again, it should be affine.
+        assert_eq!(state.alloc(Some(id2)), Some(SlotId(1)));
+
+        // with only one warm slot available allocation for `id1` should pick a
+        // fresh slot
+        assert_eq!(state.alloc(Some(id1)), Some(SlotId(2)));
+
+        state.free(SlotId(1));
+        state.free(SlotId(2));
+
+        // ensure everything stays affine
+        assert_eq!(state.alloc(Some(id1)), Some(SlotId(2)));
+        assert_eq!(state.alloc(Some(id2)), Some(SlotId(1)));
+        assert_eq!(state.alloc(Some(id3)), Some(SlotId(0)));
+
+        state.free(SlotId(1));
+        state.free(SlotId(2));
+        state.free(SlotId(0));
+
+        // LRU is 1, so that should be picked
+        assert_eq!(state.alloc(Some(id_alloc.alloc())), Some(SlotId(1)));
+
+        // Pick another LRU entry, this time 2
+        assert_eq!(state.alloc(Some(id_alloc.alloc())), Some(SlotId(2)));
+
+        // This should preserve slot `0` and pick up something new
+        assert_eq!(state.alloc(Some(id_alloc.alloc())), Some(SlotId(3)));
+
+        state.free(SlotId(1));
+        state.free(SlotId(2));
+        state.free(SlotId(3));
+
+        // for good measure make sure id3 is still affine
+        assert_eq!(state.alloc(Some(id3)), Some(SlotId(0)));
+    }
 }
--- a/crates/runtime/src/lib.rs
+++ b/crates/runtime/src/lib.rs
@@ -56,8 +56,7 @@ pub use crate::instance::{
 };
 #[cfg(feature = "pooling-allocator")]
 pub use crate::instance::{
-    InstanceLimits, PoolingAllocationStrategy, PoolingInstanceAllocator,
-    PoolingInstanceAllocatorConfig,
+    InstanceLimits, PoolingInstanceAllocator, PoolingInstanceAllocatorConfig,
 };
 pub use crate::memory::{
    DefaultMemoryCreator, Memory, RuntimeLinearMemory, RuntimeMemoryCreator, SharedMemory,
@@ -156,7 +155,7 @@ pub unsafe trait Store {
 /// is chiefly needed for lazy initialization of various bits of
 /// instance state.
 ///
-/// When an instance is created, it holds an Arc<dyn ModuleRuntimeInfo>
+/// When an instance is created, it holds an `Arc<dyn ModuleRuntimeInfo>`
 /// so that it can get to signatures, metadata on functions, memory and
 /// funcref-table images, etc. All of these things are ordinarily known
 /// by the higher-level layers of Wasmtime. Specifically, the main
--- a/crates/wasmtime/src/config.rs
+++ b/crates/wasmtime/src/config.rs
@@ -1712,17 +1712,61 @@ pub struct PoolingAllocationConfig {
    config: wasmtime_runtime::PoolingInstanceAllocatorConfig,
 }

-#[cfg(feature = "pooling-allocator")]
-pub use wasmtime_runtime::PoolingAllocationStrategy;
-
 #[cfg(feature = "pooling-allocator")]
 impl PoolingAllocationConfig {
-    /// Configures the method by which slots in the pooling allocator are
-    /// allocated to instances
+    /// Configures the maximum number of "unused warm slots" to retain in the
+    /// pooling allocator.
    ///
-    /// This defaults to [`PoolingAllocationStrategy::ReuseAffinity`] .
-    pub fn strategy(&mut self, strategy: PoolingAllocationStrategy) -> &mut Self {
-        self.config.strategy = strategy;
+    /// The pooling allocator operates over slots to allocate from, and each
+    /// slot is considered "cold" if it's never been used before or "warm" if
+    /// it's been used by some module in the past. Slots in the pooling
+    /// allocator additionally track an "affinity" flag to a particular core
+    /// wasm module. When a module is instantiated into a slot then the slot is
+    /// considered affine to that module, even after the instance has been
+    /// dealloocated.
+    ///
+    /// When a new instance is created then a slot must be chosen, and the
+    /// current algorithm for selecting a slot is:
+    ///
+    /// * If there are slots that are affine to the module being instantiated,
+    ///   then the most recently used slot is selected to be allocated from.
+    ///   This is done to improve reuse of resources such as memory mappings and
+    ///   additionally try to benefit from temporal locality for things like
+    ///   caches.
+    ///
+    /// * Otherwise if there are more than N affine slots to other modules, then
+    ///   one of those affine slots is chosen to be allocated. The slot chosen
+    ///   is picked on a least-recently-used basis.
+    ///
+    /// * Finally, if there are less than N affine slots to other modules, then
+    ///   the non-affine slots are allocated from.
+    ///
+    /// This setting, `max_unused_warm_slots`, is the value for N in the above
+    /// algorithm. The purpose of this setting is to have a knob over the RSS
+    /// impact of "unused slots" for a long-running wasm server.
+    ///
+    /// If this setting is set to 0, for example, then affine slots are
+    /// aggressively resused on a least-recently-used basis. A "cold" slot is
+    /// only used if there are no affine slots available to allocate from. This
+    /// means that the set of slots used over the lifetime of a program is the
+    /// same as the maximum concurrent number of wasm instances.
+    ///
+    /// If this setting is set to infinity, however, then cold slots are
+    /// prioritized to be allocated from. This means that the set of slots used
+    /// over the lifetime of a program will approach
+    /// [`PoolingAllocationConfig::instance_count`], or the maximum number of
+    /// slots in the pooling allocator.
+    ///
+    /// Wasmtime does not aggressively decommit all resources associated with a
+    /// slot when the slot is not in use. For example the
+    /// [`PoolingAllocationConfig::linear_memory_keep_resident`] option can be
+    /// used to keep memory associated with a slot, even when it's not in use.
+    /// This means that the total set of used slots in the pooling instance
+    /// allocator can impact the overall RSS usage of a program.
+    ///
+    /// The default value for this option is 100.
+    pub fn max_unused_warm_slots(&mut self, max: u32) -> &mut Self {
+        self.config.max_unused_warm_slots = max;
        self
    }