diff --git a/benches/instantiation.rs b/benches/instantiation.rs
index 8236fbd02c..ada6f22c8d 100644
--- a/benches/instantiation.rs
+++ b/benches/instantiation.rs
@@ -1,14 +1,21 @@
 use anyhow::Result;
 use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
-use rayon::{prelude::*, ThreadPoolBuilder};
-use std::{path::PathBuf, process::Command};
+use std::path::{Path, PathBuf};
+use std::process::Command;
+use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering::SeqCst};
+use std::sync::Arc;
+use std::thread;
 use wasmtime::*;
 use wasmtime_wasi::{sync::WasiCtxBuilder, WasiCtx};
 
-fn instantiate(linker: &Linker<WasiCtx>, module: &Module) -> Result<()> {
+fn store(engine: &Engine) -> Store<WasiCtx> {
     let wasi = WasiCtxBuilder::new().build();
-    let mut store = Store::new(module.engine(), wasi);
-    let _instance = linker.instantiate(&mut store, module)?;
+    Store::new(engine, wasi)
+}
+
+fn instantiate(pre: &InstancePre<WasiCtx>, engine: &Engine) -> Result<()> {
+    let mut store = store(engine);
+    let _instance = pre.instantiate(&mut store)?;
 
     Ok(())
 }
@@ -23,84 +30,105 @@ fn benchmark_name<'a>(strategy: &InstanceAllocationStrategy) -> &'static str {
     }
 }
 
-fn bench_sequential(c: &mut Criterion, modules: &[&str]) {
+fn bench_sequential(c: &mut Criterion, path: &Path) {
     let mut group = c.benchmark_group("sequential");
 
-    for strategy in &[
-        // Skip the on-demand allocator when uffd is enabled
-        #[cfg(any(not(feature = "uffd"), not(target_os = "linux")))]
-        InstanceAllocationStrategy::OnDemand,
-        InstanceAllocationStrategy::pooling(),
-    ] {
-        for file_name in modules {
-            let mut path = PathBuf::new();
-            path.push("benches");
-            path.push("instantiation");
-            path.push(file_name);
+    for strategy in strategies() {
+        let mut config = Config::default();
+        config.allocation_strategy(strategy.clone());
 
-            let mut config = Config::default();
-            config.allocation_strategy(strategy.clone());
+        let engine = Engine::new(&config).expect("failed to create engine");
+        let module = Module::from_file(&engine, path)
+            .unwrap_or_else(|e| panic!("failed to load benchmark `{}`: {:?}", path.display(), e));
+        let mut linker = Linker::new(&engine);
+        wasmtime_wasi::add_to_linker(&mut linker, |cx| cx).unwrap();
+        let pre = linker
+            .instantiate_pre(&mut store(&engine), &module)
+            .expect("failed to pre-instantiate");
 
-            let engine = Engine::new(&config).expect("failed to create engine");
-            let module = Module::from_file(&engine, &path)
-                .unwrap_or_else(|_| panic!("failed to load benchmark `{}`", path.display()));
-            let mut linker = Linker::new(&engine);
-            wasmtime_wasi::add_to_linker(&mut linker, |cx| cx).unwrap();
-
-            group.bench_function(BenchmarkId::new(benchmark_name(strategy), file_name), |b| {
-                b.iter(|| instantiate(&linker, &module).expect("failed to instantiate module"));
-            });
-        }
+        group.bench_function(
+            BenchmarkId::new(
+                benchmark_name(&strategy),
+                path.file_name().unwrap().to_str().unwrap(),
+            ),
+            |b| {
+                b.iter(|| instantiate(&pre, &engine).expect("failed to instantiate module"));
+            },
+        );
     }
 
     group.finish();
 }
 
-fn bench_parallel(c: &mut Criterion) {
-    const PARALLEL_INSTANCES: usize = 1000;
-
+fn bench_parallel(c: &mut Criterion, path: &Path) {
     let mut group = c.benchmark_group("parallel");
 
-    for strategy in &[
-        // Skip the on-demand allocator when uffd is enabled
-        #[cfg(any(not(feature = "uffd"), not(target_os = "linux")))]
-        InstanceAllocationStrategy::OnDemand,
-        InstanceAllocationStrategy::pooling(),
-    ] {
+    for strategy in strategies() {
         let mut config = Config::default();
         config.allocation_strategy(strategy.clone());
 
         let engine = Engine::new(&config).expect("failed to create engine");
-        let module = Module::from_file(&engine, "benches/instantiation/wasi.wasm")
-            .expect("failed to load WASI example module");
+        let module = Module::from_file(&engine, path).expect("failed to load WASI example module");
         let mut linker = Linker::new(&engine);
         wasmtime_wasi::add_to_linker(&mut linker, |cx| cx).unwrap();
+        let pre = Arc::new(
+            linker
+                .instantiate_pre(&mut store(&engine), &module)
+                .expect("failed to pre-instantiate"),
+        );
 
         for threads in 1..=num_cpus::get_physical() {
-            let pool = ThreadPoolBuilder::new()
-                .num_threads(threads)
-                .build()
-                .unwrap();
-
             group.bench_function(
                 BenchmarkId::new(
-                    benchmark_name(strategy),
+                    benchmark_name(&strategy),
                     format!(
-                        "{} instances with {} thread{}",
-                        PARALLEL_INSTANCES,
+                        "{}: with {} background thread{}",
+                        path.file_name().unwrap().to_str().unwrap(),
                         threads,
                         if threads == 1 { "" } else { "s" }
                     ),
                 ),
                 |b| {
-                    b.iter(|| {
-                        pool.install(|| {
-                            (0..PARALLEL_INSTANCES).into_par_iter().for_each(|_| {
-                                instantiate(&linker, &module)
-                                    .expect("failed to instantiate module");
+                    // Spin up N-1 threads doing background instantiations to
+                    // simulate concurrent instantiations.
+                    let done = Arc::new(AtomicBool::new(false));
+                    let count = Arc::new(AtomicUsize::new(0));
+                    let workers = (0..threads - 1)
+                        .map(|_| {
+                            let pre = pre.clone();
+                            let done = done.clone();
+                            let engine = engine.clone();
+                            let count = count.clone();
+                            thread::spawn(move || {
+                                count.fetch_add(1, SeqCst);
+                                while !done.load(SeqCst) {
+                                    instantiate(&pre, &engine).unwrap();
+                                }
                             })
                         })
+                        .collect::<Vec<_>>();
+
+                    // Wait for our workers to all get started and have
+                    // instantiated their first module, at which point they'll
+                    // all be spinning.
+                    while count.load(SeqCst) != threads - 1 {
+                        thread::yield_now();
+                    }
+
+                    // Now that our background work is configured we can
+                    // benchmark the amount of time it takes to instantiate this
+                    // module.
+                    b.iter(|| {
+                        instantiate(&pre, &engine).expect("failed to instantiate module");
                     });
+
+                    // Shut down this benchmark iteration by signalling to
+                    // worker threads they should exit and then wait for them to
+                    // have reached the exit point.
+                    done.store(true, SeqCst);
+                    for t in workers {
+                        t.join().unwrap();
+                    }
                 },
             );
         }
@@ -138,16 +166,37 @@ fn build_wasi_example() {
 
 fn bench_instantiation(c: &mut Criterion) {
     build_wasi_example();
-    bench_sequential(
-        c,
-        &[
-            "empty.wat",
-            "small_memory.wat",
-            "data_segments.wat",
-            "wasi.wasm",
-        ],
-    );
-    bench_parallel(c);
+    let modules = &[
+        "empty.wat",
+        "small_memory.wat",
+        "data_segments.wat",
+        "wasi.wasm",
+    ];
+    for module in modules {
+        let mut path = PathBuf::new();
+        path.push("benches");
+        path.push("instantiation");
+        path.push(module);
+        bench_sequential(c, &path);
+        bench_parallel(c, &path);
+    }
+}
+
+fn strategies() -> impl Iterator<Item = InstanceAllocationStrategy> {
+    std::array::IntoIter::new([
+        // Skip the on-demand allocator when uffd is enabled
+        #[cfg(any(not(feature = "uffd"), not(target_os = "linux")))]
+        InstanceAllocationStrategy::OnDemand,
+        InstanceAllocationStrategy::Pooling {
+            strategy: Default::default(),
+            module_limits: ModuleLimits {
+                functions: 20_000,
+                memory_pages: 1_000,
+                ..ModuleLimits::default()
+            },
+            instance_limits: InstanceLimits::default(),
+        },
+    ])
 }
 
 criterion_group!(benches, bench_instantiation);