Tweak parallelism and the instantiation benchmark (#3775)

Currently the "sequential" and "parallel" benchmarks reports somewhat different timings. For sequential it's time-to-instantiate, but for parallel it's time-to-instantiate-10k instances. The parallelism in the parallel benchmark can also theoretically be affected by rayon's work-stealing. For example if rayon doesn't actually do any work stealing at all then this ends up being a sequential test again. Otherwise though it's possible for some threads to finish much earlier as rayon isn't guaranteed to keep threads busy. This commit applies a few updates to the benchmark: * First an `InstancePre<T>` is now used instead of a `Linker<T>` to front-load type-checking and avoid that on each instantiation (and this is generally the fastest path to instantiate right now). * Next the instantiation benchmark is changed to measure one instantiation-per-iteration to measure per-instance instantiation to better compare with sequential numbers. * Finally rayon is removed in favor of manually creating background threads that infinitely do work until we tell them to stop. These background threads are guaranteed to be working for the entire time the benchmark is executing and should theoretically exhibit what the situation that there's N units of work all happening at once. I also applied some minor updates here such as having the parallel instantiation defined conditionally for multiple modules as well as upping the limits of the pooling allocator to handle a large module (rustpython.wasm) that I threw at it.
2022-02-07 17:55:38 -06:00
parent fa889b4fd2
commit 43b37944ff
1 changed files with 112 additions and 63 deletions
--- a/benches/instantiation.rs
+++ b/benches/instantiation.rs
@@ -1,14 +1,21 @@
 use anyhow::Result;
 use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
-use rayon::{prelude::*, ThreadPoolBuilder};
-use std::{path::PathBuf, process::Command};
+use std::path::{Path, PathBuf};
+use std::process::Command;
+use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering::SeqCst};
+use std::sync::Arc;
+use std::thread;
 use wasmtime::*;
 use wasmtime_wasi::{sync::WasiCtxBuilder, WasiCtx};

-fn instantiate(linker: &Linker<WasiCtx>, module: &Module) -> Result<()> {
+fn store(engine: &Engine) -> Store<WasiCtx> {
    let wasi = WasiCtxBuilder::new().build();
-    let mut store = Store::new(module.engine(), wasi);
-    let _instance = linker.instantiate(&mut store, module)?;
+    Store::new(engine, wasi)
+}
+
+fn instantiate(pre: &InstancePre<WasiCtx>, engine: &Engine) -> Result<()> {
+    let mut store = store(engine);
+    let _instance = pre.instantiate(&mut store)?;

    Ok(())
 }
@@ -23,84 +30,105 @@ fn benchmark_name<'a>(strategy: &InstanceAllocationStrategy) -> &'static str {
    }
 }

-fn bench_sequential(c: &mut Criterion, modules: &[&str]) {
+fn bench_sequential(c: &mut Criterion, path: &Path) {
    let mut group = c.benchmark_group("sequential");

-    for strategy in &[
-        // Skip the on-demand allocator when uffd is enabled
-        #[cfg(any(not(feature = "uffd"), not(target_os = "linux")))]
-        InstanceAllocationStrategy::OnDemand,
-        InstanceAllocationStrategy::pooling(),
-    ] {
-        for file_name in modules {
-            let mut path = PathBuf::new();
-            path.push("benches");
-            path.push("instantiation");
-            path.push(file_name);
+    for strategy in strategies() {
+        let mut config = Config::default();
+        config.allocation_strategy(strategy.clone());

-            let mut config = Config::default();
-            config.allocation_strategy(strategy.clone());
+        let engine = Engine::new(&config).expect("failed to create engine");
+        let module = Module::from_file(&engine, path)
+            .unwrap_or_else(|e| panic!("failed to load benchmark `{}`: {:?}", path.display(), e));
+        let mut linker = Linker::new(&engine);
+        wasmtime_wasi::add_to_linker(&mut linker, |cx| cx).unwrap();
+        let pre = linker
+            .instantiate_pre(&mut store(&engine), &module)
+            .expect("failed to pre-instantiate");

-            let engine = Engine::new(&config).expect("failed to create engine");
-            let module = Module::from_file(&engine, &path)
-                .unwrap_or_else(|_| panic!("failed to load benchmark `{}`", path.display()));
-            let mut linker = Linker::new(&engine);
-            wasmtime_wasi::add_to_linker(&mut linker, |cx| cx).unwrap();
-
-            group.bench_function(BenchmarkId::new(benchmark_name(strategy), file_name), |b| {
-                b.iter(|| instantiate(&linker, &module).expect("failed to instantiate module"));
-            });
-        }
+        group.bench_function(
+            BenchmarkId::new(
+                benchmark_name(&strategy),
+                path.file_name().unwrap().to_str().unwrap(),
+            ),
+            |b| {
+                b.iter(|| instantiate(&pre, &engine).expect("failed to instantiate module"));
+            },
+        );
    }

    group.finish();
 }

-fn bench_parallel(c: &mut Criterion) {
-    const PARALLEL_INSTANCES: usize = 1000;
-
+fn bench_parallel(c: &mut Criterion, path: &Path) {
    let mut group = c.benchmark_group("parallel");

-    for strategy in &[
-        // Skip the on-demand allocator when uffd is enabled
-        #[cfg(any(not(feature = "uffd"), not(target_os = "linux")))]
-        InstanceAllocationStrategy::OnDemand,
-        InstanceAllocationStrategy::pooling(),
-    ] {
+    for strategy in strategies() {
        let mut config = Config::default();
        config.allocation_strategy(strategy.clone());

        let engine = Engine::new(&config).expect("failed to create engine");
-        let module = Module::from_file(&engine, "benches/instantiation/wasi.wasm")
-            .expect("failed to load WASI example module");
+        let module = Module::from_file(&engine, path).expect("failed to load WASI example module");
        let mut linker = Linker::new(&engine);
        wasmtime_wasi::add_to_linker(&mut linker, |cx| cx).unwrap();
+        let pre = Arc::new(
+            linker
+                .instantiate_pre(&mut store(&engine), &module)
+                .expect("failed to pre-instantiate"),
+        );

        for threads in 1..=num_cpus::get_physical() {
-            let pool = ThreadPoolBuilder::new()
-                .num_threads(threads)
-                .build()
-                .unwrap();
-
            group.bench_function(
                BenchmarkId::new(
-                    benchmark_name(strategy),
+                    benchmark_name(&strategy),
                    format!(
-                        "{} instances with {} thread{}",
-                        PARALLEL_INSTANCES,
+                        "{}: with {} background thread{}",
+                        path.file_name().unwrap().to_str().unwrap(),
                        threads,
                        if threads == 1 { "" } else { "s" }
                    ),
                ),
                |b| {
-                    b.iter(|| {
-                        pool.install(|| {
-                            (0..PARALLEL_INSTANCES).into_par_iter().for_each(|_| {
-                                instantiate(&linker, &module)
-                                    .expect("failed to instantiate module");
+                    // Spin up N-1 threads doing background instantiations to
+                    // simulate concurrent instantiations.
+                    let done = Arc::new(AtomicBool::new(false));
+                    let count = Arc::new(AtomicUsize::new(0));
+                    let workers = (0..threads - 1)
+                        .map(|_| {
+                            let pre = pre.clone();
+                            let done = done.clone();
+                            let engine = engine.clone();
+                            let count = count.clone();
+                            thread::spawn(move || {
+                                count.fetch_add(1, SeqCst);
+                                while !done.load(SeqCst) {
+                                    instantiate(&pre, &engine).unwrap();
+                                }
                            })
                        })
+                        .collect::<Vec<_>>();
+
+                    // Wait for our workers to all get started and have
+                    // instantiated their first module, at which point they'll
+                    // all be spinning.
+                    while count.load(SeqCst) != threads - 1 {
+                        thread::yield_now();
+                    }
+
+                    // Now that our background work is configured we can
+                    // benchmark the amount of time it takes to instantiate this
+                    // module.
+                    b.iter(|| {
+                        instantiate(&pre, &engine).expect("failed to instantiate module");
                    });
+
+                    // Shut down this benchmark iteration by signalling to
+                    // worker threads they should exit and then wait for them to
+                    // have reached the exit point.
+                    done.store(true, SeqCst);
+                    for t in workers {
+                        t.join().unwrap();
+                    }
                },
            );
        }
@@ -138,16 +166,37 @@ fn build_wasi_example() {

 fn bench_instantiation(c: &mut Criterion) {
    build_wasi_example();
-    bench_sequential(
-        c,
-        &[
-            "empty.wat",
-            "small_memory.wat",
-            "data_segments.wat",
-            "wasi.wasm",
-        ],
-    );
-    bench_parallel(c);
+    let modules = &[
+        "empty.wat",
+        "small_memory.wat",
+        "data_segments.wat",
+        "wasi.wasm",
+    ];
+    for module in modules {
+        let mut path = PathBuf::new();
+        path.push("benches");
+        path.push("instantiation");
+        path.push(module);
+        bench_sequential(c, &path);
+        bench_parallel(c, &path);
+    }
+}
+
+fn strategies() -> impl Iterator<Item = InstanceAllocationStrategy> {
+    std::array::IntoIter::new([
+        // Skip the on-demand allocator when uffd is enabled
+        #[cfg(any(not(feature = "uffd"), not(target_os = "linux")))]
+        InstanceAllocationStrategy::OnDemand,
+        InstanceAllocationStrategy::Pooling {
+            strategy: Default::default(),
+            module_limits: ModuleLimits {
+                functions: 20_000,
+                memory_pages: 1_000,
+                ..ModuleLimits::default()
+            },
+            instance_limits: InstanceLimits::default(),
+        },
+    ])
 }

 criterion_group!(benches, bench_instantiation);