Merge pull request #2946 from bytecodealliance/pch/eager_per_thread_init
expose eager thread-local resource initialization on Engine
This commit is contained in:
@@ -120,3 +120,7 @@ debug = false # FIXME(#1813)
|
||||
[[bench]]
|
||||
name = "instantiation"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "thread_eager_init"
|
||||
harness = false
|
||||
|
||||
114
benches/thread_eager_init.rs
Normal file
114
benches/thread_eager_init.rs
Normal file
@@ -0,0 +1,114 @@
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use std::thread;
|
||||
use std::time::{Duration, Instant};
|
||||
use wasmtime::*;
|
||||
|
||||
fn measure_execution_time(c: &mut Criterion) {
|
||||
// Baseline performance: a single measurment covers both initializing
|
||||
// thread local resources and executing the first call.
|
||||
//
|
||||
// The other two bench functions should sum to this duration.
|
||||
c.bench_function("lazy initialization at call", move |b| {
|
||||
let (engine, module) = test_setup();
|
||||
b.iter_custom(move |iters| {
|
||||
(0..iters)
|
||||
.into_iter()
|
||||
.map(|_| lazy_thread_instantiate(engine.clone(), module.clone()))
|
||||
.sum()
|
||||
})
|
||||
});
|
||||
|
||||
// Using Engine::tls_eager_initialize: measure how long eager
|
||||
// initialization takes on a new thread.
|
||||
c.bench_function("eager initialization", move |b| {
|
||||
let (engine, module) = test_setup();
|
||||
b.iter_custom(move |iters| {
|
||||
(0..iters)
|
||||
.into_iter()
|
||||
.map(|_| {
|
||||
let (init, _call) = eager_thread_instantiate(engine.clone(), module.clone());
|
||||
init
|
||||
})
|
||||
.sum()
|
||||
})
|
||||
});
|
||||
|
||||
// Measure how long the first call takes on a thread after it has been
|
||||
// eagerly initialized.
|
||||
c.bench_function("call after eager initialization", move |b| {
|
||||
let (engine, module) = test_setup();
|
||||
b.iter_custom(move |iters| {
|
||||
(0..iters)
|
||||
.into_iter()
|
||||
.map(|_| {
|
||||
let (_init, call) = eager_thread_instantiate(engine.clone(), module.clone());
|
||||
call
|
||||
})
|
||||
.sum()
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
/// Creating a store and measuring the time to perform a call is the same behavior
|
||||
/// in both setups.
|
||||
fn duration_of_call(engine: &Engine, module: &Module) -> Duration {
|
||||
let mut store = Store::new(engine, ());
|
||||
let inst = Instance::new(&mut store, module, &[]).expect("instantiate");
|
||||
let f = inst.get_func(&mut store, "f").expect("get f");
|
||||
let f = f.typed::<(), (), _>(&store).expect("type f");
|
||||
|
||||
let call = Instant::now();
|
||||
f.call(&mut store, ()).expect("call f");
|
||||
call.elapsed()
|
||||
}
|
||||
|
||||
/// When wasmtime first runs a function on a thread, it needs to initialize
|
||||
/// some thread-local resources and install signal handlers. This benchmark
|
||||
/// spawns a new thread, and returns the duration it took to execute the first
|
||||
/// function call made on that thread.
|
||||
fn lazy_thread_instantiate(engine: Engine, module: Module) -> Duration {
|
||||
thread::spawn(move || duration_of_call(&engine, &module))
|
||||
.join()
|
||||
.expect("thread joins")
|
||||
}
|
||||
/// This benchmark spawns a new thread, and records the duration to eagerly
|
||||
/// initializes the thread local resources. It then creates a store and
|
||||
/// instance, and records the duration it took to execute the first function
|
||||
/// call.
|
||||
fn eager_thread_instantiate(engine: Engine, module: Module) -> (Duration, Duration) {
|
||||
thread::spawn(move || {
|
||||
let init_start = Instant::now();
|
||||
Engine::tls_eager_initialize().expect("eager init");
|
||||
let init_duration = init_start.elapsed();
|
||||
|
||||
(init_duration, duration_of_call(&engine, &module))
|
||||
})
|
||||
.join()
|
||||
.expect("thread joins")
|
||||
}
|
||||
|
||||
fn test_setup() -> (Engine, Module) {
|
||||
// We only expect to create one Instance at a time, with a single memory.
|
||||
let pool_count = 10;
|
||||
|
||||
let mut config = Config::new();
|
||||
config.allocation_strategy(InstanceAllocationStrategy::Pooling {
|
||||
strategy: PoolingAllocationStrategy::NextAvailable,
|
||||
module_limits: ModuleLimits {
|
||||
memory_pages: 1,
|
||||
..Default::default()
|
||||
},
|
||||
instance_limits: InstanceLimits {
|
||||
count: pool_count,
|
||||
memory_reservation_size: 1,
|
||||
},
|
||||
});
|
||||
let engine = Engine::new(&config).unwrap();
|
||||
|
||||
// The module has a memory (shouldn't matter) and a single function which is a no-op.
|
||||
let module = Module::new(&engine, r#"(module (memory 1) (func (export "f")))"#).unwrap();
|
||||
(engine, module)
|
||||
}
|
||||
|
||||
criterion_group!(benches, measure_execution_time);
|
||||
criterion_main!(benches);
|
||||
@@ -49,8 +49,8 @@ pub use crate::memory::{Memory, RuntimeLinearMemory, RuntimeMemoryCreator};
|
||||
pub use crate::mmap::Mmap;
|
||||
pub use crate::table::{Table, TableElement};
|
||||
pub use crate::traphandlers::{
|
||||
catch_traps, init_traps, raise_lib_trap, raise_user_trap, resume_panic, SignalHandler,
|
||||
TlsRestore, Trap,
|
||||
catch_traps, init_traps, raise_lib_trap, raise_user_trap, resume_panic, tls_eager_initialize,
|
||||
SignalHandler, TlsRestore, Trap,
|
||||
};
|
||||
pub use crate::vmcontext::{
|
||||
VMCallerCheckedAnyfunc, VMContext, VMFunctionBody, VMFunctionImport, VMGlobalDefinition,
|
||||
|
||||
@@ -12,7 +12,7 @@ use std::sync::atomic::Ordering::SeqCst;
|
||||
use std::sync::Once;
|
||||
use wasmtime_environ::ir;
|
||||
|
||||
pub use self::tls::TlsRestore;
|
||||
pub use self::tls::{tls_eager_initialize, TlsRestore};
|
||||
|
||||
extern "C" {
|
||||
#[allow(improper_ctypes)]
|
||||
@@ -386,12 +386,29 @@ mod tls {
|
||||
})
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
/// Eagerly initialize thread-local runtime functionality. This will be performed
|
||||
/// lazily by the runtime if users do not perform it eagerly.
|
||||
pub fn initialize() -> Result<(), Trap> {
|
||||
PTR.with(|p| {
|
||||
let (state, initialized) = p.get();
|
||||
if initialized {
|
||||
return Ok(());
|
||||
}
|
||||
super::super::sys::lazy_per_thread_init()?;
|
||||
p.set((state, true));
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
#[inline(never)] // see module docs for why this is here
|
||||
pub fn get() -> Ptr {
|
||||
PTR.with(|p| p.get().0)
|
||||
}
|
||||
}
|
||||
|
||||
pub use raw::initialize as tls_eager_initialize;
|
||||
|
||||
/// Opaque state used to help control TLS state across stack switches for
|
||||
/// async support.
|
||||
pub struct TlsRestore(raw::Ptr);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use crate::signatures::SignatureRegistry;
|
||||
use crate::Config;
|
||||
use crate::{Config, Trap};
|
||||
use anyhow::Result;
|
||||
use std::sync::Arc;
|
||||
#[cfg(feature = "cache")]
|
||||
@@ -63,6 +63,27 @@ impl Engine {
|
||||
})
|
||||
}
|
||||
|
||||
/// Eagerly initialize thread-local functionality shared by all [`Engine`]s.
|
||||
///
|
||||
/// Wasmtime's implementation on some platforms may involve per-thread
|
||||
/// setup that needs to happen whenever WebAssembly is invoked. This setup
|
||||
/// can take on the order of a few hundred microseconds, whereas the
|
||||
/// overhead of calling WebAssembly is otherwise on the order of a few
|
||||
/// nanoseconds. This setup cost is paid once per-OS-thread. If your
|
||||
/// application is sensitive to the latencies of WebAssembly function
|
||||
/// calls, even those that happen first on a thread, then this function
|
||||
/// can be used to improve the consistency of each call into WebAssembly
|
||||
/// by explicitly frontloading the cost of the one-time setup per-thread.
|
||||
///
|
||||
/// Note that this function is not required to be called in any embedding.
|
||||
/// Wasmtime will automatically initialize thread-local-state as necessary
|
||||
/// on calls into WebAssembly. This is provided for use cases where the
|
||||
/// latency of WebAssembly calls are extra-important, which is not
|
||||
/// necessarily true of all embeddings.
|
||||
pub fn tls_eager_initialize() -> Result<(), Trap> {
|
||||
wasmtime_runtime::tls_eager_initialize().map_err(Trap::from_runtime)
|
||||
}
|
||||
|
||||
/// Returns the configuration settings that this engine is using.
|
||||
#[inline]
|
||||
pub fn config(&self) -> &Config {
|
||||
|
||||
Reference in New Issue
Block a user