diff --git a/Cargo.toml b/Cargo.toml index 9b1ce39b16..712b57bb09 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -120,3 +120,7 @@ debug = false # FIXME(#1813) [[bench]] name = "instantiation" harness = false + +[[bench]] +name = "thread_eager_init" +harness = false diff --git a/benches/thread_eager_init.rs b/benches/thread_eager_init.rs new file mode 100644 index 0000000000..02f3c65f5c --- /dev/null +++ b/benches/thread_eager_init.rs @@ -0,0 +1,114 @@ +use criterion::{criterion_group, criterion_main, Criterion}; +use std::thread; +use std::time::{Duration, Instant}; +use wasmtime::*; + +fn measure_execution_time(c: &mut Criterion) { + // Baseline performance: a single measurment covers both initializing + // thread local resources and executing the first call. + // + // The other two bench functions should sum to this duration. + c.bench_function("lazy initialization at call", move |b| { + let (engine, module) = test_setup(); + b.iter_custom(move |iters| { + (0..iters) + .into_iter() + .map(|_| lazy_thread_instantiate(engine.clone(), module.clone())) + .sum() + }) + }); + + // Using Engine::tls_eager_initialize: measure how long eager + // initialization takes on a new thread. + c.bench_function("eager initialization", move |b| { + let (engine, module) = test_setup(); + b.iter_custom(move |iters| { + (0..iters) + .into_iter() + .map(|_| { + let (init, _call) = eager_thread_instantiate(engine.clone(), module.clone()); + init + }) + .sum() + }) + }); + + // Measure how long the first call takes on a thread after it has been + // eagerly initialized. + c.bench_function("call after eager initialization", move |b| { + let (engine, module) = test_setup(); + b.iter_custom(move |iters| { + (0..iters) + .into_iter() + .map(|_| { + let (_init, call) = eager_thread_instantiate(engine.clone(), module.clone()); + call + }) + .sum() + }) + }); +} + +/// Creating a store and measuring the time to perform a call is the same behavior +/// in both setups. +fn duration_of_call(engine: &Engine, module: &Module) -> Duration { + let mut store = Store::new(engine, ()); + let inst = Instance::new(&mut store, module, &[]).expect("instantiate"); + let f = inst.get_func(&mut store, "f").expect("get f"); + let f = f.typed::<(), (), _>(&store).expect("type f"); + + let call = Instant::now(); + f.call(&mut store, ()).expect("call f"); + call.elapsed() +} + +/// When wasmtime first runs a function on a thread, it needs to initialize +/// some thread-local resources and install signal handlers. This benchmark +/// spawns a new thread, and returns the duration it took to execute the first +/// function call made on that thread. +fn lazy_thread_instantiate(engine: Engine, module: Module) -> Duration { + thread::spawn(move || duration_of_call(&engine, &module)) + .join() + .expect("thread joins") +} +/// This benchmark spawns a new thread, and records the duration to eagerly +/// initializes the thread local resources. It then creates a store and +/// instance, and records the duration it took to execute the first function +/// call. +fn eager_thread_instantiate(engine: Engine, module: Module) -> (Duration, Duration) { + thread::spawn(move || { + let init_start = Instant::now(); + Engine::tls_eager_initialize().expect("eager init"); + let init_duration = init_start.elapsed(); + + (init_duration, duration_of_call(&engine, &module)) + }) + .join() + .expect("thread joins") +} + +fn test_setup() -> (Engine, Module) { + // We only expect to create one Instance at a time, with a single memory. + let pool_count = 10; + + let mut config = Config::new(); + config.allocation_strategy(InstanceAllocationStrategy::Pooling { + strategy: PoolingAllocationStrategy::NextAvailable, + module_limits: ModuleLimits { + memory_pages: 1, + ..Default::default() + }, + instance_limits: InstanceLimits { + count: pool_count, + memory_reservation_size: 1, + }, + }); + let engine = Engine::new(&config).unwrap(); + + // The module has a memory (shouldn't matter) and a single function which is a no-op. + let module = Module::new(&engine, r#"(module (memory 1) (func (export "f")))"#).unwrap(); + (engine, module) +} + +criterion_group!(benches, measure_execution_time); +criterion_main!(benches); diff --git a/crates/runtime/src/lib.rs b/crates/runtime/src/lib.rs index 6e5d4f6fb3..2c7aa1f584 100644 --- a/crates/runtime/src/lib.rs +++ b/crates/runtime/src/lib.rs @@ -49,8 +49,8 @@ pub use crate::memory::{Memory, RuntimeLinearMemory, RuntimeMemoryCreator}; pub use crate::mmap::Mmap; pub use crate::table::{Table, TableElement}; pub use crate::traphandlers::{ - catch_traps, init_traps, raise_lib_trap, raise_user_trap, resume_panic, SignalHandler, - TlsRestore, Trap, + catch_traps, init_traps, raise_lib_trap, raise_user_trap, resume_panic, tls_eager_initialize, + SignalHandler, TlsRestore, Trap, }; pub use crate::vmcontext::{ VMCallerCheckedAnyfunc, VMContext, VMFunctionBody, VMFunctionImport, VMGlobalDefinition, diff --git a/crates/runtime/src/traphandlers.rs b/crates/runtime/src/traphandlers.rs index 7332ae0a7e..ba9de7c3d4 100644 --- a/crates/runtime/src/traphandlers.rs +++ b/crates/runtime/src/traphandlers.rs @@ -12,7 +12,7 @@ use std::sync::atomic::Ordering::SeqCst; use std::sync::Once; use wasmtime_environ::ir; -pub use self::tls::TlsRestore; +pub use self::tls::{tls_eager_initialize, TlsRestore}; extern "C" { #[allow(improper_ctypes)] @@ -386,12 +386,29 @@ mod tls { }) } + #[inline(never)] + /// Eagerly initialize thread-local runtime functionality. This will be performed + /// lazily by the runtime if users do not perform it eagerly. + pub fn initialize() -> Result<(), Trap> { + PTR.with(|p| { + let (state, initialized) = p.get(); + if initialized { + return Ok(()); + } + super::super::sys::lazy_per_thread_init()?; + p.set((state, true)); + Ok(()) + }) + } + #[inline(never)] // see module docs for why this is here pub fn get() -> Ptr { PTR.with(|p| p.get().0) } } + pub use raw::initialize as tls_eager_initialize; + /// Opaque state used to help control TLS state across stack switches for /// async support. pub struct TlsRestore(raw::Ptr); diff --git a/crates/wasmtime/src/engine.rs b/crates/wasmtime/src/engine.rs index bce82a8a9f..f0c615fb2d 100644 --- a/crates/wasmtime/src/engine.rs +++ b/crates/wasmtime/src/engine.rs @@ -1,5 +1,5 @@ use crate::signatures::SignatureRegistry; -use crate::Config; +use crate::{Config, Trap}; use anyhow::Result; use std::sync::Arc; #[cfg(feature = "cache")] @@ -63,6 +63,27 @@ impl Engine { }) } + /// Eagerly initialize thread-local functionality shared by all [`Engine`]s. + /// + /// Wasmtime's implementation on some platforms may involve per-thread + /// setup that needs to happen whenever WebAssembly is invoked. This setup + /// can take on the order of a few hundred microseconds, whereas the + /// overhead of calling WebAssembly is otherwise on the order of a few + /// nanoseconds. This setup cost is paid once per-OS-thread. If your + /// application is sensitive to the latencies of WebAssembly function + /// calls, even those that happen first on a thread, then this function + /// can be used to improve the consistency of each call into WebAssembly + /// by explicitly frontloading the cost of the one-time setup per-thread. + /// + /// Note that this function is not required to be called in any embedding. + /// Wasmtime will automatically initialize thread-local-state as necessary + /// on calls into WebAssembly. This is provided for use cases where the + /// latency of WebAssembly calls are extra-important, which is not + /// necessarily true of all embeddings. + pub fn tls_eager_initialize() -> Result<(), Trap> { + wasmtime_runtime::tls_eager_initialize().map_err(Trap::from_runtime) + } + /// Returns the configuration settings that this engine is using. #[inline] pub fn config(&self) -> &Config {