Merge pull request #2946 from bytecodealliance/pch/eager_per_thread_init
expose eager thread-local resource initialization on Engine
This commit is contained in:
@@ -120,3 +120,7 @@ debug = false # FIXME(#1813)
|
|||||||
[[bench]]
|
[[bench]]
|
||||||
name = "instantiation"
|
name = "instantiation"
|
||||||
harness = false
|
harness = false
|
||||||
|
|
||||||
|
[[bench]]
|
||||||
|
name = "thread_eager_init"
|
||||||
|
harness = false
|
||||||
|
|||||||
114
benches/thread_eager_init.rs
Normal file
114
benches/thread_eager_init.rs
Normal file
@@ -0,0 +1,114 @@
|
|||||||
|
use criterion::{criterion_group, criterion_main, Criterion};
|
||||||
|
use std::thread;
|
||||||
|
use std::time::{Duration, Instant};
|
||||||
|
use wasmtime::*;
|
||||||
|
|
||||||
|
fn measure_execution_time(c: &mut Criterion) {
|
||||||
|
// Baseline performance: a single measurment covers both initializing
|
||||||
|
// thread local resources and executing the first call.
|
||||||
|
//
|
||||||
|
// The other two bench functions should sum to this duration.
|
||||||
|
c.bench_function("lazy initialization at call", move |b| {
|
||||||
|
let (engine, module) = test_setup();
|
||||||
|
b.iter_custom(move |iters| {
|
||||||
|
(0..iters)
|
||||||
|
.into_iter()
|
||||||
|
.map(|_| lazy_thread_instantiate(engine.clone(), module.clone()))
|
||||||
|
.sum()
|
||||||
|
})
|
||||||
|
});
|
||||||
|
|
||||||
|
// Using Engine::tls_eager_initialize: measure how long eager
|
||||||
|
// initialization takes on a new thread.
|
||||||
|
c.bench_function("eager initialization", move |b| {
|
||||||
|
let (engine, module) = test_setup();
|
||||||
|
b.iter_custom(move |iters| {
|
||||||
|
(0..iters)
|
||||||
|
.into_iter()
|
||||||
|
.map(|_| {
|
||||||
|
let (init, _call) = eager_thread_instantiate(engine.clone(), module.clone());
|
||||||
|
init
|
||||||
|
})
|
||||||
|
.sum()
|
||||||
|
})
|
||||||
|
});
|
||||||
|
|
||||||
|
// Measure how long the first call takes on a thread after it has been
|
||||||
|
// eagerly initialized.
|
||||||
|
c.bench_function("call after eager initialization", move |b| {
|
||||||
|
let (engine, module) = test_setup();
|
||||||
|
b.iter_custom(move |iters| {
|
||||||
|
(0..iters)
|
||||||
|
.into_iter()
|
||||||
|
.map(|_| {
|
||||||
|
let (_init, call) = eager_thread_instantiate(engine.clone(), module.clone());
|
||||||
|
call
|
||||||
|
})
|
||||||
|
.sum()
|
||||||
|
})
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creating a store and measuring the time to perform a call is the same behavior
|
||||||
|
/// in both setups.
|
||||||
|
fn duration_of_call(engine: &Engine, module: &Module) -> Duration {
|
||||||
|
let mut store = Store::new(engine, ());
|
||||||
|
let inst = Instance::new(&mut store, module, &[]).expect("instantiate");
|
||||||
|
let f = inst.get_func(&mut store, "f").expect("get f");
|
||||||
|
let f = f.typed::<(), (), _>(&store).expect("type f");
|
||||||
|
|
||||||
|
let call = Instant::now();
|
||||||
|
f.call(&mut store, ()).expect("call f");
|
||||||
|
call.elapsed()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// When wasmtime first runs a function on a thread, it needs to initialize
|
||||||
|
/// some thread-local resources and install signal handlers. This benchmark
|
||||||
|
/// spawns a new thread, and returns the duration it took to execute the first
|
||||||
|
/// function call made on that thread.
|
||||||
|
fn lazy_thread_instantiate(engine: Engine, module: Module) -> Duration {
|
||||||
|
thread::spawn(move || duration_of_call(&engine, &module))
|
||||||
|
.join()
|
||||||
|
.expect("thread joins")
|
||||||
|
}
|
||||||
|
/// This benchmark spawns a new thread, and records the duration to eagerly
|
||||||
|
/// initializes the thread local resources. It then creates a store and
|
||||||
|
/// instance, and records the duration it took to execute the first function
|
||||||
|
/// call.
|
||||||
|
fn eager_thread_instantiate(engine: Engine, module: Module) -> (Duration, Duration) {
|
||||||
|
thread::spawn(move || {
|
||||||
|
let init_start = Instant::now();
|
||||||
|
Engine::tls_eager_initialize().expect("eager init");
|
||||||
|
let init_duration = init_start.elapsed();
|
||||||
|
|
||||||
|
(init_duration, duration_of_call(&engine, &module))
|
||||||
|
})
|
||||||
|
.join()
|
||||||
|
.expect("thread joins")
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_setup() -> (Engine, Module) {
|
||||||
|
// We only expect to create one Instance at a time, with a single memory.
|
||||||
|
let pool_count = 10;
|
||||||
|
|
||||||
|
let mut config = Config::new();
|
||||||
|
config.allocation_strategy(InstanceAllocationStrategy::Pooling {
|
||||||
|
strategy: PoolingAllocationStrategy::NextAvailable,
|
||||||
|
module_limits: ModuleLimits {
|
||||||
|
memory_pages: 1,
|
||||||
|
..Default::default()
|
||||||
|
},
|
||||||
|
instance_limits: InstanceLimits {
|
||||||
|
count: pool_count,
|
||||||
|
memory_reservation_size: 1,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
let engine = Engine::new(&config).unwrap();
|
||||||
|
|
||||||
|
// The module has a memory (shouldn't matter) and a single function which is a no-op.
|
||||||
|
let module = Module::new(&engine, r#"(module (memory 1) (func (export "f")))"#).unwrap();
|
||||||
|
(engine, module)
|
||||||
|
}
|
||||||
|
|
||||||
|
criterion_group!(benches, measure_execution_time);
|
||||||
|
criterion_main!(benches);
|
||||||
@@ -49,8 +49,8 @@ pub use crate::memory::{Memory, RuntimeLinearMemory, RuntimeMemoryCreator};
|
|||||||
pub use crate::mmap::Mmap;
|
pub use crate::mmap::Mmap;
|
||||||
pub use crate::table::{Table, TableElement};
|
pub use crate::table::{Table, TableElement};
|
||||||
pub use crate::traphandlers::{
|
pub use crate::traphandlers::{
|
||||||
catch_traps, init_traps, raise_lib_trap, raise_user_trap, resume_panic, SignalHandler,
|
catch_traps, init_traps, raise_lib_trap, raise_user_trap, resume_panic, tls_eager_initialize,
|
||||||
TlsRestore, Trap,
|
SignalHandler, TlsRestore, Trap,
|
||||||
};
|
};
|
||||||
pub use crate::vmcontext::{
|
pub use crate::vmcontext::{
|
||||||
VMCallerCheckedAnyfunc, VMContext, VMFunctionBody, VMFunctionImport, VMGlobalDefinition,
|
VMCallerCheckedAnyfunc, VMContext, VMFunctionBody, VMFunctionImport, VMGlobalDefinition,
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ use std::sync::atomic::Ordering::SeqCst;
|
|||||||
use std::sync::Once;
|
use std::sync::Once;
|
||||||
use wasmtime_environ::ir;
|
use wasmtime_environ::ir;
|
||||||
|
|
||||||
pub use self::tls::TlsRestore;
|
pub use self::tls::{tls_eager_initialize, TlsRestore};
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#[allow(improper_ctypes)]
|
#[allow(improper_ctypes)]
|
||||||
@@ -386,12 +386,29 @@ mod tls {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline(never)]
|
||||||
|
/// Eagerly initialize thread-local runtime functionality. This will be performed
|
||||||
|
/// lazily by the runtime if users do not perform it eagerly.
|
||||||
|
pub fn initialize() -> Result<(), Trap> {
|
||||||
|
PTR.with(|p| {
|
||||||
|
let (state, initialized) = p.get();
|
||||||
|
if initialized {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
super::super::sys::lazy_per_thread_init()?;
|
||||||
|
p.set((state, true));
|
||||||
|
Ok(())
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
#[inline(never)] // see module docs for why this is here
|
#[inline(never)] // see module docs for why this is here
|
||||||
pub fn get() -> Ptr {
|
pub fn get() -> Ptr {
|
||||||
PTR.with(|p| p.get().0)
|
PTR.with(|p| p.get().0)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub use raw::initialize as tls_eager_initialize;
|
||||||
|
|
||||||
/// Opaque state used to help control TLS state across stack switches for
|
/// Opaque state used to help control TLS state across stack switches for
|
||||||
/// async support.
|
/// async support.
|
||||||
pub struct TlsRestore(raw::Ptr);
|
pub struct TlsRestore(raw::Ptr);
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
use crate::signatures::SignatureRegistry;
|
use crate::signatures::SignatureRegistry;
|
||||||
use crate::Config;
|
use crate::{Config, Trap};
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
#[cfg(feature = "cache")]
|
#[cfg(feature = "cache")]
|
||||||
@@ -63,6 +63,27 @@ impl Engine {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Eagerly initialize thread-local functionality shared by all [`Engine`]s.
|
||||||
|
///
|
||||||
|
/// Wasmtime's implementation on some platforms may involve per-thread
|
||||||
|
/// setup that needs to happen whenever WebAssembly is invoked. This setup
|
||||||
|
/// can take on the order of a few hundred microseconds, whereas the
|
||||||
|
/// overhead of calling WebAssembly is otherwise on the order of a few
|
||||||
|
/// nanoseconds. This setup cost is paid once per-OS-thread. If your
|
||||||
|
/// application is sensitive to the latencies of WebAssembly function
|
||||||
|
/// calls, even those that happen first on a thread, then this function
|
||||||
|
/// can be used to improve the consistency of each call into WebAssembly
|
||||||
|
/// by explicitly frontloading the cost of the one-time setup per-thread.
|
||||||
|
///
|
||||||
|
/// Note that this function is not required to be called in any embedding.
|
||||||
|
/// Wasmtime will automatically initialize thread-local-state as necessary
|
||||||
|
/// on calls into WebAssembly. This is provided for use cases where the
|
||||||
|
/// latency of WebAssembly calls are extra-important, which is not
|
||||||
|
/// necessarily true of all embeddings.
|
||||||
|
pub fn tls_eager_initialize() -> Result<(), Trap> {
|
||||||
|
wasmtime_runtime::tls_eager_initialize().map_err(Trap::from_runtime)
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns the configuration settings that this engine is using.
|
/// Returns the configuration settings that this engine is using.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn config(&self) -> &Config {
|
pub fn config(&self) -> &Config {
|
||||||
|
|||||||
Reference in New Issue
Block a user