Merge pull request #2946 from bytecodealliance/pch/eager_per_thread_init

expose eager thread-local resource initialization on Engine
This commit is contained in:
Pat Hickey
2021-06-04 15:42:08 -07:00
committed by GitHub
5 changed files with 160 additions and 4 deletions

View File

@@ -120,3 +120,7 @@ debug = false # FIXME(#1813)
[[bench]]
name = "instantiation"
harness = false
[[bench]]
name = "thread_eager_init"
harness = false

View File

@@ -0,0 +1,114 @@
use criterion::{criterion_group, criterion_main, Criterion};
use std::thread;
use std::time::{Duration, Instant};
use wasmtime::*;
fn measure_execution_time(c: &mut Criterion) {
// Baseline performance: a single measurment covers both initializing
// thread local resources and executing the first call.
//
// The other two bench functions should sum to this duration.
c.bench_function("lazy initialization at call", move |b| {
let (engine, module) = test_setup();
b.iter_custom(move |iters| {
(0..iters)
.into_iter()
.map(|_| lazy_thread_instantiate(engine.clone(), module.clone()))
.sum()
})
});
// Using Engine::tls_eager_initialize: measure how long eager
// initialization takes on a new thread.
c.bench_function("eager initialization", move |b| {
let (engine, module) = test_setup();
b.iter_custom(move |iters| {
(0..iters)
.into_iter()
.map(|_| {
let (init, _call) = eager_thread_instantiate(engine.clone(), module.clone());
init
})
.sum()
})
});
// Measure how long the first call takes on a thread after it has been
// eagerly initialized.
c.bench_function("call after eager initialization", move |b| {
let (engine, module) = test_setup();
b.iter_custom(move |iters| {
(0..iters)
.into_iter()
.map(|_| {
let (_init, call) = eager_thread_instantiate(engine.clone(), module.clone());
call
})
.sum()
})
});
}
/// Creating a store and measuring the time to perform a call is the same behavior
/// in both setups.
fn duration_of_call(engine: &Engine, module: &Module) -> Duration {
let mut store = Store::new(engine, ());
let inst = Instance::new(&mut store, module, &[]).expect("instantiate");
let f = inst.get_func(&mut store, "f").expect("get f");
let f = f.typed::<(), (), _>(&store).expect("type f");
let call = Instant::now();
f.call(&mut store, ()).expect("call f");
call.elapsed()
}
/// When wasmtime first runs a function on a thread, it needs to initialize
/// some thread-local resources and install signal handlers. This benchmark
/// spawns a new thread, and returns the duration it took to execute the first
/// function call made on that thread.
fn lazy_thread_instantiate(engine: Engine, module: Module) -> Duration {
thread::spawn(move || duration_of_call(&engine, &module))
.join()
.expect("thread joins")
}
/// This benchmark spawns a new thread, and records the duration to eagerly
/// initializes the thread local resources. It then creates a store and
/// instance, and records the duration it took to execute the first function
/// call.
fn eager_thread_instantiate(engine: Engine, module: Module) -> (Duration, Duration) {
thread::spawn(move || {
let init_start = Instant::now();
Engine::tls_eager_initialize().expect("eager init");
let init_duration = init_start.elapsed();
(init_duration, duration_of_call(&engine, &module))
})
.join()
.expect("thread joins")
}
fn test_setup() -> (Engine, Module) {
// We only expect to create one Instance at a time, with a single memory.
let pool_count = 10;
let mut config = Config::new();
config.allocation_strategy(InstanceAllocationStrategy::Pooling {
strategy: PoolingAllocationStrategy::NextAvailable,
module_limits: ModuleLimits {
memory_pages: 1,
..Default::default()
},
instance_limits: InstanceLimits {
count: pool_count,
memory_reservation_size: 1,
},
});
let engine = Engine::new(&config).unwrap();
// The module has a memory (shouldn't matter) and a single function which is a no-op.
let module = Module::new(&engine, r#"(module (memory 1) (func (export "f")))"#).unwrap();
(engine, module)
}
criterion_group!(benches, measure_execution_time);
criterion_main!(benches);

View File

@@ -49,8 +49,8 @@ pub use crate::memory::{Memory, RuntimeLinearMemory, RuntimeMemoryCreator};
pub use crate::mmap::Mmap;
pub use crate::table::{Table, TableElement};
pub use crate::traphandlers::{
catch_traps, init_traps, raise_lib_trap, raise_user_trap, resume_panic, SignalHandler,
TlsRestore, Trap,
catch_traps, init_traps, raise_lib_trap, raise_user_trap, resume_panic, tls_eager_initialize,
SignalHandler, TlsRestore, Trap,
};
pub use crate::vmcontext::{
VMCallerCheckedAnyfunc, VMContext, VMFunctionBody, VMFunctionImport, VMGlobalDefinition,

View File

@@ -12,7 +12,7 @@ use std::sync::atomic::Ordering::SeqCst;
use std::sync::Once;
use wasmtime_environ::ir;
pub use self::tls::TlsRestore;
pub use self::tls::{tls_eager_initialize, TlsRestore};
extern "C" {
#[allow(improper_ctypes)]
@@ -386,12 +386,29 @@ mod tls {
})
}
#[inline(never)]
/// Eagerly initialize thread-local runtime functionality. This will be performed
/// lazily by the runtime if users do not perform it eagerly.
pub fn initialize() -> Result<(), Trap> {
PTR.with(|p| {
let (state, initialized) = p.get();
if initialized {
return Ok(());
}
super::super::sys::lazy_per_thread_init()?;
p.set((state, true));
Ok(())
})
}
#[inline(never)] // see module docs for why this is here
pub fn get() -> Ptr {
PTR.with(|p| p.get().0)
}
}
pub use raw::initialize as tls_eager_initialize;
/// Opaque state used to help control TLS state across stack switches for
/// async support.
pub struct TlsRestore(raw::Ptr);

View File

@@ -1,5 +1,5 @@
use crate::signatures::SignatureRegistry;
use crate::Config;
use crate::{Config, Trap};
use anyhow::Result;
use std::sync::Arc;
#[cfg(feature = "cache")]
@@ -63,6 +63,27 @@ impl Engine {
})
}
/// Eagerly initialize thread-local functionality shared by all [`Engine`]s.
///
/// Wasmtime's implementation on some platforms may involve per-thread
/// setup that needs to happen whenever WebAssembly is invoked. This setup
/// can take on the order of a few hundred microseconds, whereas the
/// overhead of calling WebAssembly is otherwise on the order of a few
/// nanoseconds. This setup cost is paid once per-OS-thread. If your
/// application is sensitive to the latencies of WebAssembly function
/// calls, even those that happen first on a thread, then this function
/// can be used to improve the consistency of each call into WebAssembly
/// by explicitly frontloading the cost of the one-time setup per-thread.
///
/// Note that this function is not required to be called in any embedding.
/// Wasmtime will automatically initialize thread-local-state as necessary
/// on calls into WebAssembly. This is provided for use cases where the
/// latency of WebAssembly calls are extra-important, which is not
/// necessarily true of all embeddings.
pub fn tls_eager_initialize() -> Result<(), Trap> {
wasmtime_runtime::tls_eager_initialize().map_err(Trap::from_runtime)
}
/// Returns the configuration settings that this engine is using.
#[inline]
pub fn config(&self) -> &Config {