From ff87f456049bca81976cf2edb3d192c4c70645c1 Mon Sep 17 00:00:00 2001 From: Pat Hickey Date: Thu, 27 May 2021 15:50:06 -0700 Subject: [PATCH 1/7] expose eager thread-local initialization by the Engine --- crates/runtime/src/lib.rs | 4 ++-- crates/runtime/src/traphandlers.rs | 19 ++++++++++++++++++- crates/wasmtime/src/engine.rs | 8 +++++++- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/crates/runtime/src/lib.rs b/crates/runtime/src/lib.rs index 6e5d4f6fb3..2c7aa1f584 100644 --- a/crates/runtime/src/lib.rs +++ b/crates/runtime/src/lib.rs @@ -49,8 +49,8 @@ pub use crate::memory::{Memory, RuntimeLinearMemory, RuntimeMemoryCreator}; pub use crate::mmap::Mmap; pub use crate::table::{Table, TableElement}; pub use crate::traphandlers::{ - catch_traps, init_traps, raise_lib_trap, raise_user_trap, resume_panic, SignalHandler, - TlsRestore, Trap, + catch_traps, init_traps, raise_lib_trap, raise_user_trap, resume_panic, tls_eager_initialize, + SignalHandler, TlsRestore, Trap, }; pub use crate::vmcontext::{ VMCallerCheckedAnyfunc, VMContext, VMFunctionBody, VMFunctionImport, VMGlobalDefinition, diff --git a/crates/runtime/src/traphandlers.rs b/crates/runtime/src/traphandlers.rs index 7332ae0a7e..142d36133b 100644 --- a/crates/runtime/src/traphandlers.rs +++ b/crates/runtime/src/traphandlers.rs @@ -12,7 +12,7 @@ use std::sync::atomic::Ordering::SeqCst; use std::sync::Once; use wasmtime_environ::ir; -pub use self::tls::TlsRestore; +pub use self::tls::{tls_eager_initialize, TlsRestore}; extern "C" { #[allow(improper_ctypes)] @@ -386,12 +386,29 @@ mod tls { }) } + #[inline(never)] + /// Eagerly initialize thread-local runtime functionality. This will be performed + /// lazily by the runtime if users do not perform it eagerly. + pub fn initialize() -> Result<(), Trap> { + PTR.with(|p| { + let (state, mut initialized) = p.get(); + if !initialized { + super::super::sys::lazy_per_thread_init()?; + initialized = true; + } + p.set((state, initialized)); + Ok(()) + }) + } + #[inline(never)] // see module docs for why this is here pub fn get() -> Ptr { PTR.with(|p| p.get().0) } } + pub use raw::initialize as tls_eager_initialize; + /// Opaque state used to help control TLS state across stack switches for /// async support. pub struct TlsRestore(raw::Ptr); diff --git a/crates/wasmtime/src/engine.rs b/crates/wasmtime/src/engine.rs index bce82a8a9f..c367e9425a 100644 --- a/crates/wasmtime/src/engine.rs +++ b/crates/wasmtime/src/engine.rs @@ -1,5 +1,5 @@ use crate::signatures::SignatureRegistry; -use crate::Config; +use crate::{Config, Trap}; use anyhow::Result; use std::sync::Arc; #[cfg(feature = "cache")] @@ -63,6 +63,12 @@ impl Engine { }) } + /// Eagerly initialize thread-local functionality shared by all [`Engine`]s. This + /// will be performed lazily by the runtime if users do not perform it eagerly. + pub fn tls_eager_initialize() -> Result<(), Trap> { + wasmtime_runtime::tls_eager_initialize().map_err(Trap::from_runtime) + } + /// Returns the configuration settings that this engine is using. #[inline] pub fn config(&self) -> &Config { From 1136917192790fd8dfcbf038e52f17bfa465cefd Mon Sep 17 00:00:00 2001 From: Pat Hickey Date: Fri, 4 Jun 2021 10:43:07 -0700 Subject: [PATCH 2/7] golf --- crates/runtime/src/traphandlers.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/runtime/src/traphandlers.rs b/crates/runtime/src/traphandlers.rs index 142d36133b..ba9de7c3d4 100644 --- a/crates/runtime/src/traphandlers.rs +++ b/crates/runtime/src/traphandlers.rs @@ -391,12 +391,12 @@ mod tls { /// lazily by the runtime if users do not perform it eagerly. pub fn initialize() -> Result<(), Trap> { PTR.with(|p| { - let (state, mut initialized) = p.get(); - if !initialized { - super::super::sys::lazy_per_thread_init()?; - initialized = true; + let (state, initialized) = p.get(); + if initialized { + return Ok(()); } - p.set((state, initialized)); + super::super::sys::lazy_per_thread_init()?; + p.set((state, true)); Ok(()) }) } From bf1d1a4e7a1183269f92ad1a286beb94b9598245 Mon Sep 17 00:00:00 2001 From: Pat Hickey Date: Fri, 4 Jun 2021 11:22:23 -0700 Subject: [PATCH 3/7] add test for eager thread initialization --- tests/all/main.rs | 1 + tests/all/thread_eager_init.rs | 72 ++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 tests/all/thread_eager_init.rs diff --git a/tests/all/main.rs b/tests/all/main.rs index 37bb68d7a6..a1f3314d64 100644 --- a/tests/all/main.rs +++ b/tests/all/main.rs @@ -24,6 +24,7 @@ mod pooling_allocator; mod stack_overflow; mod store; mod table; +mod thread_eager_init; mod traps; mod wast; diff --git a/tests/all/thread_eager_init.rs b/tests/all/thread_eager_init.rs new file mode 100644 index 0000000000..16f23fb4fe --- /dev/null +++ b/tests/all/thread_eager_init.rs @@ -0,0 +1,72 @@ +use anyhow::Result; +use std::thread; +use std::time::{Duration, Instant}; +use wasmtime::*; + +#[test] +fn measure_execution_time() -> Result<()> { + let iterations = 1000; + + let mut config = Config::new(); + config.allocation_strategy(InstanceAllocationStrategy::Pooling { + strategy: PoolingAllocationStrategy::NextAvailable, + module_limits: ModuleLimits { + memory_pages: 1, + table_elements: 10, + ..Default::default() + }, + instance_limits: InstanceLimits { + count: iterations * 2, + memory_reservation_size: 1, + }, + }); + + let engine = Engine::new(&config)?; + let module = Module::new(&engine, r#"(module (memory 1) (table 10 funcref))"#)?; + + let lazy_total_time: Duration = (0..iterations) + .into_iter() + .map(|_| lazy_thread_instantiate(engine.clone(), module.clone())) + .sum(); + + let (eager_init_total, eager_inst_total): (Duration, Duration) = (0..iterations) + .into_iter() + .map(|_| eager_thread_instantiate(engine.clone(), module.clone())) + .fold( + (Duration::default(), Duration::default()), + |(s1, s2), (d1, d2)| (s1 + d1, s2 + d2), + ); + + println!( + "lazy total: {:?}, eager init: {:?}, eager inst: {:?}", + lazy_total_time, eager_init_total, eager_inst_total + ); + + Ok(()) +} + +fn lazy_thread_instantiate(engine: Engine, module: Module) -> Duration { + thread::spawn(move || { + let mut store = Store::new(&engine, ()); + let start = Instant::now(); + Instance::new(&mut store, &module, &[]).expect("instantiate"); + start.elapsed() + }) + .join() + .expect("thread joins") +} + +fn eager_thread_instantiate(engine: Engine, module: Module) -> (Duration, Duration) { + thread::spawn(move || { + let init_start = Instant::now(); + Engine::tls_eager_initialize().expect("eager init"); + let init_duration = init_start.elapsed(); + + let mut store = Store::new(&engine, ()); + let inst_start = Instant::now(); + Instance::new(&mut store, &module, &[]).expect("instantiate"); + (init_duration, inst_start.elapsed()) + }) + .join() + .expect("thread joins") +} From 0a96b6b60aba3251852757fa9a9ab405d5afae4a Mon Sep 17 00:00:00 2001 From: Pat Hickey Date: Fri, 4 Jun 2021 11:35:30 -0700 Subject: [PATCH 4/7] overhead is on calls, not instantiation --- tests/all/thread_eager_init.rs | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/tests/all/thread_eager_init.rs b/tests/all/thread_eager_init.rs index 16f23fb4fe..63cee33e6d 100644 --- a/tests/all/thread_eager_init.rs +++ b/tests/all/thread_eager_init.rs @@ -12,7 +12,6 @@ fn measure_execution_time() -> Result<()> { strategy: PoolingAllocationStrategy::NextAvailable, module_limits: ModuleLimits { memory_pages: 1, - table_elements: 10, ..Default::default() }, instance_limits: InstanceLimits { @@ -22,14 +21,14 @@ fn measure_execution_time() -> Result<()> { }); let engine = Engine::new(&config)?; - let module = Module::new(&engine, r#"(module (memory 1) (table 10 funcref))"#)?; + let module = Module::new(&engine, r#"(module (memory 1) (func (export "f")))"#)?; - let lazy_total_time: Duration = (0..iterations) + let lazy_call_time: Duration = (0..iterations) .into_iter() .map(|_| lazy_thread_instantiate(engine.clone(), module.clone())) .sum(); - let (eager_init_total, eager_inst_total): (Duration, Duration) = (0..iterations) + let (eager_init_total, eager_call_total): (Duration, Duration) = (0..iterations) .into_iter() .map(|_| eager_thread_instantiate(engine.clone(), module.clone())) .fold( @@ -38,8 +37,8 @@ fn measure_execution_time() -> Result<()> { ); println!( - "lazy total: {:?}, eager init: {:?}, eager inst: {:?}", - lazy_total_time, eager_init_total, eager_inst_total + "lazy call: {:?}, eager init: {:?}, eager call: {:?}", + lazy_call_time, eager_init_total, eager_call_total ); Ok(()) @@ -48,9 +47,13 @@ fn measure_execution_time() -> Result<()> { fn lazy_thread_instantiate(engine: Engine, module: Module) -> Duration { thread::spawn(move || { let mut store = Store::new(&engine, ()); - let start = Instant::now(); - Instance::new(&mut store, &module, &[]).expect("instantiate"); - start.elapsed() + let inst = Instance::new(&mut store, &module, &[]).expect("instantiate"); + let f = inst.get_func(&mut store, "f").expect("get f"); + let f = f.typed::<(), (), _>(&store).expect("type f"); + + let call = Instant::now(); + f.call(&mut store, ()).expect("call f"); + call.elapsed() }) .join() .expect("thread joins") @@ -63,9 +66,13 @@ fn eager_thread_instantiate(engine: Engine, module: Module) -> (Duration, Durati let init_duration = init_start.elapsed(); let mut store = Store::new(&engine, ()); - let inst_start = Instant::now(); - Instance::new(&mut store, &module, &[]).expect("instantiate"); - (init_duration, inst_start.elapsed()) + let inst = Instance::new(&mut store, &module, &[]).expect("instantiate"); + let f = inst.get_func(&mut store, "f").expect("get f"); + let f = f.typed::<(), (), _>(&store).expect("type f"); + + let call = Instant::now(); + f.call(&mut store, ()).expect("call f"); + (init_duration, call.elapsed()) }) .join() .expect("thread joins") From 2a4c51b77dcc2036468f40f7b8855229daaba1b0 Mon Sep 17 00:00:00 2001 From: Pat Hickey Date: Fri, 4 Jun 2021 12:24:44 -0700 Subject: [PATCH 5/7] switch eager vs lazy instantiation to a criterion bench --- Cargo.toml | 4 ++ {tests/all => benches}/thread_eager_init.rs | 74 +++++++++++++-------- tests/all/main.rs | 1 - 3 files changed, 51 insertions(+), 28 deletions(-) rename {tests/all => benches}/thread_eager_init.rs (53%) diff --git a/Cargo.toml b/Cargo.toml index 9b1ce39b16..712b57bb09 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -120,3 +120,7 @@ debug = false # FIXME(#1813) [[bench]] name = "instantiation" harness = false + +[[bench]] +name = "thread_eager_init" +harness = false diff --git a/tests/all/thread_eager_init.rs b/benches/thread_eager_init.rs similarity index 53% rename from tests/all/thread_eager_init.rs rename to benches/thread_eager_init.rs index 63cee33e6d..9d35869cea 100644 --- a/tests/all/thread_eager_init.rs +++ b/benches/thread_eager_init.rs @@ -1,11 +1,47 @@ -use anyhow::Result; +use criterion::{criterion_group, criterion_main, Criterion}; use std::thread; use std::time::{Duration, Instant}; use wasmtime::*; -#[test] -fn measure_execution_time() -> Result<()> { - let iterations = 1000; +fn measure_execution_time(c: &mut Criterion) { + c.bench_function("lazy initialization at call", move |b| { + let (engine, module) = test_engine(); + b.iter_custom(move |iters| { + (0..iters) + .into_iter() + .map(|_| lazy_thread_instantiate(engine.clone(), module.clone())) + .sum() + }) + }); + + c.bench_function("eager initialization", move |b| { + let (engine, module) = test_engine(); + b.iter_custom(move |iters| { + (0..iters) + .into_iter() + .map(|_| { + let (init, _call) = eager_thread_instantiate(engine.clone(), module.clone()); + init + }) + .sum() + }) + }); + c.bench_function("call after eager initialization", move |b| { + let (engine, module) = test_engine(); + b.iter_custom(move |iters| { + (0..iters) + .into_iter() + .map(|_| { + let (_init, call) = eager_thread_instantiate(engine.clone(), module.clone()); + call + }) + .sum() + }) + }); +} + +fn test_engine() -> (Engine, Module) { + let pool_count = 1000; let mut config = Config::new(); config.allocation_strategy(InstanceAllocationStrategy::Pooling { @@ -15,33 +51,14 @@ fn measure_execution_time() -> Result<()> { ..Default::default() }, instance_limits: InstanceLimits { - count: iterations * 2, + count: pool_count, memory_reservation_size: 1, }, }); - let engine = Engine::new(&config)?; - let module = Module::new(&engine, r#"(module (memory 1) (func (export "f")))"#)?; - - let lazy_call_time: Duration = (0..iterations) - .into_iter() - .map(|_| lazy_thread_instantiate(engine.clone(), module.clone())) - .sum(); - - let (eager_init_total, eager_call_total): (Duration, Duration) = (0..iterations) - .into_iter() - .map(|_| eager_thread_instantiate(engine.clone(), module.clone())) - .fold( - (Duration::default(), Duration::default()), - |(s1, s2), (d1, d2)| (s1 + d1, s2 + d2), - ); - - println!( - "lazy call: {:?}, eager init: {:?}, eager call: {:?}", - lazy_call_time, eager_init_total, eager_call_total - ); - - Ok(()) + let engine = Engine::new(&config).unwrap(); + let module = Module::new(&engine, r#"(module (memory 1) (func (export "f")))"#).unwrap(); + (engine, module) } fn lazy_thread_instantiate(engine: Engine, module: Module) -> Duration { @@ -77,3 +94,6 @@ fn eager_thread_instantiate(engine: Engine, module: Module) -> (Duration, Durati .join() .expect("thread joins") } + +criterion_group!(benches, measure_execution_time); +criterion_main!(benches); diff --git a/tests/all/main.rs b/tests/all/main.rs index a1f3314d64..37bb68d7a6 100644 --- a/tests/all/main.rs +++ b/tests/all/main.rs @@ -24,7 +24,6 @@ mod pooling_allocator; mod stack_overflow; mod store; mod table; -mod thread_eager_init; mod traps; mod wast; From 035d541fd96cb411413df750f575c774ce8320a4 Mon Sep 17 00:00:00 2001 From: Pat Hickey Date: Fri, 4 Jun 2021 14:04:14 -0700 Subject: [PATCH 6/7] add docs --- benches/thread_eager_init.rs | 95 +++++++++++++++++++++--------------- 1 file changed, 55 insertions(+), 40 deletions(-) diff --git a/benches/thread_eager_init.rs b/benches/thread_eager_init.rs index 9d35869cea..02f3c65f5c 100644 --- a/benches/thread_eager_init.rs +++ b/benches/thread_eager_init.rs @@ -4,8 +4,12 @@ use std::time::{Duration, Instant}; use wasmtime::*; fn measure_execution_time(c: &mut Criterion) { + // Baseline performance: a single measurment covers both initializing + // thread local resources and executing the first call. + // + // The other two bench functions should sum to this duration. c.bench_function("lazy initialization at call", move |b| { - let (engine, module) = test_engine(); + let (engine, module) = test_setup(); b.iter_custom(move |iters| { (0..iters) .into_iter() @@ -14,8 +18,10 @@ fn measure_execution_time(c: &mut Criterion) { }) }); + // Using Engine::tls_eager_initialize: measure how long eager + // initialization takes on a new thread. c.bench_function("eager initialization", move |b| { - let (engine, module) = test_engine(); + let (engine, module) = test_setup(); b.iter_custom(move |iters| { (0..iters) .into_iter() @@ -26,8 +32,11 @@ fn measure_execution_time(c: &mut Criterion) { .sum() }) }); + + // Measure how long the first call takes on a thread after it has been + // eagerly initialized. c.bench_function("call after eager initialization", move |b| { - let (engine, module) = test_engine(); + let (engine, module) = test_setup(); b.iter_custom(move |iters| { (0..iters) .into_iter() @@ -40,8 +49,47 @@ fn measure_execution_time(c: &mut Criterion) { }); } -fn test_engine() -> (Engine, Module) { - let pool_count = 1000; +/// Creating a store and measuring the time to perform a call is the same behavior +/// in both setups. +fn duration_of_call(engine: &Engine, module: &Module) -> Duration { + let mut store = Store::new(engine, ()); + let inst = Instance::new(&mut store, module, &[]).expect("instantiate"); + let f = inst.get_func(&mut store, "f").expect("get f"); + let f = f.typed::<(), (), _>(&store).expect("type f"); + + let call = Instant::now(); + f.call(&mut store, ()).expect("call f"); + call.elapsed() +} + +/// When wasmtime first runs a function on a thread, it needs to initialize +/// some thread-local resources and install signal handlers. This benchmark +/// spawns a new thread, and returns the duration it took to execute the first +/// function call made on that thread. +fn lazy_thread_instantiate(engine: Engine, module: Module) -> Duration { + thread::spawn(move || duration_of_call(&engine, &module)) + .join() + .expect("thread joins") +} +/// This benchmark spawns a new thread, and records the duration to eagerly +/// initializes the thread local resources. It then creates a store and +/// instance, and records the duration it took to execute the first function +/// call. +fn eager_thread_instantiate(engine: Engine, module: Module) -> (Duration, Duration) { + thread::spawn(move || { + let init_start = Instant::now(); + Engine::tls_eager_initialize().expect("eager init"); + let init_duration = init_start.elapsed(); + + (init_duration, duration_of_call(&engine, &module)) + }) + .join() + .expect("thread joins") +} + +fn test_setup() -> (Engine, Module) { + // We only expect to create one Instance at a time, with a single memory. + let pool_count = 10; let mut config = Config::new(); config.allocation_strategy(InstanceAllocationStrategy::Pooling { @@ -55,45 +103,12 @@ fn test_engine() -> (Engine, Module) { memory_reservation_size: 1, }, }); - let engine = Engine::new(&config).unwrap(); + + // The module has a memory (shouldn't matter) and a single function which is a no-op. let module = Module::new(&engine, r#"(module (memory 1) (func (export "f")))"#).unwrap(); (engine, module) } -fn lazy_thread_instantiate(engine: Engine, module: Module) -> Duration { - thread::spawn(move || { - let mut store = Store::new(&engine, ()); - let inst = Instance::new(&mut store, &module, &[]).expect("instantiate"); - let f = inst.get_func(&mut store, "f").expect("get f"); - let f = f.typed::<(), (), _>(&store).expect("type f"); - - let call = Instant::now(); - f.call(&mut store, ()).expect("call f"); - call.elapsed() - }) - .join() - .expect("thread joins") -} - -fn eager_thread_instantiate(engine: Engine, module: Module) -> (Duration, Duration) { - thread::spawn(move || { - let init_start = Instant::now(); - Engine::tls_eager_initialize().expect("eager init"); - let init_duration = init_start.elapsed(); - - let mut store = Store::new(&engine, ()); - let inst = Instance::new(&mut store, &module, &[]).expect("instantiate"); - let f = inst.get_func(&mut store, "f").expect("get f"); - let f = f.typed::<(), (), _>(&store).expect("type f"); - - let call = Instant::now(); - f.call(&mut store, ()).expect("call f"); - (init_duration, call.elapsed()) - }) - .join() - .expect("thread joins") -} - criterion_group!(benches, measure_execution_time); criterion_main!(benches); From 613309b76caaa0b7a74b6178b86f67c0dd51d02b Mon Sep 17 00:00:00 2001 From: Pat Hickey Date: Fri, 4 Jun 2021 14:22:08 -0700 Subject: [PATCH 7/7] missing docs --- crates/wasmtime/src/engine.rs | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/crates/wasmtime/src/engine.rs b/crates/wasmtime/src/engine.rs index c367e9425a..f0c615fb2d 100644 --- a/crates/wasmtime/src/engine.rs +++ b/crates/wasmtime/src/engine.rs @@ -63,8 +63,23 @@ impl Engine { }) } - /// Eagerly initialize thread-local functionality shared by all [`Engine`]s. This - /// will be performed lazily by the runtime if users do not perform it eagerly. + /// Eagerly initialize thread-local functionality shared by all [`Engine`]s. + /// + /// Wasmtime's implementation on some platforms may involve per-thread + /// setup that needs to happen whenever WebAssembly is invoked. This setup + /// can take on the order of a few hundred microseconds, whereas the + /// overhead of calling WebAssembly is otherwise on the order of a few + /// nanoseconds. This setup cost is paid once per-OS-thread. If your + /// application is sensitive to the latencies of WebAssembly function + /// calls, even those that happen first on a thread, then this function + /// can be used to improve the consistency of each call into WebAssembly + /// by explicitly frontloading the cost of the one-time setup per-thread. + /// + /// Note that this function is not required to be called in any embedding. + /// Wasmtime will automatically initialize thread-local-state as necessary + /// on calls into WebAssembly. This is provided for use cases where the + /// latency of WebAssembly calls are extra-important, which is not + /// necessarily true of all embeddings. pub fn tls_eager_initialize() -> Result<(), Trap> { wasmtime_runtime::tls_eager_initialize().map_err(Trap::from_runtime) }