Merge pull request #2946 from bytecodealliance/pch/eager_per_thread_init

expose eager thread-local resource initialization on Engine
2021-06-04 15:42:08 -07:00
parent e516f0339a 613309b76c
commit 38ab7a03dd
5 changed files with 160 additions and 4 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -120,3 +120,7 @@ debug = false # FIXME(#1813)
 [[bench]]
 name = "instantiation"
 harness = false
 [[bench]]
 name = "thread_eager_init"
 harness = false
--- a/benches/thread_eager_init.rs
+++ b/benches/thread_eager_init.rs
@@ -0,0 +1,114 @@
 use criterion::{criterion_group, criterion_main, Criterion};
 use std::thread;
 use std::time::{Duration, Instant};
 use wasmtime::*;
 fn measure_execution_time(c: &mut Criterion) {
    // Baseline performance: a single measurment covers both initializing
    // thread local resources and executing the first call.
    //
    // The other two bench functions should sum to this duration.
    c.bench_function("lazy initialization at call", move |b| {
        let (engine, module) = test_setup();
        b.iter_custom(move |iters| {
            (0..iters)
                .into_iter()
                .map(|_| lazy_thread_instantiate(engine.clone(), module.clone()))
                .sum()
        })
    });
    // Using Engine::tls_eager_initialize: measure how long eager
    // initialization takes on a new thread.
    c.bench_function("eager initialization", move |b| {
        let (engine, module) = test_setup();
        b.iter_custom(move |iters| {
            (0..iters)
                .into_iter()
                .map(|_| {
                    let (init, _call) = eager_thread_instantiate(engine.clone(), module.clone());
                    init
                })
                .sum()
        })
    });
    // Measure how long the first call takes on a thread after it has been
    // eagerly initialized.
    c.bench_function("call after eager initialization", move |b| {
        let (engine, module) = test_setup();
        b.iter_custom(move |iters| {
            (0..iters)
                .into_iter()
                .map(|_| {
                    let (_init, call) = eager_thread_instantiate(engine.clone(), module.clone());
                    call
                })
                .sum()
        })
    });
 }
 /// Creating a store and measuring the time to perform a call is the same behavior
 /// in both setups.
 fn duration_of_call(engine: &Engine, module: &Module) -> Duration {
    let mut store = Store::new(engine, ());
    let inst = Instance::new(&mut store, module, &[]).expect("instantiate");
    let f = inst.get_func(&mut store, "f").expect("get f");
    let f = f.typed::<(), (), _>(&store).expect("type f");
    let call = Instant::now();
    f.call(&mut store, ()).expect("call f");
    call.elapsed()
 }
 /// When wasmtime first runs a function on a thread, it needs to initialize
 /// some thread-local resources and install signal handlers. This benchmark
 /// spawns a new thread, and returns the duration it took to execute the first
 /// function call made on that thread.
 fn lazy_thread_instantiate(engine: Engine, module: Module) -> Duration {
    thread::spawn(move || duration_of_call(&engine, &module))
        .join()
        .expect("thread joins")
 }
 /// This benchmark spawns a new thread, and records the duration to eagerly
 /// initializes the thread local resources. It then creates a store and
 /// instance, and records the duration it took to execute the first function
 /// call.
 fn eager_thread_instantiate(engine: Engine, module: Module) -> (Duration, Duration) {
    thread::spawn(move || {
        let init_start = Instant::now();
        Engine::tls_eager_initialize().expect("eager init");
        let init_duration = init_start.elapsed();
        (init_duration, duration_of_call(&engine, &module))
    })
    .join()
    .expect("thread joins")
 }
 fn test_setup() -> (Engine, Module) {
    // We only expect to create one Instance at a time, with a single memory.
    let pool_count = 10;
    let mut config = Config::new();
    config.allocation_strategy(InstanceAllocationStrategy::Pooling {
        strategy: PoolingAllocationStrategy::NextAvailable,
        module_limits: ModuleLimits {
            memory_pages: 1,
            ..Default::default()
        },
        instance_limits: InstanceLimits {
            count: pool_count,
            memory_reservation_size: 1,
        },
    });
    let engine = Engine::new(&config).unwrap();
    // The module has a memory (shouldn't matter) and a single function which is a no-op.
    let module = Module::new(&engine, r#"(module (memory 1) (func (export "f")))"#).unwrap();
    (engine, module)
 }
 criterion_group!(benches, measure_execution_time);
 criterion_main!(benches);
--- a/crates/runtime/src/lib.rs
+++ b/crates/runtime/src/lib.rs
@@ -49,8 +49,8 @@ pub use crate::memory::{Memory, RuntimeLinearMemory, RuntimeMemoryCreator};
 pub use crate::mmap::Mmap;
 pub use crate::table::{Table, TableElement};
 pub use crate::traphandlers::{
-    catch_traps, init_traps, raise_lib_trap, raise_user_trap, resume_panic, SignalHandler,
+    catch_traps, init_traps, raise_lib_trap, raise_user_trap, resume_panic, tls_eager_initialize,
-    TlsRestore, Trap,
+    SignalHandler, TlsRestore, Trap,
 };
 pub use crate::vmcontext::{
    VMCallerCheckedAnyfunc, VMContext, VMFunctionBody, VMFunctionImport, VMGlobalDefinition,
--- a/crates/runtime/src/traphandlers.rs
+++ b/crates/runtime/src/traphandlers.rs
@@ -12,7 +12,7 @@ use std::sync::atomic::Ordering::SeqCst;
 use std::sync::Once;
 use wasmtime_environ::ir;
-pub use self::tls::TlsRestore;
+pub use self::tls::{tls_eager_initialize, TlsRestore};
 extern "C" {
    #[allow(improper_ctypes)]
@@ -386,12 +386,29 @@ mod tls {
            })
        }
        #[inline(never)]
        /// Eagerly initialize thread-local runtime functionality. This will be performed
        /// lazily by the runtime if users do not perform it eagerly.
        pub fn initialize() -> Result<(), Trap> {
            PTR.with(|p| {
                let (state, initialized) = p.get();
                if initialized {
                    return Ok(());
                }
                super::super::sys::lazy_per_thread_init()?;
                p.set((state, true));
                Ok(())
            })
        }
        #[inline(never)] // see module docs for why this is here
        pub fn get() -> Ptr {
            PTR.with(|p| p.get().0)
        }
    }
    pub use raw::initialize as tls_eager_initialize;
    /// Opaque state used to help control TLS state across stack switches for
    /// async support.
    pub struct TlsRestore(raw::Ptr);
--- a/crates/wasmtime/src/engine.rs
+++ b/crates/wasmtime/src/engine.rs
@@ -1,5 +1,5 @@
 use crate::signatures::SignatureRegistry;
-use crate::Config;
+use crate::{Config, Trap};
 use anyhow::Result;
 use std::sync::Arc;
 #[cfg(feature = "cache")]
@@ -63,6 +63,27 @@ impl Engine {
        })
    }
    /// Eagerly initialize thread-local functionality shared by all [`Engine`]s.
    ///
    /// Wasmtime's implementation on some platforms may involve per-thread
    /// setup that needs to happen whenever WebAssembly is invoked. This setup
    /// can take on the order of a few hundred microseconds, whereas the
    /// overhead of calling WebAssembly is otherwise on the order of a few
    /// nanoseconds. This setup cost is paid once per-OS-thread. If your
    /// application is sensitive to the latencies of WebAssembly function
    /// calls, even those that happen first on a thread, then this function
    /// can be used to improve the consistency of each call into WebAssembly
    /// by explicitly frontloading the cost of the one-time setup per-thread.
    ///
    /// Note that this function is not required to be called in any embedding.
    /// Wasmtime will automatically initialize thread-local-state as necessary
    /// on calls into WebAssembly. This is provided for use cases where the
    /// latency of WebAssembly calls are extra-important, which is not
    /// necessarily true of all embeddings.
    pub fn tls_eager_initialize() -> Result<(), Trap> {
        wasmtime_runtime::tls_eager_initialize().map_err(Trap::from_runtime)
    }
    /// Returns the configuration settings that this engine is using.
    #[inline]
    pub fn config(&self) -> &Config {