diff --git a/Cargo.toml b/Cargo.toml
index 9b1ce39b16..712b57bb09 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -120,3 +120,7 @@ debug = false # FIXME(#1813)
 [[bench]]
 name = "instantiation"
 harness = false
+
+[[bench]]
+name = "thread_eager_init"
+harness = false
diff --git a/benches/thread_eager_init.rs b/benches/thread_eager_init.rs
new file mode 100644
index 0000000000..02f3c65f5c
--- /dev/null
+++ b/benches/thread_eager_init.rs
@@ -0,0 +1,114 @@
+use criterion::{criterion_group, criterion_main, Criterion};
+use std::thread;
+use std::time::{Duration, Instant};
+use wasmtime::*;
+
+fn measure_execution_time(c: &mut Criterion) {
+    // Baseline performance: a single measurment covers both initializing
+    // thread local resources and executing the first call.
+    //
+    // The other two bench functions should sum to this duration.
+    c.bench_function("lazy initialization at call", move |b| {
+        let (engine, module) = test_setup();
+        b.iter_custom(move |iters| {
+            (0..iters)
+                .into_iter()
+                .map(|_| lazy_thread_instantiate(engine.clone(), module.clone()))
+                .sum()
+        })
+    });
+
+    // Using Engine::tls_eager_initialize: measure how long eager
+    // initialization takes on a new thread.
+    c.bench_function("eager initialization", move |b| {
+        let (engine, module) = test_setup();
+        b.iter_custom(move |iters| {
+            (0..iters)
+                .into_iter()
+                .map(|_| {
+                    let (init, _call) = eager_thread_instantiate(engine.clone(), module.clone());
+                    init
+                })
+                .sum()
+        })
+    });
+
+    // Measure how long the first call takes on a thread after it has been
+    // eagerly initialized.
+    c.bench_function("call after eager initialization", move |b| {
+        let (engine, module) = test_setup();
+        b.iter_custom(move |iters| {
+            (0..iters)
+                .into_iter()
+                .map(|_| {
+                    let (_init, call) = eager_thread_instantiate(engine.clone(), module.clone());
+                    call
+                })
+                .sum()
+        })
+    });
+}
+
+/// Creating a store and measuring the time to perform a call is the same behavior
+/// in both setups.
+fn duration_of_call(engine: &Engine, module: &Module) -> Duration {
+    let mut store = Store::new(engine, ());
+    let inst = Instance::new(&mut store, module, &[]).expect("instantiate");
+    let f = inst.get_func(&mut store, "f").expect("get f");
+    let f = f.typed::<(), (), _>(&store).expect("type f");
+
+    let call = Instant::now();
+    f.call(&mut store, ()).expect("call f");
+    call.elapsed()
+}
+
+/// When wasmtime first runs a function on a thread, it needs to initialize
+/// some thread-local resources and install signal handlers. This benchmark
+/// spawns a new thread, and returns the duration it took to execute the first
+/// function call made on that thread.
+fn lazy_thread_instantiate(engine: Engine, module: Module) -> Duration {
+    thread::spawn(move || duration_of_call(&engine, &module))
+        .join()
+        .expect("thread joins")
+}
+/// This benchmark spawns a new thread, and records the duration to eagerly
+/// initializes the thread local resources. It then creates a store and
+/// instance, and records the duration it took to execute the first function
+/// call.
+fn eager_thread_instantiate(engine: Engine, module: Module) -> (Duration, Duration) {
+    thread::spawn(move || {
+        let init_start = Instant::now();
+        Engine::tls_eager_initialize().expect("eager init");
+        let init_duration = init_start.elapsed();
+
+        (init_duration, duration_of_call(&engine, &module))
+    })
+    .join()
+    .expect("thread joins")
+}
+
+fn test_setup() -> (Engine, Module) {
+    // We only expect to create one Instance at a time, with a single memory.
+    let pool_count = 10;
+
+    let mut config = Config::new();
+    config.allocation_strategy(InstanceAllocationStrategy::Pooling {
+        strategy: PoolingAllocationStrategy::NextAvailable,
+        module_limits: ModuleLimits {
+            memory_pages: 1,
+            ..Default::default()
+        },
+        instance_limits: InstanceLimits {
+            count: pool_count,
+            memory_reservation_size: 1,
+        },
+    });
+    let engine = Engine::new(&config).unwrap();
+
+    // The module has a memory (shouldn't matter) and a single function which is a no-op.
+    let module = Module::new(&engine, r#"(module (memory 1) (func (export "f")))"#).unwrap();
+    (engine, module)
+}
+
+criterion_group!(benches, measure_execution_time);
+criterion_main!(benches);
diff --git a/crates/runtime/src/lib.rs b/crates/runtime/src/lib.rs
index 6e5d4f6fb3..2c7aa1f584 100644
--- a/crates/runtime/src/lib.rs
+++ b/crates/runtime/src/lib.rs
@@ -49,8 +49,8 @@ pub use crate::memory::{Memory, RuntimeLinearMemory, RuntimeMemoryCreator};
 pub use crate::mmap::Mmap;
 pub use crate::table::{Table, TableElement};
 pub use crate::traphandlers::{
-    catch_traps, init_traps, raise_lib_trap, raise_user_trap, resume_panic, SignalHandler,
-    TlsRestore, Trap,
+    catch_traps, init_traps, raise_lib_trap, raise_user_trap, resume_panic, tls_eager_initialize,
+    SignalHandler, TlsRestore, Trap,
 };
 pub use crate::vmcontext::{
     VMCallerCheckedAnyfunc, VMContext, VMFunctionBody, VMFunctionImport, VMGlobalDefinition,
diff --git a/crates/runtime/src/traphandlers.rs b/crates/runtime/src/traphandlers.rs
index 7332ae0a7e..ba9de7c3d4 100644
--- a/crates/runtime/src/traphandlers.rs
+++ b/crates/runtime/src/traphandlers.rs
@@ -12,7 +12,7 @@ use std::sync::atomic::Ordering::SeqCst;
 use std::sync::Once;
 use wasmtime_environ::ir;
 
-pub use self::tls::TlsRestore;
+pub use self::tls::{tls_eager_initialize, TlsRestore};
 
 extern "C" {
     #[allow(improper_ctypes)]
@@ -386,12 +386,29 @@ mod tls {
             })
         }
 
+        #[inline(never)]
+        /// Eagerly initialize thread-local runtime functionality. This will be performed
+        /// lazily by the runtime if users do not perform it eagerly.
+        pub fn initialize() -> Result<(), Trap> {
+            PTR.with(|p| {
+                let (state, initialized) = p.get();
+                if initialized {
+                    return Ok(());
+                }
+                super::super::sys::lazy_per_thread_init()?;
+                p.set((state, true));
+                Ok(())
+            })
+        }
+
         #[inline(never)] // see module docs for why this is here
         pub fn get() -> Ptr {
             PTR.with(|p| p.get().0)
         }
     }
 
+    pub use raw::initialize as tls_eager_initialize;
+
     /// Opaque state used to help control TLS state across stack switches for
     /// async support.
     pub struct TlsRestore(raw::Ptr);
diff --git a/crates/wasmtime/src/engine.rs b/crates/wasmtime/src/engine.rs
index bce82a8a9f..f0c615fb2d 100644
--- a/crates/wasmtime/src/engine.rs
+++ b/crates/wasmtime/src/engine.rs
@@ -1,5 +1,5 @@
 use crate::signatures::SignatureRegistry;
-use crate::Config;
+use crate::{Config, Trap};
 use anyhow::Result;
 use std::sync::Arc;
 #[cfg(feature = "cache")]
@@ -63,6 +63,27 @@ impl Engine {
         })
     }
 
+    /// Eagerly initialize thread-local functionality shared by all [`Engine`]s.
+    ///
+    /// Wasmtime's implementation on some platforms may involve per-thread
+    /// setup that needs to happen whenever WebAssembly is invoked. This setup
+    /// can take on the order of a few hundred microseconds, whereas the
+    /// overhead of calling WebAssembly is otherwise on the order of a few
+    /// nanoseconds. This setup cost is paid once per-OS-thread. If your
+    /// application is sensitive to the latencies of WebAssembly function
+    /// calls, even those that happen first on a thread, then this function
+    /// can be used to improve the consistency of each call into WebAssembly
+    /// by explicitly frontloading the cost of the one-time setup per-thread.
+    ///
+    /// Note that this function is not required to be called in any embedding.
+    /// Wasmtime will automatically initialize thread-local-state as necessary
+    /// on calls into WebAssembly. This is provided for use cases where the
+    /// latency of WebAssembly calls are extra-important, which is not
+    /// necessarily true of all embeddings.
+    pub fn tls_eager_initialize() -> Result<(), Trap> {
+        wasmtime_runtime::tls_eager_initialize().map_err(Trap::from_runtime)
+    }
+
     /// Returns the configuration settings that this engine is using.
     #[inline]
     pub fn config(&self) -> &Config {