More optimizations for calling into WebAssembly (#2759)

* Combine stack-based cleanups for faster wasm calls This commit is an extension of #2757 where the goal is to optimize entry into WebAssembly. Currently wasmtime has two stack-based cleanups when entering wasm, one for the externref activation table and another for stack limits getting reset. This commit fuses these two cleanups together into one and moves some code around which enables less captures for fewer closures and such to speed up calls in to wasm a bit more. Overall this drops the execution time from 88ns to 80ns locally for me. This also updates the atomic orderings when updating the stack limit from `SeqCst` to `Relaxed`. While `SeqCst` is a reasonable starting point the usage here should be safe to use `Relaxed` since we're not using the atomics to actually protect any memory, it's simply receiving signals from other threads. * Determine whether a pc is wasm via a global map The macOS implementation of traps recently changed to using mach ports for handlers instead of signal handlers. This means that a previously relied upon invariant, each thread fixes its own trap, was broken. The macOS implementation worked around this by maintaining a global map from thread id to thread local information, however, to solve the problem. This global map is quite slow though. It involves taking a lock and updating a hash map on all calls into WebAssembly. In my local testing this accounts for >70% of the overhead of calling into WebAssembly on macOS. Naturally it'd be great to remove this! This commit fixes this issue and removes the global lock/map that is updated on all calls into WebAssembly. The fix is to maintain a global map of wasm modules and their trap addresses in the `wasmtime` crate. Doing so is relatively simple since we're already tracking this information at the `Store` level. Once we've got a global map then the macOS implementation can use this from a foreign thread and everything works out. Locally this brings the overhead, on macOS specifically, of calling into wasm from 80ns to ~20ns. * Fix compiles * Review comments
2021-03-24 11:41:33 -05:00
parent 6b2da3d299
commit d4b54ee0a8
13 changed files with 324 additions and 309 deletions
--- a/crates/runtime/src/traphandlers/macos.rs
+++ b/crates/runtime/src/traphandlers/macos.rs
@@ -21,11 +21,10 @@
 //! port. This means that, unlike signals, threads can't fix their own traps.
 //! Instead a helper thread is spun up to service exception messages. This is
 //! also in conflict with Wasmtime's exception handling currently which is to
-//! use a thread-local to figure out whether a pc is a wasm pc or not on a
-//! trap. To work around this we have a global map from mach thread numbers to
-//! the state for that thread, updated on entry/exit from wasm. This is likely
-//! slower than signals which do less updating on wasm entry/exit, but hopefully
-//! by the time this is a problem we can figure out a better solution.
+//! use a thread-local to store information about how to unwind. Additionally
+//! this requires that the check of whether a pc is a wasm trap or not is a
+//! global check rather than a per-thread check. This necessitates the existence
+//! of `GlobalFrameInfo` in the `wasmtime` crate.
 //!
 //! Otherwise this file heavily uses the `mach` Rust crate for type and
 //! function declarations. Many bits and pieces are copied or translated from
@@ -33,7 +32,7 @@

 #![allow(non_snake_case)]

-use crate::traphandlers::{tls, CallThreadState, Trap, Unwind};
+use crate::traphandlers::{tls, Trap, Unwind};
 use mach::exception_types::*;
 use mach::kern_return::*;
 use mach::mach_init::*;
@@ -43,10 +42,7 @@ use mach::port::*;
 use mach::thread_act::*;
 use mach::traps::*;
 use std::cell::Cell;
-use std::collections::HashMap;
 use std::mem;
-use std::ptr;
-use std::sync::Mutex;
 use std::thread;

 /// Other `mach` declarations awaiting https://github.com/fitzgen/mach/pull/64 to be merged.
@@ -154,20 +150,10 @@ pub enum Void {}
 /// Wasmtime on macOS.
 pub type SignalHandler<'a> = dyn Fn(Void) -> bool + 'a;

-/// Process-global map for mapping thread names to their state to figure out
-/// whether a thread's trap is related to wasm or not. This is extremely
-/// unsafe and caution must be used when accessing. Be sure to read
-/// documentation below on this.
-static mut MAP: *mut Mutex<HashMap<mach_port_name_t, *const CallThreadState<'static>>> =
-    ptr::null_mut();
-
 /// Process-global port that we use to route thread-level exceptions to.
 static mut WASMTIME_PORT: mach_port_name_t = MACH_PORT_NULL;

 pub unsafe fn platform_init() {
-    // Initialize the process global map
-    MAP = Box::into_raw(Default::default());
-
    // Allocate our WASMTIME_PORT and make sure that it can be sent to so we
    // can receive exceptions.
    let me = mach_task_self();
@@ -289,7 +275,7 @@ unsafe fn handle_exception(request: &mut ExceptionRequest) -> bool {

            let get_pc = |state: &ThreadState| state.__rip as *const u8;

-            let resume = |state: &mut ThreadState, pc: usize, jmp_buf: usize| {
+            let resume = |state: &mut ThreadState, pc: usize| {
                // The x86_64 ABI requires a 16-byte stack alignment for
                // functions, so typically we'll be 16-byte aligned. In this
                // case we simulate a `call` instruction by decrementing the
@@ -315,7 +301,6 @@ unsafe fn handle_exception(request: &mut ExceptionRequest) -> bool {
                }
                state.__rip = unwind as u64;
                state.__rdi = pc as u64;
-                state.__rsi = jmp_buf as u64;
            };
            let mut thread_state = ThreadState::new();
        } else if #[cfg(target_arch = "aarch64")] {
@@ -325,18 +310,17 @@ unsafe fn handle_exception(request: &mut ExceptionRequest) -> bool {

            let get_pc = |state: &ThreadState| state.__pc as *const u8;

-            let resume = |state: &mut ThreadState, pc: usize, jmp_buf: usize| {
+            let resume = |state: &mut ThreadState, pc: usize| {
                // Clobber LR with the faulting PC, so unwinding resumes at the
                // faulting instruction. The previous value of LR has been saved
                // by the callee (in Cranelift generated code), so no need to
                // stash it.
                state.__lr = pc as u64;

-                // Fill in the 2 arguments to unwind here, and set PC to it, so
+                // Fill in the argument to unwind here, and set PC to it, so
                // it looks like a call to unwind.
-                state.__pc = unwind as u64;
                state.__x[0] = pc as u64;
-                state.__x[1] = jmp_buf as u64;
+                state.__pc = unwind as u64;
            };
            let mut thread_state = mem::zeroed::<ThreadState>();
        } else {
@@ -372,19 +356,7 @@ unsafe fn handle_exception(request: &mut ExceptionRequest) -> bool {
    // pointer value and if `MAP` changes happen after we read our entry that's
    // ok since they won't invalidate our entry.
    let pc = get_pc(&thread_state);
-    let state = (*MAP)
-        .lock()
-        .unwrap_or_else(|e| e.into_inner())
-        .get(&origin_thread)
-        .copied();
-    let jmp_buf = match state {
-        Some(state) => (*state).jmp_buf_if_trap(pc, |_| false),
-        None => ptr::null(),
-    };
-    if jmp_buf.is_null() {
-        return false;
-    }
-    if jmp_buf as usize == 1 {
+    if !super::IS_WASM_PC(pc as usize) {
        return false;
    }

@@ -392,7 +364,7 @@ unsafe fn handle_exception(request: &mut ExceptionRequest) -> bool {
    // force the thread itself to trap. The thread's register state is
    // configured to resume in the `unwind` function below, we update the
    // thread's register state, and then we're off to the races.
-    resume(&mut thread_state, pc as usize, jmp_buf as usize);
+    resume(&mut thread_state, pc as usize);
    let kret = thread_set_state(
        origin_thread,
        thread_state_flavor,
@@ -409,13 +381,13 @@ unsafe fn handle_exception(request: &mut ExceptionRequest) -> bool {
 /// a native backtrace once we've switched back to the thread itself. After
 /// the backtrace is captured we can do the usual `longjmp` back to the source
 /// of the wasm code.
-unsafe extern "C" fn unwind(wasm_pc: *const u8, jmp_buf: *const u8) -> ! {
-    tls::with(|state| {
-        if let Some(state) = state {
-            state.capture_backtrace(wasm_pc);
-        }
+unsafe extern "C" fn unwind(wasm_pc: *const u8) -> ! {
+    let jmp_buf = tls::with(|state| {
+        let state = state.unwrap();
+        state.capture_backtrace(wasm_pc);
+        state.jmp_buf.get()
    });
-
+    debug_assert!(!jmp_buf.is_null());
    Unwind(jmp_buf);
 }

@@ -474,23 +446,3 @@ pub fn lazy_per_thread_init() -> Result<(), Trap> {
    });
    Ok(())
 }
-
-/// This hook is invoked whenever TLS state for the current thread is updated
-/// to the `ptr` specified.
-///
-/// The purpose for hooking this on macOS is we register in a process-global map
-/// that our mach thread's state is `ptr` at this time. This allows the
-/// exception handling thread to lookup in this map later if our thread
-/// generates an exception.
-///
-/// Note that in general this is quite unsafe since we're moving non-Send state
-/// (`ptr`) which is also only valid for a short portion of the program (it
-/// lives on the stack) into a global portion of the program. This needs to be
-/// kept tightly in sync with `handle_exception` above where it's accessed in a
-/// very limited fashion.
-pub fn register_tls(ptr: *const CallThreadState<'static>) {
-    unsafe {
-        let me = MY_PORT.with(|p| p.0);
-        (*MAP).lock().unwrap().insert(me, ptr);
-    }
-}