More optimizations for calling into WebAssembly (#2759)
* Combine stack-based cleanups for faster wasm calls This commit is an extension of #2757 where the goal is to optimize entry into WebAssembly. Currently wasmtime has two stack-based cleanups when entering wasm, one for the externref activation table and another for stack limits getting reset. This commit fuses these two cleanups together into one and moves some code around which enables less captures for fewer closures and such to speed up calls in to wasm a bit more. Overall this drops the execution time from 88ns to 80ns locally for me. This also updates the atomic orderings when updating the stack limit from `SeqCst` to `Relaxed`. While `SeqCst` is a reasonable starting point the usage here should be safe to use `Relaxed` since we're not using the atomics to actually protect any memory, it's simply receiving signals from other threads. * Determine whether a pc is wasm via a global map The macOS implementation of traps recently changed to using mach ports for handlers instead of signal handlers. This means that a previously relied upon invariant, each thread fixes its own trap, was broken. The macOS implementation worked around this by maintaining a global map from thread id to thread local information, however, to solve the problem. This global map is quite slow though. It involves taking a lock and updating a hash map on all calls into WebAssembly. In my local testing this accounts for >70% of the overhead of calling into WebAssembly on macOS. Naturally it'd be great to remove this! This commit fixes this issue and removes the global lock/map that is updated on all calls into WebAssembly. The fix is to maintain a global map of wasm modules and their trap addresses in the `wasmtime` crate. Doing so is relatively simple since we're already tracking this information at the `Store` level. Once we've got a global map then the macOS implementation can use this from a foreign thread and everything works out. Locally this brings the overhead, on macOS specifically, of calling into wasm from 80ns to ~20ns. * Fix compiles * Review comments
This commit is contained in:
@@ -35,6 +35,8 @@ serde = { version = "1.0.94", features = ["derive"] }
|
||||
bincode = "1.2.1"
|
||||
indexmap = "1.6"
|
||||
paste = "1.0.3"
|
||||
psm = "0.1.11"
|
||||
lazy_static = "1.4"
|
||||
|
||||
[target.'cfg(target_os = "windows")'.dependencies]
|
||||
winapi = "0.3.7"
|
||||
|
||||
@@ -1,12 +1,22 @@
|
||||
use std::cmp;
|
||||
use std::collections::BTreeMap;
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
use wasmtime_environ::entity::EntityRef;
|
||||
use wasmtime_environ::ir;
|
||||
use wasmtime_environ::wasm::FuncIndex;
|
||||
use wasmtime_environ::{FunctionAddressMap, Module, TrapInformation};
|
||||
use wasmtime_jit::{CompiledModule, SymbolizeContext};
|
||||
|
||||
/// This is a structure that lives within a `Store` and retains information
|
||||
/// about all wasm code registered with the `Store` (e.g. modules that have
|
||||
/// been instantiated into a store).
|
||||
///
|
||||
/// "frame information" here refers to things like determining whether a
|
||||
/// program counter is a wasm program counter, and additionally mapping program
|
||||
/// counters to wasm filenames, modules, line numbers, etc. This store of
|
||||
/// information lives as long as a `Store` lives since modules are never
|
||||
/// unloaded today.
|
||||
#[derive(Default)]
|
||||
pub struct StoreFrameInfo {
|
||||
/// An internal map that keeps track of backtrace frame information for
|
||||
@@ -21,14 +31,18 @@ pub struct StoreFrameInfo {
|
||||
ranges: BTreeMap<usize, ModuleFrameInfo>,
|
||||
}
|
||||
|
||||
/// This is a listing of information for each module registered with a store
|
||||
/// which lives in `StoreFrameInfo`.
|
||||
struct ModuleFrameInfo {
|
||||
start: usize,
|
||||
functions: BTreeMap<usize, FunctionInfo>,
|
||||
functions: Arc<BTreeMap<usize, FunctionInfo>>,
|
||||
module: Arc<Module>,
|
||||
symbolize: Option<SymbolizeContext>,
|
||||
has_unparsed_debuginfo: bool,
|
||||
}
|
||||
|
||||
/// Information about a function, specifically information about individual
|
||||
/// traps and such.
|
||||
struct FunctionInfo {
|
||||
start: usize,
|
||||
index: FuncIndex,
|
||||
@@ -45,26 +59,7 @@ impl StoreFrameInfo {
|
||||
/// information due to the compiler's configuration.
|
||||
pub fn lookup_frame_info(&self, pc: usize) -> Option<(FrameInfo, bool)> {
|
||||
let (module, func) = self.func(pc)?;
|
||||
|
||||
// Use our relative position from the start of the function to find the
|
||||
// machine instruction that corresponds to `pc`, which then allows us to
|
||||
// map that to a wasm original source location.
|
||||
let rel_pos = (pc - func.start) as u32;
|
||||
let pos = match func
|
||||
.instr_map
|
||||
.instructions
|
||||
.binary_search_by_key(&rel_pos, |map| map.code_offset)
|
||||
{
|
||||
// Exact hit!
|
||||
Ok(pos) => Some(pos),
|
||||
|
||||
// This *would* be at the first slot in the array, so no
|
||||
// instructions cover `pc`.
|
||||
Err(0) => None,
|
||||
|
||||
// This would be at the `nth` slot, so we're at the `n-1`th slot.
|
||||
Err(n) => Some(n - 1),
|
||||
};
|
||||
let pos = func.instr_pos(pc);
|
||||
|
||||
// In debug mode for now assert that we found a mapping for `pc` within
|
||||
// the function, because otherwise something is buggy along the way and
|
||||
@@ -138,15 +133,7 @@ impl StoreFrameInfo {
|
||||
}
|
||||
|
||||
fn func(&self, pc: usize) -> Option<(&ModuleFrameInfo, &FunctionInfo)> {
|
||||
let (end, info) = self.ranges.range(pc..).next()?;
|
||||
if pc < info.start || *end < pc {
|
||||
return None;
|
||||
}
|
||||
let (end, func) = info.functions.range(pc..).next()?;
|
||||
if pc < func.start || *end < pc {
|
||||
return None;
|
||||
}
|
||||
Some((info, func))
|
||||
func(pc, &self.ranges, |t| (t.start, &t.functions))
|
||||
}
|
||||
|
||||
/// Registers a new compiled module's frame information.
|
||||
@@ -183,6 +170,7 @@ impl StoreFrameInfo {
|
||||
if functions.len() == 0 {
|
||||
return;
|
||||
}
|
||||
let functions = Arc::new(functions);
|
||||
|
||||
// First up assert that our chunk of jit functions doesn't collide with
|
||||
// any other known chunks of jit functions...
|
||||
@@ -194,6 +182,7 @@ impl StoreFrameInfo {
|
||||
}
|
||||
|
||||
// ... then insert our range and assert nothing was there previously
|
||||
GLOBAL_INFO.lock().unwrap().register(min, max, &functions);
|
||||
let prev = self.ranges.insert(
|
||||
max,
|
||||
ModuleFrameInfo {
|
||||
@@ -208,6 +197,138 @@ impl StoreFrameInfo {
|
||||
}
|
||||
}
|
||||
|
||||
impl FunctionInfo {
|
||||
fn instr_pos(&self, pc: usize) -> Option<usize> {
|
||||
// Use our relative position from the start of the function to find the
|
||||
// machine instruction that corresponds to `pc`, which then allows us to
|
||||
// map that to a wasm original source location.
|
||||
let rel_pos = (pc - self.start) as u32;
|
||||
match self
|
||||
.instr_map
|
||||
.instructions
|
||||
.binary_search_by_key(&rel_pos, |map| map.code_offset)
|
||||
{
|
||||
// Exact hit!
|
||||
Ok(pos) => Some(pos),
|
||||
|
||||
// This *would* be at the first slot in the array, so no
|
||||
// instructions cover `pc`.
|
||||
Err(0) => None,
|
||||
|
||||
// This would be at the `nth` slot, so we're at the `n-1`th slot.
|
||||
Err(n) => Some(n - 1),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for StoreFrameInfo {
|
||||
fn drop(&mut self) {
|
||||
let mut info = GLOBAL_INFO.lock().unwrap();
|
||||
for end in self.ranges.keys() {
|
||||
info.unregister(*end);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This is the dual of `StoreFrameInfo` and is stored globally (as the name
|
||||
/// implies) rather than simply in one `Store`.
|
||||
///
|
||||
/// The purpose of this map is to be called from signal handlers to determine
|
||||
/// whether a program counter is a wasm trap or not. Specifically macOS has
|
||||
/// no contextual information about the thread available, hence the necessity
|
||||
/// for global state rather than using thread local state.
|
||||
///
|
||||
/// This is similar to `StoreFrameInfo` except that it has less information and
|
||||
/// supports removal. Any time anything is registered with a `StoreFrameInfo`
|
||||
/// it is also automatically registered with the singleton global frame
|
||||
/// information. When a `StoreFrameInfo` is destroyed then all of its entries
|
||||
/// are removed from the global frame information.
|
||||
#[derive(Default)]
|
||||
pub(crate) struct GlobalFrameInfo {
|
||||
// The map here behaves the same way as `StoreFrameInfo`.
|
||||
ranges: BTreeMap<usize, GlobalModuleFrameInfo>,
|
||||
}
|
||||
|
||||
/// This is the equivalent of `ModuleFrameInfo` except has less code and is
|
||||
/// stored within `GlobalFrameInfo`.
|
||||
struct GlobalModuleFrameInfo {
|
||||
start: usize,
|
||||
functions: Arc<BTreeMap<usize, FunctionInfo>>,
|
||||
|
||||
/// Note that modules can be instantiated in many stores, so the purpose of
|
||||
/// this field is to keep track of how many stores have registered a
|
||||
/// module. Information is only removed from the global store when this
|
||||
/// reference count reaches 0.
|
||||
references: usize,
|
||||
}
|
||||
|
||||
lazy_static::lazy_static! {
|
||||
static ref GLOBAL_INFO: Mutex<GlobalFrameInfo> = Default::default();
|
||||
}
|
||||
|
||||
impl GlobalFrameInfo {
|
||||
/// Returns whether the `pc`, according to globally registered information,
|
||||
/// is a wasm trap or not.
|
||||
pub(crate) fn is_wasm_pc(pc: usize) -> bool {
|
||||
let info = GLOBAL_INFO.lock().unwrap();
|
||||
match func(pc, &info.ranges, |i| (i.start, &i.functions)) {
|
||||
Some((_, info)) => info.instr_pos(pc).is_some(),
|
||||
None => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Registers a new region of code, described by `(start, end)` and with
|
||||
/// the given function information, with the global information.
|
||||
fn register(
|
||||
&mut self,
|
||||
start: usize,
|
||||
end: usize,
|
||||
functions: &Arc<BTreeMap<usize, FunctionInfo>>,
|
||||
) {
|
||||
let info = self
|
||||
.ranges
|
||||
.entry(end)
|
||||
.or_insert_with(|| GlobalModuleFrameInfo {
|
||||
start,
|
||||
functions: functions.clone(),
|
||||
references: 0,
|
||||
});
|
||||
// Note that ideally we'd debug_assert that the information previously
|
||||
// stored, if any, matches the `functions` we were given, but for now we
|
||||
// just do some simple checks to hope it's the same.
|
||||
assert_eq!(info.start, start);
|
||||
assert_eq!(info.functions.len(), functions.len());
|
||||
info.references += 1;
|
||||
}
|
||||
|
||||
/// Unregisters a region of code (keyed by the `end` address) from this
|
||||
/// global information.
|
||||
fn unregister(&mut self, end: usize) {
|
||||
let val = self.ranges.get_mut(&end).unwrap();
|
||||
val.references -= 1;
|
||||
if val.references == 0 {
|
||||
self.ranges.remove(&end);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn func<T>(
|
||||
pc: usize,
|
||||
ranges: &BTreeMap<usize, T>,
|
||||
get_start_and_functions: impl FnOnce(&T) -> (usize, &BTreeMap<usize, FunctionInfo>),
|
||||
) -> Option<(&T, &FunctionInfo)> {
|
||||
let (end, info) = ranges.range(pc..).next()?;
|
||||
let (start, functions) = get_start_and_functions(info);
|
||||
if pc < start || *end < pc {
|
||||
return None;
|
||||
}
|
||||
let (end, func) = functions.range(pc..).next()?;
|
||||
if pc < func.start || *end < pc {
|
||||
return None;
|
||||
}
|
||||
Some((info, func))
|
||||
}
|
||||
|
||||
/// Description of a frame in a backtrace for a [`Trap`].
|
||||
///
|
||||
/// Whenever a WebAssembly trap occurs an instance of [`Trap`] is created. Each
|
||||
|
||||
@@ -10,6 +10,7 @@ use std::mem;
|
||||
use std::panic::{self, AssertUnwindSafe};
|
||||
use std::pin::Pin;
|
||||
use std::ptr::{self, NonNull};
|
||||
use std::sync::atomic::Ordering::Relaxed;
|
||||
use wasmtime_environ::wasm::{EntityIndex, FuncIndex};
|
||||
use wasmtime_runtime::{
|
||||
raise_user_trap, ExportFunction, InstanceAllocator, InstanceHandle, OnDemandInstanceAllocator,
|
||||
@@ -1149,20 +1150,109 @@ impl fmt::Debug for Func {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn invoke_wasm_and_catch_traps(
|
||||
store: &Store,
|
||||
closure: impl FnMut(),
|
||||
) -> Result<(), Trap> {
|
||||
unsafe {
|
||||
let canary = 0;
|
||||
let _auto_reset_canary = store
|
||||
.externref_activations_table()
|
||||
.set_stack_canary(&canary);
|
||||
let _reset = if store.externref_activations_table().stack_canary().is_some() {
|
||||
None
|
||||
} else {
|
||||
Some(enter_wasm_init(store)?)
|
||||
};
|
||||
|
||||
wasmtime_runtime::catch_traps(store, closure).map_err(|e| Trap::from_runtime(store, e))
|
||||
}
|
||||
}
|
||||
|
||||
/// This function is called to register state within `Store` whenever
|
||||
/// WebAssembly is entered for the first time within the `Store`. This isn't
|
||||
/// called when wasm is called recursively within the `Store`.
|
||||
///
|
||||
/// This function sets up various limits such as:
|
||||
///
|
||||
/// * The stack limit. This is what ensures that we limit the stack space
|
||||
/// allocated by WebAssembly code and it's relative to the initial stack
|
||||
/// pointer that called into wasm.
|
||||
///
|
||||
/// * Stack canaries for externref gc tracing. Currently the implementation
|
||||
/// relies on walking frames but the stack walker isn't always 100% reliable,
|
||||
/// so a canary is used to ensure that if the canary is seen then it's
|
||||
/// guaranteed all wasm frames have been walked.
|
||||
///
|
||||
/// This function may fail if the the stack limit can't be set because an
|
||||
/// interrupt already happened. Otherwise it returns a value that resets the
|
||||
/// various limits on `Drop`.
|
||||
#[inline]
|
||||
fn enter_wasm_init<'a>(store: &'a Store) -> Result<impl Drop + 'a, Trap> {
|
||||
let stack_pointer = psm::stack_pointer() as usize;
|
||||
|
||||
// Determine the stack pointer where, after which, any wasm code will
|
||||
// immediately trap. This is checked on the entry to all wasm functions.
|
||||
//
|
||||
// Note that this isn't 100% precise. We are requested to give wasm
|
||||
// `max_wasm_stack` bytes, but what we're actually doing is giving wasm
|
||||
// probably a little less than `max_wasm_stack` because we're
|
||||
// calculating the limit relative to this function's approximate stack
|
||||
// pointer. Wasm will be executed on a frame beneath this one (or next
|
||||
// to it). In any case it's expected to be at most a few hundred bytes
|
||||
// of slop one way or another. When wasm is typically given a MB or so
|
||||
// (a million bytes) the slop shouldn't matter too much.
|
||||
//
|
||||
// After we've got the stack limit then we store it into the `stack_limit`
|
||||
// variable. Note that the store is an atomic swap to ensure that we can
|
||||
// consume any previously-sent interrupt requests. If we found that wasm was
|
||||
// previously interrupted then we immediately return a trap (after resetting
|
||||
// the stack limit). Otherwise we're good to keep on going.
|
||||
//
|
||||
// Note the usage of `Relaxed` memory orderings here. This is specifically
|
||||
// an optimization in the `Drop` below where a `Relaxed` store is speedier
|
||||
// than a `SeqCst` store. The rationale for `Relaxed` here is that the
|
||||
// atomic orderings here aren't actually protecting any memory, we're just
|
||||
// trying to be atomic with respect to this one location in memory (for when
|
||||
// `InterruptHandle` sends us a signal). Due to the lack of needing to
|
||||
// synchronize with any other memory it's hoped that the choice of `Relaxed`
|
||||
// here should be correct for our use case.
|
||||
let wasm_stack_limit = stack_pointer - store.engine().config().max_wasm_stack;
|
||||
let interrupts = store.interrupts();
|
||||
match interrupts.stack_limit.swap(wasm_stack_limit, Relaxed) {
|
||||
wasmtime_environ::INTERRUPTED => {
|
||||
// This means that an interrupt happened before we actually
|
||||
// called this function, which means that we're now
|
||||
// considered interrupted.
|
||||
interrupts.stack_limit.store(usize::max_value(), Relaxed);
|
||||
return Err(Trap::new_wasm(
|
||||
Some(store),
|
||||
None,
|
||||
wasmtime_environ::ir::TrapCode::Interrupt,
|
||||
backtrace::Backtrace::new_unresolved(),
|
||||
));
|
||||
}
|
||||
n => debug_assert_eq!(usize::max_value(), n),
|
||||
}
|
||||
store
|
||||
.externref_activations_table()
|
||||
.set_stack_canary(Some(stack_pointer));
|
||||
|
||||
return Ok(Reset(store));
|
||||
|
||||
struct Reset<'a>(&'a Store);
|
||||
|
||||
impl Drop for Reset<'_> {
|
||||
#[inline]
|
||||
fn drop(&mut self) {
|
||||
self.0.externref_activations_table().set_stack_canary(None);
|
||||
|
||||
// see docs above for why this uses `Relaxed`
|
||||
self.0
|
||||
.interrupts()
|
||||
.stack_limit
|
||||
.store(usize::max_value(), Relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A trait implemented for types which can be returned from closures passed to
|
||||
/// [`Func::wrap`] and friends.
|
||||
///
|
||||
|
||||
@@ -93,8 +93,6 @@ where
|
||||
));
|
||||
}
|
||||
|
||||
let anyfunc = self.func.export.anyfunc.as_ref();
|
||||
let trampoline = self.func.trampoline;
|
||||
let params = MaybeUninit::new(params);
|
||||
let mut ret = MaybeUninit::uninit();
|
||||
let mut called = false;
|
||||
@@ -102,9 +100,10 @@ where
|
||||
let result = invoke_wasm_and_catch_traps(&self.func.instance.store, || {
|
||||
called = true;
|
||||
let params = ptr::read(params.as_ptr());
|
||||
let anyfunc = self.func.export.anyfunc.as_ref();
|
||||
let result = params.invoke::<Results>(
|
||||
&self.func.instance.store,
|
||||
trampoline,
|
||||
self.func.trampoline,
|
||||
anyfunc.func_ptr.as_ptr(),
|
||||
anyfunc.vmctx,
|
||||
ptr::null_mut(),
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use crate::frame_info;
|
||||
use crate::frame_info::StoreFrameInfo;
|
||||
use crate::sig_registry::SignatureRegistry;
|
||||
use crate::trampoline::StoreInstanceHandle;
|
||||
@@ -136,7 +137,8 @@ impl Store {
|
||||
// once-per-thread. Platforms like Unix, however, only require this
|
||||
// once-per-program. In any case this is safe to call many times and
|
||||
// each one that's not relevant just won't do anything.
|
||||
wasmtime_runtime::init_traps().expect("failed to initialize trap handling");
|
||||
wasmtime_runtime::init_traps(frame_info::GlobalFrameInfo::is_wasm_pc)
|
||||
.expect("failed to initialize trap handling");
|
||||
|
||||
Store {
|
||||
inner: Rc::new(StoreInner {
|
||||
@@ -401,6 +403,7 @@ impl Store {
|
||||
*self.inner.signal_handler.borrow_mut() = handler;
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn interrupts(&self) -> &VMInterrupts {
|
||||
&self.inner.interrupts
|
||||
}
|
||||
@@ -924,10 +927,6 @@ unsafe impl TrapInfo for Store {
|
||||
self
|
||||
}
|
||||
|
||||
fn is_wasm_trap(&self, addr: usize) -> bool {
|
||||
self.frame_info().borrow().lookup_trap_info(addr).is_some()
|
||||
}
|
||||
|
||||
fn custom_signal_handler(&self, call: &dyn Fn(&SignalHandler) -> bool) -> bool {
|
||||
if let Some(handler) = &*self.inner.signal_handler.borrow() {
|
||||
return call(handler);
|
||||
@@ -935,11 +934,6 @@ unsafe impl TrapInfo for Store {
|
||||
false
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn max_wasm_stack(&self) -> usize {
|
||||
self.engine().config().max_wasm_stack
|
||||
}
|
||||
|
||||
fn out_of_gas(&self) {
|
||||
match self.inner.out_of_gas_behavior.get() {
|
||||
OutOfGas::Trap => self.out_of_gas_trap(),
|
||||
|
||||
@@ -182,7 +182,7 @@ impl Trap {
|
||||
}
|
||||
}
|
||||
|
||||
fn new_wasm(
|
||||
pub(crate) fn new_wasm(
|
||||
store: Option<&Store>,
|
||||
trap_pc: Option<usize>,
|
||||
code: ir::TrapCode,
|
||||
|
||||
Reference in New Issue
Block a user