Files
wasmtime/benches/call.rs
Alex Crichton f4b9020913 Change wasm-to-host trampolines to take the values_vec size (#4192)
* Change wasm-to-host trampolines to take the values_vec size

This commit changes the ABI of wasm-to-host trampolines, which are
only used right now for functions created with `Func::new`, to pass
along the size of the `values_vec` argument. Previously the trampoline
simply received `*mut ValRaw` and assumed that it was the appropriate
size. By receiving a size as well we can thread through `&mut [ValRaw]`
internally instead of `*mut ValRaw`.

The original motivation for this is that I'm planning to leverage these
trampolines for the component model for host-defined functions. Out of
an abundance of caution of making sure that everything lines up I wanted
to be able to write down asserts about the size received at runtime
compared to the size expected. This overall led me to the desire to
thread this size parameter through on the assumption that it would not
impact performance all that much.

I ran two benchmarks locally from the `call.rs` benchmark and got:

* `sync/no-hook/wasm-to-host - nop - unchecked` - no change
* `sync/no-hook/wasm-to-host - nop-params-and-results - unchecked` - 5%
  slower

This is what I roughly expected in that if nothing actually reads the
new parameter (e.g. no arguments) then threading through the parameter
is effectively otherwise free. Otherwise though accesses to the `ValRaw`
storage is now bounds-checked internally in Wasmtime instead of assuming
it's valid, leading to the 5% slowdown (~9.6ns to ~10.3ns). If this
becomes a peformance bottleneck for a particular use case then we should
be fine to remove the bounds checking here or otherwise only bounds
check in debug mode, otherwise I plan on leaving this as-is.

Of particular note this also changes the C API for `*_unchecked`
functions where the C callback now receives the size of the array as
well.

* Add docs
2022-06-01 09:05:37 -05:00

471 lines
15 KiB
Rust

use criterion::measurement::WallTime;
use criterion::{criterion_group, criterion_main, BenchmarkGroup, Criterion};
use std::fmt::Debug;
use std::future::Future;
use std::pin::Pin;
use std::task::{Context, Poll, RawWaker, RawWakerVTable, Waker};
use std::time::Instant;
use wasmtime::*;
criterion_main!(benches);
criterion_group!(benches, measure_execution_time);
fn measure_execution_time(c: &mut Criterion) {
host_to_wasm(c);
wasm_to_host(c);
}
#[derive(Copy, Clone)]
enum IsAsync {
Yes,
YesPooling,
No,
}
impl IsAsync {
fn desc(&self) -> &str {
match self {
IsAsync::Yes => "async",
IsAsync::YesPooling => "async-pool",
IsAsync::No => "sync",
}
}
fn use_async(&self) -> bool {
match self {
IsAsync::Yes | IsAsync::YesPooling => true,
IsAsync::No => false,
}
}
}
fn engines() -> Vec<(Engine, IsAsync)> {
let mut config = Config::new();
vec![
(Engine::new(&config).unwrap(), IsAsync::No),
(
Engine::new(config.async_support(true)).unwrap(),
IsAsync::Yes,
),
(
Engine::new(config.allocation_strategy(InstanceAllocationStrategy::pooling())).unwrap(),
IsAsync::YesPooling,
),
]
}
/// Benchmarks the overhead of calling WebAssembly from the host in various
/// configurations.
fn host_to_wasm(c: &mut Criterion) {
for (engine, is_async) in engines() {
let mut store = Store::new(&engine, ());
let module = Module::new(
&engine,
r#"(module
(func (export "nop"))
(func (export "nop-params-and-results") (param i32 i64) (result f32)
f32.const 0)
)"#,
)
.unwrap();
let instance = if is_async.use_async() {
run_await(Instance::new_async(&mut store, &module, &[])).unwrap()
} else {
Instance::new(&mut store, &module, &[]).unwrap()
};
let bench_calls = |group: &mut BenchmarkGroup<'_, WallTime>, store: &mut Store<()>| {
// Bench the overhead of a function that has no parameters or results
bench_host_to_wasm::<(), ()>(group, store, &instance, is_async, "nop", (), ());
// Bench the overhead of a function that has some parameters and just
// one result (will use the raw system-v convention on applicable
// platforms).
bench_host_to_wasm::<(i32, i64), (f32,)>(
group,
store,
&instance,
is_async,
"nop-params-and-results",
(0, 0),
(0.0,),
);
};
// Bench once without any call hooks configured
let name = format!("{}/no-hook", is_async.desc());
bench_calls(&mut c.benchmark_group(&name), &mut store);
// Bench again with a "call hook" enabled
store.call_hook(|_, _| Ok(()));
let name = format!("{}/hook-sync", is_async.desc());
bench_calls(&mut c.benchmark_group(&name), &mut store);
}
}
fn bench_host_to_wasm<Params, Results>(
c: &mut BenchmarkGroup<'_, WallTime>,
store: &mut Store<()>,
instance: &Instance,
is_async: IsAsync,
name: &str,
typed_params: Params,
typed_results: Results,
) where
Params: WasmParams + ToVals + Copy,
Results: WasmResults + ToVals + Copy + PartialEq + Debug,
{
// Benchmark the "typed" version, which should be faster than the versions
// below.
c.bench_function(&format!("host-to-wasm - typed - {}", name), |b| {
let typed = instance
.get_typed_func::<Params, Results, _>(&mut *store, name)
.unwrap();
b.iter(|| {
let results = if is_async.use_async() {
run_await(typed.call_async(&mut *store, typed_params)).unwrap()
} else {
typed.call(&mut *store, typed_params).unwrap()
};
assert_eq!(results, typed_results);
})
});
// Benchmark the "untyped" version which should be the slowest of the three
// here, but not unduly slow.
c.bench_function(&format!("host-to-wasm - untyped - {}", name), |b| {
let untyped = instance.get_func(&mut *store, name).unwrap();
let params = typed_params.to_vals();
let expected_results = typed_results.to_vals();
let mut results = vec![Val::I32(0); expected_results.len()];
b.iter(|| {
if is_async.use_async() {
run_await(untyped.call_async(&mut *store, &params, &mut results)).unwrap();
} else {
untyped.call(&mut *store, &params, &mut results).unwrap();
}
for (expected, actual) in expected_results.iter().zip(&results) {
assert_vals_eq(expected, actual);
}
})
});
// Currently `call_async_unchecked` isn't implemented, so can't benchmark
// below
if is_async.use_async() {
return;
}
// Benchmark the "unchecked" version which should be between the above two,
// but is unsafe.
c.bench_function(&format!("host-to-wasm - unchecked - {}", name), |b| {
let untyped = instance.get_func(&mut *store, name).unwrap();
let params = typed_params.to_vals();
let results = typed_results.to_vals();
let mut space = vec![ValRaw::i32(0); params.len().max(results.len())];
b.iter(|| unsafe {
for (i, param) in params.iter().enumerate() {
space[i] = param.to_raw(&mut *store);
}
untyped
.call_unchecked(&mut *store, space.as_mut_ptr())
.unwrap();
for (i, expected) in results.iter().enumerate() {
assert_vals_eq(
expected,
&Val::from_raw(&mut *store, space[i], expected.ty()),
);
}
})
});
}
/// Benchmarks the overhead of calling the host from WebAssembly itself
fn wasm_to_host(c: &mut Criterion) {
let module = r#"(module
;; host imports with a variety of parameters/arguments
(import "" "nop" (func $nop))
(import "" "nop-params-and-results"
(func $nop_params_and_results (param i32 i64) (result f32))
)
;; "runner functions" for each of the above imports. Each runner
;; function takes the number of times to call the host function as
;; the duration of this entire loop will be measured.
(func (export "run-nop") (param i64)
loop
call $nop
local.get 0 ;; decrement & break if necessary
i64.const -1
i64.add
local.tee 0
i64.const 0
i64.ne
br_if 0
end
)
(func (export "run-nop-params-and-results") (param i64)
loop
i32.const 0 ;; always zero parameters
i64.const 0
call $nop_params_and_results
f32.const 0 ;; assert the correct result
f32.eq
i32.eqz
if
unreachable
end
local.get 0 ;; decrement & break if necessary
i64.const -1
i64.add
local.tee 0
i64.const 0
i64.ne
br_if 0
end
)
)"#;
for (engine, is_async) in engines() {
let mut store = Store::new(&engine, ());
let module = Module::new(&engine, module).unwrap();
bench_calls(
&mut c.benchmark_group(&format!("{}/no-hook", is_async.desc())),
&mut store,
&module,
is_async,
);
store.call_hook(|_, _| Ok(()));
bench_calls(
&mut c.benchmark_group(&format!("{}/hook-sync", is_async.desc())),
&mut store,
&module,
is_async,
);
}
// Given a `Store` will create various instances hooked up to different ways
// of defining host imports to benchmark their overhead.
fn bench_calls(
group: &mut BenchmarkGroup<'_, WallTime>,
store: &mut Store<()>,
module: &Module,
is_async: IsAsync,
) {
let engine = store.engine().clone();
let mut typed = Linker::new(&engine);
typed.func_wrap("", "nop", || {}).unwrap();
typed
.func_wrap("", "nop-params-and-results", |x: i32, y: i64| {
assert_eq!(x, 0);
assert_eq!(y, 0);
0.0f32
})
.unwrap();
let instance = if is_async.use_async() {
run_await(typed.instantiate_async(&mut *store, &module)).unwrap()
} else {
typed.instantiate(&mut *store, &module).unwrap()
};
bench_instance(group, store, &instance, "typed", is_async);
let mut untyped = Linker::new(&engine);
untyped
.func_new("", "nop", FuncType::new([], []), |_, _, _| Ok(()))
.unwrap();
let ty = FuncType::new([ValType::I32, ValType::I64], [ValType::F32]);
untyped
.func_new(
"",
"nop-params-and-results",
ty,
|_caller, params, results| {
assert_eq!(params.len(), 2);
match params[0] {
Val::I32(0) => {}
_ => unreachable!(),
}
match params[1] {
Val::I64(0) => {}
_ => unreachable!(),
}
assert_eq!(results.len(), 1);
results[0] = Val::F32(0);
Ok(())
},
)
.unwrap();
let instance = if is_async.use_async() {
run_await(untyped.instantiate_async(&mut *store, &module)).unwrap()
} else {
untyped.instantiate(&mut *store, &module).unwrap()
};
bench_instance(group, store, &instance, "untyped", is_async);
unsafe {
let mut unchecked = Linker::new(&engine);
unchecked
.func_new_unchecked("", "nop", FuncType::new([], []), |_, _| Ok(()))
.unwrap();
let ty = FuncType::new([ValType::I32, ValType::I64], [ValType::F32]);
unchecked
.func_new_unchecked("", "nop-params-and-results", ty, |mut caller, space| {
match Val::from_raw(&mut caller, space[0], ValType::I32) {
Val::I32(0) => {}
_ => unreachable!(),
}
match Val::from_raw(&mut caller, space[1], ValType::I64) {
Val::I64(0) => {}
_ => unreachable!(),
}
space[0] = Val::F32(0).to_raw(&mut caller);
Ok(())
})
.unwrap();
let instance = if is_async.use_async() {
run_await(unchecked.instantiate_async(&mut *store, &module)).unwrap()
} else {
unchecked.instantiate(&mut *store, &module).unwrap()
};
bench_instance(group, store, &instance, "unchecked", is_async);
}
// Only define async host imports if allowed
if !is_async.use_async() {
return;
}
let mut typed = Linker::new(&engine);
typed
.func_wrap0_async("", "nop", |caller| {
Box::new(async {
drop(caller);
})
})
.unwrap();
typed
.func_wrap2_async("", "nop-params-and-results", |_caller, x: i32, y: i64| {
Box::new(async move {
assert_eq!(x, 0);
assert_eq!(y, 0);
0.0f32
})
})
.unwrap();
let instance = run_await(typed.instantiate_async(&mut *store, &module)).unwrap();
bench_instance(group, store, &instance, "async-typed", is_async);
}
// Given a specific instance executes all of the "runner functions"
fn bench_instance(
group: &mut BenchmarkGroup<'_, WallTime>,
store: &mut Store<()>,
instance: &Instance,
desc: &str,
is_async: IsAsync,
) {
group.bench_function(&format!("wasm-to-host - nop - {}", desc), |b| {
let run = instance
.get_typed_func::<u64, (), _>(&mut *store, "run-nop")
.unwrap();
b.iter_custom(|iters| {
let start = Instant::now();
if is_async.use_async() {
run_await(run.call_async(&mut *store, iters)).unwrap();
} else {
run.call(&mut *store, iters).unwrap();
}
start.elapsed()
})
});
group.bench_function(
&format!("wasm-to-host - nop-params-and-results - {}", desc),
|b| {
let run = instance
.get_typed_func::<u64, (), _>(&mut *store, "run-nop-params-and-results")
.unwrap();
b.iter_custom(|iters| {
let start = Instant::now();
if is_async.use_async() {
run_await(run.call_async(&mut *store, iters)).unwrap();
} else {
run.call(&mut *store, iters).unwrap();
}
start.elapsed()
})
},
);
}
}
fn assert_vals_eq(a: &Val, b: &Val) {
match (a, b) {
(Val::I32(a), Val::I32(b)) => assert_eq!(a, b),
(Val::I64(a), Val::I64(b)) => assert_eq!(a, b),
(Val::F32(a), Val::F32(b)) => assert_eq!(a, b),
(Val::F64(a), Val::F64(b)) => assert_eq!(a, b),
_ => unimplemented!(),
}
}
trait ToVals {
fn to_vals(&self) -> Vec<Val>;
}
macro_rules! tuples {
($($t:ident)*) => (
#[allow(non_snake_case)]
impl<$($t:Copy + Into<Val>,)*> ToVals for ($($t,)*) {
fn to_vals(&self) -> Vec<Val> {
let mut _dst = Vec::new();
let ($($t,)*) = *self;
$(_dst.push($t.into());)*
_dst
}
}
)
}
tuples!();
tuples!(A);
tuples!(A B);
tuples!(A B C);
fn run_await<F: Future>(future: F) -> F::Output {
let mut f = Pin::from(Box::new(future));
let waker = dummy_waker();
let mut cx = Context::from_waker(&waker);
loop {
match f.as_mut().poll(&mut cx) {
Poll::Ready(val) => break val,
Poll::Pending => {}
}
}
}
fn dummy_waker() -> Waker {
return unsafe { Waker::from_raw(clone(5 as *const _)) };
unsafe fn clone(ptr: *const ()) -> RawWaker {
assert_eq!(ptr as usize, 5);
const VTABLE: RawWakerVTable = RawWakerVTable::new(clone, wake, wake_by_ref, drop);
RawWaker::new(ptr, &VTABLE)
}
unsafe fn wake(ptr: *const ()) {
assert_eq!(ptr as usize, 5);
}
unsafe fn wake_by_ref(ptr: *const ()) {
assert_eq!(ptr as usize, 5);
}
unsafe fn drop(ptr: *const ()) {
assert_eq!(ptr as usize, 5);
}
}