Files
wasmtime/crates/runtime/src/component/transcode.rs
Nick Fitzgerald a2f846f124 Don't re-capture backtraces when propagating traps through host frames (#5049)
* Add a benchmark for traps with many Wasm<-->host calls on the stack

* Add a test for expected Wasm stack traces with Wasm<--host calls on the stack when we trap

* Don't re-capture backtraces when propagating traps through host frames

This fixes some accidentally quadratic code where we would re-capture a Wasm
stack trace (takes `O(n)` time) every time we propagated a trap through a host
frame back to Wasm (can happen `O(n)` times). And `O(n) * O(n) = O(n^2)`, of
course. Whoops. After this commit, it trapping with a call stack that is `n`
frames deep of Wasm-to-host-to-Wasm calls just captures a single backtrace and
is therefore just a proper `O(n)` time operation, as it is intended to be.

Now we explicitly track whether we need to capture a Wasm backtrace or not when
raising a trap. This unfortunately isn't as straightforward as one might hope,
however, because of the split between `wasmtime::Trap` and
`wasmtime_runtime::Trap`. We need to decide whether or not to capture a Wasm
backtrace inside `wasmtime_runtime` but in order to determine whether to do that
or not we need to reflect on the `anyhow::Error` and see if it is a
`wasmtime::Trap` that already has a backtrace or not. This can't be done the
straightforward way because it would introduce a cyclic dependency between the
`wasmtime` and `wasmtime-runtime` crates. We can't merge those two `Trap`
types-- at least not without effectively merging the whole `wasmtime` and
`wasmtime-runtime` crates together, which would be a good idea in a perfect
world but would be a *ton* of ocean boiling from where we currently are --
because `wasmtime::Trap` does symbolication of stack traces which relies on
module registration information data that resides inside the `wasmtime` crate
and therefore can't be moved into `wasmtime-runtime`. We resolve this problem by
adding a boolean to `wasmtime_runtime::raise_user_trap` that controls whether we
should capture a Wasm backtrace or not, and then determine whether we need a
backtrace or not at each of that function's call sites, which are in `wasmtime`
and therefore can do the reflection to determine whether the user trap already
has a backtrace or not. Phew!

Fixes #5037

* debug assert that we don't record unnecessary backtraces for traps

* Add assertions around `needs_backtrace`

Unfortunately we can't do

    debug_assert_eq!(needs_backtrace, trap.inner.backtrace.get().is_some());

because `needs_backtrace` doesn't consider whether Wasm backtraces have been
disabled via config.

* Consolidate `needs_backtrace` calculation followed by calling `raise_user_trap` into one place
2022-10-13 07:22:46 -07:00

452 lines
17 KiB
Rust

//! Implementation of string transcoding required by the component model.
use anyhow::{anyhow, Result};
use std::cell::Cell;
use std::slice;
const UTF16_TAG: usize = 1 << 31;
/// Macro to define the `VMBuiltinTranscodeArray` type which contains all of the
/// function pointers to the actual transcoder functions. This structure is read
/// by Cranelift-generated code, hence the `repr(C)`.
///
/// Note that this references the `trampolines` module rather than the functions
/// below as the `trampolines` module has the raw ABI.
///
/// This is modeled after the similar macros and usages in `libcalls.rs` and
/// `vmcontext.rs`
macro_rules! define_transcoders {
(
$(
$( #[$attr:meta] )*
$name:ident( $( $pname:ident: $param:ident ),* ) $( -> $result:ident )?;
)*
) => {
/// An array that stores addresses of builtin functions. We translate code
/// to use indirect calls. This way, we don't have to patch the code.
#[repr(C)]
pub struct VMBuiltinTranscodeArray {
$(
$name: unsafe extern "C" fn(
$(define_transcoders!(@ty $param),)*
$(define_transcoders!(@retptr $result),)?
) $( -> define_transcoders!(@ty $result))?,
)*
}
impl VMBuiltinTranscodeArray {
pub const INIT: VMBuiltinTranscodeArray = VMBuiltinTranscodeArray {
$($name: trampolines::$name,)*
};
}
};
(@ty size) => (usize);
(@ty size_pair) => (usize);
(@ty ptr_u8) => (*mut u8);
(@ty ptr_u16) => (*mut u16);
(@retptr size_pair) => (*mut usize);
(@retptr size) => (());
}
wasmtime_environ::foreach_transcoder!(define_transcoders);
/// Submodule with macro-generated constants which are the actual libcall
/// transcoders that are invoked by Cranelift. These functions have a specific
/// ABI defined by the macro itself and will defer to the actual bodies of each
/// implementation following this submodule.
#[allow(improper_ctypes_definitions)]
mod trampolines {
macro_rules! transcoders {
(
$(
$( #[$attr:meta] )*
$name:ident( $( $pname:ident: $param:ident ),* ) $( -> $result:ident )?;
)*
) => (
$(
pub unsafe extern "C" fn $name(
$($pname : define_transcoders!(@ty $param),)*
// If a result is given then a `size_pair` results gets its
// second result value passed via a return pointer here, so
// optionally indicate a return pointer.
$(_retptr: define_transcoders!(@retptr $result))?
) $( -> define_transcoders!(@ty $result))? {
$(transcoders!(@validate_param $pname $param);)*
// Always catch panics to avoid trying to unwind from Rust
// into Cranelift-generated code which would lead to a Bad
// Time.
//
// Additionally assume that every function below returns a
// `Result` where errors turn into traps.
let result = std::panic::catch_unwind(|| {
super::$name($($pname),*)
});
match result {
Ok(Ok(ret)) => transcoders!(@convert_ret ret _retptr $($result)?),
Ok(Err(err)) => crate::traphandlers::raise_trap(
crate::traphandlers::TrapReason::User {
error: err,
needs_backtrace: true,
},
),
Err(panic) => crate::traphandlers::resume_panic(panic),
}
}
)*
);
(@convert_ret $ret:ident $retptr:ident) => ($ret);
(@convert_ret $ret:ident $retptr:ident size) => ($ret);
(@convert_ret $ret:ident $retptr:ident size_pair) => ({
let (a, b) = $ret;
*$retptr = b;
a
});
(@validate_param $arg:ident ptr_u16) => ({
// This should already be guaranteed by the canonical ABI and our
// adapter modules, but double-check here to be extra-sure. If this
// is a perf concern it can become a `debug_assert!`.
assert!(($arg as usize) % 2 == 0, "unaligned 16-bit pointer");
});
(@validate_param $arg:ident $ty:ident) => ();
}
wasmtime_environ::foreach_transcoder!(transcoders);
}
/// This property should already be guaranteed by construction in the component
/// model but assert it here to be extra sure. Nothing below is sound if regions
/// can overlap.
fn assert_no_overlap<T, U>(a: &[T], b: &[U]) {
let a_start = a.as_ptr() as usize;
let a_end = a_start + (a.len() * std::mem::size_of::<T>());
let b_start = b.as_ptr() as usize;
let b_end = b_start + (b.len() * std::mem::size_of::<U>());
if a_start < b_start {
assert!(a_end < b_start);
} else {
assert!(b_end < a_start);
}
}
/// Converts a utf8 string to a utf8 string.
///
/// The length provided is length of both the source and the destination
/// buffers. No value is returned other than whether an invalid string was
/// found.
unsafe fn utf8_to_utf8(src: *mut u8, len: usize, dst: *mut u8) -> Result<()> {
let src = slice::from_raw_parts(src, len);
let dst = slice::from_raw_parts_mut(dst, len);
assert_no_overlap(src, dst);
log::trace!("utf8-to-utf8 {len}");
let src = std::str::from_utf8(src).map_err(|_| anyhow!("invalid utf8 encoding"))?;
dst.copy_from_slice(src.as_bytes());
Ok(())
}
/// Converts a utf16 string to a utf16 string.
///
/// The length provided is length of both the source and the destination
/// buffers. No value is returned other than whether an invalid string was
/// found.
unsafe fn utf16_to_utf16(src: *mut u16, len: usize, dst: *mut u16) -> Result<()> {
let src = slice::from_raw_parts(src, len);
let dst = slice::from_raw_parts_mut(dst, len);
assert_no_overlap(src, dst);
log::trace!("utf16-to-utf16 {len}");
run_utf16_to_utf16(src, dst)?;
Ok(())
}
/// Transcodes utf16 to itself, returning whether all code points were inside of
/// the latin1 space.
fn run_utf16_to_utf16(src: &[u16], mut dst: &mut [u16]) -> Result<bool> {
let mut all_latin1 = true;
for ch in std::char::decode_utf16(src.iter().map(|i| u16::from_le(*i))) {
let ch = ch.map_err(|_| anyhow!("invalid utf16 encoding"))?;
all_latin1 = all_latin1 && u8::try_from(u32::from(ch)).is_ok();
let result = ch.encode_utf16(dst);
let size = result.len();
for item in result {
*item = item.to_le();
}
dst = &mut dst[size..];
}
Ok(all_latin1)
}
/// Converts a latin1 string to a latin1 string.
///
/// Given that all byte sequences are valid latin1 strings this is simply a
/// memory copy.
unsafe fn latin1_to_latin1(src: *mut u8, len: usize, dst: *mut u8) -> Result<()> {
let src = slice::from_raw_parts(src, len);
let dst = slice::from_raw_parts_mut(dst, len);
assert_no_overlap(src, dst);
log::trace!("latin1-to-latin1 {len}");
dst.copy_from_slice(src);
Ok(())
}
/// Converts a latin1 string to a utf16 string.
///
/// This simply inflates the latin1 characters to the u16 code points. The
/// length provided is the same length of the source and destination buffers.
unsafe fn latin1_to_utf16(src: *mut u8, len: usize, dst: *mut u16) -> Result<()> {
let src = slice::from_raw_parts(src, len);
let dst = slice::from_raw_parts_mut(dst, len);
assert_no_overlap(src, dst);
for (src, dst) in src.iter().zip(dst) {
*dst = u16::from(*src).to_le();
}
log::trace!("latin1-to-utf16 {len}");
Ok(())
}
/// Converts utf8 to utf16.
///
/// The length provided is the same unit length of both buffers, and the
/// returned value from this function is how many u16 units were written.
unsafe fn utf8_to_utf16(src: *mut u8, len: usize, dst: *mut u16) -> Result<usize> {
let src = slice::from_raw_parts(src, len);
let dst = slice::from_raw_parts_mut(dst, len);
assert_no_overlap(src, dst);
let result = run_utf8_to_utf16(src, dst)?;
log::trace!("utf8-to-utf16 {len} => {result}");
Ok(result)
}
fn run_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> Result<usize> {
let src = std::str::from_utf8(src).map_err(|_| anyhow!("invalid utf8 encoding"))?;
let mut amt = 0;
for (i, dst) in src.encode_utf16().zip(dst) {
*dst = i.to_le();
amt += 1;
}
Ok(amt)
}
/// Converts utf16 to utf8.
///
/// Each buffer is specified independently here and the returned value is a pair
/// of the number of code units read and code units written. This might perform
/// a partial transcode if the destination buffer is not large enough to hold
/// the entire contents.
unsafe fn utf16_to_utf8(
src: *mut u16,
src_len: usize,
dst: *mut u8,
dst_len: usize,
) -> Result<(usize, usize)> {
let src = slice::from_raw_parts(src, src_len);
let mut dst = slice::from_raw_parts_mut(dst, dst_len);
assert_no_overlap(src, dst);
// This iterator will convert to native endianness and additionally count
// how many items have been read from the iterator so far. This
// count is used to return how many of the source code units were read.
let src_iter_read = Cell::new(0);
let src_iter = src.iter().map(|i| {
src_iter_read.set(src_iter_read.get() + 1);
u16::from_le(*i)
});
let mut src_read = 0;
let mut dst_written = 0;
for ch in std::char::decode_utf16(src_iter) {
let ch = ch.map_err(|_| anyhow!("invalid utf16 encoding"))?;
// If the destination doesn't have enough space for this character
// then the loop is ended and this function will be called later with a
// larger destination buffer.
if dst.len() < 4 && dst.len() < ch.len_utf8() {
break;
}
// Record that characters were read and then convert the `char` to
// utf-8, advancing the destination buffer.
src_read = src_iter_read.get();
let len = ch.encode_utf8(dst).len();
dst_written += len;
dst = &mut dst[len..];
}
log::trace!("utf16-to-utf8 {src_len}/{dst_len} => {src_read}/{dst_written}");
Ok((src_read, dst_written))
}
/// Converts latin1 to utf8.
///
/// Receives the independent size of both buffers and returns the number of code
/// units read and code units written (both bytes in this case).
///
/// This may perform a partial encoding if the destination is not large enough.
unsafe fn latin1_to_utf8(
src: *mut u8,
src_len: usize,
dst: *mut u8,
dst_len: usize,
) -> Result<(usize, usize)> {
let src = slice::from_raw_parts(src, src_len);
let dst = slice::from_raw_parts_mut(dst, dst_len);
assert_no_overlap(src, dst);
let (read, written) = encoding_rs::mem::convert_latin1_to_utf8_partial(src, dst);
log::trace!("latin1-to-utf8 {src_len}/{dst_len} => ({read}, {written})");
Ok((read, written))
}
/// Converts utf16 to "latin1+utf16", probably using a utf16 encoding.
///
/// The length specified is the length of both the source and destination
/// buffers. If the source string has any characters that don't fit in the
/// latin1 code space (0xff and below) then a utf16-tagged length will be
/// returned. Otherwise the string is "deflated" from a utf16 string to a latin1
/// string and the latin1 length is returned.
unsafe fn utf16_to_compact_probably_utf16(
src: *mut u16,
len: usize,
dst: *mut u16,
) -> Result<usize> {
let src = slice::from_raw_parts(src, len);
let dst = slice::from_raw_parts_mut(dst, len);
assert_no_overlap(src, dst);
let all_latin1 = run_utf16_to_utf16(src, dst)?;
if all_latin1 {
let (left, dst, right) = dst.align_to_mut::<u8>();
assert!(left.is_empty());
assert!(right.is_empty());
for i in 0..len {
dst[i] = dst[2 * i];
}
log::trace!("utf16-to-compact-probably-utf16 {len} => latin1 {len}");
Ok(len)
} else {
log::trace!("utf16-to-compact-probably-utf16 {len} => utf16 {len}");
Ok(len | UTF16_TAG)
}
}
/// Converts a utf8 string to latin1.
///
/// The length specified is the same length of both the input and the output
/// buffers.
///
/// Returns the number of code units read from the source and the number of code
/// units written to the destination.
///
/// Note that this may not convert the entire source into the destination if the
/// original utf8 string has usvs not representable in latin1.
unsafe fn utf8_to_latin1(src: *mut u8, len: usize, dst: *mut u8) -> Result<(usize, usize)> {
let src = slice::from_raw_parts(src, len);
let dst = slice::from_raw_parts_mut(dst, len);
assert_no_overlap(src, dst);
let read = encoding_rs::mem::utf8_latin1_up_to(src);
let written = encoding_rs::mem::convert_utf8_to_latin1_lossy(&src[..read], dst);
log::trace!("utf8-to-latin1 {len} => ({read}, {written})");
Ok((read, written))
}
/// Converts a utf16 string to latin1
///
/// This is the same as `utf8_to_latin1` in terms of parameters/results.
unsafe fn utf16_to_latin1(src: *mut u16, len: usize, dst: *mut u8) -> Result<(usize, usize)> {
let src = slice::from_raw_parts(src, len);
let dst = slice::from_raw_parts_mut(dst, len);
assert_no_overlap(src, dst);
let mut size = 0;
for (src, dst) in src.iter().zip(dst) {
let src = u16::from_le(*src);
match u8::try_from(src) {
Ok(src) => *dst = src,
Err(_) => break,
}
size += 1;
}
log::trace!("utf16-to-latin1 {len} => {size}");
Ok((size, size))
}
/// Converts a utf8 string to a utf16 string which has been partially converted
/// as latin1 prior.
///
/// The original string has already been partially transcoded with
/// `utf8_to_latin1` and that was determined to not be able to transcode the
/// entire string. The substring of the source that couldn't be encoded into
/// latin1 is passed here via `src` and `src_len`.
///
/// The destination buffer is specified by `dst` and `dst_len`. The first
/// `latin1_bytes_so_far` bytes (not code units) of the `dst` buffer have
/// already been filled in with latin1 characters and need to be inflated
/// in-place to their utf16 equivalents.
///
/// After the initial latin1 code units have been inflated the entirety of `src`
/// is then transcoded into the remaining space within `dst`.
unsafe fn utf8_to_compact_utf16(
src: *mut u8,
src_len: usize,
dst: *mut u16,
dst_len: usize,
latin1_bytes_so_far: usize,
) -> Result<usize> {
let src = slice::from_raw_parts(src, src_len);
let dst = slice::from_raw_parts_mut(dst, dst_len);
assert_no_overlap(src, dst);
let dst = inflate_latin1_bytes(dst, latin1_bytes_so_far);
let result = run_utf8_to_utf16(src, dst)?;
log::trace!("utf8-to-compact-utf16 {src_len}/{dst_len}/{latin1_bytes_so_far} => {result}");
Ok(result + latin1_bytes_so_far)
}
/// Same as `utf8_to_compact_utf16` but for utf16 source strings.
unsafe fn utf16_to_compact_utf16(
src: *mut u16,
src_len: usize,
dst: *mut u16,
dst_len: usize,
latin1_bytes_so_far: usize,
) -> Result<usize> {
let src = slice::from_raw_parts(src, src_len);
let dst = slice::from_raw_parts_mut(dst, dst_len);
assert_no_overlap(src, dst);
let dst = inflate_latin1_bytes(dst, latin1_bytes_so_far);
run_utf16_to_utf16(src, dst)?;
let result = src.len();
log::trace!("utf16-to-compact-utf16 {src_len}/{dst_len}/{latin1_bytes_so_far} => {result}");
Ok(result + latin1_bytes_so_far)
}
/// Inflates the `latin1_bytes_so_far` number of bytes written to the beginning
/// of `dst` into u16 codepoints.
///
/// Returns the remaining space in the destination that can be transcoded into,
/// slicing off the prefix of the string that was inflated from the latin1
/// bytes.
fn inflate_latin1_bytes(dst: &mut [u16], latin1_bytes_so_far: usize) -> &mut [u16] {
// Note that `latin1_bytes_so_far` is a byte measure while `dst` is a region
// of u16 units. This `split_at_mut` uses the byte index as an index into
// the u16 unit because each of the latin1 bytes will become a whole code
// unit in the destination which is 2 bytes large.
let (to_inflate, rest) = dst.split_at_mut(latin1_bytes_so_far);
// Use a byte-oriented view to inflate the original latin1 bytes.
let (left, mid, right) = unsafe { to_inflate.align_to_mut::<u8>() };
assert!(left.is_empty());
assert!(right.is_empty());
for i in (0..latin1_bytes_so_far).rev() {
mid[2 * i] = mid[i];
mid[2 * i + 1] = 0;
}
return rest;
}