From 87ff24a4aafd009d389c7a69c4b4688f74f26071 Mon Sep 17 00:00:00 2001 From: Dan Gohman Date: Mon, 20 Sep 2021 07:14:52 -0700 Subject: [PATCH] Use `__builtin_setjmp` instead of `sigsetjmp`. (#3360) * Use `__builtin_setjmp` instead of `sigsetjmp`. Use [`__builtin_setjmp`] instead of `sigsetjmp`, as it is implemented in the compiler, performed inline, and saves much less state. This speeds up calls into wasm by about 8% on my machine. [`__builtin_setjmp`]: https://gcc.gnu.org/onlinedocs/gcc/Nonlocal-Gotos.html * Add a comment confirming that 5 really is the documented size. * Add a comment about callee-saved state and __builtin_setjmp. * On clang on aarch64, use sigsetjmp. * Fix a stray `#endif`. --- crates/runtime/src/helpers.c | 38 ++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/crates/runtime/src/helpers.c b/crates/runtime/src/helpers.c index 4865928a18..daf47dda11 100644 --- a/crates/runtime/src/helpers.c +++ b/crates/runtime/src/helpers.c @@ -2,18 +2,44 @@ #include #include +#ifdef CFG_TARGET_OS_windows + +#define platform_setjmp(buf) setjmp(buf) +#define platform_longjmp(buf, arg) longjmp(buf, arg) +typedef jmp_buf platform_jmp_buf; + +#elif defined(__clang__) && defined(__aarch64__) + +// Clang on aarch64 doesn't support `__builtin_setjmp`, so use `sigsetjmp` +// from libc. +// // Note that `sigsetjmp` and `siglongjmp` are used here where possible to // explicitly pass a 0 argument to `sigsetjmp` that we don't need to preserve // the process signal mask. This should make this call a bit faster b/c it // doesn't need to touch the kernel signal handling routines. -#ifdef CFG_TARGET_OS_windows -#define platform_setjmp(buf) setjmp(buf) -#define platform_longjmp(buf, arg) longjmp(buf, arg) -#define platform_jmp_buf jmp_buf -#else #define platform_setjmp(buf) sigsetjmp(buf, 0) #define platform_longjmp(buf, arg) siglongjmp(buf, arg) -#define platform_jmp_buf sigjmp_buf +typedef sigjmp_buf platform_jmp_buf; + +#else + +// GCC and Clang both provide `__builtin_setjmp`/`__builtin_longjmp`, which +// differ from plain `setjmp` and `longjmp` in that they're implemented by +// the compiler inline rather than in libc, and the compiler can avoid saving +// and restoring most of the registers. See the [GCC docs] and [clang docs] +// for more information. +// +// Per the caveat in the GCC docs, this assumes that the host compiler (which +// may be compiling for a generic architecture family) knows about all the +// register state that Cranelift (which may be specializing for the hardware at +// runtime) is assuming is callee-saved. +// +// [GCC docs]: https://gcc.gnu.org/onlinedocs/gcc/Nonlocal-Gotos.html +// [clang docs]: https://llvm.org/docs/ExceptionHandling.html#llvm-eh-sjlj-setjmp +#define platform_setjmp(buf) __builtin_setjmp(buf) +#define platform_longjmp(buf, arg) __builtin_longjmp(buf, arg) +typedef void *platform_jmp_buf[5]; // this is the documented size; see the docs links for details. + #endif int wasmtime_setjmp(