From 4639e85c4e9a084ff442e2ff7924b5d753e63be5 Mon Sep 17 00:00:00 2001 From: Afonso Bordado Date: Wed, 12 Oct 2022 19:15:38 +0100 Subject: [PATCH] Flush Icache on AArch64 Windows (#4997) * cranelift: Add FlushInstructionCache for AArch64 on Windows This was previously done on #3426 for linux. * wasmtime: Add FlushInstructionCache for AArch64 on Windows This was previously done on #3426 for linux. * cranelift: Add MemoryUse flag to JIT Memory Manager This allows us to keep the icache flushing code self-contained and not leak implementation details. This also changes the windows icache flushing code to only flush pages that were previously unflushed. * Add jit-icache-coherence crate * cranelift: Use `jit-icache-coherence` * wasmtime: Use `jit-icache-coherence` * jit-icache-coherence: Make rustix feature additive Mutually exclusive features cause issues. * wasmtime: Remove rustix from wasmtime-jit We now use it via jit-icache-coherence * Rename wasmtime-jit-icache-coherency crate * Use cfg-if in wasmtime-jit-icache-coherency crate * Use inline instead of inline(always) * Add unsafe marker to clear_cache * Conditionally compile all rustix operations membarrier does not exist on MacOS * Publish `wasmtime-jit-icache-coherence` * Remove explicit windows check This is implied by the target_os = "windows" above * cranelift: Remove len != 0 check This is redundant as it is done in non_protected_allocations_iter * Comment cleanups Thanks @akirilov-arm! * Make clear_cache safe * Rename pipeline_flush to pipeline_flush_mt * Revert "Make clear_cache safe" This reverts commit 21165d81c9030ed9b291a1021a367214d2942c90. * More docs! * Fix pipeline_flush reference on clear_cache * Update more docs! * Move pipeline flush after `mprotect` calls Technically the `clear_cache` operation is a lie in AArch64, so move the pipeline flush after the `mprotect` calls so that it benefits from the implicit cache cleaning done by it. * wasmtime: Remove rustix backend from icache crate * wasmtime: Use libc for macos * wasmtime: Flush icache on all arch's for windows * wasmtime: Add flags to membarrier call --- Cargo.lock | 12 ++- Cargo.toml | 2 + cranelift/jit/Cargo.toml | 1 + cranelift/jit/src/backend.rs | 17 ---- cranelift/jit/src/memory.rs | 95 +++++++++++---------- crates/jit-icache-coherence/Cargo.toml | 23 ++++++ crates/jit-icache-coherence/src/lib.rs | 105 ++++++++++++++++++++++++ crates/jit-icache-coherence/src/libc.rs | 88 ++++++++++++++++++++ crates/jit-icache-coherence/src/win.rs | 45 ++++++++++ crates/jit/Cargo.toml | 4 +- crates/jit/src/code_memory.rs | 28 +++---- scripts/publish.rs | 1 + 12 files changed, 334 insertions(+), 87 deletions(-) create mode 100644 crates/jit-icache-coherence/Cargo.toml create mode 100644 crates/jit-icache-coherence/src/lib.rs create mode 100644 crates/jit-icache-coherence/src/libc.rs create mode 100644 crates/jit-icache-coherence/src/win.rs diff --git a/Cargo.lock b/Cargo.lock index 48b84a3b16..e63d0c7f2f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -667,6 +667,7 @@ dependencies = [ "memmap2", "region", "target-lexicon", + "wasmtime-jit-icache-coherence", "windows-sys", ] @@ -3665,12 +3666,12 @@ dependencies = [ "log", "object", "rustc-demangle", - "rustix", "serde", "target-lexicon", "thiserror", "wasmtime-environ", "wasmtime-jit-debug", + "wasmtime-jit-icache-coherence", "wasmtime-runtime", "windows-sys", ] @@ -3684,6 +3685,15 @@ dependencies = [ "rustix", ] +[[package]] +name = "wasmtime-jit-icache-coherence" +version = "2.0.0" +dependencies = [ + "cfg-if", + "libc", + "windows-sys", +] + [[package]] name = "wasmtime-runtime" version = "3.0.0" diff --git a/Cargo.toml b/Cargo.toml index 5762dc02a4..71d7267928 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -84,6 +84,7 @@ members = [ "crates/c-api", "crates/cli-flags", "crates/environ/fuzz", + "crates/jit-icache-coherence", "examples/fib-debug/wasm", "examples/wasi/wasm", "examples/tokio/wasm", @@ -126,6 +127,7 @@ wasi-common = { path = "crates/wasi-common", version = "=3.0.0" } wasi-tokio = { path = "crates/wasi-common/tokio", version = "=3.0.0" } wasi-cap-std-sync = { path = "crates/wasi-common/cap-std-sync", version = "=3.0.0" } wasmtime-fuzzing = { path = "crates/fuzzing" } +wasmtime-jit-icache-coherence = { path = "crates/jit-icache-coherence", version = "=2.0.0" } cranelift-wasm = { path = "cranelift/wasm", version = "0.90.0" } cranelift-codegen = { path = "cranelift/codegen", version = "0.90.0" } diff --git a/cranelift/jit/Cargo.toml b/cranelift/jit/Cargo.toml index 4bf735fe45..e234b8d0d0 100644 --- a/cranelift/jit/Cargo.toml +++ b/cranelift/jit/Cargo.toml @@ -20,6 +20,7 @@ libc = { version = "0.2.42" } target-lexicon = { workspace = true } memmap2 = { version = "0.2.1", optional = true } log = { workspace = true } +wasmtime-jit-icache-coherence = { workspace = true } [target.'cfg(windows)'.dependencies.windows-sys] workspace = true diff --git a/cranelift/jit/src/backend.rs b/cranelift/jit/src/backend.rs index 31158e9038..441a878d4f 100644 --- a/cranelift/jit/src/backend.rs +++ b/cranelift/jit/src/backend.rs @@ -458,14 +458,6 @@ impl JITModule { self.memory.readonly.set_readonly(); self.memory.code.set_readable_and_executable(); - #[cfg(all(target_arch = "aarch64", target_os = "linux"))] - { - let cmd: libc::c_int = 32; // MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE - - // Ensure that no processor has fetched a stale instruction stream. - unsafe { libc::syscall(libc::SYS_membarrier, cmd) }; - } - for update in self.pending_got_updates.drain(..) { unsafe { update.entry.as_ref() }.store(update.ptr as *mut _, Ordering::SeqCst); } @@ -530,15 +522,6 @@ impl JITModule { module.libcall_plt_entries.insert(libcall, plt_entry); } - #[cfg(all(target_arch = "aarch64", target_os = "linux"))] - { - let cmd: libc::c_int = 64; // MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE - - // This is a requirement of the membarrier() call executed by - // the finalize_definitions() method. - unsafe { libc::syscall(libc::SYS_membarrier, cmd) }; - } - module } diff --git a/cranelift/jit/src/memory.rs b/cranelift/jit/src/memory.rs index a18f6ad3c1..9187131f88 100644 --- a/cranelift/jit/src/memory.rs +++ b/cranelift/jit/src/memory.rs @@ -4,9 +4,11 @@ use memmap2::MmapMut; #[cfg(not(any(feature = "selinux-fix", windows)))] use std::alloc; use std::convert::TryFrom; +use std::ffi::c_void; use std::io; use std::mem; use std::ptr; +use wasmtime_jit_icache_coherence as icache_coherence; /// A simple struct consisting of a pointer and length. struct PtrLen { @@ -161,6 +163,7 @@ impl Memory { // TODO: Allocate more at a time. self.current = PtrLen::with_size(size)?; self.position = size; + Ok(self.current.ptr) } @@ -168,45 +171,45 @@ impl Memory { pub(crate) fn set_readable_and_executable(&mut self) { self.finish_current(); + // Clear all the newly allocated code from cache if the processor requires it + // + // Do this before marking the memory as R+X, technically we should be able to do it after + // but there are some CPU's that have had errata about doing this with read only memory. + for &PtrLen { ptr, len, .. } in self.non_protected_allocations_iter() { + unsafe { + icache_coherence::clear_cache(ptr as *const c_void, len) + .expect("Failed cache clear") + }; + } + let set_region_readable_and_executable = |ptr, len| { - if len != 0 { - if self.branch_protection == BranchProtection::BTI { - #[cfg(all(target_arch = "aarch64", target_os = "linux"))] - if std::arch::is_aarch64_feature_detected!("bti") { - let prot = libc::PROT_EXEC | libc::PROT_READ | /* PROT_BTI */ 0x10; + if self.branch_protection == BranchProtection::BTI { + #[cfg(all(target_arch = "aarch64", target_os = "linux"))] + if std::arch::is_aarch64_feature_detected!("bti") { + let prot = libc::PROT_EXEC | libc::PROT_READ | /* PROT_BTI */ 0x10; - unsafe { - if libc::mprotect(ptr as *mut libc::c_void, len, prot) < 0 { - panic!("unable to make memory readable+executable"); - } + unsafe { + if libc::mprotect(ptr as *mut libc::c_void, len, prot) < 0 { + panic!("unable to make memory readable+executable"); } - - return; } - } - unsafe { - region::protect(ptr, len, region::Protection::READ_EXECUTE) - .expect("unable to make memory readable+executable"); + return; } } + + unsafe { + region::protect(ptr, len, region::Protection::READ_EXECUTE) + .expect("unable to make memory readable+executable"); + } }; - #[cfg(feature = "selinux-fix")] - { - for &PtrLen { ref map, ptr, len } in &self.allocations[self.already_protected..] { - if map.is_some() { - set_region_readable_and_executable(ptr, len); - } - } + for &PtrLen { ptr, len, .. } in self.non_protected_allocations_iter() { + set_region_readable_and_executable(ptr, len); } - #[cfg(not(feature = "selinux-fix"))] - { - for &PtrLen { ptr, len } in &self.allocations[self.already_protected..] { - set_region_readable_and_executable(ptr, len); - } - } + // Flush any in-flight instructions from the pipeline + icache_coherence::pipeline_flush_mt().expect("Failed pipeline flush"); self.already_protected = self.allocations.len(); } @@ -215,33 +218,27 @@ impl Memory { pub(crate) fn set_readonly(&mut self) { self.finish_current(); - #[cfg(feature = "selinux-fix")] - { - for &PtrLen { ref map, ptr, len } in &self.allocations[self.already_protected..] { - if len != 0 && map.is_some() { - unsafe { - region::protect(ptr, len, region::Protection::READ) - .expect("unable to make memory readonly"); - } - } - } - } - - #[cfg(not(feature = "selinux-fix"))] - { - for &PtrLen { ptr, len } in &self.allocations[self.already_protected..] { - if len != 0 { - unsafe { - region::protect(ptr, len, region::Protection::READ) - .expect("unable to make memory readonly"); - } - } + for &PtrLen { ptr, len, .. } in self.non_protected_allocations_iter() { + unsafe { + region::protect(ptr, len, region::Protection::READ) + .expect("unable to make memory readonly"); } } self.already_protected = self.allocations.len(); } + /// Iterates non protected memory allocations that are of not zero bytes in size. + fn non_protected_allocations_iter(&self) -> impl Iterator { + let iter = self.allocations[self.already_protected..].iter(); + + #[cfg(feature = "selinux-fix")] + return iter.filter(|&PtrLen { ref map, len, .. }| len != 0 && map.is_some()); + + #[cfg(not(feature = "selinux-fix"))] + return iter.filter(|&PtrLen { len, .. }| *len != 0); + } + /// Frees all allocated memory regions that would be leaked otherwise. /// Likely to invalidate existing function pointers, causing unsafety. pub(crate) unsafe fn free_memory(&mut self) { diff --git a/crates/jit-icache-coherence/Cargo.toml b/crates/jit-icache-coherence/Cargo.toml new file mode 100644 index 0000000000..2eb095ff1a --- /dev/null +++ b/crates/jit-icache-coherence/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "wasmtime-jit-icache-coherence" +version = "2.0.0" +authors.workspace = true +description = "Utilities for JIT icache maintenance" +documentation = "https://docs.rs/jit-icache-coherence" +license = "Apache-2.0 WITH LLVM-exception" +repository = "https://github.com/bytecodealliance/wasmtime" +edition.workspace = true + +[dependencies] +cfg-if = "1.0" + +[target.'cfg(target_os = "windows")'.dependencies.windows-sys] +workspace = true +features = [ + "Win32_Foundation", + "Win32_System_Threading", + "Win32_System_Diagnostics_Debug", +] + +[target.'cfg(any(target_os = "linux", target_os = "macos"))'.dependencies.libc] +version = "0.2.42" \ No newline at end of file diff --git a/crates/jit-icache-coherence/src/lib.rs b/crates/jit-icache-coherence/src/lib.rs new file mode 100644 index 0000000000..e47e539714 --- /dev/null +++ b/crates/jit-icache-coherence/src/lib.rs @@ -0,0 +1,105 @@ +//! This crate provides utilities for instruction cache maintenance for JIT authors. +//! +//! In self modifying codes such as when writing a JIT, special care must be taken when marking the +//! code as ready for execution. On fully coherent architectures (X86, S390X) the data cache (D-Cache) +//! and the instruction cache (I-Cache) are always in sync. However this is not guaranteed for all +//! architectures such as AArch64 where these caches are not coherent with each other. +//! +//! When writing new code there may be a I-cache entry for that same address which causes the +//! processor to execute whatever was in the cache instead of the new code. +//! +//! See the [ARM Community - Caches and Self-Modifying Code] blog post that contains a great +//! explanation of the above. (It references AArch32 but it has a high level overview of this problem). +//! +//! ## Usage +//! +//! You should call [clear_cache] on any pages that you write with the new code that you're intending +//! to execute. You can do this at any point in the code from the moment that you write the page up to +//! the moment where the code is executed. +//! +//! You also need to call [pipeline_flush_mt] to ensure that there isn't any invalid instruction currently +//! in the pipeline if you are running in a multi threaded environment. +//! +//! For single threaded programs you are free to omit [pipeline_flush_mt], otherwise you need to +//! call both [clear_cache] and [pipeline_flush_mt] in that order. +//! +//! ### Example: +//! ``` +//! # use std::ffi::c_void; +//! # use std::io; +//! # use wasmtime_jit_icache_coherence::*; +//! # +//! # struct Page { +//! # addr: *const c_void, +//! # len: usize, +//! # } +//! # +//! # fn main() -> io::Result<()> { +//! # +//! # let run_code = || {}; +//! # let code = vec![0u8; 64]; +//! # let newly_written_pages = vec![Page { +//! # addr: &code[0] as *const u8 as *const c_void, +//! # len: code.len(), +//! # }]; +//! # unsafe { +//! // Invalidate the cache for all the newly written pages where we wrote our new code. +//! for page in newly_written_pages { +//! clear_cache(page.addr, page.len)?; +//! } +//! +//! // Once those are invalidated we also need to flush the pipeline +//! pipeline_flush_mt()?; +//! +//! // We can now safely execute our new code. +//! run_code(); +//! # } +//! # Ok(()) +//! # } +//! ``` +//! +//!
+//!
+//!  **Warning**: In order to correctly use this interface you should always call [clear_cache].
+//!  A followup call to [pipeline_flush_mt] is required if you are running in a multi-threaded environment.
+//!
+//! 
+//! +//! [ARM Community - Caches and Self-Modifying Code]: https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/caches-and-self-modifying-code + +use std::ffi::c_void; +use std::io::Result; + +cfg_if::cfg_if! { + if #[cfg(target_os = "windows")] { + mod win; + use win as imp; + } else { + mod libc; + use crate::libc as imp; + } +} + +/// Flushes instructions in the processor pipeline +/// +/// This pipeline flush is broadcast to all processors that are executing threads in the current process. +/// +/// Calling [pipeline_flush_mt] is only required for multi-threaded programs and it *must* be called +/// after all calls to [clear_cache]. +/// +/// If the architecture does not require a pipeline flush, this function does nothing. +pub fn pipeline_flush_mt() -> Result<()> { + imp::pipeline_flush_mt() +} + +/// Flushes the instruction cache for a region of memory. +/// +/// If the architecture does not require an instruction cache flush, this function does nothing. +/// +/// # Unsafe +/// +/// It is necessary to call [pipeline_flush_mt] after this function if you are running in a multi-threaded +/// environment. +pub unsafe fn clear_cache(ptr: *const c_void, len: usize) -> Result<()> { + imp::clear_cache(ptr, len) +} diff --git a/crates/jit-icache-coherence/src/libc.rs b/crates/jit-icache-coherence/src/libc.rs new file mode 100644 index 0000000000..6ea9cea08e --- /dev/null +++ b/crates/jit-icache-coherence/src/libc.rs @@ -0,0 +1,88 @@ +#![allow(unused)] + +use libc::{syscall, EINVAL, EPERM}; +use std::ffi::c_void; +use std::io::{Error, Result}; + +const MEMBARRIER_CMD_GLOBAL: libc::c_int = 1; +const MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE: libc::c_int = 32; +const MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: libc::c_int = 64; + +/// See docs on [crate::pipeline_flush_mt] for a description of what this function is trying to do. +#[inline] +pub(crate) fn pipeline_flush_mt() -> Result<()> { + // Ensure that no processor has fetched a stale instruction stream. + // + // On AArch64 we try to do this by executing a "broadcast" `ISB` which is not something that the + // architecture provides us but we can emulate it using the membarrier kernel interface. + // + // This behaviour was documented in a patch, however it seems that it hasn't been upstreamed yet + // Nevertheless it clearly explains the guarantees that the Linux kernel provides us regarding the + // membarrier interface, and how to use it for JIT contexts. + // https://lkml.kernel.org/lkml/07a8b963002cb955b7516e61bad19514a3acaa82.1623813516.git.luto@kernel.org/ + // + // I couldn't find the follow up for that patch but there doesn't seem to be disagreement about + // that specific part in the replies. + // TODO: Check if the kernel has updated the membarrier documentation + // + // See the following issues for more info: + // * https://github.com/bytecodealliance/wasmtime/pull/3426 + // * https://github.com/bytecodealliance/wasmtime/pull/4997 + // + // TODO: x86 and s390x have coherent caches so they don't need this, but RISCV does not + // guarantee that, so we may need to do something similar for it. However as noted in the above + // kernel patch the SYNC_CORE membarrier has different guarantees on each architecture + // so we need follow up and check what it provides us. + // See: https://github.com/bytecodealliance/wasmtime/issues/5033 + #[cfg(all(target_arch = "aarch64", target_os = "linux"))] + match membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE) { + Ok(_) => {} + + // EPERM happens if the calling process hasn't yet called the register membarrier. + // We can call the register membarrier now, and then retry the actual membarrier, + // + // This does have some overhead since on the first time we call this function we + // actually execute three membarriers, but this only happens once per process and only + // one slow membarrier is actually executed (The last one, which actually generates an IPI). + Err(e) if e.raw_os_error().unwrap() == EPERM => { + membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)?; + membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE)?; + } + + // On kernels older than 4.16 the above syscall does not exist, so we can + // fallback to MEMBARRIER_CMD_GLOBAL which is an alias for MEMBARRIER_CMD_SHARED + // that has existed since 4.3. GLOBAL is a lot slower, but allows us to have + // compatibility with older kernels. + Err(e) if e.raw_os_error().unwrap() == EINVAL => { + membarrier(MEMBARRIER_CMD_GLOBAL)?; + } + + // In any other case we got an actual error, so lets propagate that up + e => e?, + } + + Ok(()) +} + +#[cfg(target_os = "linux")] +fn membarrier(barrier: libc::c_int) -> Result<()> { + let flags: libc::c_int = 0; + let res = unsafe { syscall(libc::SYS_membarrier, barrier, flags) }; + if res == 0 { + Ok(()) + } else { + Err(Error::last_os_error()) + } +} + +/// See docs on [crate::clear_cache] for a description of what this function is trying to do. +#[inline] +pub(crate) fn clear_cache(_ptr: *const c_void, _len: usize) -> Result<()> { + // TODO: On AArch64 we currently rely on the `mprotect` call that switches the memory from W+R to R+X + // to do this for us, however that is an implementation detail and should not be relied upon + // We should call some implementation of `clear_cache` here + // + // See: https://github.com/bytecodealliance/wasmtime/issues/3310 + + Ok(()) +} diff --git a/crates/jit-icache-coherence/src/win.rs b/crates/jit-icache-coherence/src/win.rs new file mode 100644 index 0000000000..488e15f466 --- /dev/null +++ b/crates/jit-icache-coherence/src/win.rs @@ -0,0 +1,45 @@ +use std::ffi::c_void; +use std::io::{Error, Result}; +use windows_sys::Win32::System::Diagnostics::Debug::FlushInstructionCache; +use windows_sys::Win32::System::Threading::FlushProcessWriteBuffers; +use windows_sys::Win32::System::Threading::GetCurrentProcess; + +/// See docs on [crate::pipeline_flush_mt] for a description of what this function is trying to do. +#[inline] +pub(crate) fn pipeline_flush_mt() -> Result<()> { + // If we are here, it means that the user has already called [cache_clear] for all buffers that + // are going to be holding code. We don't really care about flushing the write buffers, but + // the other guarantee that microsoft provides on this API. As documented: + // + // "The function generates an interprocessor interrupt (IPI) to all processors that are part of + // the current process affinity. It guarantees the visibility of write operations performed on + // one processor to the other processors." + // + // This all-core IPI acts as a core serializing operation, equivalent to a "broadcast" `ISB` + // instruction that the architecture does not provide and which is what we really want. + // + // See: https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-flushprocesswritebuffers + if cfg!(target_arch = "aarch64") { + unsafe { + FlushProcessWriteBuffers(); + } + } + + Ok(()) +} + +/// See docs on [crate::clear_cache] for a description of what this function is trying to do. +#[inline] +pub(crate) fn clear_cache(ptr: *const c_void, len: usize) -> Result<()> { + // See: + // * https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-flushinstructioncache + // * https://devblogs.microsoft.com/oldnewthing/20190902-00/?p=102828 + unsafe { + let res = FlushInstructionCache(GetCurrentProcess(), ptr, len); + if res == 0 { + return Err(Error::last_os_error()); + } + } + + Ok(()) +} diff --git a/crates/jit/Cargo.toml b/crates/jit/Cargo.toml index b555010552..fef5bdd622 100644 --- a/crates/jit/Cargo.toml +++ b/crates/jit/Cargo.toml @@ -26,6 +26,7 @@ bincode = "1.2.1" rustc-demangle = "0.1.16" cpp_demangle = "0.3.2" log = { workspace = true } +wasmtime-jit-icache-coherence = { workspace = true } [target.'cfg(target_os = "windows")'.dependencies.windows-sys] workspace = true @@ -33,9 +34,6 @@ features = [ "Win32_System_Diagnostics_Debug", ] -[target.'cfg(target_os = "linux")'.dependencies] -rustix = { workspace = true, features = ["process"] } - [target.'cfg(target_arch = "x86_64")'.dependencies] ittapi = { version = "0.3.0", optional = true } diff --git a/crates/jit/src/code_memory.rs b/crates/jit/src/code_memory.rs index 08ee895f7c..66eb8ee44e 100644 --- a/crates/jit/src/code_memory.rs +++ b/crates/jit/src/code_memory.rs @@ -3,7 +3,9 @@ use crate::unwind::UnwindRegistration; use anyhow::{bail, Context, Result}; use object::read::{File, Object, ObjectSection}; +use std::ffi::c_void; use std::mem::ManuallyDrop; +use wasmtime_jit_icache_coherence as icache_coherence; use wasmtime_runtime::MmapVec; /// Management of executable memory within a `MmapVec` @@ -54,15 +56,6 @@ impl CodeMemory { /// The returned `CodeMemory` manages the internal `MmapVec` and the /// `publish` method is used to actually make the memory executable. pub fn new(mmap: MmapVec) -> Self { - #[cfg(all(target_arch = "aarch64", target_os = "linux"))] - { - // This is a requirement of the `membarrier` call executed by the `publish` method. - rustix::process::membarrier( - rustix::process::MembarrierCommand::RegisterPrivateExpeditedSyncCore, - ) - .unwrap(); - } - Self { mmap: ManuallyDrop::new(mmap), unwind_registration: ManuallyDrop::new(None), @@ -155,6 +148,13 @@ impl CodeMemory { // must be added here, though, if relocations pop up. assert!(text.relocations().count() == 0); + // Clear the newly allocated code from cache if the processor requires it + // + // Do this before marking the memory as R+X, technically we should be able to do it after + // but there are some CPU's that have had errata about doing this with read only memory. + icache_coherence::clear_cache(ret.text.as_ptr() as *const c_void, ret.text.len()) + .expect("Failed cache clear"); + // Switch the executable portion from read/write to // read/execute, notably not using read/write/execute to prevent // modifications. @@ -162,14 +162,8 @@ impl CodeMemory { .make_executable(text_range.clone(), enable_branch_protection) .expect("unable to make memory executable"); - #[cfg(all(target_arch = "aarch64", target_os = "linux"))] - { - // Ensure that no processor has fetched a stale instruction stream. - rustix::process::membarrier( - rustix::process::MembarrierCommand::PrivateExpeditedSyncCore, - ) - .unwrap(); - } + // Flush any in-flight instructions from the pipeline + icache_coherence::pipeline_flush_mt().expect("Failed pipeline flush"); // With all our memory set up use the platform-specific // `UnwindRegistration` implementation to inform the general diff --git a/scripts/publish.rs b/scripts/publish.rs index e417653735..932ee3493b 100644 --- a/scripts/publish.rs +++ b/scripts/publish.rs @@ -36,6 +36,7 @@ const CRATES_TO_PUBLISH: &[&str] = &[ "cranelift-object", "cranelift-interpreter", "cranelift", + "wasmtime-jit-icache-coherence", "cranelift-jit", // wiggle "wiggle-generate",