Flush Icache on AArch64 Windows (#4997)

* cranelift: Add FlushInstructionCache for AArch64 on Windows This was previously done on #3426 for linux. * wasmtime: Add FlushInstructionCache for AArch64 on Windows This was previously done on #3426 for linux. * cranelift: Add MemoryUse flag to JIT Memory Manager This allows us to keep the icache flushing code self-contained and not leak implementation details. This also changes the windows icache flushing code to only flush pages that were previously unflushed. * Add jit-icache-coherence crate * cranelift: Use `jit-icache-coherence` * wasmtime: Use `jit-icache-coherence` * jit-icache-coherence: Make rustix feature additive Mutually exclusive features cause issues. * wasmtime: Remove rustix from wasmtime-jit We now use it via jit-icache-coherence * Rename wasmtime-jit-icache-coherency crate * Use cfg-if in wasmtime-jit-icache-coherency crate * Use inline instead of inline(always) * Add unsafe marker to clear_cache * Conditionally compile all rustix operations membarrier does not exist on MacOS * Publish `wasmtime-jit-icache-coherence` * Remove explicit windows check This is implied by the target_os = "windows" above * cranelift: Remove len != 0 check This is redundant as it is done in non_protected_allocations_iter * Comment cleanups Thanks @akirilov-arm! * Make clear_cache safe * Rename pipeline_flush to pipeline_flush_mt * Revert "Make clear_cache safe" This reverts commit 21165d81c9030ed9b291a1021a367214d2942c90. * More docs! * Fix pipeline_flush reference on clear_cache * Update more docs! * Move pipeline flush after `mprotect` calls Technically the `clear_cache` operation is a lie in AArch64, so move the pipeline flush after the `mprotect` calls so that it benefits from the implicit cache cleaning done by it. * wasmtime: Remove rustix backend from icache crate * wasmtime: Use libc for macos * wasmtime: Flush icache on all arch's for windows * wasmtime: Add flags to membarrier call
2022-10-12 19:15:38 +01:00
parent 75cd888e23
commit 4639e85c4e
12 changed files with 334 additions and 87 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -667,6 +667,7 @@ dependencies = [
 "memmap2",
 "region",
 "target-lexicon",
+ "wasmtime-jit-icache-coherence",
 "windows-sys",
 ]

@@ -3665,12 +3666,12 @@ dependencies = [
 "log",
 "object",
 "rustc-demangle",
- "rustix",
 "serde",
 "target-lexicon",
 "thiserror",
 "wasmtime-environ",
 "wasmtime-jit-debug",
+ "wasmtime-jit-icache-coherence",
 "wasmtime-runtime",
 "windows-sys",
 ]
@@ -3684,6 +3685,15 @@ dependencies = [
 "rustix",
 ]

+[[package]]
+name = "wasmtime-jit-icache-coherence"
+version = "2.0.0"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "windows-sys",
+]
+
 [[package]]
 name = "wasmtime-runtime"
 version = "3.0.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -84,6 +84,7 @@ members = [
  "crates/c-api",
  "crates/cli-flags",
  "crates/environ/fuzz",
+  "crates/jit-icache-coherence",
  "examples/fib-debug/wasm",
  "examples/wasi/wasm",
  "examples/tokio/wasm",
@@ -126,6 +127,7 @@ wasi-common = { path = "crates/wasi-common", version = "=3.0.0" }
 wasi-tokio = { path = "crates/wasi-common/tokio", version = "=3.0.0" }
 wasi-cap-std-sync = { path = "crates/wasi-common/cap-std-sync", version = "=3.0.0" }
 wasmtime-fuzzing = { path = "crates/fuzzing" }
+wasmtime-jit-icache-coherence = { path = "crates/jit-icache-coherence", version = "=2.0.0" }

 cranelift-wasm = { path = "cranelift/wasm", version = "0.90.0" }
 cranelift-codegen = { path = "cranelift/codegen", version = "0.90.0" }
--- a/cranelift/jit/Cargo.toml
+++ b/cranelift/jit/Cargo.toml
@@ -20,6 +20,7 @@ libc = { version = "0.2.42" }
 target-lexicon = { workspace = true }
 memmap2 = { version = "0.2.1", optional = true }
 log = { workspace = true }
+wasmtime-jit-icache-coherence = { workspace = true }

 [target.'cfg(windows)'.dependencies.windows-sys]
 workspace = true
--- a/cranelift/jit/src/backend.rs
+++ b/cranelift/jit/src/backend.rs
@@ -458,14 +458,6 @@ impl JITModule {
        self.memory.readonly.set_readonly();
        self.memory.code.set_readable_and_executable();

-        #[cfg(all(target_arch = "aarch64", target_os = "linux"))]
-        {
-            let cmd: libc::c_int = 32; // MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE
-
-            // Ensure that no processor has fetched a stale instruction stream.
-            unsafe { libc::syscall(libc::SYS_membarrier, cmd) };
-        }
-
        for update in self.pending_got_updates.drain(..) {
            unsafe { update.entry.as_ref() }.store(update.ptr as *mut _, Ordering::SeqCst);
        }
@@ -530,15 +522,6 @@ impl JITModule {
            module.libcall_plt_entries.insert(libcall, plt_entry);
        }

-        #[cfg(all(target_arch = "aarch64", target_os = "linux"))]
-        {
-            let cmd: libc::c_int = 64; // MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE
-
-            // This is a requirement of the membarrier() call executed by
-            // the finalize_definitions() method.
-            unsafe { libc::syscall(libc::SYS_membarrier, cmd) };
-        }
-
        module
    }

--- a/cranelift/jit/src/memory.rs
+++ b/cranelift/jit/src/memory.rs
@@ -4,9 +4,11 @@ use memmap2::MmapMut;
 #[cfg(not(any(feature = "selinux-fix", windows)))]
 use std::alloc;
 use std::convert::TryFrom;
+use std::ffi::c_void;
 use std::io;
 use std::mem;
 use std::ptr;
+use wasmtime_jit_icache_coherence as icache_coherence;

 /// A simple struct consisting of a pointer and length.
 struct PtrLen {
@@ -161,6 +163,7 @@ impl Memory {
        // TODO: Allocate more at a time.
        self.current = PtrLen::with_size(size)?;
        self.position = size;
+
        Ok(self.current.ptr)
    }

@@ -168,45 +171,45 @@ impl Memory {
    pub(crate) fn set_readable_and_executable(&mut self) {
        self.finish_current();

+        // Clear all the newly allocated code from cache if the processor requires it
+        //
+        // Do this before marking the memory as R+X, technically we should be able to do it after
+        // but there are some CPU's that have had errata about doing this with read only memory.
+        for &PtrLen { ptr, len, .. } in self.non_protected_allocations_iter() {
+            unsafe {
+                icache_coherence::clear_cache(ptr as *const c_void, len)
+                    .expect("Failed cache clear")
+            };
+        }
+
        let set_region_readable_and_executable = |ptr, len| {
-            if len != 0 {
-                if self.branch_protection == BranchProtection::BTI {
-                    #[cfg(all(target_arch = "aarch64", target_os = "linux"))]
-                    if std::arch::is_aarch64_feature_detected!("bti") {
-                        let prot = libc::PROT_EXEC | libc::PROT_READ | /* PROT_BTI */ 0x10;
+            if self.branch_protection == BranchProtection::BTI {
+                #[cfg(all(target_arch = "aarch64", target_os = "linux"))]
+                if std::arch::is_aarch64_feature_detected!("bti") {
+                    let prot = libc::PROT_EXEC | libc::PROT_READ | /* PROT_BTI */ 0x10;

-                        unsafe {
-                            if libc::mprotect(ptr as *mut libc::c_void, len, prot) < 0 {
-                                panic!("unable to make memory readable+executable");
-                            }
+                    unsafe {
+                        if libc::mprotect(ptr as *mut libc::c_void, len, prot) < 0 {
+                            panic!("unable to make memory readable+executable");
                        }
-
-                        return;
                    }
-                }

-                unsafe {
-                    region::protect(ptr, len, region::Protection::READ_EXECUTE)
-                        .expect("unable to make memory readable+executable");
+                    return;
                }
            }
+
+            unsafe {
+                region::protect(ptr, len, region::Protection::READ_EXECUTE)
+                    .expect("unable to make memory readable+executable");
+            }
        };

-        #[cfg(feature = "selinux-fix")]
-        {
-            for &PtrLen { ref map, ptr, len } in &self.allocations[self.already_protected..] {
-                if map.is_some() {
-                    set_region_readable_and_executable(ptr, len);
-                }
-            }
+        for &PtrLen { ptr, len, .. } in self.non_protected_allocations_iter() {
+            set_region_readable_and_executable(ptr, len);
        }

-        #[cfg(not(feature = "selinux-fix"))]
-        {
-            for &PtrLen { ptr, len } in &self.allocations[self.already_protected..] {
-                set_region_readable_and_executable(ptr, len);
-            }
-        }
+        // Flush any in-flight instructions from the pipeline
+        icache_coherence::pipeline_flush_mt().expect("Failed pipeline flush");

        self.already_protected = self.allocations.len();
    }
@@ -215,33 +218,27 @@ impl Memory {
    pub(crate) fn set_readonly(&mut self) {
        self.finish_current();

-        #[cfg(feature = "selinux-fix")]
-        {
-            for &PtrLen { ref map, ptr, len } in &self.allocations[self.already_protected..] {
-                if len != 0 && map.is_some() {
-                    unsafe {
-                        region::protect(ptr, len, region::Protection::READ)
-                            .expect("unable to make memory readonly");
-                    }
-                }
-            }
-        }
-
-        #[cfg(not(feature = "selinux-fix"))]
-        {
-            for &PtrLen { ptr, len } in &self.allocations[self.already_protected..] {
-                if len != 0 {
-                    unsafe {
-                        region::protect(ptr, len, region::Protection::READ)
-                            .expect("unable to make memory readonly");
-                    }
-                }
+        for &PtrLen { ptr, len, .. } in self.non_protected_allocations_iter() {
+            unsafe {
+                region::protect(ptr, len, region::Protection::READ)
+                    .expect("unable to make memory readonly");
            }
        }

        self.already_protected = self.allocations.len();
    }

+    /// Iterates non protected memory allocations that are of not zero bytes in size.
+    fn non_protected_allocations_iter(&self) -> impl Iterator<Item = &PtrLen> {
+        let iter = self.allocations[self.already_protected..].iter();
+
+        #[cfg(feature = "selinux-fix")]
+        return iter.filter(|&PtrLen { ref map, len, .. }| len != 0 && map.is_some());
+
+        #[cfg(not(feature = "selinux-fix"))]
+        return iter.filter(|&PtrLen { len, .. }| *len != 0);
+    }
+
    /// Frees all allocated memory regions that would be leaked otherwise.
    /// Likely to invalidate existing function pointers, causing unsafety.
    pub(crate) unsafe fn free_memory(&mut self) {
--- a/crates/jit-icache-coherence/Cargo.toml
+++ b/crates/jit-icache-coherence/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "wasmtime-jit-icache-coherence"
+version = "2.0.0"
+authors.workspace = true
+description = "Utilities for JIT icache maintenance"
+documentation = "https://docs.rs/jit-icache-coherence"
+license = "Apache-2.0 WITH LLVM-exception"
+repository = "https://github.com/bytecodealliance/wasmtime"
+edition.workspace = true
+
+[dependencies]
+cfg-if = "1.0"
+
+[target.'cfg(target_os = "windows")'.dependencies.windows-sys]
+workspace = true
+features = [
+    "Win32_Foundation",
+    "Win32_System_Threading",
+    "Win32_System_Diagnostics_Debug",
+]
+
+[target.'cfg(any(target_os = "linux", target_os = "macos"))'.dependencies.libc]
+version = "0.2.42"
--- a/crates/jit-icache-coherence/src/lib.rs
+++ b/crates/jit-icache-coherence/src/lib.rs
@@ -0,0 +1,105 @@
+//! This crate provides utilities for instruction cache maintenance for JIT authors.
+//!
+//! In self modifying codes such as when writing a JIT, special care must be taken when marking the
+//! code as ready for execution. On fully coherent architectures (X86, S390X) the data cache (D-Cache)
+//! and the instruction cache (I-Cache) are always in sync. However this is not guaranteed for all
+//! architectures such as AArch64 where these caches are not coherent with each other.
+//!
+//! When writing new code there may be a I-cache entry for that same address which causes the
+//! processor to execute whatever was in the cache instead of the new code.
+//!
+//! See the [ARM Community - Caches and Self-Modifying Code] blog post that contains a great
+//! explanation of the above. (It references AArch32 but it has a high level overview of this problem).
+//!
+//! ## Usage
+//!
+//! You should call [clear_cache] on any pages that you write with the new code that you're intending
+//! to execute. You can do this at any point in the code from the moment that you write the page up to
+//! the moment where the code is executed.
+//!
+//! You also need to call [pipeline_flush_mt] to ensure that there isn't any invalid instruction currently
+//! in the pipeline if you are running in a multi threaded environment.
+//!
+//! For single threaded programs you are free to omit [pipeline_flush_mt], otherwise you need to
+//! call both [clear_cache] and [pipeline_flush_mt] in that order.
+//!
+//! ### Example:
+//! ```
+//! # use std::ffi::c_void;
+//! # use std::io;
+//! # use wasmtime_jit_icache_coherence::*;
+//! #
+//! # struct Page {
+//! #   addr: *const c_void,
+//! #   len: usize,
+//! # }
+//! #
+//! # fn main() -> io::Result<()> {
+//! #
+//! # let run_code = || {};
+//! # let code = vec![0u8; 64];
+//! # let newly_written_pages = vec![Page {
+//! #    addr: &code[0] as *const u8 as *const c_void,
+//! #    len: code.len(),
+//! # }];
+//! # unsafe {
+//! // Invalidate the cache for all the newly written pages where we wrote our new code.
+//! for page in newly_written_pages {
+//!     clear_cache(page.addr, page.len)?;
+//! }
+//!
+//! // Once those are invalidated we also need to flush the pipeline
+//! pipeline_flush_mt()?;
+//!
+//! // We can now safely execute our new code.
+//! run_code();
+//! # }
+//! # Ok(())
+//! # }
+//! ```
+//!
+//! <div class="example-wrap" style="display:inline-block"><pre class="compile_fail" style="white-space:normal;font:inherit;">
+//!
+//!  **Warning**: In order to correctly use this interface you should always call [clear_cache].
+//!  A followup call to [pipeline_flush_mt] is required if you are running in a multi-threaded environment.
+//!
+//! </pre></div>
+//!
+//! [ARM Community - Caches and Self-Modifying Code]: https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/caches-and-self-modifying-code
+
+use std::ffi::c_void;
+use std::io::Result;
+
+cfg_if::cfg_if! {
+    if #[cfg(target_os = "windows")] {
+        mod win;
+        use win as imp;
+    } else {
+        mod libc;
+        use crate::libc as imp;
+    }
+}
+
+/// Flushes instructions in the processor pipeline
+///
+/// This pipeline flush is broadcast to all processors that are executing threads in the current process.
+///
+/// Calling [pipeline_flush_mt] is only required for multi-threaded programs and it *must* be called
+/// after all calls to [clear_cache].
+///
+/// If the architecture does not require a pipeline flush, this function does nothing.
+pub fn pipeline_flush_mt() -> Result<()> {
+    imp::pipeline_flush_mt()
+}
+
+/// Flushes the instruction cache for a region of memory.
+///
+/// If the architecture does not require an instruction cache flush, this function does nothing.
+///
+/// # Unsafe
+///
+/// It is necessary to call [pipeline_flush_mt] after this function if you are running in a multi-threaded
+/// environment.
+pub unsafe fn clear_cache(ptr: *const c_void, len: usize) -> Result<()> {
+    imp::clear_cache(ptr, len)
+}
--- a/crates/jit-icache-coherence/src/libc.rs
+++ b/crates/jit-icache-coherence/src/libc.rs
@@ -0,0 +1,88 @@
+#![allow(unused)]
+
+use libc::{syscall, EINVAL, EPERM};
+use std::ffi::c_void;
+use std::io::{Error, Result};
+
+const MEMBARRIER_CMD_GLOBAL: libc::c_int = 1;
+const MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE: libc::c_int = 32;
+const MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: libc::c_int = 64;
+
+/// See docs on [crate::pipeline_flush_mt] for a description of what this function is trying to do.
+#[inline]
+pub(crate) fn pipeline_flush_mt() -> Result<()> {
+    // Ensure that no processor has fetched a stale instruction stream.
+    //
+    // On AArch64 we try to do this by executing a "broadcast" `ISB` which is not something that the
+    // architecture provides us but we can emulate it using the membarrier kernel interface.
+    //
+    // This behaviour was documented in a patch, however it seems that it hasn't been upstreamed yet
+    // Nevertheless it clearly explains the guarantees that the Linux kernel provides us regarding the
+    // membarrier interface, and how to use it for JIT contexts.
+    // https://lkml.kernel.org/lkml/07a8b963002cb955b7516e61bad19514a3acaa82.1623813516.git.luto@kernel.org/
+    //
+    // I couldn't find the follow up for that patch but there doesn't seem to be disagreement about
+    // that specific part in the replies.
+    // TODO: Check if the kernel has updated the membarrier documentation
+    //
+    // See the following issues for more info:
+    //  * https://github.com/bytecodealliance/wasmtime/pull/3426
+    //  * https://github.com/bytecodealliance/wasmtime/pull/4997
+    //
+    // TODO: x86 and s390x have coherent caches so they don't need this, but RISCV does not
+    // guarantee that, so we may need to do something similar for it. However as noted in the above
+    // kernel patch the SYNC_CORE membarrier has different guarantees on each architecture
+    // so we need follow up and check what it provides us.
+    // See: https://github.com/bytecodealliance/wasmtime/issues/5033
+    #[cfg(all(target_arch = "aarch64", target_os = "linux"))]
+    match membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE) {
+        Ok(_) => {}
+
+        // EPERM happens if the calling process hasn't yet called the register membarrier.
+        // We can call the register membarrier now, and then retry the actual membarrier,
+        //
+        // This does have some overhead since on the first time we call this function we
+        // actually execute three membarriers, but this only happens once per process and only
+        // one slow membarrier is actually executed (The last one, which actually generates an IPI).
+        Err(e) if e.raw_os_error().unwrap() == EPERM => {
+            membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)?;
+            membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE)?;
+        }
+
+        // On kernels older than 4.16 the above syscall does not exist, so we can
+        // fallback to MEMBARRIER_CMD_GLOBAL which is an alias for MEMBARRIER_CMD_SHARED
+        // that has existed since 4.3. GLOBAL is a lot slower, but allows us to have
+        // compatibility with older kernels.
+        Err(e) if e.raw_os_error().unwrap() == EINVAL => {
+            membarrier(MEMBARRIER_CMD_GLOBAL)?;
+        }
+
+        // In any other case we got an actual error, so lets propagate that up
+        e => e?,
+    }
+
+    Ok(())
+}
+
+#[cfg(target_os = "linux")]
+fn membarrier(barrier: libc::c_int) -> Result<()> {
+    let flags: libc::c_int = 0;
+    let res = unsafe { syscall(libc::SYS_membarrier, barrier, flags) };
+    if res == 0 {
+        Ok(())
+    } else {
+        Err(Error::last_os_error())
+    }
+}
+
+/// See docs on [crate::clear_cache] for a description of what this function is trying to do.
+#[inline]
+pub(crate) fn clear_cache(_ptr: *const c_void, _len: usize) -> Result<()> {
+    // TODO: On AArch64 we currently rely on the `mprotect` call that switches the memory from W+R to R+X
+    // to do this for us, however that is an implementation detail and should not be relied upon
+    // We should call some implementation of `clear_cache` here
+    //
+    // See: https://github.com/bytecodealliance/wasmtime/issues/3310
+
+    Ok(())
+}
--- a/crates/jit-icache-coherence/src/win.rs
+++ b/crates/jit-icache-coherence/src/win.rs
@@ -0,0 +1,45 @@
+use std::ffi::c_void;
+use std::io::{Error, Result};
+use windows_sys::Win32::System::Diagnostics::Debug::FlushInstructionCache;
+use windows_sys::Win32::System::Threading::FlushProcessWriteBuffers;
+use windows_sys::Win32::System::Threading::GetCurrentProcess;
+
+/// See docs on [crate::pipeline_flush_mt] for a description of what this function is trying to do.
+#[inline]
+pub(crate) fn pipeline_flush_mt() -> Result<()> {
+    // If we are here, it means that the user has already called [cache_clear] for all buffers that
+    // are going to be holding code. We don't really care about flushing the write buffers, but
+    // the other guarantee that microsoft provides on this API. As documented:
+    //
+    // "The function generates an interprocessor interrupt (IPI) to all processors that are part of
+    // the current process affinity. It guarantees the visibility of write operations performed on
+    // one processor to the other processors."
+    //
+    // This all-core IPI acts as a core serializing operation, equivalent to a "broadcast" `ISB`
+    // instruction that the architecture does not provide and which is what we really want.
+    //
+    // See: https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-flushprocesswritebuffers
+    if cfg!(target_arch = "aarch64") {
+        unsafe {
+            FlushProcessWriteBuffers();
+        }
+    }
+
+    Ok(())
+}
+
+/// See docs on [crate::clear_cache] for a description of what this function is trying to do.
+#[inline]
+pub(crate) fn clear_cache(ptr: *const c_void, len: usize) -> Result<()> {
+    // See:
+    //   * https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-flushinstructioncache
+    //   * https://devblogs.microsoft.com/oldnewthing/20190902-00/?p=102828
+    unsafe {
+        let res = FlushInstructionCache(GetCurrentProcess(), ptr, len);
+        if res == 0 {
+            return Err(Error::last_os_error());
+        }
+    }
+
+    Ok(())
+}
--- a/crates/jit/Cargo.toml
+++ b/crates/jit/Cargo.toml
@@ -26,6 +26,7 @@ bincode = "1.2.1"
 rustc-demangle = "0.1.16"
 cpp_demangle = "0.3.2"
 log = { workspace = true }
+wasmtime-jit-icache-coherence = { workspace = true }

 [target.'cfg(target_os = "windows")'.dependencies.windows-sys]
 workspace = true
@@ -33,9 +34,6 @@ features = [
  "Win32_System_Diagnostics_Debug",
 ]

-[target.'cfg(target_os = "linux")'.dependencies]
-rustix = { workspace = true, features = ["process"] }
-
 [target.'cfg(target_arch = "x86_64")'.dependencies]
 ittapi = { version = "0.3.0", optional = true  }

--- a/crates/jit/src/code_memory.rs
+++ b/crates/jit/src/code_memory.rs
@@ -3,7 +3,9 @@
 use crate::unwind::UnwindRegistration;
 use anyhow::{bail, Context, Result};
 use object::read::{File, Object, ObjectSection};
+use std::ffi::c_void;
 use std::mem::ManuallyDrop;
+use wasmtime_jit_icache_coherence as icache_coherence;
 use wasmtime_runtime::MmapVec;

 /// Management of executable memory within a `MmapVec`
@@ -54,15 +56,6 @@ impl CodeMemory {
    /// The returned `CodeMemory` manages the internal `MmapVec` and the
    /// `publish` method is used to actually make the memory executable.
    pub fn new(mmap: MmapVec) -> Self {
-        #[cfg(all(target_arch = "aarch64", target_os = "linux"))]
-        {
-            // This is a requirement of the `membarrier` call executed by the `publish` method.
-            rustix::process::membarrier(
-                rustix::process::MembarrierCommand::RegisterPrivateExpeditedSyncCore,
-            )
-            .unwrap();
-        }
-
        Self {
            mmap: ManuallyDrop::new(mmap),
            unwind_registration: ManuallyDrop::new(None),
@@ -155,6 +148,13 @@ impl CodeMemory {
            // must be added here, though, if relocations pop up.
            assert!(text.relocations().count() == 0);

+            // Clear the newly allocated code from cache if the processor requires it
+            //
+            // Do this before marking the memory as R+X, technically we should be able to do it after
+            // but there are some CPU's that have had errata about doing this with read only memory.
+            icache_coherence::clear_cache(ret.text.as_ptr() as *const c_void, ret.text.len())
+                .expect("Failed cache clear");
+
            // Switch the executable portion from read/write to
            // read/execute, notably not using read/write/execute to prevent
            // modifications.
@@ -162,14 +162,8 @@ impl CodeMemory {
                .make_executable(text_range.clone(), enable_branch_protection)
                .expect("unable to make memory executable");

-            #[cfg(all(target_arch = "aarch64", target_os = "linux"))]
-            {
-                // Ensure that no processor has fetched a stale instruction stream.
-                rustix::process::membarrier(
-                    rustix::process::MembarrierCommand::PrivateExpeditedSyncCore,
-                )
-                .unwrap();
-            }
+            // Flush any in-flight instructions from the pipeline
+            icache_coherence::pipeline_flush_mt().expect("Failed pipeline flush");

            // With all our memory set up use the platform-specific
            // `UnwindRegistration` implementation to inform the general
--- a/scripts/publish.rs
+++ b/scripts/publish.rs
@@ -36,6 +36,7 @@ const CRATES_TO_PUBLISH: &[&str] = &[
    "cranelift-object",
    "cranelift-interpreter",
    "cranelift",
+    "wasmtime-jit-icache-coherence",
    "cranelift-jit",
    // wiggle
    "wiggle-generate",