From 4639e85c4e9a084ff442e2ff7924b5d753e63be5 Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonso360@users.noreply.github.com>
Date: Wed, 12 Oct 2022 19:15:38 +0100
Subject: [PATCH] Flush Icache on AArch64 Windows (#4997)

* cranelift: Add FlushInstructionCache for AArch64 on Windows

This was previously done on #3426 for linux.

* wasmtime: Add FlushInstructionCache for AArch64 on Windows

This was previously done on #3426 for linux.

* cranelift: Add MemoryUse flag to JIT Memory Manager

This allows us to keep the icache flushing code self-contained and not leak implementation details.

This also changes the windows icache flushing code to only flush pages that were previously unflushed.

* Add jit-icache-coherence crate

* cranelift: Use `jit-icache-coherence`

* wasmtime: Use `jit-icache-coherence`

* jit-icache-coherence: Make rustix feature additive

Mutually exclusive features cause issues.

* wasmtime: Remove rustix from wasmtime-jit

We now use it via jit-icache-coherence

* Rename wasmtime-jit-icache-coherency crate

* Use cfg-if in wasmtime-jit-icache-coherency crate

* Use inline instead of inline(always)

* Add unsafe marker to clear_cache

* Conditionally compile all rustix operations

membarrier does not exist on MacOS

* Publish `wasmtime-jit-icache-coherence`

* Remove explicit windows check

This is implied by the target_os = "windows" above

* cranelift: Remove len != 0 check

This is redundant as it is done in non_protected_allocations_iter

* Comment cleanups

Thanks @akirilov-arm!

* Make clear_cache safe

* Rename pipeline_flush to pipeline_flush_mt

* Revert "Make clear_cache safe"

This reverts commit 21165d81c9030ed9b291a1021a367214d2942c90.

* More docs!

* Fix pipeline_flush reference on clear_cache

* Update more docs!

* Move pipeline flush after `mprotect` calls

Technically the `clear_cache` operation is a lie in AArch64, so move the pipeline flush after the `mprotect` calls so that it benefits from the implicit cache cleaning done by it.

* wasmtime: Remove rustix backend from icache crate

* wasmtime: Use libc for macos

* wasmtime: Flush icache on all arch's for windows

* wasmtime: Add flags to membarrier call
---
 Cargo.lock                              |  12 ++-
 Cargo.toml                              |   2 +
 cranelift/jit/Cargo.toml                |   1 +
 cranelift/jit/src/backend.rs            |  17 ----
 cranelift/jit/src/memory.rs             |  95 +++++++++++----------
 crates/jit-icache-coherence/Cargo.toml  |  23 ++++++
 crates/jit-icache-coherence/src/lib.rs  | 105 ++++++++++++++++++++++++
 crates/jit-icache-coherence/src/libc.rs |  88 ++++++++++++++++++++
 crates/jit-icache-coherence/src/win.rs  |  45 ++++++++++
 crates/jit/Cargo.toml                   |   4 +-
 crates/jit/src/code_memory.rs           |  28 +++----
 scripts/publish.rs                      |   1 +
 12 files changed, 334 insertions(+), 87 deletions(-)
 create mode 100644 crates/jit-icache-coherence/Cargo.toml
 create mode 100644 crates/jit-icache-coherence/src/lib.rs
 create mode 100644 crates/jit-icache-coherence/src/libc.rs
 create mode 100644 crates/jit-icache-coherence/src/win.rs

diff --git a/Cargo.lock b/Cargo.lock
index 48b84a3b16..e63d0c7f2f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -667,6 +667,7 @@ dependencies = [
  "memmap2",
  "region",
  "target-lexicon",
+ "wasmtime-jit-icache-coherence",
  "windows-sys",
 ]
 
@@ -3665,12 +3666,12 @@ dependencies = [
  "log",
  "object",
  "rustc-demangle",
- "rustix",
  "serde",
  "target-lexicon",
  "thiserror",
  "wasmtime-environ",
  "wasmtime-jit-debug",
+ "wasmtime-jit-icache-coherence",
  "wasmtime-runtime",
  "windows-sys",
 ]
@@ -3684,6 +3685,15 @@ dependencies = [
  "rustix",
 ]
 
+[[package]]
+name = "wasmtime-jit-icache-coherence"
+version = "2.0.0"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "windows-sys",
+]
+
 [[package]]
 name = "wasmtime-runtime"
 version = "3.0.0"
diff --git a/Cargo.toml b/Cargo.toml
index 5762dc02a4..71d7267928 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -84,6 +84,7 @@ members = [
   "crates/c-api",
   "crates/cli-flags",
   "crates/environ/fuzz",
+  "crates/jit-icache-coherence",
   "examples/fib-debug/wasm",
   "examples/wasi/wasm",
   "examples/tokio/wasm",
@@ -126,6 +127,7 @@ wasi-common = { path = "crates/wasi-common", version = "=3.0.0" }
 wasi-tokio = { path = "crates/wasi-common/tokio", version = "=3.0.0" }
 wasi-cap-std-sync = { path = "crates/wasi-common/cap-std-sync", version = "=3.0.0" }
 wasmtime-fuzzing = { path = "crates/fuzzing" }
+wasmtime-jit-icache-coherence = { path = "crates/jit-icache-coherence", version = "=2.0.0" }
 
 cranelift-wasm = { path = "cranelift/wasm", version = "0.90.0" }
 cranelift-codegen = { path = "cranelift/codegen", version = "0.90.0" }
diff --git a/cranelift/jit/Cargo.toml b/cranelift/jit/Cargo.toml
index 4bf735fe45..e234b8d0d0 100644
--- a/cranelift/jit/Cargo.toml
+++ b/cranelift/jit/Cargo.toml
@@ -20,6 +20,7 @@ libc = { version = "0.2.42" }
 target-lexicon = { workspace = true }
 memmap2 = { version = "0.2.1", optional = true }
 log = { workspace = true }
+wasmtime-jit-icache-coherence = { workspace = true }
 
 [target.'cfg(windows)'.dependencies.windows-sys]
 workspace = true
diff --git a/cranelift/jit/src/backend.rs b/cranelift/jit/src/backend.rs
index 31158e9038..441a878d4f 100644
--- a/cranelift/jit/src/backend.rs
+++ b/cranelift/jit/src/backend.rs
@@ -458,14 +458,6 @@ impl JITModule {
         self.memory.readonly.set_readonly();
         self.memory.code.set_readable_and_executable();
 
-        #[cfg(all(target_arch = "aarch64", target_os = "linux"))]
-        {
-            let cmd: libc::c_int = 32; // MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE
-
-            // Ensure that no processor has fetched a stale instruction stream.
-            unsafe { libc::syscall(libc::SYS_membarrier, cmd) };
-        }
-
         for update in self.pending_got_updates.drain(..) {
             unsafe { update.entry.as_ref() }.store(update.ptr as *mut _, Ordering::SeqCst);
         }
@@ -530,15 +522,6 @@ impl JITModule {
             module.libcall_plt_entries.insert(libcall, plt_entry);
         }
 
-        #[cfg(all(target_arch = "aarch64", target_os = "linux"))]
-        {
-            let cmd: libc::c_int = 64; // MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE
-
-            // This is a requirement of the membarrier() call executed by
-            // the finalize_definitions() method.
-            unsafe { libc::syscall(libc::SYS_membarrier, cmd) };
-        }
-
         module
     }
 
diff --git a/cranelift/jit/src/memory.rs b/cranelift/jit/src/memory.rs
index a18f6ad3c1..9187131f88 100644
--- a/cranelift/jit/src/memory.rs
+++ b/cranelift/jit/src/memory.rs
@@ -4,9 +4,11 @@ use memmap2::MmapMut;
 #[cfg(not(any(feature = "selinux-fix", windows)))]
 use std::alloc;
 use std::convert::TryFrom;
+use std::ffi::c_void;
 use std::io;
 use std::mem;
 use std::ptr;
+use wasmtime_jit_icache_coherence as icache_coherence;
 
 /// A simple struct consisting of a pointer and length.
 struct PtrLen {
@@ -161,6 +163,7 @@ impl Memory {
         // TODO: Allocate more at a time.
         self.current = PtrLen::with_size(size)?;
         self.position = size;
+
         Ok(self.current.ptr)
     }
 
@@ -168,45 +171,45 @@ impl Memory {
     pub(crate) fn set_readable_and_executable(&mut self) {
         self.finish_current();
 
+        // Clear all the newly allocated code from cache if the processor requires it
+        //
+        // Do this before marking the memory as R+X, technically we should be able to do it after
+        // but there are some CPU's that have had errata about doing this with read only memory.
+        for &PtrLen { ptr, len, .. } in self.non_protected_allocations_iter() {
+            unsafe {
+                icache_coherence::clear_cache(ptr as *const c_void, len)
+                    .expect("Failed cache clear")
+            };
+        }
+
         let set_region_readable_and_executable = |ptr, len| {
-            if len != 0 {
-                if self.branch_protection == BranchProtection::BTI {
-                    #[cfg(all(target_arch = "aarch64", target_os = "linux"))]
-                    if std::arch::is_aarch64_feature_detected!("bti") {
-                        let prot = libc::PROT_EXEC | libc::PROT_READ | /* PROT_BTI */ 0x10;
+            if self.branch_protection == BranchProtection::BTI {
+                #[cfg(all(target_arch = "aarch64", target_os = "linux"))]
+                if std::arch::is_aarch64_feature_detected!("bti") {
+                    let prot = libc::PROT_EXEC | libc::PROT_READ | /* PROT_BTI */ 0x10;
 
-                        unsafe {
-                            if libc::mprotect(ptr as *mut libc::c_void, len, prot) < 0 {
-                                panic!("unable to make memory readable+executable");
-                            }
+                    unsafe {
+                        if libc::mprotect(ptr as *mut libc::c_void, len, prot) < 0 {
+                            panic!("unable to make memory readable+executable");
                         }
-
-                        return;
                     }
-                }
 
-                unsafe {
-                    region::protect(ptr, len, region::Protection::READ_EXECUTE)
-                        .expect("unable to make memory readable+executable");
+                    return;
                 }
             }
+
+            unsafe {
+                region::protect(ptr, len, region::Protection::READ_EXECUTE)
+                    .expect("unable to make memory readable+executable");
+            }
         };
 
-        #[cfg(feature = "selinux-fix")]
-        {
-            for &PtrLen { ref map, ptr, len } in &self.allocations[self.already_protected..] {
-                if map.is_some() {
-                    set_region_readable_and_executable(ptr, len);
-                }
-            }
+        for &PtrLen { ptr, len, .. } in self.non_protected_allocations_iter() {
+            set_region_readable_and_executable(ptr, len);
         }
 
-        #[cfg(not(feature = "selinux-fix"))]
-        {
-            for &PtrLen { ptr, len } in &self.allocations[self.already_protected..] {
-                set_region_readable_and_executable(ptr, len);
-            }
-        }
+        // Flush any in-flight instructions from the pipeline
+        icache_coherence::pipeline_flush_mt().expect("Failed pipeline flush");
 
         self.already_protected = self.allocations.len();
     }
@@ -215,33 +218,27 @@ impl Memory {
     pub(crate) fn set_readonly(&mut self) {
         self.finish_current();
 
-        #[cfg(feature = "selinux-fix")]
-        {
-            for &PtrLen { ref map, ptr, len } in &self.allocations[self.already_protected..] {
-                if len != 0 && map.is_some() {
-                    unsafe {
-                        region::protect(ptr, len, region::Protection::READ)
-                            .expect("unable to make memory readonly");
-                    }
-                }
-            }
-        }
-
-        #[cfg(not(feature = "selinux-fix"))]
-        {
-            for &PtrLen { ptr, len } in &self.allocations[self.already_protected..] {
-                if len != 0 {
-                    unsafe {
-                        region::protect(ptr, len, region::Protection::READ)
-                            .expect("unable to make memory readonly");
-                    }
-                }
+        for &PtrLen { ptr, len, .. } in self.non_protected_allocations_iter() {
+            unsafe {
+                region::protect(ptr, len, region::Protection::READ)
+                    .expect("unable to make memory readonly");
             }
         }
 
         self.already_protected = self.allocations.len();
     }
 
+    /// Iterates non protected memory allocations that are of not zero bytes in size.
+    fn non_protected_allocations_iter(&self) -> impl Iterator<Item = &PtrLen> {
+        let iter = self.allocations[self.already_protected..].iter();
+
+        #[cfg(feature = "selinux-fix")]
+        return iter.filter(|&PtrLen { ref map, len, .. }| len != 0 && map.is_some());
+
+        #[cfg(not(feature = "selinux-fix"))]
+        return iter.filter(|&PtrLen { len, .. }| *len != 0);
+    }
+
     /// Frees all allocated memory regions that would be leaked otherwise.
     /// Likely to invalidate existing function pointers, causing unsafety.
     pub(crate) unsafe fn free_memory(&mut self) {
diff --git a/crates/jit-icache-coherence/Cargo.toml b/crates/jit-icache-coherence/Cargo.toml
new file mode 100644
index 0000000000..2eb095ff1a
--- /dev/null
+++ b/crates/jit-icache-coherence/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "wasmtime-jit-icache-coherence"
+version = "2.0.0"
+authors.workspace = true
+description = "Utilities for JIT icache maintenance"
+documentation = "https://docs.rs/jit-icache-coherence"
+license = "Apache-2.0 WITH LLVM-exception"
+repository = "https://github.com/bytecodealliance/wasmtime"
+edition.workspace = true
+
+[dependencies]
+cfg-if = "1.0"
+
+[target.'cfg(target_os = "windows")'.dependencies.windows-sys]
+workspace = true
+features = [
+    "Win32_Foundation",
+    "Win32_System_Threading",
+    "Win32_System_Diagnostics_Debug",
+]
+
+[target.'cfg(any(target_os = "linux", target_os = "macos"))'.dependencies.libc]
+version = "0.2.42"
\ No newline at end of file
diff --git a/crates/jit-icache-coherence/src/lib.rs b/crates/jit-icache-coherence/src/lib.rs
new file mode 100644
index 0000000000..e47e539714
--- /dev/null
+++ b/crates/jit-icache-coherence/src/lib.rs
@@ -0,0 +1,105 @@
+//! This crate provides utilities for instruction cache maintenance for JIT authors.
+//!
+//! In self modifying codes such as when writing a JIT, special care must be taken when marking the
+//! code as ready for execution. On fully coherent architectures (X86, S390X) the data cache (D-Cache)
+//! and the instruction cache (I-Cache) are always in sync. However this is not guaranteed for all
+//! architectures such as AArch64 where these caches are not coherent with each other.
+//!
+//! When writing new code there may be a I-cache entry for that same address which causes the
+//! processor to execute whatever was in the cache instead of the new code.
+//!
+//! See the [ARM Community - Caches and Self-Modifying Code] blog post that contains a great
+//! explanation of the above. (It references AArch32 but it has a high level overview of this problem).
+//!
+//! ## Usage
+//!
+//! You should call [clear_cache] on any pages that you write with the new code that you're intending
+//! to execute. You can do this at any point in the code from the moment that you write the page up to
+//! the moment where the code is executed.
+//!
+//! You also need to call [pipeline_flush_mt] to ensure that there isn't any invalid instruction currently
+//! in the pipeline if you are running in a multi threaded environment.
+//!
+//! For single threaded programs you are free to omit [pipeline_flush_mt], otherwise you need to
+//! call both [clear_cache] and [pipeline_flush_mt] in that order.
+//!
+//! ### Example:
+//! ```
+//! # use std::ffi::c_void;
+//! # use std::io;
+//! # use wasmtime_jit_icache_coherence::*;
+//! #
+//! # struct Page {
+//! #   addr: *const c_void,
+//! #   len: usize,
+//! # }
+//! #
+//! # fn main() -> io::Result<()> {
+//! #
+//! # let run_code = || {};
+//! # let code = vec![0u8; 64];
+//! # let newly_written_pages = vec![Page {
+//! #    addr: &code[0] as *const u8 as *const c_void,
+//! #    len: code.len(),
+//! # }];
+//! # unsafe {
+//! // Invalidate the cache for all the newly written pages where we wrote our new code.
+//! for page in newly_written_pages {
+//!     clear_cache(page.addr, page.len)?;
+//! }
+//!
+//! // Once those are invalidated we also need to flush the pipeline
+//! pipeline_flush_mt()?;
+//!
+//! // We can now safely execute our new code.
+//! run_code();
+//! # }
+//! # Ok(())
+//! # }
+//! ```
+//!
+//! <div class="example-wrap" style="display:inline-block"><pre class="compile_fail" style="white-space:normal;font:inherit;">
+//!
+//!  **Warning**: In order to correctly use this interface you should always call [clear_cache].
+//!  A followup call to [pipeline_flush_mt] is required if you are running in a multi-threaded environment.
+//!
+//! </pre></div>
+//!
+//! [ARM Community - Caches and Self-Modifying Code]: https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/caches-and-self-modifying-code
+
+use std::ffi::c_void;
+use std::io::Result;
+
+cfg_if::cfg_if! {
+    if #[cfg(target_os = "windows")] {
+        mod win;
+        use win as imp;
+    } else {
+        mod libc;
+        use crate::libc as imp;
+    }
+}
+
+/// Flushes instructions in the processor pipeline
+///
+/// This pipeline flush is broadcast to all processors that are executing threads in the current process.
+///
+/// Calling [pipeline_flush_mt] is only required for multi-threaded programs and it *must* be called
+/// after all calls to [clear_cache].
+///
+/// If the architecture does not require a pipeline flush, this function does nothing.
+pub fn pipeline_flush_mt() -> Result<()> {
+    imp::pipeline_flush_mt()
+}
+
+/// Flushes the instruction cache for a region of memory.
+///
+/// If the architecture does not require an instruction cache flush, this function does nothing.
+///
+/// # Unsafe
+///
+/// It is necessary to call [pipeline_flush_mt] after this function if you are running in a multi-threaded
+/// environment.
+pub unsafe fn clear_cache(ptr: *const c_void, len: usize) -> Result<()> {
+    imp::clear_cache(ptr, len)
+}
diff --git a/crates/jit-icache-coherence/src/libc.rs b/crates/jit-icache-coherence/src/libc.rs
new file mode 100644
index 0000000000..6ea9cea08e
--- /dev/null
+++ b/crates/jit-icache-coherence/src/libc.rs
@@ -0,0 +1,88 @@
+#![allow(unused)]
+
+use libc::{syscall, EINVAL, EPERM};
+use std::ffi::c_void;
+use std::io::{Error, Result};
+
+const MEMBARRIER_CMD_GLOBAL: libc::c_int = 1;
+const MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE: libc::c_int = 32;
+const MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: libc::c_int = 64;
+
+/// See docs on [crate::pipeline_flush_mt] for a description of what this function is trying to do.
+#[inline]
+pub(crate) fn pipeline_flush_mt() -> Result<()> {
+    // Ensure that no processor has fetched a stale instruction stream.
+    //
+    // On AArch64 we try to do this by executing a "broadcast" `ISB` which is not something that the
+    // architecture provides us but we can emulate it using the membarrier kernel interface.
+    //
+    // This behaviour was documented in a patch, however it seems that it hasn't been upstreamed yet
+    // Nevertheless it clearly explains the guarantees that the Linux kernel provides us regarding the
+    // membarrier interface, and how to use it for JIT contexts.
+    // https://lkml.kernel.org/lkml/07a8b963002cb955b7516e61bad19514a3acaa82.1623813516.git.luto@kernel.org/
+    //
+    // I couldn't find the follow up for that patch but there doesn't seem to be disagreement about
+    // that specific part in the replies.
+    // TODO: Check if the kernel has updated the membarrier documentation
+    //
+    // See the following issues for more info:
+    //  * https://github.com/bytecodealliance/wasmtime/pull/3426
+    //  * https://github.com/bytecodealliance/wasmtime/pull/4997
+    //
+    // TODO: x86 and s390x have coherent caches so they don't need this, but RISCV does not
+    // guarantee that, so we may need to do something similar for it. However as noted in the above
+    // kernel patch the SYNC_CORE membarrier has different guarantees on each architecture
+    // so we need follow up and check what it provides us.
+    // See: https://github.com/bytecodealliance/wasmtime/issues/5033
+    #[cfg(all(target_arch = "aarch64", target_os = "linux"))]
+    match membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE) {
+        Ok(_) => {}
+
+        // EPERM happens if the calling process hasn't yet called the register membarrier.
+        // We can call the register membarrier now, and then retry the actual membarrier,
+        //
+        // This does have some overhead since on the first time we call this function we
+        // actually execute three membarriers, but this only happens once per process and only
+        // one slow membarrier is actually executed (The last one, which actually generates an IPI).
+        Err(e) if e.raw_os_error().unwrap() == EPERM => {
+            membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)?;
+            membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE)?;
+        }
+
+        // On kernels older than 4.16 the above syscall does not exist, so we can
+        // fallback to MEMBARRIER_CMD_GLOBAL which is an alias for MEMBARRIER_CMD_SHARED
+        // that has existed since 4.3. GLOBAL is a lot slower, but allows us to have
+        // compatibility with older kernels.
+        Err(e) if e.raw_os_error().unwrap() == EINVAL => {
+            membarrier(MEMBARRIER_CMD_GLOBAL)?;
+        }
+
+        // In any other case we got an actual error, so lets propagate that up
+        e => e?,
+    }
+
+    Ok(())
+}
+
+#[cfg(target_os = "linux")]
+fn membarrier(barrier: libc::c_int) -> Result<()> {
+    let flags: libc::c_int = 0;
+    let res = unsafe { syscall(libc::SYS_membarrier, barrier, flags) };
+    if res == 0 {
+        Ok(())
+    } else {
+        Err(Error::last_os_error())
+    }
+}
+
+/// See docs on [crate::clear_cache] for a description of what this function is trying to do.
+#[inline]
+pub(crate) fn clear_cache(_ptr: *const c_void, _len: usize) -> Result<()> {
+    // TODO: On AArch64 we currently rely on the `mprotect` call that switches the memory from W+R to R+X
+    // to do this for us, however that is an implementation detail and should not be relied upon
+    // We should call some implementation of `clear_cache` here
+    //
+    // See: https://github.com/bytecodealliance/wasmtime/issues/3310
+
+    Ok(())
+}
diff --git a/crates/jit-icache-coherence/src/win.rs b/crates/jit-icache-coherence/src/win.rs
new file mode 100644
index 0000000000..488e15f466
--- /dev/null
+++ b/crates/jit-icache-coherence/src/win.rs
@@ -0,0 +1,45 @@
+use std::ffi::c_void;
+use std::io::{Error, Result};
+use windows_sys::Win32::System::Diagnostics::Debug::FlushInstructionCache;
+use windows_sys::Win32::System::Threading::FlushProcessWriteBuffers;
+use windows_sys::Win32::System::Threading::GetCurrentProcess;
+
+/// See docs on [crate::pipeline_flush_mt] for a description of what this function is trying to do.
+#[inline]
+pub(crate) fn pipeline_flush_mt() -> Result<()> {
+    // If we are here, it means that the user has already called [cache_clear] for all buffers that
+    // are going to be holding code. We don't really care about flushing the write buffers, but
+    // the other guarantee that microsoft provides on this API. As documented:
+    //
+    // "The function generates an interprocessor interrupt (IPI) to all processors that are part of
+    // the current process affinity. It guarantees the visibility of write operations performed on
+    // one processor to the other processors."
+    //
+    // This all-core IPI acts as a core serializing operation, equivalent to a "broadcast" `ISB`
+    // instruction that the architecture does not provide and which is what we really want.
+    //
+    // See: https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-flushprocesswritebuffers
+    if cfg!(target_arch = "aarch64") {
+        unsafe {
+            FlushProcessWriteBuffers();
+        }
+    }
+
+    Ok(())
+}
+
+/// See docs on [crate::clear_cache] for a description of what this function is trying to do.
+#[inline]
+pub(crate) fn clear_cache(ptr: *const c_void, len: usize) -> Result<()> {
+    // See:
+    //   * https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-flushinstructioncache
+    //   * https://devblogs.microsoft.com/oldnewthing/20190902-00/?p=102828
+    unsafe {
+        let res = FlushInstructionCache(GetCurrentProcess(), ptr, len);
+        if res == 0 {
+            return Err(Error::last_os_error());
+        }
+    }
+
+    Ok(())
+}
diff --git a/crates/jit/Cargo.toml b/crates/jit/Cargo.toml
index b555010552..fef5bdd622 100644
--- a/crates/jit/Cargo.toml
+++ b/crates/jit/Cargo.toml
@@ -26,6 +26,7 @@ bincode = "1.2.1"
 rustc-demangle = "0.1.16"
 cpp_demangle = "0.3.2"
 log = { workspace = true }
+wasmtime-jit-icache-coherence = { workspace = true }
 
 [target.'cfg(target_os = "windows")'.dependencies.windows-sys]
 workspace = true
@@ -33,9 +34,6 @@ features = [
   "Win32_System_Diagnostics_Debug",
 ]
 
-[target.'cfg(target_os = "linux")'.dependencies]
-rustix = { workspace = true, features = ["process"] }
-
 [target.'cfg(target_arch = "x86_64")'.dependencies]
 ittapi = { version = "0.3.0", optional = true  }
 
diff --git a/crates/jit/src/code_memory.rs b/crates/jit/src/code_memory.rs
index 08ee895f7c..66eb8ee44e 100644
--- a/crates/jit/src/code_memory.rs
+++ b/crates/jit/src/code_memory.rs
@@ -3,7 +3,9 @@
 use crate::unwind::UnwindRegistration;
 use anyhow::{bail, Context, Result};
 use object::read::{File, Object, ObjectSection};
+use std::ffi::c_void;
 use std::mem::ManuallyDrop;
+use wasmtime_jit_icache_coherence as icache_coherence;
 use wasmtime_runtime::MmapVec;
 
 /// Management of executable memory within a `MmapVec`
@@ -54,15 +56,6 @@ impl CodeMemory {
     /// The returned `CodeMemory` manages the internal `MmapVec` and the
     /// `publish` method is used to actually make the memory executable.
     pub fn new(mmap: MmapVec) -> Self {
-        #[cfg(all(target_arch = "aarch64", target_os = "linux"))]
-        {
-            // This is a requirement of the `membarrier` call executed by the `publish` method.
-            rustix::process::membarrier(
-                rustix::process::MembarrierCommand::RegisterPrivateExpeditedSyncCore,
-            )
-            .unwrap();
-        }
-
         Self {
             mmap: ManuallyDrop::new(mmap),
             unwind_registration: ManuallyDrop::new(None),
@@ -155,6 +148,13 @@ impl CodeMemory {
             // must be added here, though, if relocations pop up.
             assert!(text.relocations().count() == 0);
 
+            // Clear the newly allocated code from cache if the processor requires it
+            //
+            // Do this before marking the memory as R+X, technically we should be able to do it after
+            // but there are some CPU's that have had errata about doing this with read only memory.
+            icache_coherence::clear_cache(ret.text.as_ptr() as *const c_void, ret.text.len())
+                .expect("Failed cache clear");
+
             // Switch the executable portion from read/write to
             // read/execute, notably not using read/write/execute to prevent
             // modifications.
@@ -162,14 +162,8 @@ impl CodeMemory {
                 .make_executable(text_range.clone(), enable_branch_protection)
                 .expect("unable to make memory executable");
 
-            #[cfg(all(target_arch = "aarch64", target_os = "linux"))]
-            {
-                // Ensure that no processor has fetched a stale instruction stream.
-                rustix::process::membarrier(
-                    rustix::process::MembarrierCommand::PrivateExpeditedSyncCore,
-                )
-                .unwrap();
-            }
+            // Flush any in-flight instructions from the pipeline
+            icache_coherence::pipeline_flush_mt().expect("Failed pipeline flush");
 
             // With all our memory set up use the platform-specific
             // `UnwindRegistration` implementation to inform the general
diff --git a/scripts/publish.rs b/scripts/publish.rs
index e417653735..932ee3493b 100644
--- a/scripts/publish.rs
+++ b/scripts/publish.rs
@@ -36,6 +36,7 @@ const CRATES_TO_PUBLISH: &[&str] = &[
     "cranelift-object",
     "cranelift-interpreter",
     "cranelift",
+    "wasmtime-jit-icache-coherence",
     "cranelift-jit",
     // wiggle
     "wiggle-generate",