diff --git a/cranelift/filetests/filetests/runtests/atomic-load-store.clif b/cranelift/filetests/filetests/runtests/atomic-load-store.clif
new file mode 100644
index 0000000000..356ba1dc0b
--- /dev/null
+++ b/cranelift/filetests/filetests/runtests/atomic-load-store.clif
@@ -0,0 +1,94 @@
+test interpret
+test run
+target x86_64
+target aarch64
+target riscv64
+target s390x
+
+function %i64_atomic_store_load(i64) -> i64 {
+    ss0 = explicit_slot 8
+
+block0(v0: i64):
+    v1 = stack_addr.i64 ss0
+    atomic_store.i64 v0, v1
+    v2 = atomic_load.i64 v1
+    return v2
+}
+; run: %i64_atomic_store_load(0) == 0
+; run: %i64_atomic_store_load(-1) == -1
+; run: %i64_atomic_store_load(0x00000000_FFFFFFFF) == 0x00000000_FFFFFFFF
+; run: %i64_atomic_store_load(0xFFFFFFFF_00000000) == 0xFFFFFFFF_00000000
+; run: %i64_atomic_store_load(0xFEDCBA98_76543210) == 0xFEDCBA98_76543210
+; run: %i64_atomic_store_load(0xA00A00A0_0A00A00A) == 0xA00A00A0_0A00A00A
+; run: %i64_atomic_store_load(0xC0FFEEEE_DECAFFFF) == 0xC0FFEEEE_DECAFFFF
+
+
+function %i32_atomic_store_load(i32) -> i32 {
+    ss0 = explicit_slot 4
+
+block0(v0: i32):
+    v1 = stack_addr.i64 ss0
+    atomic_store.i32 v0, v1
+    v2 = atomic_load.i32 v1
+    return v2
+}
+; run: %i32_atomic_store_load(0) == 0
+; run: %i32_atomic_store_load(-1) == -1
+; run: %i32_atomic_store_load(0x0000FFFF) == 0x0000FFFF
+; run: %i32_atomic_store_load(0xFFFF0000) == 0xFFFF0000
+; run: %i32_atomic_store_load(0xFEDC3210) == 0xFEDC3210
+; run: %i32_atomic_store_load(0xA00AA00A) == 0xA00AA00A
+; run: %i32_atomic_store_load(0xC0FFEEEE) == 0xC0FFEEEE
+
+
+function %i16_atomic_store_load(i16) -> i16 {
+    ss0 = explicit_slot 2
+
+block0(v0: i16):
+    v1 = stack_addr.i64 ss0
+    atomic_store.i16 v0, v1
+    v2 = atomic_load.i16 v1
+    return v2
+}
+; run: %i16_atomic_store_load(0) == 0
+; run: %i16_atomic_store_load(-1) == -1
+; run: %i16_atomic_store_load(0x00FF) == 0x00FF
+; run: %i16_atomic_store_load(0xFF00) == 0xFF00
+; run: %i16_atomic_store_load(0xFE10) == 0xFE10
+; run: %i16_atomic_store_load(0xA00A) == 0xA00A
+; run: %i16_atomic_store_load(0xC0FF) == 0xC0FF
+
+
+function %i8_atomic_store_load(i8) -> i8 {
+    ss0 = explicit_slot 1
+
+block0(v0: i8):
+    v1 = stack_addr.i64 ss0
+    atomic_store.i8 v0, v1
+    v2 = atomic_load.i8 v1
+    return v2
+}
+; run: %i8_atomic_store_load(0) == 0
+; run: %i8_atomic_store_load(-1) == -1
+; run: %i8_atomic_store_load(0x0F) == 0x0F
+; run: %i8_atomic_store_load(0xF0) == 0xF0
+; run: %i8_atomic_store_load(0xAA) == 0xAA
+; run: %i8_atomic_store_load(0xC0) == 0xC0
+
+
+function %atomic_store_load_aligned(i64) -> i64 {
+    ss0 = explicit_slot 16
+
+block0(v0: i64):
+    v1 = stack_addr.i64 ss0
+    atomic_store.i64 aligned v0, v1
+    v2 = atomic_load.i64 aligned v1
+    return v2
+}
+; run: %atomic_store_load_aligned(0) == 0
+; run: %atomic_store_load_aligned(-1) == -1
+; run: %atomic_store_load_aligned(0x00000000_FFFFFFFF) == 0x00000000_FFFFFFFF
+; run: %atomic_store_load_aligned(0xFFFFFFFF_00000000) == 0xFFFFFFFF_00000000
+; run: %atomic_store_load_aligned(0xFEDCBA98_76543210) == 0xFEDCBA98_76543210
+; run: %atomic_store_load_aligned(0xA00A00A0_0A00A00A) == 0xA00A00A0_0A00A00A
+; run: %atomic_store_load_aligned(0xC0FFEEEE_DECAFFFF) == 0xC0FFEEEE_DECAFFFF
diff --git a/cranelift/filetests/filetests/runtests/fence.clif b/cranelift/filetests/filetests/runtests/fence.clif
new file mode 100644
index 0000000000..e27c90e60b
--- /dev/null
+++ b/cranelift/filetests/filetests/runtests/fence.clif
@@ -0,0 +1,18 @@
+test interpret
+test run
+target aarch64
+target s390x
+target x86_64
+target riscv64
+
+; Check that the fence instruction doesn't crash. Testing anything else would
+; require multiple threads, which requires a runtime like Wasmtime.
+
+function %fence() -> i8 {
+block0:
+    fence
+
+    v0 = iconst.i8 0
+    return v0
+}
+; run: %fence() == 0
diff --git a/cranelift/fuzzgen/src/function_generator.rs b/cranelift/fuzzgen/src/function_generator.rs
index d5b51c5c59..3e2af2e911 100644
--- a/cranelift/fuzzgen/src/function_generator.rs
+++ b/cranelift/fuzzgen/src/function_generator.rs
@@ -2,7 +2,6 @@ use crate::codegen::ir::{ArgumentExtension, ArgumentPurpose};
 use crate::config::Config;
 use anyhow::Result;
 use arbitrary::{Arbitrary, Unstructured};
-use cranelift::codegen::ir::immediates::Offset32;
 use cranelift::codegen::ir::instructions::InstructionFormat;
 use cranelift::codegen::ir::stackslot::StackSize;
 use cranelift::codegen::ir::{types::*, FuncRef, LibCall, UserExternalName, UserFuncName};
@@ -223,27 +222,73 @@ fn insert_load_store(
 ) -> Result<()> {
     let ctrl_type = *rets.first().or(args.first()).unwrap();
     let type_size = ctrl_type.bytes();
-    let (address, offset) = fgen.generate_load_store_address(builder, type_size)?;
 
-    // TODO: More advanced MemFlags
-    let flags = MemFlags::new();
+    // Should we generate an aligned address
+    let is_atomic = [Opcode::AtomicLoad, Opcode::AtomicStore].contains(&opcode);
+    let is_aarch64 = matches!(fgen.target_triple.architecture, Architecture::Aarch64(_));
+    let aligned = if is_atomic && is_aarch64 {
+        // AArch64 has issues with unaligned atomics.
+        // https://github.com/bytecodealliance/wasmtime/issues/5483
+        true
+    } else {
+        bool::arbitrary(fgen.u)?
+    };
+
+    let mut flags = MemFlags::new();
+    // Even if we picked an aligned address, we can always generate unaligned memflags
+    if aligned && bool::arbitrary(fgen.u)? {
+        flags.set_aligned();
+    }
+    // If the address is aligned, then we know it won't trap
+    if aligned && bool::arbitrary(fgen.u)? {
+        flags.set_notrap();
+    }
+
+    let (address, max_offset) = fgen.generate_load_store_address(builder, type_size, aligned)?;
+
+    // Pick an offset to pass into the load/store.
+    let offset = if aligned {
+        0
+    } else {
+        fgen.u.int_in_range(0..=max_offset)? as i32
+    }
+    .into();
 
     // The variable being loaded or stored into
     let var = fgen.get_variable_of_type(ctrl_type)?;
 
-    if opcode.can_store() {
-        let val = builder.use_var(var);
+    match opcode.format() {
+        InstructionFormat::LoadNoOffset => {
+            let (inst, dfg) = builder
+                .ins()
+                .LoadNoOffset(opcode, ctrl_type, flags, address);
 
-        builder
-            .ins()
-            .Store(opcode, ctrl_type, flags, offset, val, address);
-    } else {
-        let (inst, dfg) = builder
-            .ins()
-            .Load(opcode, ctrl_type, flags, offset, address);
+            let new_val = dfg.first_result(inst);
+            builder.def_var(var, new_val);
+        }
+        InstructionFormat::StoreNoOffset => {
+            let val = builder.use_var(var);
 
-        let new_val = dfg.first_result(inst);
-        builder.def_var(var, new_val);
+            builder
+                .ins()
+                .StoreNoOffset(opcode, ctrl_type, flags, val, address);
+        }
+        InstructionFormat::Store => {
+            let val = builder.use_var(var);
+
+            builder
+                .ins()
+                .Store(opcode, ctrl_type, flags, offset, val, address);
+        }
+        InstructionFormat::Load => {
+            let (inst, dfg) = builder
+                .ins()
+                .Load(opcode, ctrl_type, flags, offset, address);
+
+            let new_val = dfg.first_result(inst);
+            builder.def_var(var, new_val);
+        }
+        _ => unimplemented!(),
     }
 
     Ok(())
@@ -1038,6 +1083,8 @@ const OPCODE_SIGNATURES: &[OpcodeSignature] = &[
     (Opcode::Icmp, &[I32, I32], &[I8], insert_cmp),
     (Opcode::Icmp, &[I64, I64], &[I8], insert_cmp),
     (Opcode::Icmp, &[I128, I128], &[I8], insert_cmp),
+    // Fence
+    (Opcode::Fence, &[], &[], insert_opcode),
     // Stack Access
     (Opcode::StackStore, &[I8], &[], insert_stack_store),
     (Opcode::StackStore, &[I16], &[], insert_stack_store),
@@ -1077,6 +1124,11 @@ const OPCODE_SIGNATURES: &[OpcodeSignature] = &[
     // Opcode::Sload16x4
     // Opcode::Uload32x2
     // Opcode::Sload32x2
+    // AtomicLoad
+    (Opcode::AtomicLoad, &[], &[I8], insert_load_store),
+    (Opcode::AtomicLoad, &[], &[I16], insert_load_store),
+    (Opcode::AtomicLoad, &[], &[I32], insert_load_store),
+    (Opcode::AtomicLoad, &[], &[I64], insert_load_store),
     // Stores
     (Opcode::Store, &[I8], &[], insert_load_store),
     (Opcode::Store, &[I16], &[], insert_load_store),
@@ -1092,6 +1144,11 @@ const OPCODE_SIGNATURES: &[OpcodeSignature] = &[
     (Opcode::Istore16, &[I32], &[], insert_load_store),
     (Opcode::Istore16, &[I64], &[], insert_load_store),
     (Opcode::Istore32, &[I64], &[], insert_load_store),
+    // AtomicStore
+    (Opcode::AtomicStore, &[I8], &[], insert_load_store),
+    (Opcode::AtomicStore, &[I16], &[], insert_load_store),
+    (Opcode::AtomicStore, &[I32], &[], insert_load_store),
+    (Opcode::AtomicStore, &[I64], &[], insert_load_store),
     // Bitcast
     (Opcode::Bitcast, &[F32], &[I32], insert_bitcast),
     (Opcode::Bitcast, &[I32], &[F32], insert_bitcast),
@@ -1277,34 +1334,42 @@ where
     /// we don't run the risk of returning them from a function, which would make the fuzzer
     /// complain since they are different from the interpreter to the backend.
     ///
-    /// The address is not guaranteed to be valid, but there's a chance that it is.
+    /// `min_size`: Controls the amount of space that the address should have.
     ///
-    /// `min_size`: Controls the amount of space that the address should have.This is not
-    /// guaranteed to be respected
+    /// `aligned`: When passed as true, the resulting address is guaranteed to be aligned
+    /// on an 8 byte boundary.
+    ///
+    /// Returns a valid address and the maximum possible offset that still respects `min_size`.
     fn generate_load_store_address(
         &mut self,
         builder: &mut FunctionBuilder,
         min_size: u32,
-    ) -> Result<(Value, Offset32)> {
+        aligned: bool,
+    ) -> Result<(Value, u32)> {
         // TODO: Currently our only source of addresses is stack_addr, but we
         // should add global_value, symbol_value eventually
         let (addr, available_size) = {
             let (ss, slot_size) = self.stack_slot_with_size(min_size)?;
-            let max_offset = slot_size.saturating_sub(min_size);
-            let offset = self.u.int_in_range(0..=max_offset)? as i32;
-            let base_addr = builder.ins().stack_addr(I64, ss, offset);
-            let available_size = (slot_size as i32).saturating_sub(offset);
+
+            // stack_slot_with_size guarantees that slot_size >= min_size
+            let max_offset = slot_size - min_size;
+            let offset = if aligned {
+                self.u.int_in_range(0..=max_offset / min_size)? * min_size
+            } else {
+                self.u.int_in_range(0..=max_offset)?
+            };
+
+            let base_addr = builder.ins().stack_addr(I64, ss, offset as i32);
+            let available_size = slot_size.saturating_sub(offset);
             (base_addr, available_size)
         };
 
         // TODO: Insert a bunch of amode opcodes here to modify the address!
 
         // Now that we have an address and a size, we just choose a random offset to return to the
-        // caller. Try to preserve min_size bytes.
-        let max_offset = available_size.saturating_sub(min_size as i32);
-        let offset = self.u.int_in_range(0..=max_offset)? as i32;
-
-        Ok((addr, offset.into()))
+        // caller. Preserving min_size bytes.
+        let max_offset = available_size.saturating_sub(min_size);
+        Ok((addr, max_offset))
     }
 
     /// Get a variable of type `ty` from the current function
diff --git a/cranelift/interpreter/src/step.rs b/cranelift/interpreter/src/step.rs
index 8696c57ab5..365efc9634 100644
--- a/cranelift/interpreter/src/step.rs
+++ b/cranelift/interpreter/src/step.rs
@@ -1162,9 +1162,27 @@ where
         Opcode::Iconcat => assign(Value::concat(arg(0)?, arg(1)?)?),
         Opcode::AtomicRmw => unimplemented!("AtomicRmw"),
         Opcode::AtomicCas => unimplemented!("AtomicCas"),
-        Opcode::AtomicLoad => unimplemented!("AtomicLoad"),
-        Opcode::AtomicStore => unimplemented!("AtomicStore"),
-        Opcode::Fence => unimplemented!("Fence"),
+        Opcode::AtomicLoad => {
+            let load_ty = inst_context.controlling_type().unwrap();
+            let addr = arg(0)?.into_int()? as u64;
+            // We are doing a regular load here, this isn't actually thread safe.
+            assign_or_memtrap(
+                Address::try_from(addr).and_then(|addr| state.checked_load(addr, load_ty)),
+            )
+        }
+        Opcode::AtomicStore => {
+            let val = arg(0)?;
+            let addr = arg(1)?.into_int()? as u64;
+            // We are doing a regular store here, this isn't actually thread safe.
+            continue_or_memtrap(
+                Address::try_from(addr).and_then(|addr| state.checked_store(addr, val)),
+            )
+        }
+        Opcode::Fence => {
+            // The interpreter always runs in a single threaded context, so we don't
+            // actually need to emit a fence here.
+            ControlFlow::Continue
+        }
         Opcode::WideningPairwiseDotProductS => {
             let ctrl_ty = types::I16X8;
             let new_type = ctrl_ty.merge_lanes().unwrap();