diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs
index ffded34f95..04cbc87065 100644
--- a/cranelift/codegen/meta/src/shared/instructions.rs
+++ b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -4600,8 +4600,7 @@ pub(crate) fn define(
             r#"
         Atomically load from memory at `p`.
 
-        This is a polymorphic instruction that can load any value type which has a memory
-        representation.  It should only be used for integer types with 8, 16, 32 or 64 bits.
+        It should only be used for integer types with 32 or 64 bits.
         This operation is sequentially consistent and creates happens-before edges that order
         normal (non-atomic) loads and stores.
         "#,
@@ -4613,14 +4612,124 @@ pub(crate) fn define(
         .other_side_effects(true),
     );
 
+    ig.push(
+        Inst::new(
+            "atomic_uload8",
+            r#"
+        Atomically load 8 bits from memory at `p` and zero-extend to either 32 or 64 bits.
+
+        This is equivalent to ``load.i8`` followed by ``uextend``.
+
+        This operation is sequentially consistent and creates happens-before edges that order
+        normal (non-atomic) loads and stores.
+        "#,
+            &formats.load_no_offset,
+        )
+        .operands_in(vec![MemFlags, p])
+        .operands_out(vec![a])
+        .can_load(true)
+        .other_side_effects(true),
+    );
+
+    ig.push(
+        Inst::new(
+            "atomic_uload16",
+            r#"
+        Atomically load 16 bits from memory at `p` and zero-extend to either 32 or 64 bits.
+
+        This is equivalent to ``load.i16`` followed by ``uextend``.
+
+        This operation is sequentially consistent and creates
+        happens-before edges that order normal (non-atomic) loads and stores.
+        "#,
+            &formats.load_no_offset,
+        )
+        .operands_in(vec![MemFlags, p])
+        .operands_out(vec![a])
+        .can_load(true)
+        .other_side_effects(true),
+    );
+
+    ig.push(
+        Inst::new(
+            "atomic_uload32",
+            r#"
+        Atomically load 32 bits from memory at `p` and zero-extend to 64 bits.
+
+        This is equivalent to ``load.i32`` followed by ``uextend``.
+
+        This operation is sequentially consistent and creates
+        happens-before edges that order normal (non-atomic) loads and stores.
+        "#,
+            &formats.load_no_offset,
+        )
+        .operands_in(vec![MemFlags, p])
+        .operands_out(vec![a])
+        .can_load(true)
+        .other_side_effects(true),
+    );
+
     ig.push(
         Inst::new(
             "atomic_store",
             r#"
         Atomically store `x` to memory at `p`.
 
-        This is a polymorphic instruction that can store any value type with a memory
-        representation.  It should only be used for integer types with 8, 16, 32 or 64 bits.
+        This is a polymorphic instruction that can store a 32 or 64-bit value.
+        This operation is sequentially consistent and creates happens-before edges that order
+        normal (non-atomic) loads and stores.
+        "#,
+            &formats.store_no_offset,
+        )
+        .operands_in(vec![MemFlags, x, p])
+        .can_store(true)
+        .other_side_effects(true),
+    );
+
+    ig.push(
+        Inst::new(
+            "atomic_store8",
+            r#"
+        Atomically store the low 8 bits of `x` to memory at `p`.
+
+        This is equivalent to ``ireduce.i8`` followed by ``store.i8``.
+
+        This operation is sequentially consistent and creates happens-before edges that order
+        normal (non-atomic) loads and stores.
+        "#,
+            &formats.store_no_offset,
+        )
+        .operands_in(vec![MemFlags, x, p])
+        .can_store(true)
+        .other_side_effects(true),
+    );
+
+    ig.push(
+        Inst::new(
+            "atomic_store16",
+            r#"
+        Atomically store the low 16 bits of `x` to memory at `p`.
+
+        This is equivalent to ``ireduce.i16`` followed by ``store.i16``.
+
+        This operation is sequentially consistent and creates happens-before edges that order
+        normal (non-atomic) loads and stores.
+        "#,
+            &formats.store_no_offset,
+        )
+        .operands_in(vec![MemFlags, x, p])
+        .can_store(true)
+        .other_side_effects(true),
+    );
+
+    ig.push(
+        Inst::new(
+            "atomic_store32",
+            r#"
+        Atomically store the low 32 bits of `x` to memory at `p`.
+
+        This is equivalent to ``ireduce.i32`` followed by ``store.i32``.
+
         This operation is sequentially consistent and creates happens-before edges that order
         normal (non-atomic) loads and stores.
         "#,
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index 5374de6bf8..ce669459e1 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -498,7 +498,7 @@ fn enc_dmb_ish() -> u32 {
     0xD5033BBF
 }
 
-fn enc_ldxr(ty: Type, rt: Writable<Reg>, rn: Reg) -> u32 {
+fn enc_ldar(ty: Type, rt: Writable<Reg>, rn: Reg) -> u32 {
     let sz = match ty {
         I64 => 0b11,
         I32 => 0b10,
@@ -506,13 +506,13 @@ fn enc_ldxr(ty: Type, rt: Writable<Reg>, rn: Reg) -> u32 {
         I8 => 0b00,
         _ => unreachable!(),
     };
-    0b00001000_01011111_01111100_00000000
+    0b00_001000_1_1_0_11111_1_11111_00000_00000
         | (sz << 30)
         | (machreg_to_gpr(rn) << 5)
         | machreg_to_gpr(rt.to_reg())
 }
 
-fn enc_stxr(ty: Type, rs: Writable<Reg>, rt: Reg, rn: Reg) -> u32 {
+fn enc_stlr(ty: Type, rt: Reg, rn: Reg) -> u32 {
     let sz = match ty {
         I64 => 0b11,
         I32 => 0b10,
@@ -520,7 +520,35 @@ fn enc_stxr(ty: Type, rs: Writable<Reg>, rt: Reg, rn: Reg) -> u32 {
         I8 => 0b00,
         _ => unreachable!(),
     };
-    0b00001000_00000000_01111100_00000000
+    0b00_001000_100_11111_1_11111_00000_00000
+        | (sz << 30)
+        | (machreg_to_gpr(rn) << 5)
+        | machreg_to_gpr(rt)
+}
+
+fn enc_ldaxr(ty: Type, rt: Writable<Reg>, rn: Reg) -> u32 {
+    let sz = match ty {
+        I64 => 0b11,
+        I32 => 0b10,
+        I16 => 0b01,
+        I8 => 0b00,
+        _ => unreachable!(),
+    };
+    0b00_001000_0_1_0_11111_1_11111_00000_00000
+        | (sz << 30)
+        | (machreg_to_gpr(rn) << 5)
+        | machreg_to_gpr(rt.to_reg())
+}
+
+fn enc_stlxr(ty: Type, rs: Writable<Reg>, rt: Reg, rn: Reg) -> u32 {
+    let sz = match ty {
+        I64 => 0b11,
+        I32 => 0b10,
+        I16 => 0b01,
+        I8 => 0b00,
+        _ => unreachable!(),
+    };
+    0b00_001000_000_00000_1_11111_00000_00000
         | (sz << 30)
         | (machreg_to_gpr(rs.to_reg()) << 16)
         | (machreg_to_gpr(rn) << 5)
@@ -1286,20 +1314,18 @@ impl MachInstEmit for Inst {
             }
             &Inst::AtomicRMW { ty, op } => {
                 /* Emit this:
-                      dmb         ish
                      again:
-                      ldxr{,b,h}  x/w27, [x25]
+                      ldaxr{,b,h}  x/w27, [x25]
                       op          x28, x27, x26 // op is add,sub,and,orr,eor
-                      stxr{,b,h}  w24, x/w28, [x25]
+                      stlxr{,b,h}  w24, x/w28, [x25]
                       cbnz        x24, again
-                      dmb         ish
 
                    Operand conventions:
                       IN:  x25 (addr), x26 (2nd arg for op)
                       OUT: x27 (old value), x24 (trashed), x28 (trashed)
 
                    It is unfortunate that, per the ARM documentation, x28 cannot be used for
-                   both the store-data and success-flag operands of stxr.  This causes the
+                   both the store-data and success-flag operands of stlxr.  This causes the
                    instruction's behaviour to be "CONSTRAINED UNPREDICTABLE", so we use x24
                    instead for the success-flag.
 
@@ -1320,15 +1346,13 @@ impl MachInstEmit for Inst {
                 let x28wr = writable_xreg(28);
                 let again_label = sink.get_label();
 
-                sink.put4(enc_dmb_ish()); // dmb ish
-
                 // again:
                 sink.bind_label(again_label);
                 let srcloc = state.cur_srcloc();
                 if srcloc != SourceLoc::default() {
                     sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
                 }
-                sink.put4(enc_ldxr(ty, x27wr, x25)); // ldxr x27, [x25]
+                sink.put4(enc_ldaxr(ty, x27wr, x25)); // ldaxr x27, [x25]
 
                 match op {
                     AtomicRmwOp::Xchg => {
@@ -1420,19 +1444,17 @@ impl MachInstEmit for Inst {
                 if srcloc != SourceLoc::default() {
                     sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
                 }
-                sink.put4(enc_stxr(ty, x24wr, x28, x25)); // stxr w24, x28, [x25]
+                sink.put4(enc_stlxr(ty, x24wr, x28, x25)); // stlxr w24, x28, [x25]
 
                 // cbnz w24, again
                 // Note, we're actually testing x24, and relying on the default zero-high-half
-                // rule in the assignment that `stxr` does.
+                // rule in the assignment that `stlxr` does.
                 let br_offset = sink.cur_offset();
                 sink.put4(enc_conditional_br(
                     BranchTarget::Label(again_label),
                     CondBrKind::NotZero(x24),
                 ));
                 sink.use_label_at_offset(br_offset, again_label, LabelUse::Branch19);
-
-                sink.put4(enc_dmb_ish()); // dmb ish
             }
             &Inst::AtomicCAS { rs, rt, rn, ty } => {
                 let size = match ty {
@@ -1447,22 +1469,18 @@ impl MachInstEmit for Inst {
             }
             &Inst::AtomicCASLoop { ty } => {
                 /* Emit this:
-                     dmb         ish
                     again:
-                     ldxr{,b,h}  x/w27, [x25]
-                     and         x24, x26, MASK (= 2^size_bits - 1)
-                     cmp         x27, x24
+                     ldaxr{,b,h} x/w27, [x25]
+                     cmp         x27, x/w26 uxt{b,h}
                      b.ne        out
-                     stxr{,b,h}  w24, x/w28, [x25]
+                     stlxr{,b,h} w24, x/w28, [x25]
                      cbnz        x24, again
                     out:
-                     dmb         ish
 
                   Operand conventions:
                      IN:  x25 (addr), x26 (expected value), x28 (replacement value)
                      OUT: x27 (old value), x24 (trashed)
                 */
-                let xzr = zero_reg();
                 let x24 = xreg(24);
                 let x25 = xreg(25);
                 let x26 = xreg(26);
@@ -1474,37 +1492,25 @@ impl MachInstEmit for Inst {
                 let again_label = sink.get_label();
                 let out_label = sink.get_label();
 
-                sink.put4(enc_dmb_ish()); // dmb ish
-
                 // again:
                 sink.bind_label(again_label);
                 let srcloc = state.cur_srcloc();
                 if srcloc != SourceLoc::default() {
                     sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
                 }
-                sink.put4(enc_ldxr(ty, x27wr, x25)); // ldxr x27, [x25]
+                // ldaxr x27, [x25]
+                sink.put4(enc_ldaxr(ty, x27wr, x25));
 
-                if ty == I64 {
-                    // mov x24, x26
-                    sink.put4(enc_arith_rrr(0b101_01010_00_0, 0b000000, x24wr, xzr, x26))
-                } else {
-                    // and x24, x26, 0xFF/0xFFFF/0xFFFFFFFF
-                    let (mask, s) = match ty {
-                        I8 => (0xFF, 7),
-                        I16 => (0xFFFF, 15),
-                        I32 => (0xFFFFFFFF, 31),
-                        _ => unreachable!(),
-                    };
-                    sink.put4(enc_arith_rr_imml(
-                        0b100_100100,
-                        ImmLogic::from_n_r_s(mask, true, 0, s, OperandSize::Size64).enc_bits(),
-                        x26,
-                        x24wr,
-                    ))
-                }
-
-                // cmp x27, x24 (== subs xzr, x27, x24)
-                sink.put4(enc_arith_rrr(0b111_01011_00_0, 0b000000, xzrwr, x27, x24));
+                // The top 32-bits are zero-extended by the ldaxr so we don't
+                // have to use UXTW, just the x-form of the register.
+                let (bit21, extend_op) = match ty {
+                    I8 => (0b1, 0b000000),
+                    I16 => (0b1, 0b001000),
+                    _ => (0b0, 0b000000),
+                };
+                let bits_31_21 = 0b111_01011_000 | bit21;
+                // cmp x27, x26 (== subs xzr, x27, x26)
+                sink.put4(enc_arith_rrr(bits_31_21, extend_op, xzrwr, x27, x26));
 
                 // b.ne out
                 let br_out_offset = sink.cur_offset();
@@ -1518,11 +1524,11 @@ impl MachInstEmit for Inst {
                 if srcloc != SourceLoc::default() {
                     sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
                 }
-                sink.put4(enc_stxr(ty, x24wr, x28, x25)); // stxr w24, x28, [x25]
+                sink.put4(enc_stlxr(ty, x24wr, x28, x25)); // stlxr w24, x28, [x25]
 
                 // cbnz w24, again.
                 // Note, we're actually testing x24, and relying on the default zero-high-half
-                // rule in the assignment that `stxr` does.
+                // rule in the assignment that `stlxr` does.
                 let br_again_offset = sink.cur_offset();
                 sink.put4(enc_conditional_br(
                     BranchTarget::Label(again_label),
@@ -1532,46 +1538,12 @@ impl MachInstEmit for Inst {
 
                 // out:
                 sink.bind_label(out_label);
-                sink.put4(enc_dmb_ish()); // dmb ish
             }
-            &Inst::AtomicLoad { ty, r_data, r_addr } => {
-                let op = match ty {
-                    I8 => 0b0011100001,
-                    I16 => 0b0111100001,
-                    I32 => 0b1011100001,
-                    I64 => 0b1111100001,
-                    _ => unreachable!(),
-                };
-                sink.put4(enc_dmb_ish()); // dmb ish
-
-                let srcloc = state.cur_srcloc();
-                if srcloc != SourceLoc::default() {
-                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
-                }
-                let uimm12scaled_zero = UImm12Scaled::zero(I8 /*irrelevant*/);
-                sink.put4(enc_ldst_uimm12(
-                    op,
-                    uimm12scaled_zero,
-                    r_addr,
-                    r_data.to_reg(),
-                ));
+            &Inst::LoadAcquire { access_ty, rt, rn } => {
+                sink.put4(enc_ldar(access_ty, rt, rn));
             }
-            &Inst::AtomicStore { ty, r_data, r_addr } => {
-                let op = match ty {
-                    I8 => 0b0011100000,
-                    I16 => 0b0111100000,
-                    I32 => 0b1011100000,
-                    I64 => 0b1111100000,
-                    _ => unreachable!(),
-                };
-
-                let srcloc = state.cur_srcloc();
-                if srcloc != SourceLoc::default() {
-                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
-                }
-                let uimm12scaled_zero = UImm12Scaled::zero(I8 /*irrelevant*/);
-                sink.put4(enc_ldst_uimm12(op, uimm12scaled_zero, r_addr, r_data));
-                sink.put4(enc_dmb_ish()); // dmb ish
+            &Inst::StoreRelease { access_ty, rt, rn } => {
+                sink.put4(enc_stlr(access_ty, rt, rn));
             }
             &Inst::Fence {} => {
                 sink.put4(enc_dmb_ish()); // dmb ish
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
index b27d183a94..9e45c6795c 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -5891,7 +5891,7 @@ fn test_aarch64_binemit() {
             ty: I16,
             op: inst_common::AtomicRmwOp::Xor,
         },
-        "BF3B03D53B7F5F487C031ACA3C7F1848B8FFFFB5BF3B03D5",
+        "3BFF5F487C031ACA3CFF1848B8FFFFB5",
         "atomically { 16_bits_at_[x25]) Xor= x26 ; x27 = old_value_at_[x25]; x24,x28 = trash }",
     ));
 
@@ -5900,7 +5900,7 @@ fn test_aarch64_binemit() {
             ty: I32,
             op: inst_common::AtomicRmwOp::Xchg,
         },
-        "BF3B03D53B7F5F88FC031AAA3C7F1888B8FFFFB5BF3B03D5",
+        "3BFF5F88FC031AAA3CFF1888B8FFFFB5",
         "atomically { 32_bits_at_[x25]) Xchg= x26 ; x27 = old_value_at_[x25]; x24,x28 = trash }",
     ));
     insns.push((
@@ -5947,56 +5947,112 @@ fn test_aarch64_binemit() {
         Inst::AtomicCASLoop {
             ty: I8,
         },
-        "BF3B03D53B7F5F08581F40927F0318EB610000543C7F180878FFFFB5BF3B03D5",
+        "3BFF5F087F033AEB610000543CFF180898FFFFB5",
         "atomically { compare-and-swap(8_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }"
     ));
 
+    insns.push((
+        Inst::AtomicCASLoop {
+            ty: I16,
+        },
+        "3BFF5F487F233AEB610000543CFF184898FFFFB5",
+        "atomically { compare-and-swap(16_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }"
+    ));
+
+    insns.push((
+        Inst::AtomicCASLoop {
+            ty: I32,
+        },
+        "3BFF5F887F031AEB610000543CFF188898FFFFB5",
+        "atomically { compare-and-swap(32_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }"
+    ));
+
     insns.push((
         Inst::AtomicCASLoop {
             ty: I64,
         },
-        "BF3B03D53B7F5FC8F8031AAA7F0318EB610000543C7F18C878FFFFB5BF3B03D5",
+        "3BFF5FC87F031AEB610000543CFF18C898FFFFB5",
         "atomically { compare-and-swap(64_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }"
     ));
 
     insns.push((
-        Inst::AtomicLoad {
-            ty: I8,
-            r_data: writable_xreg(7),
-            r_addr: xreg(28),
+        Inst::LoadAcquire {
+            access_ty: I8,
+            rt: writable_xreg(7),
+            rn: xreg(28),
         },
-        "BF3B03D587034039",
-        "atomically { x7 = zero_extend_8_bits_at[x28] }",
+        "87FFDF08",
+        "ldarb w7, [x28]",
     ));
 
     insns.push((
-        Inst::AtomicLoad {
-            ty: I64,
-            r_data: writable_xreg(28),
-            r_addr: xreg(7),
+        Inst::LoadAcquire {
+            access_ty: I16,
+            rt: writable_xreg(2),
+            rn: xreg(3),
         },
-        "BF3B03D5FC0040F9",
-        "atomically { x28 = zero_extend_64_bits_at[x7] }",
+        "62FCDF48",
+        "ldarh w2, [x3]",
     ));
 
     insns.push((
-        Inst::AtomicStore {
-            ty: I16,
-            r_data: xreg(17),
-            r_addr: xreg(8),
+        Inst::LoadAcquire {
+            access_ty: I32,
+            rt: writable_xreg(15),
+            rn: xreg(0),
         },
-        "11010079BF3B03D5",
-        "atomically { 16_bits_at[x8] = x17 }",
+        "0FFCDF88",
+        "ldar w15, [x0]",
     ));
 
     insns.push((
-        Inst::AtomicStore {
-            ty: I32,
-            r_data: xreg(18),
-            r_addr: xreg(7),
+        Inst::LoadAcquire {
+            access_ty: I64,
+            rt: writable_xreg(28),
+            rn: xreg(7),
         },
-        "F20000B9BF3B03D5",
-        "atomically { 32_bits_at[x7] = x18 }",
+        "FCFCDFC8",
+        "ldar x28, [x7]",
+    ));
+
+    insns.push((
+        Inst::StoreRelease {
+            access_ty: I8,
+            rt: xreg(7),
+            rn: xreg(28),
+        },
+        "87FF9F08",
+        "stlrb w7, [x28]",
+    ));
+
+    insns.push((
+        Inst::StoreRelease {
+            access_ty: I16,
+            rt: xreg(2),
+            rn: xreg(3),
+        },
+        "62FC9F48",
+        "stlrh w2, [x3]",
+    ));
+
+    insns.push((
+        Inst::StoreRelease {
+            access_ty: I32,
+            rt: xreg(15),
+            rn: xreg(0),
+        },
+        "0FFC9F88",
+        "stlr w15, [x0]",
+    ));
+
+    insns.push((
+        Inst::StoreRelease {
+            access_ty: I64,
+            rt: xreg(28),
+            rn: xreg(7),
+        },
+        "FCFC9FC8",
+        "stlr x28, [x7]",
     ));
 
     insns.push((Inst::Fence {}, "BF3B03D5", "dmb ish"));
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index d498bc9b85..ce1b520429 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -789,10 +789,9 @@ pub enum Inst {
     },
 
     /// Similar to AtomicRMW, a compare-and-swap operation implemented using a load-linked
-    /// store-conditional loop. The sequence is both preceded and followed by a fence which is
-    /// at least as comprehensive as that of the `Fence` instruction below.  This instruction
-    /// is sequentially consistent.  Note that the operand conventions, although very similar
-    /// to AtomicRMW, are different:
+    /// store-conditional loop.
+    /// This instruction is sequentially consistent.
+    /// Note that the operand conventions, although very similar to AtomicRMW, are different:
     ///
     /// x25   (rd) address
     /// x26   (rd) expected value
@@ -803,22 +802,21 @@ pub enum Inst {
         ty: Type, // I8, I16, I32 or I64
     },
 
-    /// Read `ty` bits from address `r_addr`, zero extend the loaded value to 64 bits and put it
-    /// in `r_data`.  The load instruction is preceded by a fence at least as comprehensive as
-    /// that of the `Fence` instruction below.  This instruction is sequentially consistent.
-    AtomicLoad {
-        ty: Type, // I8, I16, I32 or I64
-        r_data: Writable<Reg>,
-        r_addr: Reg,
+    /// Read `access_ty` bits from address `rt`, either 8, 16, 32 or 64-bits, and put
+    /// it in `rn`, optionally zero-extending to fill a word or double word result.
+    /// This instruction is sequentially consistent.
+    LoadAcquire {
+        access_ty: Type, // I8, I16, I32 or I64
+        rt: Writable<Reg>,
+        rn: Reg,
     },
 
-    /// Write the lowest `ty` bits of `r_data` to address `r_addr`, with a memory fence
-    /// instruction following the store.  The fence is at least as comprehensive as that of the
-    /// `Fence` instruction below.  This instruction is sequentially consistent.
-    AtomicStore {
-        ty: Type, // I8, I16, I32 or I64
-        r_data: Reg,
-        r_addr: Reg,
+    /// Write the lowest `ty` bits of `rt` to address `rn`.
+    /// This instruction is sequentially consistent.
+    StoreRelease {
+        access_ty: Type, // I8, I16, I32 or I64
+        rt: Reg,
+        rn: Reg,
     },
 
     /// A memory fence.  This must provide ordering to ensure that, at a minimum, neither loads
@@ -1940,13 +1938,13 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
             collector.add_def(writable_xreg(24));
             collector.add_def(writable_xreg(27));
         }
-        &Inst::AtomicLoad { r_data, r_addr, .. } => {
-            collector.add_use(r_addr);
-            collector.add_def(r_data);
+        &Inst::LoadAcquire { rt, rn, .. } => {
+            collector.add_use(rn);
+            collector.add_def(rt);
         }
-        &Inst::AtomicStore { r_data, r_addr, .. } => {
-            collector.add_use(r_addr);
-            collector.add_use(r_data);
+        &Inst::StoreRelease { rt, rn, .. } => {
+            collector.add_use(rn);
+            collector.add_use(rt);
         }
         &Inst::Fence {} => {}
         &Inst::FpuMove64 { rd, rn } => {
@@ -2579,21 +2577,21 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
         &mut Inst::AtomicCASLoop { .. } => {
             // There are no vregs to map in this insn.
         }
-        &mut Inst::AtomicLoad {
-            ref mut r_data,
-            ref mut r_addr,
+        &mut Inst::LoadAcquire {
+            ref mut rt,
+            ref mut rn,
             ..
         } => {
-            map_def(mapper, r_data);
-            map_use(mapper, r_addr);
+            map_def(mapper, rt);
+            map_use(mapper, rn);
         }
-        &mut Inst::AtomicStore {
-            ref mut r_data,
-            ref mut r_addr,
+        &mut Inst::StoreRelease {
+            ref mut rt,
+            ref mut rn,
             ..
         } => {
-            map_use(mapper, r_data);
-            map_use(mapper, r_addr);
+            map_use(mapper, rt);
+            map_use(mapper, rn);
         }
         &mut Inst::Fence {} => {}
         &mut Inst::FpuMove64 {
@@ -3643,25 +3641,35 @@ impl Inst {
                     "atomically {{ compare-and-swap({}_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }}",
                     ty.bits())
             }
-            &Inst::AtomicLoad {
-                ty, r_data, r_addr, ..
+            &Inst::LoadAcquire {
+                access_ty, rt, rn, ..
             } => {
-                format!(
-                    "atomically {{ {} = zero_extend_{}_bits_at[{}] }}",
-                    r_data.show_rru(mb_rru),
-                    ty.bits(),
-                    r_addr.show_rru(mb_rru)
-                )
+                let (op, ty) = match access_ty {
+                    I8 => ("ldarb", I32),
+                    I16 => ("ldarh", I32),
+                    I32 => ("ldar", I32),
+                    I64 => ("ldar", I64),
+                    _ => panic!("Unsupported type: {}", access_ty),
+                };
+                let size = OperandSize::from_ty(ty);
+                let rt = show_ireg_sized(rt.to_reg(), mb_rru, size);
+                let rn = rn.show_rru(mb_rru);
+                format!("{} {}, [{}]", op, rt, rn)
             }
-            &Inst::AtomicStore {
-                ty, r_data, r_addr, ..
+            &Inst::StoreRelease {
+                access_ty, rt, rn, ..
             } => {
-                format!(
-                    "atomically {{ {}_bits_at[{}] = {} }}",
-                    ty.bits(),
-                    r_addr.show_rru(mb_rru),
-                    r_data.show_rru(mb_rru)
-                )
+                let (op, ty) = match access_ty {
+                    I8 => ("stlrb", I32),
+                    I16 => ("stlrh", I32),
+                    I32 => ("stlr", I32),
+                    I64 => ("stlr", I64),
+                    _ => panic!("Unsupported type: {}", access_ty),
+                };
+                let size = OperandSize::from_ty(ty);
+                let rt = show_ireg_sized(rt, mb_rru, size);
+                let rn = rn.show_rru(mb_rru);
+                format!("{} {}, [{}]", op, rt, rn)
             }
             &Inst::Fence {} => {
                 format!("dmb ish")
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index f9440dbbb1..8a4df2026b 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -1522,28 +1522,40 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             }
         }
 
-        Opcode::AtomicLoad => {
-            let r_data = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let ty_access = ty.unwrap();
-            assert!(is_valid_atomic_transaction_ty(ty_access));
-            ctx.emit(Inst::AtomicLoad {
-                ty: ty_access,
-                r_data,
-                r_addr,
-            });
+        Opcode::AtomicLoad
+        | Opcode::AtomicUload8
+        | Opcode::AtomicUload16
+        | Opcode::AtomicUload32 => {
+            let rt = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let ty = ty.unwrap();
+            let access_ty = match op {
+                Opcode::AtomicLoad => ty,
+                Opcode::AtomicUload8 => I8,
+                Opcode::AtomicUload16 => I16,
+                Opcode::AtomicUload32 => I32,
+                _ => panic!(),
+            };
+            assert!(is_valid_atomic_transaction_ty(access_ty));
+            ctx.emit(Inst::LoadAcquire { access_ty, rt, rn });
         }
 
-        Opcode::AtomicStore => {
-            let r_data = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let r_addr = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
-            let ty_access = ctx.input_ty(insn, 0);
-            assert!(is_valid_atomic_transaction_ty(ty_access));
-            ctx.emit(Inst::AtomicStore {
-                ty: ty_access,
-                r_data,
-                r_addr,
-            });
+        Opcode::AtomicStore
+        | Opcode::AtomicStore32
+        | Opcode::AtomicStore16
+        | Opcode::AtomicStore8 => {
+            let rt = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+            let ty = ctx.input_ty(insn, 0);
+            let access_ty = match op {
+                Opcode::AtomicStore => ty,
+                Opcode::AtomicStore32 => I32,
+                Opcode::AtomicStore16 => I16,
+                Opcode::AtomicStore8 => I8,
+                _ => unreachable!(),
+            };
+            assert!(is_valid_atomic_transaction_ty(access_ty));
+            ctx.emit(Inst::StoreRelease { access_ty, rt, rn });
         }
 
         Opcode::Fence => {
diff --git a/cranelift/codegen/src/isa/s390x/lower.rs b/cranelift/codegen/src/isa/s390x/lower.rs
index b13edc4bb2..8ff375788a 100644
--- a/cranelift/codegen/src/isa/s390x/lower.rs
+++ b/cranelift/codegen/src/isa/s390x/lower.rs
@@ -2734,37 +2734,61 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 ctx.emit(Inst::AtomicCas64 { rd, rn, mem });
             }
         }
-        Opcode::AtomicLoad => {
+        Opcode::AtomicLoad
+        | Opcode::AtomicUload8
+        | Opcode::AtomicUload16
+        | Opcode::AtomicUload32 => {
             let flags = ctx.memflags(insn).unwrap();
             let endianness = flags.endianness(Endianness::Big);
             let ty = ty.unwrap();
-            assert!(is_valid_atomic_transaction_ty(ty));
+            let access_ty = match op {
+                Opcode::AtomicLoad => ty,
+                Opcode::AtomicUload8 => types::I8,
+                Opcode::AtomicUload16 => types::I16,
+                Opcode::AtomicUload32 => types::I32,
+                _ => unreachable!(),
+            };
+            assert!(is_valid_atomic_transaction_ty(access_ty));
 
             let mem = lower_address(ctx, &inputs[..], 0, flags);
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
             if endianness == Endianness::Big {
-                ctx.emit(match ty_bits(ty) {
-                    8 => Inst::Load32ZExt8 { rd, mem },
-                    16 => Inst::Load32ZExt16 { rd, mem },
-                    32 => Inst::Load32 { rd, mem },
-                    64 => Inst::Load64 { rd, mem },
+                ctx.emit(match (ty_bits(access_ty), ty_bits(ty)) {
+                    (8, 32) => Inst::Load32ZExt8 { rd, mem },
+                    (8, 64) => Inst::Load64ZExt8 { rd, mem },
+                    (16, 32) => Inst::Load32ZExt16 { rd, mem },
+                    (16, 64) => Inst::Load64ZExt16 { rd, mem },
+                    (32, 32) => Inst::Load32 { rd, mem },
+                    (32, 64) => Inst::Load64ZExt32 { rd, mem },
+                    (64, 64) => Inst::Load64 { rd, mem },
                     _ => panic!("Unsupported size in load"),
                 });
             } else {
-                ctx.emit(match ty_bits(ty) {
-                    8 => Inst::Load32ZExt8 { rd, mem },
-                    16 => Inst::LoadRev16 { rd, mem },
-                    32 => Inst::LoadRev32 { rd, mem },
-                    64 => Inst::LoadRev64 { rd, mem },
+                ctx.emit(match (ty_bits(access_ty), ty_bits(ty)) {
+                    (8, 32) => Inst::Load32ZExt8 { rd, mem },
+                    (8, 64) => Inst::Load64ZExt8 { rd, mem },
+                    (16, 32) => Inst::LoadRev16 { rd, mem },
+                    (32, 32) => Inst::LoadRev32 { rd, mem },
+                    (64, 64) => Inst::LoadRev64 { rd, mem },
                     _ => panic!("Unsupported size in load"),
                 });
             }
         }
-        Opcode::AtomicStore => {
+        Opcode::AtomicStore
+        | Opcode::AtomicStore32
+        | Opcode::AtomicStore16
+        | Opcode::AtomicStore8 => {
             let flags = ctx.memflags(insn).unwrap();
             let endianness = flags.endianness(Endianness::Big);
-            let ty = ctx.input_ty(insn, 0);
+            let data_ty = ctx.input_ty(insn, 0);
+            let ty = match op {
+                Opcode::AtomicStore => data_ty,
+                Opcode::AtomicStore32 => types::I32,
+                Opcode::AtomicStore16 => types::I16,
+                Opcode::AtomicStore8 => types::I8,
+                _ => unreachable!(),
+            };
             assert!(is_valid_atomic_transaction_ty(ty));
 
             let mem = lower_address(ctx, &inputs[1..], 0, flags);
diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs
index b4c05cee8f..19433dc71e 100644
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -5825,7 +5825,10 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64));
         }
 
-        Opcode::AtomicLoad => {
+        Opcode::AtomicLoad
+        | Opcode::AtomicUload8
+        | Opcode::AtomicUload16
+        | Opcode::AtomicUload32 => {
             // This is a normal load.  The x86-TSO memory model provides sufficient sequencing
             // to satisfy the CLIF synchronisation requirements for `AtomicLoad` without the
             // need for any fence instructions.
@@ -5847,11 +5850,21 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             }
         }
 
-        Opcode::AtomicStore => {
+        Opcode::AtomicStore
+        | Opcode::AtomicStore32
+        | Opcode::AtomicStore16
+        | Opcode::AtomicStore8 => {
             // This is a normal store, followed by an `mfence` instruction.
             let data = put_input_in_reg(ctx, inputs[0]);
             let addr = lower_to_amode(ctx, inputs[1], 0);
-            let ty_access = ctx.input_ty(insn, 0);
+            let data_ty = ctx.input_ty(insn, 0);
+            let ty_access = match op {
+                Opcode::AtomicStore => data_ty,
+                Opcode::AtomicStore32 => types::I32,
+                Opcode::AtomicStore16 => types::I16,
+                Opcode::AtomicStore8 => types::I8,
+                _ => unreachable!(),
+            };
             assert!(is_valid_atomic_transaction_ty(ty_access));
 
             ctx.emit(Inst::store(ty_access, data, addr));
diff --git a/cranelift/filetests/filetests/isa/aarch64/atomic_load.clif b/cranelift/filetests/filetests/isa/aarch64/atomic_load.clif
new file mode 100644
index 0000000000..31af721015
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/aarch64/atomic_load.clif
@@ -0,0 +1,72 @@
+test compile
+target aarch64
+
+function %atomic_load_i64(i64) -> i64 {
+block0(v0: i64):
+  v1 = atomic_load.i64 v0
+  return v1
+}
+
+; check: ldar x0, [x0]
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %atomic_load_i32(i64) -> i32 {
+block0(v0: i64):
+  v1 = atomic_load.i32 v0
+  return v1
+}
+
+; check: ldar w0, [x0]
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %atomic_uload_i32_i64(i64) -> i64 {
+block0(v0: i64):
+  v1 = atomic_uload32.i64 v0
+  return v1
+}
+
+; check: ldar w0, [x0]
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %atomic_uload_i16_i32(i64) -> i32 {
+block0(v0: i64):
+  v1 = atomic_uload16.i32 v0
+  return v1
+}
+
+; check: ldarh w0, [x0]
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %atomic_uload_i16_i64(i64) -> i64 {
+block0(v0: i64):
+  v1 = atomic_uload16.i64 v0
+  return v1
+}
+
+; check: ldarh w0, [x0]
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %atomic_uload_i8_i32(i64) -> i32 {
+block0(v0: i64):
+  v1 = atomic_uload8.i32 v0
+  return v1
+}
+
+; check: ldarb w0, [x0]
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %atomic_uload_i8_i64(i64) -> i64 {
+block0(v0: i64):
+  v1 = atomic_uload8.i64 v0
+  return v1
+}
+
+; check: ldarb w0, [x0]
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
diff --git a/cranelift/filetests/filetests/isa/aarch64/atomic_store.clif b/cranelift/filetests/filetests/isa/aarch64/atomic_store.clif
new file mode 100644
index 0000000000..9c0cd529d7
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/aarch64/atomic_store.clif
@@ -0,0 +1,72 @@
+test compile
+target aarch64
+
+function %atomic_store_i64(i64, i64) {
+block0(v0: i64, v1: i64):
+  atomic_store.i64 v0, v1
+  return
+}
+
+; check: stlr x0, [x1]
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %atomic_store_i32(i32, i64) {
+block0(v0: i32, v1: i64):
+  atomic_store.i32 v0, v1
+  return
+}
+
+; check: stlr w0, [x1]
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %atomic_ustore_i32_i64(i64, i64) {
+block0(v0: i64, v1: i64):
+  atomic_store32.i64 v0, v1
+  return
+}
+
+; check: stlr w0, [x1]
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %atomic_ustore_i16_i32(i32, i64) {
+block0(v0: i32, v1: i64):
+  atomic_store16.i32 v0, v1
+  return
+}
+
+; check: stlrh w0, [x1]
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %atomic_ustore_i16_i64(i64, i64) {
+block0(v0: i64, v1: i64):
+  atomic_store16.i64 v0, v1
+  return
+}
+
+; check: stlrh w0, [x1]
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %atomic_ustore_i8_i32(i32, i64) {
+block0(v0: i32, v1: i64):
+  atomic_store8.i32 v0, v1
+  return
+}
+
+; check: stlrb w0, [x1]
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %atomic_ustore_i8_i64(i64, i64) {
+block0(v0: i64, v1: i64):
+  atomic_store8.i64 v0, v1
+  return
+}
+
+; check: stlrb w0, [x1]
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
diff --git a/cranelift/filetests/filetests/isa/s390x/atomic_load-little.clif b/cranelift/filetests/filetests/isa/s390x/atomic_load-little.clif
index 5556176bbb..629c432370 100644
--- a/cranelift/filetests/filetests/isa/s390x/atomic_load-little.clif
+++ b/cranelift/filetests/filetests/isa/s390x/atomic_load-little.clif
@@ -41,29 +41,29 @@ block0:
 ; check:  larl %r1, %sym + 0 ; lrv %r2, 0(%r1)
 ; nextln: br %r14
 
-function %atomic_load_i16(i64) -> i16 {
+function %atomic_load_i16(i64) -> i32 {
 block0(v0: i64):
-  v1 = atomic_load.i16 little v0
+  v1 = atomic_uload16.i32 little v0
   return v1
 }
 
 ; check:  lrvh %r2, 0(%r2)
 ; nextln: br %r14
 
-function %atomic_load_i16_sym() -> i16 {
+function %atomic_load_i16_sym() -> i32 {
   gv0 = symbol colocated %sym
 block0:
   v0 = symbol_value.i64 gv0
-  v1 = atomic_load.i16 little v0
+  v1 = atomic_uload16.i32 little v0
   return v1
 }
 
 ; check:  larl %r1, %sym + 0 ; lrvh %r2, 0(%r1)
 ; nextln: br %r14
 
-function %atomic_load_i8(i64) -> i8 {
+function %atomic_load_i8(i64) -> i32 {
 block0(v0: i64):
-  v1 = atomic_load.i8 little v0
+  v1 = atomic_uload8.i32 little v0
   return v1
 }
 
diff --git a/cranelift/filetests/filetests/isa/s390x/atomic_load.clif b/cranelift/filetests/filetests/isa/s390x/atomic_load.clif
index b361aaa4c7..9a58de52d1 100644
--- a/cranelift/filetests/filetests/isa/s390x/atomic_load.clif
+++ b/cranelift/filetests/filetests/isa/s390x/atomic_load.clif
@@ -41,29 +41,29 @@ block0:
 ; check:  lrl %r2, %sym + 0
 ; nextln: br %r14
 
-function %atomic_load_i16(i64) -> i16 {
+function %atomic_load_i16(i64) -> i32 {
 block0(v0: i64):
-  v1 = atomic_load.i16 v0
+  v1 = atomic_uload16.i32 v0
   return v1
 }
 
 ; check:  llh %r2, 0(%r2)
 ; nextln: br %r14
 
-function %atomic_load_i16_sym() -> i16 {
+function %atomic_load_i16_sym() -> i32 {
   gv0 = symbol colocated %sym
 block0:
   v0 = symbol_value.i64 gv0
-  v1 = atomic_load.i16 v0
+  v1 = atomic_uload16.i32 v0
   return v1
 }
 
 ; check:  llhrl %r2, %sym + 0
 ; nextln: br %r14
 
-function %atomic_load_i8(i64) -> i8 {
+function %atomic_load_i8(i64) -> i32 {
 block0(v0: i64):
-  v1 = atomic_load.i8 v0
+  v1 = atomic_uload8.i32 v0
   return v1
 }
 
diff --git a/cranelift/interpreter/src/step.rs b/cranelift/interpreter/src/step.rs
index c9c0372980..2b892de57c 100644
--- a/cranelift/interpreter/src/step.rs
+++ b/cranelift/interpreter/src/step.rs
@@ -625,8 +625,14 @@ where
         Opcode::Iconcat => assign(Value::concat(arg(0)?, arg(1)?)?),
         Opcode::AtomicRmw => unimplemented!("AtomicRmw"),
         Opcode::AtomicCas => unimplemented!("AtomicCas"),
-        Opcode::AtomicLoad => unimplemented!("AtomicLoad"),
-        Opcode::AtomicStore => unimplemented!("AtomicStore"),
+        Opcode::AtomicLoad
+        | Opcode::AtomicUload8
+        | Opcode::AtomicUload16
+        | Opcode::AtomicUload32 => unimplemented!("AtomicLoad"),
+        Opcode::AtomicStore
+        | Opcode::AtomicStore8
+        | Opcode::AtomicStore16
+        | Opcode::AtomicStore32 => unimplemented!("AtomicStore"),
         Opcode::Fence => unimplemented!("Fence"),
         Opcode::WideningPairwiseDotProductS => unimplemented!("WideningPairwiseDotProductS"),
         Opcode::SqmulRoundSat => unimplemented!("SqmulRoundSat"),