[AArch64] Improve AtomicRMWLoop (#3839)

2022-02-23 18:47:59 +00:00
parent 141af7523a
commit d307a4ab9a
5 changed files with 1442 additions and 44 deletions
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -1338,10 +1338,6 @@ impl MachInstEmit for Inst {
                   both the store-data and success-flag operands of stlxr.  This causes the
                   instruction's behaviour to be "CONSTRAINED UNPREDICTABLE", so we use x24
                   instead for the success-flag.
-
-                   In the case where the operation is 'xchg', the second insn is instead
-                     mov          x28, x26
-                   so that we simply write in the destination, the "2nd arg for op".
                */
                // TODO: We should not hardcode registers here, a better idea would be to
                // pass some scratch registers in the AtomicRMWLoop pseudo-instruction, and use those
@@ -1363,19 +1359,17 @@ impl MachInstEmit for Inst {
                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
                }
                sink.put4(enc_ldaxr(ty, x27wr, x25)); // ldaxr x27, [x25]
+                let size = OperandSize::from_ty(ty);

                match op {
-                    AtomicRmwOp::Xchg => {
-                        // mov x28, x26
-                        Inst::Mov64 { rd: x28wr, rm: x26 }.emit(sink, emit_info, state);
-                    }
+                    AtomicRmwOp::Xchg => {} // do nothing
                    AtomicRmwOp::Nand => {
                        // and x28, x27, x26
                        // mvn x28, x28

                        Inst::AluRRR {
                            alu_op: ALUOp::And,
-                            size: OperandSize::Size64,
+                            size,
                            rd: x28wr,
                            rn: x27,
                            rm: x26,
@@ -1384,7 +1378,7 @@ impl MachInstEmit for Inst {

                        Inst::AluRRR {
                            alu_op: ALUOp::OrrNot,
-                            size: OperandSize::Size64,
+                            size,
                            rd: x28wr,
                            rn: xzr,
                            rm: x28,
@@ -1408,7 +1402,7 @@ impl MachInstEmit for Inst {

                        Inst::AluRRR {
                            alu_op: ALUOp::SubS,
-                            size: OperandSize::from_ty(ty),
+                            size,
                            rd: writable_zero_reg(),
                            rn: x27,
                            rm: x26,
@@ -1441,7 +1435,7 @@ impl MachInstEmit for Inst {

                        Inst::AluRRR {
                            alu_op,
-                            size: OperandSize::Size64,
+                            size,
                            rd: x28wr,
                            rn: x27,
                            rm: x26,
@@ -1454,7 +1448,11 @@ impl MachInstEmit for Inst {
                if srcloc != SourceLoc::default() {
                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
                }
+                if op == AtomicRmwOp::Xchg {
+                    sink.put4(enc_stlxr(ty, x24wr, x26, x25)); // stlxr w24, x26, [x25]
+                } else {
                    sink.put4(enc_stlxr(ty, x24wr, x28, x25)); // stlxr w24, x28, [x25]
+                }

                // cbnz w24, again
                // Note, we're actually testing x24, and relying on the default zero-high-half
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -6105,8 +6105,80 @@ fn test_aarch64_binemit() {
            ty: I16,
            op: inst_common::AtomicRmwOp::Xor,
        },
-        "3BFF5F487C031ACA3CFF1848B8FFFFB5",
-        "atomically { 16_bits_at_[x25]) Xor= x26 ; x27 = old_value_at_[x25]; x24,x28 = trash }",
+        "3BFF5F487C031A4A3CFF1848B8FFFFB5",
+        "1: ldaxrh w27, [x25]; eor w28, w27, w26; stlxrh w24, w28, [x25]; cbnz w24, 1b",
+    ));
+    insns.push((
+        Inst::AtomicRMWLoop {
+            ty: I8,
+            op: inst_common::AtomicRmwOp::Add,
+        },
+        "3BFF5F087C031A0B3CFF1808B8FFFFB5",
+        "1: ldaxrb w27, [x25]; add w28, w27, w26; stlxrb w24, w28, [x25]; cbnz w24, 1b",
+    ));
+    insns.push((
+        Inst::AtomicRMWLoop {
+            ty: I32,
+            op: inst_common::AtomicRmwOp::Or,
+        },
+        "3BFF5F887C031A2A3CFF1888B8FFFFB5",
+        "1: ldaxr w27, [x25]; orr w28, w27, w26; stlxr w24, w28, [x25]; cbnz w24, 1b",
+    ));
+    insns.push((
+        Inst::AtomicRMWLoop {
+            ty: I64,
+            op: inst_common::AtomicRmwOp::And,
+        },
+        "3BFF5FC87C031A8A3CFF18C8B8FFFFB5",
+        "1: ldaxr x27, [x25]; and x28, x27, x26; stlxr w24, x28, [x25]; cbnz w24, 1b",
+    ));
+    insns.push((
+        Inst::AtomicRMWLoop {
+            ty: I8,
+            op: inst_common::AtomicRmwOp::Xchg,
+        },
+        "3BFF5F083AFF1808D8FFFFB5",
+        "1: ldaxrb w27, [x25]; stlxrb w24, w26, [x25]; cbnz w24, 1b",
+    ));
+    insns.push((
+        Inst::AtomicRMWLoop {
+            ty: I16,
+            op: inst_common::AtomicRmwOp::Nand,
+        },
+        "3BFF5F487C031A0AFC033C2A3CFF184898FFFFB5",
+        "1: ldaxrh w27, [x25]; and w28, w27, w26; mvn w28, w28; stlxrh w24, w28, [x25]; cbnz w24, 1b",
+    ));
+    insns.push((
+        Inst::AtomicRMWLoop {
+            ty: I32,
+            op: inst_common::AtomicRmwOp::Smin,
+        },
+        "3BFF5F887F031A6B7CB39A9A3CFF188898FFFFB5",
+        "1: ldaxr w27, [x25]; cmp w27, w26; csel w28, w27, w26, lt; stlxr w24, w28, [x25]; cbnz w24, 1b",
+    ));
+    insns.push((
+        Inst::AtomicRMWLoop {
+            ty: I64,
+            op: inst_common::AtomicRmwOp::Smax,
+        },
+        "3BFF5FC87F031AEB7CC39A9A3CFF18C898FFFFB5",
+        "1: ldaxr x27, [x25]; cmp x27, x26; csel x28, x27, x26, gt; stlxr w24, x28, [x25]; cbnz w24, 1b",
+    ));
+    insns.push((
+        Inst::AtomicRMWLoop {
+            ty: I8,
+            op: inst_common::AtomicRmwOp::Umin,
+        },
+        "3BFF5F087F031A6B7C339A9A3CFF180898FFFFB5",
+        "1: ldaxrb w27, [x25]; cmp w27, w26; csel w28, w27, w26, lo; stlxrb w24, w28, [x25]; cbnz w24, 1b",
+    ));
+    insns.push((
+        Inst::AtomicRMWLoop {
+            ty: I16,
+            op: inst_common::AtomicRmwOp::Umax,
+        },
+        "3BFF5F487F031A6B7C839A9A3CFF184898FFFFB5",
+        "1: ldaxrh w27, [x25]; cmp w27, w26; csel w28, w27, w26, hi; stlxrh w24, w28, [x25]; cbnz w24, 1b",
    ));

    insns.push((
@@ -6462,14 +6534,6 @@ fn test_aarch64_binemit() {
        "lduminal x25, x26, [x27]",
    ));

-    insns.push((
-        Inst::AtomicRMWLoop {
-            ty: I32,
-            op: inst_common::AtomicRmwOp::Xchg,
-        },
-        "3BFF5F88FC031AAA3CFF1888B8FFFFB5",
-        "atomically { 32_bits_at_[x25]) Xchg= x26 ; x27 = old_value_at_[x25]; x24,x28 = trash }",
-    ));
    insns.push((
        Inst::AtomicCAS {
            rs: writable_xreg(28),
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -688,13 +688,15 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
        &Inst::CCmpImm { rn, .. } => {
            collector.add_use(rn);
        }
-        &Inst::AtomicRMWLoop { .. } => {
+        &Inst::AtomicRMWLoop { op, .. } => {
            collector.add_use(xreg(25));
            collector.add_use(xreg(26));
            collector.add_def(writable_xreg(24));
            collector.add_def(writable_xreg(27));
+            if op != AtomicRmwOp::Xchg {
                collector.add_def(writable_xreg(28));
            }
+        }
        &Inst::AtomicRMW { rs, rt, rn, .. } => {
            collector.add_use(rs);
            collector.add_def(rt);
@@ -2399,9 +2401,60 @@ impl Inst {
                format!("{}{} {}, {}, [{}]", op, ty_suffix, rs, rt, rn)
            }
            &Inst::AtomicRMWLoop { ty, op, .. } => {
-                format!(
-                    "atomically {{ {}_bits_at_[x25]) {:?}= x26 ; x27 = old_value_at_[x25]; x24,x28 = trash }}",
-                    ty.bits(), op)
+                let ty_suffix = match ty {
+                    I8 => "b",
+                    I16 => "h",
+                    _ => "",
+                };
+                let size = OperandSize::from_ty(ty);
+                let r_status = show_ireg_sized(xreg(24), mb_rru, OperandSize::Size32);
+                let r_arg2 = show_ireg_sized(xreg(26), mb_rru, size);
+                let r_tmp = show_ireg_sized(xreg(27), mb_rru, size);
+                let mut r_dst = show_ireg_sized(xreg(28), mb_rru, size);
+
+                let mut loop_str: String = "1: ".to_string();
+                loop_str.push_str(&format!("ldaxr{} {}, [x25]; ", ty_suffix, r_tmp));
+
+                let op_str = match op {
+                    inst_common::AtomicRmwOp::Add => "add",
+                    inst_common::AtomicRmwOp::Sub => "sub",
+                    inst_common::AtomicRmwOp::Xor => "eor",
+                    inst_common::AtomicRmwOp::Or => "orr",
+                    inst_common::AtomicRmwOp::And => "and",
+                    _ => "",
+                };
+
+                if op_str.is_empty() {
+                    match op {
+                        inst_common::AtomicRmwOp::Xchg => r_dst = r_arg2,
+                        inst_common::AtomicRmwOp::Nand => {
+                            loop_str.push_str(&format!("and {}, {}, {}; ", r_dst, r_tmp, r_arg2));
+                            loop_str.push_str(&format!("mvn {}, {}; ", r_dst, r_dst));
+                        }
+                        _ => {
+                            loop_str.push_str(&format!("cmp {}, {}; ", r_tmp, r_arg2));
+                            let cond = match op {
+                                inst_common::AtomicRmwOp::Smin => "lt",
+                                inst_common::AtomicRmwOp::Smax => "gt",
+                                inst_common::AtomicRmwOp::Umin => "lo",
+                                inst_common::AtomicRmwOp::Umax => "hi",
+                                _ => unreachable!(),
+                            };
+                            loop_str.push_str(&format!(
+                                "csel {}, {}, {}, {}; ",
+                                r_dst, r_tmp, r_arg2, cond
+                            ));
+                        }
+                    };
+                } else {
+                    loop_str.push_str(&format!("{} {}, {}, {}; ", op_str, r_dst, r_tmp, r_arg2));
+                }
+                loop_str.push_str(&format!(
+                    "stlxr{} {}, {}, [x25]; ",
+                    ty_suffix, r_status, r_dst
+                ));
+                loop_str.push_str(&format!("cbnz {}, 1b", r_status));
+                loop_str
            }
            &Inst::AtomicCAS { rs, rt, rn, ty } => {
                let op = match ty {
--- a/cranelift/filetests/filetests/isa/aarch64/atomic-rmw-lse.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/atomic-rmw-lse.clif
@@ -16,8 +16,8 @@ block0(v0: i64, v1: i64):
 ;   Inst 1:   ret
 ; }}

-function %atomic_rmw_add_i32(i32, i32) {
-block0(v0: i32, v1: i32):
+function %atomic_rmw_add_i32(i64, i32) {
+block0(v0: i64, v1: i32):
    v2 = atomic_rmw.i32 add v0, v1
    return
 }
@@ -31,6 +31,36 @@ block0(v0: i32, v1: i32):
 ;   Inst 1:   ret
 ; }}

+function %atomic_rmw_add_i16(i64, i16) {
+block0(v0: i64, v1: i16):
+    v2 = atomic_rmw.i16 add v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   ldaddalh w1, w0, [x0]
+;   Inst 1:   ret
+; }}
+
+function %atomic_rmw_add_i8(i64, i8) {
+block0(v0: i64, v1: i8):
+    v2 = atomic_rmw.i8 add v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   ldaddalb w1, w0, [x0]
+;   Inst 1:   ret
+; }}
+
 function %atomic_rmw_and_i64(i64, i64) {
 block0(v0: i64, v1: i64):
    v2 = atomic_rmw.i64 and v0, v1
@@ -46,8 +76,8 @@ block0(v0: i64, v1: i64):
 ;   Inst 1:   ret
 ; }}

-function %atomic_rmw_and_i32(i32, i32) {
-block0(v0: i32, v1: i32):
+function %atomic_rmw_and_i32(i64, i32) {
+block0(v0: i64, v1: i32):
    v2 = atomic_rmw.i32 and v0, v1
    return
 }
@@ -61,6 +91,140 @@ block0(v0: i32, v1: i32):
 ;   Inst 1:   ret
 ; }}

+function %atomic_rmw_and_i16(i64, i16) {
+block0(v0: i64, v1: i16):
+    v2 = atomic_rmw.i16 and v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   ldclralh w1, w0, [x0]
+;   Inst 1:   ret
+; }}
+
+function %atomic_rmw_and_i8(i64, i8) {
+block0(v0: i64, v1: i8):
+    v2 = atomic_rmw.i8 and v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   ldclralb w1, w0, [x0]
+;   Inst 1:   ret
+; }}
+
+function %atomic_rmw_nand_i64(i64, i64) {
+block0(v0: i64, v1: i64):
+    v2 = atomic_rmw.i64 nand v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxr x27, [x25]; and x28, x27, x26; mvn x28, x28; stlxr w24, x28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_nand_i32(i64, i32) {
+block0(v0: i64, v1: i32):
+    v2 = atomic_rmw.i32 nand v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxr w27, [x25]; and w28, w27, w26; mvn w28, w28; stlxr w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_nand_i16(i64, i16) {
+block0(v0: i64, v1: i16):
+    v2 = atomic_rmw.i16 nand v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxrh w27, [x25]; and w28, w27, w26; mvn w28, w28; stlxrh w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_nand_i8(i64, i8) {
+block0(v0: i64, v1: i8):
+    v2 = atomic_rmw.i8 nand v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxrb w27, [x25]; and w28, w27, w26; mvn w28, w28; stlxrb w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
 function %atomic_rmw_or_i64(i64, i64) {
 block0(v0: i64, v1: i64):
    v2 = atomic_rmw.i64 or v0, v1
@@ -76,8 +240,8 @@ block0(v0: i64, v1: i64):
 ;   Inst 1:   ret
 ; }}

-function %atomic_rmw_or_i32(i32, i32) {
-block0(v0: i32, v1: i32):
+function %atomic_rmw_or_i32(i64, i32) {
+block0(v0: i64, v1: i32):
    v2 = atomic_rmw.i32 or v0, v1
    return
 }
@@ -91,6 +255,36 @@ block0(v0: i32, v1: i32):
 ;   Inst 1:   ret
 ; }}

+function %atomic_rmw_or_i16(i64, i16) {
+block0(v0: i64, v1: i16):
+    v2 = atomic_rmw.i16 or v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   ldsetalh w1, w0, [x0]
+;   Inst 1:   ret
+; }}
+
+function %atomic_rmw_or_i8(i64, i8) {
+block0(v0: i64, v1: i8):
+    v2 = atomic_rmw.i8 or v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   ldsetalb w1, w0, [x0]
+;   Inst 1:   ret
+; }}
+
 function %atomic_rmw_xor_i64(i64, i64) {
 block0(v0: i64, v1: i64):
    v2 = atomic_rmw.i64 xor v0, v1
@@ -106,8 +300,8 @@ block0(v0: i64, v1: i64):
 ;   Inst 1:   ret
 ; }}

-function %atomic_rmw_xor_i32(i32, i32) {
-block0(v0: i32, v1: i32):
+function %atomic_rmw_xor_i32(i64, i32) {
+block0(v0: i64, v1: i32):
    v2 = atomic_rmw.i32 xor v0, v1
    return
 }
@@ -121,6 +315,36 @@ block0(v0: i32, v1: i32):
 ;   Inst 1:   ret
 ; }}

+function %atomic_rmw_xor_i16(i64, i16) {
+block0(v0: i64, v1: i16):
+    v2 = atomic_rmw.i16 xor v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   ldeoralh w1, w0, [x0]
+;   Inst 1:   ret
+; }}
+
+function %atomic_rmw_xor_i8(i64, i8) {
+block0(v0: i64, v1: i8):
+    v2 = atomic_rmw.i8 xor v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   ldeoralb w1, w0, [x0]
+;   Inst 1:   ret
+; }}
+
 function %atomic_rmw_smax_i64(i64, i64) {
 block0(v0: i64, v1: i64):
    v2 = atomic_rmw.i64 smax v0, v1
@@ -136,8 +360,8 @@ block0(v0: i64, v1: i64):
 ;   Inst 1:   ret
 ; }}

-function %atomic_rmw_smax_i32(i32, i32) {
-block0(v0: i32, v1: i32):
+function %atomic_rmw_smax_i32(i64, i32) {
+block0(v0: i64, v1: i32):
    v2 = atomic_rmw.i32 smax v0, v1
    return
 }
@@ -151,6 +375,36 @@ block0(v0: i32, v1: i32):
 ;   Inst 1:   ret
 ; }}

+function %atomic_rmw_smax_i16(i64, i16) {
+block0(v0: i64, v1: i16):
+    v2 = atomic_rmw.i16 smax v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   ldsmaxalh w1, w0, [x0]
+;   Inst 1:   ret
+; }}
+
+function %atomic_rmw_smax_i8(i64, i8) {
+block0(v0: i64, v1: i8):
+    v2 = atomic_rmw.i8 smax v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   ldsmaxalb w1, w0, [x0]
+;   Inst 1:   ret
+; }}
+
 function %atomic_rmw_umax_i64(i64, i64) {
 block0(v0: i64, v1: i64):
    v2 = atomic_rmw.i64 umax v0, v1
@@ -166,8 +420,8 @@ block0(v0: i64, v1: i64):
 ;   Inst 1:   ret
 ; }}

-function %atomic_rmw_umax_i32(i32, i32) {
-block0(v0: i32, v1: i32):
+function %atomic_rmw_umax_i32(i64, i32) {
+block0(v0: i64, v1: i32):
    v2 = atomic_rmw.i32 umax v0, v1
    return
 }
@@ -181,6 +435,36 @@ block0(v0: i32, v1: i32):
 ;   Inst 1:   ret
 ; }}

+function %atomic_rmw_umax_i16(i64, i16) {
+block0(v0: i64, v1: i16):
+    v2 = atomic_rmw.i16 umax v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   ldumaxalh w1, w0, [x0]
+;   Inst 1:   ret
+; }}
+
+function %atomic_rmw_umax_i8(i64, i8) {
+block0(v0: i64, v1: i8):
+    v2 = atomic_rmw.i8 umax v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   ldumaxalb w1, w0, [x0]
+;   Inst 1:   ret
+; }}
+
 function %atomic_rmw_smin_i64(i64, i64) {
 block0(v0: i64, v1: i64):
    v2 = atomic_rmw.i64 smin v0, v1
@@ -196,8 +480,8 @@ block0(v0: i64, v1: i64):
 ;   Inst 1:   ret
 ; }}

-function %atomic_rmw_smin_i32(i32, i32) {
-block0(v0: i32, v1: i32):
+function %atomic_rmw_smin_i32(i64, i32) {
+block0(v0: i64, v1: i32):
    v2 = atomic_rmw.i32 smin v0, v1
    return
 }
@@ -211,6 +495,36 @@ block0(v0: i32, v1: i32):
 ;   Inst 1:   ret
 ; }}

+function %atomic_rmw_smin_i16(i64, i16) {
+block0(v0: i64, v1: i16):
+    v2 = atomic_rmw.i16 smin v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   ldsminalh w1, w0, [x0]
+;   Inst 1:   ret
+; }}
+
+function %atomic_rmw_smin_i8(i64, i8) {
+block0(v0: i64, v1: i8):
+    v2 = atomic_rmw.i8 smin v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   ldsminalb w1, w0, [x0]
+;   Inst 1:   ret
+; }}
+
 function %atomic_rmw_umin_i64(i64, i64) {
 block0(v0: i64, v1: i64):
    v2 = atomic_rmw.i64 umin v0, v1
@@ -226,8 +540,8 @@ block0(v0: i64, v1: i64):
 ;   Inst 1:   ret
 ; }}

-function %atomic_rmw_umin_i32(i32, i32) {
-block0(v0: i32, v1: i32):
+function %atomic_rmw_umin_i32(i64, i32) {
+block0(v0: i64, v1: i32):
    v2 = atomic_rmw.i32 umin v0, v1
    return
 }
@@ -241,3 +555,33 @@ block0(v0: i32, v1: i32):
 ;   Inst 1:   ret
 ; }}

+function %atomic_rmw_umin_i16(i64, i16) {
+block0(v0: i64, v1: i16):
+    v2 = atomic_rmw.i16 umin v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   lduminalh w1, w0, [x0]
+;   Inst 1:   ret
+; }}
+
+function %atomic_rmw_umin_i8(i64, i8) {
+block0(v0: i64, v1: i8):
+    v2 = atomic_rmw.i8 umin v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   lduminalb w1, w0, [x0]
+;   Inst 1:   ret
+; }}
+
--- a/cranelift/filetests/filetests/isa/aarch64/atomic-rmw.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/atomic-rmw.clif
@@ -0,0 +1,939 @@
+test compile precise-output
+target aarch64
+
+function %atomic_rmw_add_i64(i64, i64) {
+block0(v0: i64, v1: i64):
+    v2 = atomic_rmw.i64 add v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxr x27, [x25]; add x28, x27, x26; stlxr w24, x28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_add_i32(i64, i32) {
+block0(v0: i64, v1: i32):
+    v2 = atomic_rmw.i32 add v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxr w27, [x25]; add w28, w27, w26; stlxr w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_add_i16(i64, i16) {
+block0(v0: i64, v1: i16):
+    v2 = atomic_rmw.i16 add v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxrh w27, [x25]; add w28, w27, w26; stlxrh w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_add_i8(i64, i8) {
+block0(v0: i64, v1: i8):
+    v2 = atomic_rmw.i8 add v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxrb w27, [x25]; add w28, w27, w26; stlxrb w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_and_i64(i64, i64) {
+block0(v0: i64, v1: i64):
+    v2 = atomic_rmw.i64 and v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxr x27, [x25]; and x28, x27, x26; stlxr w24, x28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_and_i32(i64, i32) {
+block0(v0: i64, v1: i32):
+    v2 = atomic_rmw.i32 and v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxr w27, [x25]; and w28, w27, w26; stlxr w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_and_i16(i64, i16) {
+block0(v0: i64, v1: i16):
+    v2 = atomic_rmw.i16 and v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxrh w27, [x25]; and w28, w27, w26; stlxrh w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_and_i8(i64, i8) {
+block0(v0: i64, v1: i8):
+    v2 = atomic_rmw.i8 and v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxrb w27, [x25]; and w28, w27, w26; stlxrb w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_nand_i64(i64, i64) {
+block0(v0: i64, v1: i64):
+    v2 = atomic_rmw.i64 nand v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxr x27, [x25]; and x28, x27, x26; mvn x28, x28; stlxr w24, x28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_nand_i32(i64, i32) {
+block0(v0: i64, v1: i32):
+    v2 = atomic_rmw.i32 nand v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxr w27, [x25]; and w28, w27, w26; mvn w28, w28; stlxr w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_nand_i16(i64, i16) {
+block0(v0: i64, v1: i16):
+    v2 = atomic_rmw.i16 nand v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxrh w27, [x25]; and w28, w27, w26; mvn w28, w28; stlxrh w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_nand_i8(i64, i8) {
+block0(v0: i64, v1: i8):
+    v2 = atomic_rmw.i8 nand v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxrb w27, [x25]; and w28, w27, w26; mvn w28, w28; stlxrb w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_or_i64(i64, i64) {
+block0(v0: i64, v1: i64):
+    v2 = atomic_rmw.i64 or v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxr x27, [x25]; orr x28, x27, x26; stlxr w24, x28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_or_i32(i64, i32) {
+block0(v0: i64, v1: i32):
+    v2 = atomic_rmw.i32 or v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxr w27, [x25]; orr w28, w27, w26; stlxr w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_or_i16(i64, i16) {
+block0(v0: i64, v1: i16):
+    v2 = atomic_rmw.i16 or v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxrh w27, [x25]; orr w28, w27, w26; stlxrh w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_or_i8(i64, i8) {
+block0(v0: i64, v1: i8):
+    v2 = atomic_rmw.i8 or v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxrb w27, [x25]; orr w28, w27, w26; stlxrb w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_xor_i64(i64, i64) {
+block0(v0: i64, v1: i64):
+    v2 = atomic_rmw.i64 xor v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxr x27, [x25]; eor x28, x27, x26; stlxr w24, x28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_xor_i32(i64, i32) {
+block0(v0: i64, v1: i32):
+    v2 = atomic_rmw.i32 xor v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxr w27, [x25]; eor w28, w27, w26; stlxr w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_xor_i16(i64, i16) {
+block0(v0: i64, v1: i16):
+    v2 = atomic_rmw.i16 xor v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxrh w27, [x25]; eor w28, w27, w26; stlxrh w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_xor_i8(i64, i8) {
+block0(v0: i64, v1: i8):
+    v2 = atomic_rmw.i8 xor v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxrb w27, [x25]; eor w28, w27, w26; stlxrb w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_smax_i64(i64, i64) {
+block0(v0: i64, v1: i64):
+    v2 = atomic_rmw.i64 smax v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxr x27, [x25]; cmp x27, x26; csel x28, x27, x26, gt; stlxr w24, x28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_smax_i32(i64, i32) {
+block0(v0: i64, v1: i32):
+    v2 = atomic_rmw.i32 smax v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxr w27, [x25]; cmp w27, w26; csel w28, w27, w26, gt; stlxr w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_smax_i16(i64, i16) {
+block0(v0: i64, v1: i16):
+    v2 = atomic_rmw.i16 smax v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxrh w27, [x25]; cmp w27, w26; csel w28, w27, w26, gt; stlxrh w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_smax_i8(i64, i8) {
+block0(v0: i64, v1: i8):
+    v2 = atomic_rmw.i8 smax v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxrb w27, [x25]; cmp w27, w26; csel w28, w27, w26, gt; stlxrb w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_umax_i64(i64, i64) {
+block0(v0: i64, v1: i64):
+    v2 = atomic_rmw.i64 umax v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxr x27, [x25]; cmp x27, x26; csel x28, x27, x26, hi; stlxr w24, x28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_umax_i32(i64, i32) {
+block0(v0: i64, v1: i32):
+    v2 = atomic_rmw.i32 umax v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxr w27, [x25]; cmp w27, w26; csel w28, w27, w26, hi; stlxr w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_umax_i16(i64, i16) {
+block0(v0: i64, v1: i16):
+    v2 = atomic_rmw.i16 umax v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxrh w27, [x25]; cmp w27, w26; csel w28, w27, w26, hi; stlxrh w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_umax_i8(i64, i8) {
+block0(v0: i64, v1: i8):
+    v2 = atomic_rmw.i8 umax v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxrb w27, [x25]; cmp w27, w26; csel w28, w27, w26, hi; stlxrb w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_smin_i64(i64, i64) {
+block0(v0: i64, v1: i64):
+    v2 = atomic_rmw.i64 smin v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxr x27, [x25]; cmp x27, x26; csel x28, x27, x26, lt; stlxr w24, x28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_smin_i32(i64, i32) {
+block0(v0: i64, v1: i32):
+    v2 = atomic_rmw.i32 smin v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxr w27, [x25]; cmp w27, w26; csel w28, w27, w26, lt; stlxr w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_smin_i16(i64, i16) {
+block0(v0: i64, v1: i16):
+    v2 = atomic_rmw.i16 smin v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxrh w27, [x25]; cmp w27, w26; csel w28, w27, w26, lt; stlxrh w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_smin_i8(i64, i8) {
+block0(v0: i64, v1: i8):
+    v2 = atomic_rmw.i8 smin v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxrb w27, [x25]; cmp w27, w26; csel w28, w27, w26, lt; stlxrb w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_umin_i64(i64, i64) {
+block0(v0: i64, v1: i64):
+    v2 = atomic_rmw.i64 umin v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxr x27, [x25]; cmp x27, x26; csel x28, x27, x26, lo; stlxr w24, x28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_umin_i32(i64, i32) {
+block0(v0: i64, v1: i32):
+    v2 = atomic_rmw.i32 umin v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxr w27, [x25]; cmp w27, w26; csel w28, w27, w26, lo; stlxr w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_umin_i16(i64, i16) {
+block0(v0: i64, v1: i16):
+    v2 = atomic_rmw.i16 umin v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxrh w27, [x25]; cmp w27, w26; csel w28, w27, w26, lo; stlxrh w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+
+function %atomic_rmw_umin_i8(i64, i8) {
+block0(v0: i64, v1: i8):
+    v2 = atomic_rmw.i8 umin v0, v1
+    return
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 13)
+;   Inst 0:   stp fp, lr, [sp, #-16]!
+;   Inst 1:   mov fp, sp
+;   Inst 2:   str x28, [sp, #-16]!
+;   Inst 3:   stp x26, x27, [sp, #-16]!
+;   Inst 4:   stp x24, x25, [sp, #-16]!
+;   Inst 5:   mov x25, x0
+;   Inst 6:   mov x26, x1
+;   Inst 7:   1: ldaxrb w27, [x25]; cmp w27, w26; csel w28, w27, w26, lo; stlxrb w24, w28, [x25]; cbnz w24, 1b
+;   Inst 8:   ldp x24, x25, [sp], #16
+;   Inst 9:   ldp x26, x27, [sp], #16
+;   Inst 10:   ldr x28, [sp], #16
+;   Inst 11:   ldp fp, lr, [sp], #16
+;   Inst 12:   ret
+; }}
+