diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle
index aaede3ab56..0df0f53c5b 100644
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -415,10 +415,10 @@
        ;; The sequence consists of an initial "normal" load from `dst`, followed
        ;; by a loop which computes the new value and tries to compare-and-swap
        ;; ("CAS") it into `dst`, using the native instruction `lock
-       ;; cmpxchg{b,w,l,q}` .  The loop iterates until the CAS is successful.
-       ;; If there is no contention, there will be only one pass through the
-       ;; loop body.  The sequence does *not* perform any explicit memory fence
-       ;; instructions (mfence/sfence/lfence).
+       ;; cmpxchg{b,w,l,q}`.  The loop iterates until the CAS is successful. If
+       ;; there is no contention, there will be only one pass through the loop
+       ;; body.  The sequence does *not* perform any explicit memory fence
+       ;; instructions (`mfence`/`sfence`/`lfence`).
        ;;
        ;; Note that the transaction is atomic in the sense that, as observed by
        ;; some other thread, `dst` either has the initial or final value, but no
@@ -430,15 +430,12 @@
        ;; problem.
        ;;
        ;; This instruction sequence has fixed register uses as follows:
-       ;;
-       ;; %r9   (read) address
-       ;; %r10  (read) second operand for `op`
-       ;; %r11  (written) scratch reg; value afterwards has no meaning
-       ;; %rax  (written) the old value at %r9
-       ;; %rflags is written.  Do not assume anything about it after the instruction.
+       ;; - %rax  (written) the old value at `mem`
+       ;; - %rflags is written.  Do not assume anything about it after the
+       ;;   instruction.
        (AtomicRmwSeq (ty Type) ;; I8, I16, I32, or I64
-                     (op AtomicRmwOp)
-                     (address Reg)
+                     (op MachAtomicRmwOp)
+                     (mem SyntheticAmode)
                      (operand Reg)
                      (temp WritableReg)
                      (dst_old WritableReg))
@@ -2921,6 +2918,19 @@
             (_ Unit (emit (MInst.LockCmpxchg ty replacement expected addr dst))))
         dst))
 
+(decl x64_atomic_rmw_seq (Type MachAtomicRmwOp SyntheticAmode Gpr) Gpr)
+(rule (x64_atomic_rmw_seq ty op mem input)
+      (let ((dst WritableGpr (temp_writable_gpr))
+            (tmp WritableGpr (temp_writable_gpr))
+            (_ Unit (emit (MInst.AtomicRmwSeq ty op mem input tmp dst))))
+        dst))
+
+;; CLIF IR has one enumeration for atomic operations (`AtomicRmwOp`) while the
+;; mach backend has another (`MachAtomicRmwOp`)--this converts one to the other.
+(type MachAtomicRmwOp extern (enum))
+(decl atomic_rmw_op_to_mach_atomic_rmw_op (AtomicRmwOp) MachAtomicRmwOp)
+(extern constructor atomic_rmw_op_to_mach_atomic_rmw_op atomic_rmw_op_to_mach_atomic_rmw_op)
+
 ;;;; Automatic conversions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (convert Gpr InstOutput output_gpr)
@@ -2973,6 +2983,7 @@
 (convert SyntheticAmode XmmMem synthetic_amode_to_xmm_mem)
 
 (convert IntCC CC intcc_to_cc)
+(convert AtomicRmwOp MachAtomicRmwOp atomic_rmw_op_to_mach_atomic_rmw_op)
 
 (decl reg_to_xmm_mem (Reg) XmmMem)
 (rule (reg_to_xmm_mem r)
diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs
index 08f2331afd..6d5e29b999 100644
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -306,7 +306,7 @@ impl Amode {
         }
     }
 
-    /// Add the regs mentioned by `self` to `collector`.
+    /// Add the registers mentioned by `self` to `collector`.
     pub(crate) fn get_operands<F: Fn(VReg) -> VReg>(
         &self,
         collector: &mut OperandCollector<'_, F>,
@@ -325,6 +325,25 @@ impl Amode {
         }
     }
 
+    /// Same as `get_operands`, but add the registers in the "late" phase.
+    pub(crate) fn get_operands_late<F: Fn(VReg) -> VReg>(
+        &self,
+        collector: &mut OperandCollector<'_, F>,
+    ) {
+        match self {
+            Amode::ImmReg { base, .. } => {
+                collector.reg_late_use(*base);
+            }
+            Amode::ImmRegRegShift { base, index, .. } => {
+                collector.reg_late_use(base.to_reg());
+                collector.reg_late_use(index.to_reg());
+            }
+            Amode::RipRelative { .. } => {
+                // RIP isn't involved in regalloc.
+            }
+        }
+    }
+
     pub(crate) fn get_flags(&self) -> MemFlags {
         match self {
             Amode::ImmReg { flags, .. } => *flags,
@@ -426,7 +445,7 @@ impl SyntheticAmode {
         SyntheticAmode::NominalSPOffset { simm32 }
     }
 
-    /// Add the regs mentioned by `self` to `collector`.
+    /// Add the registers mentioned by `self` to `collector`.
     pub(crate) fn get_operands<F: Fn(VReg) -> VReg>(
         &self,
         collector: &mut OperandCollector<'_, F>,
@@ -440,6 +459,20 @@ impl SyntheticAmode {
         }
     }
 
+    /// Same as `get_operands`, but add the register in the "late" phase.
+    pub(crate) fn get_operands_late<F: Fn(VReg) -> VReg>(
+        &self,
+        collector: &mut OperandCollector<'_, F>,
+    ) {
+        match self {
+            SyntheticAmode::Real(addr) => addr.get_operands_late(collector),
+            SyntheticAmode::NominalSPOffset { .. } => {
+                // Nothing to do; the base is SP and isn't involved in regalloc.
+            }
+            SyntheticAmode::ConstantOffset(_) => {}
+        }
+    }
+
     pub(crate) fn finalize(&self, state: &mut EmitState, buffer: &MachBuffer<Inst>) -> Amode {
         match self {
             SyntheticAmode::Real(addr) => addr.clone(),
diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
index 3002f8dd67..9e9a66f881 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -2613,118 +2613,116 @@ pub(crate) fn emit(
         Inst::AtomicRmwSeq {
             ty,
             op,
-            address,
+            mem,
             operand,
             temp,
             dst_old,
         } => {
-            // FIXME: use real vregs for this seq.
-            debug_assert_eq!(*address, regs::r9());
-            debug_assert_eq!(*operand, regs::r10());
-            debug_assert_eq!(temp.to_reg(), regs::r11());
+            let operand = allocs.next(*operand);
+            let temp = allocs.next_writable(*temp);
+            let dst_old = allocs.next_writable(*dst_old);
             debug_assert_eq!(dst_old.to_reg(), regs::rax());
+            let mem = mem.finalize(state, sink).with_allocs(allocs);
 
             // Emit this:
-            //
-            //    mov{zbq,zwq,zlq,q}     (%r9), %rax  // rax = old value
-            //   again:
-            //    movq                   %rax, %r11   // rax = old value, r11 = old value
-            //    `op`q                  %r10, %r11   // rax = old value, r11 = new value
-            //    lock cmpxchg{b,w,l,q}  %r11, (%r9)  // try to store new value
+            //    mov{zbq,zwq,zlq,q}     (%r_address), %rax    // rax = old value
+            //  again:
+            //    movq                   %rax, %r_temp         // rax = old value, r_temp = old value
+            //    `op`q                  %r_operand, %r_temp   // rax = old value, r_temp = new value
+            //    lock cmpxchg{b,w,l,q}  %r_temp, (%r_address) // try to store new value
             //    jnz again // If this is taken, rax will have a "revised" old value
             //
-            // Operand conventions:
-            //    IN:  %r9 (addr), %r10 (2nd arg for `op`)
-            //    OUT: %rax (old value), %r11 (trashed), %rflags (trashed)
+            // Operand conventions: IN:  %r_address, %r_operand OUT: %rax (old
+            //    value), %r_temp (trashed), %rflags (trashed)
             //
-            // In the case where the operation is 'xchg', the "`op`q" instruction is instead
-            //   movq                    %r10, %r11
-            // so that we simply write in the destination, the "2nd arg for `op`".
-            let rax = regs::rax();
-            let r9 = regs::r9();
-            let r10 = regs::r10();
-            let r11 = regs::r11();
-            let rax_w = Writable::from_reg(rax);
-            let r11_w = Writable::from_reg(r11);
-            let amode = Amode::imm_reg(0, r9);
+            // In the case where the operation is 'xchg', the "`op`q"
+            // instruction is instead: movq                    %r_operand,
+            //   %r_temp so that we simply write in the destination, the "2nd
+            // arg for `op`".
+            //
+            // TODO: this sequence can be significantly improved (e.g., to `lock
+            // <op>`) when it is known that `dst_old` is not used later, see
+            // https://github.com/bytecodealliance/wasmtime/issues/2153.
             let again_label = sink.get_label();
 
-            // mov{zbq,zwq,zlq,q} (%r9), %rax
+            // mov{zbq,zwq,zlq,q} (%r_address), %rax
             // No need to call `add_trap` here, since the `i1` emit will do that.
-            let i1 = Inst::load(*ty, amode.clone(), rax_w, ExtKind::ZeroExtend);
+            let i1 = Inst::load(*ty, mem.clone(), dst_old, ExtKind::ZeroExtend);
             i1.emit(&[], sink, info, state);
 
             // again:
             sink.bind_label(again_label);
 
-            // movq %rax, %r11
-            let i2 = Inst::mov_r_r(OperandSize::Size64, rax, r11_w);
+            // movq %rax, %r_temp
+            let i2 = Inst::mov_r_r(OperandSize::Size64, dst_old.to_reg(), temp);
             i2.emit(&[], sink, info, state);
 
-            let r10_rmi = RegMemImm::reg(r10);
+            let operand_rmi = RegMemImm::reg(operand);
+            use inst_common::MachAtomicRmwOp as RmwOp;
             match op {
-                inst_common::AtomicRmwOp::Xchg => {
-                    // movq %r10, %r11
-                    let i3 = Inst::mov_r_r(OperandSize::Size64, r10, r11_w);
+                RmwOp::Xchg => {
+                    // movq %r_operand, %r_temp
+                    let i3 = Inst::mov_r_r(OperandSize::Size64, operand, temp);
                     i3.emit(&[], sink, info, state);
                 }
-                inst_common::AtomicRmwOp::Nand => {
-                    // andq %r10, %r11
+                RmwOp::Nand => {
+                    // andq %r_operand, %r_temp
                     let i3 =
-                        Inst::alu_rmi_r(OperandSize::Size64, AluRmiROpcode::And, r10_rmi, r11_w);
+                        Inst::alu_rmi_r(OperandSize::Size64, AluRmiROpcode::And, operand_rmi, temp);
                     i3.emit(&[], sink, info, state);
 
-                    // notq %r11
-                    let i4 = Inst::not(OperandSize::Size64, r11_w);
+                    // notq %r_temp
+                    let i4 = Inst::not(OperandSize::Size64, temp);
                     i4.emit(&[], sink, info, state);
                 }
-                inst_common::AtomicRmwOp::Umin
-                | inst_common::AtomicRmwOp::Umax
-                | inst_common::AtomicRmwOp::Smin
-                | inst_common::AtomicRmwOp::Smax => {
-                    // cmp %r11, %r10
-                    let i3 = Inst::cmp_rmi_r(OperandSize::from_ty(*ty), RegMemImm::reg(r11), r10);
+                RmwOp::Umin | RmwOp::Umax | RmwOp::Smin | RmwOp::Smax => {
+                    // cmp %r_temp, %r_operand
+                    let i3 = Inst::cmp_rmi_r(
+                        OperandSize::from_ty(*ty),
+                        RegMemImm::reg(temp.to_reg()),
+                        operand,
+                    );
                     i3.emit(&[], sink, info, state);
 
-                    // cmovcc %r10, %r11
+                    // cmovcc %r_operand, %r_temp
                     let cc = match op {
-                        inst_common::AtomicRmwOp::Umin => CC::BE,
-                        inst_common::AtomicRmwOp::Umax => CC::NB,
-                        inst_common::AtomicRmwOp::Smin => CC::LE,
-                        inst_common::AtomicRmwOp::Smax => CC::NL,
+                        RmwOp::Umin => CC::BE,
+                        RmwOp::Umax => CC::NB,
+                        RmwOp::Smin => CC::LE,
+                        RmwOp::Smax => CC::NL,
                         _ => unreachable!(),
                     };
-                    let i4 = Inst::cmove(OperandSize::Size64, cc, RegMem::reg(r10), r11_w);
+                    let i4 = Inst::cmove(OperandSize::Size64, cc, RegMem::reg(operand), temp);
                     i4.emit(&[], sink, info, state);
                 }
                 _ => {
-                    // opq %r10, %r11
+                    // opq %r_operand, %r_temp
                     let alu_op = match op {
-                        inst_common::AtomicRmwOp::Add => AluRmiROpcode::Add,
-                        inst_common::AtomicRmwOp::Sub => AluRmiROpcode::Sub,
-                        inst_common::AtomicRmwOp::And => AluRmiROpcode::And,
-                        inst_common::AtomicRmwOp::Or => AluRmiROpcode::Or,
-                        inst_common::AtomicRmwOp::Xor => AluRmiROpcode::Xor,
-                        inst_common::AtomicRmwOp::Xchg
-                        | inst_common::AtomicRmwOp::Nand
-                        | inst_common::AtomicRmwOp::Umin
-                        | inst_common::AtomicRmwOp::Umax
-                        | inst_common::AtomicRmwOp::Smin
-                        | inst_common::AtomicRmwOp::Smax => unreachable!(),
+                        RmwOp::Add => AluRmiROpcode::Add,
+                        RmwOp::Sub => AluRmiROpcode::Sub,
+                        RmwOp::And => AluRmiROpcode::And,
+                        RmwOp::Or => AluRmiROpcode::Or,
+                        RmwOp::Xor => AluRmiROpcode::Xor,
+                        RmwOp::Xchg
+                        | RmwOp::Nand
+                        | RmwOp::Umin
+                        | RmwOp::Umax
+                        | RmwOp::Smin
+                        | RmwOp::Smax => unreachable!(),
                     };
-                    let i3 = Inst::alu_rmi_r(OperandSize::Size64, alu_op, r10_rmi, r11_w);
+                    let i3 = Inst::alu_rmi_r(OperandSize::Size64, alu_op, operand_rmi, temp);
                     i3.emit(&[], sink, info, state);
                 }
             }
 
-            // lock cmpxchg{b,w,l,q} %r11, (%r9)
+            // lock cmpxchg{b,w,l,q} %r_temp, (%r_address)
             // No need to call `add_trap` here, since the `i4` emit will do that.
             let i4 = Inst::LockCmpxchg {
                 ty: *ty,
-                replacement: r11,
-                expected: regs::rax(),
-                mem: amode.into(),
-                dst_old: Writable::from_reg(regs::rax()),
+                replacement: temp.to_reg(),
+                expected: dst_old.to_reg(),
+                mem: mem.into(),
+                dst_old,
             };
             i4.emit(&[], sink, info, state);
 
diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
index 06166a55bd..4bcf936f14 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -4611,6 +4611,8 @@ fn test_x64_emit() {
         3,
     )
     .into();
+    // Use `r9` with a 0 offset.
+    let am3: SyntheticAmode = Amode::imm_reg(0, r9).into();
 
     // A general 8-bit case.
     insns.push((
@@ -4743,8 +4745,8 @@ fn test_x64_emit() {
     insns.push((
         Inst::AtomicRmwSeq {
             ty: types::I8,
-            op: inst_common::AtomicRmwOp::Or,
-            address: r9,
+            op: inst_common::MachAtomicRmwOp::Or,
+            mem: am3.clone(),
             operand: r10,
             temp: w_r11,
             dst_old: w_rax
@@ -4755,8 +4757,8 @@ fn test_x64_emit() {
     insns.push((
         Inst::AtomicRmwSeq {
             ty: types::I16,
-            op: inst_common::AtomicRmwOp::And,
-            address: r9,
+            op: inst_common::MachAtomicRmwOp::And,
+            mem: am3.clone(),
             operand: r10,
             temp: w_r11,
             dst_old: w_rax
@@ -4767,8 +4769,8 @@ fn test_x64_emit() {
     insns.push((
         Inst::AtomicRmwSeq {
             ty: types::I32,
-            op: inst_common::AtomicRmwOp::Xchg,
-            address: r9,
+            op: inst_common::MachAtomicRmwOp::Xchg,
+            mem: am3.clone(),
             operand: r10,
             temp: w_r11,
             dst_old: w_rax
@@ -4779,8 +4781,8 @@ fn test_x64_emit() {
     insns.push((
         Inst::AtomicRmwSeq {
             ty: types::I32,
-            op: inst_common::AtomicRmwOp::Umin,
-            address: r9,
+            op: inst_common::MachAtomicRmwOp::Umin,
+            mem: am3.clone(),
             operand: r10,
             temp: w_r11,
             dst_old: w_rax
@@ -4791,8 +4793,8 @@ fn test_x64_emit() {
     insns.push((
         Inst::AtomicRmwSeq {
             ty: types::I64,
-            op: inst_common::AtomicRmwOp::Add,
-            address: r9,
+            op: inst_common::MachAtomicRmwOp::Add,
+            mem: am3.clone(),
             operand: r10,
             temp: w_r11,
             dst_old: w_rax
diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs
index f6250859a2..a9d0a79146 100644
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -2052,13 +2052,19 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
             mem.get_operands(collector);
         }
 
-        Inst::AtomicRmwSeq { .. } => {
-            // FIXME: take vreg args, not fixed regs, and just use
-            // reg_fixed_use here.
-            collector.reg_use(regs::r9());
-            collector.reg_use(regs::r10());
-            collector.reg_def(Writable::from_reg(regs::r11()));
-            collector.reg_def(Writable::from_reg(regs::rax()));
+        Inst::AtomicRmwSeq {
+            operand,
+            temp,
+            dst_old,
+            mem,
+            ..
+        } => {
+            collector.reg_late_use(*operand);
+            collector.reg_early_def(*temp);
+            // This `fixed_def` is needed because `CMPXCHG` always uses this
+            // register implicitly.
+            collector.reg_fixed_def(*dst_old, regs::rax());
+            mem.get_operands_late(collector)
         }
 
         Inst::Ret { rets } => {
diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
index ed1cfe5579..848794f85e 100644
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -2851,3 +2851,19 @@
 (rule (lower (has_type (and (fits_in_64 ty) (ty_int _))
                   (atomic_cas flags address expected replacement)))
       (x64_cmpxchg ty expected replacement (to_amode flags address (zero_offset))))
+
+;; Rules for `atomic_rmw` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; This is a simple, general-case atomic update, based on a loop involving
+;; `cmpxchg`.  Note that we could do much better than this in the case where the
+;; old value at the location (that is to say, the SSA `Value` computed by this
+;; CLIF instruction) is not required.  In that case, we could instead implement
+;; this using a single `lock`-prefixed x64 read-modify-write instruction.  Also,
+;; even in the case where the old value is required, for the `add` and `sub`
+;; cases, we can use the single instruction `lock xadd`.  However, those
+;; improvements have been left for another day. TODO: filed as
+;; https://github.com/bytecodealliance/wasmtime/issues/2153.
+
+(rule (lower (has_type (and (fits_in_64 ty) (ty_int _))
+                  (atomic_rmw flags op address input)))
+      (x64_atomic_rmw_seq ty op (to_amode flags address (zero_offset)) input))
diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs
index 98b56bb6ca..2c4641296d 100644
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -44,14 +44,6 @@ fn is_bool_ty(ty: Type) -> bool {
     }
 }
 
-/// This is target-word-size dependent.  And it excludes booleans and reftypes.
-fn is_valid_atomic_transaction_ty(ty: Type) -> bool {
-    match ty {
-        types::I8 | types::I16 | types::I32 | types::I64 => true,
-        _ => false,
-    }
-}
-
 /// Returns whether the given specified `input` is a result produced by an instruction with Opcode
 /// `op`.
 // TODO investigate failures with checking against the result index.
@@ -2136,54 +2128,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::AtomicRmw => {
-            // This is a simple, general-case atomic update, based on a loop involving
-            // `cmpxchg`.  Note that we could do much better than this in the case where the old
-            // value at the location (that is to say, the SSA `Value` computed by this CLIF
-            // instruction) is not required.  In that case, we could instead implement this
-            // using a single `lock`-prefixed x64 read-modify-write instruction.  Also, even in
-            // the case where the old value is required, for the `add` and `sub` cases, we can
-            // use the single instruction `lock xadd`.  However, those improvements have been
-            // left for another day.
-            // TODO: filed as https://github.com/bytecodealliance/wasmtime/issues/2153
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let mut addr = put_input_in_reg(ctx, inputs[0]);
-            let mut arg2 = put_input_in_reg(ctx, inputs[1]);
-            let ty_access = ty.unwrap();
-            assert!(is_valid_atomic_transaction_ty(ty_access));
-
-            // Make sure that both args are in virtual regs, since in effect we have to do a
-            // parallel copy to get them safely to the AtomicRmwSeq input regs, and that's not
-            // guaranteed safe if either is in a real reg.
-            addr = ctx.ensure_in_vreg(addr, types::I64);
-            arg2 = ctx.ensure_in_vreg(arg2, types::I64);
-
-            // Move the args to the preordained AtomicRMW input regs.  Note that `AtomicRmwSeq`
-            // operates at whatever width is specified by `ty`, so there's no need to
-            // zero-extend `arg2` in the case of `ty` being I8/I16/I32.
-            ctx.emit(Inst::gen_move(
-                Writable::from_reg(regs::r9()),
-                addr,
-                types::I64,
-            ));
-            ctx.emit(Inst::gen_move(
-                Writable::from_reg(regs::r10()),
-                arg2,
-                types::I64,
-            ));
-
-            // Now the AtomicRmwSeq (pseudo-) instruction itself
-            let op = inst_common::AtomicRmwOp::from(ctx.data(insn).atomic_rmw_op().unwrap());
-            ctx.emit(Inst::AtomicRmwSeq {
-                ty: ty_access,
-                op,
-                address: regs::r9(),
-                operand: regs::r10(),
-                temp: Writable::from_reg(regs::r11()),
-                dst_old: Writable::from_reg(regs::rax()),
-            });
-
-            // And finally, copy the preordained AtomicRmwSeq output reg to its destination.
-            ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64));
+            implemented_in_isle(ctx);
         }
 
         Opcode::AtomicCas => {
diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs
index b8a71206a6..9b068b1eba 100644
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@@ -2,7 +2,10 @@
 
 // Pull in the ISLE generated code.
 pub(crate) mod generated_code;
-use crate::machinst::{InputSourceInst, Reg, Writable};
+use crate::{
+    ir::AtomicRmwOp,
+    machinst::{InputSourceInst, Reg, Writable},
+};
 use generated_code::MInst;
 
 // Types that the generated ISLE code uses via `use super::*`.
@@ -23,7 +26,7 @@ use crate::{
         },
     },
     machinst::{
-        isle::*, AtomicRmwOp, InsnInput, InsnOutput, LowerCtx, VCodeConstant, VCodeConstantData,
+        isle::*, InsnInput, InsnOutput, LowerCtx, MachAtomicRmwOp, VCodeConstant, VCodeConstantData,
     },
 };
 use std::boxed::Box;
@@ -565,6 +568,11 @@ where
     fn zero_offset(&mut self) -> Offset32 {
         Offset32::new(0)
     }
+
+    #[inline]
+    fn atomic_rmw_op_to_mach_atomic_rmw_op(&mut self, op: &AtomicRmwOp) -> MachAtomicRmwOp {
+        MachAtomicRmwOp::from(*op)
+    }
 }
 
 // Since x64 doesn't have 8x16 shifts and we must use a 16x8 shift instead, we
diff --git a/cranelift/codegen/src/machinst/inst_common.rs b/cranelift/codegen/src/machinst/inst_common.rs
index a4fb41ec76..740a0346cc 100644
--- a/cranelift/codegen/src/machinst/inst_common.rs
+++ b/cranelift/codegen/src/machinst/inst_common.rs
@@ -45,11 +45,10 @@ pub(crate) fn insn_outputs<I: VCodeInst, C: LowerCtx<I = I>>(
 //============================================================================
 // Atomic instructions.
 
-/// Atomic memory update operations.  As of 21 Aug 2020 these are used for the aarch64 and x64
-/// targets.
+/// Atomic memory update operations.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 #[repr(u8)]
-pub enum AtomicRmwOp {
+pub enum MachAtomicRmwOp {
     /// Add
     Add,
     /// Sub
@@ -74,21 +73,22 @@ pub enum AtomicRmwOp {
     Smax,
 }
 
-impl AtomicRmwOp {
-    /// Converts an `ir::AtomicRmwOp` to the corresponding `inst_common::AtomicRmwOp`.
+impl MachAtomicRmwOp {
+    /// Converts an `ir::AtomicRmwOp` to the corresponding
+    /// `inst_common::AtomicRmwOp`.
     pub fn from(ir_op: ir::AtomicRmwOp) -> Self {
         match ir_op {
-            ir::AtomicRmwOp::Add => AtomicRmwOp::Add,
-            ir::AtomicRmwOp::Sub => AtomicRmwOp::Sub,
-            ir::AtomicRmwOp::And => AtomicRmwOp::And,
-            ir::AtomicRmwOp::Nand => AtomicRmwOp::Nand,
-            ir::AtomicRmwOp::Or => AtomicRmwOp::Or,
-            ir::AtomicRmwOp::Xor => AtomicRmwOp::Xor,
-            ir::AtomicRmwOp::Xchg => AtomicRmwOp::Xchg,
-            ir::AtomicRmwOp::Umin => AtomicRmwOp::Umin,
-            ir::AtomicRmwOp::Umax => AtomicRmwOp::Umax,
-            ir::AtomicRmwOp::Smin => AtomicRmwOp::Smin,
-            ir::AtomicRmwOp::Smax => AtomicRmwOp::Smax,
+            ir::AtomicRmwOp::Add => MachAtomicRmwOp::Add,
+            ir::AtomicRmwOp::Sub => MachAtomicRmwOp::Sub,
+            ir::AtomicRmwOp::And => MachAtomicRmwOp::And,
+            ir::AtomicRmwOp::Nand => MachAtomicRmwOp::Nand,
+            ir::AtomicRmwOp::Or => MachAtomicRmwOp::Or,
+            ir::AtomicRmwOp::Xor => MachAtomicRmwOp::Xor,
+            ir::AtomicRmwOp::Xchg => MachAtomicRmwOp::Xchg,
+            ir::AtomicRmwOp::Umin => MachAtomicRmwOp::Umin,
+            ir::AtomicRmwOp::Umax => MachAtomicRmwOp::Umax,
+            ir::AtomicRmwOp::Smin => MachAtomicRmwOp::Smin,
+            ir::AtomicRmwOp::Smax => MachAtomicRmwOp::Smax,
         }
     }
 }
diff --git a/cranelift/codegen/src/machinst/reg.rs b/cranelift/codegen/src/machinst/reg.rs
index 671edd59df..5c4bd494a3 100644
--- a/cranelift/codegen/src/machinst/reg.rs
+++ b/cranelift/codegen/src/machinst/reg.rs
@@ -328,6 +328,11 @@ impl<'a, F: Fn(VReg) -> VReg> OperandCollector<'a, F> {
         self.add_operand(Operand::reg_use(reg.into()));
     }
 
+    /// Add a register use, at the end of the instruction (`After` position).
+    pub fn reg_late_use(&mut self, reg: Reg) {
+        self.add_operand(Operand::reg_use_at_end(reg.into()));
+    }
+
     /// Add multiple register uses.
     pub fn reg_uses(&mut self, regs: &[Reg]) {
         for &reg in regs {