x64: port atomic_rmw to ISLE (#4389)

* x64: port `atomic_rmw` to ISLE This change ports `atomic_rmw` to ISLE for the x64 backend. It does not change the lowering in any way, though it seems possible that the fixed regs need not be as fixed and that there are opportunities for single instruction lowerings. It does rename `inst_common::AtomicRmwOp` to `MachAtomicRmwOp` to disambiguate with the IR enum with the same name. * x64: remove remaining hardcoded register constraints for `atomic_rmw` * x64: use `SyntheticAmode` in `AtomicRmwSeq` * review: add missing reg collector for amode * review: collect memory registers in the 'late' phase
2022-07-06 16:58:59 -07:00
parent f98076ae88
commit 8629cbc6a4
10 changed files with 196 additions and 172 deletions
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -415,10 +415,10 @@
       ;; The sequence consists of an initial "normal" load from `dst`, followed
       ;; by a loop which computes the new value and tries to compare-and-swap
       ;; ("CAS") it into `dst`, using the native instruction `lock
-       ;; cmpxchg{b,w,l,q}` .  The loop iterates until the CAS is successful.
-       ;; If there is no contention, there will be only one pass through the
-       ;; loop body.  The sequence does *not* perform any explicit memory fence
-       ;; instructions (mfence/sfence/lfence).
+       ;; cmpxchg{b,w,l,q}`.  The loop iterates until the CAS is successful. If
+       ;; there is no contention, there will be only one pass through the loop
+       ;; body.  The sequence does *not* perform any explicit memory fence
+       ;; instructions (`mfence`/`sfence`/`lfence`).
       ;;
       ;; Note that the transaction is atomic in the sense that, as observed by
       ;; some other thread, `dst` either has the initial or final value, but no
@@ -430,15 +430,12 @@
       ;; problem.
       ;;
       ;; This instruction sequence has fixed register uses as follows:
-       ;;
-       ;; %r9   (read) address
-       ;; %r10  (read) second operand for `op`
-       ;; %r11  (written) scratch reg; value afterwards has no meaning
-       ;; %rax  (written) the old value at %r9
-       ;; %rflags is written.  Do not assume anything about it after the instruction.
+       ;; - %rax  (written) the old value at `mem`
+       ;; - %rflags is written.  Do not assume anything about it after the
+       ;;   instruction.
       (AtomicRmwSeq (ty Type) ;; I8, I16, I32, or I64
-                     (op AtomicRmwOp)
-                     (address Reg)
+                     (op MachAtomicRmwOp)
+                     (mem SyntheticAmode)
                     (operand Reg)
                     (temp WritableReg)
                     (dst_old WritableReg))
@@ -2921,6 +2918,19 @@
            (_ Unit (emit (MInst.LockCmpxchg ty replacement expected addr dst))))
        dst))

+(decl x64_atomic_rmw_seq (Type MachAtomicRmwOp SyntheticAmode Gpr) Gpr)
+(rule (x64_atomic_rmw_seq ty op mem input)
+      (let ((dst WritableGpr (temp_writable_gpr))
+            (tmp WritableGpr (temp_writable_gpr))
+            (_ Unit (emit (MInst.AtomicRmwSeq ty op mem input tmp dst))))
+        dst))
+
+;; CLIF IR has one enumeration for atomic operations (`AtomicRmwOp`) while the
+;; mach backend has another (`MachAtomicRmwOp`)--this converts one to the other.
+(type MachAtomicRmwOp extern (enum))
+(decl atomic_rmw_op_to_mach_atomic_rmw_op (AtomicRmwOp) MachAtomicRmwOp)
+(extern constructor atomic_rmw_op_to_mach_atomic_rmw_op atomic_rmw_op_to_mach_atomic_rmw_op)
+
 ;;;; Automatic conversions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 (convert Gpr InstOutput output_gpr)
@@ -2973,6 +2983,7 @@
 (convert SyntheticAmode XmmMem synthetic_amode_to_xmm_mem)

 (convert IntCC CC intcc_to_cc)
+(convert AtomicRmwOp MachAtomicRmwOp atomic_rmw_op_to_mach_atomic_rmw_op)

 (decl reg_to_xmm_mem (Reg) XmmMem)
 (rule (reg_to_xmm_mem r)
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -306,7 +306,7 @@ impl Amode {
        }
    }

-    /// Add the regs mentioned by `self` to `collector`.
+    /// Add the registers mentioned by `self` to `collector`.
    pub(crate) fn get_operands<F: Fn(VReg) -> VReg>(
        &self,
        collector: &mut OperandCollector<'_, F>,
@@ -325,6 +325,25 @@ impl Amode {
        }
    }

+    /// Same as `get_operands`, but add the registers in the "late" phase.
+    pub(crate) fn get_operands_late<F: Fn(VReg) -> VReg>(
+        &self,
+        collector: &mut OperandCollector<'_, F>,
+    ) {
+        match self {
+            Amode::ImmReg { base, .. } => {
+                collector.reg_late_use(*base);
+            }
+            Amode::ImmRegRegShift { base, index, .. } => {
+                collector.reg_late_use(base.to_reg());
+                collector.reg_late_use(index.to_reg());
+            }
+            Amode::RipRelative { .. } => {
+                // RIP isn't involved in regalloc.
+            }
+        }
+    }
+
    pub(crate) fn get_flags(&self) -> MemFlags {
        match self {
            Amode::ImmReg { flags, .. } => *flags,
@@ -426,7 +445,7 @@ impl SyntheticAmode {
        SyntheticAmode::NominalSPOffset { simm32 }
    }

-    /// Add the regs mentioned by `self` to `collector`.
+    /// Add the registers mentioned by `self` to `collector`.
    pub(crate) fn get_operands<F: Fn(VReg) -> VReg>(
        &self,
        collector: &mut OperandCollector<'_, F>,
@@ -440,6 +459,20 @@ impl SyntheticAmode {
        }
    }

+    /// Same as `get_operands`, but add the register in the "late" phase.
+    pub(crate) fn get_operands_late<F: Fn(VReg) -> VReg>(
+        &self,
+        collector: &mut OperandCollector<'_, F>,
+    ) {
+        match self {
+            SyntheticAmode::Real(addr) => addr.get_operands_late(collector),
+            SyntheticAmode::NominalSPOffset { .. } => {
+                // Nothing to do; the base is SP and isn't involved in regalloc.
+            }
+            SyntheticAmode::ConstantOffset(_) => {}
+        }
+    }
+
    pub(crate) fn finalize(&self, state: &mut EmitState, buffer: &MachBuffer<Inst>) -> Amode {
        match self {
            SyntheticAmode::Real(addr) => addr.clone(),
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -2613,118 +2613,116 @@ pub(crate) fn emit(
        Inst::AtomicRmwSeq {
            ty,
            op,
-            address,
+            mem,
            operand,
            temp,
            dst_old,
        } => {
-            // FIXME: use real vregs for this seq.
-            debug_assert_eq!(*address, regs::r9());
-            debug_assert_eq!(*operand, regs::r10());
-            debug_assert_eq!(temp.to_reg(), regs::r11());
+            let operand = allocs.next(*operand);
+            let temp = allocs.next_writable(*temp);
+            let dst_old = allocs.next_writable(*dst_old);
            debug_assert_eq!(dst_old.to_reg(), regs::rax());
+            let mem = mem.finalize(state, sink).with_allocs(allocs);

            // Emit this:
-            //
-            //    mov{zbq,zwq,zlq,q}     (%r9), %rax  // rax = old value
-            //   again:
-            //    movq                   %rax, %r11   // rax = old value, r11 = old value
-            //    `op`q                  %r10, %r11   // rax = old value, r11 = new value
-            //    lock cmpxchg{b,w,l,q}  %r11, (%r9)  // try to store new value
+            //    mov{zbq,zwq,zlq,q}     (%r_address), %rax    // rax = old value
+            //  again:
+            //    movq                   %rax, %r_temp         // rax = old value, r_temp = old value
+            //    `op`q                  %r_operand, %r_temp   // rax = old value, r_temp = new value
+            //    lock cmpxchg{b,w,l,q}  %r_temp, (%r_address) // try to store new value
            //    jnz again // If this is taken, rax will have a "revised" old value
            //
-            // Operand conventions:
-            //    IN:  %r9 (addr), %r10 (2nd arg for `op`)
-            //    OUT: %rax (old value), %r11 (trashed), %rflags (trashed)
+            // Operand conventions: IN:  %r_address, %r_operand OUT: %rax (old
+            //    value), %r_temp (trashed), %rflags (trashed)
            //
-            // In the case where the operation is 'xchg', the "`op`q" instruction is instead
-            //   movq                    %r10, %r11
-            // so that we simply write in the destination, the "2nd arg for `op`".
-            let rax = regs::rax();
-            let r9 = regs::r9();
-            let r10 = regs::r10();
-            let r11 = regs::r11();
-            let rax_w = Writable::from_reg(rax);
-            let r11_w = Writable::from_reg(r11);
-            let amode = Amode::imm_reg(0, r9);
+            // In the case where the operation is 'xchg', the "`op`q"
+            // instruction is instead: movq                    %r_operand,
+            //   %r_temp so that we simply write in the destination, the "2nd
+            // arg for `op`".
+            //
+            // TODO: this sequence can be significantly improved (e.g., to `lock
+            // <op>`) when it is known that `dst_old` is not used later, see
+            // https://github.com/bytecodealliance/wasmtime/issues/2153.
            let again_label = sink.get_label();

-            // mov{zbq,zwq,zlq,q} (%r9), %rax
+            // mov{zbq,zwq,zlq,q} (%r_address), %rax
            // No need to call `add_trap` here, since the `i1` emit will do that.
-            let i1 = Inst::load(*ty, amode.clone(), rax_w, ExtKind::ZeroExtend);
+            let i1 = Inst::load(*ty, mem.clone(), dst_old, ExtKind::ZeroExtend);
            i1.emit(&[], sink, info, state);

            // again:
            sink.bind_label(again_label);

-            // movq %rax, %r11
-            let i2 = Inst::mov_r_r(OperandSize::Size64, rax, r11_w);
+            // movq %rax, %r_temp
+            let i2 = Inst::mov_r_r(OperandSize::Size64, dst_old.to_reg(), temp);
            i2.emit(&[], sink, info, state);

-            let r10_rmi = RegMemImm::reg(r10);
+            let operand_rmi = RegMemImm::reg(operand);
+            use inst_common::MachAtomicRmwOp as RmwOp;
            match op {
-                inst_common::AtomicRmwOp::Xchg => {
-                    // movq %r10, %r11
-                    let i3 = Inst::mov_r_r(OperandSize::Size64, r10, r11_w);
+                RmwOp::Xchg => {
+                    // movq %r_operand, %r_temp
+                    let i3 = Inst::mov_r_r(OperandSize::Size64, operand, temp);
                    i3.emit(&[], sink, info, state);
                }
-                inst_common::AtomicRmwOp::Nand => {
-                    // andq %r10, %r11
+                RmwOp::Nand => {
+                    // andq %r_operand, %r_temp
                    let i3 =
-                        Inst::alu_rmi_r(OperandSize::Size64, AluRmiROpcode::And, r10_rmi, r11_w);
+                        Inst::alu_rmi_r(OperandSize::Size64, AluRmiROpcode::And, operand_rmi, temp);
                    i3.emit(&[], sink, info, state);

-                    // notq %r11
-                    let i4 = Inst::not(OperandSize::Size64, r11_w);
+                    // notq %r_temp
+                    let i4 = Inst::not(OperandSize::Size64, temp);
                    i4.emit(&[], sink, info, state);
                }
-                inst_common::AtomicRmwOp::Umin
-                | inst_common::AtomicRmwOp::Umax
-                | inst_common::AtomicRmwOp::Smin
-                | inst_common::AtomicRmwOp::Smax => {
-                    // cmp %r11, %r10
-                    let i3 = Inst::cmp_rmi_r(OperandSize::from_ty(*ty), RegMemImm::reg(r11), r10);
+                RmwOp::Umin | RmwOp::Umax | RmwOp::Smin | RmwOp::Smax => {
+                    // cmp %r_temp, %r_operand
+                    let i3 = Inst::cmp_rmi_r(
+                        OperandSize::from_ty(*ty),
+                        RegMemImm::reg(temp.to_reg()),
+                        operand,
+                    );
                    i3.emit(&[], sink, info, state);

-                    // cmovcc %r10, %r11
+                    // cmovcc %r_operand, %r_temp
                    let cc = match op {
-                        inst_common::AtomicRmwOp::Umin => CC::BE,
-                        inst_common::AtomicRmwOp::Umax => CC::NB,
-                        inst_common::AtomicRmwOp::Smin => CC::LE,
-                        inst_common::AtomicRmwOp::Smax => CC::NL,
+                        RmwOp::Umin => CC::BE,
+                        RmwOp::Umax => CC::NB,
+                        RmwOp::Smin => CC::LE,
+                        RmwOp::Smax => CC::NL,
                        _ => unreachable!(),
                    };
-                    let i4 = Inst::cmove(OperandSize::Size64, cc, RegMem::reg(r10), r11_w);
+                    let i4 = Inst::cmove(OperandSize::Size64, cc, RegMem::reg(operand), temp);
                    i4.emit(&[], sink, info, state);
                }
                _ => {
-                    // opq %r10, %r11
+                    // opq %r_operand, %r_temp
                    let alu_op = match op {
-                        inst_common::AtomicRmwOp::Add => AluRmiROpcode::Add,
-                        inst_common::AtomicRmwOp::Sub => AluRmiROpcode::Sub,
-                        inst_common::AtomicRmwOp::And => AluRmiROpcode::And,
-                        inst_common::AtomicRmwOp::Or => AluRmiROpcode::Or,
-                        inst_common::AtomicRmwOp::Xor => AluRmiROpcode::Xor,
-                        inst_common::AtomicRmwOp::Xchg
-                        | inst_common::AtomicRmwOp::Nand
-                        | inst_common::AtomicRmwOp::Umin
-                        | inst_common::AtomicRmwOp::Umax
-                        | inst_common::AtomicRmwOp::Smin
-                        | inst_common::AtomicRmwOp::Smax => unreachable!(),
+                        RmwOp::Add => AluRmiROpcode::Add,
+                        RmwOp::Sub => AluRmiROpcode::Sub,
+                        RmwOp::And => AluRmiROpcode::And,
+                        RmwOp::Or => AluRmiROpcode::Or,
+                        RmwOp::Xor => AluRmiROpcode::Xor,
+                        RmwOp::Xchg
+                        | RmwOp::Nand
+                        | RmwOp::Umin
+                        | RmwOp::Umax
+                        | RmwOp::Smin
+                        | RmwOp::Smax => unreachable!(),
                    };
-                    let i3 = Inst::alu_rmi_r(OperandSize::Size64, alu_op, r10_rmi, r11_w);
+                    let i3 = Inst::alu_rmi_r(OperandSize::Size64, alu_op, operand_rmi, temp);
                    i3.emit(&[], sink, info, state);
                }
            }

-            // lock cmpxchg{b,w,l,q} %r11, (%r9)
+            // lock cmpxchg{b,w,l,q} %r_temp, (%r_address)
            // No need to call `add_trap` here, since the `i4` emit will do that.
            let i4 = Inst::LockCmpxchg {
                ty: *ty,
-                replacement: r11,
-                expected: regs::rax(),
-                mem: amode.into(),
-                dst_old: Writable::from_reg(regs::rax()),
+                replacement: temp.to_reg(),
+                expected: dst_old.to_reg(),
+                mem: mem.into(),
+                dst_old,
            };
            i4.emit(&[], sink, info, state);

--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -4611,6 +4611,8 @@ fn test_x64_emit() {
        3,
    )
    .into();
+    // Use `r9` with a 0 offset.
+    let am3: SyntheticAmode = Amode::imm_reg(0, r9).into();

    // A general 8-bit case.
    insns.push((
@@ -4743,8 +4745,8 @@ fn test_x64_emit() {
    insns.push((
        Inst::AtomicRmwSeq {
            ty: types::I8,
-            op: inst_common::AtomicRmwOp::Or,
-            address: r9,
+            op: inst_common::MachAtomicRmwOp::Or,
+            mem: am3.clone(),
            operand: r10,
            temp: w_r11,
            dst_old: w_rax
@@ -4755,8 +4757,8 @@ fn test_x64_emit() {
    insns.push((
        Inst::AtomicRmwSeq {
            ty: types::I16,
-            op: inst_common::AtomicRmwOp::And,
-            address: r9,
+            op: inst_common::MachAtomicRmwOp::And,
+            mem: am3.clone(),
            operand: r10,
            temp: w_r11,
            dst_old: w_rax
@@ -4767,8 +4769,8 @@ fn test_x64_emit() {
    insns.push((
        Inst::AtomicRmwSeq {
            ty: types::I32,
-            op: inst_common::AtomicRmwOp::Xchg,
-            address: r9,
+            op: inst_common::MachAtomicRmwOp::Xchg,
+            mem: am3.clone(),
            operand: r10,
            temp: w_r11,
            dst_old: w_rax
@@ -4779,8 +4781,8 @@ fn test_x64_emit() {
    insns.push((
        Inst::AtomicRmwSeq {
            ty: types::I32,
-            op: inst_common::AtomicRmwOp::Umin,
-            address: r9,
+            op: inst_common::MachAtomicRmwOp::Umin,
+            mem: am3.clone(),
            operand: r10,
            temp: w_r11,
            dst_old: w_rax
@@ -4791,8 +4793,8 @@ fn test_x64_emit() {
    insns.push((
        Inst::AtomicRmwSeq {
            ty: types::I64,
-            op: inst_common::AtomicRmwOp::Add,
-            address: r9,
+            op: inst_common::MachAtomicRmwOp::Add,
+            mem: am3.clone(),
            operand: r10,
            temp: w_r11,
            dst_old: w_rax
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -2052,13 +2052,19 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
            mem.get_operands(collector);
        }

-        Inst::AtomicRmwSeq { .. } => {
-            // FIXME: take vreg args, not fixed regs, and just use
-            // reg_fixed_use here.
-            collector.reg_use(regs::r9());
-            collector.reg_use(regs::r10());
-            collector.reg_def(Writable::from_reg(regs::r11()));
-            collector.reg_def(Writable::from_reg(regs::rax()));
+        Inst::AtomicRmwSeq {
+            operand,
+            temp,
+            dst_old,
+            mem,
+            ..
+        } => {
+            collector.reg_late_use(*operand);
+            collector.reg_early_def(*temp);
+            // This `fixed_def` is needed because `CMPXCHG` always uses this
+            // register implicitly.
+            collector.reg_fixed_def(*dst_old, regs::rax());
+            mem.get_operands_late(collector)
        }

        Inst::Ret { rets } => {
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -2851,3 +2851,19 @@
 (rule (lower (has_type (and (fits_in_64 ty) (ty_int _))
                  (atomic_cas flags address expected replacement)))
      (x64_cmpxchg ty expected replacement (to_amode flags address (zero_offset))))
+
+;; Rules for `atomic_rmw` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; This is a simple, general-case atomic update, based on a loop involving
+;; `cmpxchg`.  Note that we could do much better than this in the case where the
+;; old value at the location (that is to say, the SSA `Value` computed by this
+;; CLIF instruction) is not required.  In that case, we could instead implement
+;; this using a single `lock`-prefixed x64 read-modify-write instruction.  Also,
+;; even in the case where the old value is required, for the `add` and `sub`
+;; cases, we can use the single instruction `lock xadd`.  However, those
+;; improvements have been left for another day. TODO: filed as
+;; https://github.com/bytecodealliance/wasmtime/issues/2153.
+
+(rule (lower (has_type (and (fits_in_64 ty) (ty_int _))
+                  (atomic_rmw flags op address input)))
+      (x64_atomic_rmw_seq ty op (to_amode flags address (zero_offset)) input))
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -44,14 +44,6 @@ fn is_bool_ty(ty: Type) -> bool {
    }
 }

-/// This is target-word-size dependent.  And it excludes booleans and reftypes.
-fn is_valid_atomic_transaction_ty(ty: Type) -> bool {
-    match ty {
-        types::I8 | types::I16 | types::I32 | types::I64 => true,
-        _ => false,
-    }
-}
-
 /// Returns whether the given specified `input` is a result produced by an instruction with Opcode
 /// `op`.
 // TODO investigate failures with checking against the result index.
@@ -2136,54 +2128,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        }

        Opcode::AtomicRmw => {
-            // This is a simple, general-case atomic update, based on a loop involving
-            // `cmpxchg`.  Note that we could do much better than this in the case where the old
-            // value at the location (that is to say, the SSA `Value` computed by this CLIF
-            // instruction) is not required.  In that case, we could instead implement this
-            // using a single `lock`-prefixed x64 read-modify-write instruction.  Also, even in
-            // the case where the old value is required, for the `add` and `sub` cases, we can
-            // use the single instruction `lock xadd`.  However, those improvements have been
-            // left for another day.
-            // TODO: filed as https://github.com/bytecodealliance/wasmtime/issues/2153
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let mut addr = put_input_in_reg(ctx, inputs[0]);
-            let mut arg2 = put_input_in_reg(ctx, inputs[1]);
-            let ty_access = ty.unwrap();
-            assert!(is_valid_atomic_transaction_ty(ty_access));
-
-            // Make sure that both args are in virtual regs, since in effect we have to do a
-            // parallel copy to get them safely to the AtomicRmwSeq input regs, and that's not
-            // guaranteed safe if either is in a real reg.
-            addr = ctx.ensure_in_vreg(addr, types::I64);
-            arg2 = ctx.ensure_in_vreg(arg2, types::I64);
-
-            // Move the args to the preordained AtomicRMW input regs.  Note that `AtomicRmwSeq`
-            // operates at whatever width is specified by `ty`, so there's no need to
-            // zero-extend `arg2` in the case of `ty` being I8/I16/I32.
-            ctx.emit(Inst::gen_move(
-                Writable::from_reg(regs::r9()),
-                addr,
-                types::I64,
-            ));
-            ctx.emit(Inst::gen_move(
-                Writable::from_reg(regs::r10()),
-                arg2,
-                types::I64,
-            ));
-
-            // Now the AtomicRmwSeq (pseudo-) instruction itself
-            let op = inst_common::AtomicRmwOp::from(ctx.data(insn).atomic_rmw_op().unwrap());
-            ctx.emit(Inst::AtomicRmwSeq {
-                ty: ty_access,
-                op,
-                address: regs::r9(),
-                operand: regs::r10(),
-                temp: Writable::from_reg(regs::r11()),
-                dst_old: Writable::from_reg(regs::rax()),
-            });
-
-            // And finally, copy the preordained AtomicRmwSeq output reg to its destination.
-            ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64));
+            implemented_in_isle(ctx);
        }

        Opcode::AtomicCas => {
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@@ -2,7 +2,10 @@

 // Pull in the ISLE generated code.
 pub(crate) mod generated_code;
-use crate::machinst::{InputSourceInst, Reg, Writable};
+use crate::{
+    ir::AtomicRmwOp,
+    machinst::{InputSourceInst, Reg, Writable},
+};
 use generated_code::MInst;

 // Types that the generated ISLE code uses via `use super::*`.
@@ -23,7 +26,7 @@ use crate::{
        },
    },
    machinst::{
-        isle::*, AtomicRmwOp, InsnInput, InsnOutput, LowerCtx, VCodeConstant, VCodeConstantData,
+        isle::*, InsnInput, InsnOutput, LowerCtx, MachAtomicRmwOp, VCodeConstant, VCodeConstantData,
    },
 };
 use std::boxed::Box;
@@ -565,6 +568,11 @@ where
    fn zero_offset(&mut self) -> Offset32 {
        Offset32::new(0)
    }
+
+    #[inline]
+    fn atomic_rmw_op_to_mach_atomic_rmw_op(&mut self, op: &AtomicRmwOp) -> MachAtomicRmwOp {
+        MachAtomicRmwOp::from(*op)
+    }
 }

 // Since x64 doesn't have 8x16 shifts and we must use a 16x8 shift instead, we
--- a/cranelift/codegen/src/machinst/inst_common.rs
+++ b/cranelift/codegen/src/machinst/inst_common.rs
@@ -45,11 +45,10 @@ pub(crate) fn insn_outputs<I: VCodeInst, C: LowerCtx<I = I>>(
 //============================================================================
 // Atomic instructions.

-/// Atomic memory update operations.  As of 21 Aug 2020 these are used for the aarch64 and x64
-/// targets.
+/// Atomic memory update operations.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 #[repr(u8)]
-pub enum AtomicRmwOp {
+pub enum MachAtomicRmwOp {
    /// Add
    Add,
    /// Sub
@@ -74,21 +73,22 @@ pub enum AtomicRmwOp {
    Smax,
 }

-impl AtomicRmwOp {
-    /// Converts an `ir::AtomicRmwOp` to the corresponding `inst_common::AtomicRmwOp`.
+impl MachAtomicRmwOp {
+    /// Converts an `ir::AtomicRmwOp` to the corresponding
+    /// `inst_common::AtomicRmwOp`.
    pub fn from(ir_op: ir::AtomicRmwOp) -> Self {
        match ir_op {
-            ir::AtomicRmwOp::Add => AtomicRmwOp::Add,
-            ir::AtomicRmwOp::Sub => AtomicRmwOp::Sub,
-            ir::AtomicRmwOp::And => AtomicRmwOp::And,
-            ir::AtomicRmwOp::Nand => AtomicRmwOp::Nand,
-            ir::AtomicRmwOp::Or => AtomicRmwOp::Or,
-            ir::AtomicRmwOp::Xor => AtomicRmwOp::Xor,
-            ir::AtomicRmwOp::Xchg => AtomicRmwOp::Xchg,
-            ir::AtomicRmwOp::Umin => AtomicRmwOp::Umin,
-            ir::AtomicRmwOp::Umax => AtomicRmwOp::Umax,
-            ir::AtomicRmwOp::Smin => AtomicRmwOp::Smin,
-            ir::AtomicRmwOp::Smax => AtomicRmwOp::Smax,
+            ir::AtomicRmwOp::Add => MachAtomicRmwOp::Add,
+            ir::AtomicRmwOp::Sub => MachAtomicRmwOp::Sub,
+            ir::AtomicRmwOp::And => MachAtomicRmwOp::And,
+            ir::AtomicRmwOp::Nand => MachAtomicRmwOp::Nand,
+            ir::AtomicRmwOp::Or => MachAtomicRmwOp::Or,
+            ir::AtomicRmwOp::Xor => MachAtomicRmwOp::Xor,
+            ir::AtomicRmwOp::Xchg => MachAtomicRmwOp::Xchg,
+            ir::AtomicRmwOp::Umin => MachAtomicRmwOp::Umin,
+            ir::AtomicRmwOp::Umax => MachAtomicRmwOp::Umax,
+            ir::AtomicRmwOp::Smin => MachAtomicRmwOp::Smin,
+            ir::AtomicRmwOp::Smax => MachAtomicRmwOp::Smax,
        }
    }
 }
--- a/cranelift/codegen/src/machinst/reg.rs
+++ b/cranelift/codegen/src/machinst/reg.rs
@@ -328,6 +328,11 @@ impl<'a, F: Fn(VReg) -> VReg> OperandCollector<'a, F> {
        self.add_operand(Operand::reg_use(reg.into()));
    }

+    /// Add a register use, at the end of the instruction (`After` position).
+    pub fn reg_late_use(&mut self, reg: Reg) {
+        self.add_operand(Operand::reg_use_at_end(reg.into()));
+    }
+
    /// Add multiple register uses.
    pub fn reg_uses(&mut self, regs: &[Reg]) {
        for &reg in regs {