From 620e4b4e823f1942e5ab9dba7a6e1053d26b6835 Mon Sep 17 00:00:00 2001
From: Julian Seward <jseward@acm.org>
Date: Thu, 20 Aug 2020 07:36:19 +0200
Subject: [PATCH] This patch fills in the missing pieces needed to support wasm
 atomics on newBE/x64.

It does this by providing an implementation of the CLIF instructions `AtomicRmw`, `AtomicCas`,
`AtomicLoad`, `AtomicStore` and `Fence`.

The translation is straightforward.  `AtomicCas` is translated into x64 `cmpxchg`, `AtomicLoad`
becomes a normal load because x64-TSO provides adequate sequencing, `AtomicStore` becomes a
normal store followed by `mfence`, and `Fence` becomes `mfence`.  `AtomicRmw` is the only
complex case: it becomes a normal load, followed by a loop which computes an updated value,
tries to `cmpxchg` it back to memory, and repeats if necessary.

This is a minimum-effort initial implementation.  `AtomicRmw` could be implemented more
efficiently using LOCK-prefixed integer read-modify-write instructions in the case where the old
value in memory is not required.  Subsequent work could add that, if required.

The x64 emitter has been updated to emit the new instructions, obviously.  The `LegacyPrefix`
mechanism has been revised to handle multiple prefix bytes, not just one, since it is now
sometimes necessary to emit both 0x66 (Operand Size Override) and F0 (Lock).

In the aarch64 implementation of atomics, there has been some minor renaming for the sake of
clarity, and for consistency with this x64 implementation.
---
 .../codegen/src/isa/aarch64/inst/args.rs      |  28 --
 .../codegen/src/isa/aarch64/inst/emit.rs      |  14 +-
 .../src/isa/aarch64/inst/emit_tests.rs        |   4 +-
 cranelift/codegen/src/isa/aarch64/inst/mod.rs |   2 +-
 cranelift/codegen/src/isa/aarch64/lower.rs    |   5 +-
 .../codegen/src/isa/aarch64/lower_inst.rs     |  13 +-
 cranelift/codegen/src/isa/x64/inst/args.rs    |  11 +
 cranelift/codegen/src/isa/x64/inst/emit.rs    | 401 ++++++++++++------
 .../codegen/src/isa/x64/inst/emit_tests.rs    | 177 +++++++-
 cranelift/codegen/src/isa/x64/inst/mod.rs     | 102 ++++-
 cranelift/codegen/src/isa/x64/lower.rs        | 158 +++++++
 cranelift/codegen/src/machinst/inst_common.rs |  36 ++
 cranelift/codegen/src/machinst/mod.rs         |   2 +
 13 files changed, 761 insertions(+), 192 deletions(-)
 create mode 100644 cranelift/codegen/src/machinst/inst_common.rs

diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs
index 729d21d121..fe8660bbaf 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -3,7 +3,6 @@
 // Some variants are never constructed, but we still want them as options in the future.
 #![allow(dead_code)]
 
-use crate::ir;
 use crate::ir::types::{F32X2, F32X4, F64X2, I16X4, I16X8, I32X2, I32X4, I64X2, I8X16, I8X8};
 use crate::ir::Type;
 use crate::isa::aarch64::inst::*;
@@ -681,30 +680,3 @@ impl VectorSize {
         }
     }
 }
-
-//=============================================================================
-// Instruction sub-components: atomic memory update operations
-
-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-#[repr(u8)]
-pub enum AtomicRMWOp {
-    Add,
-    Sub,
-    And,
-    Or,
-    Xor,
-    Xchg,
-}
-
-impl AtomicRMWOp {
-    pub fn from(ir_op: ir::AtomicRmwOp) -> Self {
-        match ir_op {
-            ir::AtomicRmwOp::Add => AtomicRMWOp::Add,
-            ir::AtomicRmwOp::Sub => AtomicRMWOp::Sub,
-            ir::AtomicRmwOp::And => AtomicRMWOp::And,
-            ir::AtomicRmwOp::Or => AtomicRMWOp::Or,
-            ir::AtomicRmwOp::Xor => AtomicRMWOp::Xor,
-            ir::AtomicRmwOp::Xchg => AtomicRMWOp::Xchg,
-        }
-    }
-}
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index 32fe3aa6cf..60a81eb005 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -1090,18 +1090,18 @@ impl MachInstEmit for Inst {
                 }
                 sink.put4(enc_ldxr(ty, x27wr, x25)); // ldxr x27, [x25]
 
-                if op == AtomicRMWOp::Xchg {
+                if op == inst_common::AtomicRmwOp::Xchg {
                     // mov x28, x26
                     sink.put4(enc_arith_rrr(0b101_01010_00_0, 0b000000, x28wr, xzr, x26))
                 } else {
                     // add/sub/and/orr/eor x28, x27, x26
                     let bits_31_21 = match op {
-                        AtomicRMWOp::Add => 0b100_01011_00_0,
-                        AtomicRMWOp::Sub => 0b110_01011_00_0,
-                        AtomicRMWOp::And => 0b100_01010_00_0,
-                        AtomicRMWOp::Or => 0b101_01010_00_0,
-                        AtomicRMWOp::Xor => 0b110_01010_00_0,
-                        AtomicRMWOp::Xchg => unreachable!(),
+                        inst_common::AtomicRmwOp::Add => 0b100_01011_00_0,
+                        inst_common::AtomicRmwOp::Sub => 0b110_01011_00_0,
+                        inst_common::AtomicRmwOp::And => 0b100_01010_00_0,
+                        inst_common::AtomicRmwOp::Or => 0b101_01010_00_0,
+                        inst_common::AtomicRmwOp::Xor => 0b110_01010_00_0,
+                        inst_common::AtomicRmwOp::Xchg => unreachable!(),
                     };
                     sink.put4(enc_arith_rrr(bits_31_21, 0b000000, x28wr, x27, x26));
                 }
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
index e2f08abb21..f8b446de31 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -4551,7 +4551,7 @@ fn test_aarch64_binemit() {
     insns.push((
         Inst::AtomicRMW {
             ty: I16,
-            op: AtomicRMWOp::Xor,
+            op: inst_common::AtomicRmwOp::Xor,
             srcloc: None,
         },
         "BF3B03D53B7F5F487C031ACA3C7F1848B8FFFFB5BF3B03D5",
@@ -4561,7 +4561,7 @@ fn test_aarch64_binemit() {
     insns.push((
         Inst::AtomicRMW {
             ty: I32,
-            op: AtomicRMWOp::Xchg,
+            op: inst_common::AtomicRmwOp::Xchg,
             srcloc: None,
         },
         "BF3B03D53B7F5F88FC031AAA3C7F1888B8FFFFB5BF3B03D5",
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index b90dccd41a..b527b7dc19 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -649,7 +649,7 @@ pub enum Inst {
     /// x28   (wr) scratch reg; value afterwards has no meaning
     AtomicRMW {
         ty: Type, // I8, I16, I32 or I64
-        op: AtomicRMWOp,
+        op: inst_common::AtomicRmwOp,
         srcloc: Option<SourceLoc>,
     },
 
diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
index d399b90ed0..55b675a714 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -7,10 +7,11 @@
 //!
 //! - Floating-point immediates (FIMM instruction).
 
+use crate::ir;
 use crate::ir::condcodes::{FloatCC, IntCC};
 use crate::ir::types::*;
 use crate::ir::Inst as IRInst;
-use crate::ir::{AtomicRmwOp, InstructionData, Opcode, TrapCode, Type};
+use crate::ir::{InstructionData, Opcode, TrapCode, Type};
 use crate::machinst::lower::*;
 use crate::machinst::*;
 use crate::CodegenResult;
@@ -1067,7 +1068,7 @@ pub(crate) fn inst_trapcode(data: &InstructionData) -> Option<TrapCode> {
     }
 }
 
-pub(crate) fn inst_atomic_rmw_op(data: &InstructionData) -> Option<AtomicRmwOp> {
+pub(crate) fn inst_atomic_rmw_op(data: &InstructionData) -> Option<ir::AtomicRmwOp> {
     match data {
         &InstructionData::AtomicRmw { op, .. } => Some(op),
         _ => None,
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index b2915d024e..b52f01364d 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -21,7 +21,8 @@ use smallvec::SmallVec;
 
 use super::lower::*;
 
-fn is_single_word_int_ty(ty: Type) -> bool {
+/// This is target-word-size dependent.  And it excludes booleans and reftypes.
+fn is_valid_atomic_transaction_ty(ty: Type) -> bool {
     match ty {
         I8 | I16 | I32 | I64 => true,
         _ => false,
@@ -1228,7 +1229,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let mut r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let mut r_arg2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let ty_access = ty.unwrap();
-            assert!(is_single_word_int_ty(ty_access));
+            assert!(is_valid_atomic_transaction_ty(ty_access));
             let memflags = ctx.memflags(insn).expect("memory flags");
             let srcloc = if !memflags.notrap() {
                 Some(ctx.srcloc(insn))
@@ -1244,7 +1245,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64));
             ctx.emit(Inst::gen_move(Writable::from_reg(xreg(26)), r_arg2, I64));
             // Now the AtomicRMW insn itself
-            let op = AtomicRMWOp::from(inst_atomic_rmw_op(ctx.data(insn)).unwrap());
+            let op = inst_common::AtomicRmwOp::from(inst_atomic_rmw_op(ctx.data(insn)).unwrap());
             ctx.emit(Inst::AtomicRMW {
                 ty: ty_access,
                 op,
@@ -1264,7 +1265,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let mut r_expected = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let mut r_replacement = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
             let ty_access = ty.unwrap();
-            assert!(is_single_word_int_ty(ty_access));
+            assert!(is_valid_atomic_transaction_ty(ty_access));
             let memflags = ctx.memflags(insn).expect("memory flags");
             let srcloc = if !memflags.notrap() {
                 Some(ctx.srcloc(insn))
@@ -1302,7 +1303,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let r_data = get_output_reg(ctx, outputs[0]);
             let r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let ty_access = ty.unwrap();
-            assert!(is_single_word_int_ty(ty_access));
+            assert!(is_valid_atomic_transaction_ty(ty_access));
             let memflags = ctx.memflags(insn).expect("memory flags");
             let srcloc = if !memflags.notrap() {
                 Some(ctx.srcloc(insn))
@@ -1321,7 +1322,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let r_data = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let r_addr = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let ty_access = ctx.input_ty(insn, 0);
-            assert!(is_single_word_int_ty(ty_access));
+            assert!(is_valid_atomic_transaction_ty(ty_access));
             let memflags = ctx.memflags(insn).expect("memory flags");
             let srcloc = if !memflags.notrap() {
                 Some(ctx.srcloc(insn))
diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs
index 343f3322d0..8690c57a4c 100644
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -1010,3 +1010,14 @@ impl OperandSize {
         }
     }
 }
+
+/// An x64 memory fence kind.
+#[derive(Clone)]
+pub enum FenceKind {
+    /// `mfence` instruction ("Memory Fence")
+    MFence,
+    /// `lfence` instruction ("Load Fence")
+    LFence,
+    /// `sfence` instruction ("Store Fence")
+    SFence,
+}
diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
index 9bae562c5c..b54de499c9 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -3,7 +3,7 @@ use crate::ir::immediates::{Ieee32, Ieee64};
 use crate::ir::TrapCode;
 use crate::isa::x64::inst::args::*;
 use crate::isa::x64::inst::*;
-use crate::machinst::{MachBuffer, MachInstEmit, MachLabel};
+use crate::machinst::{inst_common, MachBuffer, MachInstEmit, MachLabel};
 use core::convert::TryInto;
 use log::debug;
 use regalloc::{Reg, RegClass, Writable};
@@ -118,25 +118,38 @@ impl RexFlags {
     }
 }
 
-/// For specifying the legacy prefixes (or `None` if no prefix required) to
-/// be used at the start an instruction. A given prefix may be required for
-/// various operations, including instructions that operate on GPR, SSE, and Vex
-/// registers.
-enum LegacyPrefix {
+/// We may need to include one or more legacy prefix bytes before the REX prefix.  This enum
+/// covers only the small set of possibilities that we actually need.
+enum LegacyPrefixes {
+    /// No prefix bytes
     None,
+    /// Operand Size Override -- here, denoting "16-bit operation"
     _66,
+    /// The Lock prefix
+    _F0,
+    /// Operand size override and Lock
+    _66F0,
+    /// REPNE, but no specific meaning here -- is just an opcode extension
     _F2,
+    /// REP/REPE, but no specific meaning here -- is just an opcode extension
     _F3,
 }
 
-impl LegacyPrefix {
+impl LegacyPrefixes {
     #[inline(always)]
     fn emit(&self, sink: &mut MachBuffer<Inst>) {
         match self {
-            LegacyPrefix::_66 => sink.put1(0x66),
-            LegacyPrefix::_F2 => sink.put1(0xF2),
-            LegacyPrefix::_F3 => sink.put1(0xF3),
-            LegacyPrefix::None => (),
+            LegacyPrefixes::_66 => sink.put1(0x66),
+            LegacyPrefixes::_F0 => sink.put1(0xF0),
+            LegacyPrefixes::_66F0 => {
+                // I don't think the order matters, but in any case, this is the same order that
+                // the GNU assembler uses.
+                sink.put1(0x66);
+                sink.put1(0xF0);
+            }
+            LegacyPrefixes::_F2 => sink.put1(0xF2),
+            LegacyPrefixes::_F3 => sink.put1(0xF3),
+            LegacyPrefixes::None => (),
         }
     }
 }
@@ -145,15 +158,16 @@ impl LegacyPrefix {
 ///
 /// For an instruction that has as operands a reg encoding `enc_g` and a memory address `mem_e`,
 /// create and emit:
-/// - first the REX prefix,
+/// - first the legacy prefixes, if any
+/// - then the REX prefix, if needed
 /// - then caller-supplied opcode byte(s) (`opcodes` and `num_opcodes`),
 /// - then the MOD/RM byte,
 /// - then optionally, a SIB byte,
 /// - and finally optionally an immediate that will be derived from the `mem_e` operand.
 ///
 /// For most instructions up to and including SSE4.2, that will be the whole instruction: this is
-/// what we call "standard" instructions, and abbreviate "std" in the name here. VEX instructions
-/// will require their own emitter functions.
+/// what we call "standard" instructions, and abbreviate "std" in the name here. VEX-prefixed
+/// instructions will require their own emitter functions.
 ///
 /// This will also work for 32-bits x86 instructions, assuming no REX prefix is provided.
 ///
@@ -168,7 +182,7 @@ impl LegacyPrefix {
 /// indicate a 64-bit operation.
 fn emit_std_enc_mem(
     sink: &mut MachBuffer<Inst>,
-    prefix: LegacyPrefix,
+    prefixes: LegacyPrefixes,
     opcodes: u32,
     mut num_opcodes: usize,
     enc_g: u8,
@@ -179,7 +193,7 @@ fn emit_std_enc_mem(
     // 64-bit integer registers, because they are part of an address
     // expression.  But `enc_g` can be derived from a register of any class.
 
-    prefix.emit(sink);
+    prefixes.emit(sink);
 
     match mem_e {
         Amode::ImmReg { simm32, base } => {
@@ -304,7 +318,7 @@ fn emit_std_enc_mem(
 /// operand is a register rather than memory.  Hence it is much simpler.
 fn emit_std_enc_enc(
     sink: &mut MachBuffer<Inst>,
-    prefix: LegacyPrefix,
+    prefixes: LegacyPrefixes,
     opcodes: u32,
     mut num_opcodes: usize,
     enc_g: u8,
@@ -316,8 +330,8 @@ fn emit_std_enc_enc(
     // integer-to-FP conversion insn, one might be RegClass::I64 and the other
     // RegClass::V128.
 
-    // The operand-size override.
-    prefix.emit(sink);
+    // The legacy prefixes.
+    prefixes.emit(sink);
 
     // The rex byte.
     rex.emit_two_op(sink, enc_g, enc_e);
@@ -338,7 +352,7 @@ fn emit_std_enc_enc(
 
 fn emit_std_reg_mem(
     sink: &mut MachBuffer<Inst>,
-    prefix: LegacyPrefix,
+    prefixes: LegacyPrefixes,
     opcodes: u32,
     num_opcodes: usize,
     reg_g: Reg,
@@ -346,12 +360,12 @@ fn emit_std_reg_mem(
     rex: RexFlags,
 ) {
     let enc_g = reg_enc(reg_g);
-    emit_std_enc_mem(sink, prefix, opcodes, num_opcodes, enc_g, mem_e, rex);
+    emit_std_enc_mem(sink, prefixes, opcodes, num_opcodes, enc_g, mem_e, rex);
 }
 
 fn emit_std_reg_reg(
     sink: &mut MachBuffer<Inst>,
-    prefix: LegacyPrefix,
+    prefixes: LegacyPrefixes,
     opcodes: u32,
     num_opcodes: usize,
     reg_g: Reg,
@@ -360,7 +374,7 @@ fn emit_std_reg_reg(
 ) {
     let enc_g = reg_enc(reg_g);
     let enc_e = reg_enc(reg_e);
-    emit_std_enc_enc(sink, prefix, opcodes, num_opcodes, enc_g, enc_e, rex);
+    emit_std_enc_enc(sink, prefixes, opcodes, num_opcodes, enc_g, enc_e, rex);
 }
 
 /// Write a suitable number of bits from an imm64 to the sink.
@@ -481,7 +495,7 @@ pub(crate) fn emit(
                     RegMemImm::Reg { reg: reg_e } => {
                         emit_std_reg_reg(
                             sink,
-                            LegacyPrefix::None,
+                            LegacyPrefixes::None,
                             0x0FAF,
                             2,
                             reg_g.to_reg(),
@@ -493,7 +507,7 @@ pub(crate) fn emit(
                     RegMemImm::Mem { addr } => {
                         emit_std_reg_mem(
                             sink,
-                            LegacyPrefix::None,
+                            LegacyPrefixes::None,
                             0x0FAF,
                             2,
                             reg_g.to_reg(),
@@ -508,7 +522,7 @@ pub(crate) fn emit(
                         // Yes, really, reg_g twice.
                         emit_std_reg_reg(
                             sink,
-                            LegacyPrefix::None,
+                            LegacyPrefixes::None,
                             opcode,
                             1,
                             reg_g.to_reg(),
@@ -535,7 +549,7 @@ pub(crate) fn emit(
                         // code easily.
                         emit_std_reg_reg(
                             sink,
-                            LegacyPrefix::None,
+                            LegacyPrefixes::None,
                             opcode_r,
                             1,
                             *reg_e,
@@ -550,7 +564,7 @@ pub(crate) fn emit(
                         // Here we revert to the "normal" G-E ordering.
                         emit_std_reg_mem(
                             sink,
-                            LegacyPrefix::None,
+                            LegacyPrefixes::None,
                             opcode_m,
                             1,
                             reg_g.to_reg(),
@@ -566,7 +580,7 @@ pub(crate) fn emit(
                         let enc_g = int_reg_enc(reg_g.to_reg());
                         emit_std_enc_enc(
                             sink,
-                            LegacyPrefix::None,
+                            LegacyPrefixes::None,
                             opcode,
                             1,
                             subopcode_i,
@@ -581,9 +595,9 @@ pub(crate) fn emit(
 
         Inst::UnaryRmR { size, op, src, dst } => {
             let (prefix, rex_flags) = match size {
-                2 => (LegacyPrefix::_66, RexFlags::clear_w()),
-                4 => (LegacyPrefix::None, RexFlags::clear_w()),
-                8 => (LegacyPrefix::None, RexFlags::set_w()),
+                2 => (LegacyPrefixes::_66, RexFlags::clear_w()),
+                4 => (LegacyPrefixes::None, RexFlags::clear_w()),
+                8 => (LegacyPrefixes::None, RexFlags::set_w()),
                 _ => unreachable!(),
             };
 
@@ -621,9 +635,9 @@ pub(crate) fn emit(
             loc,
         } => {
             let (prefix, rex_flags) = match size {
-                2 => (LegacyPrefix::_66, RexFlags::clear_w()),
-                4 => (LegacyPrefix::None, RexFlags::clear_w()),
-                8 => (LegacyPrefix::None, RexFlags::set_w()),
+                2 => (LegacyPrefixes::_66, RexFlags::clear_w()),
+                4 => (LegacyPrefixes::None, RexFlags::clear_w()),
+                8 => (LegacyPrefixes::None, RexFlags::set_w()),
                 _ => unreachable!(),
             };
 
@@ -649,9 +663,9 @@ pub(crate) fn emit(
 
         Inst::MulHi { size, signed, rhs } => {
             let (prefix, rex_flags) = match size {
-                2 => (LegacyPrefix::_66, RexFlags::clear_w()),
-                4 => (LegacyPrefix::None, RexFlags::clear_w()),
-                8 => (LegacyPrefix::None, RexFlags::set_w()),
+                2 => (LegacyPrefixes::_66, RexFlags::clear_w()),
+                4 => (LegacyPrefixes::None, RexFlags::clear_w()),
+                8 => (LegacyPrefixes::None, RexFlags::set_w()),
                 _ => unreachable!(),
             };
 
@@ -826,7 +840,7 @@ pub(crate) fn emit(
             } else {
                 RexFlags::clear_w()
             };
-            emit_std_reg_reg(sink, LegacyPrefix::None, 0x89, 1, *src, dst.to_reg(), rex);
+            emit_std_reg_reg(sink, LegacyPrefixes::None, 0x89, 1, *src, dst.to_reg(), rex);
         }
 
         Inst::MovZX_RM_R {
@@ -880,7 +894,7 @@ pub(crate) fn emit(
                     }
                     emit_std_reg_reg(
                         sink,
-                        LegacyPrefix::None,
+                        LegacyPrefixes::None,
                         opcodes,
                         num_opcodes,
                         dst.to_reg(),
@@ -899,7 +913,7 @@ pub(crate) fn emit(
 
                     emit_std_reg_mem(
                         sink,
-                        LegacyPrefix::None,
+                        LegacyPrefixes::None,
                         opcodes,
                         num_opcodes,
                         dst.to_reg(),
@@ -920,7 +934,7 @@ pub(crate) fn emit(
 
             emit_std_reg_mem(
                 sink,
-                LegacyPrefix::None,
+                LegacyPrefixes::None,
                 0x8B,
                 1,
                 dst.to_reg(),
@@ -931,7 +945,7 @@ pub(crate) fn emit(
 
         Inst::LoadEffectiveAddress { addr, dst } => emit_std_reg_mem(
             sink,
-            LegacyPrefix::None,
+            LegacyPrefixes::None,
             0x8D,
             1,
             dst.to_reg(),
@@ -982,7 +996,7 @@ pub(crate) fn emit(
                     }
                     emit_std_reg_reg(
                         sink,
-                        LegacyPrefix::None,
+                        LegacyPrefixes::None,
                         opcodes,
                         num_opcodes,
                         dst.to_reg(),
@@ -1001,7 +1015,7 @@ pub(crate) fn emit(
 
                     emit_std_reg_mem(
                         sink,
-                        LegacyPrefix::None,
+                        LegacyPrefixes::None,
                         opcodes,
                         num_opcodes,
                         dst.to_reg(),
@@ -1038,14 +1052,14 @@ pub(crate) fn emit(
                     };
 
                     // MOV r8, r/m8 is (REX.W==0) 88 /r
-                    emit_std_reg_mem(sink, LegacyPrefix::None, 0x88, 1, *src, dst, rex)
+                    emit_std_reg_mem(sink, LegacyPrefixes::None, 0x88, 1, *src, dst, rex)
                 }
 
                 2 => {
                     // MOV r16, r/m16 is 66 (REX.W==0) 89 /r
                     emit_std_reg_mem(
                         sink,
-                        LegacyPrefix::_66,
+                        LegacyPrefixes::_66,
                         0x89,
                         1,
                         *src,
@@ -1058,7 +1072,7 @@ pub(crate) fn emit(
                     // MOV r32, r/m32 is (REX.W==0) 89 /r
                     emit_std_reg_mem(
                         sink,
-                        LegacyPrefix::None,
+                        LegacyPrefixes::None,
                         0x89,
                         1,
                         *src,
@@ -1071,7 +1085,7 @@ pub(crate) fn emit(
                     // MOV r64, r/m64 is (REX.W==1) 89 /r
                     emit_std_reg_mem(
                         sink,
-                        LegacyPrefix::None,
+                        LegacyPrefixes::None,
                         0x89,
                         1,
                         *src,
@@ -1109,7 +1123,7 @@ pub(crate) fn emit(
                 None => {
                     // SHL/SHR/SAR %cl, reg32 is (REX.W==0) D3 /subopcode
                     // SHL/SHR/SAR %cl, reg64 is (REX.W==1) D3 /subopcode
-                    emit_std_enc_enc(sink, LegacyPrefix::None, 0xD3, 1, subopcode, enc_dst, rex);
+                    emit_std_enc_enc(sink, LegacyPrefixes::None, 0xD3, 1, subopcode, enc_dst, rex);
                 }
 
                 Some(num_bits) => {
@@ -1117,7 +1131,7 @@ pub(crate) fn emit(
                     // SHL/SHR/SAR $ib, reg64 is (REX.W==1) C1 /subopcode ib
                     // When the shift amount is 1, there's an even shorter encoding, but we don't
                     // bother with that nicety here.
-                    emit_std_enc_enc(sink, LegacyPrefix::None, 0xC1, 1, subopcode, enc_dst, rex);
+                    emit_std_enc_enc(sink, LegacyPrefixes::None, 0xC1, 1, subopcode, enc_dst, rex);
                     sink.put1(*num_bits);
                 }
             }
@@ -1125,7 +1139,7 @@ pub(crate) fn emit(
 
         Inst::XmmRmiReg { opcode, src, dst } => {
             let rex = RexFlags::clear_w();
-            let prefix = LegacyPrefix::_66;
+            let prefix = LegacyPrefixes::_66;
             if let RegMemImm::Imm { simm32 } = src {
                 let (opcode_bytes, reg_digit) = match opcode {
                     SseOpcode::Psllw => (0x0F71, 6),
@@ -1175,9 +1189,9 @@ pub(crate) fn emit(
             src: src_e,
             dst: reg_g,
         } => {
-            let mut prefix = LegacyPrefix::None;
+            let mut prefix = LegacyPrefixes::None;
             if *size == 2 {
-                prefix = LegacyPrefix::_66;
+                prefix = LegacyPrefixes::_66;
             }
 
             let mut rex = match size {
@@ -1245,7 +1259,7 @@ pub(crate) fn emit(
             rex_flags.always_emit();
             emit_std_enc_enc(
                 sink,
-                LegacyPrefix::None,
+                LegacyPrefixes::None,
                 opcode,
                 2,
                 0,
@@ -1261,9 +1275,9 @@ pub(crate) fn emit(
             dst: reg_g,
         } => {
             let (prefix, rex_flags) = match size {
-                2 => (LegacyPrefix::_66, RexFlags::clear_w()),
-                4 => (LegacyPrefix::None, RexFlags::clear_w()),
-                8 => (LegacyPrefix::None, RexFlags::set_w()),
+                2 => (LegacyPrefixes::_66, RexFlags::clear_w()),
+                4 => (LegacyPrefixes::None, RexFlags::clear_w()),
+                8 => (LegacyPrefixes::None, RexFlags::set_w()),
                 _ => unreachable!("invalid size spec for cmove"),
             };
             let opcode = 0x0F40 + cc.get_enc() as u32;
@@ -1315,7 +1329,7 @@ pub(crate) fn emit(
                     let addr = &addr.finalize(state);
                     emit_std_enc_mem(
                         sink,
-                        LegacyPrefix::None,
+                        LegacyPrefixes::None,
                         0xFF,
                         1,
                         6, /*subopcode*/
@@ -1371,7 +1385,7 @@ pub(crate) fn emit(
                     let reg_enc = int_reg_enc(*reg);
                     emit_std_enc_enc(
                         sink,
-                        LegacyPrefix::None,
+                        LegacyPrefixes::None,
                         0xFF,
                         1,
                         2, /*subopcode*/
@@ -1384,7 +1398,7 @@ pub(crate) fn emit(
                     let addr = &addr.finalize(state);
                     emit_std_enc_mem(
                         sink,
-                        LegacyPrefix::None,
+                        LegacyPrefixes::None,
                         0xFF,
                         1,
                         2, /*subopcode*/
@@ -1461,7 +1475,7 @@ pub(crate) fn emit(
                     let reg_enc = int_reg_enc(*reg);
                     emit_std_enc_enc(
                         sink,
-                        LegacyPrefix::None,
+                        LegacyPrefixes::None,
                         0xFF,
                         1,
                         4, /*subopcode*/
@@ -1474,7 +1488,7 @@ pub(crate) fn emit(
                     let addr = &addr.finalize(state);
                     emit_std_enc_mem(
                         sink,
-                        LegacyPrefix::None,
+                        LegacyPrefixes::None,
                         0xFF,
                         1,
                         4, /*subopcode*/
@@ -1596,20 +1610,20 @@ pub(crate) fn emit(
             let rex = RexFlags::clear_w();
 
             let (prefix, opcode) = match op {
-                SseOpcode::Cvtss2sd => (LegacyPrefix::_F3, 0x0F5A),
-                SseOpcode::Cvtsd2ss => (LegacyPrefix::_F2, 0x0F5A),
-                SseOpcode::Movaps => (LegacyPrefix::None, 0x0F28),
-                SseOpcode::Movapd => (LegacyPrefix::_66, 0x0F28),
-                SseOpcode::Movdqa => (LegacyPrefix::_66, 0x0F6F),
-                SseOpcode::Movdqu => (LegacyPrefix::_F3, 0x0F6F),
-                SseOpcode::Movsd => (LegacyPrefix::_F2, 0x0F10),
-                SseOpcode::Movss => (LegacyPrefix::_F3, 0x0F10),
-                SseOpcode::Movups => (LegacyPrefix::None, 0x0F10),
-                SseOpcode::Movupd => (LegacyPrefix::_66, 0x0F10),
-                SseOpcode::Sqrtps => (LegacyPrefix::None, 0x0F51),
-                SseOpcode::Sqrtpd => (LegacyPrefix::_66, 0x0F51),
-                SseOpcode::Sqrtss => (LegacyPrefix::_F3, 0x0F51),
-                SseOpcode::Sqrtsd => (LegacyPrefix::_F2, 0x0F51),
+                SseOpcode::Cvtss2sd => (LegacyPrefixes::_F3, 0x0F5A),
+                SseOpcode::Cvtsd2ss => (LegacyPrefixes::_F2, 0x0F5A),
+                SseOpcode::Movaps => (LegacyPrefixes::None, 0x0F28),
+                SseOpcode::Movapd => (LegacyPrefixes::_66, 0x0F28),
+                SseOpcode::Movdqa => (LegacyPrefixes::_66, 0x0F6F),
+                SseOpcode::Movdqu => (LegacyPrefixes::_F3, 0x0F6F),
+                SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F10),
+                SseOpcode::Movss => (LegacyPrefixes::_F3, 0x0F10),
+                SseOpcode::Movups => (LegacyPrefixes::None, 0x0F10),
+                SseOpcode::Movupd => (LegacyPrefixes::_66, 0x0F10),
+                SseOpcode::Sqrtps => (LegacyPrefixes::None, 0x0F51),
+                SseOpcode::Sqrtpd => (LegacyPrefixes::_66, 0x0F51),
+                SseOpcode::Sqrtss => (LegacyPrefixes::_F3, 0x0F51),
+                SseOpcode::Sqrtsd => (LegacyPrefixes::_F2, 0x0F51),
                 _ => unimplemented!("Opcode {:?} not implemented", op),
             };
 
@@ -1635,49 +1649,49 @@ pub(crate) fn emit(
         } => {
             let rex = RexFlags::clear_w();
             let (prefix, opcode, length) = match op {
-                SseOpcode::Addps => (LegacyPrefix::None, 0x0F58, 2),
-                SseOpcode::Addpd => (LegacyPrefix::_66, 0x0F58, 2),
-                SseOpcode::Addss => (LegacyPrefix::_F3, 0x0F58, 2),
-                SseOpcode::Addsd => (LegacyPrefix::_F2, 0x0F58, 2),
-                SseOpcode::Andpd => (LegacyPrefix::_66, 0x0F54, 2),
-                SseOpcode::Andps => (LegacyPrefix::None, 0x0F54, 2),
-                SseOpcode::Andnps => (LegacyPrefix::None, 0x0F55, 2),
-                SseOpcode::Andnpd => (LegacyPrefix::_66, 0x0F55, 2),
-                SseOpcode::Divps => (LegacyPrefix::None, 0x0F5E, 2),
-                SseOpcode::Divpd => (LegacyPrefix::_66, 0x0F5E, 2),
-                SseOpcode::Divss => (LegacyPrefix::_F3, 0x0F5E, 2),
-                SseOpcode::Divsd => (LegacyPrefix::_F2, 0x0F5E, 2),
-                SseOpcode::Minps => (LegacyPrefix::None, 0x0F5D, 2),
-                SseOpcode::Minpd => (LegacyPrefix::_66, 0x0F5D, 2),
-                SseOpcode::Minss => (LegacyPrefix::_F3, 0x0F5D, 2),
-                SseOpcode::Minsd => (LegacyPrefix::_F2, 0x0F5D, 2),
-                SseOpcode::Maxps => (LegacyPrefix::None, 0x0F5F, 2),
-                SseOpcode::Maxpd => (LegacyPrefix::_66, 0x0F5F, 2),
-                SseOpcode::Maxss => (LegacyPrefix::_F3, 0x0F5F, 2),
-                SseOpcode::Maxsd => (LegacyPrefix::_F2, 0x0F5F, 2),
-                SseOpcode::Mulps => (LegacyPrefix::None, 0x0F59, 2),
-                SseOpcode::Mulpd => (LegacyPrefix::_66, 0x0F59, 2),
-                SseOpcode::Mulss => (LegacyPrefix::_F3, 0x0F59, 2),
-                SseOpcode::Mulsd => (LegacyPrefix::_F2, 0x0F59, 2),
-                SseOpcode::Orpd => (LegacyPrefix::_66, 0x0F56, 2),
-                SseOpcode::Orps => (LegacyPrefix::None, 0x0F56, 2),
-                SseOpcode::Paddb => (LegacyPrefix::_66, 0x0FFC, 2),
-                SseOpcode::Paddd => (LegacyPrefix::_66, 0x0FFE, 2),
-                SseOpcode::Paddq => (LegacyPrefix::_66, 0x0FD4, 2),
-                SseOpcode::Paddw => (LegacyPrefix::_66, 0x0FFD, 2),
-                SseOpcode::Pmulld => (LegacyPrefix::_66, 0x0F3840, 3),
-                SseOpcode::Pmullw => (LegacyPrefix::_66, 0x0FD5, 2),
-                SseOpcode::Pmuludq => (LegacyPrefix::_66, 0x0FF4, 2),
-                SseOpcode::Psubb => (LegacyPrefix::_66, 0x0FF8, 2),
-                SseOpcode::Psubd => (LegacyPrefix::_66, 0x0FFA, 2),
-                SseOpcode::Psubq => (LegacyPrefix::_66, 0x0FFB, 2),
-                SseOpcode::Psubw => (LegacyPrefix::_66, 0x0FF9, 2),
-                SseOpcode::Subps => (LegacyPrefix::None, 0x0F5C, 2),
-                SseOpcode::Subpd => (LegacyPrefix::_66, 0x0F5C, 2),
-                SseOpcode::Subss => (LegacyPrefix::_F3, 0x0F5C, 2),
-                SseOpcode::Subsd => (LegacyPrefix::_F2, 0x0F5C, 2),
-                SseOpcode::Xorps => (LegacyPrefix::None, 0x0F57, 2),
-                SseOpcode::Xorpd => (LegacyPrefix::_66, 0x0F57, 2),
+                SseOpcode::Addps => (LegacyPrefixes::None, 0x0F58, 2),
+                SseOpcode::Addpd => (LegacyPrefixes::_66, 0x0F58, 2),
+                SseOpcode::Addss => (LegacyPrefixes::_F3, 0x0F58, 2),
+                SseOpcode::Addsd => (LegacyPrefixes::_F2, 0x0F58, 2),
+                SseOpcode::Andpd => (LegacyPrefixes::_66, 0x0F54, 2),
+                SseOpcode::Andps => (LegacyPrefixes::None, 0x0F54, 2),
+                SseOpcode::Andnps => (LegacyPrefixes::None, 0x0F55, 2),
+                SseOpcode::Andnpd => (LegacyPrefixes::_66, 0x0F55, 2),
+                SseOpcode::Divps => (LegacyPrefixes::None, 0x0F5E, 2),
+                SseOpcode::Divpd => (LegacyPrefixes::_66, 0x0F5E, 2),
+                SseOpcode::Divss => (LegacyPrefixes::_F3, 0x0F5E, 2),
+                SseOpcode::Divsd => (LegacyPrefixes::_F2, 0x0F5E, 2),
+                SseOpcode::Minps => (LegacyPrefixes::None, 0x0F5D, 2),
+                SseOpcode::Minpd => (LegacyPrefixes::_66, 0x0F5D, 2),
+                SseOpcode::Minss => (LegacyPrefixes::_F3, 0x0F5D, 2),
+                SseOpcode::Minsd => (LegacyPrefixes::_F2, 0x0F5D, 2),
+                SseOpcode::Maxps => (LegacyPrefixes::None, 0x0F5F, 2),
+                SseOpcode::Maxpd => (LegacyPrefixes::_66, 0x0F5F, 2),
+                SseOpcode::Maxss => (LegacyPrefixes::_F3, 0x0F5F, 2),
+                SseOpcode::Maxsd => (LegacyPrefixes::_F2, 0x0F5F, 2),
+                SseOpcode::Mulps => (LegacyPrefixes::None, 0x0F59, 2),
+                SseOpcode::Mulpd => (LegacyPrefixes::_66, 0x0F59, 2),
+                SseOpcode::Mulss => (LegacyPrefixes::_F3, 0x0F59, 2),
+                SseOpcode::Mulsd => (LegacyPrefixes::_F2, 0x0F59, 2),
+                SseOpcode::Orpd => (LegacyPrefixes::_66, 0x0F56, 2),
+                SseOpcode::Orps => (LegacyPrefixes::None, 0x0F56, 2),
+                SseOpcode::Paddb => (LegacyPrefixes::_66, 0x0FFC, 2),
+                SseOpcode::Paddd => (LegacyPrefixes::_66, 0x0FFE, 2),
+                SseOpcode::Paddq => (LegacyPrefixes::_66, 0x0FD4, 2),
+                SseOpcode::Paddw => (LegacyPrefixes::_66, 0x0FFD, 2),
+                SseOpcode::Pmulld => (LegacyPrefixes::_66, 0x0F3840, 3),
+                SseOpcode::Pmullw => (LegacyPrefixes::_66, 0x0FD5, 2),
+                SseOpcode::Pmuludq => (LegacyPrefixes::_66, 0x0FF4, 2),
+                SseOpcode::Psubb => (LegacyPrefixes::_66, 0x0FF8, 2),
+                SseOpcode::Psubd => (LegacyPrefixes::_66, 0x0FFA, 2),
+                SseOpcode::Psubq => (LegacyPrefixes::_66, 0x0FFB, 2),
+                SseOpcode::Psubw => (LegacyPrefixes::_66, 0x0FF9, 2),
+                SseOpcode::Subps => (LegacyPrefixes::None, 0x0F5C, 2),
+                SseOpcode::Subpd => (LegacyPrefixes::_66, 0x0F5C, 2),
+                SseOpcode::Subss => (LegacyPrefixes::_F3, 0x0F5C, 2),
+                SseOpcode::Subsd => (LegacyPrefixes::_F2, 0x0F5C, 2),
+                SseOpcode::Xorps => (LegacyPrefixes::None, 0x0F57, 2),
+                SseOpcode::Xorpd => (LegacyPrefixes::_66, 0x0F57, 2),
                 _ => unimplemented!("Opcode {:?} not implemented", op),
             };
 
@@ -1780,10 +1794,10 @@ pub(crate) fn emit(
 
         Inst::XmmRmRImm { op, src, dst, imm } => {
             let prefix = match op {
-                SseOpcode::Cmpps => LegacyPrefix::None,
-                SseOpcode::Cmppd => LegacyPrefix::_66,
-                SseOpcode::Cmpss => LegacyPrefix::_F3,
-                SseOpcode::Cmpsd => LegacyPrefix::_F2,
+                SseOpcode::Cmpps => LegacyPrefixes::None,
+                SseOpcode::Cmppd => LegacyPrefixes::_66,
+                SseOpcode::Cmpss => LegacyPrefixes::_F3,
+                SseOpcode::Cmpsd => LegacyPrefixes::_F2,
                 _ => unimplemented!("Opcode {:?} not implemented", op),
             };
             let opcode = 0x0FC2;
@@ -1833,14 +1847,14 @@ pub(crate) fn emit(
             srcloc,
         } => {
             let (prefix, opcode) = match op {
-                SseOpcode::Movaps => (LegacyPrefix::None, 0x0F29),
-                SseOpcode::Movapd => (LegacyPrefix::_66, 0x0F29),
-                SseOpcode::Movdqa => (LegacyPrefix::_66, 0x0F7F),
-                SseOpcode::Movdqu => (LegacyPrefix::_F3, 0x0F7F),
-                SseOpcode::Movss => (LegacyPrefix::_F3, 0x0F11),
-                SseOpcode::Movsd => (LegacyPrefix::_F2, 0x0F11),
-                SseOpcode::Movups => (LegacyPrefix::None, 0x0F11),
-                SseOpcode::Movupd => (LegacyPrefix::_66, 0x0F11),
+                SseOpcode::Movaps => (LegacyPrefixes::None, 0x0F29),
+                SseOpcode::Movapd => (LegacyPrefixes::_66, 0x0F29),
+                SseOpcode::Movdqa => (LegacyPrefixes::_66, 0x0F7F),
+                SseOpcode::Movdqu => (LegacyPrefixes::_F3, 0x0F7F),
+                SseOpcode::Movss => (LegacyPrefixes::_F3, 0x0F11),
+                SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F11),
+                SseOpcode::Movups => (LegacyPrefixes::None, 0x0F11),
+                SseOpcode::Movupd => (LegacyPrefixes::_66, 0x0F11),
                 _ => unimplemented!("Opcode {:?} not implemented", op),
             };
             let dst = &dst.finalize(state);
@@ -1860,9 +1874,9 @@ pub(crate) fn emit(
             let (prefix, opcode, dst_first) = match op {
                 // Movd and movq use the same opcode; the presence of the REX prefix (set below)
                 // actually determines which is used.
-                SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefix::_66, 0x0F7E, false),
-                SseOpcode::Cvttss2si => (LegacyPrefix::_F3, 0x0F2C, true),
-                SseOpcode::Cvttsd2si => (LegacyPrefix::_F2, 0x0F2C, true),
+                SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F7E, false),
+                SseOpcode::Cvttss2si => (LegacyPrefixes::_F3, 0x0F2C, true),
+                SseOpcode::Cvttsd2si => (LegacyPrefixes::_F2, 0x0F2C, true),
                 _ => panic!("unexpected opcode {:?}", op),
             };
             let rex = match dst_size {
@@ -1888,9 +1902,9 @@ pub(crate) fn emit(
             let (prefix, opcode) = match op {
                 // Movd and movq use the same opcode; the presence of the REX prefix (set below)
                 // actually determines which is used.
-                SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefix::_66, 0x0F6E),
-                SseOpcode::Cvtsi2ss => (LegacyPrefix::_F3, 0x0F2A),
-                SseOpcode::Cvtsi2sd => (LegacyPrefix::_F2, 0x0F2A),
+                SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F6E),
+                SseOpcode::Cvtsi2ss => (LegacyPrefixes::_F3, 0x0F2A),
+                SseOpcode::Cvtsi2sd => (LegacyPrefixes::_F2, 0x0F2A),
                 _ => panic!("unexpected opcode {:?}", op),
             };
             let rex = match *src_size {
@@ -1911,8 +1925,8 @@ pub(crate) fn emit(
         Inst::XMM_Cmp_RM_R { op, src, dst } => {
             let rex = RexFlags::clear_w();
             let (prefix, opcode) = match op {
-                SseOpcode::Ucomisd => (LegacyPrefix::_66, 0x0F2E),
-                SseOpcode::Ucomiss => (LegacyPrefix::None, 0x0F2E),
+                SseOpcode::Ucomisd => (LegacyPrefixes::_66, 0x0F2E),
+                SseOpcode::Ucomiss => (LegacyPrefixes::None, 0x0F2E),
                 _ => unimplemented!("Emit xmm cmp rm r"),
             };
 
@@ -2431,6 +2445,113 @@ pub(crate) fn emit(
             }
         }
 
+        Inst::LockCmpxchg {
+            ty,
+            src,
+            dst,
+            srcloc,
+        } => {
+            if let Some(srcloc) = srcloc {
+                sink.add_trap(*srcloc, TrapCode::HeapOutOfBounds);
+            }
+            // lock cmpxchg{b,w,l,q} %src, (dst)
+            // Note that 0xF0 is the Lock prefix.
+            let (prefix, rex, opcodes) = match *ty {
+                types::I8 => {
+                    let mut rex_flags = RexFlags::clear_w();
+                    let enc_src = int_reg_enc(*src);
+                    if enc_src >= 4 && enc_src <= 7 {
+                        rex_flags.always_emit();
+                    };
+                    (LegacyPrefixes::_F0, rex_flags, 0x0FB0)
+                }
+                types::I16 => (LegacyPrefixes::_66F0, RexFlags::clear_w(), 0x0FB1),
+                types::I32 => (LegacyPrefixes::_F0, RexFlags::clear_w(), 0x0FB1),
+                types::I64 => (LegacyPrefixes::_F0, RexFlags::set_w(), 0x0FB1),
+                _ => unreachable!(),
+            };
+            emit_std_reg_mem(sink, prefix, opcodes, 2, *src, &dst.finalize(state), rex);
+        }
+
+        Inst::AtomicRmwSeq { ty, op, srcloc } => {
+            // Emit this:
+            //
+            //    mov{zbq,zwq,zlq,q}     (%r9), %rax  // rax = old value
+            //   again:
+            //    movq                   %rax, %r11   // rax = old value, r11 = old value
+            //    `op`q                  %r10, %r11   // rax = old value, r11 = new value
+            //    lock cmpxchg{b,w,l,q}  %r11, (%r9)  // try to store new value
+            //    jnz again // If this is taken, rax will have a "revised" old value
+            //
+            // Operand conventions:
+            //    IN:  %r9 (addr), %r10 (2nd arg for `op`)
+            //    OUT: %rax (old value), %r11 (trashed), %rflags (trashed)
+            //
+            // In the case where the operation is 'xchg', the "`op`q" instruction is instead
+            //   movq                    %r10, %r11
+            // so that we simply write in the destination, the "2nd arg for `op`".
+            let rax = regs::rax();
+            let r9 = regs::r9();
+            let r10 = regs::r10();
+            let r11 = regs::r11();
+            let rax_w = Writable::from_reg(rax);
+            let r11_w = Writable::from_reg(r11);
+            let amode = Amode::imm_reg(0, r9);
+            let again_label = sink.get_label();
+
+            // mov{zbq,zwq,zlq,q} (%r9), %rax
+            // No need to call `add_trap` here, since the `i1` emit will do that.
+            let i1 = Inst::load(*ty, amode.clone(), rax_w, ExtKind::ZeroExtend, *srcloc);
+            i1.emit(sink, flags, state);
+
+            // again:
+            sink.bind_label(again_label);
+
+            // movq %rax, %r11
+            let i2 = Inst::mov_r_r(true, rax, r11_w);
+            i2.emit(sink, flags, state);
+
+            // opq %r10, %r11
+            let r10_rmi = RegMemImm::reg(r10);
+            let i3 = if *op == inst_common::AtomicRmwOp::Xchg {
+                Inst::mov_r_r(true, r10, r11_w)
+            } else {
+                let alu_op = match op {
+                    inst_common::AtomicRmwOp::Add => AluRmiROpcode::Add,
+                    inst_common::AtomicRmwOp::Sub => AluRmiROpcode::Sub,
+                    inst_common::AtomicRmwOp::And => AluRmiROpcode::And,
+                    inst_common::AtomicRmwOp::Or => AluRmiROpcode::Or,
+                    inst_common::AtomicRmwOp::Xor => AluRmiROpcode::Xor,
+                    inst_common::AtomicRmwOp::Xchg => unreachable!(),
+                };
+                Inst::alu_rmi_r(true, alu_op, r10_rmi, r11_w)
+            };
+            i3.emit(sink, flags, state);
+
+            // lock cmpxchg{b,w,l,q} %r11, (%r9)
+            // No need to call `add_trap` here, since the `i4` emit will do that.
+            let i4 = Inst::LockCmpxchg {
+                ty: *ty,
+                src: r11,
+                dst: amode.into(),
+                srcloc: *srcloc,
+            };
+            i4.emit(sink, flags, state);
+
+            // jnz again
+            one_way_jmp(sink, CC::NZ, again_label);
+        }
+
+        Inst::Fence { kind } => {
+            sink.put1(0x0F);
+            sink.put1(0xAE);
+            match kind {
+                FenceKind::MFence => sink.put1(0xF0), // mfence = 0F AE F0
+                FenceKind::LFence => sink.put1(0xE8), // lfence = 0F AE E8
+                FenceKind::SFence => sink.put1(0xF8), // sfence = 0F AE F8
+            }
+        }
+
         Inst::Hlt => {
             sink.put1(0xcc);
         }
diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
index e0f2ea1acd..cb1a6b855a 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -4,10 +4,13 @@
 //!
 //! to see stdout: cargo test -- --nocapture
 //!
-//! for this specific case:
+//! for this specific case, as of 24 Aug 2020:
 //!
-//! (cd cranelift/codegen && \
-//! RUST_BACKTRACE=1 cargo test isa::x64::inst::test_x64_insn_encoding_and_printing -- --nocapture)
+//! cd to the top of your wasmtime tree, then:
+//! RUST_BACKTRACE=1 cargo test --features test-programs/test_programs \
+//!   --features experimental_x64 --all --exclude peepmatic --exclude lightbeam \
+//!   --exclude wasmtime-lightbeam --exclude peepmatic-automata --exclude peepmatic-fuzzing \
+//!  --exclude peepmatic-macro -- isa::x64::inst::emit_tests::test_x64_emit
 
 use super::*;
 use crate::isa::test_utils;
@@ -3272,6 +3275,174 @@ fn test_x64_emit() {
         "cmpps   $0, %xmm15, %xmm7",
     ));
 
+    // ========================================================
+    // Pertaining to atomics.
+    let am1: SyntheticAmode = Amode::imm_reg_reg_shift(321, r10, rdx, 2).into();
+    // `am2` doesn't contribute any 1 bits to the rex prefix, so we must use it when testing
+    // for retention of the apparently-redundant rex prefix in the 8-bit case.
+    let am2: SyntheticAmode = Amode::imm_reg_reg_shift(-12345i32 as u32, rcx, rsi, 3).into();
+
+    // A general 8-bit case.
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I8,
+            src: rbx,
+            dst: am1,
+            srcloc: None,
+        },
+        "F0410FB09C9241010000",
+        "lock cmpxchgb %bl, 321(%r10,%rdx,4)",
+    ));
+    // Check redundant rex retention in 8-bit cases.
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I8,
+            src: rdx,
+            dst: am2.clone(),
+            srcloc: None,
+        },
+        "F00FB094F1C7CFFFFF",
+        "lock cmpxchgb %dl, -12345(%rcx,%rsi,8)",
+    ));
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I8,
+            src: rsi,
+            dst: am2.clone(),
+            srcloc: None,
+        },
+        "F0400FB0B4F1C7CFFFFF",
+        "lock cmpxchgb %sil, -12345(%rcx,%rsi,8)",
+    ));
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I8,
+            src: r10,
+            dst: am2.clone(),
+            srcloc: None,
+        },
+        "F0440FB094F1C7CFFFFF",
+        "lock cmpxchgb %r10b, -12345(%rcx,%rsi,8)",
+    ));
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I8,
+            src: r15,
+            dst: am2.clone(),
+            srcloc: None,
+        },
+        "F0440FB0BCF1C7CFFFFF",
+        "lock cmpxchgb %r15b, -12345(%rcx,%rsi,8)",
+    ));
+    // 16 bit cases
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I16,
+            src: rsi,
+            dst: am2.clone(),
+            srcloc: None,
+        },
+        "66F00FB1B4F1C7CFFFFF",
+        "lock cmpxchgw %si, -12345(%rcx,%rsi,8)",
+    ));
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I16,
+            src: r10,
+            dst: am2.clone(),
+            srcloc: None,
+        },
+        "66F0440FB194F1C7CFFFFF",
+        "lock cmpxchgw %r10w, -12345(%rcx,%rsi,8)",
+    ));
+    // 32 bit cases
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I32,
+            src: rsi,
+            dst: am2.clone(),
+            srcloc: None,
+        },
+        "F00FB1B4F1C7CFFFFF",
+        "lock cmpxchgl %esi, -12345(%rcx,%rsi,8)",
+    ));
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I32,
+            src: r10,
+            dst: am2.clone(),
+            srcloc: None,
+        },
+        "F0440FB194F1C7CFFFFF",
+        "lock cmpxchgl %r10d, -12345(%rcx,%rsi,8)",
+    ));
+    // 64 bit cases
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I64,
+            src: rsi,
+            dst: am2.clone(),
+            srcloc: None,
+        },
+        "F0480FB1B4F1C7CFFFFF",
+        "lock cmpxchgq %rsi, -12345(%rcx,%rsi,8)",
+    ));
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I64,
+            src: r10,
+            dst: am2.clone(),
+            srcloc: None,
+        },
+        "F04C0FB194F1C7CFFFFF",
+        "lock cmpxchgq %r10, -12345(%rcx,%rsi,8)",
+    ));
+
+    // AtomicRmwSeq
+    insns.push((
+        Inst::AtomicRmwSeq { ty: types::I8, op: inst_common::AtomicRmwOp::Or, srcloc: None },
+        "490FB6014989C34D09D3F0450FB0190F85EFFFFFFF",
+        "atomically { 8_bits_at_[%r9]) Or= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }"
+    ));
+    insns.push((
+        Inst::AtomicRmwSeq { ty: types::I16, op: inst_common::AtomicRmwOp::And, srcloc: None },
+        "490FB7014989C34D21D366F0450FB1190F85EEFFFFFF",
+        "atomically { 16_bits_at_[%r9]) And= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }"
+    ));
+    insns.push((
+        Inst::AtomicRmwSeq { ty: types::I32, op: inst_common::AtomicRmwOp::Xchg, srcloc: None },
+        "418B014989C34D89D3F0450FB1190F85EFFFFFFF",
+        "atomically { 32_bits_at_[%r9]) Xchg= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }"
+    ));
+    insns.push((
+        Inst::AtomicRmwSeq { ty: types::I64, op: inst_common::AtomicRmwOp::Add, srcloc: None },
+        "498B014989C34D01D3F04D0FB1190F85EFFFFFFF",
+        "atomically { 64_bits_at_[%r9]) Add= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }"
+    ));
+
+    // Fence
+    insns.push((
+        Inst::Fence {
+            kind: FenceKind::MFence,
+        },
+        "0FAEF0",
+        "mfence",
+    ));
+    insns.push((
+        Inst::Fence {
+            kind: FenceKind::LFence,
+        },
+        "0FAEE8",
+        "lfence",
+    ));
+    insns.push((
+        Inst::Fence {
+            kind: FenceKind::SFence,
+        },
+        "0FAEF8",
+        "sfence",
+    ));
+
     // ========================================================
     // Misc instructions.
 
diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs
index 712a9b508e..da2dca2060 100644
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -404,6 +404,56 @@ pub enum Inst {
         offset: i64,
     },
 
+    // =====================================
+    // Instructions pertaining to atomic memory accesses.
+    /// A standard (native) `lock cmpxchg src, (amode)`, with register conventions:
+    ///
+    /// `dst`  (read) address
+    /// `src`  (read) replacement value
+    /// %rax   (modified) in: expected value, out: value that was actually at `dst`
+    /// %rflags is written.  Do not assume anything about it after the instruction.
+    ///
+    /// The instruction "succeeded" iff the lowest `ty` bits of %rax afterwards are the same as
+    /// they were before.
+    LockCmpxchg {
+        ty: Type, // I8, I16, I32 or I64
+        src: Reg,
+        dst: SyntheticAmode,
+        srcloc: Option<SourceLoc>,
+    },
+
+    /// A synthetic instruction, based on a loop around a native `lock cmpxchg` instruction.
+    /// This atomically modifies a value in memory and returns the old value.  The sequence
+    /// consists of an initial "normal" load from `dst`, followed by a loop which computes the
+    /// new value and tries to compare-and-swap ("CAS") it into `dst`, using the native
+    /// instruction `lock cmpxchg{b,w,l,q}` .  The loop iterates until the CAS is successful.
+    /// If there is no contention, there will be only one pass through the loop body.  The
+    /// sequence does *not* perform any explicit memory fence instructions
+    /// (mfence/sfence/lfence).
+    ///
+    /// Note that the transaction is atomic in the sense that, as observed by some other thread,
+    /// `dst` either has the initial or final value, but no other.  It isn't atomic in the sense
+    /// of guaranteeing that no other thread writes to `dst` in between the initial load and the
+    /// CAS -- but that would cause the CAS to fail unless the other thread's last write before
+    /// the CAS wrote the same value that was already there.  In other words, this
+    /// implementation suffers (unavoidably) from the A-B-A problem.
+    ///
+    /// This instruction sequence has fixed register uses as follows:
+    ///
+    /// %r9   (read) address
+    /// %r10  (read) second operand for `op`
+    /// %r11  (written) scratch reg; value afterwards has no meaning
+    /// %rax  (written) the old value at %r9
+    /// %rflags is written.  Do not assume anything about it after the instruction.
+    AtomicRmwSeq {
+        ty: Type, // I8, I16, I32 or I64
+        op: inst_common::AtomicRmwOp,
+        srcloc: Option<SourceLoc>,
+    },
+
+    /// A memory fence (mfence, lfence or sfence).
+    Fence { kind: FenceKind },
+
     // =====================================
     // Meta-instructions generating no code.
     /// Marker, no-op in generated code: SP "virtual offset" is adjusted. This
@@ -1521,6 +1571,26 @@ impl ShowWithRRU for Inst {
                 show_ireg_sized(dst.to_reg(), mb_rru, 8),
             ),
 
+            Inst::LockCmpxchg { ty, src, dst, .. } => {
+                let size = ty.bytes() as u8;
+                format!("lock cmpxchg{} {}, {}",
+                        suffixBWLQ(size), show_ireg_sized(*src, mb_rru, size), dst.show_rru(mb_rru))
+            }
+
+            Inst::AtomicRmwSeq { ty, op, .. } => {
+                format!(
+                    "atomically {{ {}_bits_at_[%r9]) {:?}= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }}",
+                    ty.bits(), op)
+            },
+
+            Inst::Fence { kind } => {
+                match kind {
+                    FenceKind::MFence => "mfence".to_string(),
+                    FenceKind::LFence => "lfence".to_string(),
+                    FenceKind::SFence => "sfence".to_string(),
+                }
+            }
+
             Inst::VirtualSPOffsetAdj { offset } => format!("virtual_sp_offset_adjust {}", offset),
 
             Inst::Hlt => "hlt".into(),
@@ -1737,6 +1807,19 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
             collector.add_def(*dst);
         }
 
+        Inst::LockCmpxchg { src, dst, .. } => {
+            dst.get_regs_as_uses(collector);
+            collector.add_use(*src);
+            collector.add_mod(Writable::from_reg(regs::rax()));
+        }
+
+        Inst::AtomicRmwSeq { .. } => {
+            collector.add_use(regs::r9());
+            collector.add_use(regs::r10());
+            collector.add_def(Writable::from_reg(regs::r11()));
+            collector.add_def(Writable::from_reg(regs::rax()));
+        }
+
         Inst::Ret
         | Inst::EpiloguePlaceholder
         | Inst::JmpKnown { .. }
@@ -1745,7 +1828,8 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
         | Inst::TrapIf { .. }
         | Inst::VirtualSPOffsetAdj { .. }
         | Inst::Hlt
-        | Inst::Ud2 { .. } => {
+        | Inst::Ud2 { .. }
+        | Inst::Fence { .. } => {
             // No registers are used.
         }
     }
@@ -2091,6 +2175,15 @@ fn x64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
 
         Inst::LoadExtName { ref mut dst, .. } => map_def(mapper, dst),
 
+        Inst::LockCmpxchg {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            map_use(mapper, src);
+            dst.map_uses(mapper);
+        }
+
         Inst::Ret
         | Inst::EpiloguePlaceholder
         | Inst::JmpKnown { .. }
@@ -2099,8 +2192,11 @@ fn x64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
         | Inst::TrapIf { .. }
         | Inst::VirtualSPOffsetAdj { .. }
         | Inst::Ud2 { .. }
-        | Inst::Hlt => {
-            // No registers are used.
+        | Inst::Hlt
+        | Inst::AtomicRmwSeq { .. }
+        | Inst::Fence { .. } => {
+            // Instruction doesn't explicitly mention any regs, so it can't have any virtual
+            // regs that we'd need to remap.  Hence no action required.
         }
     }
 }
diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs
index f4eb306882..1b494db706 100644
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -2,6 +2,7 @@
 
 #![allow(non_snake_case)]
 
+use crate::ir;
 use crate::ir::{
     condcodes::FloatCC, condcodes::IntCC, types, AbiParam, ArgumentPurpose, ExternalName,
     Inst as IRInst, InstructionData, LibCall, Opcode, Signature, TrapCode, Type,
@@ -45,6 +46,14 @@ fn is_bool_ty(ty: Type) -> bool {
     }
 }
 
+/// This is target-word-size dependent.  And it excludes booleans and reftypes.
+fn is_valid_atomic_transaction_ty(ty: Type) -> bool {
+    match ty {
+        types::I8 | types::I16 | types::I32 | types::I64 => true,
+        _ => false,
+    }
+}
+
 fn iri_to_u64_imm(ctx: Ctx, inst: IRInst) -> Option<u64> {
     ctx.get_constant(inst)
 }
@@ -82,6 +91,13 @@ fn inst_fp_condcode(data: &InstructionData) -> Option<FloatCC> {
     }
 }
 
+fn inst_atomic_rmw_op(data: &InstructionData) -> Option<ir::AtomicRmwOp> {
+    match data {
+        &InstructionData::AtomicRmw { op, .. } => Some(op),
+        _ => None,
+    }
+}
+
 fn ldst_offset(data: &InstructionData) -> Option<i32> {
     match data {
         &InstructionData::Load { offset, .. }
@@ -1732,6 +1748,148 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             });
         }
 
+        Opcode::AtomicRmw => {
+            // This is a simple, general-case atomic update, based on a loop involving
+            // `cmpxchg`.  Note that we could do much better than this in the case where the old
+            // value at the location (that is to say, the SSA `Value` computed by this CLIF
+            // instruction) is not required.  In that case, we could instead implement this
+            // using a single `lock`-prefixed x64 read-modify-write instruction.  Also, even in
+            // the case where the old value is required, for the `add` and `sub` cases, we can
+            // use the single instruction `lock xadd`.  However, those improvements have been
+            // left for another day.
+            // TODO: filed as https://github.com/bytecodealliance/wasmtime/issues/2153
+            let dst = output_to_reg(ctx, outputs[0]);
+            let mut addr = input_to_reg(ctx, inputs[0]);
+            let mut arg2 = input_to_reg(ctx, inputs[1]);
+            let ty_access = ty.unwrap();
+            assert!(is_valid_atomic_transaction_ty(ty_access));
+            let memflags = ctx.memflags(insn).expect("memory flags");
+            let srcloc = if !memflags.notrap() {
+                Some(ctx.srcloc(insn))
+            } else {
+                None
+            };
+            // Make sure that both args are in virtual regs, since in effect we have to do a
+            // parallel copy to get them safely to the AtomicRmwSeq input regs, and that's not
+            // guaranteed safe if either is in a real reg.
+            addr = ctx.ensure_in_vreg(addr, types::I64);
+            arg2 = ctx.ensure_in_vreg(arg2, types::I64);
+            // Move the args to the preordained AtomicRMW input regs.  Note that `AtomicRmwSeq`
+            // operates at whatever width is specified by `ty`, so there's no need to
+            // zero-extend `arg2` in the case of `ty` being I8/I16/I32.
+            ctx.emit(Inst::gen_move(
+                Writable::from_reg(regs::r9()),
+                addr,
+                types::I64,
+            ));
+            ctx.emit(Inst::gen_move(
+                Writable::from_reg(regs::r10()),
+                arg2,
+                types::I64,
+            ));
+            // Now the AtomicRmwSeq (pseudo-) instruction itself
+            let op = inst_common::AtomicRmwOp::from(inst_atomic_rmw_op(ctx.data(insn)).unwrap());
+            ctx.emit(Inst::AtomicRmwSeq {
+                ty: ty_access,
+                op,
+                srcloc,
+            });
+            // And finally, copy the preordained AtomicRmwSeq output reg to its destination.
+            ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64));
+        }
+
+        Opcode::AtomicCas => {
+            // This is very similar to, but not identical to, the `AtomicRmw` case.  As with
+            // `AtomicRmw`, there's no need to zero-extend narrow values here.
+            let dst = output_to_reg(ctx, outputs[0]);
+            let addr = input_to_reg(ctx, inputs[0]);
+            let expected = input_to_reg(ctx, inputs[1]);
+            let replacement = input_to_reg(ctx, inputs[2]);
+            let ty_access = ty.unwrap();
+            assert!(is_valid_atomic_transaction_ty(ty_access));
+            let memflags = ctx.memflags(insn).expect("memory flags");
+            let srcloc = if !memflags.notrap() {
+                Some(ctx.srcloc(insn))
+            } else {
+                None
+            };
+            // Move the expected value into %rax.  Because there's only one fixed register on
+            // the input side, we don't have to use `ensure_in_vreg`, as is necessary in the
+            // `AtomicRmw` case.
+            ctx.emit(Inst::gen_move(
+                Writable::from_reg(regs::rax()),
+                expected,
+                types::I64,
+            ));
+            ctx.emit(Inst::LockCmpxchg {
+                ty: ty_access,
+                src: replacement,
+                dst: Amode::imm_reg(0, addr).into(),
+                srcloc,
+            });
+            // And finally, copy the old value at the location to its destination reg.
+            ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64));
+        }
+
+        Opcode::AtomicLoad => {
+            // This is a normal load.  The x86-TSO memory model provides sufficient sequencing
+            // to satisfy the CLIF synchronisation requirements for `AtomicLoad` without the
+            // need for any fence instructions.
+            let data = output_to_reg(ctx, outputs[0]);
+            let addr = input_to_reg(ctx, inputs[0]);
+            let ty_access = ty.unwrap();
+            assert!(is_valid_atomic_transaction_ty(ty_access));
+            let memflags = ctx.memflags(insn).expect("memory flags");
+            let srcloc = if !memflags.notrap() {
+                Some(ctx.srcloc(insn))
+            } else {
+                None
+            };
+            // For the amode, we could do better, but for now just use `0(addr)`.
+            let rm = RegMem::mem(Amode::imm_reg(0, addr));
+            if ty_access == types::I64 {
+                ctx.emit(Inst::mov64_rm_r(rm, data, srcloc));
+            } else {
+                let ext_mode = match ty_access {
+                    types::I8 => ExtMode::BQ,
+                    types::I16 => ExtMode::WQ,
+                    types::I32 => ExtMode::LQ,
+                    _ => panic!("lowering AtomicLoad: invalid type"),
+                };
+                ctx.emit(Inst::movzx_rm_r(ext_mode, rm, data, srcloc));
+            }
+        }
+
+        Opcode::AtomicStore => {
+            // This is a normal store, followed by an `mfence` instruction.
+            let data = input_to_reg(ctx, inputs[0]);
+            let addr = input_to_reg(ctx, inputs[1]);
+            let ty_access = ctx.input_ty(insn, 0);
+            assert!(is_valid_atomic_transaction_ty(ty_access));
+            let memflags = ctx.memflags(insn).expect("memory flags");
+            let srcloc = if !memflags.notrap() {
+                Some(ctx.srcloc(insn))
+            } else {
+                None
+            };
+            // For the amode, we could do better, but for now just use `0(addr)`.
+            ctx.emit(Inst::mov_r_m(
+                ty_access.bytes() as u8,
+                data,
+                Amode::imm_reg(0, addr),
+                srcloc,
+            ));
+            ctx.emit(Inst::Fence {
+                kind: FenceKind::MFence,
+            });
+        }
+
+        Opcode::Fence => {
+            ctx.emit(Inst::Fence {
+                kind: FenceKind::MFence,
+            });
+        }
+
         Opcode::FuncAddr => {
             let dst = output_to_reg(ctx, outputs[0]);
             let (extname, _) = ctx.call_target(insn).unwrap();
diff --git a/cranelift/codegen/src/machinst/inst_common.rs b/cranelift/codegen/src/machinst/inst_common.rs
new file mode 100644
index 0000000000..9566c56e53
--- /dev/null
+++ b/cranelift/codegen/src/machinst/inst_common.rs
@@ -0,0 +1,36 @@
+//! A place to park MachInst::Inst fragments which are common across multiple architectures.
+
+use crate::ir;
+
+/// Atomic memory update operations.  As of 21 Aug 2020 these are used for the aarch64 and x64
+/// targets.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+#[repr(u8)]
+pub enum AtomicRmwOp {
+    /// Add
+    Add,
+    /// Sub
+    Sub,
+    /// And
+    And,
+    /// Or
+    Or,
+    /// Exclusive Or
+    Xor,
+    /// Exchange (swap operands)
+    Xchg,
+}
+
+impl AtomicRmwOp {
+    /// Converts an `ir::AtomicRmwOp` to the corresponding `inst_common::AtomicRmwOp`.
+    pub fn from(ir_op: ir::AtomicRmwOp) -> Self {
+        match ir_op {
+            ir::AtomicRmwOp::Add => AtomicRmwOp::Add,
+            ir::AtomicRmwOp::Sub => AtomicRmwOp::Sub,
+            ir::AtomicRmwOp::And => AtomicRmwOp::And,
+            ir::AtomicRmwOp::Or => AtomicRmwOp::Or,
+            ir::AtomicRmwOp::Xor => AtomicRmwOp::Xor,
+            ir::AtomicRmwOp::Xchg => AtomicRmwOp::Xchg,
+        }
+    }
+}
diff --git a/cranelift/codegen/src/machinst/mod.rs b/cranelift/codegen/src/machinst/mod.rs
index b8ec275133..915764436e 100644
--- a/cranelift/codegen/src/machinst/mod.rs
+++ b/cranelift/codegen/src/machinst/mod.rs
@@ -133,6 +133,8 @@ pub mod adapter;
 pub use adapter::*;
 pub mod helpers;
 pub use helpers::*;
+pub mod inst_common;
+pub use inst_common::*;
 
 /// A machine instruction.
 pub trait MachInst: Clone + Debug {