diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs
index 8052874603..3f5aeb784f 100644
--- a/cranelift/codegen/src/isa/aarch64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -560,10 +560,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
         insts.push(Inst::StoreP64 {
             rt: fp_reg(),
             rt2: link_reg(),
-            mem: PairAMode::PreIndexed(
-                writable_stack_reg(),
-                SImm7Scaled::maybe_from_i64(-16, types::I64).unwrap(),
-            ),
+            mem: PairAMode::SPPreIndexed(SImm7Scaled::maybe_from_i64(-16, types::I64).unwrap()),
             flags: MemFlags::trusted(),
         });
 
@@ -601,10 +598,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
         insts.push(Inst::LoadP64 {
             rt: writable_fp_reg(),
             rt2: writable_link_reg(),
-            mem: PairAMode::PostIndexed(
-                writable_stack_reg(),
-                SImm7Scaled::maybe_from_i64(16, types::I64).unwrap(),
-            ),
+            mem: PairAMode::SPPostIndexed(SImm7Scaled::maybe_from_i64(16, types::I64).unwrap()),
             flags: MemFlags::trusted(),
         });
         insts
@@ -676,10 +670,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
             // str rd, [sp, #-16]!
             insts.push(Inst::Store64 {
                 rd,
-                mem: AMode::PreIndexed(
-                    writable_stack_reg(),
-                    SImm9::maybe_from_i64(-clobber_offset_change).unwrap(),
-                ),
+                mem: AMode::SPPreIndexed(SImm9::maybe_from_i64(-clobber_offset_change).unwrap()),
                 flags: MemFlags::trusted(),
             });
 
@@ -708,8 +699,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
             insts.push(Inst::StoreP64 {
                 rt,
                 rt2,
-                mem: PairAMode::PreIndexed(
-                    writable_stack_reg(),
+                mem: PairAMode::SPPreIndexed(
                     SImm7Scaled::maybe_from_i64(-clobber_offset_change, types::I64).unwrap(),
                 ),
                 flags: MemFlags::trusted(),
@@ -734,10 +724,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
 
         let store_vec_reg = |rd| Inst::FpuStore64 {
             rd,
-            mem: AMode::PreIndexed(
-                writable_stack_reg(),
-                SImm9::maybe_from_i64(-clobber_offset_change).unwrap(),
-            ),
+            mem: AMode::SPPreIndexed(SImm9::maybe_from_i64(-clobber_offset_change).unwrap()),
             flags: MemFlags::trusted(),
         };
         let iter = clobbered_vec.chunks_exact(2);
@@ -766,8 +753,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
                 Inst::FpuStoreP64 {
                     rt,
                     rt2,
-                    mem: PairAMode::PreIndexed(
-                        writable_stack_reg(),
+                    mem: PairAMode::SPPreIndexed(
                         SImm7Scaled::maybe_from_i64(-clobber_offset_change, F64).unwrap(),
                     ),
                     flags: MemFlags::trusted(),
@@ -831,16 +817,13 @@ impl ABIMachineSpec for AArch64MachineDeps {
 
         let load_vec_reg = |rd| Inst::FpuLoad64 {
             rd,
-            mem: AMode::PostIndexed(writable_stack_reg(), SImm9::maybe_from_i64(16).unwrap()),
+            mem: AMode::SPPostIndexed(SImm9::maybe_from_i64(16).unwrap()),
             flags: MemFlags::trusted(),
         };
         let load_vec_reg_pair = |rt, rt2| Inst::FpuLoadP64 {
             rt,
             rt2,
-            mem: PairAMode::PostIndexed(
-                writable_stack_reg(),
-                SImm7Scaled::maybe_from_i64(16, F64).unwrap(),
-            ),
+            mem: PairAMode::SPPostIndexed(SImm7Scaled::maybe_from_i64(16, F64).unwrap()),
             flags: MemFlags::trusted(),
         };
 
@@ -876,10 +859,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
             insts.push(Inst::LoadP64 {
                 rt,
                 rt2,
-                mem: PairAMode::PostIndexed(
-                    writable_stack_reg(),
-                    SImm7Scaled::maybe_from_i64(16, I64).unwrap(),
-                ),
+                mem: PairAMode::SPPostIndexed(SImm7Scaled::maybe_from_i64(16, I64).unwrap()),
                 flags: MemFlags::trusted(),
             });
         }
@@ -893,7 +873,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
             // ldr rd, [sp], #16
             insts.push(Inst::ULoad64 {
                 rd,
-                mem: AMode::PostIndexed(writable_stack_reg(), SImm9::maybe_from_i64(16).unwrap()),
+                mem: AMode::SPPostIndexed(SImm9::maybe_from_i64(16).unwrap()),
                 flags: MemFlags::trusted(),
             });
         }
diff --git a/cranelift/codegen/src/isa/aarch64/inst.isle b/cranelift/codegen/src/isa/aarch64/inst.isle
index a49f8872d1..fb2a81ba13 100644
--- a/cranelift/codegen/src/isa/aarch64/inst.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst.isle
@@ -171,13 +171,23 @@
         (rd WritableReg)
         (rm PReg))
 
-       ;; A MOV[Z,N,K] with a 16-bit immediate.
+       ;; A MOV[Z,N] with a 16-bit immediate.
        (MovWide
         (op MoveWideOp)
         (rd WritableReg)
         (imm MoveWideConst)
         (size OperandSize))
 
+       ;; A MOVK with a 16-bit immediate. Modifies its register; we
+       ;; model this with a seprate input `rn` and output `rd` virtual
+       ;; register, with a regalloc constraint to tie them together.
+       (MovK
+        (rd WritableReg)
+        (rn Reg)
+        (imm MoveWideConst)
+        (size OperandSize))
+
+
        ;; A sign- or zero-extend operation.
        (Extend
         (rd WritableReg)
@@ -240,7 +250,12 @@
        ;; x28   (wr) scratch reg; value afterwards has no meaning
        (AtomicRMWLoop
         (ty Type) ;; I8, I16, I32 or I64
-        (op AtomicRMWLoopOp))
+        (op AtomicRMWLoopOp)
+        (addr Reg)
+        (operand Reg)
+        (oldval WritableReg)
+        (scratch1 WritableReg)
+        (scratch2 WritableReg))
 
        ;; Similar to AtomicRMWLoop, a compare-and-swap operation implemented using a load-linked
        ;; store-conditional loop, with acquire-release semantics.
@@ -253,7 +268,11 @@
        ;; x24   (wr) scratch reg; value afterwards has no meaning
        (AtomicCASLoop
         (ty Type) ;; I8, I16, I32 or I64
-        )
+        (addr Reg)
+        (expected Reg)
+        (replacement Reg)
+        (oldval WritableReg)
+        (scratch WritableReg))
 
        ;; An atomic read-modify-write operation. These instructions require the
        ;; Large System Extension (LSE) ISA support (FEAT_LSE). The instructions have
@@ -269,7 +288,10 @@
        ;; Large System Extension (LSE) ISA support (FEAT_LSE). The instructions have
        ;; acquire-release semantics.
        (AtomicCAS
-         (rs WritableReg)
+         ;; `rd` is really `rs` in the encoded instruction (so `rd` == `rs`); we separate
+         ;; them here to have separate use and def vregs for regalloc.
+         (rd WritableReg)
+         (rs Reg)
          (rt Reg)
          (rn Reg)
          (ty Type))
@@ -342,6 +364,16 @@
         (rd WritableReg)
         (rn Reg))
 
+       ;; Variant of FpuRRI that modifies its `rd`, and so we name the
+       ;; input state `ri` (for "input") and constrain the two
+       ;; together.
+       (FpuRRIMod
+        (fpu_op FPUOpRIMod)
+        (rd WritableReg)
+        (ri Reg)
+        (rn Reg))
+
+
        ;; 3-op FPU instruction.
        ;; 16-bit scalars require half-precision floating-point support (FEAT_FP16).
        (FpuRRRR
@@ -479,6 +511,7 @@
        ;; Move to a vector element from a GPR.
        (MovToVec
         (rd WritableReg)
+        (ri Reg)
         (rn Reg)
         (idx u8)
         (size VectorSize))
@@ -534,6 +567,7 @@
        ;; Move vector element to another vector element.
        (VecMovElement
         (rd WritableReg)
+        (ri Reg)
         (rn Reg)
         (dest_idx u8)
         (src_idx u8)
@@ -546,12 +580,19 @@
         (rn Reg)
         (high_half bool))
 
-       ;; Vector narrowing operation.
-       (VecRRNarrow
+       ;; Vector narrowing operation -- low half.
+       (VecRRNarrowLow
         (op VecRRNarrowOp)
         (rd WritableReg)
         (rn Reg)
-        (high_half bool)
+        (lane_size ScalarSize))
+
+       ;; Vector narrowing operation -- high half.
+       (VecRRNarrowHigh
+        (op VecRRNarrowOp)
+        (rd WritableReg)
+        (ri Reg)
+        (rn Reg)
         (lane_size ScalarSize))
 
        ;; 1-operand vector instruction that operates on a pair of elements.
@@ -569,6 +610,17 @@
         (rm Reg)
         (high_half bool))
 
+       ;; 2-operand vector instruction that produces a result with
+       ;; twice the lane width and half the number of lanes. Variant
+       ;; that modifies `rd` (so takes its initial state as `ri`).
+       (VecRRRLongMod
+        (alu_op VecRRRLongModOp)
+        (rd WritableReg)
+        (ri Reg)
+        (rn Reg)
+        (rm Reg)
+        (high_half bool))
+
        ;; 1-operand vector instruction that extends elements of the input
        ;; register and operates on a pair of elements. The output lane width
        ;; is double that of the input.
@@ -589,6 +641,7 @@
        (VecRRRMod
         (alu_op VecALUModOp)
         (rd WritableReg)
+        (ri Reg)
         (rn Reg)
         (rm Reg)
         (size VectorSize))
@@ -623,6 +676,7 @@
        (VecShiftImmMod
         (op VecShiftImmModOp)
         (rd WritableReg)
+        (ri Reg)
         (rn Reg)
         (size VectorSize)
         (imm u8))
@@ -635,29 +689,55 @@
         (rm Reg)
         (imm4 u8))
 
-       ;; Table vector lookup - single register table. The table consists of 8-bit elements and is
-       ;; stored in `rn`, while `rm` contains 8-bit element indices. `is_extension` specifies whether
-       ;; to emit a TBX or a TBL instruction, i.e. whether to leave the elements in the destination
-       ;; vector that correspond to out-of-range indices (greater than 15) unmodified or to set them
-       ;; to 0.
+       ;; Table vector lookup - single register table. The table
+       ;; consists of 8-bit elements and is stored in `rn`, while `rm`
+       ;; contains 8-bit element indices. This variant emits `TBL`,
+       ;; which sets elements that correspond to out-of-range indices
+       ;; (greater than 15) to 0.
        (VecTbl
         (rd WritableReg)
         (rn Reg)
-        (rm Reg)
-        (is_extension bool))
+        (rm Reg))
 
-       ;; Table vector lookup - two register table. The table consists of 8-bit elements and is
-       ;; stored in `rn` and `rn2`, while `rm` contains 8-bit element indices. `is_extension`
-       ;; specifies whether to emit a TBX or a TBL instruction, i.e. whether to leave the elements in
-       ;; the destination vector that correspond to out-of-range indices (greater than 31) unmodified
-       ;; or to set them to 0. The table registers `rn` and `rn2` must have consecutive numbers
-       ;; modulo 32, that is v31 and v0 (in that order) are consecutive registers.
+       ;; Table vector lookup - single register table. The table
+       ;; consists of 8-bit elements and is stored in `rn`, while `rm`
+       ;; contains 8-bit element indices. This variant emits `TBX`,
+       ;; which leaves elements that correspond to out-of-range indices
+       ;; (greater than 15) unmodified. Hence, it takes an input vreg in
+       ;; `ri` that is constrained to the same allocation as `rd`.
+       (VecTblExt
+        (rd WritableReg)
+        (ri Reg)
+        (rn Reg)
+        (rm Reg))
+
+       ;; Table vector lookup - two register table. The table consists
+       ;; of 8-bit elements and is stored in `rn` and `rn2`, while
+       ;; `rm` contains 8-bit element indices. The table registers
+       ;; `rn` and `rn2` must have consecutive numbers modulo 32, that
+       ;; is v31 and v0 (in that order) are consecutive registers.
+       ;; This variant emits `TBL`, which sets out-of-range results to
+       ;; 0.
        (VecTbl2
         (rd WritableReg)
         (rn Reg)
         (rn2 Reg)
-        (rm Reg)
-        (is_extension bool))
+        (rm Reg))
+
+       ;; Table vector lookup - two register table. The table consists
+       ;; of 8-bit elements and is stored in `rn` and `rn2`, while
+       ;; `rm` contains 8-bit element indices. The table registers
+       ;; `rn` and `rn2` must have consecutive numbers modulo 32, that
+       ;; is v31 and v0 (in that order) are consecutive registers.
+       ;; This variant emits `TBX`, which leaves out-of-range results
+       ;; unmodified, hence takes the initial state of the result
+       ;; register in vreg `ri`.
+       (VecTbl2Ext
+        (rd WritableReg)
+        (ri Reg)
+        (rn Reg)
+        (rn2 Reg)
+        (rm Reg))
 
        ;; Load an element and replicate to all lanes of a vector.
        (VecLoadReplicate
@@ -888,7 +968,6 @@
   (enum
     (MovZ)
     (MovN)
-    (MovK)
 ))
 
 (type UImm5 (primitive UImm5))
@@ -934,6 +1013,7 @@
 (type AMode extern (enum))
 (type PairAMode extern (enum))
 (type FPUOpRI extern (enum))
+(type FPUOpRIMod extern (enum))
 
 (type OperandSize extern
       (enum Size32
@@ -1287,6 +1367,10 @@
     (Umull8)
     (Umull16)
     (Umull32)
+))
+
+(type VecRRRLongModOp
+  (enum
     ;; Unsigned multiply add long
     (Umlal8)
     (Umlal16)
@@ -1447,9 +1531,9 @@
 (decl fpu_op_ri_ushr (u8 u8) FPUOpRI)
 (extern constructor fpu_op_ri_ushr fpu_op_ri_ushr)
 
-;; Constructs an FPUOpRI.Sli* given the size in bits of the value (or lane)
+;; Constructs an FPUOpRIMod.Sli* given the size in bits of the value (or lane)
 ;; and the amount to shift by.
-(decl fpu_op_ri_sli (u8 u8) FPUOpRI)
+(decl fpu_op_ri_sli (u8 u8) FPUOpRIMod)
 (extern constructor fpu_op_ri_sli fpu_op_ri_sli)
 
 (decl imm12_from_negated_u64 (Imm12) u64)
@@ -1524,29 +1608,6 @@
 (decl writable_zero_reg () WritableReg)
 (extern constructor writable_zero_reg writable_zero_reg)
 
-;; Helpers for getting a particular real register
-(decl xreg (u8) Reg)
-(extern constructor xreg xreg)
-
-(decl writable_vreg (u8) WritableReg)
-(extern constructor writable_vreg writable_vreg)
-
-(decl writable_xreg (u8) WritableReg)
-(extern constructor writable_xreg writable_xreg)
-
-;; Helper for emitting `MInst.Mov64` instructions.
-(decl mov64_to_real (u8 Reg) Reg)
-(rule (mov64_to_real num src)
-      (let ((dst WritableReg (writable_xreg num))
-            (_ Unit (emit (MInst.Mov (operand_size $I64) dst src))))
-        dst))
-
-(decl mov64_from_real (u8) Reg)
-(rule (mov64_from_real num)
-      (let ((dst WritableReg (temp_writable_reg $I64))
-            (_ Unit (emit (MInst.Mov (operand_size $I64) dst (xreg num)))))
-        dst))
-
 ;; Helper for emitting `MInst.MovZ` instructions.
 (decl movz (MoveWideConst OperandSize) Reg)
 (rule (movz imm size)
@@ -1601,8 +1662,7 @@
 (decl vec_rrr_mod (VecALUModOp Reg Reg Reg VectorSize) Reg)
 (rule (vec_rrr_mod op src1 src2 src3 size)
       (let ((dst WritableReg (temp_writable_reg $I8X16))
-            (_1 Unit (emit (MInst.FpuMove128 dst src1)))
-            (_2 Unit (emit (MInst.VecRRRMod op dst src2 src3 size))))
+            (_1 Unit (emit (MInst.VecRRRMod op dst src1 src2 src3 size))))
         dst))
 
 (decl fpu_rri (FPUOpRI Reg) Reg)
@@ -1611,6 +1671,12 @@
             (_ Unit (emit (MInst.FpuRRI op dst src))))
         dst))
 
+(decl fpu_rri_mod (FPUOpRIMod Reg Reg) Reg)
+(rule (fpu_rri_mod op dst_src src)
+      (let ((dst WritableReg (temp_writable_reg $F64))
+            (_ Unit (emit (MInst.FpuRRIMod op dst dst_src src))))
+        dst))
+
 ;; Helper for emitting `MInst.FpuRRR` instructions.
 (decl fpu_rrr (FPUOp2 Reg Reg ScalarSize) Reg)
 (rule (fpu_rrr op src1 src2 size)
@@ -1790,29 +1856,33 @@
         dst))
 
 ;; Helper for emitting `MInst.VecTbl` instructions.
-(decl vec_tbl (Reg Reg bool) Reg)
-(rule (vec_tbl rn rm is_extension)
+(decl vec_tbl (Reg Reg) Reg)
+(rule (vec_tbl rn rm)
       (let ((dst WritableReg (temp_writable_reg $I8X16))
-            (_ Unit (emit (MInst.VecTbl dst rn rm is_extension))))
+            (_ Unit (emit (MInst.VecTbl dst rn rm))))
+        dst))
+
+(decl vec_tbl_ext (Reg Reg Reg) Reg)
+(rule (vec_tbl_ext ri rn rm)
+      (let ((dst WritableReg (temp_writable_reg $I8X16))
+            (_ Unit (emit (MInst.VecTblExt dst ri rn rm))))
         dst))
 
 ;; Helper for emitting `MInst.VecTbl2` instructions.
-;; - 2 register table vector lookups require consecutive table registers;
-;;   we satisfy this constraint by hardcoding the usage of v30 and v31.
-;; - Make sure that both args are in virtual regs, since it is not guaranteed
-;;   that we can get them safely to the temporaries if either is in a real
-;;   register.
-(decl vec_tbl2 (Reg Reg Reg bool Type) Reg)
-(rule (vec_tbl2 rn rn2 rm is_extension ty)
+(decl vec_tbl2 (Reg Reg Reg Type) Reg)
+(rule (vec_tbl2 rn rn2 rm ty)
       (let (
-            (temp WritableReg (writable_vreg 30))
-            (temp2 WritableReg (writable_vreg 31))
             (dst WritableReg (temp_writable_reg $I8X16))
-            (rn Reg (ensure_in_vreg rn ty))
-            (rn2 Reg (ensure_in_vreg rn2 ty))
-            (_ Unit (emit (MInst.FpuMove128 temp rn)))
-            (_ Unit (emit (MInst.FpuMove128 temp2 rn2)))
-            (_ Unit (emit (MInst.VecTbl2 dst temp temp2 rm is_extension)))
+            (_ Unit (emit (MInst.VecTbl2 dst rn rn2 rm)))
+        )
+        dst))
+
+;; Helper for emitting `MInst.VecTbl2Ext` instructions.
+(decl vec_tbl2_ext (Reg Reg Reg Reg Type) Reg)
+(rule (vec_tbl2_ext ri rn rn2 rm ty)
+      (let (
+            (dst WritableReg (temp_writable_reg $I8X16))
+            (_ Unit (emit (MInst.VecTbl2Ext dst ri rn rn2 rm)))
         )
         dst))
 
@@ -1830,22 +1900,18 @@
             (_ Unit (emit (MInst.VecRRPairLong op dst src))))
         dst))
 
-;; Helper for emitting `MInst.VecRRRLong` instructions, but for variants
-;; where the operation both reads and modifies the destination register.
-;;
-;; Currently this is only used for `VecRRRLongOp.Umlal*`
-(decl vec_rrrr_long (VecRRRLongOp Reg Reg Reg bool) Reg)
+;; Helper for emitting `MInst.VecRRRLongMod` instructions.
+(decl vec_rrrr_long (VecRRRLongModOp Reg Reg Reg bool) Reg)
 (rule (vec_rrrr_long op src1 src2 src3 high_half)
       (let ((dst WritableReg (temp_writable_reg $I8X16))
-            (_ Unit (emit (MInst.FpuMove128 dst src1)))
-            (_ Unit (emit (MInst.VecRRRLong op dst src2 src3 high_half))))
+            (_ Unit (emit (MInst.VecRRRLongMod op dst src1 src2 src3 high_half))))
         dst))
 
 ;; Helper for emitting `MInst.VecRRNarrow` instructions.
-(decl vec_rr_narrow (VecRRNarrowOp Reg ScalarSize) Reg)
-(rule (vec_rr_narrow op src size)
+(decl vec_rr_narrow_low (VecRRNarrowOp Reg ScalarSize) Reg)
+(rule (vec_rr_narrow_low op src size)
       (let ((dst WritableReg (temp_writable_reg $I8X16))
-            (_ Unit (emit (MInst.VecRRNarrow op dst src $false size))))
+            (_ Unit (emit (MInst.VecRRNarrowLow op dst src size))))
         dst))
 
 ;; Helper for emitting `MInst.VecRRNarrow` instructions which update the
@@ -1853,8 +1919,7 @@
 (decl vec_rr_narrow_high (VecRRNarrowOp Reg Reg ScalarSize) Reg)
 (rule (vec_rr_narrow_high op mod src size)
       (let ((dst WritableReg (temp_writable_reg $I8X16))
-            (_ Unit (emit (MInst.FpuMove128 dst mod)))
-            (_ Unit (emit (MInst.VecRRNarrow op dst src $true size))))
+            (_ Unit (emit (MInst.VecRRNarrowHigh op dst mod src size))))
         dst))
 
 ;; Helper for emitting `MInst.VecRRLong` instructions.
@@ -1897,16 +1962,14 @@
 (decl mov_to_vec (Reg Reg u8 VectorSize) Reg)
 (rule (mov_to_vec src1 src2 lane size)
       (let ((dst WritableReg (temp_writable_reg $I8X16))
-            (_ Unit (emit (MInst.FpuMove128 dst src1)))
-            (_ Unit (emit (MInst.MovToVec dst src2 lane size))))
+            (_ Unit (emit (MInst.MovToVec dst src1 src2 lane size))))
         dst))
 
 ;; Helper for emitting `MInst.VecMovElement` instructions.
 (decl mov_vec_elem (Reg Reg u8 u8 VectorSize) Reg)
 (rule (mov_vec_elem src1 src2 dst_idx src_idx size)
       (let ((dst WritableReg (temp_writable_reg $I8X16))
-            (_ Unit (emit (MInst.FpuMove128 dst src1)))
-            (_ Unit (emit (MInst.VecMovElement dst src2 dst_idx src_idx size))))
+            (_ Unit (emit (MInst.VecMovElement dst src1 src2 dst_idx src_idx size))))
         dst))
 
 ;; Helper for emitting `MInst.MovFromVec` instructions.
@@ -2104,15 +2167,15 @@
 
 ;; Helper for generating `xtn` instructions.
 (decl xtn (Reg ScalarSize) Reg)
-(rule (xtn x size) (vec_rr_narrow (VecRRNarrowOp.Xtn) x size))
+(rule (xtn x size) (vec_rr_narrow_low (VecRRNarrowOp.Xtn) x size))
 
 ;; Helper for generating `fcvtn` instructions.
 (decl fcvtn (Reg ScalarSize) Reg)
-(rule (fcvtn x size) (vec_rr_narrow (VecRRNarrowOp.Fcvtn) x size))
+(rule (fcvtn x size) (vec_rr_narrow_low (VecRRNarrowOp.Fcvtn) x size))
 
 ;; Helper for generating `sqxtn` instructions.
 (decl sqxtn (Reg ScalarSize) Reg)
-(rule (sqxtn x size) (vec_rr_narrow (VecRRNarrowOp.Sqxtn) x size))
+(rule (sqxtn x size) (vec_rr_narrow_low (VecRRNarrowOp.Sqxtn) x size))
 
 ;; Helper for generating `sqxtn2` instructions.
 (decl sqxtn2 (Reg Reg ScalarSize) Reg)
@@ -2120,7 +2183,7 @@
 
 ;; Helper for generating `sqxtun` instructions.
 (decl sqxtun (Reg ScalarSize) Reg)
-(rule (sqxtun x size) (vec_rr_narrow (VecRRNarrowOp.Sqxtun) x size))
+(rule (sqxtun x size) (vec_rr_narrow_low (VecRRNarrowOp.Sqxtun) x size))
 
 ;; Helper for generating `sqxtun2` instructions.
 (decl sqxtun2 (Reg Reg ScalarSize) Reg)
@@ -2128,7 +2191,7 @@
 
 ;; Helper for generating `uqxtn` instructions.
 (decl uqxtn (Reg ScalarSize) Reg)
-(rule (uqxtn x size) (vec_rr_narrow (VecRRNarrowOp.Uqxtn) x size))
+(rule (uqxtn x size) (vec_rr_narrow_low (VecRRNarrowOp.Uqxtn) x size))
 
 ;; Helper for generating `uqxtn2` instructions.
 (decl uqxtn2 (Reg Reg ScalarSize) Reg)
@@ -2187,7 +2250,7 @@
 
 ;; Helper for generating `umlal32` instructions.
 (decl umlal32 (Reg Reg Reg bool) Reg)
-(rule (umlal32 x y z high_half) (vec_rrrr_long (VecRRRLongOp.Umlal32) x y z high_half))
+(rule (umlal32 x y z high_half) (vec_rrrr_long (VecRRRLongModOp.Umlal32) x y z high_half))
 
 ;; Helper for generating `smull8` instructions.
 (decl smull8 (Reg Reg bool) Reg)
@@ -2719,8 +2782,7 @@
 (rule (lse_atomic_cas addr expect replace ty)
       (let (
             (dst WritableReg (temp_writable_reg ty))
-            (_ Unit (emit (MInst.Mov (operand_size ty) dst expect)))
-            (_ Unit (emit (MInst.AtomicCAS dst replace addr ty)))
+            (_ Unit (emit (MInst.AtomicCAS dst expect replace addr ty)))
           )
           dst))
 
@@ -2730,16 +2792,13 @@
 ;; regs, and that's not guaranteed safe if either is in a real reg.
 ;; - Move the args to the preordained AtomicRMW input regs
 ;; - And finally, copy the preordained AtomicRMW output reg to its destination.
-(decl atomic_rmw_loop (AtomicRMWLoopOp Value Value Type) Reg)
-(rule (atomic_rmw_loop op p arg2 ty)
-      (let (
-          (v_addr Reg (ensure_in_vreg p $I64))
-          (v_arg2 Reg (ensure_in_vreg arg2 $I64))
-          (r_addr Reg (mov64_to_real 25 v_addr))
-          (r_arg2 Reg (mov64_to_real 26 v_arg2))
-          (_ Unit (emit (MInst.AtomicRMWLoop ty op)))
-        )
-        (mov64_from_real 27)))
+(decl atomic_rmw_loop (AtomicRMWLoopOp Reg Reg Type) Reg)
+(rule (atomic_rmw_loop op addr operand ty)
+      (let ((dst WritableReg (temp_writable_reg $I64))
+            (scratch1 WritableReg (temp_writable_reg $I64))
+            (scratch2 WritableReg (temp_writable_reg $I64))
+            (_ Unit (emit (MInst.AtomicRMWLoop ty op addr operand dst scratch1 scratch2))))
+        dst))
 
 ;; Helper for emitting `MInst.AtomicCASLoop` instructions.
 ;; This is very similar to, but not identical to, the AtomicRmw case.  Note
@@ -2749,21 +2808,10 @@
 ;; for `atomic_rmw_loop` above.
 (decl atomic_cas_loop (Reg Reg Reg Type) Reg)
 (rule (atomic_cas_loop addr expect replace ty)
-      (let (
-          (v_addr Reg (ensure_in_vreg addr $I64))
-          (v_exp Reg (ensure_in_vreg expect $I64))
-          (v_rep Reg (ensure_in_vreg replace $I64))
-          ;; Move the args to the preordained AtomicCASLoop input regs
-          (r_addr Reg (mov64_to_real 25 v_addr))
-          (r_exp Reg (mov64_to_real 26 v_exp))
-          (r_rep Reg (mov64_to_real 28 v_rep))
-          ;; Now the AtomicCASLoop itself, implemented in the normal way, with a
-          ;; load-exclusive, store-exclusive loop
-          (_ Unit (emit (MInst.AtomicCASLoop ty)))
-        )
-        ;; And finally, copy the preordained AtomicCASLoop output reg to its destination.
-        ;; Also, x24 and x28 are trashed.
-        (mov64_from_real 27)))
+      (let ((dst WritableReg (temp_writable_reg $I64))
+            (scratch WritableReg (temp_writable_reg $I64))
+            (_ Unit (emit (MInst.AtomicCASLoop ty addr expect replace dst scratch))))
+        dst))
 
 ;; Helper for emitting `MInst.MovPReg` instructions.
 (decl mov_preg (PReg) Reg)
@@ -2811,15 +2859,13 @@
 (decl fcopy_sign (Reg Reg Type) Reg)
 (rule (fcopy_sign x y (ty_scalar_float ty))
       (let ((dst WritableReg (temp_writable_reg $F64))
-            (_ Unit (emit (MInst.FpuMove64 dst x)))
             (tmp Reg (fpu_rri (fpu_op_ri_ushr (ty_bits ty) (max_shift ty)) y))
-            (_ Unit (emit (MInst.FpuRRI (fpu_op_ri_sli (ty_bits ty) (max_shift ty)) dst tmp))))
+            (_ Unit (emit (MInst.FpuRRIMod (fpu_op_ri_sli (ty_bits ty) (max_shift ty)) dst x tmp))))
        dst))
 (rule (fcopy_sign x y ty @ (multi_lane _ _))
       (let ((dst WritableReg (temp_writable_reg $I8X16))
-            (_ Unit (emit (MInst.FpuMove128 dst x)))
             (tmp Reg (vec_shift_imm (VecShiftImmOp.Ushr) (max_shift (lane_type ty)) y (vector_size ty)))
-            (_ Unit (emit (MInst.VecShiftImmMod (VecShiftImmModOp.Sli) dst tmp (vector_size ty) (max_shift (lane_type ty))))))
+            (_ Unit (emit (MInst.VecShiftImmMod (VecShiftImmModOp.Sli) dst x tmp (vector_size ty) (max_shift (lane_type ty))))))
        dst))
 
 ;; Helpers for generating `MInst.FpuToInt` instructions.
diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs
index 4428be2a83..57869e1c32 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -3,7 +3,7 @@
 use crate::ir::types::*;
 use crate::ir::Type;
 use crate::isa::aarch64::inst::*;
-use crate::machinst::{ty_bits, MachLabel, PrettyPrint, Reg, Writable};
+use crate::machinst::{ty_bits, MachLabel, PrettyPrint, Reg};
 use core::convert::Into;
 use std::string::String;
 
@@ -122,9 +122,11 @@ pub enum AMode {
     // Real ARM64 addressing modes:
     //
     /// "post-indexed" mode as per AArch64 docs: postincrement reg after address computation.
-    PostIndexed(Writable<Reg>, SImm9),
+    /// Specialized here to SP so we don't have to emit regalloc metadata.
+    SPPostIndexed(SImm9),
     /// "pre-indexed" mode as per AArch64 docs: preincrement reg before address computation.
-    PreIndexed(Writable<Reg>, SImm9),
+    /// Specialized here to SP so we don't have to emit regalloc metadata.
+    SPPreIndexed(SImm9),
 
     // N.B.: RegReg, RegScaled, and RegScaledExtended all correspond to
     // what the ISA calls the "register offset" addressing mode. We split out
@@ -220,10 +222,12 @@ impl AMode {
             &AMode::RegExtended(r1, r2, ext) => {
                 AMode::RegExtended(allocs.next(r1), allocs.next(r2), ext)
             }
-            &AMode::PreIndexed(reg, simm9) => AMode::PreIndexed(allocs.next_writable(reg), simm9),
-            &AMode::PostIndexed(reg, simm9) => AMode::PostIndexed(allocs.next_writable(reg), simm9),
+            // Note that SP is not managed by regalloc, so there is no register to report in the
+            // pre/post-indexed amodes.
             &AMode::RegOffset(r, off, ty) => AMode::RegOffset(allocs.next(r), off, ty),
-            &AMode::FPOffset(..)
+            &AMode::SPPreIndexed(..)
+            | &AMode::SPPostIndexed(..)
+            | &AMode::FPOffset(..)
             | &AMode::SPOffset(..)
             | &AMode::NominalSPOffset(..)
             | AMode::Label(..) => self.clone(),
@@ -235,8 +239,8 @@ impl AMode {
 #[derive(Clone, Debug)]
 pub enum PairAMode {
     SignedOffset(Reg, SImm7Scaled),
-    PreIndexed(Writable<Reg>, SImm7Scaled),
-    PostIndexed(Writable<Reg>, SImm7Scaled),
+    SPPreIndexed(SImm7Scaled),
+    SPPostIndexed(SImm7Scaled),
 }
 
 impl PairAMode {
@@ -246,12 +250,7 @@ impl PairAMode {
             &PairAMode::SignedOffset(reg, simm7scaled) => {
                 PairAMode::SignedOffset(allocs.next(reg), simm7scaled)
             }
-            &PairAMode::PreIndexed(reg, simm7scaled) => {
-                PairAMode::PreIndexed(allocs.next_writable(reg), simm7scaled)
-            }
-            &PairAMode::PostIndexed(reg, simm7scaled) => {
-                PairAMode::PostIndexed(allocs.next_writable(reg), simm7scaled)
-            }
+            &PairAMode::SPPreIndexed(..) | &PairAMode::SPPostIndexed(..) => self.clone(),
         }
     }
 }
@@ -470,15 +469,13 @@ impl PrettyPrint for AMode {
                 format!("[{}, {}, {}]", r1, r2, op)
             }
             &AMode::Label(ref label) => label.pretty_print(0, allocs),
-            &AMode::PreIndexed(r, simm9) => {
-                let r = pretty_print_reg(r.to_reg(), allocs);
+            &AMode::SPPreIndexed(simm9) => {
                 let simm9 = simm9.pretty_print(8, allocs);
-                format!("[{}, {}]!", r, simm9)
+                format!("[sp, {}]!", simm9)
             }
-            &AMode::PostIndexed(r, simm9) => {
-                let r = pretty_print_reg(r.to_reg(), allocs);
+            &AMode::SPPostIndexed(simm9) => {
                 let simm9 = simm9.pretty_print(8, allocs);
-                format!("[{}], {}", r, simm9)
+                format!("[sp], {}", simm9)
             }
             // Eliminated by `mem_finalize()`.
             &AMode::SPOffset(..)
@@ -503,15 +500,13 @@ impl PrettyPrint for PairAMode {
                     format!("[{}]", reg)
                 }
             }
-            &PairAMode::PreIndexed(reg, simm7) => {
-                let reg = pretty_print_reg(reg.to_reg(), allocs);
+            &PairAMode::SPPreIndexed(simm7) => {
                 let simm7 = simm7.pretty_print(8, allocs);
-                format!("[{}, {}]!", reg, simm7)
+                format!("[sp, {}]!", simm7)
             }
-            &PairAMode::PostIndexed(reg, simm7) => {
-                let reg = pretty_print_reg(reg.to_reg(), allocs);
+            &PairAMode::SPPostIndexed(simm7) => {
                 let simm7 = simm7.pretty_print(8, allocs);
-                format!("[{}], {}", reg, simm7)
+                format!("[sp], {}", simm7)
             }
         }
     }
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index 57e1bfb488..3fb53f81f9 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -184,7 +184,6 @@ fn enc_move_wide(op: MoveWideOp, rd: Writable<Reg>, imm: MoveWideConst, size: Op
     let op = match op {
         MoveWideOp::MovN => 0b00,
         MoveWideOp::MovZ => 0b10,
-        MoveWideOp::MovK => 0b11,
     };
     0x12800000
         | size.sf_bit() << 31
@@ -194,6 +193,15 @@ fn enc_move_wide(op: MoveWideOp, rd: Writable<Reg>, imm: MoveWideConst, size: Op
         | machreg_to_gpr(rd.to_reg())
 }
 
+fn enc_movk(rd: Writable<Reg>, imm: MoveWideConst, size: OperandSize) -> u32 {
+    assert!(imm.shift <= 0b11);
+    0x72800000
+        | size.sf_bit() << 31
+        | u32::from(imm.shift) << 21
+        | u32::from(imm.bits) << 5
+        | machreg_to_gpr(rd.to_reg())
+}
+
 fn enc_ldst_pair(op_31_22: u32, simm7: SImm7Scaled, rn: Reg, rt: Reg, rt2: Reg) -> u32 {
     (op_31_22 << 22)
         | (simm7.bits() << 15)
@@ -1040,12 +1048,12 @@ impl MachInstEmit for Inst {
                             _ => panic!("Unspported size for LDR from constant pool!"),
                         }
                     }
-                    &AMode::PreIndexed(reg, simm9) => {
-                        let reg = allocs.next(reg.to_reg());
+                    &AMode::SPPreIndexed(simm9) => {
+                        let reg = stack_reg();
                         sink.put4(enc_ldst_simm9(op, simm9, 0b11, reg, rd));
                     }
-                    &AMode::PostIndexed(reg, simm9) => {
-                        let reg = allocs.next(reg.to_reg());
+                    &AMode::SPPostIndexed(simm9) => {
+                        let reg = stack_reg();
                         sink.put4(enc_ldst_simm9(op, simm9, 0b01, reg, rd));
                     }
                     // Eliminated by `mem_finalize()` above.
@@ -1134,12 +1142,12 @@ impl MachInstEmit for Inst {
                     &AMode::Label(..) => {
                         panic!("Store to a MemLabel not implemented!");
                     }
-                    &AMode::PreIndexed(reg, simm9) => {
-                        let reg = allocs.next(reg.to_reg());
+                    &AMode::SPPreIndexed(simm9) => {
+                        let reg = stack_reg();
                         sink.put4(enc_ldst_simm9(op, simm9, 0b11, reg, rd));
                     }
-                    &AMode::PostIndexed(reg, simm9) => {
-                        let reg = allocs.next(reg.to_reg());
+                    &AMode::SPPostIndexed(simm9) => {
+                        let reg = stack_reg();
                         sink.put4(enc_ldst_simm9(op, simm9, 0b01, reg, rd));
                     }
                     // Eliminated by `mem_finalize()` above.
@@ -1170,14 +1178,14 @@ impl MachInstEmit for Inst {
                         let reg = allocs.next(reg);
                         sink.put4(enc_ldst_pair(0b1010100100, simm7, reg, rt, rt2));
                     }
-                    &PairAMode::PreIndexed(reg, simm7) => {
+                    &PairAMode::SPPreIndexed(simm7) => {
                         assert_eq!(simm7.scale_ty, I64);
-                        let reg = allocs.next(reg.to_reg());
+                        let reg = stack_reg();
                         sink.put4(enc_ldst_pair(0b1010100110, simm7, reg, rt, rt2));
                     }
-                    &PairAMode::PostIndexed(reg, simm7) => {
+                    &PairAMode::SPPostIndexed(simm7) => {
                         assert_eq!(simm7.scale_ty, I64);
-                        let reg = allocs.next(reg.to_reg());
+                        let reg = stack_reg();
                         sink.put4(enc_ldst_pair(0b1010100010, simm7, reg, rt, rt2));
                     }
                 }
@@ -1203,14 +1211,14 @@ impl MachInstEmit for Inst {
                         let reg = allocs.next(reg);
                         sink.put4(enc_ldst_pair(0b1010100101, simm7, reg, rt, rt2));
                     }
-                    &PairAMode::PreIndexed(reg, simm7) => {
+                    &PairAMode::SPPreIndexed(simm7) => {
                         assert_eq!(simm7.scale_ty, I64);
-                        let reg = allocs.next(reg.to_reg());
+                        let reg = stack_reg();
                         sink.put4(enc_ldst_pair(0b1010100111, simm7, reg, rt, rt2));
                     }
-                    &PairAMode::PostIndexed(reg, simm7) => {
+                    &PairAMode::SPPostIndexed(simm7) => {
                         assert_eq!(simm7.scale_ty, I64);
-                        let reg = allocs.next(reg.to_reg());
+                        let reg = stack_reg();
                         sink.put4(enc_ldst_pair(0b1010100011, simm7, reg, rt, rt2));
                     }
                 }
@@ -1249,14 +1257,14 @@ impl MachInstEmit for Inst {
                         let reg = allocs.next(reg);
                         sink.put4(enc_ldst_vec_pair(opc, 0b10, true, simm7, reg, rt, rt2));
                     }
-                    &PairAMode::PreIndexed(reg, simm7) => {
+                    &PairAMode::SPPreIndexed(simm7) => {
                         assert!(simm7.scale_ty == F64 || simm7.scale_ty == I8X16);
-                        let reg = allocs.next(reg.to_reg());
+                        let reg = stack_reg();
                         sink.put4(enc_ldst_vec_pair(opc, 0b11, true, simm7, reg, rt, rt2));
                     }
-                    &PairAMode::PostIndexed(reg, simm7) => {
+                    &PairAMode::SPPostIndexed(simm7) => {
                         assert!(simm7.scale_ty == F64 || simm7.scale_ty == I8X16);
-                        let reg = allocs.next(reg.to_reg());
+                        let reg = stack_reg();
                         sink.put4(enc_ldst_vec_pair(opc, 0b01, true, simm7, reg, rt, rt2));
                     }
                 }
@@ -1295,14 +1303,14 @@ impl MachInstEmit for Inst {
                         let reg = allocs.next(reg);
                         sink.put4(enc_ldst_vec_pair(opc, 0b10, false, simm7, reg, rt, rt2));
                     }
-                    &PairAMode::PreIndexed(reg, simm7) => {
+                    &PairAMode::SPPreIndexed(simm7) => {
                         assert!(simm7.scale_ty == F64 || simm7.scale_ty == I8X16);
-                        let reg = allocs.next(reg.to_reg());
+                        let reg = stack_reg();
                         sink.put4(enc_ldst_vec_pair(opc, 0b11, false, simm7, reg, rt, rt2));
                     }
-                    &PairAMode::PostIndexed(reg, simm7) => {
+                    &PairAMode::SPPostIndexed(simm7) => {
                         assert!(simm7.scale_ty == F64 || simm7.scale_ty == I8X16);
-                        let reg = allocs.next(reg.to_reg());
+                        let reg = stack_reg();
                         sink.put4(enc_ldst_vec_pair(opc, 0b01, false, simm7, reg, rt, rt2));
                     }
                 }
@@ -1356,6 +1364,12 @@ impl MachInstEmit for Inst {
                 let rd = allocs.next_writable(rd);
                 sink.put4(enc_move_wide(op, rd, imm, size));
             }
+            &Inst::MovK { rd, rn, imm, size } => {
+                let rn = allocs.next(rn);
+                let rd = allocs.next_writable(rd);
+                debug_assert_eq!(rn, rd.to_reg());
+                sink.put4(enc_movk(rd, imm, size));
+            }
             &Inst::CSel { rd, rn, rm, cond } => {
                 let rd = allocs.next_writable(rd);
                 let rn = allocs.next(rn);
@@ -1403,7 +1417,7 @@ impl MachInstEmit for Inst {
                 let rn = allocs.next(rn);
                 sink.put4(enc_acq_rel(ty, op, rs, rt, rn));
             }
-            &Inst::AtomicRMWLoop { ty, op } => {
+            &Inst::AtomicRMWLoop { ty, op, .. } => {
                 /* Emit this:
                      again:
                       ldaxr{,b,h}  x/w27, [x25]
@@ -1581,8 +1595,10 @@ impl MachInstEmit for Inst {
                 ));
                 sink.use_label_at_offset(br_offset, again_label, LabelUse::Branch19);
             }
-            &Inst::AtomicCAS { rs, rt, rn, ty } => {
-                let rs = allocs.next_writable(rs);
+            &Inst::AtomicCAS { rd, rs, rt, rn, ty } => {
+                let rd = allocs.next_writable(rd);
+                let rs = allocs.next(rs);
+                debug_assert_eq!(rd.to_reg(), rs);
                 let rt = allocs.next(rt);
                 let rn = allocs.next(rn);
                 let size = match ty {
@@ -1593,9 +1609,9 @@ impl MachInstEmit for Inst {
                     _ => panic!("Unsupported type: {}", ty),
                 };
 
-                sink.put4(enc_cas(size, rs, rt, rn));
+                sink.put4(enc_cas(size, rd, rt, rn));
             }
-            &Inst::AtomicCASLoop { ty } => {
+            &Inst::AtomicCASLoop { ty, .. } => {
                 /* Emit this:
                     again:
                      ldaxr{,b,h} x/w27, [x25]
@@ -1788,7 +1804,15 @@ impl MachInstEmit for Inst {
                                 | machreg_to_vec(rd.to_reg()),
                         )
                     }
-                    FPUOpRI::Sli64(imm) => {
+                }
+            }
+            &Inst::FpuRRIMod { fpu_op, rd, ri, rn } => {
+                let rd = allocs.next_writable(rd);
+                let ri = allocs.next(ri);
+                let rn = allocs.next(rn);
+                debug_assert_eq!(rd.to_reg(), ri);
+                match fpu_op {
+                    FPUOpRIMod::Sli64(imm) => {
                         debug_assert_eq!(64, imm.lane_size_in_bits);
                         sink.put4(
                             0b01_1_111110_0000000_010101_00000_00000
@@ -1797,7 +1821,7 @@ impl MachInstEmit for Inst {
                                 | machreg_to_vec(rd.to_reg()),
                         )
                     }
-                    FPUOpRI::Sli32(imm) => {
+                    FPUOpRIMod::Sli32(imm) => {
                         debug_assert_eq!(32, imm.lane_size_in_bits);
                         sink.put4(
                             0b0_0_1_011110_0000000_010101_00000_00000
@@ -2036,11 +2060,14 @@ impl MachInstEmit for Inst {
             &Inst::VecShiftImmMod {
                 op,
                 rd,
+                ri,
                 rn,
                 size,
                 imm,
             } => {
                 let rd = allocs.next_writable(rd);
+                let ri = allocs.next(ri);
+                debug_assert_eq!(rd.to_reg(), ri);
                 let rn = allocs.next(rn);
                 let (is_shr, mut template) = match op {
                     VecShiftImmModOp::Sli => (false, 0b_001_011110_0000_000_010101_00000_00000_u32),
@@ -2096,30 +2123,43 @@ impl MachInstEmit for Inst {
                     );
                 }
             }
-            &Inst::VecTbl {
-                rd,
-                rn,
-                rm,
-                is_extension,
-            } => {
+            &Inst::VecTbl { rd, rn, rm } => {
                 let rn = allocs.next(rn);
                 let rm = allocs.next(rm);
                 let rd = allocs.next_writable(rd);
-                sink.put4(enc_tbl(is_extension, 0b00, rd, rn, rm));
+                sink.put4(enc_tbl(/* is_extension = */ false, 0b00, rd, rn, rm));
             }
-            &Inst::VecTbl2 {
-                rd,
-                rn,
-                rn2,
-                rm,
-                is_extension,
-            } => {
+            &Inst::VecTblExt { rd, ri, rn, rm } => {
+                let rn = allocs.next(rn);
+                let rm = allocs.next(rm);
+                let rd = allocs.next_writable(rd);
+                let ri = allocs.next(ri);
+                debug_assert_eq!(rd.to_reg(), ri);
+                sink.put4(enc_tbl(/* is_extension = */ true, 0b00, rd, rn, rm));
+            }
+            &Inst::VecTbl2 { rd, rn, rn2, rm } => {
                 let rn = allocs.next(rn);
                 let rn2 = allocs.next(rn2);
                 let rm = allocs.next(rm);
                 let rd = allocs.next_writable(rd);
                 assert_eq!(machreg_to_vec(rn2), (machreg_to_vec(rn) + 1) % 32);
-                sink.put4(enc_tbl(is_extension, 0b01, rd, rn, rm));
+                sink.put4(enc_tbl(/* is_extension = */ false, 0b01, rd, rn, rm));
+            }
+            &Inst::VecTbl2Ext {
+                rd,
+                ri,
+                rn,
+                rn2,
+                rm,
+            } => {
+                let rn = allocs.next(rn);
+                let rn2 = allocs.next(rn2);
+                let rm = allocs.next(rm);
+                let rd = allocs.next_writable(rd);
+                let ri = allocs.next(ri);
+                debug_assert_eq!(rd.to_reg(), ri);
+                assert_eq!(machreg_to_vec(rn2), (machreg_to_vec(rn) + 1) % 32);
+                sink.put4(enc_tbl(/* is_extension = */ true, 0b01, rd, rn, rm));
             }
             &Inst::FpuCmp { size, rn, rm } => {
                 let rn = allocs.next(rn);
@@ -2254,8 +2294,16 @@ impl MachInstEmit for Inst {
                         | machreg_to_vec(rd.to_reg()),
                 );
             }
-            &Inst::MovToVec { rd, rn, idx, size } => {
+            &Inst::MovToVec {
+                rd,
+                ri,
+                rn,
+                idx,
+                size,
+            } => {
                 let rd = allocs.next_writable(rd);
+                let ri = allocs.next(ri);
+                debug_assert_eq!(rd.to_reg(), ri);
                 let rn = allocs.next(rn);
                 let (imm5, shift) = match size.lane_size() {
                     ScalarSize::Size8 => (0b00001, 1),
@@ -2475,15 +2523,26 @@ impl MachInstEmit for Inst {
                     rn,
                 ));
             }
-            &Inst::VecRRNarrow {
+            &Inst::VecRRNarrowLow {
                 op,
                 rd,
                 rn,
-                high_half,
                 lane_size,
+            }
+            | &Inst::VecRRNarrowHigh {
+                op,
+                rd,
+                rn,
+                lane_size,
+                ..
             } => {
                 let rn = allocs.next(rn);
                 let rd = allocs.next_writable(rd);
+                let high_half = match self {
+                    &Inst::VecRRNarrowLow { .. } => false,
+                    &Inst::VecRRNarrowHigh { .. } => true,
+                    _ => unreachable!(),
+                };
 
                 let size = match lane_size {
                     ScalarSize::Size8 => 0b00,
@@ -2516,12 +2575,15 @@ impl MachInstEmit for Inst {
             }
             &Inst::VecMovElement {
                 rd,
+                ri,
                 rn,
                 dest_idx,
                 src_idx,
                 size,
             } => {
                 let rd = allocs.next_writable(rd);
+                let ri = allocs.next(ri);
+                debug_assert_eq!(rd.to_reg(), ri);
                 let rn = allocs.next(rn);
                 let (imm5, shift) = match size.lane_size() {
                     ScalarSize::Size8 => (0b00001, 1),
@@ -2569,9 +2631,34 @@ impl MachInstEmit for Inst {
                     VecRRRLongOp::Umull8 => (0b1, 0b00, 0b1),
                     VecRRRLongOp::Umull16 => (0b1, 0b01, 0b1),
                     VecRRRLongOp::Umull32 => (0b1, 0b10, 0b1),
-                    VecRRRLongOp::Umlal8 => (0b1, 0b00, 0b0),
-                    VecRRRLongOp::Umlal16 => (0b1, 0b01, 0b0),
-                    VecRRRLongOp::Umlal32 => (0b1, 0b10, 0b0),
+                };
+                sink.put4(enc_vec_rrr_long(
+                    high_half as u32,
+                    u,
+                    size,
+                    bit14,
+                    rm,
+                    rn,
+                    rd,
+                ));
+            }
+            &Inst::VecRRRLongMod {
+                rd,
+                ri,
+                rn,
+                rm,
+                alu_op,
+                high_half,
+            } => {
+                let rd = allocs.next_writable(rd);
+                let ri = allocs.next(ri);
+                debug_assert_eq!(rd.to_reg(), ri);
+                let rn = allocs.next(rn);
+                let rm = allocs.next(rm);
+                let (u, size, bit14) = match alu_op {
+                    VecRRRLongModOp::Umlal8 => (0b1, 0b00, 0b0),
+                    VecRRRLongModOp::Umlal16 => (0b1, 0b01, 0b0),
+                    VecRRRLongModOp::Umlal32 => (0b1, 0b10, 0b0),
                 };
                 sink.put4(enc_vec_rrr_long(
                     high_half as u32,
@@ -2702,12 +2789,15 @@ impl MachInstEmit for Inst {
             }
             &Inst::VecRRRMod {
                 rd,
+                ri,
                 rn,
                 rm,
                 alu_op,
                 size,
             } => {
                 let rd = allocs.next_writable(rd);
+                let ri = allocs.next(ri);
+                debug_assert_eq!(rd.to_reg(), ri);
                 let rn = allocs.next(rn);
                 let rm = allocs.next(rm);
                 let (q, _enc_size) = size.enc_size();
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
index b3dc56d568..a3eaabd68e 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -1614,20 +1614,20 @@ fn test_aarch64_binemit() {
     insns.push((
         Inst::ULoad64 {
             rd: writable_xreg(1),
-            mem: AMode::PreIndexed(writable_xreg(2), SImm9::maybe_from_i64(16).unwrap()),
+            mem: AMode::SPPreIndexed(SImm9::maybe_from_i64(16).unwrap()),
             flags: MemFlags::trusted(),
         },
-        "410C41F8",
-        "ldr x1, [x2, #16]!",
+        "E10F41F8",
+        "ldr x1, [sp, #16]!",
     ));
     insns.push((
         Inst::ULoad64 {
             rd: writable_xreg(1),
-            mem: AMode::PostIndexed(writable_xreg(2), SImm9::maybe_from_i64(16).unwrap()),
+            mem: AMode::SPPostIndexed(SImm9::maybe_from_i64(16).unwrap()),
             flags: MemFlags::trusted(),
         },
-        "410441F8",
-        "ldr x1, [x2], #16",
+        "E10741F8",
+        "ldr x1, [sp], #16",
     ));
     insns.push((
         Inst::ULoad64 {
@@ -1663,7 +1663,7 @@ fn test_aarch64_binemit() {
             flags: MemFlags::trusted(),
         },
         "300080521002A072B063308B010240F9",
-        "movz w16, #1 ; movk w16, #16, LSL #16 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
+        "movz w16, #1 ; movk w16, w16, #16, LSL #16 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
     ));
 
     insns.push((
@@ -1807,20 +1807,20 @@ fn test_aarch64_binemit() {
     insns.push((
         Inst::Store64 {
             rd: xreg(1),
-            mem: AMode::PreIndexed(writable_xreg(2), SImm9::maybe_from_i64(16).unwrap()),
+            mem: AMode::SPPreIndexed(SImm9::maybe_from_i64(16).unwrap()),
             flags: MemFlags::trusted(),
         },
-        "410C01F8",
-        "str x1, [x2, #16]!",
+        "E10F01F8",
+        "str x1, [sp, #16]!",
     ));
     insns.push((
         Inst::Store64 {
             rd: xreg(1),
-            mem: AMode::PostIndexed(writable_xreg(2), SImm9::maybe_from_i64(16).unwrap()),
+            mem: AMode::SPPostIndexed(SImm9::maybe_from_i64(16).unwrap()),
             flags: MemFlags::trusted(),
         },
-        "410401F8",
-        "str x1, [x2], #16",
+        "E10701F8",
+        "str x1, [sp], #16",
     ));
 
     insns.push((
@@ -1867,27 +1867,21 @@ fn test_aarch64_binemit() {
         Inst::StoreP64 {
             rt: xreg(8),
             rt2: xreg(9),
-            mem: PairAMode::PreIndexed(
-                writable_xreg(10),
-                SImm7Scaled::maybe_from_i64(-64, I64).unwrap(),
-            ),
+            mem: PairAMode::SPPreIndexed(SImm7Scaled::maybe_from_i64(-64, I64).unwrap()),
             flags: MemFlags::trusted(),
         },
-        "4825BCA9",
-        "stp x8, x9, [x10, #-64]!",
+        "E827BCA9",
+        "stp x8, x9, [sp, #-64]!",
     ));
     insns.push((
         Inst::StoreP64 {
             rt: xreg(15),
             rt2: xreg(16),
-            mem: PairAMode::PostIndexed(
-                writable_xreg(20),
-                SImm7Scaled::maybe_from_i64(504, I64).unwrap(),
-            ),
+            mem: PairAMode::SPPostIndexed(SImm7Scaled::maybe_from_i64(504, I64).unwrap()),
             flags: MemFlags::trusted(),
         },
-        "8FC29FA8",
-        "stp x15, x16, [x20], #504",
+        "EFC39FA8",
+        "stp x15, x16, [sp], #504",
     ));
 
     insns.push((
@@ -1934,27 +1928,21 @@ fn test_aarch64_binemit() {
         Inst::LoadP64 {
             rt: writable_xreg(8),
             rt2: writable_xreg(9),
-            mem: PairAMode::PreIndexed(
-                writable_xreg(10),
-                SImm7Scaled::maybe_from_i64(-64, I64).unwrap(),
-            ),
+            mem: PairAMode::SPPreIndexed(SImm7Scaled::maybe_from_i64(-64, I64).unwrap()),
             flags: MemFlags::trusted(),
         },
-        "4825FCA9",
-        "ldp x8, x9, [x10, #-64]!",
+        "E827FCA9",
+        "ldp x8, x9, [sp, #-64]!",
     ));
     insns.push((
         Inst::LoadP64 {
             rt: writable_xreg(8),
             rt2: writable_xreg(25),
-            mem: PairAMode::PostIndexed(
-                writable_xreg(12),
-                SImm7Scaled::maybe_from_i64(504, I64).unwrap(),
-            ),
+            mem: PairAMode::SPPostIndexed(SImm7Scaled::maybe_from_i64(504, I64).unwrap()),
             flags: MemFlags::trusted(),
         },
-        "88E5DFA8",
-        "ldp x8, x25, [x12], #504",
+        "E8E7DFA8",
+        "ldp x8, x25, [sp], #504",
     ));
 
     insns.push((
@@ -2079,64 +2067,64 @@ fn test_aarch64_binemit() {
     ));
 
     insns.push((
-        Inst::MovWide {
-            op: MoveWideOp::MovK,
+        Inst::MovK {
             rd: writable_xreg(12),
+            rn: xreg(12),
             imm: MoveWideConst::maybe_from_u64(0x0000_0000_0000_0000).unwrap(),
             size: OperandSize::Size64,
         },
         "0C0080F2",
-        "movk x12, #0",
+        "movk x12, x12, #0",
     ));
     insns.push((
-        Inst::MovWide {
-            op: MoveWideOp::MovK,
+        Inst::MovK {
             rd: writable_xreg(19),
+            rn: xreg(19),
             imm: MoveWideConst::maybe_with_shift(0x0000, 16).unwrap(),
             size: OperandSize::Size64,
         },
         "1300A0F2",
-        "movk x19, #0, LSL #16",
+        "movk x19, x19, #0, LSL #16",
     ));
     insns.push((
-        Inst::MovWide {
-            op: MoveWideOp::MovK,
+        Inst::MovK {
             rd: writable_xreg(3),
+            rn: xreg(3),
             imm: MoveWideConst::maybe_from_u64(0x0000_0000_0000_ffff).unwrap(),
             size: OperandSize::Size64,
         },
         "E3FF9FF2",
-        "movk x3, #65535",
+        "movk x3, x3, #65535",
     ));
     insns.push((
-        Inst::MovWide {
-            op: MoveWideOp::MovK,
+        Inst::MovK {
             rd: writable_xreg(8),
+            rn: xreg(8),
             imm: MoveWideConst::maybe_from_u64(0x0000_0000_ffff_0000).unwrap(),
             size: OperandSize::Size64,
         },
         "E8FFBFF2",
-        "movk x8, #65535, LSL #16",
+        "movk x8, x8, #65535, LSL #16",
     ));
     insns.push((
-        Inst::MovWide {
-            op: MoveWideOp::MovK,
+        Inst::MovK {
             rd: writable_xreg(8),
+            rn: xreg(8),
             imm: MoveWideConst::maybe_from_u64(0x0000_ffff_0000_0000).unwrap(),
             size: OperandSize::Size64,
         },
         "E8FFDFF2",
-        "movk x8, #65535, LSL #32",
+        "movk x8, x8, #65535, LSL #32",
     ));
     insns.push((
-        Inst::MovWide {
-            op: MoveWideOp::MovK,
+        Inst::MovK {
             rd: writable_xreg(8),
+            rn: xreg(8),
             imm: MoveWideConst::maybe_from_u64(0xffff_0000_0000_0000).unwrap(),
             size: OperandSize::Size64,
         },
         "E8FFFFF2",
-        "movk x8, #65535, LSL #48",
+        "movk x8, x8, #65535, LSL #48",
     ));
 
     insns.push((
@@ -2267,22 +2255,24 @@ fn test_aarch64_binemit() {
     insns.push((
         Inst::MovToVec {
             rd: writable_vreg(0),
+            ri: vreg(0),
             rn: xreg(0),
             idx: 7,
             size: VectorSize::Size8x8,
         },
         "001C0F4E",
-        "mov v0.b[7], w0",
+        "mov v0.b[7], v0.b[7], w0",
     ));
     insns.push((
         Inst::MovToVec {
             rd: writable_vreg(20),
+            ri: vreg(20),
             rn: xreg(21),
             idx: 0,
             size: VectorSize::Size64x2,
         },
         "B41E084E",
-        "mov v20.d[0], x21",
+        "mov v20.d[0], v20.d[0], x21",
     ));
     insns.push((
         Inst::MovFromVec {
@@ -2649,25 +2639,27 @@ fn test_aarch64_binemit() {
     insns.push((
         Inst::VecMovElement {
             rd: writable_vreg(0),
+            ri: vreg(0),
             rn: vreg(31),
             dest_idx: 7,
             src_idx: 7,
             size: VectorSize::Size16x8,
         },
         "E0771E6E",
-        "mov v0.h[7], v31.h[7]",
+        "mov v0.h[7], v0.h[7], v31.h[7]",
     ));
 
     insns.push((
         Inst::VecMovElement {
             rd: writable_vreg(31),
+            ri: vreg(31),
             rn: vreg(16),
             dest_idx: 1,
             src_idx: 0,
             size: VectorSize::Size32x2,
         },
         "1F060C6E",
-        "mov v31.s[1], v16.s[0]",
+        "mov v31.s[1], v31.s[1], v16.s[0]",
     ));
 
     insns.push((
@@ -2726,11 +2718,10 @@ fn test_aarch64_binemit() {
     ));
 
     insns.push((
-        Inst::VecRRNarrow {
+        Inst::VecRRNarrowLow {
             op: VecRRNarrowOp::Xtn,
             rd: writable_vreg(25),
             rn: vreg(17),
-            high_half: false,
             lane_size: ScalarSize::Size8,
         },
         "392A210E",
@@ -2738,23 +2729,22 @@ fn test_aarch64_binemit() {
     ));
 
     insns.push((
-        Inst::VecRRNarrow {
+        Inst::VecRRNarrowHigh {
             op: VecRRNarrowOp::Xtn,
             rd: writable_vreg(3),
+            ri: vreg(3),
             rn: vreg(10),
-            high_half: true,
             lane_size: ScalarSize::Size16,
         },
         "4329614E",
-        "xtn2 v3.8h, v10.4s",
+        "xtn2 v3.8h, v3.8h, v10.4s",
     ));
 
     insns.push((
-        Inst::VecRRNarrow {
+        Inst::VecRRNarrowLow {
             op: VecRRNarrowOp::Xtn,
             rd: writable_vreg(22),
             rn: vreg(8),
-            high_half: false,
             lane_size: ScalarSize::Size32,
         },
         "1629A10E",
@@ -2762,35 +2752,34 @@ fn test_aarch64_binemit() {
     ));
 
     insns.push((
-        Inst::VecRRNarrow {
+        Inst::VecRRNarrowHigh {
             op: VecRRNarrowOp::Sqxtn,
             rd: writable_vreg(7),
+            ri: vreg(7),
             rn: vreg(22),
-            high_half: true,
             lane_size: ScalarSize::Size8,
         },
         "C74A214E",
-        "sqxtn2 v7.16b, v22.8h",
+        "sqxtn2 v7.16b, v7.16b, v22.8h",
     ));
 
     insns.push((
-        Inst::VecRRNarrow {
+        Inst::VecRRNarrowHigh {
             op: VecRRNarrowOp::Sqxtn,
             rd: writable_vreg(31),
+            ri: vreg(31),
             rn: vreg(0),
-            high_half: true,
             lane_size: ScalarSize::Size16,
         },
         "1F48614E",
-        "sqxtn2 v31.8h, v0.4s",
+        "sqxtn2 v31.8h, v31.8h, v0.4s",
     ));
 
     insns.push((
-        Inst::VecRRNarrow {
+        Inst::VecRRNarrowLow {
             op: VecRRNarrowOp::Sqxtn,
             rd: writable_vreg(14),
             rn: vreg(20),
-            high_half: false,
             lane_size: ScalarSize::Size32,
         },
         "8E4AA10E",
@@ -2798,11 +2787,10 @@ fn test_aarch64_binemit() {
     ));
 
     insns.push((
-        Inst::VecRRNarrow {
+        Inst::VecRRNarrowLow {
             op: VecRRNarrowOp::Sqxtun,
             rd: writable_vreg(16),
             rn: vreg(23),
-            high_half: false,
             lane_size: ScalarSize::Size8,
         },
         "F02A212E",
@@ -2810,23 +2798,22 @@ fn test_aarch64_binemit() {
     ));
 
     insns.push((
-        Inst::VecRRNarrow {
+        Inst::VecRRNarrowHigh {
             op: VecRRNarrowOp::Sqxtun,
             rd: writable_vreg(28),
+            ri: vreg(28),
             rn: vreg(9),
-            high_half: true,
             lane_size: ScalarSize::Size16,
         },
         "3C29616E",
-        "sqxtun2 v28.8h, v9.4s",
+        "sqxtun2 v28.8h, v28.8h, v9.4s",
     ));
 
     insns.push((
-        Inst::VecRRNarrow {
+        Inst::VecRRNarrowLow {
             op: VecRRNarrowOp::Sqxtun,
             rd: writable_vreg(15),
             rn: vreg(15),
-            high_half: false,
             lane_size: ScalarSize::Size32,
         },
         "EF29A12E",
@@ -2834,23 +2821,22 @@ fn test_aarch64_binemit() {
     ));
 
     insns.push((
-        Inst::VecRRNarrow {
+        Inst::VecRRNarrowHigh {
             op: VecRRNarrowOp::Uqxtn,
             rd: writable_vreg(21),
+            ri: vreg(21),
             rn: vreg(4),
-            high_half: true,
             lane_size: ScalarSize::Size8,
         },
         "9548216E",
-        "uqxtn2 v21.16b, v4.8h",
+        "uqxtn2 v21.16b, v21.16b, v4.8h",
     ));
 
     insns.push((
-        Inst::VecRRNarrow {
+        Inst::VecRRNarrowLow {
             op: VecRRNarrowOp::Uqxtn,
             rd: writable_vreg(31),
             rn: vreg(31),
-            high_half: false,
             lane_size: ScalarSize::Size16,
         },
         "FF4B612E",
@@ -2858,23 +2844,22 @@ fn test_aarch64_binemit() {
     ));
 
     insns.push((
-        Inst::VecRRNarrow {
+        Inst::VecRRNarrowHigh {
             op: VecRRNarrowOp::Uqxtn,
             rd: writable_vreg(11),
+            ri: vreg(11),
             rn: vreg(12),
-            high_half: true,
             lane_size: ScalarSize::Size32,
         },
         "8B49A16E",
-        "uqxtn2 v11.4s, v12.2d",
+        "uqxtn2 v11.4s, v11.4s, v12.2d",
     ));
 
     insns.push((
-        Inst::VecRRNarrow {
+        Inst::VecRRNarrowLow {
             op: VecRRNarrowOp::Fcvtn,
             rd: writable_vreg(0),
             rn: vreg(0),
-            high_half: false,
             lane_size: ScalarSize::Size16,
         },
         "0068210E",
@@ -2882,11 +2867,10 @@ fn test_aarch64_binemit() {
     ));
 
     insns.push((
-        Inst::VecRRNarrow {
+        Inst::VecRRNarrowLow {
             op: VecRRNarrowOp::Fcvtn,
             rd: writable_vreg(2),
             rn: vreg(7),
-            high_half: false,
             lane_size: ScalarSize::Size32,
         },
         "E268610E",
@@ -2894,15 +2878,15 @@ fn test_aarch64_binemit() {
     ));
 
     insns.push((
-        Inst::VecRRNarrow {
+        Inst::VecRRNarrowHigh {
             op: VecRRNarrowOp::Fcvtn,
             rd: writable_vreg(31),
+            ri: vreg(31),
             rn: vreg(30),
-            high_half: true,
             lane_size: ScalarSize::Size32,
         },
         "DF6B614E",
-        "fcvtn2 v31.4s, v30.2d",
+        "fcvtn2 v31.4s, v31.4s, v30.2d",
     ));
 
     insns.push((
@@ -3415,12 +3399,13 @@ fn test_aarch64_binemit() {
         Inst::VecRRRMod {
             alu_op: VecALUModOp::Bsl,
             rd: writable_vreg(8),
+            ri: vreg(8),
             rn: vreg(9),
             rm: vreg(1),
             size: VectorSize::Size8x16,
         },
         "281D616E",
-        "bsl v8.16b, v9.16b, v1.16b",
+        "bsl v8.16b, v8.16b, v9.16b, v1.16b",
     ));
 
     insns.push((
@@ -4123,36 +4108,39 @@ fn test_aarch64_binemit() {
         Inst::VecRRRMod {
             alu_op: VecALUModOp::Fmla,
             rd: writable_vreg(2),
+            ri: vreg(2),
             rn: vreg(0),
             rm: vreg(5),
             size: VectorSize::Size32x2,
         },
         "02CC250E",
-        "fmla v2.2s, v0.2s, v5.2s",
+        "fmla v2.2s, v2.2s, v0.2s, v5.2s",
     ));
 
     insns.push((
         Inst::VecRRRMod {
             alu_op: VecALUModOp::Fmla,
             rd: writable_vreg(2),
+            ri: vreg(2),
             rn: vreg(0),
             rm: vreg(5),
             size: VectorSize::Size32x4,
         },
         "02CC254E",
-        "fmla v2.4s, v0.4s, v5.4s",
+        "fmla v2.4s, v2.4s, v0.4s, v5.4s",
     ));
 
     insns.push((
         Inst::VecRRRMod {
             alu_op: VecALUModOp::Fmla,
             rd: writable_vreg(2),
+            ri: vreg(2),
             rn: vreg(0),
             rm: vreg(5),
             size: VectorSize::Size64x2,
         },
         "02CC654E",
-        "fmla v2.2d, v0.2d, v5.2d",
+        "fmla v2.2d, v2.2d, v0.2d, v5.2d",
     ));
 
     insns.push((
@@ -4276,15 +4264,16 @@ fn test_aarch64_binemit() {
     ));
 
     insns.push((
-        Inst::VecRRRLong {
-            alu_op: VecRRRLongOp::Umlal8,
+        Inst::VecRRRLongMod {
+            alu_op: VecRRRLongModOp::Umlal8,
             rd: writable_vreg(4),
+            ri: vreg(4),
             rn: vreg(8),
             rm: vreg(16),
             high_half: false,
         },
         "0481302E",
-        "umlal v4.8h, v8.8b, v16.8b",
+        "umlal v4.8h, v4.8h, v8.8b, v16.8b",
     ));
 
     insns.push((
@@ -4312,15 +4301,16 @@ fn test_aarch64_binemit() {
     ));
 
     insns.push((
-        Inst::VecRRRLong {
-            alu_op: VecRRRLongOp::Umlal16,
+        Inst::VecRRRLongMod {
+            alu_op: VecRRRLongModOp::Umlal16,
             rd: writable_vreg(7),
+            ri: vreg(7),
             rn: vreg(14),
             rm: vreg(21),
             high_half: false,
         },
         "C781752E",
-        "umlal v7.4s, v14.4h, v21.4h",
+        "umlal v7.4s, v7.4s, v14.4h, v21.4h",
     ));
 
     insns.push((
@@ -4348,15 +4338,16 @@ fn test_aarch64_binemit() {
     ));
 
     insns.push((
-        Inst::VecRRRLong {
-            alu_op: VecRRRLongOp::Umlal32,
+        Inst::VecRRRLongMod {
+            alu_op: VecRRRLongModOp::Umlal32,
             rd: writable_vreg(9),
+            ri: vreg(9),
             rn: vreg(20),
             rm: vreg(17),
             high_half: false,
         },
         "8982B12E",
-        "umlal v9.2d, v20.2s, v17.2s",
+        "umlal v9.2d, v9.2d, v20.2s, v17.2s",
     ));
 
     insns.push((
@@ -4384,15 +4375,16 @@ fn test_aarch64_binemit() {
     ));
 
     insns.push((
-        Inst::VecRRRLong {
-            alu_op: VecRRRLongOp::Umlal8,
+        Inst::VecRRRLongMod {
+            alu_op: VecRRRLongModOp::Umlal8,
             rd: writable_vreg(1),
+            ri: vreg(1),
             rn: vreg(5),
             rm: vreg(15),
             high_half: true,
         },
         "A1802F6E",
-        "umlal2 v1.8h, v5.16b, v15.16b",
+        "umlal2 v1.8h, v1.8h, v5.16b, v15.16b",
     ));
 
     insns.push((
@@ -4420,15 +4412,16 @@ fn test_aarch64_binemit() {
     ));
 
     insns.push((
-        Inst::VecRRRLong {
-            alu_op: VecRRRLongOp::Umlal16,
+        Inst::VecRRRLongMod {
+            alu_op: VecRRRLongModOp::Umlal16,
             rd: writable_vreg(11),
+            ri: vreg(11),
             rn: vreg(10),
             rm: vreg(12),
             high_half: true,
         },
         "4B816C6E",
-        "umlal2 v11.4s, v10.8h, v12.8h",
+        "umlal2 v11.4s, v11.4s, v10.8h, v12.8h",
     ));
 
     insns.push((
@@ -4456,15 +4449,16 @@ fn test_aarch64_binemit() {
     ));
 
     insns.push((
-        Inst::VecRRRLong {
-            alu_op: VecRRRLongOp::Umlal32,
+        Inst::VecRRRLongMod {
+            alu_op: VecRRRLongModOp::Umlal32,
             rd: writable_vreg(10),
+            ri: vreg(10),
             rn: vreg(29),
             rm: vreg(2),
             high_half: true,
         },
         "AA83A26E",
-        "umlal2 v10.2d, v29.4s, v2.4s",
+        "umlal2 v10.2d, v10.2d, v29.4s, v2.4s",
     ));
 
     insns.push((
@@ -5418,21 +5412,20 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(0),
             rn: vreg(31),
             rm: vreg(16),
-            is_extension: false,
         },
         "E003104E",
         "tbl v0.16b, { v31.16b }, v16.16b",
     ));
 
     insns.push((
-        Inst::VecTbl {
+        Inst::VecTblExt {
             rd: writable_vreg(4),
+            ri: vreg(4),
             rn: vreg(12),
             rm: vreg(23),
-            is_extension: true,
         },
         "8411174E",
-        "tbx v4.16b, { v12.16b }, v23.16b",
+        "tbx v4.16b, v4.16b, { v12.16b }, v23.16b",
     ));
 
     insns.push((
@@ -5441,22 +5434,21 @@ fn test_aarch64_binemit() {
             rn: vreg(31),
             rn2: vreg(0),
             rm: vreg(26),
-            is_extension: false,
         },
         "F0231A4E",
         "tbl v16.16b, { v31.16b, v0.16b }, v26.16b",
     ));
 
     insns.push((
-        Inst::VecTbl2 {
+        Inst::VecTbl2Ext {
             rd: writable_vreg(3),
+            ri: vreg(3),
             rn: vreg(11),
             rn2: vreg(12),
             rm: vreg(19),
-            is_extension: true,
         },
         "6331134E",
-        "tbx v3.16b, { v11.16b, v12.16b }, v19.16b",
+        "tbx v3.16b, v3.16b, { v11.16b, v12.16b }, v19.16b",
     ));
 
     insns.push((
@@ -6201,23 +6193,25 @@ fn test_aarch64_binemit() {
     ));
 
     insns.push((
-        Inst::FpuRRI {
-            fpu_op: FPUOpRI::Sli32(FPULeftShiftImm::maybe_from_u8(31, 32).unwrap()),
+        Inst::FpuRRIMod {
+            fpu_op: FPUOpRIMod::Sli32(FPULeftShiftImm::maybe_from_u8(31, 32).unwrap()),
             rd: writable_vreg(4),
+            ri: vreg(4),
             rn: vreg(10),
         },
         "44553F2F",
-        "sli v4.2s, v10.2s, #31",
+        "sli v4.2s, v4.2s, v10.2s, #31",
     ));
 
     insns.push((
-        Inst::FpuRRI {
-            fpu_op: FPUOpRI::Sli64(FPULeftShiftImm::maybe_from_u8(63, 64).unwrap()),
+        Inst::FpuRRIMod {
+            fpu_op: FPUOpRIMod::Sli64(FPULeftShiftImm::maybe_from_u8(63, 64).unwrap()),
             rd: writable_vreg(4),
+            ri: vreg(4),
             rn: vreg(10),
         },
         "44557F7F",
-        "sli d4, d10, #63",
+        "sli d4, d4, d10, #63",
     ));
 
     insns.push((
@@ -6505,24 +6499,18 @@ fn test_aarch64_binemit() {
         Inst::FpuLoadP64 {
             rt: writable_vreg(19),
             rt2: writable_vreg(11),
-            mem: PairAMode::PreIndexed(
-                writable_xreg(25),
-                SImm7Scaled::maybe_from_i64(-512, F64).unwrap(),
-            ),
+            mem: PairAMode::SPPreIndexed(SImm7Scaled::maybe_from_i64(-512, F64).unwrap()),
             flags: MemFlags::trusted(),
         },
-        "332FE06D",
-        "ldp d19, d11, [x25, #-512]!",
+        "F32FE06D",
+        "ldp d19, d11, [sp, #-512]!",
     ));
 
     insns.push((
         Inst::FpuLoadP64 {
             rt: writable_vreg(7),
             rt2: writable_vreg(20),
-            mem: PairAMode::PostIndexed(
-                writable_stack_reg(),
-                SImm7Scaled::maybe_from_i64(64, F64).unwrap(),
-            ),
+            mem: PairAMode::SPPostIndexed(SImm7Scaled::maybe_from_i64(64, F64).unwrap()),
             flags: MemFlags::trusted(),
         },
         "E753C46C",
@@ -6547,28 +6535,22 @@ fn test_aarch64_binemit() {
         Inst::FpuStoreP64 {
             rt: vreg(16),
             rt2: vreg(8),
-            mem: PairAMode::PreIndexed(
-                writable_xreg(15),
-                SImm7Scaled::maybe_from_i64(48, F64).unwrap(),
-            ),
+            mem: PairAMode::SPPreIndexed(SImm7Scaled::maybe_from_i64(48, F64).unwrap()),
             flags: MemFlags::trusted(),
         },
-        "F021836D",
-        "stp d16, d8, [x15, #48]!",
+        "F023836D",
+        "stp d16, d8, [sp, #48]!",
     ));
 
     insns.push((
         Inst::FpuStoreP64 {
             rt: vreg(5),
             rt2: vreg(6),
-            mem: PairAMode::PostIndexed(
-                writable_xreg(28),
-                SImm7Scaled::maybe_from_i64(-32, F64).unwrap(),
-            ),
+            mem: PairAMode::SPPostIndexed(SImm7Scaled::maybe_from_i64(-32, F64).unwrap()),
             flags: MemFlags::trusted(),
         },
-        "851BBE6C",
-        "stp d5, d6, [x28], #-32",
+        "E51BBE6C",
+        "stp d5, d6, [sp], #-32",
     ));
 
     insns.push((
@@ -6586,28 +6568,22 @@ fn test_aarch64_binemit() {
         Inst::FpuLoadP128 {
             rt: writable_vreg(29),
             rt2: writable_vreg(9),
-            mem: PairAMode::PreIndexed(
-                writable_xreg(16),
-                SImm7Scaled::maybe_from_i64(-1024, I8X16).unwrap(),
-            ),
+            mem: PairAMode::SPPreIndexed(SImm7Scaled::maybe_from_i64(-1024, I8X16).unwrap()),
             flags: MemFlags::trusted(),
         },
-        "1D26E0AD",
-        "ldp q29, q9, [x16, #-1024]!",
+        "FD27E0AD",
+        "ldp q29, q9, [sp, #-1024]!",
     ));
 
     insns.push((
         Inst::FpuLoadP128 {
             rt: writable_vreg(10),
             rt2: writable_vreg(20),
-            mem: PairAMode::PostIndexed(
-                writable_xreg(26),
-                SImm7Scaled::maybe_from_i64(256, I8X16).unwrap(),
-            ),
+            mem: PairAMode::SPPostIndexed(SImm7Scaled::maybe_from_i64(256, I8X16).unwrap()),
             flags: MemFlags::trusted(),
         },
-        "4A53C8AC",
-        "ldp q10, q20, [x26], #256",
+        "EA53C8AC",
+        "ldp q10, q20, [sp], #256",
     ));
 
     insns.push((
@@ -6628,10 +6604,7 @@ fn test_aarch64_binemit() {
         Inst::FpuStoreP128 {
             rt: vreg(27),
             rt2: vreg(13),
-            mem: PairAMode::PreIndexed(
-                writable_stack_reg(),
-                SImm7Scaled::maybe_from_i64(-192, I8X16).unwrap(),
-            ),
+            mem: PairAMode::SPPreIndexed(SImm7Scaled::maybe_from_i64(-192, I8X16).unwrap()),
             flags: MemFlags::trusted(),
         },
         "FB37BAAD",
@@ -6642,14 +6615,11 @@ fn test_aarch64_binemit() {
         Inst::FpuStoreP128 {
             rt: vreg(18),
             rt2: vreg(22),
-            mem: PairAMode::PostIndexed(
-                writable_xreg(13),
-                SImm7Scaled::maybe_from_i64(304, I8X16).unwrap(),
-            ),
+            mem: PairAMode::SPPostIndexed(SImm7Scaled::maybe_from_i64(304, I8X16).unwrap()),
             flags: MemFlags::trusted(),
         },
-        "B2D989AC",
-        "stp q18, q22, [x13], #304",
+        "F2DB89AC",
+        "stp q18, q22, [sp], #304",
     ));
 
     insns.push((
@@ -6769,105 +6739,170 @@ fn test_aarch64_binemit() {
         Inst::AtomicRMWLoop {
             ty: I8,
             op: AtomicRMWLoopOp::Sub,
+            addr: xreg(25),
+            operand: xreg(26),
+            oldval: writable_xreg(27),
+            scratch1: writable_xreg(24),
+            scratch2: writable_xreg(28),
         },
         "3BFF5F087C031A4B3CFF1808B8FFFFB5",
-        "1: ldaxrb w27, [x25]; sub w28, w27, w26; stlxrb w24, w28, [x25]; cbnz w24, 1b",
+        "atomic_rmw_loop_sub_8 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28",
     ));
     insns.push((
         Inst::AtomicRMWLoop {
             ty: I16,
             op: AtomicRMWLoopOp::Eor,
+            addr: xreg(25),
+            operand: xreg(26),
+            oldval: writable_xreg(27),
+            scratch1: writable_xreg(24),
+            scratch2: writable_xreg(28),
         },
         "3BFF5F487C031A4A3CFF1848B8FFFFB5",
-        "1: ldaxrh w27, [x25]; eor w28, w27, w26; stlxrh w24, w28, [x25]; cbnz w24, 1b",
+        "atomic_rmw_loop_eor_16 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28",
     ));
     insns.push((
         Inst::AtomicRMWLoop {
             ty: I8,
             op: AtomicRMWLoopOp::Add,
+            addr: xreg(25),
+            operand: xreg(26),
+            oldval: writable_xreg(27),
+            scratch1: writable_xreg(24),
+            scratch2: writable_xreg(28),
         },
         "3BFF5F087C031A0B3CFF1808B8FFFFB5",
-        "1: ldaxrb w27, [x25]; add w28, w27, w26; stlxrb w24, w28, [x25]; cbnz w24, 1b",
+        "atomic_rmw_loop_add_8 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28",
     ));
     insns.push((
         Inst::AtomicRMWLoop {
             ty: I32,
             op: AtomicRMWLoopOp::Orr,
+            addr: xreg(25),
+            operand: xreg(26),
+            oldval: writable_xreg(27),
+            scratch1: writable_xreg(24),
+            scratch2: writable_xreg(28),
         },
         "3BFF5F887C031A2A3CFF1888B8FFFFB5",
-        "1: ldaxr w27, [x25]; orr w28, w27, w26; stlxr w24, w28, [x25]; cbnz w24, 1b",
+        "atomic_rmw_loop_orr_32 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28",
     ));
     insns.push((
         Inst::AtomicRMWLoop {
             ty: I64,
             op: AtomicRMWLoopOp::And,
+            addr: xreg(25),
+            operand: xreg(26),
+            oldval: writable_xreg(27),
+            scratch1: writable_xreg(24),
+            scratch2: writable_xreg(28),
         },
         "3BFF5FC87C031A8A3CFF18C8B8FFFFB5",
-        "1: ldaxr x27, [x25]; and x28, x27, x26; stlxr w24, x28, [x25]; cbnz w24, 1b",
+        "atomic_rmw_loop_and_64 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28",
     ));
     insns.push((
         Inst::AtomicRMWLoop {
             ty: I8,
             op: AtomicRMWLoopOp::Xchg,
+            addr: xreg(25),
+            operand: xreg(26),
+            oldval: writable_xreg(27),
+            scratch1: writable_xreg(24),
+            scratch2: writable_xreg(28),
         },
         "3BFF5F083AFF1808D8FFFFB5",
-        "1: ldaxrb w27, [x25]; stlxrb w24, w26, [x25]; cbnz w24, 1b",
+        "atomic_rmw_loop_xchg_8 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28",
     ));
     insns.push((
         Inst::AtomicRMWLoop {
             ty: I16,
             op: AtomicRMWLoopOp::Nand,
+            addr: xreg(25),
+            operand: xreg(26),
+            oldval: writable_xreg(27),
+            scratch1: writable_xreg(24),
+            scratch2: writable_xreg(28),
         },
         "3BFF5F487C031A0AFC033C2A3CFF184898FFFFB5",
-        "1: ldaxrh w27, [x25]; and w28, w27, w26; mvn w28, w28; stlxrh w24, w28, [x25]; cbnz w24, 1b",
+        "atomic_rmw_loop_nand_16 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28",
     ));
     insns.push((
         Inst::AtomicRMWLoop {
             ty: I16,
             op: AtomicRMWLoopOp::Smin,
+            addr: xreg(25),
+            operand: xreg(26),
+            oldval: writable_xreg(27),
+            scratch1: writable_xreg(24),
+            scratch2: writable_xreg(28),
         },
         "3BFF5F487B3F00137FA33A6B7CB39A9A3CFF184878FFFFB5",
-        "1: ldaxrh w27, [x25]; sxth w27, w27; cmp w27, w26, sxth; csel w28, w27, w26, lt; stlxrh w24, w28, [x25]; cbnz w24, 1b",
+        "atomic_rmw_loop_smin_16 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28",
     ));
     insns.push((
         Inst::AtomicRMWLoop {
             ty: I32,
             op: AtomicRMWLoopOp::Smin,
+            addr: xreg(25),
+            operand: xreg(26),
+            oldval: writable_xreg(27),
+            scratch1: writable_xreg(24),
+            scratch2: writable_xreg(28),
         },
         "3BFF5F887F031A6B7CB39A9A3CFF188898FFFFB5",
-        "1: ldaxr w27, [x25]; cmp w27, w26; csel w28, w27, w26, lt; stlxr w24, w28, [x25]; cbnz w24, 1b",
+        "atomic_rmw_loop_smin_32 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28",
     ));
     insns.push((
         Inst::AtomicRMWLoop {
             ty: I64,
             op: AtomicRMWLoopOp::Smax,
+            addr: xreg(25),
+            operand: xreg(26),
+            oldval: writable_xreg(27),
+            scratch1: writable_xreg(24),
+            scratch2: writable_xreg(28),
         },
         "3BFF5FC87F031AEB7CC39A9A3CFF18C898FFFFB5",
-        "1: ldaxr x27, [x25]; cmp x27, x26; csel x28, x27, x26, gt; stlxr w24, x28, [x25]; cbnz w24, 1b",
+        "atomic_rmw_loop_smax_64 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28",
     ));
     insns.push((
         Inst::AtomicRMWLoop {
             ty: I8,
             op: AtomicRMWLoopOp::Smax,
+            addr: xreg(25),
+            operand: xreg(26),
+            oldval: writable_xreg(27),
+            scratch1: writable_xreg(24),
+            scratch2: writable_xreg(28),
         },
         "3BFF5F087B1F00137F833A6B7CC39A9A3CFF180878FFFFB5",
-        "1: ldaxrb w27, [x25]; sxtb w27, w27; cmp w27, w26, sxtb; csel w28, w27, w26, gt; stlxrb w24, w28, [x25]; cbnz w24, 1b",
+        "atomic_rmw_loop_smax_8 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28",
     ));
     insns.push((
         Inst::AtomicRMWLoop {
             ty: I8,
             op: AtomicRMWLoopOp::Umin,
+            addr: xreg(25),
+            operand: xreg(26),
+            oldval: writable_xreg(27),
+            scratch1: writable_xreg(24),
+            scratch2: writable_xreg(28),
         },
         "3BFF5F087F031A6B7C339A9A3CFF180898FFFFB5",
-        "1: ldaxrb w27, [x25]; cmp w27, w26; csel w28, w27, w26, lo; stlxrb w24, w28, [x25]; cbnz w24, 1b",
+        "atomic_rmw_loop_umin_8 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28",
     ));
     insns.push((
         Inst::AtomicRMWLoop {
             ty: I16,
             op: AtomicRMWLoopOp::Umax,
+            addr: xreg(25),
+            operand: xreg(26),
+            oldval: writable_xreg(27),
+            scratch1: writable_xreg(24),
+            scratch2: writable_xreg(28),
         },
         "3BFF5F487F031A6B7C839A9A3CFF184898FFFFB5",
-        "1: ldaxrh w27, [x25]; cmp w27, w26; csel w28, w27, w26, hi; stlxrh w24, w28, [x25]; cbnz w24, 1b",
+        "atomic_rmw_loop_umax_16 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28",
     ));
 
     insns.push((
@@ -7269,74 +7304,98 @@ fn test_aarch64_binemit() {
 
     insns.push((
         Inst::AtomicCAS {
-            rs: writable_xreg(28),
+            rd: writable_xreg(28),
+            rs: xreg(28),
             rt: xreg(20),
             rn: xreg(10),
             ty: I8,
         },
         "54FDFC08",
-        "casalb w28, w20, [x10]",
+        "casalb w28, w28, w20, [x10]",
     ));
     insns.push((
         Inst::AtomicCAS {
-            rs: writable_xreg(2),
+            rd: writable_xreg(2),
+            rs: xreg(2),
             rt: xreg(19),
             rn: xreg(23),
             ty: I16,
         },
         "F3FEE248",
-        "casalh w2, w19, [x23]",
+        "casalh w2, w2, w19, [x23]",
     ));
     insns.push((
         Inst::AtomicCAS {
-            rs: writable_xreg(0),
+            rd: writable_xreg(0),
+            rs: xreg(0),
             rt: zero_reg(),
             rn: stack_reg(),
             ty: I32,
         },
         "FFFFE088",
-        "casal w0, wzr, [sp]",
+        "casal w0, w0, wzr, [sp]",
     ));
     insns.push((
         Inst::AtomicCAS {
-            rs: writable_xreg(7),
+            rd: writable_xreg(7),
+            rs: xreg(7),
             rt: xreg(15),
             rn: xreg(27),
             ty: I64,
         },
         "6FFFE7C8",
-        "casal x7, x15, [x27]",
+        "casal x7, x7, x15, [x27]",
     ));
     insns.push((
         Inst::AtomicCASLoop {
             ty: I8,
+            addr: xreg(25),
+            expected: xreg(26),
+            replacement: xreg(28),
+            oldval: writable_xreg(27),
+            scratch: writable_xreg(24),
         },
         "3BFF5F087F033AEB610000543CFF180898FFFFB5",
-        "atomically { compare-and-swap(8_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }"
+        "atomic_cas_loop_8 addr=x25, expect=x26, replacement=x28, oldval=x27, scratch=x24",
     ));
 
     insns.push((
         Inst::AtomicCASLoop {
             ty: I16,
+            addr: xreg(25),
+            expected: xreg(26),
+            replacement: xreg(28),
+            oldval: writable_xreg(27),
+            scratch: writable_xreg(24),
         },
         "3BFF5F487F233AEB610000543CFF184898FFFFB5",
-        "atomically { compare-and-swap(16_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }"
+        "atomic_cas_loop_16 addr=x25, expect=x26, replacement=x28, oldval=x27, scratch=x24",
     ));
 
     insns.push((
         Inst::AtomicCASLoop {
             ty: I32,
+            addr: xreg(25),
+            expected: xreg(26),
+            replacement: xreg(28),
+            oldval: writable_xreg(27),
+            scratch: writable_xreg(24),
         },
         "3BFF5F887F031AEB610000543CFF188898FFFFB5",
-        "atomically { compare-and-swap(32_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }"
+        "atomic_cas_loop_32 addr=x25, expect=x26, replacement=x28, oldval=x27, scratch=x24",
     ));
 
     insns.push((
         Inst::AtomicCASLoop {
             ty: I64,
+            addr: xreg(25),
+            expected: xreg(26),
+            replacement: xreg(28),
+            oldval: writable_xreg(27),
+            scratch: writable_xreg(24),
         },
         "3BFF5FC87F031AEB610000543CFF18C898FFFFB5",
-        "atomically { compare-and-swap(64_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }"
+        "atomic_cas_loop_64 addr=x25, expect=x26, replacement=x28, oldval=x27, scratch=x24",
     ));
 
     insns.push((
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index 3d55806cf5..8add4a18ec 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -39,7 +39,7 @@ pub use crate::isa::aarch64::lower::isle::generated_code::{
     ALUOp, ALUOp3, APIKey, AtomicRMWLoopOp, AtomicRMWOp, BitOp, FPUOp1, FPUOp2, FPUOp3,
     FpuRoundMode, FpuToIntOp, IntToFpuOp, MInst as Inst, MoveWideOp, VecALUModOp, VecALUOp,
     VecExtendOp, VecLanesOp, VecMisc2, VecPairOp, VecRRLongOp, VecRRNarrowOp, VecRRPairLongOp,
-    VecRRRLongOp, VecShiftImmModOp, VecShiftImmOp,
+    VecRRRLongModOp, VecRRRLongOp, VecShiftImmModOp, VecShiftImmOp,
 };
 
 /// A floating-point unit (FPU) operation with two args, a register and an immediate.
@@ -49,6 +49,13 @@ pub enum FPUOpRI {
     UShr32(FPURightShiftImm),
     /// Unsigned right shift. Rd = Rn << #imm
     UShr64(FPURightShiftImm),
+}
+
+/// A floating-point unit (FPU) operation with two args, a register and
+/// an immediate that modifies its dest (so takes that input value as a
+/// separate virtual register).
+#[derive(Copy, Clone, Debug)]
+pub enum FPUOpRIMod {
     /// Shift left and insert. Rd |= Rn << #imm
     Sli32(FPULeftShiftImm),
     /// Shift left and insert. Rd |= Rn << #imm
@@ -197,9 +204,9 @@ impl Inst {
                         }
                     } else {
                         let imm = MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap();
-                        insts.push(Inst::MovWide {
-                            op: MoveWideOp::MovK,
+                        insts.push(Inst::MovK {
                             rd,
+                            rn: rd.to_reg(), // Redef the same virtual register.
                             imm,
                             size,
                         });
@@ -550,9 +557,7 @@ fn memarg_operands<F: Fn(VReg) -> VReg>(memarg: &AMode, collector: &mut OperandC
             collector.reg_use(r2);
         }
         &AMode::Label(..) => {}
-        &AMode::PreIndexed(reg, ..) | &AMode::PostIndexed(reg, ..) => {
-            collector.reg_mod(reg);
-        }
+        &AMode::SPPreIndexed(..) | &AMode::SPPostIndexed(..) => {}
         &AMode::FPOffset(..) => {}
         &AMode::SPOffset(..) | &AMode::NominalSPOffset(..) => {}
         &AMode::RegOffset(r, ..) => {
@@ -570,9 +575,7 @@ fn pairmemarg_operands<F: Fn(VReg) -> VReg>(
         &PairAMode::SignedOffset(reg, ..) => {
             collector.reg_use(reg);
         }
-        &PairAMode::PreIndexed(reg, ..) | &PairAMode::PostIndexed(reg, ..) => {
-            collector.reg_mod(reg);
-        }
+        &PairAMode::SPPreIndexed(..) | &PairAMode::SPPostIndexed(..) => {}
     }
 }
 
@@ -657,10 +660,13 @@ fn aarch64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
             debug_assert!(rd.to_reg().is_virtual());
             collector.reg_def(rd);
         }
-        &Inst::MovWide { op, rd, .. } => match op {
-            MoveWideOp::MovK => collector.reg_mod(rd),
-            _ => collector.reg_def(rd),
-        },
+        &Inst::MovK { rd, rn, .. } => {
+            collector.reg_use(rn);
+            collector.reg_reuse_def(rd, 0); // `rn` == `rd`.
+        }
+        &Inst::MovWide { rd, .. } => {
+            collector.reg_def(rd);
+        }
         &Inst::CSel { rd, rn, rm, .. } => {
             collector.reg_def(rd);
             collector.reg_use(rn);
@@ -681,13 +687,21 @@ fn aarch64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
         &Inst::CCmpImm { rn, .. } => {
             collector.reg_use(rn);
         }
-        &Inst::AtomicRMWLoop { op, .. } => {
-            collector.reg_use(xreg(25));
-            collector.reg_use(xreg(26));
-            collector.reg_def(writable_xreg(24));
-            collector.reg_def(writable_xreg(27));
+        &Inst::AtomicRMWLoop {
+            op,
+            addr,
+            operand,
+            oldval,
+            scratch1,
+            scratch2,
+            ..
+        } => {
+            collector.reg_fixed_use(addr, xreg(25));
+            collector.reg_fixed_use(operand, xreg(26));
+            collector.reg_fixed_def(oldval, xreg(27));
+            collector.reg_fixed_def(scratch1, xreg(24));
             if op != AtomicRMWLoopOp::Xchg {
-                collector.reg_def(writable_xreg(28));
+                collector.reg_fixed_def(scratch2, xreg(28));
             }
         }
         &Inst::AtomicRMW { rs, rt, rn, .. } => {
@@ -695,17 +709,25 @@ fn aarch64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
             collector.reg_def(rt);
             collector.reg_use(rn);
         }
-        &Inst::AtomicCAS { rs, rt, rn, .. } => {
-            collector.reg_mod(rs);
+        &Inst::AtomicCAS { rd, rs, rt, rn, .. } => {
+            collector.reg_reuse_def(rd, 1); // reuse `rs`.
+            collector.reg_use(rs);
             collector.reg_use(rt);
             collector.reg_use(rn);
         }
-        &Inst::AtomicCASLoop { .. } => {
-            collector.reg_use(xreg(25));
-            collector.reg_use(xreg(26));
-            collector.reg_use(xreg(28));
-            collector.reg_def(writable_xreg(24));
-            collector.reg_def(writable_xreg(27));
+        &Inst::AtomicCASLoop {
+            addr,
+            expected,
+            replacement,
+            oldval,
+            scratch,
+            ..
+        } => {
+            collector.reg_fixed_use(addr, xreg(25));
+            collector.reg_fixed_use(expected, xreg(26));
+            collector.reg_fixed_use(replacement, xreg(28));
+            collector.reg_fixed_def(oldval, xreg(24));
+            collector.reg_fixed_def(scratch, xreg(27));
         }
         &Inst::LoadAcquire { rt, rn, .. } => {
             collector.reg_use(rn);
@@ -741,11 +763,13 @@ fn aarch64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
             collector.reg_use(rn);
             collector.reg_use(rm);
         }
-        &Inst::FpuRRI { fpu_op, rd, rn, .. } => {
-            match fpu_op {
-                FPUOpRI::UShr32(..) | FPUOpRI::UShr64(..) => collector.reg_def(rd),
-                FPUOpRI::Sli32(..) | FPUOpRI::Sli64(..) => collector.reg_mod(rd),
-            }
+        &Inst::FpuRRI { rd, rn, .. } => {
+            collector.reg_def(rd);
+            collector.reg_use(rn);
+        }
+        &Inst::FpuRRIMod { rd, ri, rn, .. } => {
+            collector.reg_reuse_def(rd, 1); // reuse `ri`.
+            collector.reg_use(ri);
             collector.reg_use(rn);
         }
         &Inst::FpuRRRR { rd, rn, rm, ra, .. } => {
@@ -767,8 +791,9 @@ fn aarch64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
             collector.reg_def(rd);
             collector.reg_use(rn);
         }
-        &Inst::VecShiftImmMod { rd, rn, .. } => {
-            collector.reg_mod(rd);
+        &Inst::VecShiftImmMod { rd, ri, rn, .. } => {
+            collector.reg_reuse_def(rd, 1); // `rd` == `ri`.
+            collector.reg_use(ri);
             collector.reg_use(rn);
         }
         &Inst::VecExtract { rd, rn, rm, .. } => {
@@ -776,37 +801,42 @@ fn aarch64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
             collector.reg_use(rn);
             collector.reg_use(rm);
         }
-        &Inst::VecTbl {
-            rd,
-            rn,
-            rm,
-            is_extension,
-        } => {
+        &Inst::VecTbl { rd, rn, rm } => {
             collector.reg_use(rn);
             collector.reg_use(rm);
-
-            if is_extension {
-                collector.reg_mod(rd);
-            } else {
-                collector.reg_def(rd);
-            }
+            collector.reg_def(rd);
         }
-        &Inst::VecTbl2 {
+        &Inst::VecTblExt { rd, ri, rn, rm } => {
+            collector.reg_use(rn);
+            collector.reg_use(rm);
+            collector.reg_reuse_def(rd, 3); // `rd` == `ri`.
+            collector.reg_use(ri);
+        }
+
+        &Inst::VecTbl2 { rd, rn, rn2, rm } => {
+            // Constrain to v30 / v31 so that we satisfy the "adjacent
+            // registers" constraint without use of pinned vregs in
+            // lowering.
+            collector.reg_fixed_use(rn, vreg(30));
+            collector.reg_fixed_use(rn2, vreg(31));
+            collector.reg_use(rm);
+            collector.reg_def(rd);
+        }
+        &Inst::VecTbl2Ext {
             rd,
+            ri,
             rn,
             rn2,
             rm,
-            is_extension,
         } => {
-            collector.reg_use(rn);
-            collector.reg_use(rn2);
+            // Constrain to v30 / v31 so that we satisfy the "adjacent
+            // registers" constraint without use of pinned vregs in
+            // lowering.
+            collector.reg_fixed_use(rn, vreg(30));
+            collector.reg_fixed_use(rn2, vreg(31));
             collector.reg_use(rm);
-
-            if is_extension {
-                collector.reg_mod(rd);
-            } else {
-                collector.reg_def(rd);
-            }
+            collector.reg_reuse_def(rd, 4); // `rd` == `ri`.
+            collector.reg_use(ri);
         }
         &Inst::VecLoadReplicate { rd, rn, .. } => {
             collector.reg_def(rd);
@@ -900,8 +930,9 @@ fn aarch64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
         &Inst::FpuMoveFPImm { rd, .. } => {
             collector.reg_def(rd);
         }
-        &Inst::MovToVec { rd, rn, .. } => {
-            collector.reg_mod(rd);
+        &Inst::MovToVec { rd, ri, rn, .. } => {
+            collector.reg_reuse_def(rd, 1); // `rd` == `ri`.
+            collector.reg_use(ri);
             collector.reg_use(rn);
         }
         &Inst::MovFromVec { rd, rn, .. } | &Inst::MovFromVecSigned { rd, rn, .. } => {
@@ -926,38 +957,36 @@ fn aarch64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
             collector.reg_def(rd);
             collector.reg_use(rn);
         }
-        &Inst::VecMovElement { rd, rn, .. } => {
-            collector.reg_mod(rd);
+        &Inst::VecMovElement { rd, ri, rn, .. } => {
+            collector.reg_reuse_def(rd, 1); // `rd` == `ri`.
+            collector.reg_use(ri);
             collector.reg_use(rn);
         }
         &Inst::VecRRLong { rd, rn, .. } => {
             collector.reg_def(rd);
             collector.reg_use(rn);
         }
-        &Inst::VecRRNarrow {
-            rd, rn, high_half, ..
-        } => {
+        &Inst::VecRRNarrowLow { rd, rn, .. } => {
             collector.reg_use(rn);
-
-            if high_half {
-                collector.reg_mod(rd);
-            } else {
-                collector.reg_def(rd);
-            }
+            collector.reg_def(rd);
+        }
+        &Inst::VecRRNarrowHigh { rd, ri, rn, .. } => {
+            collector.reg_use(rn);
+            collector.reg_reuse_def(rd, 2); // `rd` == `ri`.
+            collector.reg_use(ri);
         }
         &Inst::VecRRPair { rd, rn, .. } => {
             collector.reg_def(rd);
             collector.reg_use(rn);
         }
-        &Inst::VecRRRLong {
-            alu_op, rd, rn, rm, ..
-        } => {
-            match alu_op {
-                VecRRRLongOp::Umlal8 | VecRRRLongOp::Umlal16 | VecRRRLongOp::Umlal32 => {
-                    collector.reg_mod(rd)
-                }
-                _ => collector.reg_def(rd),
-            };
+        &Inst::VecRRRLong { rd, rn, rm, .. } => {
+            collector.reg_def(rd);
+            collector.reg_use(rn);
+            collector.reg_use(rm);
+        }
+        &Inst::VecRRRLongMod { rd, ri, rn, rm, .. } => {
+            collector.reg_reuse_def(rd, 1); // `rd` == `ri`.
+            collector.reg_use(ri);
             collector.reg_use(rn);
             collector.reg_use(rm);
         }
@@ -970,8 +999,9 @@ fn aarch64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
             collector.reg_use(rn);
             collector.reg_use(rm);
         }
-        &Inst::VecRRRMod { rd, rn, rm, .. } => {
-            collector.reg_mod(rd);
+        &Inst::VecRRRMod { rd, ri, rn, rm, .. } => {
+            collector.reg_reuse_def(rd, 1); // `rd` == `ri`.
+            collector.reg_use(ri);
             collector.reg_use(rn);
             collector.reg_use(rm);
         }
@@ -1508,12 +1538,22 @@ impl Inst {
                 let op_str = match op {
                     MoveWideOp::MovZ => "movz",
                     MoveWideOp::MovN => "movn",
-                    MoveWideOp::MovK => "movk",
                 };
                 let rd = pretty_print_ireg(rd.to_reg(), size, allocs);
                 let imm = imm.pretty_print(0, allocs);
                 format!("{} {}, {}", op_str, rd, imm)
             }
+            &Inst::MovK {
+                rd,
+                rn,
+                ref imm,
+                size,
+            } => {
+                let rn = pretty_print_ireg(rn, size, allocs);
+                let rd = pretty_print_ireg(rd.to_reg(), size, allocs);
+                let imm = imm.pretty_print(0, allocs);
+                format!("movk {}, {}, {}", rd, rn, imm)
+            }
             &Inst::CSel { rd, rn, rm, cond } => {
                 let rd = pretty_print_ireg(rd.to_reg(), OperandSize::Size64, allocs);
                 let rn = pretty_print_ireg(rn, OperandSize::Size64, allocs);
@@ -1589,75 +1629,45 @@ impl Inst {
                 };
                 format!("{}{} {}, {}, [{}]", op, ty_suffix, rs, rt, rn)
             }
-            &Inst::AtomicRMWLoop { ty, op, .. } => {
-                let ty_suffix = match ty {
-                    I8 => "b",
-                    I16 => "h",
-                    _ => "",
-                };
-                let size = OperandSize::from_ty(ty);
-                let r_addr = pretty_print_ireg(xreg(25), OperandSize::Size64, allocs);
-                let r_arg2 = pretty_print_ireg(xreg(26), size, allocs);
-                let r_status = pretty_print_ireg(xreg(24), OperandSize::Size32, allocs);
-                let r_tmp = pretty_print_ireg(xreg(27), size, allocs);
-                let mut r_dst = pretty_print_ireg(xreg(28), size, allocs);
-
-                let mut loop_str: String = "1: ".to_string();
-                loop_str.push_str(&format!("ldaxr{} {}, [{}]; ", ty_suffix, r_tmp, r_addr));
-
-                let op_str = match op {
+            &Inst::AtomicRMWLoop {
+                ty,
+                op,
+                addr,
+                operand,
+                oldval,
+                scratch1,
+                scratch2,
+            } => {
+                let op = match op {
                     AtomicRMWLoopOp::Add => "add",
                     AtomicRMWLoopOp::Sub => "sub",
                     AtomicRMWLoopOp::Eor => "eor",
                     AtomicRMWLoopOp::Orr => "orr",
                     AtomicRMWLoopOp::And => "and",
-                    _ => "",
+                    AtomicRMWLoopOp::Nand => "nand",
+                    AtomicRMWLoopOp::Smin => "smin",
+                    AtomicRMWLoopOp::Smax => "smax",
+                    AtomicRMWLoopOp::Umin => "umin",
+                    AtomicRMWLoopOp::Umax => "umax",
+                    AtomicRMWLoopOp::Xchg => "xchg",
                 };
-
-                if op_str.is_empty() {
-                    match op {
-                        AtomicRMWLoopOp::Xchg => r_dst = r_arg2,
-                        AtomicRMWLoopOp::Nand => {
-                            loop_str.push_str(&format!("and {}, {}, {}; ", r_dst, r_tmp, r_arg2));
-                            loop_str.push_str(&format!("mvn {}, {}; ", r_dst, r_dst));
-                        }
-                        _ => {
-                            if (op == AtomicRMWLoopOp::Smin || op == AtomicRMWLoopOp::Smax)
-                                && (ty == I8 || ty == I16)
-                            {
-                                loop_str
-                                    .push_str(&format!("sxt{} {}, {}; ", ty_suffix, r_tmp, r_tmp));
-                                loop_str.push_str(&format!(
-                                    "cmp {}, {}, sxt{}; ",
-                                    r_tmp, r_arg2, ty_suffix
-                                ));
-                            } else {
-                                loop_str.push_str(&format!("cmp {}, {}; ", r_tmp, r_arg2));
-                            }
-                            let cond = match op {
-                                AtomicRMWLoopOp::Smin => "lt",
-                                AtomicRMWLoopOp::Smax => "gt",
-                                AtomicRMWLoopOp::Umin => "lo",
-                                AtomicRMWLoopOp::Umax => "hi",
-                                _ => unreachable!(),
-                            };
-                            loop_str.push_str(&format!(
-                                "csel {}, {}, {}, {}; ",
-                                r_dst, r_tmp, r_arg2, cond
-                            ));
-                        }
-                    };
-                } else {
-                    loop_str.push_str(&format!("{} {}, {}, {}; ", op_str, r_dst, r_tmp, r_arg2));
-                }
-                loop_str.push_str(&format!(
-                    "stlxr{} {}, {}, [{}]; ",
-                    ty_suffix, r_status, r_dst, r_addr
-                ));
-                loop_str.push_str(&format!("cbnz {}, 1b", r_status));
-                loop_str
+                let addr = pretty_print_ireg(addr, OperandSize::Size64, allocs);
+                let operand = pretty_print_ireg(operand, OperandSize::Size64, allocs);
+                let oldval = pretty_print_ireg(oldval.to_reg(), OperandSize::Size64, allocs);
+                let scratch1 = pretty_print_ireg(scratch1.to_reg(), OperandSize::Size64, allocs);
+                let scratch2 = pretty_print_ireg(scratch2.to_reg(), OperandSize::Size64, allocs);
+                format!(
+                    "atomic_rmw_loop_{}_{} addr={} operand={} oldval={} scratch1={} scratch2={}",
+                    op,
+                    ty.bits(),
+                    addr,
+                    operand,
+                    oldval,
+                    scratch1,
+                    scratch2,
+                )
             }
-            &Inst::AtomicCAS { rs, rt, rn, ty } => {
+            &Inst::AtomicCAS { rd, rs, rt, rn, ty } => {
                 let op = match ty {
                     I8 => "casalb",
                     I16 => "casalh",
@@ -1665,16 +1675,35 @@ impl Inst {
                     _ => panic!("Unsupported type: {}", ty),
                 };
                 let size = OperandSize::from_ty(ty);
-                let rs = pretty_print_ireg(rs.to_reg(), size, allocs);
+                let rd = pretty_print_ireg(rd.to_reg(), size, allocs);
+                let rs = pretty_print_ireg(rs, size, allocs);
                 let rt = pretty_print_ireg(rt, size, allocs);
                 let rn = pretty_print_ireg(rn, OperandSize::Size64, allocs);
 
-                format!("{} {}, {}, [{}]", op, rs, rt, rn)
+                format!("{} {}, {}, {}, [{}]", op, rd, rs, rt, rn)
             }
-            &Inst::AtomicCASLoop { ty } => {
+            &Inst::AtomicCASLoop {
+                ty,
+                addr,
+                expected,
+                replacement,
+                oldval,
+                scratch,
+            } => {
+                let addr = pretty_print_ireg(addr, OperandSize::Size64, allocs);
+                let expected = pretty_print_ireg(expected, OperandSize::Size64, allocs);
+                let replacement = pretty_print_ireg(replacement, OperandSize::Size64, allocs);
+                let oldval = pretty_print_ireg(oldval.to_reg(), OperandSize::Size64, allocs);
+                let scratch = pretty_print_ireg(scratch.to_reg(), OperandSize::Size64, allocs);
                 format!(
-                    "atomically {{ compare-and-swap({}_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }}",
-                    ty.bits())
+                    "atomic_cas_loop_{} addr={}, expect={}, replacement={}, oldval={}, scratch={}",
+                    ty.bits(),
+                    addr,
+                    expected,
+                    replacement,
+                    oldval,
+                    scratch,
+                )
             }
             &Inst::LoadAcquire {
                 access_ty, rt, rn, ..
@@ -1777,8 +1806,6 @@ impl Inst {
                 let (op, imm, vector) = match fpu_op {
                     FPUOpRI::UShr32(imm) => ("ushr", imm.pretty_print(0, allocs), true),
                     FPUOpRI::UShr64(imm) => ("ushr", imm.pretty_print(0, allocs), false),
-                    FPUOpRI::Sli32(imm) => ("sli", imm.pretty_print(0, allocs), true),
-                    FPUOpRI::Sli64(imm) => ("sli", imm.pretty_print(0, allocs), false),
                 };
 
                 let (rd, rn) = if vector {
@@ -1794,6 +1821,27 @@ impl Inst {
                 };
                 format!("{} {}, {}, {}", op, rd, rn, imm)
             }
+            &Inst::FpuRRIMod { fpu_op, rd, ri, rn } => {
+                let (op, imm, vector) = match fpu_op {
+                    FPUOpRIMod::Sli32(imm) => ("sli", imm.pretty_print(0, allocs), true),
+                    FPUOpRIMod::Sli64(imm) => ("sli", imm.pretty_print(0, allocs), false),
+                };
+
+                let (rd, ri, rn) = if vector {
+                    (
+                        pretty_print_vreg_vector(rd.to_reg(), VectorSize::Size32x2, allocs),
+                        pretty_print_vreg_vector(ri, VectorSize::Size32x2, allocs),
+                        pretty_print_vreg_vector(rn, VectorSize::Size32x2, allocs),
+                    )
+                } else {
+                    (
+                        pretty_print_vreg_scalar(rd.to_reg(), ScalarSize::Size64, allocs),
+                        pretty_print_vreg_scalar(ri, ScalarSize::Size64, allocs),
+                        pretty_print_vreg_scalar(rn, ScalarSize::Size64, allocs),
+                    )
+                };
+                format!("{} {}, {}, {}, {}", op, rd, ri, rn, imm)
+            }
             &Inst::FpuRRRR {
                 fpu_op,
                 size,
@@ -1983,11 +2031,18 @@ impl Inst {
 
                 format!("fmov {}, {}", rd, imm)
             }
-            &Inst::MovToVec { rd, rn, idx, size } => {
+            &Inst::MovToVec {
+                rd,
+                ri,
+                rn,
+                idx,
+                size,
+            } => {
                 let rd =
                     pretty_print_vreg_element(rd.to_reg(), idx as usize, size.lane_size(), allocs);
+                let ri = pretty_print_vreg_element(ri, idx as usize, size.lane_size(), allocs);
                 let rn = pretty_print_ireg(rn, size.operand_size(), allocs);
-                format!("mov {}, {}", rd, rn)
+                format!("mov {}, {}, {}", rd, ri, rn)
             }
             &Inst::MovFromVec { rd, rn, idx, size } => {
                 let op = match size {
@@ -2062,6 +2117,7 @@ impl Inst {
             }
             &Inst::VecMovElement {
                 rd,
+                ri,
                 rn,
                 dest_idx,
                 src_idx,
@@ -2073,8 +2129,9 @@ impl Inst {
                     size.lane_size(),
                     allocs,
                 );
+                let ri = pretty_print_vreg_element(ri, dest_idx as usize, size.lane_size(), allocs);
                 let rn = pretty_print_vreg_element(rn, src_idx as usize, size.lane_size(), allocs);
-                format!("mov {}, {}", rd, rn)
+                format!("mov {}, {}, {}", rd, ri, rn)
             }
             &Inst::VecRRLong {
                 op,
@@ -2119,16 +2176,28 @@ impl Inst {
 
                 format!("{} {}, {}{}", op, rd, rn, suffix)
             }
-            &Inst::VecRRNarrow {
+            &Inst::VecRRNarrowLow {
                 op,
                 rd,
                 rn,
-                high_half,
                 lane_size,
+                ..
+            }
+            | &Inst::VecRRNarrowHigh {
+                op,
+                rd,
+                rn,
+                lane_size,
+                ..
             } => {
                 let vec64 = VectorSize::from_lane_size(lane_size, false);
                 let vec128 = VectorSize::from_lane_size(lane_size, true);
                 let rn_size = VectorSize::from_lane_size(lane_size.widen(), true);
+                let high_half = match self {
+                    &Inst::VecRRNarrowLow { .. } => false,
+                    &Inst::VecRRNarrowHigh { .. } => true,
+                    _ => unreachable!(),
+                };
                 let (op, rd_size) = match (op, high_half) {
                     (VecRRNarrowOp::Xtn, false) => ("xtn", vec64),
                     (VecRRNarrowOp::Xtn, true) => ("xtn2", vec128),
@@ -2143,8 +2212,15 @@ impl Inst {
                 };
                 let rn = pretty_print_vreg_vector(rn, rn_size, allocs);
                 let rd = pretty_print_vreg_vector(rd.to_reg(), rd_size, allocs);
+                let ri = match self {
+                    &Inst::VecRRNarrowLow { .. } => "".to_string(),
+                    &Inst::VecRRNarrowHigh { ri, .. } => {
+                        format!("{}, ", pretty_print_vreg_vector(ri, rd_size, allocs))
+                    }
+                    _ => unreachable!(),
+                };
 
-                format!("{} {}, {}", op, rd, rn)
+                format!("{} {}, {}{}", op, rd, ri, rn)
             }
             &Inst::VecRRPair { op, rd, rn } => {
                 let op = match op {
@@ -2227,6 +2303,7 @@ impl Inst {
             }
             &Inst::VecRRRMod {
                 rd,
+                ri,
                 rn,
                 rm,
                 alu_op,
@@ -2237,9 +2314,10 @@ impl Inst {
                     VecALUModOp::Fmla => ("fmla", size),
                 };
                 let rd = pretty_print_vreg_vector(rd.to_reg(), size, allocs);
+                let ri = pretty_print_vreg_vector(ri, size, allocs);
                 let rn = pretty_print_vreg_vector(rn, size, allocs);
                 let rm = pretty_print_vreg_vector(rm, size, allocs);
-                format!("{} {}, {}, {}", op, rd, rn, rm)
+                format!("{} {}, {}, {}, {}", op, rd, ri, rn, rm)
             }
             &Inst::VecRRRLong {
                 rd,
@@ -2285,30 +2363,46 @@ impl Inst {
                     (VecRRRLongOp::Umull32, true) => {
                         ("umull2", VectorSize::Size64x2, VectorSize::Size32x4)
                     }
-                    (VecRRRLongOp::Umlal8, false) => {
-                        ("umlal", VectorSize::Size16x8, VectorSize::Size8x8)
-                    }
-                    (VecRRRLongOp::Umlal8, true) => {
-                        ("umlal2", VectorSize::Size16x8, VectorSize::Size8x16)
-                    }
-                    (VecRRRLongOp::Umlal16, false) => {
-                        ("umlal", VectorSize::Size32x4, VectorSize::Size16x4)
-                    }
-                    (VecRRRLongOp::Umlal16, true) => {
-                        ("umlal2", VectorSize::Size32x4, VectorSize::Size16x8)
-                    }
-                    (VecRRRLongOp::Umlal32, false) => {
-                        ("umlal", VectorSize::Size64x2, VectorSize::Size32x2)
-                    }
-                    (VecRRRLongOp::Umlal32, true) => {
-                        ("umlal2", VectorSize::Size64x2, VectorSize::Size32x4)
-                    }
                 };
                 let rd = pretty_print_vreg_vector(rd.to_reg(), dest_size, allocs);
                 let rn = pretty_print_vreg_vector(rn, src_size, allocs);
                 let rm = pretty_print_vreg_vector(rm, src_size, allocs);
                 format!("{} {}, {}, {}", op, rd, rn, rm)
             }
+            &Inst::VecRRRLongMod {
+                rd,
+                ri,
+                rn,
+                rm,
+                alu_op,
+                high_half,
+            } => {
+                let (op, dest_size, src_size) = match (alu_op, high_half) {
+                    (VecRRRLongModOp::Umlal8, false) => {
+                        ("umlal", VectorSize::Size16x8, VectorSize::Size8x8)
+                    }
+                    (VecRRRLongModOp::Umlal8, true) => {
+                        ("umlal2", VectorSize::Size16x8, VectorSize::Size8x16)
+                    }
+                    (VecRRRLongModOp::Umlal16, false) => {
+                        ("umlal", VectorSize::Size32x4, VectorSize::Size16x4)
+                    }
+                    (VecRRRLongModOp::Umlal16, true) => {
+                        ("umlal2", VectorSize::Size32x4, VectorSize::Size16x8)
+                    }
+                    (VecRRRLongModOp::Umlal32, false) => {
+                        ("umlal", VectorSize::Size64x2, VectorSize::Size32x2)
+                    }
+                    (VecRRRLongModOp::Umlal32, true) => {
+                        ("umlal2", VectorSize::Size64x2, VectorSize::Size32x4)
+                    }
+                };
+                let rd = pretty_print_vreg_vector(rd.to_reg(), dest_size, allocs);
+                let ri = pretty_print_vreg_vector(ri, dest_size, allocs);
+                let rn = pretty_print_vreg_vector(rn, src_size, allocs);
+                let rm = pretty_print_vreg_vector(rm, src_size, allocs);
+                format!("{} {}, {}, {}, {}", op, rd, ri, rn, rm)
+            }
             &Inst::VecMisc { op, rd, rn, size } => {
                 let (op, size, suffix) = match op {
                     VecMisc2::Not => (
@@ -2378,6 +2472,7 @@ impl Inst {
             &Inst::VecShiftImmMod {
                 op,
                 rd,
+                ri,
                 rn,
                 size,
                 imm,
@@ -2386,8 +2481,9 @@ impl Inst {
                     VecShiftImmModOp::Sli => "sli",
                 };
                 let rd = pretty_print_vreg_vector(rd.to_reg(), size, allocs);
+                let ri = pretty_print_vreg_vector(ri, size, allocs);
                 let rn = pretty_print_vreg_vector(rn, size, allocs);
-                format!("{} {}, {}, #{}", op, rd, rn, imm)
+                format!("{} {}, {}, {}, #{}", op, rd, ri, rn, imm)
             }
             &Inst::VecExtract { rd, rn, rm, imm4 } => {
                 let rd = pretty_print_vreg_vector(rd.to_reg(), VectorSize::Size8x16, allocs);
@@ -2395,31 +2491,39 @@ impl Inst {
                 let rm = pretty_print_vreg_vector(rm, VectorSize::Size8x16, allocs);
                 format!("ext {}, {}, {}, #{}", rd, rn, rm, imm4)
             }
-            &Inst::VecTbl {
-                rd,
-                rn,
-                rm,
-                is_extension,
-            } => {
-                let op = if is_extension { "tbx" } else { "tbl" };
+            &Inst::VecTbl { rd, rn, rm } => {
                 let rn = pretty_print_vreg_vector(rn, VectorSize::Size8x16, allocs);
                 let rm = pretty_print_vreg_vector(rm, VectorSize::Size8x16, allocs);
                 let rd = pretty_print_vreg_vector(rd.to_reg(), VectorSize::Size8x16, allocs);
-                format!("{} {}, {{ {} }}, {}", op, rd, rn, rm)
+                format!("tbl {}, {{ {} }}, {}", rd, rn, rm)
             }
-            &Inst::VecTbl2 {
-                rd,
-                rn,
-                rn2,
-                rm,
-                is_extension,
-            } => {
-                let op = if is_extension { "tbx" } else { "tbl" };
+            &Inst::VecTblExt { rd, ri, rn, rm } => {
+                let rn = pretty_print_vreg_vector(rn, VectorSize::Size8x16, allocs);
+                let rm = pretty_print_vreg_vector(rm, VectorSize::Size8x16, allocs);
+                let rd = pretty_print_vreg_vector(rd.to_reg(), VectorSize::Size8x16, allocs);
+                let ri = pretty_print_vreg_vector(ri, VectorSize::Size8x16, allocs);
+                format!("tbx {}, {}, {{ {} }}, {}", rd, ri, rn, rm)
+            }
+            &Inst::VecTbl2 { rd, rn, rn2, rm } => {
                 let rn = pretty_print_vreg_vector(rn, VectorSize::Size8x16, allocs);
                 let rn2 = pretty_print_vreg_vector(rn2, VectorSize::Size8x16, allocs);
                 let rm = pretty_print_vreg_vector(rm, VectorSize::Size8x16, allocs);
                 let rd = pretty_print_vreg_vector(rd.to_reg(), VectorSize::Size8x16, allocs);
-                format!("{} {}, {{ {}, {} }}, {}", op, rd, rn, rn2, rm)
+                format!("tbl {}, {{ {}, {} }}, {}", rd, rn, rn2, rm)
+            }
+            &Inst::VecTbl2Ext {
+                rd,
+                ri,
+                rn,
+                rn2,
+                rm,
+            } => {
+                let rn = pretty_print_vreg_vector(rn, VectorSize::Size8x16, allocs);
+                let rn2 = pretty_print_vreg_vector(rn2, VectorSize::Size8x16, allocs);
+                let rm = pretty_print_vreg_vector(rm, VectorSize::Size8x16, allocs);
+                let rd = pretty_print_vreg_vector(rd.to_reg(), VectorSize::Size8x16, allocs);
+                let ri = pretty_print_vreg_vector(ri, VectorSize::Size8x16, allocs);
+                format!("tbx {}, {}, {{ {}, {} }}, {}", rd, ri, rn, rn2, rm)
             }
             &Inst::VecLoadReplicate { rd, rn, size, .. } => {
                 let rd = pretty_print_vreg_vector(rd.to_reg(), size, allocs);
diff --git a/cranelift/codegen/src/isa/aarch64/inst/regs.rs b/cranelift/codegen/src/isa/aarch64/inst/regs.rs
index 3c1114a515..eacd0b4330 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/regs.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/regs.rs
@@ -50,6 +50,7 @@ pub(crate) const fn vreg_preg(num: u8) -> PReg {
 }
 
 /// Get a writable reference to a V-register.
+#[cfg(test)] // Used only in test code.
 pub fn writable_vreg(num: u8) -> Writable<Reg> {
     Writable::from_reg(vreg(num))
 }
diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle
index b0bb4e6f1c..8b97f64fda 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.isle
+++ b/cranelift/codegen/src/isa/aarch64/lower.isle
@@ -103,12 +103,12 @@
 
 (rule (lower (has_type ty (shuffle rn rn2 (u128_from_immediate mask))))
       (let ((mask_reg Reg (constant_f128 mask)))
-       (vec_tbl2 rn rn2 mask_reg $false ty)))
+       (vec_tbl2 rn rn2 mask_reg ty)))
 
 ;;;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (has_type vec_i128_ty (swizzle rn rm)))
-      (vec_tbl rn rm #f))
+      (vec_tbl rn rm))
 
 ;;;; Rules for `isplit` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
diff --git a/cranelift/codegen/src/isa/aarch64/lower/isle.rs b/cranelift/codegen/src/isa/aarch64/lower/isle.rs
index e4fb6ea6f5..20606c4588 100644
--- a/cranelift/codegen/src/isa/aarch64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle.rs
@@ -8,7 +8,7 @@ use generated_code::Context;
 use super::{
     lower_constant_f128, lower_constant_f32, lower_constant_f64, lower_fp_condcode,
     writable_zero_reg, zero_reg, AMode, ASIMDFPModImm, ASIMDMovModImm, BranchTarget, CallIndInfo,
-    CallInfo, Cond, CondBrKind, ExtendOp, FPUOpRI, FloatCC, Imm12, ImmLogic, ImmShift,
+    CallInfo, Cond, CondBrKind, ExtendOp, FPUOpRI, FPUOpRIMod, FloatCC, Imm12, ImmLogic, ImmShift,
     Inst as MInst, IntCC, JTSequenceInfo, MachLabel, MoveWideConst, MoveWideOp, NarrowValueMode,
     Opcode, OperandSize, PairAMode, Reg, ScalarSize, ShiftOpAndAmt, UImm5, VecMisc2, VectorSize,
     NZCV,
@@ -28,7 +28,6 @@ use crate::{
     },
     isa::aarch64::abi::AArch64Caller,
     isa::aarch64::inst::args::{ShiftOp, ShiftOpShiftImm},
-    isa::aarch64::lower::{writable_vreg, writable_xreg, xreg},
     isa::unwind::UnwindInst,
     machinst::{ty_bits, InsnOutput, Lower, MachInst, VCodeConstant, VCodeConstantData},
 };
@@ -209,9 +208,9 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
                 });
 
                 if upper_halfword != 0 {
-                    self.emit(&MInst::MovWide {
-                        op: MoveWideOp::MovK,
+                    self.emit(&MInst::MovK {
                         rd,
+                        rn: rd.to_reg(),
                         imm: MoveWideConst::maybe_with_shift(upper_halfword, 16).unwrap(),
                         size,
                     });
@@ -263,9 +262,9 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
                     }
                 } else {
                     let imm = MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap();
-                    self.emit(&MInst::MovWide {
-                        op: MoveWideOp::MovK,
+                    self.emit(&MInst::MovK {
                         rd,
+                        rn: rd.to_reg(),
                         imm,
                         size,
                     });
@@ -294,18 +293,6 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
         zero_reg()
     }
 
-    fn xreg(&mut self, index: u8) -> Reg {
-        xreg(index)
-    }
-
-    fn writable_xreg(&mut self, index: u8) -> WritableReg {
-        writable_xreg(index)
-    }
-
-    fn writable_vreg(&mut self, index: u8) -> WritableReg {
-        writable_vreg(index)
-    }
-
     fn extended_value_from_value(&mut self, val: Value) -> Option<ExtendedValue> {
         let (val, extend) =
             super::get_as_extended_value(self.lower_ctx, val, NarrowValueMode::None)?;
@@ -718,11 +705,11 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
         }
     }
 
-    fn fpu_op_ri_sli(&mut self, ty_bits: u8, shift: u8) -> FPUOpRI {
+    fn fpu_op_ri_sli(&mut self, ty_bits: u8, shift: u8) -> FPUOpRIMod {
         if ty_bits == 32 {
-            FPUOpRI::Sli32(FPULeftShiftImm::maybe_from_u8(shift, ty_bits).unwrap())
+            FPUOpRIMod::Sli32(FPULeftShiftImm::maybe_from_u8(shift, ty_bits).unwrap())
         } else if ty_bits == 64 {
-            FPUOpRI::Sli64(FPULeftShiftImm::maybe_from_u8(shift, ty_bits).unwrap())
+            FPUOpRIMod::Sli64(FPULeftShiftImm::maybe_from_u8(shift, ty_bits).unwrap())
         } else {
             unimplemented!(
                 "unexpected input size for fpu_op_ri_sli: {} (shift: {})",
diff --git a/cranelift/filetests/filetests/isa/aarch64/amodes.clif b/cranelift/filetests/filetests/isa/aarch64/amodes.clif
index 83b7b96bc2..92dfe41526 100644
--- a/cranelift/filetests/filetests/isa/aarch64/amodes.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/amodes.clif
@@ -139,7 +139,7 @@ block0(v0: i64):
 
 ; block0:
 ;   movz w3, #51712
-;   movk w3, #15258, LSL #16
+;   movk w3, w3, #15258, LSL #16
 ;   add x3, x3, x0
 ;   ldr w0, [x3]
 ;   ret
diff --git a/cranelift/filetests/filetests/isa/aarch64/atomic-rmw-lse.clif b/cranelift/filetests/filetests/isa/aarch64/atomic-rmw-lse.clif
index 0e33718593..9ca6b575cf 100644
--- a/cranelift/filetests/filetests/isa/aarch64/atomic-rmw-lse.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/atomic-rmw-lse.clif
@@ -142,9 +142,8 @@ block0(v0: i64, v1: i64):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxr x27, [x25]; and x28, x27, x26; mvn x28, x28; stlxr w24, x28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_nand_64 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -164,9 +163,8 @@ block0(v0: i64, v1: i32):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxr w27, [x25]; and w28, w27, w26; mvn w28, w28; stlxr w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_nand_32 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -186,9 +184,8 @@ block0(v0: i64, v1: i16):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxrh w27, [x25]; and w28, w27, w26; mvn w28, w28; stlxrh w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_nand_16 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -208,9 +205,8 @@ block0(v0: i64, v1: i8):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxrb w27, [x25]; and w28, w27, w26; mvn w28, w28; stlxrb w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_nand_8 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
diff --git a/cranelift/filetests/filetests/isa/aarch64/atomic-rmw.clif b/cranelift/filetests/filetests/isa/aarch64/atomic-rmw.clif
index 0b017ad3df..d2ba234244 100644
--- a/cranelift/filetests/filetests/isa/aarch64/atomic-rmw.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/atomic-rmw.clif
@@ -14,9 +14,8 @@ block0(v0: i64, v1: i64):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxr x27, [x25]; add x28, x27, x26; stlxr w24, x28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_add_64 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -36,9 +35,8 @@ block0(v0: i64, v1: i32):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxr w27, [x25]; add w28, w27, w26; stlxr w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_add_32 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -58,9 +56,8 @@ block0(v0: i64, v1: i16):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxrh w27, [x25]; add w28, w27, w26; stlxrh w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_add_16 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -80,9 +77,8 @@ block0(v0: i64, v1: i8):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxrb w27, [x25]; add w28, w27, w26; stlxrb w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_add_8 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -102,9 +98,8 @@ block0(v0: i64, v1: i64):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxr x27, [x25]; sub x28, x27, x26; stlxr w24, x28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_sub_64 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -124,9 +119,8 @@ block0(v0: i64, v1: i32):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxr w27, [x25]; sub w28, w27, w26; stlxr w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_sub_32 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -146,9 +140,8 @@ block0(v0: i64, v1: i16):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxrh w27, [x25]; sub w28, w27, w26; stlxrh w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_sub_16 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -168,9 +161,8 @@ block0(v0: i64, v1: i8):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxrb w27, [x25]; sub w28, w27, w26; stlxrb w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_sub_8 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -190,9 +182,8 @@ block0(v0: i64, v1: i64):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxr x27, [x25]; and x28, x27, x26; stlxr w24, x28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_and_64 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -212,9 +203,8 @@ block0(v0: i64, v1: i32):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxr w27, [x25]; and w28, w27, w26; stlxr w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_and_32 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -234,9 +224,8 @@ block0(v0: i64, v1: i16):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxrh w27, [x25]; and w28, w27, w26; stlxrh w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_and_16 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -256,9 +245,8 @@ block0(v0: i64, v1: i8):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxrb w27, [x25]; and w28, w27, w26; stlxrb w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_and_8 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -278,9 +266,8 @@ block0(v0: i64, v1: i64):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxr x27, [x25]; and x28, x27, x26; mvn x28, x28; stlxr w24, x28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_nand_64 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -300,9 +287,8 @@ block0(v0: i64, v1: i32):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxr w27, [x25]; and w28, w27, w26; mvn w28, w28; stlxr w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_nand_32 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -322,9 +308,8 @@ block0(v0: i64, v1: i16):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxrh w27, [x25]; and w28, w27, w26; mvn w28, w28; stlxrh w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_nand_16 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -344,9 +329,8 @@ block0(v0: i64, v1: i8):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxrb w27, [x25]; and w28, w27, w26; mvn w28, w28; stlxrb w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_nand_8 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -366,9 +350,8 @@ block0(v0: i64, v1: i64):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxr x27, [x25]; orr x28, x27, x26; stlxr w24, x28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_orr_64 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -388,9 +371,8 @@ block0(v0: i64, v1: i32):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxr w27, [x25]; orr w28, w27, w26; stlxr w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_orr_32 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -410,9 +392,8 @@ block0(v0: i64, v1: i16):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxrh w27, [x25]; orr w28, w27, w26; stlxrh w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_orr_16 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -432,9 +413,8 @@ block0(v0: i64, v1: i8):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxrb w27, [x25]; orr w28, w27, w26; stlxrb w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_orr_8 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -454,9 +434,8 @@ block0(v0: i64, v1: i64):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxr x27, [x25]; eor x28, x27, x26; stlxr w24, x28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_eor_64 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -476,9 +455,8 @@ block0(v0: i64, v1: i32):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxr w27, [x25]; eor w28, w27, w26; stlxr w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_eor_32 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -498,9 +476,8 @@ block0(v0: i64, v1: i16):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxrh w27, [x25]; eor w28, w27, w26; stlxrh w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_eor_16 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -520,9 +497,8 @@ block0(v0: i64, v1: i8):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxrb w27, [x25]; eor w28, w27, w26; stlxrb w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_eor_8 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -542,9 +518,8 @@ block0(v0: i64, v1: i64):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxr x27, [x25]; cmp x27, x26; csel x28, x27, x26, gt; stlxr w24, x28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_smax_64 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -564,9 +539,8 @@ block0(v0: i64, v1: i32):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxr w27, [x25]; cmp w27, w26; csel w28, w27, w26, gt; stlxr w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_smax_32 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -586,9 +560,8 @@ block0(v0: i64, v1: i16):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxrh w27, [x25]; sxth w27, w27; cmp w27, w26, sxth; csel w28, w27, w26, gt; stlxrh w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_smax_16 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -608,9 +581,8 @@ block0(v0: i64, v1: i8):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxrb w27, [x25]; sxtb w27, w27; cmp w27, w26, sxtb; csel w28, w27, w26, gt; stlxrb w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_smax_8 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -630,9 +602,8 @@ block0(v0: i64, v1: i64):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxr x27, [x25]; cmp x27, x26; csel x28, x27, x26, hi; stlxr w24, x28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_umax_64 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -652,9 +623,8 @@ block0(v0: i64, v1: i32):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxr w27, [x25]; cmp w27, w26; csel w28, w27, w26, hi; stlxr w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_umax_32 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -674,9 +644,8 @@ block0(v0: i64, v1: i16):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxrh w27, [x25]; cmp w27, w26; csel w28, w27, w26, hi; stlxrh w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_umax_16 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -696,9 +665,8 @@ block0(v0: i64, v1: i8):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxrb w27, [x25]; cmp w27, w26; csel w28, w27, w26, hi; stlxrb w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_umax_8 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -718,9 +686,8 @@ block0(v0: i64, v1: i64):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxr x27, [x25]; cmp x27, x26; csel x28, x27, x26, lt; stlxr w24, x28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_smin_64 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -740,9 +707,8 @@ block0(v0: i64, v1: i32):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxr w27, [x25]; cmp w27, w26; csel w28, w27, w26, lt; stlxr w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_smin_32 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -762,9 +728,8 @@ block0(v0: i64, v1: i16):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxrh w27, [x25]; sxth w27, w27; cmp w27, w26, sxth; csel w28, w27, w26, lt; stlxrh w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_smin_16 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -784,9 +749,8 @@ block0(v0: i64, v1: i8):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxrb w27, [x25]; sxtb w27, w27; cmp w27, w26, sxtb; csel w28, w27, w26, lt; stlxrb w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_smin_8 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -806,9 +770,8 @@ block0(v0: i64, v1: i64):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxr x27, [x25]; cmp x27, x26; csel x28, x27, x26, lo; stlxr w24, x28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_umin_64 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -828,9 +791,8 @@ block0(v0: i64, v1: i32):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxr w27, [x25]; cmp w27, w26; csel w28, w27, w26, lo; stlxr w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_umin_32 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -850,9 +812,8 @@ block0(v0: i64, v1: i16):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxrh w27, [x25]; cmp w27, w26; csel w28, w27, w26, lo; stlxrh w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_umin_16 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
@@ -872,9 +833,8 @@ block0(v0: i64, v1: i8):
 ;   stp x24, x25, [sp, #-16]!
 ; block0:
 ;   mov x25, x0
-;   mov x4, x1
-;   mov x26, x4
-;   1: ldaxrb w27, [x25]; cmp w27, w26; csel w28, w27, w26, lo; stlxrb w24, w28, [x25]; cbnz w24, 1b
+;   mov x26, x1
+;   atomic_rmw_loop_umin_8 addr=x25 operand=x26 oldval=x27 scratch1=x24 scratch2=x28
 ;   ldp x24, x25, [sp], #16
 ;   ldp x26, x27, [sp], #16
 ;   ldr x28, [sp], #16
diff --git a/cranelift/filetests/filetests/isa/aarch64/bitops.clif b/cranelift/filetests/filetests/isa/aarch64/bitops.clif
index 5419d077b8..38cb3ed290 100644
--- a/cranelift/filetests/filetests/isa/aarch64/bitops.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/bitops.clif
@@ -245,11 +245,11 @@ block0(v0: i128):
 }
 
 ; block0:
-;   fmov d6, x0
-;   mov v6.d[1], x1
-;   cnt v19.16b, v6.16b
-;   addv b21, v19.16b
-;   umov w0, v21.b[0]
+;   fmov d7, x0
+;   mov v7.d[1], v7.d[1], x1
+;   cnt v18.16b, v7.16b
+;   addv b20, v18.16b
+;   umov w0, v20.b[0]
 ;   movz w1, #0
 ;   ret
 
diff --git a/cranelift/filetests/filetests/isa/aarch64/constants.clif b/cranelift/filetests/filetests/isa/aarch64/constants.clif
index a6a7a95549..a7fa74698f 100644
--- a/cranelift/filetests/filetests/isa/aarch64/constants.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/constants.clif
@@ -130,9 +130,9 @@ block0:
 
 ; block0:
 ;   movz x0, #58
-;   movk x0, #4626, LSL #16
-;   movk x0, #61603, LSL #32
-;   movk x0, #62283, LSL #48
+;   movk x0, x0, #4626, LSL #16
+;   movk x0, x0, #61603, LSL #32
+;   movk x0, x0, #62283, LSL #48
 ;   ret
 
 function %f() -> i64 {
@@ -143,7 +143,7 @@ block0:
 
 ; block0:
 ;   movz x0, #7924, LSL #16
-;   movk x0, #4841, LSL #48
+;   movk x0, x0, #4841, LSL #48
 ;   ret
 
 function %f() -> i64 {
@@ -154,7 +154,7 @@ block0:
 
 ; block0:
 ;   movn x0, #57611, LSL #16
-;   movk x0, #4841, LSL #48
+;   movk x0, x0, #4841, LSL #48
 ;   ret
 
 function %f() -> i32 {
diff --git a/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-narrow.clif b/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-narrow.clif
index 7b041b5a14..303e030ab1 100644
--- a/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-narrow.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-narrow.clif
@@ -15,10 +15,9 @@ block0(v0: i16):
 }
 
 ; block0:
-;   dup v6.4h, w0
-;   mov v7.16b, v6.16b
-;   mov v7.d[1], v6.d[0]
-;   sqxtn v0.8b, v7.8h
+;   dup v4.4h, w0
+;   mov v4.d[1], v4.d[1], v4.d[0]
+;   sqxtn v0.8b, v4.8h
 ;   ret
 
 function %snarrow_i16x8(i16) -> i8x16 {
@@ -37,7 +36,7 @@ block0(v0: i16):
 ; block0:
 ;   dup v6.8h, w0
 ;   sqxtn v0.8b, v6.8h
-;   sqxtn2 v0.16b, v6.8h
+;   sqxtn2 v0.16b, v0.16b, v6.8h
 ;   ret
 
 function %snarrow_i32x2(i32) -> i16x4 {
@@ -54,10 +53,9 @@ block0(v0: i32):
 }
 
 ; block0:
-;   dup v6.2s, w0
-;   mov v7.16b, v6.16b
-;   mov v7.d[1], v6.d[0]
-;   sqxtn v0.4h, v7.4s
+;   dup v4.2s, w0
+;   mov v4.d[1], v4.d[1], v4.d[0]
+;   sqxtn v0.4h, v4.4s
 ;   ret
 
 function %snarrow_i32x4(i32) -> i16x8 {
@@ -76,7 +74,7 @@ block0(v0: i32):
 ; block0:
 ;   dup v6.4s, w0
 ;   sqxtn v0.4h, v6.4s
-;   sqxtn2 v0.8h, v6.4s
+;   sqxtn2 v0.8h, v0.8h, v6.4s
 ;   ret
 
 function %snarrow_i64x2(i64) -> i32x4 {
@@ -95,7 +93,7 @@ block0(v0: i64):
 ; block0:
 ;   dup v6.2d, x0
 ;   sqxtn v0.2s, v6.2d
-;   sqxtn2 v0.4s, v6.2d
+;   sqxtn2 v0.4s, v0.4s, v6.2d
 ;   ret
 
 function %unarrow_i16x4(i16) -> i8x8 {
@@ -112,10 +110,9 @@ block0(v0: i16):
 }
 
 ; block0:
-;   dup v6.4h, w0
-;   mov v7.16b, v6.16b
-;   mov v7.d[1], v6.d[0]
-;   sqxtun v0.8b, v7.8h
+;   dup v4.4h, w0
+;   mov v4.d[1], v4.d[1], v4.d[0]
+;   sqxtun v0.8b, v4.8h
 ;   ret
 
 function %unarrow_i16x8(i16) -> i8x16 {
@@ -134,7 +131,7 @@ block0(v0: i16):
 ; block0:
 ;   dup v6.8h, w0
 ;   sqxtun v0.8b, v6.8h
-;   sqxtun2 v0.16b, v6.8h
+;   sqxtun2 v0.16b, v0.16b, v6.8h
 ;   ret
 
 function %unarrow_i32x2(i32) -> i16x4 {
@@ -151,10 +148,9 @@ block0(v0: i32):
 }
 
 ; block0:
-;   dup v6.2s, w0
-;   mov v7.16b, v6.16b
-;   mov v7.d[1], v6.d[0]
-;   sqxtun v0.4h, v7.4s
+;   dup v4.2s, w0
+;   mov v4.d[1], v4.d[1], v4.d[0]
+;   sqxtun v0.4h, v4.4s
 ;   ret
 
 function %unarrow_i32x4(i32) -> i16x8 {
@@ -173,7 +169,7 @@ block0(v0: i32):
 ; block0:
 ;   dup v6.4s, w0
 ;   sqxtun v0.4h, v6.4s
-;   sqxtun2 v0.8h, v6.4s
+;   sqxtun2 v0.8h, v0.8h, v6.4s
 ;   ret
 
 function %unarrow_i64x2(i64) -> i32x4 {
@@ -192,7 +188,7 @@ block0(v0: i64):
 ; block0:
 ;   dup v6.2d, x0
 ;   sqxtun v0.2s, v6.2d
-;   sqxtun2 v0.4s, v6.2d
+;   sqxtun2 v0.4s, v0.4s, v6.2d
 ;   ret
 
 function %uunarrow_i16x4(i16) -> i8x8 {
@@ -209,10 +205,9 @@ block0(v0: i16):
 }
 
 ; block0:
-;   dup v6.4h, w0
-;   mov v7.16b, v6.16b
-;   mov v7.d[1], v6.d[0]
-;   uqxtn v0.8b, v7.8h
+;   dup v4.4h, w0
+;   mov v4.d[1], v4.d[1], v4.d[0]
+;   uqxtn v0.8b, v4.8h
 ;   ret
 
 function %uunarrow_i16x8(i16) -> i8x16 {
@@ -231,7 +226,7 @@ block0(v0: i16):
 ; block0:
 ;   dup v6.8h, w0
 ;   uqxtn v0.8b, v6.8h
-;   uqxtn2 v0.16b, v6.8h
+;   uqxtn2 v0.16b, v0.16b, v6.8h
 ;   ret
 
 function %uunarrow_i32x2(i32) -> i16x4 {
@@ -248,10 +243,9 @@ block0(v0: i32):
 }
 
 ; block0:
-;   dup v6.2s, w0
-;   mov v7.16b, v6.16b
-;   mov v7.d[1], v6.d[0]
-;   uqxtn v0.4h, v7.4s
+;   dup v4.2s, w0
+;   mov v4.d[1], v4.d[1], v4.d[0]
+;   uqxtn v0.4h, v4.4s
 ;   ret
 
 function %uunarrow_i32x4(i32) -> i16x8 {
@@ -270,7 +264,7 @@ block0(v0: i32):
 ; block0:
 ;   dup v6.4s, w0
 ;   uqxtn v0.4h, v6.4s
-;   uqxtn2 v0.8h, v6.4s
+;   uqxtn2 v0.8h, v0.8h, v6.4s
 ;   ret
 
 function %uunarrow_i64x2(i64) -> i32x4 {
@@ -289,5 +283,6 @@ block0(v0: i64):
 ; block0:
 ;   dup v6.2d, x0
 ;   uqxtn v0.2s, v6.2d
-;   uqxtn2 v0.4s, v6.2d
+;   uqxtn2 v0.4s, v0.4s, v6.2d
 ;   ret
+
diff --git a/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-neon.clif b/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-neon.clif
index 0fbcf700bd..722bf860d5 100644
--- a/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-neon.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-neon.clif
@@ -197,7 +197,7 @@ block0(v0: f64, v1: f64):
 ;   dup v17.2d, v0.d[0]
 ;   dup v18.2d, v1.d[0]
 ;   fcmgt v0.2d, v17.2d, v18.2d
-;   bsl v0.16b, v18.16b, v17.16b
+;   bsl v0.16b, v0.16b, v18.16b, v17.16b
 ;   ret
 
 function %f64x2_splat_max_pseudo(f64, f64) -> f64x2 {
@@ -216,5 +216,6 @@ block0(v0: f64, v1: f64):
 ;   dup v17.2d, v0.d[0]
 ;   dup v18.2d, v1.d[0]
 ;   fcmgt v0.2d, v18.2d, v17.2d
-;   bsl v0.16b, v18.16b, v17.16b
+;   bsl v0.16b, v0.16b, v18.16b, v17.16b
 ;   ret
+
diff --git a/cranelift/filetests/filetests/isa/aarch64/floating-point.clif b/cranelift/filetests/filetests/isa/aarch64/floating-point.clif
index 16f38886a2..eac5fddaac 100644
--- a/cranelift/filetests/filetests/isa/aarch64/floating-point.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/floating-point.clif
@@ -309,8 +309,8 @@ block0(v0: f32, v1: f32):
 }
 
 ; block0:
-;   ushr v7.2s, v1.2s, #31
-;   sli v0.2s, v7.2s, #31
+;   ushr v6.2s, v1.2s, #31
+;   sli v0.2s, v0.2s, v6.2s, #31
 ;   ret
 
 function %f32(f64, f64) -> f64 {
@@ -320,8 +320,8 @@ block0(v0: f64, v1: f64):
 }
 
 ; block0:
-;   ushr d7, d1, #63
-;   sli d0, d7, #63
+;   ushr d6, d1, #63
+;   sli d0, d0, d6, #63
 ;   ret
 
 function %f33(f32) -> i32 {
@@ -918,9 +918,8 @@ block0(v0: f32x4, v1: f32x4, v2: f32x4):
 }
 
 ; block0:
-;   mov v17.16b, v0.16b
+;   fmla v2.4s, v2.4s, v0.4s, v1.4s
 ;   mov v0.16b, v2.16b
-;   fmla v0.4s, v17.4s, v1.4s
 ;   ret
 
 function %f79(f32x2, f32x2, f32x2) -> f32x2 {
@@ -930,9 +929,8 @@ block0(v0: f32x2, v1: f32x2, v2: f32x2):
 }
 
 ; block0:
-;   mov v17.16b, v0.16b
+;   fmla v2.2s, v2.2s, v0.2s, v1.2s
 ;   mov v0.16b, v2.16b
-;   fmla v0.2s, v17.2s, v1.2s
 ;   ret
 
 function %f80(f64x2, f64x2, f64x2) -> f64x2 {
@@ -942,9 +940,8 @@ block0(v0: f64x2, v1: f64x2, v2: f64x2):
 }
 
 ; block0:
-;   mov v17.16b, v0.16b
+;   fmla v2.2d, v2.2d, v0.2d, v1.2d
 ;   mov v0.16b, v2.16b
-;   fmla v0.2d, v17.2d, v1.2d
 ;   ret
 
 function %f81(f32x2, f32x2) -> f32x2 {
@@ -954,8 +951,8 @@ block0(v0: f32x2, v1: f32x2):
 }
 
 ; block0:
-;   ushr v7.2s, v1.2s, #31
-;   sli v0.2s, v7.2s, #31
+;   ushr v6.2s, v1.2s, #31
+;   sli v0.2s, v0.2s, v6.2s, #31
 ;   ret
 
 function %f82(f32x4, f32x4) -> f32x4 {
@@ -965,8 +962,8 @@ block0(v0: f32x4, v1: f32x4):
 }
 
 ; block0:
-;   ushr v7.4s, v1.4s, #31
-;   sli v0.4s, v7.4s, #31
+;   ushr v6.4s, v1.4s, #31
+;   sli v0.4s, v0.4s, v6.4s, #31
 ;   ret
 
 function %f83(f64x2, f64x2) -> f64x2 {
@@ -976,6 +973,7 @@ block0(v0: f64x2, v1: f64x2):
 }
 
 ; block0:
-;   ushr v7.2d, v1.2d, #63
-;   sli v0.2d, v7.2d, #63
+;   ushr v6.2d, v1.2d, #63
+;   sli v0.2d, v0.2d, v6.2d, #63
 ;   ret
+
diff --git a/cranelift/filetests/filetests/isa/aarch64/simd-bitwise-compile.clif b/cranelift/filetests/filetests/isa/aarch64/simd-bitwise-compile.clif
index f9e9967ffe..cdd5d92b46 100644
--- a/cranelift/filetests/filetests/isa/aarch64/simd-bitwise-compile.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/simd-bitwise-compile.clif
@@ -105,7 +105,7 @@ block0:
 ;   movi v0.16b, #0
 ;   movi v4.16b, #0
 ;   movi v5.16b, #0
-;   bsl v0.16b, v4.16b, v5.16b
+;   bsl v0.16b, v0.16b, v4.16b, v5.16b
 ;   ret
 
 function %vselect_i16x8(b16x8, i16x8, i16x8) -> i16x8 {
@@ -115,7 +115,7 @@ block0(v0: b16x8, v1: i16x8, v2: i16x8):
 }
 
 ; block0:
-;   bsl v0.16b, v1.16b, v2.16b
+;   bsl v0.16b, v0.16b, v1.16b, v2.16b
 ;   ret
 
 function %vselect_f32x4(b32x4, f32x4, f32x4) -> f32x4 {
@@ -125,7 +125,7 @@ block0(v0: b32x4, v1: f32x4, v2: f32x4):
 }
 
 ; block0:
-;   bsl v0.16b, v1.16b, v2.16b
+;   bsl v0.16b, v0.16b, v1.16b, v2.16b
 ;   ret
 
 function %vselect_f64x2(b64x2, f64x2, f64x2) -> f64x2 {
@@ -135,7 +135,7 @@ block0(v0: b64x2, v1: f64x2, v2: f64x2):
 }
 
 ; block0:
-;   bsl v0.16b, v1.16b, v2.16b
+;   bsl v0.16b, v0.16b, v1.16b, v2.16b
 ;   ret
 
 function %ishl_i8x16(i32) -> i8x16 {
diff --git a/cranelift/filetests/filetests/isa/aarch64/simd-lane-access-compile.clif b/cranelift/filetests/filetests/isa/aarch64/simd-lane-access-compile.clif
index 2f4f35f574..a6968ab206 100644
--- a/cranelift/filetests/filetests/isa/aarch64/simd-lane-access-compile.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/simd-lane-access-compile.clif
@@ -29,9 +29,9 @@ block0:
 
 ; block0:
 ;   movz x4, #1
-;   fmov s30, w4
+;   fmov s31, w4
 ;   ldr q3, pc+8 ; b 20 ; data.f128 0x13000000000000000000000000000000
-;   mov v31.16b, v30.16b
+;   mov v30.16b, v31.16b
 ;   tbl v0.16b, { v30.16b, v31.16b }, v3.16b
 ;   ret
 
diff --git a/cranelift/filetests/filetests/isa/aarch64/simd-narrow.clif b/cranelift/filetests/filetests/isa/aarch64/simd-narrow.clif
index dcf23e1cfe..50b147adff 100644
--- a/cranelift/filetests/filetests/isa/aarch64/simd-narrow.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/simd-narrow.clif
@@ -9,7 +9,7 @@ block0(v0: i16x4, v1: i16x4):
 }
 
 ; block0:
-;   mov v0.d[1], v1.d[0]
+;   mov v0.d[1], v0.d[1], v1.d[0]
 ;   sqxtn v0.8b, v0.8h
 ;   ret
 
@@ -21,7 +21,7 @@ block0(v0: i16x8, v1: i16x8):
 
 ; block0:
 ;   sqxtn v0.8b, v0.8h
-;   sqxtn2 v0.16b, v1.8h
+;   sqxtn2 v0.16b, v0.16b, v1.8h
 ;   ret
 
 function %snarrow_i32x2(i32x2, i32x2) -> i16x4 {
@@ -31,7 +31,7 @@ block0(v0: i32x2, v1: i32x2):
 }
 
 ; block0:
-;   mov v0.d[1], v1.d[0]
+;   mov v0.d[1], v0.d[1], v1.d[0]
 ;   sqxtn v0.4h, v0.4s
 ;   ret
 
@@ -43,7 +43,7 @@ block0(v0: i32x4, v1: i32x4):
 
 ; block0:
 ;   sqxtn v0.4h, v0.4s
-;   sqxtn2 v0.8h, v1.4s
+;   sqxtn2 v0.8h, v0.8h, v1.4s
 ;   ret
 
 function %snarrow_i64x2(i64x2, i64x2) -> i32x4 {
@@ -54,7 +54,7 @@ block0(v0: i64x2, v1: i64x2):
 
 ; block0:
 ;   sqxtn v0.2s, v0.2d
-;   sqxtn2 v0.4s, v1.2d
+;   sqxtn2 v0.4s, v0.4s, v1.2d
 ;   ret
 
 function %unarrow_i16x4(i16x4, i16x4) -> i8x8 {
@@ -64,7 +64,7 @@ block0(v0: i16x4, v1: i16x4):
 }
 
 ; block0:
-;   mov v0.d[1], v1.d[0]
+;   mov v0.d[1], v0.d[1], v1.d[0]
 ;   sqxtun v0.8b, v0.8h
 ;   ret
 
@@ -76,7 +76,7 @@ block0(v0: i16x8, v1: i16x8):
 
 ; block0:
 ;   sqxtun v0.8b, v0.8h
-;   sqxtun2 v0.16b, v1.8h
+;   sqxtun2 v0.16b, v0.16b, v1.8h
 ;   ret
 
 function %unarrow_i32x2(i32x2, i32x2) -> i16x4 {
@@ -86,7 +86,7 @@ block0(v0: i32x2, v1: i32x2):
 }
 
 ; block0:
-;   mov v0.d[1], v1.d[0]
+;   mov v0.d[1], v0.d[1], v1.d[0]
 ;   sqxtun v0.4h, v0.4s
 ;   ret
 
@@ -98,7 +98,7 @@ block0(v0: i32x4, v1: i32x4):
 
 ; block0:
 ;   sqxtun v0.4h, v0.4s
-;   sqxtun2 v0.8h, v1.4s
+;   sqxtun2 v0.8h, v0.8h, v1.4s
 ;   ret
 
 function %unarrow_i64x2(i64x2, i64x2) -> i32x4 {
@@ -109,7 +109,7 @@ block0(v0: i64x2, v1: i64x2):
 
 ; block0:
 ;   sqxtun v0.2s, v0.2d
-;   sqxtun2 v0.4s, v1.2d
+;   sqxtun2 v0.4s, v0.4s, v1.2d
 ;   ret
 
 function %uunarrow_i16x4(i16x4, i16x4) -> i8x8 {
@@ -119,7 +119,7 @@ block0(v0: i16x4, v1: i16x4):
 }
 
 ; block0:
-;   mov v0.d[1], v1.d[0]
+;   mov v0.d[1], v0.d[1], v1.d[0]
 ;   uqxtn v0.8b, v0.8h
 ;   ret
 
@@ -131,7 +131,7 @@ block0(v0: i16x8, v1: i16x8):
 
 ; block0:
 ;   uqxtn v0.8b, v0.8h
-;   uqxtn2 v0.16b, v1.8h
+;   uqxtn2 v0.16b, v0.16b, v1.8h
 ;   ret
 
 function %uunarrow_i32x2(i32x2, i32x2) -> i16x4 {
@@ -141,7 +141,7 @@ block0(v0: i32x2, v1: i32x2):
 }
 
 ; block0:
-;   mov v0.d[1], v1.d[0]
+;   mov v0.d[1], v0.d[1], v1.d[0]
 ;   uqxtn v0.4h, v0.4s
 ;   ret
 
@@ -153,7 +153,7 @@ block0(v0: i32x4, v1: i32x4):
 
 ; block0:
 ;   uqxtn v0.4h, v0.4s
-;   uqxtn2 v0.8h, v1.4s
+;   uqxtn2 v0.8h, v0.8h, v1.4s
 ;   ret
 
 function %uunarrow_i64x2(i64x2, i64x2) -> i32x4 {
@@ -164,7 +164,7 @@ block0(v0: i64x2, v1: i64x2):
 
 ; block0:
 ;   uqxtn v0.2s, v0.2d
-;   uqxtn2 v0.4s, v1.2d
+;   uqxtn2 v0.4s, v0.4s, v1.2d
 ;   ret
 
 function %snarrow_i16x8_zero(i16x8) -> i8x16 {
diff --git a/cranelift/filetests/filetests/isa/aarch64/simd.clif b/cranelift/filetests/filetests/isa/aarch64/simd.clif
index b26811e6fa..e66fcd2101 100644
--- a/cranelift/filetests/filetests/isa/aarch64/simd.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/simd.clif
@@ -11,7 +11,7 @@ block0:
 
 ; block0:
 ;   movz x2, #1
-;   movk x2, #1, LSL #48
+;   movk x2, x2, #1, LSL #48
 ;   dup v0.2d, x2
 ;   ret
 
diff --git a/cranelift/filetests/filetests/isa/aarch64/simd_load_zero.clif b/cranelift/filetests/filetests/isa/aarch64/simd_load_zero.clif
index 70ceecd6db..388168c5e2 100644
--- a/cranelift/filetests/filetests/isa/aarch64/simd_load_zero.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/simd_load_zero.clif
@@ -11,7 +11,7 @@ block0:
 
 ; block0:
 ;   movz x1, #1
-;   movk x1, #1, LSL #48
+;   movk x1, x1, #1, LSL #48
 ;   fmov d0, x1
 ;   ret
 
diff --git a/cranelift/filetests/filetests/isa/aarch64/stack-limit.clif b/cranelift/filetests/filetests/isa/aarch64/stack-limit.clif
index 2fe23da047..703d8c4b35 100644
--- a/cranelift/filetests/filetests/isa/aarch64/stack-limit.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/stack-limit.clif
@@ -98,16 +98,16 @@ block0(v0: i64):
 ;   subs xzr, sp, x0, UXTX
 ;   b.hs 8 ; udf
 ;   movz w17, #6784
-;   movk w17, #6, LSL #16
+;   movk w17, w17, #6, LSL #16
 ;   add x16, x0, x17, UXTX
 ;   subs xzr, sp, x16, UXTX
 ;   b.hs 8 ; udf
 ;   movz w16, #6784
-;   movk w16, #6, LSL #16
+;   movk w16, w16, #6, LSL #16
 ;   sub sp, sp, x16, UXTX
 ; block0:
 ;   movz w16, #6784
-;   movk w16, #6, LSL #16
+;   movk w16, w16, #6, LSL #16
 ;   add sp, sp, x16, UXTX
 ;   ldp fp, lr, [sp], #16
 ;   ret
@@ -152,16 +152,16 @@ block0(v0: i64):
 ;   subs xzr, sp, x16, UXTX
 ;   b.hs 8 ; udf
 ;   movz w17, #6784
-;   movk w17, #6, LSL #16
+;   movk w17, w17, #6, LSL #16
 ;   add x16, x16, x17, UXTX
 ;   subs xzr, sp, x16, UXTX
 ;   b.hs 8 ; udf
 ;   movz w16, #6784
-;   movk w16, #6, LSL #16
+;   movk w16, w16, #6, LSL #16
 ;   sub sp, sp, x16, UXTX
 ; block0:
 ;   movz w16, #6784
-;   movk w16, #6, LSL #16
+;   movk w16, w16, #6, LSL #16
 ;   add sp, sp, x16, UXTX
 ;   ldp fp, lr, [sp], #16
 ;   ret
@@ -177,7 +177,7 @@ block0(v0: i64):
 
 ;   stp fp, lr, [sp, #-16]!
 ;   mov fp, sp
-;   movz w16, #6784 ; movk w16, #6, LSL #16 ; add x16, x0, x16, UXTX ; ldr x16, [x16]
+;   movz w16, #6784 ; movk w16, w16, #6, LSL #16 ; add x16, x0, x16, UXTX ; ldr x16, [x16]
 ;   add x16, x16, #32
 ;   subs xzr, sp, x16, UXTX
 ;   b.hs 8 ; udf
diff --git a/cranelift/filetests/filetests/isa/aarch64/stack.clif b/cranelift/filetests/filetests/isa/aarch64/stack.clif
index a1478dccda..ac723f088b 100644
--- a/cranelift/filetests/filetests/isa/aarch64/stack.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/stack.clif
@@ -31,12 +31,12 @@ block0:
 ;   stp fp, lr, [sp, #-16]!
 ;   mov fp, sp
 ;   movz w16, #34480
-;   movk w16, #1, LSL #16
+;   movk w16, w16, #1, LSL #16
 ;   sub sp, sp, x16, UXTX
 ; block0:
 ;   mov x0, sp
 ;   movz w16, #34480
-;   movk w16, #1, LSL #16
+;   movk w16, w16, #1, LSL #16
 ;   add sp, sp, x16, UXTX
 ;   ldp fp, lr, [sp], #16
 ;   ret
@@ -71,13 +71,13 @@ block0:
 ;   stp fp, lr, [sp, #-16]!
 ;   mov fp, sp
 ;   movz w16, #34480
-;   movk w16, #1, LSL #16
+;   movk w16, w16, #1, LSL #16
 ;   sub sp, sp, x16, UXTX
 ; block0:
 ;   mov x2, sp
 ;   ldr x0, [x2]
 ;   movz w16, #34480
-;   movk w16, #1, LSL #16
+;   movk w16, w16, #1, LSL #16
 ;   add sp, sp, x16, UXTX
 ;   ldp fp, lr, [sp], #16
 ;   ret
@@ -112,13 +112,13 @@ block0(v0: i64):
 ;   stp fp, lr, [sp, #-16]!
 ;   mov fp, sp
 ;   movz w16, #34480
-;   movk w16, #1, LSL #16
+;   movk w16, w16, #1, LSL #16
 ;   sub sp, sp, x16, UXTX
 ; block0:
 ;   mov x2, sp
 ;   str x0, [x2]
 ;   movz w16, #34480
-;   movk w16, #1, LSL #16
+;   movk w16, w16, #1, LSL #16
 ;   add sp, sp, x16, UXTX
 ;   ldp fp, lr, [sp], #16
 ;   ret
@@ -479,13 +479,13 @@ block0(v0: i128):
 ;   stp fp, lr, [sp, #-16]!
 ;   mov fp, sp
 ;   movz w16, #34480
-;   movk w16, #1, LSL #16
+;   movk w16, w16, #1, LSL #16
 ;   sub sp, sp, x16, UXTX
 ; block0:
 ;   mov x5, sp
 ;   stp x0, x1, [x5]
 ;   movz w16, #34480
-;   movk w16, #1, LSL #16
+;   movk w16, w16, #1, LSL #16
 ;   add sp, sp, x16, UXTX
 ;   ldp fp, lr, [sp], #16
 ;   ret
@@ -539,13 +539,13 @@ block0:
 ;   stp fp, lr, [sp, #-16]!
 ;   mov fp, sp
 ;   movz w16, #34480
-;   movk w16, #1, LSL #16
+;   movk w16, w16, #1, LSL #16
 ;   sub sp, sp, x16, UXTX
 ; block0:
 ;   mov x5, sp
 ;   ldp x0, x1, [x5]
 ;   movz w16, #34480
-;   movk w16, #1, LSL #16
+;   movk w16, w16, #1, LSL #16
 ;   add sp, sp, x16, UXTX
 ;   ldp fp, lr, [sp], #16
 ;   ret
diff --git a/cranelift/filetests/filetests/isa/aarch64/vhigh_bits.clif b/cranelift/filetests/filetests/isa/aarch64/vhigh_bits.clif
index 53a99fe2c8..75fc84903b 100644
--- a/cranelift/filetests/filetests/isa/aarch64/vhigh_bits.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/vhigh_bits.clif
@@ -10,9 +10,9 @@ block0(v0: i8x16):
 ; block0:
 ;   sshr v3.16b, v0.16b, #7
 ;   movz x6, #513
-;   movk x6, #2052, LSL #16
-;   movk x6, #8208, LSL #32
-;   movk x6, #32832, LSL #48
+;   movk x6, x6, #2052, LSL #16
+;   movk x6, x6, #8208, LSL #32
+;   movk x6, x6, #32832, LSL #48
 ;   dup v17.2d, x6
 ;   and v20.16b, v3.16b, v17.16b
 ;   ext v22.16b, v20.16b, v20.16b, #8
@@ -30,9 +30,9 @@ block0(v0: i8x16):
 ; block0:
 ;   sshr v3.16b, v0.16b, #7
 ;   movz x6, #513
-;   movk x6, #2052, LSL #16
-;   movk x6, #8208, LSL #32
-;   movk x6, #32832, LSL #48
+;   movk x6, x6, #2052, LSL #16
+;   movk x6, x6, #8208, LSL #32
+;   movk x6, x6, #32832, LSL #48
 ;   dup v17.2d, x6
 ;   and v20.16b, v3.16b, v17.16b
 ;   ext v22.16b, v20.16b, v20.16b, #8