diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle
index bd25f388ea..ff6bbd1775 100644
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -64,24 +64,13 @@
        ;; A synthetic sequence to implement the right inline checks for
        ;; remainder and division, assuming the dividend is in %rax.
        ;;
-       ;; Puts the result back into %rax if is_div, %rdx if !is_div, to mimic
-       ;; what the div instruction does.
-       ;;
        ;; The generated code sequence is described in the emit's function match
        ;; arm for this instruction.
-       ;;
-       ;; Note: %rdx is marked as modified by this instruction, to avoid an
-       ;; early clobber problem with the temporary and divisor registers. Make
-       ;; sure to zero %rdx right before this instruction, or you might run into
-       ;; regalloc failures where %rdx is live before its first def!
        (CheckedDivOrRemSeq (kind DivOrRemKind)
                            (size OperandSize)
                            (dividend_lo Gpr)
                            (dividend_hi Gpr)
-                           ;; The divisor operand. Note it's marked as modified
-                           ;; so that it gets assigned a register different from
-                           ;; the temporary.
-                           (divisor WritableGpr)
+                           (divisor Gpr)
                            (dst_quotient WritableGpr)
                            (dst_remainder WritableGpr)
                            (tmp OptionWritableGpr))
@@ -205,12 +194,21 @@
                    (src3 XmmMem)
                    (dst WritableXmm))
 
-       ;; XMM (scalar or vector) binary op that relies on the EVEX prefix.
+       ;; XMM (scalar or vector) binary op that relies on the EVEX
+       ;; prefix. Takes two inputs.
        (XmmRmREvex (op Avx512Opcode)
                    (src1 XmmMem)
                    (src2 Xmm)
                    (dst WritableXmm))
 
+       ;; XMM (scalar or vector) binary op that relies on the EVEX
+       ;; prefix. Takes three inputs.
+       (XmmRmREvex3 (op Avx512Opcode)
+                   (src1 XmmMem)
+                   (src2 Xmm)
+                   (src3 Xmm)
+                   (dst WritableXmm))
+
        ;; XMM (scalar or vector) unary op: mov between XMM registers (32 64)
        ;; (reg addr) reg, sqrt, etc.
        ;;
@@ -255,13 +253,7 @@
 
        ;; Converts an unsigned int64 to a float32/float64.
        (CvtUint64ToFloatSeq (dst_size OperandSize) ;; 4 or 8
-                            ;; A copy of the source register, fed by
-                            ;; lowering. It is marked as modified during
-                            ;; register allocation to make sure that the
-                            ;; temporary registers differ from the src register,
-                            ;; since both registers are live at the same time in
-                            ;; the generated code sequence.
-                            (src WritableGpr)
+                            (src Gpr)
                             (dst WritableXmm)
                             (tmp_gpr1 WritableGpr)
                             (tmp_gpr2 WritableGpr))
@@ -270,13 +262,7 @@
        (CvtFloatToSintSeq (dst_size OperandSize)
                           (src_size OperandSize)
                           (is_saturating bool)
-                          ;; A copy of the source register, fed by
-                          ;; lowering. It is marked as modified during
-                          ;; register allocation to make sure that the
-                          ;; temporary registers differ from the src register,
-                          ;; since both registers are live at the same time in
-                          ;; the generated code sequence.
-                          (src WritableXmm)
+                          (src Xmm)
                           (dst WritableGpr)
                           (tmp_gpr WritableGpr)
                           (tmp_xmm WritableXmm))
@@ -285,13 +271,7 @@
        (CvtFloatToUintSeq (dst_size OperandSize)
                           (src_size OperandSize)
                           (is_saturating bool)
-                          ;; A copy of the source register, fed by
-                          ;; lowering. It is marked as modified during
-                          ;; register allocation to make sure that the
-                          ;; temporary registers differ from the src register,
-                          ;; since both registers are live at the same time in
-                          ;; the generated code sequence.
-                          (src WritableXmm)
+                          (src Xmm)
                           (dst WritableGpr)
                           (tmp_gpr WritableGpr)
                           (tmp_xmm WritableXmm))
@@ -2769,11 +2749,11 @@
 (decl x64_vpermi2b (Xmm Xmm Xmm) Xmm)
 (rule (x64_vpermi2b src1 src2 src3)
       (let ((dst WritableXmm (temp_writable_xmm))
-            (_ Unit (emit (gen_move $I8X16 dst src3)))
-            (_ Unit (emit (MInst.XmmRmREvex (Avx512Opcode.Vpermi2b)
-                                            src1
-                                            src2
-                                            dst))))
+            (_ Unit (emit (MInst.XmmRmREvex3 (Avx512Opcode.Vpermi2b)
+                                             src1
+                                             src2
+                                             src3
+                                             dst))))
         dst))
 
 ;; Helper for creating `MInst.MulHi` instructions.
@@ -3214,12 +3194,10 @@
 (decl cvt_u64_to_float_seq (Type Gpr) Xmm)
 (rule (cvt_u64_to_float_seq ty src)
       (let ((size OperandSize (raw_operand_size_of_type ty))
-            (src_copy WritableGpr (temp_writable_gpr))
             (dst WritableXmm (temp_writable_xmm))
             (tmp_gpr1 WritableGpr (temp_writable_gpr))
             (tmp_gpr2 WritableGpr (temp_writable_gpr))
-            (_ Unit (emit (gen_move $I64 src_copy src)))
-            (_ Unit (emit (MInst.CvtUint64ToFloatSeq size src_copy dst tmp_gpr1 tmp_gpr2))))
+            (_ Unit (emit (MInst.CvtUint64ToFloatSeq size src dst tmp_gpr1 tmp_gpr2))))
         dst))
 
 (decl cvt_float_to_uint_seq (Type Value bool) Gpr)
@@ -3227,13 +3205,10 @@
       (let ((out_size OperandSize (raw_operand_size_of_type out_ty))
             (src_size OperandSize (raw_operand_size_of_type src_ty))
 
-            (tmp WritableXmm (temp_writable_xmm))
-            (_ Unit (emit (gen_move src_ty tmp src)))
-
             (dst WritableGpr (temp_writable_gpr))
             (tmp_xmm WritableXmm (temp_writable_xmm))
             (tmp_gpr WritableGpr (temp_writable_gpr))
-            (_ Unit (emit (MInst.CvtFloatToUintSeq out_size src_size is_saturating tmp dst tmp_gpr tmp_xmm))))
+            (_ Unit (emit (MInst.CvtFloatToUintSeq out_size src_size is_saturating src dst tmp_gpr tmp_xmm))))
         dst))
 
 (decl cvt_float_to_sint_seq (Type Value bool) Gpr)
@@ -3241,13 +3216,10 @@
       (let ((out_size OperandSize (raw_operand_size_of_type out_ty))
             (src_size OperandSize (raw_operand_size_of_type src_ty))
 
-            (tmp WritableXmm (temp_writable_xmm))
-            (_ Unit (emit (gen_move src_ty tmp src)))
-
             (dst WritableGpr (temp_writable_gpr))
             (tmp_xmm WritableXmm (temp_writable_xmm))
             (tmp_gpr WritableGpr (temp_writable_gpr))
-            (_ Unit (emit (MInst.CvtFloatToSintSeq out_size src_size is_saturating tmp dst tmp_gpr tmp_xmm))))
+            (_ Unit (emit (MInst.CvtFloatToSintSeq out_size src_size is_saturating src dst tmp_gpr tmp_xmm))))
         dst))
 
 (decl fcvt_uint_mask_const () VCodeConstant)
@@ -3396,10 +3368,6 @@
             ;; addresses).
             (tmp1 WritableGpr (temp_writable_gpr))
 
-            ;; Put a zero in tmp1. This is needed for Spectre mitigations (a
-            ;; CMOV that zeroes the index on misspeculation).
-            (_ Unit (emit (MInst.Imm (OperandSize.Size32) 0 tmp1)))
-
             ;; This temporary is used as a signed integer of 32-bits (for the
             ;; wasm-table index) and then 64-bits (address addend). The small
             ;; lie about the I64 type is benign, since the temporary is dead
diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
index 6e6ef44bd5..8489338054 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -377,11 +377,11 @@ pub(crate) fn emit(
         } => {
             let dividend_lo = allocs.next(dividend_lo.to_reg());
             let dst_quotient = allocs.next(dst_quotient.to_reg().to_reg());
-            let dst_remainder = allocs.next(dst_remainder.to_reg().to_reg());
             debug_assert_eq!(dividend_lo, regs::rax());
             debug_assert_eq!(dst_quotient, regs::rax());
-            debug_assert_eq!(dst_remainder, regs::rdx());
             if size.to_bits() > 8 {
+                let dst_remainder = allocs.next(dst_remainder.to_reg().to_reg());
+                debug_assert_eq!(dst_remainder, regs::rdx());
                 let dividend_hi = allocs.next(dividend_hi.to_reg());
                 debug_assert_eq!(dividend_hi, regs::rdx());
             }
@@ -468,7 +468,11 @@ pub(crate) fn emit(
             let src = allocs.next(src.to_reg());
             let dst = allocs.next(dst.to_reg().to_reg());
             debug_assert_eq!(src, regs::rax());
-            debug_assert_eq!(dst, regs::rdx());
+            if *size == OperandSize::Size8 {
+                debug_assert_eq!(dst, regs::rax());
+            } else {
+                debug_assert_eq!(dst, regs::rdx());
+            }
             match size {
                 OperandSize::Size8 => {
                     sink.put1(0x66);
@@ -498,7 +502,7 @@ pub(crate) fn emit(
         } => {
             let dividend_lo = allocs.next(dividend_lo.to_reg());
             let dividend_hi = allocs.next(dividend_hi.to_reg());
-            let divisor = allocs.next(divisor.to_reg().to_reg());
+            let divisor = allocs.next(divisor.to_reg());
             let dst_quotient = allocs.next(dst_quotient.to_reg().to_reg());
             let dst_remainder = allocs.next(dst_remainder.to_reg().to_reg());
             let tmp = tmp.map(|tmp| allocs.next(tmp.to_reg().to_reg()));
@@ -597,18 +601,45 @@ pub(crate) fn emit(
                 sink.bind_label(do_op);
             }
 
+            let dividend_lo = Gpr::new(regs::rax()).unwrap();
+            let dst_quotient = WritableGpr::from_reg(Gpr::new(regs::rax()).unwrap());
+            let (dividend_hi, dst_remainder) = if *size == OperandSize::Size8 {
+                (
+                    Gpr::new(regs::rax()).unwrap(),
+                    Writable::from_reg(Gpr::new(regs::rax()).unwrap()),
+                )
+            } else {
+                (
+                    Gpr::new(regs::rdx()).unwrap(),
+                    Writable::from_reg(Gpr::new(regs::rdx()).unwrap()),
+                )
+            };
+
             // Fill in the high parts:
             if kind.is_signed() {
                 // sign-extend the sign-bit of rax into rdx, for signed opcodes.
-                let inst = Inst::sign_extend_data(*size);
+                let inst =
+                    Inst::sign_extend_data(*size, dividend_lo, WritableGpr::from_reg(dividend_hi));
                 inst.emit(&[], sink, info, state);
-            } else {
+            } else if *size != OperandSize::Size8 {
                 // zero for unsigned opcodes.
-                let inst = Inst::imm(OperandSize::Size64, 0, Writable::from_reg(regs::rdx()));
+                let inst = Inst::imm(
+                    OperandSize::Size64,
+                    0,
+                    Writable::from_reg(dividend_hi.to_reg()),
+                );
                 inst.emit(&[], sink, info, state);
             }
 
-            let inst = Inst::div(*size, kind.is_signed(), RegMem::reg(divisor));
+            let inst = Inst::div(
+                *size,
+                kind.is_signed(),
+                RegMem::reg(divisor),
+                dividend_lo,
+                dividend_hi,
+                dst_quotient,
+                dst_remainder,
+            );
             inst.emit(&[], sink, info, state);
 
             // Lowering takes care of moving the result back into the right register, see comment
@@ -1393,7 +1424,8 @@ pub(crate) fn emit(
             // ;; generated by lowering: cmp #jmp_table_size, %idx
             // jnb $default_target
             // movl %idx, %tmp2
-            // cmovnb %tmp1, %tmp2 ;; Spectre mitigation; we require tmp1 to be zero on entry.
+            // mov $0, %tmp1
+            // cmovnb %tmp1, %tmp2 ;; Spectre mitigation.
             // lea start_of_jump_table_offset(%rip), %tmp1
             // movslq [%tmp1, %tmp2, 4], %tmp2 ;; shift of 2, viz. multiply index by 4
             // addq %tmp2, %tmp1
@@ -1406,6 +1438,13 @@ pub(crate) fn emit(
             let inst = Inst::movzx_rm_r(ExtMode::LQ, RegMem::reg(idx), tmp2);
             inst.emit(&[], sink, info, state);
 
+            // Zero `tmp1` to overwrite `tmp2` with zeroes on the
+            // out-of-bounds case (Spectre mitigation using CMOV).
+            // Note that we need to do this with a move-immediate
+            // form, because we cannot clobber the flags.
+            let inst = Inst::imm(OperandSize::Size32, 0, tmp1);
+            inst.emit(&[], sink, info, state);
+
             // Spectre mitigation: CMOV to zero the index if the out-of-bounds branch above misspeculated.
             let inst = Inst::cmove(
                 OperandSize::Size64,
@@ -1768,9 +1807,21 @@ pub(crate) fn emit(
             src1,
             src2,
             dst,
+        }
+        | Inst::XmmRmREvex3 {
+            op,
+            src1,
+            src2,
+            dst,
+            // `dst` reuses `src3`.
+            ..
         } => {
             let dst = allocs.next(dst.to_reg().to_reg());
             let src2 = allocs.next(src2.to_reg());
+            if let Inst::XmmRmREvex3 { src3, .. } = inst {
+                let src3 = allocs.next(src3.to_reg());
+                debug_assert_eq!(src3, dst);
+            }
             let src1 = src1.clone().to_reg_mem().with_allocs(allocs);
 
             let (w, opcode) = match op {
@@ -2086,7 +2137,7 @@ pub(crate) fn emit(
             tmp_gpr1,
             tmp_gpr2,
         } => {
-            let src = allocs.next(src.to_reg().to_reg());
+            let src = allocs.next(src.to_reg());
             let dst = allocs.next(dst.to_reg().to_reg());
             let tmp_gpr1 = allocs.next(tmp_gpr1.to_reg().to_reg());
             let tmp_gpr2 = allocs.next(tmp_gpr2.to_reg().to_reg());
@@ -2155,7 +2206,7 @@ pub(crate) fn emit(
             let inst = Inst::shift_r(
                 OperandSize::Size64,
                 ShiftKind::ShiftRightLogical,
-                Some(1),
+                Imm8Gpr::new(Imm8Reg::Imm8 { imm: 1 }).unwrap(),
                 Writable::from_reg(tmp_gpr1),
             );
             inst.emit(&[], sink, info, state);
@@ -2208,7 +2259,7 @@ pub(crate) fn emit(
             tmp_gpr,
             tmp_xmm,
         } => {
-            let src = allocs.next(src.to_reg().to_reg());
+            let src = allocs.next(src.to_reg());
             let dst = allocs.next(dst.to_reg().to_reg());
             let tmp_gpr = allocs.next(tmp_gpr.to_reg().to_reg());
             let tmp_xmm = allocs.next(tmp_xmm.to_reg().to_reg());
@@ -2417,7 +2468,7 @@ pub(crate) fn emit(
             tmp_gpr,
             tmp_xmm,
         } => {
-            let src = allocs.next(src.to_reg().to_reg());
+            let src = allocs.next(src.to_reg());
             let dst = allocs.next(dst.to_reg().to_reg());
             let tmp_gpr = allocs.next(tmp_gpr.to_reg().to_reg());
             let tmp_xmm = allocs.next(tmp_xmm.to_reg().to_reg());
diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
index d0dde74727..9cbde12668 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -1723,6 +1723,10 @@ fn test_x64_emit() {
             OperandSize::Size32,
             true, /*signed*/
             RegMem::reg(regs::rsi()),
+            Gpr::new(regs::rax()).unwrap(),
+            Gpr::new(regs::rdx()).unwrap(),
+            WritableGpr::from_reg(Gpr::new(regs::rax()).unwrap()),
+            WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()),
         ),
         "F7FE",
         "idiv    %eax, %edx, %esi, %eax, %edx",
@@ -1732,6 +1736,10 @@ fn test_x64_emit() {
             OperandSize::Size64,
             true, /*signed*/
             RegMem::reg(regs::r15()),
+            Gpr::new(regs::rax()).unwrap(),
+            Gpr::new(regs::rdx()).unwrap(),
+            WritableGpr::from_reg(Gpr::new(regs::rax()).unwrap()),
+            WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()),
         ),
         "49F7FF",
         "idiv    %rax, %rdx, %r15, %rax, %rdx",
@@ -1741,6 +1749,10 @@ fn test_x64_emit() {
             OperandSize::Size32,
             false, /*signed*/
             RegMem::reg(regs::r14()),
+            Gpr::new(regs::rax()).unwrap(),
+            Gpr::new(regs::rdx()).unwrap(),
+            WritableGpr::from_reg(Gpr::new(regs::rax()).unwrap()),
+            WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()),
         ),
         "41F7F6",
         "div     %eax, %edx, %r14d, %eax, %edx",
@@ -1750,19 +1762,39 @@ fn test_x64_emit() {
             OperandSize::Size64,
             false, /*signed*/
             RegMem::reg(regs::rdi()),
+            Gpr::new(regs::rax()).unwrap(),
+            Gpr::new(regs::rdx()).unwrap(),
+            WritableGpr::from_reg(Gpr::new(regs::rax()).unwrap()),
+            WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()),
         ),
         "48F7F7",
         "div     %rax, %rdx, %rdi, %rax, %rdx",
     ));
     insns.push((
-        Inst::div(OperandSize::Size8, false, RegMem::reg(regs::rax())),
+        Inst::div(
+            OperandSize::Size8,
+            false,
+            RegMem::reg(regs::rax()),
+            Gpr::new(regs::rax()).unwrap(),
+            Gpr::new(regs::rdx()).unwrap(),
+            WritableGpr::from_reg(Gpr::new(regs::rax()).unwrap()),
+            WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()),
+        ),
         "F6F0",
-        "div     %al, (none), %al, %al, %dl",
+        "div     %al, (none), %al, %al, (none)",
     ));
     insns.push((
-        Inst::div(OperandSize::Size8, false, RegMem::reg(regs::rsi())),
+        Inst::div(
+            OperandSize::Size8,
+            false,
+            RegMem::reg(regs::rsi()),
+            Gpr::new(regs::rax()).unwrap(),
+            Gpr::new(regs::rdx()).unwrap(),
+            WritableGpr::from_reg(Gpr::new(regs::rax()).unwrap()),
+            WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()),
+        ),
         "40F6F6",
-        "div     %al, (none), %sil, %al, %dl",
+        "div     %al, (none), %sil, %al, (none)",
     ));
 
     // ========================================================
@@ -1807,25 +1839,41 @@ fn test_x64_emit() {
     // ========================================================
     // cbw
     insns.push((
-        Inst::sign_extend_data(OperandSize::Size8),
+        Inst::sign_extend_data(
+            OperandSize::Size8,
+            Gpr::new(regs::rax()).unwrap(),
+            WritableGpr::from_reg(Gpr::new(regs::rax()).unwrap()),
+        ),
         "6698",
-        "cbw %al, %dl",
+        "cbw %al, %al",
     ));
 
     // ========================================================
     // cdq family: SignExtendRaxRdx
     insns.push((
-        Inst::sign_extend_data(OperandSize::Size16),
+        Inst::sign_extend_data(
+            OperandSize::Size16,
+            Gpr::new(regs::rax()).unwrap(),
+            WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()),
+        ),
         "6699",
         "cwd %ax, %dx",
     ));
     insns.push((
-        Inst::sign_extend_data(OperandSize::Size32),
+        Inst::sign_extend_data(
+            OperandSize::Size32,
+            Gpr::new(regs::rax()).unwrap(),
+            WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()),
+        ),
         "99",
         "cdq %eax, %edx",
     ));
     insns.push((
-        Inst::sign_extend_data(OperandSize::Size64),
+        Inst::sign_extend_data(
+            OperandSize::Size64,
+            Gpr::new(regs::rax()).unwrap(),
+            WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()),
+        ),
         "4899",
         "cqo %rax, %rdx",
     ));
@@ -2813,47 +2861,92 @@ fn test_x64_emit() {
     // ========================================================
     // Shift_R
     insns.push((
-        Inst::shift_r(OperandSize::Size32, ShiftKind::ShiftLeft, None, w_rdi),
+        Inst::shift_r(
+            OperandSize::Size32,
+            ShiftKind::ShiftLeft,
+            Imm8Gpr::new(Imm8Reg::Reg { reg: regs::rcx() }).unwrap(),
+            w_rdi,
+        ),
         "D3E7",
         "shll    %cl, %edi, %edi",
     ));
     insns.push((
-        Inst::shift_r(OperandSize::Size32, ShiftKind::ShiftLeft, None, w_r12),
+        Inst::shift_r(
+            OperandSize::Size32,
+            ShiftKind::ShiftLeft,
+            Imm8Gpr::new(Imm8Reg::Reg { reg: regs::rcx() }).unwrap(),
+            w_r12,
+        ),
         "41D3E4",
         "shll    %cl, %r12d, %r12d",
     ));
     insns.push((
-        Inst::shift_r(OperandSize::Size32, ShiftKind::ShiftLeft, Some(2), w_r8),
+        Inst::shift_r(
+            OperandSize::Size32,
+            ShiftKind::ShiftLeft,
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 2 }).unwrap(),
+            w_r8,
+        ),
         "41C1E002",
         "shll    $2, %r8d, %r8d",
     ));
     insns.push((
-        Inst::shift_r(OperandSize::Size32, ShiftKind::ShiftLeft, Some(31), w_r13),
+        Inst::shift_r(
+            OperandSize::Size32,
+            ShiftKind::ShiftLeft,
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 31 }).unwrap(),
+            w_r13,
+        ),
         "41C1E51F",
         "shll    $31, %r13d, %r13d",
     ));
     insns.push((
-        Inst::shift_r(OperandSize::Size64, ShiftKind::ShiftLeft, None, w_r13),
+        Inst::shift_r(
+            OperandSize::Size64,
+            ShiftKind::ShiftLeft,
+            Imm8Gpr::new(Imm8Reg::Reg { reg: regs::rcx() }).unwrap(),
+            w_r13,
+        ),
         "49D3E5",
         "shlq    %cl, %r13, %r13",
     ));
     insns.push((
-        Inst::shift_r(OperandSize::Size64, ShiftKind::ShiftLeft, None, w_rdi),
+        Inst::shift_r(
+            OperandSize::Size64,
+            ShiftKind::ShiftLeft,
+            Imm8Gpr::new(Imm8Reg::Reg { reg: regs::rcx() }).unwrap(),
+            w_rdi,
+        ),
         "48D3E7",
         "shlq    %cl, %rdi, %rdi",
     ));
     insns.push((
-        Inst::shift_r(OperandSize::Size64, ShiftKind::ShiftLeft, Some(2), w_r8),
+        Inst::shift_r(
+            OperandSize::Size64,
+            ShiftKind::ShiftLeft,
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 2 }).unwrap(),
+            w_r8,
+        ),
         "49C1E002",
         "shlq    $2, %r8, %r8",
     ));
     insns.push((
-        Inst::shift_r(OperandSize::Size64, ShiftKind::ShiftLeft, Some(3), w_rbx),
+        Inst::shift_r(
+            OperandSize::Size64,
+            ShiftKind::ShiftLeft,
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 3 }).unwrap(),
+            w_rbx,
+        ),
         "48C1E303",
         "shlq    $3, %rbx, %rbx",
     ));
     insns.push((
-        Inst::shift_r(OperandSize::Size64, ShiftKind::ShiftLeft, Some(63), w_r13),
+        Inst::shift_r(
+            OperandSize::Size64,
+            ShiftKind::ShiftLeft,
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 63 }).unwrap(),
+            w_r13,
+        ),
         "49C1E53F",
         "shlq    $63, %r13, %r13",
     ));
@@ -2861,7 +2954,7 @@ fn test_x64_emit() {
         Inst::shift_r(
             OperandSize::Size32,
             ShiftKind::ShiftRightLogical,
-            None,
+            Imm8Gpr::new(Imm8Reg::Reg { reg: regs::rcx() }).unwrap(),
             w_rdi,
         ),
         "D3EF",
@@ -2871,7 +2964,7 @@ fn test_x64_emit() {
         Inst::shift_r(
             OperandSize::Size32,
             ShiftKind::ShiftRightLogical,
-            Some(2),
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 2 }).unwrap(),
             w_r8,
         ),
         "41C1E802",
@@ -2881,7 +2974,7 @@ fn test_x64_emit() {
         Inst::shift_r(
             OperandSize::Size32,
             ShiftKind::ShiftRightLogical,
-            Some(31),
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 31 }).unwrap(),
             w_r13,
         ),
         "41C1ED1F",
@@ -2891,7 +2984,7 @@ fn test_x64_emit() {
         Inst::shift_r(
             OperandSize::Size64,
             ShiftKind::ShiftRightLogical,
-            None,
+            Imm8Gpr::new(Imm8Reg::Reg { reg: regs::rcx() }).unwrap(),
             w_rdi,
         ),
         "48D3EF",
@@ -2901,7 +2994,7 @@ fn test_x64_emit() {
         Inst::shift_r(
             OperandSize::Size64,
             ShiftKind::ShiftRightLogical,
-            Some(2),
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 2 }).unwrap(),
             w_r8,
         ),
         "49C1E802",
@@ -2911,7 +3004,7 @@ fn test_x64_emit() {
         Inst::shift_r(
             OperandSize::Size64,
             ShiftKind::ShiftRightLogical,
-            Some(63),
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 63 }).unwrap(),
             w_r13,
         ),
         "49C1ED3F",
@@ -2921,7 +3014,7 @@ fn test_x64_emit() {
         Inst::shift_r(
             OperandSize::Size32,
             ShiftKind::ShiftRightArithmetic,
-            None,
+            Imm8Gpr::new(Imm8Reg::Reg { reg: regs::rcx() }).unwrap(),
             w_rdi,
         ),
         "D3FF",
@@ -2931,7 +3024,7 @@ fn test_x64_emit() {
         Inst::shift_r(
             OperandSize::Size32,
             ShiftKind::ShiftRightArithmetic,
-            Some(2),
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 2 }).unwrap(),
             w_r8,
         ),
         "41C1F802",
@@ -2941,7 +3034,7 @@ fn test_x64_emit() {
         Inst::shift_r(
             OperandSize::Size32,
             ShiftKind::ShiftRightArithmetic,
-            Some(31),
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 31 }).unwrap(),
             w_r13,
         ),
         "41C1FD1F",
@@ -2951,7 +3044,7 @@ fn test_x64_emit() {
         Inst::shift_r(
             OperandSize::Size64,
             ShiftKind::ShiftRightArithmetic,
-            None,
+            Imm8Gpr::new(Imm8Reg::Reg { reg: regs::rcx() }).unwrap(),
             w_rdi,
         ),
         "48D3FF",
@@ -2961,7 +3054,7 @@ fn test_x64_emit() {
         Inst::shift_r(
             OperandSize::Size64,
             ShiftKind::ShiftRightArithmetic,
-            Some(2),
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 2 }).unwrap(),
             w_r8,
         ),
         "49C1F802",
@@ -2971,54 +3064,99 @@ fn test_x64_emit() {
         Inst::shift_r(
             OperandSize::Size64,
             ShiftKind::ShiftRightArithmetic,
-            Some(63),
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 63 }).unwrap(),
             w_r13,
         ),
         "49C1FD3F",
         "sarq    $63, %r13, %r13",
     ));
     insns.push((
-        Inst::shift_r(OperandSize::Size64, ShiftKind::RotateLeft, None, w_r8),
+        Inst::shift_r(
+            OperandSize::Size64,
+            ShiftKind::RotateLeft,
+            Imm8Gpr::new(Imm8Reg::Reg { reg: regs::rcx() }).unwrap(),
+            w_r8,
+        ),
         "49D3C0",
         "rolq    %cl, %r8, %r8",
     ));
     insns.push((
-        Inst::shift_r(OperandSize::Size32, ShiftKind::RotateLeft, Some(3), w_r9),
+        Inst::shift_r(
+            OperandSize::Size32,
+            ShiftKind::RotateLeft,
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 3 }).unwrap(),
+            w_r9,
+        ),
         "41C1C103",
         "roll    $3, %r9d, %r9d",
     ));
     insns.push((
-        Inst::shift_r(OperandSize::Size32, ShiftKind::RotateRight, None, w_rsi),
+        Inst::shift_r(
+            OperandSize::Size32,
+            ShiftKind::RotateRight,
+            Imm8Gpr::new(Imm8Reg::Reg { reg: regs::rcx() }).unwrap(),
+            w_rsi,
+        ),
         "D3CE",
         "rorl    %cl, %esi, %esi",
     ));
     insns.push((
-        Inst::shift_r(OperandSize::Size64, ShiftKind::RotateRight, Some(5), w_r15),
+        Inst::shift_r(
+            OperandSize::Size64,
+            ShiftKind::RotateRight,
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 5 }).unwrap(),
+            w_r15,
+        ),
         "49C1CF05",
         "rorq    $5, %r15, %r15",
     ));
     insns.push((
-        Inst::shift_r(OperandSize::Size8, ShiftKind::RotateRight, None, w_rsi),
+        Inst::shift_r(
+            OperandSize::Size8,
+            ShiftKind::RotateRight,
+            Imm8Gpr::new(Imm8Reg::Reg { reg: regs::rcx() }).unwrap(),
+            w_rsi,
+        ),
         "40D2CE",
         "rorb    %cl, %sil, %sil",
     ));
     insns.push((
-        Inst::shift_r(OperandSize::Size8, ShiftKind::RotateRight, None, w_rax),
+        Inst::shift_r(
+            OperandSize::Size8,
+            ShiftKind::RotateRight,
+            Imm8Gpr::new(Imm8Reg::Reg { reg: regs::rcx() }).unwrap(),
+            w_rax,
+        ),
         "D2C8",
         "rorb    %cl, %al, %al",
     ));
     insns.push((
-        Inst::shift_r(OperandSize::Size8, ShiftKind::RotateRight, Some(5), w_r15),
+        Inst::shift_r(
+            OperandSize::Size8,
+            ShiftKind::RotateRight,
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 5 }).unwrap(),
+            w_r15,
+        ),
         "41C0CF05",
         "rorb    $5, %r15b, %r15b",
     ));
     insns.push((
-        Inst::shift_r(OperandSize::Size16, ShiftKind::RotateRight, None, w_rsi),
+        Inst::shift_r(
+            OperandSize::Size16,
+            ShiftKind::RotateRight,
+            Imm8Gpr::new(Imm8Reg::Reg { reg: regs::rcx() }).unwrap(),
+            w_rsi,
+        ),
         "66D3CE",
         "rorw    %cl, %si, %si",
     ));
     insns.push((
-        Inst::shift_r(OperandSize::Size16, ShiftKind::RotateRight, Some(5), w_r15),
+        Inst::shift_r(
+            OperandSize::Size16,
+            ShiftKind::RotateRight,
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 5 }).unwrap(),
+            w_r15,
+        ),
         "6641C1CF05",
         "rorw    $5, %r15w, %r15w",
     ));
diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs
index 9d7f1bd0f4..09b5993298 100644
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -131,7 +131,9 @@ impl Inst {
             | Inst::XmmToGpr { op, .. }
             | Inst::XmmUnaryRmR { op, .. } => smallvec![op.available_from()],
 
-            Inst::XmmUnaryRmREvex { op, .. } | Inst::XmmRmREvex { op, .. } => op.available_from(),
+            Inst::XmmUnaryRmREvex { op, .. }
+            | Inst::XmmRmREvex { op, .. }
+            | Inst::XmmRmREvex3 { op, .. } => op.available_from(),
 
             Inst::XmmRmRVex { op, .. } => op.available_from(),
         }
@@ -195,47 +197,55 @@ impl Inst {
         }
     }
 
-    pub(crate) fn div(size: OperandSize, signed: bool, divisor: RegMem) -> Inst {
+    pub(crate) fn div(
+        size: OperandSize,
+        signed: bool,
+        divisor: RegMem,
+        dividend_lo: Gpr,
+        dividend_hi: Gpr,
+        dst_quotient: WritableGpr,
+        dst_remainder: WritableGpr,
+    ) -> Inst {
         divisor.assert_regclass_is(RegClass::Int);
         Inst::Div {
             size,
             signed,
             divisor: GprMem::new(divisor).unwrap(),
-            dividend_lo: Gpr::new(regs::rax()).unwrap(),
-            dividend_hi: Gpr::new(regs::rdx()).unwrap(),
-            dst_quotient: WritableGpr::from_reg(Gpr::new(regs::rax()).unwrap()),
-            dst_remainder: Writable::from_reg(Gpr::new(regs::rdx()).unwrap()),
+            dividend_lo,
+            dividend_hi,
+            dst_quotient,
+            dst_remainder,
         }
     }
 
     pub(crate) fn checked_div_or_rem_seq(
         kind: DivOrRemKind,
         size: OperandSize,
-        divisor: Writable<Reg>,
+        divisor: Reg,
+        dividend_lo: Gpr,
+        dividend_hi: Gpr,
+        dst_quotient: WritableGpr,
+        dst_remainder: WritableGpr,
         tmp: Option<Writable<Reg>>,
     ) -> Inst {
-        debug_assert!(divisor.to_reg().class() == RegClass::Int);
+        debug_assert!(divisor.class() == RegClass::Int);
         debug_assert!(tmp
             .map(|tmp| tmp.to_reg().class() == RegClass::Int)
             .unwrap_or(true));
         Inst::CheckedDivOrRemSeq {
             kind,
             size,
-            divisor: WritableGpr::from_writable_reg(divisor).unwrap(),
-            dividend_lo: Gpr::new(regs::rax()).unwrap(),
-            dividend_hi: Gpr::new(regs::rdx()).unwrap(),
-            dst_quotient: Writable::from_reg(Gpr::new(regs::rax()).unwrap()),
-            dst_remainder: Writable::from_reg(Gpr::new(regs::rdx()).unwrap()),
+            divisor: Gpr::new(divisor).unwrap(),
+            dividend_lo,
+            dividend_hi,
+            dst_quotient,
+            dst_remainder,
             tmp: tmp.map(|tmp| WritableGpr::from_writable_reg(tmp).unwrap()),
         }
     }
 
-    pub(crate) fn sign_extend_data(size: OperandSize) -> Inst {
-        Inst::SignExtendData {
-            size,
-            src: Gpr::new(regs::rax()).unwrap(),
-            dst: Writable::from_reg(Gpr::new(regs::rdx()).unwrap()),
-        }
+    pub(crate) fn sign_extend_data(size: OperandSize, src: Gpr, dst: WritableGpr) -> Inst {
+        Inst::SignExtendData { size, src, dst }
     }
 
     pub(crate) fn imm(dst_size: OperandSize, simm64: u64, dst: Writable<Reg>) -> Inst {
@@ -415,24 +425,18 @@ impl Inst {
     pub(crate) fn shift_r(
         size: OperandSize,
         kind: ShiftKind,
-        num_bits: Option<u8>,
+        num_bits: Imm8Gpr,
         dst: Writable<Reg>,
     ) -> Inst {
-        debug_assert!(if let Some(num_bits) = num_bits {
-            num_bits < size.to_bits()
-        } else {
-            true
-        });
+        if let Imm8Reg::Imm8 { imm: num_bits } = num_bits.clone().to_imm8_reg() {
+            debug_assert!(num_bits < size.to_bits());
+        }
         debug_assert!(dst.to_reg().class() == RegClass::Int);
         Inst::ShiftR {
             size,
             kind,
             src: Gpr::new(dst.to_reg()).unwrap(),
-            num_bits: Imm8Gpr::new(match num_bits {
-                Some(imm) => Imm8Reg::Imm8 { imm },
-                None => Imm8Reg::Reg { reg: regs::rcx() },
-            })
-            .unwrap(),
+            num_bits,
             dst: WritableGpr::from_writable_reg(dst).unwrap(),
         }
     }
@@ -781,8 +785,11 @@ impl PrettyPrint for Inst {
                 let dividend_lo = pretty_print_reg(dividend_lo.to_reg(), size.to_bytes(), allocs);
                 let dst_quotient =
                     pretty_print_reg(dst_quotient.to_reg().to_reg(), size.to_bytes(), allocs);
-                let dst_remainder =
-                    pretty_print_reg(dst_remainder.to_reg().to_reg(), size.to_bytes(), allocs);
+                let dst_remainder = if size.to_bits() > 8 {
+                    pretty_print_reg(dst_remainder.to_reg().to_reg(), size.to_bytes(), allocs)
+                } else {
+                    "(none)".to_string()
+                };
                 let dividend_hi = if size.to_bits() > 8 {
                     pretty_print_reg(dividend_hi.to_reg(), size.to_bytes(), allocs)
                 } else {
@@ -842,7 +849,7 @@ impl PrettyPrint for Inst {
             } => {
                 let dividend_lo = pretty_print_reg(dividend_lo.to_reg(), size.to_bytes(), allocs);
                 let dividend_hi = pretty_print_reg(dividend_hi.to_reg(), size.to_bytes(), allocs);
-                let divisor = pretty_print_reg(divisor.to_reg().to_reg(), size.to_bytes(), allocs);
+                let divisor = pretty_print_reg(divisor.to_reg(), size.to_bytes(), allocs);
                 let dst_quotient =
                     pretty_print_reg(dst_quotient.to_reg().to_reg(), size.to_bytes(), allocs);
                 let dst_remainder =
@@ -949,12 +956,34 @@ impl PrettyPrint for Inst {
                 dst,
                 ..
             } => {
-                let src2 = pretty_print_reg(src2.to_reg(), 8, allocs);
                 let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
+                let src2 = pretty_print_reg(src2.to_reg(), 8, allocs);
                 let src1 = src1.pretty_print(8, allocs);
                 format!("{} {}, {}, {}", ljustify(op.to_string()), src1, src2, dst)
             }
 
+            Inst::XmmRmREvex3 {
+                op,
+                src1,
+                src2,
+                src3,
+                dst,
+                ..
+            } => {
+                let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
+                let src2 = pretty_print_reg(src2.to_reg(), 8, allocs);
+                let src3 = pretty_print_reg(src3.to_reg(), 8, allocs);
+                let src1 = src1.pretty_print(8, allocs);
+                format!(
+                    "{} {}, {}, {}, {}",
+                    ljustify(op.to_string()),
+                    src1,
+                    src2,
+                    src3,
+                    dst
+                )
+            }
+
             Inst::XmmMinMaxSeq {
                 lhs,
                 rhs,
@@ -1084,7 +1113,7 @@ impl PrettyPrint for Inst {
                 tmp_gpr2,
                 ..
             } => {
-                let src = pretty_print_reg(src.to_reg().to_reg(), 8, allocs);
+                let src = pretty_print_reg(src.to_reg(), 8, allocs);
                 let dst = pretty_print_reg(dst.to_reg().to_reg(), dst_size.to_bytes(), allocs);
                 let tmp_gpr1 = pretty_print_reg(tmp_gpr1.to_reg().to_reg(), 8, allocs);
                 let tmp_gpr2 = pretty_print_reg(tmp_gpr2.to_reg().to_reg(), 8, allocs);
@@ -1114,7 +1143,7 @@ impl PrettyPrint for Inst {
                 tmp_gpr,
                 is_saturating,
             } => {
-                let src = pretty_print_reg(src.to_reg().to_reg(), src_size.to_bytes(), allocs);
+                let src = pretty_print_reg(src.to_reg(), src_size.to_bytes(), allocs);
                 let dst = pretty_print_reg(dst.to_reg().to_reg(), dst_size.to_bytes(), allocs);
                 let tmp_gpr = pretty_print_reg(tmp_gpr.to_reg().to_reg(), 8, allocs);
                 let tmp_xmm = pretty_print_reg(tmp_xmm.to_reg().to_reg(), 8, allocs);
@@ -1142,7 +1171,7 @@ impl PrettyPrint for Inst {
                 tmp_xmm,
                 is_saturating,
             } => {
-                let src = pretty_print_reg(src.to_reg().to_reg(), src_size.to_bytes(), allocs);
+                let src = pretty_print_reg(src.to_reg(), src_size.to_bytes(), allocs);
                 let dst = pretty_print_reg(dst.to_reg().to_reg(), dst_size.to_bytes(), allocs);
                 let tmp_gpr = pretty_print_reg(tmp_gpr.to_reg().to_reg(), 8, allocs);
                 let tmp_xmm = pretty_print_reg(tmp_xmm.to_reg().to_reg(), 8, allocs);
@@ -1424,9 +1453,19 @@ impl PrettyPrint for Inst {
                 not_taken.to_string()
             ),
 
-            Inst::JmpTableSeq { idx, .. } => {
+            Inst::JmpTableSeq {
+                idx, tmp1, tmp2, ..
+            } => {
                 let idx = pretty_print_reg(*idx, 8, allocs);
-                format!("{} {}", ljustify("br_table".into()), idx)
+                let tmp1 = pretty_print_reg(tmp1.to_reg(), 8, allocs);
+                let tmp2 = pretty_print_reg(tmp2.to_reg(), 8, allocs);
+                format!(
+                    "{} {}, {}, {}",
+                    ljustify("br_table".into()),
+                    idx,
+                    tmp1,
+                    tmp2
+                )
             }
 
             Inst::JmpUnknown { target } => {
@@ -1605,8 +1644,8 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
         } => {
             collector.reg_fixed_use(dividend_lo.to_reg(), regs::rax());
             collector.reg_fixed_def(dst_quotient.to_writable_reg(), regs::rax());
-            collector.reg_fixed_def(dst_remainder.to_writable_reg(), regs::rdx());
             if size.to_bits() > 8 {
+                collector.reg_fixed_def(dst_remainder.to_writable_reg(), regs::rdx());
                 collector.reg_fixed_use(dividend_hi.to_reg(), regs::rdx());
             }
             divisor.get_operands(collector);
@@ -1634,10 +1673,12 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
         } => {
             collector.reg_fixed_use(dividend_lo.to_reg(), regs::rax());
             collector.reg_fixed_use(dividend_hi.to_reg(), regs::rdx());
-            collector.reg_mod(divisor.to_writable_reg());
+            collector.reg_use(divisor.to_reg());
             collector.reg_fixed_def(dst_quotient.to_writable_reg(), regs::rax());
             collector.reg_fixed_def(dst_remainder.to_writable_reg(), regs::rdx());
             if let Some(tmp) = tmp {
+                // Early def so that the temporary register does not
+                // conflict with inputs or outputs.
                 collector.reg_early_def(tmp.to_writable_reg());
             }
         }
@@ -1718,13 +1759,25 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
             dst,
             ..
         } => {
-            match *op {
-                Avx512Opcode::Vpermi2b => collector.reg_mod(dst.to_writable_reg()),
-                _ => collector.reg_def(dst.to_writable_reg()),
-            }
+            assert_ne!(*op, Avx512Opcode::Vpermi2b);
+            collector.reg_def(dst.to_writable_reg());
             collector.reg_use(src2.to_reg());
             src1.get_operands(collector);
         }
+        Inst::XmmRmREvex3 {
+            op,
+            src1,
+            src2,
+            src3,
+            dst,
+            ..
+        } => {
+            assert_eq!(*op, Avx512Opcode::Vpermi2b);
+            collector.reg_reuse_def(dst.to_writable_reg(), 2); // Reuse `src3`.
+            collector.reg_use(src2.to_reg());
+            collector.reg_use(src3.to_reg());
+            src1.get_operands(collector);
+        }
         Inst::XmmRmRImm {
             op,
             src1,
@@ -1795,7 +1848,7 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
             tmp_gpr2,
             ..
         } => {
-            collector.reg_mod(src.to_writable_reg());
+            collector.reg_use(src.to_reg());
             collector.reg_def(dst.to_writable_reg());
             collector.reg_early_def(tmp_gpr1.to_writable_reg());
             collector.reg_early_def(tmp_gpr2.to_writable_reg());
@@ -1814,7 +1867,7 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
             tmp_xmm,
             ..
         } => {
-            collector.reg_mod(src.to_writable_reg());
+            collector.reg_use(src.to_reg());
             collector.reg_def(dst.to_writable_reg());
             collector.reg_early_def(tmp_gpr.to_writable_reg());
             collector.reg_early_def(tmp_xmm.to_writable_reg());
@@ -1911,7 +1964,7 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
             ..
         } => {
             collector.reg_use(*idx);
-            collector.reg_mod(*tmp1);
+            collector.reg_early_def(*tmp1);
             collector.reg_early_def(*tmp2);
         }
 
diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs
index d76e72f88c..2148d4f400 100644
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@@ -955,40 +955,34 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
         let is_div = kind.is_div();
         let size = OperandSize::from_ty(ty);
 
-        self.lower_ctx.emit(MInst::gen_move(
-            Writable::from_reg(regs::rax()),
-            dividend.to_reg(),
-            ty,
-        ));
+        let dst_quotient = self.lower_ctx.alloc_tmp(types::I64).only_reg().unwrap();
+        let dst_remainder = self.lower_ctx.alloc_tmp(types::I64).only_reg().unwrap();
 
         // Always do explicit checks for `srem`: otherwise, INT_MIN % -1 is not handled properly.
         if self.flags.avoid_div_traps() || *kind == DivOrRemKind::SignedRem {
             // A vcode meta-instruction is used to lower the inline checks, since they embed
             // pc-relative offsets that must not change, thus requiring regalloc to not
             // interfere by introducing spills and reloads.
-            //
-            // Note it keeps the result in $rax (for divide) or $rdx (for rem), so that
-            // regalloc is aware of the coalescing opportunity between rax/rdx and the
-            // destination register.
-            let divisor_copy = self.lower_ctx.alloc_tmp(types::I64).only_reg().unwrap();
-            self.lower_ctx
-                .emit(MInst::gen_move(divisor_copy, divisor.to_reg(), types::I64));
-
             let tmp = if *kind == DivOrRemKind::SignedDiv && size == OperandSize::Size64 {
                 Some(self.lower_ctx.alloc_tmp(types::I64).only_reg().unwrap())
             } else {
                 None
             };
-            // TODO use xor
-            self.lower_ctx.emit(MInst::imm(
+            let dividend_hi = self.lower_ctx.alloc_tmp(types::I64).only_reg().unwrap();
+            self.lower_ctx.emit(MInst::alu_rmi_r(
                 OperandSize::Size32,
-                0,
-                Writable::from_reg(regs::rdx()),
+                AluRmiROpcode::Xor,
+                RegMemImm::reg(dividend_hi.to_reg()),
+                dividend_hi,
             ));
             self.lower_ctx.emit(MInst::checked_div_or_rem_seq(
                 kind.clone(),
                 size,
-                divisor_copy,
+                divisor.to_reg(),
+                Gpr::new(dividend.to_reg()).unwrap(),
+                Gpr::new(dividend_hi.to_reg()).unwrap(),
+                WritableGpr::from_reg(Gpr::new(dst_quotient.to_reg()).unwrap()),
+                WritableGpr::from_reg(Gpr::new(dst_remainder.to_reg()).unwrap()),
                 tmp,
             ));
         } else {
@@ -997,51 +991,89 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
             // divisor into a register instead.
             let divisor = RegMem::reg(divisor.to_reg());
 
+            let dividend_hi = self.lower_ctx.alloc_tmp(types::I64).only_reg().unwrap();
+
             // Fill in the high parts:
-            if kind.is_signed() {
-                // sign-extend the sign-bit of al into ah for size 1, or rax into rdx, for
-                // signed opcodes.
-                self.lower_ctx.emit(MInst::sign_extend_data(size));
+            let dividend_lo = if kind.is_signed() && ty == types::I8 {
+                let dividend_lo = self.lower_ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                // 8-bit div takes its dividend in only the `lo` reg.
+                self.lower_ctx.emit(MInst::sign_extend_data(
+                    size,
+                    Gpr::new(dividend.to_reg()).unwrap(),
+                    WritableGpr::from_reg(Gpr::new(dividend_lo.to_reg()).unwrap()),
+                ));
+                // `dividend_hi` is not used by the Div below, so we
+                // don't def it here.
+
+                dividend_lo.to_reg()
+            } else if kind.is_signed() {
+                // 16-bit and higher div takes its operand in hi:lo
+                // with half in each (64:64, 32:32 or 16:16).
+                self.lower_ctx.emit(MInst::sign_extend_data(
+                    size,
+                    Gpr::new(dividend.to_reg()).unwrap(),
+                    WritableGpr::from_reg(Gpr::new(dividend_hi.to_reg()).unwrap()),
+                ));
+
+                dividend.to_reg()
             } else if ty == types::I8 {
+                let dividend_lo = self.lower_ctx.alloc_tmp(types::I64).only_reg().unwrap();
                 self.lower_ctx.emit(MInst::movzx_rm_r(
                     ExtMode::BL,
-                    RegMem::reg(regs::rax()),
-                    Writable::from_reg(regs::rax()),
+                    RegMem::reg(dividend.to_reg()),
+                    dividend_lo,
                 ));
+
+                dividend_lo.to_reg()
             } else {
                 // zero for unsigned opcodes.
-                self.lower_ctx.emit(MInst::imm(
-                    OperandSize::Size64,
-                    0,
-                    Writable::from_reg(regs::rdx()),
-                ));
-            }
+                self.lower_ctx
+                    .emit(MInst::imm(OperandSize::Size64, 0, dividend_hi));
+
+                dividend.to_reg()
+            };
 
             // Emit the actual idiv.
-            self.lower_ctx
-                .emit(MInst::div(size, kind.is_signed(), divisor));
+            self.lower_ctx.emit(MInst::div(
+                size,
+                kind.is_signed(),
+                divisor,
+                Gpr::new(dividend_lo).unwrap(),
+                Gpr::new(dividend_hi.to_reg()).unwrap(),
+                WritableGpr::from_reg(Gpr::new(dst_quotient.to_reg()).unwrap()),
+                WritableGpr::from_reg(Gpr::new(dst_remainder.to_reg()).unwrap()),
+            ));
         }
 
         // Move the result back into the destination reg.
         if is_div {
             // The quotient is in rax.
-            self.lower_ctx
-                .emit(MInst::gen_move(dst.to_writable_reg(), regs::rax(), ty));
+            self.lower_ctx.emit(MInst::gen_move(
+                dst.to_writable_reg(),
+                dst_quotient.to_reg(),
+                ty,
+            ));
         } else {
             if size == OperandSize::Size8 {
                 // The remainder is in AH. Right-shift by 8 bits then move from rax.
                 self.lower_ctx.emit(MInst::shift_r(
                     OperandSize::Size64,
                     ShiftKind::ShiftRightLogical,
-                    Some(8),
-                    Writable::from_reg(regs::rax()),
+                    Imm8Gpr::new(Imm8Reg::Imm8 { imm: 8 }).unwrap(),
+                    dst_quotient,
+                ));
+                self.lower_ctx.emit(MInst::gen_move(
+                    dst.to_writable_reg(),
+                    dst_quotient.to_reg(),
+                    ty,
                 ));
-                self.lower_ctx
-                    .emit(MInst::gen_move(dst.to_writable_reg(), regs::rax(), ty));
             } else {
                 // The remainder is in rdx.
-                self.lower_ctx
-                    .emit(MInst::gen_move(dst.to_writable_reg(), regs::rdx(), ty));
+                self.lower_ctx.emit(MInst::gen_move(
+                    dst.to_writable_reg(),
+                    dst_remainder.to_reg(),
+                    ty,
+                ));
             }
         }
     }
diff --git a/cranelift/codegen/src/isa/x64/mod.rs b/cranelift/codegen/src/isa/x64/mod.rs
index c6093e5b71..303b90d3ab 100644
--- a/cranelift/codegen/src/isa/x64/mod.rs
+++ b/cranelift/codegen/src/isa/x64/mod.rs
@@ -427,37 +427,34 @@ mod test {
 
         // 00000000  55                push rbp
         // 00000001  4889E5            mov rbp,rsp
-        // 00000004  41B900000000      mov r9d,0x0
-        // 0000000A  83FF02            cmp edi,byte +0x2
-        // 0000000D  0F8320000000      jnc near 0x33
-        // 00000013  8BF7              mov esi,edi
-        // 00000015  490F43F1          cmovnc rsi,r9
-        // 00000019  4C8D0D0B000000    lea r9,[rel 0x2b]
-        // 00000020  496374B100        movsxd rsi,dword [r9+rsi*4+0x0]
-        // 00000025  4901F1            add r9,rsi
-        // 00000028  41FFE1            jmp r9
-        // 0000002B  1200              adc al,[rax]
-        // 0000002D  0000              add [rax],al
-        // 0000002F  1C00              sbb al,0x0
-        // 00000031  0000              add [rax],al
-        // 00000033  B803000000        mov eax,0x3
-        // 00000038  4889EC            mov rsp,rbp
-        // 0000003B  5D                pop rbp
-        // 0000003C  C3                ret
-        // 0000003D  B801000000        mov eax,0x1
-        // 00000042  4889EC            mov rsp,rbp
-        // 00000045  5D                pop rbp
-        // 00000046  C3                ret
-        // 00000047  B802000000        mov eax,0x2
-        // 0000004C  4889EC            mov rsp,rbp
-        // 0000004F  5D                pop rbp
-        // 00000050  C3                ret
+        // 00000004  83FF02            cmp edi,byte +0x2
+        // 00000007  0F8327000000      jnc near 0x34
+        // 0000000D  448BDF            mov r11d,edi
+        // 00000010  41BA00000000      mov r10d,0x0
+        // 00000016  4D0F43DA          cmovnc r11,r10
+        // 0000001A  4C8D150B000000    lea r10,[rel 0x2c]
+        // 00000021  4F635C9A00        movsxd r11,dword [r10+r11*4+0x0]
+        // 00000026  4D01DA            add r10,r11
+        // 00000029  41FFE2            jmp r10
+        // 0000002C  120000001C000000  (jumptable data)
+        // 00000034  B803000000        mov eax,0x3
+        // 00000039  4889EC            mov rsp,rbp
+        // 0000003C  5D                pop rbp
+        // 0000003D  C3                ret
+        // 0000003E  B801000000        mov eax,0x1
+        // 00000043  4889EC            mov rsp,rbp
+        // 00000046  5D                pop rbp
+        // 00000047  C3                ret
+        // 00000048  B802000000        mov eax,0x2
+        // 0000004D  4889EC            mov rsp,rbp
+        // 00000050  5D                pop rbp
+        // 00000051  C3                ret
 
         let golden = vec![
-            85, 72, 137, 229, 65, 185, 0, 0, 0, 0, 131, 255, 2, 15, 131, 32, 0, 0, 0, 139, 247, 73,
-            15, 67, 241, 76, 141, 13, 11, 0, 0, 0, 73, 99, 116, 177, 0, 73, 1, 241, 65, 255, 225,
-            18, 0, 0, 0, 28, 0, 0, 0, 184, 3, 0, 0, 0, 72, 137, 236, 93, 195, 184, 1, 0, 0, 0, 72,
-            137, 236, 93, 195, 184, 2, 0, 0, 0, 72, 137, 236, 93, 195,
+            85, 72, 137, 229, 131, 255, 2, 15, 131, 39, 0, 0, 0, 68, 139, 223, 65, 186, 0, 0, 0, 0,
+            77, 15, 67, 218, 76, 141, 21, 11, 0, 0, 0, 79, 99, 92, 154, 0, 77, 1, 218, 65, 255,
+            226, 18, 0, 0, 0, 28, 0, 0, 0, 184, 3, 0, 0, 0, 72, 137, 236, 93, 195, 184, 1, 0, 0, 0,
+            72, 137, 236, 93, 195, 184, 2, 0, 0, 0, 72, 137, 236, 93, 195,
         ];
 
         assert_eq!(code, &golden[..]);
diff --git a/cranelift/filetests/filetests/isa/x64/branches.clif b/cranelift/filetests/filetests/isa/x64/branches.clif
index ecb8800842..9bdd14e2b7 100644
--- a/cranelift/filetests/filetests/isa/x64/branches.clif
+++ b/cranelift/filetests/filetests/isa/x64/branches.clif
@@ -205,9 +205,8 @@ block2:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movl    $0, %r8d
 ;   cmpl    $2, %edi
-;   br_table %rdi
+;   br_table %rdi, %r9, %r10
 ; block1:
 ;   jmp     label3
 ; block2:
diff --git a/cranelift/filetests/filetests/isa/x64/div-checks.clif b/cranelift/filetests/filetests/isa/x64/div-checks.clif
index 132f939818..8361e2880d 100644
--- a/cranelift/filetests/filetests/isa/x64/div-checks.clif
+++ b/cranelift/filetests/filetests/isa/x64/div-checks.clif
@@ -10,8 +10,9 @@ target x86_64
 function %i8(i8, i8) -> i8 {
 block0(v0: i8, v1: i8):
   v2 = srem.i8 v0, v1
-; check:  movq    %rdi, %rax
-; nextln: movl    $$0, %edx
+; check:  xorl    %r11d, %r11d, %r11d
+; nextln: movq    %rdi, %rax
+; nextln: movq    %r11, %rdx
 ; nextln: srem_seq %al, %dl, %sil, %al, %dl, tmp=(none)
 ; nextln: shrq    $$8, %rax, %rax
 
@@ -21,8 +22,9 @@ block0(v0: i8, v1: i8):
 function %i16(i16, i16) -> i16 {
 block0(v0: i16, v1: i16):
   v2 = srem.i16 v0, v1
-; check:  movq    %rdi, %rax
-; nextln: movl    $$0, %edx
+; check:  xorl    %r11d, %r11d, %r11d
+; nextln: movq    %rdi, %rax
+; nextln: movq    %r11, %rdx
 ; nextln: srem_seq %ax, %dx, %si, %ax, %dx, tmp=(none)
 ; nextln: movq    %rdx, %rax
 
@@ -32,8 +34,9 @@ block0(v0: i16, v1: i16):
 function %i32(i32, i32) -> i32 {
 block0(v0: i32, v1: i32):
   v2 = srem.i32 v0, v1
-; check:  movq    %rdi, %rax
-; nextln: movl    $$0, %edx
+; check:  xorl    %r11d, %r11d, %r11d
+; nextln: movq    %rdi, %rax
+; nextln: movq    %r11, %rdx
 ; nextln: srem_seq %eax, %edx, %esi, %eax, %edx, tmp=(none)
 ; nextln: movq    %rdx, %rax
 
@@ -43,8 +46,9 @@ block0(v0: i32, v1: i32):
 function %i64(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
   v2 = srem.i64 v0, v1
-; check:  movq    %rdi, %rax
-; nextln: movl    $$0, %edx
+; check:  xorl    %r11d, %r11d, %r11d
+; nextln: movq    %rdi, %rax
+; nextln: movq    %r11, %rdx
 ; nextln: srem_seq %rax, %rdx, %rsi, %rax, %rdx, tmp=(none)
 ; nextln: movq    %rdx, %rax
 
diff --git a/cranelift/filetests/filetests/isa/x64/fcvt.clif b/cranelift/filetests/filetests/isa/x64/fcvt.clif
index 09c6093c54..3429078f59 100644
--- a/cranelift/filetests/filetests/isa/x64/fcvt.clif
+++ b/cranelift/filetests/filetests/isa/x64/fcvt.clif
@@ -146,16 +146,16 @@ block0(v0: i8, v1: i16, v2: i32, v3: i64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movzbq  %dil, %rax
-;   cvtsi2ss %rax, %xmm0
-;   movzwq  %si, %rax
-;   cvtsi2ss %rax, %xmm6
-;   movl    %edx, %eax
-;   cvtsi2ss %rax, %xmm7
-;   u64_to_f32_seq %rcx, %xmm4, %r8, %rdx
+;   movzbq  %dil, %rdi
+;   cvtsi2ss %rdi, %xmm0
+;   movzwq  %si, %rdi
+;   cvtsi2ss %rdi, %xmm5
+;   movl    %edx, %edi
+;   cvtsi2ss %rdi, %xmm6
+;   u64_to_f32_seq %rcx, %xmm2, %rdi, %rax
+;   addss   %xmm0, %xmm5, %xmm0
 ;   addss   %xmm0, %xmm6, %xmm0
-;   addss   %xmm0, %xmm7, %xmm0
-;   addss   %xmm0, %xmm4, %xmm0
+;   addss   %xmm0, %xmm2, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -209,7 +209,7 @@ block0(v0: f32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float32_to_uint32_seq %xmm0, %eax, %r10, %xmm6
+;   cvt_float32_to_uint32_seq %xmm0, %eax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -223,7 +223,7 @@ block0(v0: f32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float32_to_uint64_seq %xmm0, %rax, %r10, %xmm6
+;   cvt_float32_to_uint64_seq %xmm0, %rax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -237,7 +237,7 @@ block0(v0: f64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float64_to_uint32_seq %xmm0, %eax, %r10, %xmm6
+;   cvt_float64_to_uint32_seq %xmm0, %eax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -251,7 +251,7 @@ block0(v0: f64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float64_to_uint64_seq %xmm0, %rax, %r10, %xmm6
+;   cvt_float64_to_uint64_seq %xmm0, %rax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -265,7 +265,7 @@ block0(v0: f32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float32_to_uint32_sat_seq %xmm0, %eax, %r10, %xmm6
+;   cvt_float32_to_uint32_sat_seq %xmm0, %eax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -279,7 +279,7 @@ block0(v0: f32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float32_to_uint64_sat_seq %xmm0, %rax, %r10, %xmm6
+;   cvt_float32_to_uint64_sat_seq %xmm0, %rax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -293,7 +293,7 @@ block0(v0: f64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float64_to_uint32_sat_seq %xmm0, %eax, %r10, %xmm6
+;   cvt_float64_to_uint32_sat_seq %xmm0, %eax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -307,7 +307,7 @@ block0(v0: f64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float64_to_uint64_sat_seq %xmm0, %rax, %r10, %xmm6
+;   cvt_float64_to_uint64_sat_seq %xmm0, %rax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -321,7 +321,7 @@ block0(v0: f32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float32_to_sint32_seq %xmm0, %eax, %r10, %xmm6
+;   cvt_float32_to_sint32_seq %xmm0, %eax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -335,7 +335,7 @@ block0(v0: f32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float32_to_sint64_seq %xmm0, %rax, %r10, %xmm6
+;   cvt_float32_to_sint64_seq %xmm0, %rax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -349,7 +349,7 @@ block0(v0: f64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float64_to_sint32_seq %xmm0, %eax, %r10, %xmm6
+;   cvt_float64_to_sint32_seq %xmm0, %eax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -363,7 +363,7 @@ block0(v0: f64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float64_to_sint64_seq %xmm0, %rax, %r10, %xmm6
+;   cvt_float64_to_sint64_seq %xmm0, %rax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -377,7 +377,7 @@ block0(v0: f32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float32_to_sint32_sat_seq %xmm0, %eax, %r10, %xmm6
+;   cvt_float32_to_sint32_sat_seq %xmm0, %eax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -391,7 +391,7 @@ block0(v0: f32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float32_to_sint64_sat_seq %xmm0, %rax, %r10, %xmm6
+;   cvt_float32_to_sint64_sat_seq %xmm0, %rax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -405,7 +405,7 @@ block0(v0: f64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float64_to_sint32_sat_seq %xmm0, %eax, %r10, %xmm6
+;   cvt_float64_to_sint32_sat_seq %xmm0, %eax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -419,7 +419,7 @@ block0(v0: f64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float64_to_sint64_sat_seq %xmm0, %rax, %r10, %xmm6
+;   cvt_float64_to_sint64_sat_seq %xmm0, %rax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
diff --git a/cranelift/filetests/filetests/isa/x64/sdiv.clif b/cranelift/filetests/filetests/isa/x64/sdiv.clif
index c0f486c71f..6c13154db7 100644
--- a/cranelift/filetests/filetests/isa/x64/sdiv.clif
+++ b/cranelift/filetests/filetests/isa/x64/sdiv.clif
@@ -11,8 +11,9 @@ block0(v0: i8, v1: i8):
 ;   movq    %rsp, %rbp
 ; block0:
 ;   movq    %rdi, %rax
-;   cbw %al, %dl
-;   idiv    %al, (none), %sil, %al, %dl
+;   cbw %al, %al
+;   movq    %rax, %rdi
+;   idiv    %al, (none), %sil, %al, (none)
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -28,6 +29,7 @@ block0(v0: i16, v1: i16):
 ; block0:
 ;   movq    %rdi, %rax
 ;   cwd %ax, %dx
+;   movq    %rdx, %r8
 ;   idiv    %ax, %dx, %si, %ax, %dx
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
@@ -44,6 +46,7 @@ block0(v0: i32, v1: i32):
 ; block0:
 ;   movq    %rdi, %rax
 ;   cdq %eax, %edx
+;   movq    %rdx, %r8
 ;   idiv    %eax, %edx, %esi, %eax, %edx
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
@@ -60,6 +63,7 @@ block0(v0: i64, v1: i64):
 ; block0:
 ;   movq    %rdi, %rax
 ;   cqo %rax, %rdx
+;   movq    %rdx, %r8
 ;   idiv    %rax, %rdx, %rsi, %rax, %rdx
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
diff --git a/cranelift/filetests/filetests/isa/x64/shuffle-avx512.clif b/cranelift/filetests/filetests/isa/x64/shuffle-avx512.clif
index 29221415ca..827c80ffe2 100644
--- a/cranelift/filetests/filetests/isa/x64/shuffle-avx512.clif
+++ b/cranelift/filetests/filetests/isa/x64/shuffle-avx512.clif
@@ -12,9 +12,10 @@ block0(v0: i8x16, v1: i8x16):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movdqa  %xmm0, %xmm9
+;   movdqa  %xmm0, %xmm6
 ;   load_const VCodeConstant(0), %xmm0
-;   vpermi2b %xmm1, %xmm0, %xmm9
+;   movdqa  %xmm6, %xmm8
+;   vpermi2b %xmm1, %xmm8, %xmm0, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -31,11 +32,12 @@ block0(v0: i8x16, v1: i8x16):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movdqa  %xmm0, %xmm12
+;   movdqa  %xmm0, %xmm9
 ;   load_const VCodeConstant(1), %xmm0
-;   load_const VCodeConstant(0), %xmm7
-;   vpermi2b %xmm1, %xmm7, %xmm12
-;   andps   %xmm0, %xmm7, %xmm0
+;   load_const VCodeConstant(0), %xmm8
+;   movdqa  %xmm9, %xmm11
+;   vpermi2b %xmm1, %xmm11, %xmm8, %xmm8
+;   andps   %xmm0, %xmm8, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -49,9 +51,10 @@ block0(v0: i8x16, v1: i8x16):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movdqa  %xmm0, %xmm9
+;   movdqa  %xmm0, %xmm6
 ;   load_const VCodeConstant(0), %xmm0
-;   vpermi2b %xmm1, %xmm0, %xmm9
+;   movdqa  %xmm6, %xmm8
+;   vpermi2b %xmm1, %xmm8, %xmm0, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
diff --git a/cranelift/filetests/filetests/isa/x64/srem.clif b/cranelift/filetests/filetests/isa/x64/srem.clif
index 99b137d566..fa7ee252fe 100644
--- a/cranelift/filetests/filetests/isa/x64/srem.clif
+++ b/cranelift/filetests/filetests/isa/x64/srem.clif
@@ -10,8 +10,9 @@ block0(v0: i8, v1: i8):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
+;   xorl    %r11d, %r11d, %r11d
 ;   movq    %rdi, %rax
-;   movl    $0, %edx
+;   movq    %r11, %rdx
 ;   srem_seq %al, %dl, %sil, %al, %dl, tmp=(none)
 ;   shrq    $8, %rax, %rax
 ;   movq    %rbp, %rsp
@@ -27,8 +28,9 @@ block0(v0: i16, v1: i16):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
+;   xorl    %r11d, %r11d, %r11d
 ;   movq    %rdi, %rax
-;   movl    $0, %edx
+;   movq    %r11, %rdx
 ;   srem_seq %ax, %dx, %si, %ax, %dx, tmp=(none)
 ;   movq    %rdx, %rax
 ;   movq    %rbp, %rsp
@@ -44,8 +46,9 @@ block0(v0: i32, v1: i32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
+;   xorl    %r11d, %r11d, %r11d
 ;   movq    %rdi, %rax
-;   movl    $0, %edx
+;   movq    %r11, %rdx
 ;   srem_seq %eax, %edx, %esi, %eax, %edx, tmp=(none)
 ;   movq    %rdx, %rax
 ;   movq    %rbp, %rsp
@@ -61,8 +64,9 @@ block0(v0: i64, v1: i64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
+;   xorl    %r11d, %r11d, %r11d
 ;   movq    %rdi, %rax
-;   movl    $0, %edx
+;   movq    %r11, %rdx
 ;   srem_seq %rax, %rdx, %rsi, %rax, %rdx, tmp=(none)
 ;   movq    %rdx, %rax
 ;   movq    %rbp, %rsp
diff --git a/cranelift/filetests/filetests/isa/x64/udiv.clif b/cranelift/filetests/filetests/isa/x64/udiv.clif
index a49b5a027e..75efb2d9d7 100644
--- a/cranelift/filetests/filetests/isa/x64/udiv.clif
+++ b/cranelift/filetests/filetests/isa/x64/udiv.clif
@@ -10,9 +10,9 @@ block0(v0: i8, v1: i8):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movq    %rdi, %rax
-;   movzbl  %al, %eax
-;   div     %al, (none), %sil, %al, %dl
+;   movzbl  %dil, %r10d
+;   movq    %r10, %rax
+;   div     %al, (none), %sil, %al, (none)
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -26,8 +26,9 @@ block0(v0: i16, v1: i16):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
+;   movl    $0, %r11d
 ;   movq    %rdi, %rax
-;   movl    $0, %edx
+;   movq    %r11, %rdx
 ;   div     %ax, %dx, %si, %ax, %dx
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
@@ -42,8 +43,9 @@ block0(v0: i32, v1: i32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
+;   movl    $0, %r11d
 ;   movq    %rdi, %rax
-;   movl    $0, %edx
+;   movq    %r11, %rdx
 ;   div     %eax, %edx, %esi, %eax, %edx
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
@@ -58,8 +60,9 @@ block0(v0: i64, v1: i64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
+;   movl    $0, %r11d
 ;   movq    %rdi, %rax
-;   movl    $0, %edx
+;   movq    %r11, %rdx
 ;   div     %rax, %rdx, %rsi, %rax, %rdx
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
diff --git a/cranelift/filetests/filetests/isa/x64/urem.clif b/cranelift/filetests/filetests/isa/x64/urem.clif
index 5f4e80251f..dc21776f6a 100644
--- a/cranelift/filetests/filetests/isa/x64/urem.clif
+++ b/cranelift/filetests/filetests/isa/x64/urem.clif
@@ -10,9 +10,9 @@ block0(v0: i8, v1: i8):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movq    %rdi, %rax
-;   movzbl  %al, %eax
-;   div     %al, (none), %sil, %al, %dl
+;   movzbl  %dil, %r10d
+;   movq    %r10, %rax
+;   div     %al, (none), %sil, %al, (none)
 ;   shrq    $8, %rax, %rax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
@@ -27,8 +27,9 @@ block0(v0: i16, v1: i16):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
+;   movl    $0, %r11d
 ;   movq    %rdi, %rax
-;   movl    $0, %edx
+;   movq    %r11, %rdx
 ;   div     %ax, %dx, %si, %ax, %dx
 ;   movq    %rdx, %rax
 ;   movq    %rbp, %rsp
@@ -44,8 +45,9 @@ block0(v0: i32, v1: i32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
+;   movl    $0, %r11d
 ;   movq    %rdi, %rax
-;   movl    $0, %edx
+;   movq    %r11, %rdx
 ;   div     %eax, %edx, %esi, %eax, %edx
 ;   movq    %rdx, %rax
 ;   movq    %rbp, %rsp
@@ -61,8 +63,9 @@ block0(v0: i64, v1: i64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
+;   movl    $0, %r11d
 ;   movq    %rdi, %rax
-;   movl    $0, %edx
+;   movq    %r11, %rdx
 ;   div     %rax, %rdx, %rsi, %rax, %rdx
 ;   movq    %rdx, %rax
 ;   movq    %rbp, %rsp