From 2b325a1878ad5da36c06469046ecf61fddde6c0c Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Mon, 4 Jan 2021 14:50:42 -0600
Subject: [PATCH 01/10] Try to fix CI (#2544)

This is an attempt to work around rust-lang/rust#80703 to get CI green
again.
---
 .github/actions/binary-compatible-builds/main.js | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/actions/binary-compatible-builds/main.js b/.github/actions/binary-compatible-builds/main.js
index d2d774f4eb..a70cd76e83 100755
--- a/.github/actions/binary-compatible-builds/main.js
+++ b/.github/actions/binary-compatible-builds/main.js
@@ -51,6 +51,8 @@ child_process.execFileSync('docker', [
   '-v', `${process.cwd()}:${process.cwd()}`,
   '-v', `${child_process.execSync('rustc --print sysroot').toString().trim()}:/rust:ro`,
   '--env', `PATH=${path}`,
+  // FIXME(rust-lang/rust#80703) this shouldn't be necessary
+  '--env', `LD_LIBRARY_PATH=/rust/lib`,
   'centos:7',
 ], stdio);
 

From dbd2241b6066060c359318e945098998656c704e Mon Sep 17 00:00:00 2001
From: Chris Fallin <chris@cfallin.org>
Date: Mon, 4 Jan 2021 16:42:24 -0800
Subject: [PATCH 02/10] x64: handle tests of b1 values correctly (only LSB is
 defined).

Previously, `select` and `brz`/`brnz` instructions, when given a `b1`
boolean argument, would test whether that boolean argument was nonzero,
rather than whether its LSB was nonzero. Since our invariant for mapping
CLIF state to machine state is that bits beyond the width of a value are
undefined, the proper lowering is to test only the LSB.

(aarch64 does not have the same issue because its `Extend` pseudoinst
already properly handles masking of b1 values when a zero-extend is
requested, as it is for select/brz/brnz.)

Found by Nathan Ringo on Zulip [1] (thanks!).

[1]
https://bytecodealliance.zulipchat.com/#narrow/stream/217117-cranelift/topic/bnot.20on.20b1s
---
 cranelift/codegen/src/isa/x64/inst/args.rs    |  8 ++
 cranelift/codegen/src/isa/x64/inst/emit.rs    | 45 +++++++---
 .../codegen/src/isa/x64/inst/emit_tests.rs    | 82 +++++++++++++++++++
 cranelift/codegen/src/isa/x64/inst/mod.rs     | 52 ++++++++++--
 .../src/isa/x64/inst/unwind/systemv.rs        |  2 +-
 cranelift/codegen/src/isa/x64/lower.rs        | 38 +++++++--
 cranelift/filetests/filetests/isa/x64/b1.clif | 73 +++++++++++++++++
 7 files changed, 271 insertions(+), 29 deletions(-)
 create mode 100644 cranelift/filetests/filetests/isa/x64/b1.clif

diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs
index 4c61954630..680d0921ff 100644
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -397,6 +397,14 @@ impl fmt::Display for UnaryRmROpcode {
     }
 }
 
+#[derive(Clone, Copy, PartialEq)]
+pub enum CmpOpcode {
+    /// CMP instruction: compute `a - b` and set flags from result.
+    Cmp,
+    /// TEST instruction: compute `a & b` and set flags from result.
+    Test,
+}
+
 pub(crate) enum InstructionSet {
     SSE,
     SSE2,
diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
index fb32635a92..0bd8af840f 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -1311,7 +1311,13 @@ pub(crate) fn emit(
             size,
             src: src_e,
             dst: reg_g,
+            opcode,
         } => {
+            let is_cmp = match opcode {
+                CmpOpcode::Cmp => true,
+                CmpOpcode::Test => false,
+            };
+
             let mut prefix = LegacyPrefixes::None;
             if *size == 2 {
                 prefix = LegacyPrefixes::_66;
@@ -1342,16 +1348,26 @@ pub(crate) fn emit(
                         }
                     }
 
-                    // Use the swapped operands encoding, to stay consistent with the output of
+                    // Use the swapped operands encoding for CMP, to stay consistent with the output of
                     // gcc/llvm.
-                    let opcode = if *size == 1 { 0x38 } else { 0x39 };
+                    let opcode = match (*size, is_cmp) {
+                        (1, true) => 0x38,
+                        (_, true) => 0x39,
+                        (1, false) => 0x84,
+                        (_, false) => 0x85,
+                    };
                     emit_std_reg_reg(sink, prefix, opcode, 1, *reg_e, *reg_g, rex);
                 }
 
                 RegMemImm::Mem { addr } => {
                     let addr = &addr.finalize(state, sink);
-                    // Whereas here we revert to the "normal" G-E ordering.
-                    let opcode = if *size == 1 { 0x3A } else { 0x3B };
+                    // Whereas here we revert to the "normal" G-E ordering for CMP.
+                    let opcode = match (*size, is_cmp) {
+                        (1, true) => 0x3A,
+                        (_, true) => 0x3B,
+                        (1, false) => 0x84,
+                        (_, false) => 0x85,
+                    };
                     emit_std_reg_mem(sink, state, info, prefix, opcode, 1, *reg_g, addr, rex);
                 }
 
@@ -1361,16 +1377,25 @@ pub(crate) fn emit(
                     let use_imm8 = low8_will_sign_extend_to_32(*simm32);
 
                     // And also here we use the "normal" G-E ordering.
-                    let opcode = if *size == 1 {
-                        0x80
-                    } else if use_imm8 {
-                        0x83
+                    let opcode = if is_cmp {
+                        if *size == 1 {
+                            0x80
+                        } else if use_imm8 {
+                            0x83
+                        } else {
+                            0x81
+                        }
                     } else {
-                        0x81
+                        if *size == 1 {
+                            0xF6
+                        } else {
+                            0xF7
+                        }
                     };
+                    let subopcode = if is_cmp { 7 } else { 0 };
 
                     let enc_g = int_reg_enc(*reg_g);
-                    emit_std_enc_enc(sink, prefix, opcode, 1, 7 /*subopcode*/, enc_g, rex);
+                    emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, enc_g, rex);
                     emit_simm(sink, if use_imm8 { 1 } else { *size }, *simm32);
                 }
             }
diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
index e2afa80e2a..48e831b2d8 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -2648,6 +2648,88 @@ fn test_x64_emit() {
         "cmpb    %r13b, %r14b",
     ));
 
+    // ========================================================
+    // TestRmiR
+    insns.push((
+        Inst::test_rmi_r(8, RegMemImm::reg(r15), rdx),
+        "4C85FA",
+        "testq   %r15, %rdx",
+    ));
+    insns.push((
+        Inst::test_rmi_r(8, RegMemImm::mem(Amode::imm_reg(99, rdi)), rdx),
+        "48855763",
+        "testq   99(%rdi), %rdx",
+    ));
+    insns.push((
+        Inst::test_rmi_r(8, RegMemImm::imm(76543210), rdx),
+        "48F7C2EAF48F04",
+        "testq   $76543210, %rdx",
+    ));
+    //
+    insns.push((
+        Inst::test_rmi_r(4, RegMemImm::reg(r15), rdx),
+        "4485FA",
+        "testl   %r15d, %edx",
+    ));
+    insns.push((
+        Inst::test_rmi_r(4, RegMemImm::mem(Amode::imm_reg(99, rdi)), rdx),
+        "855763",
+        "testl   99(%rdi), %edx",
+    ));
+    insns.push((
+        Inst::test_rmi_r(4, RegMemImm::imm(76543210), rdx),
+        "F7C2EAF48F04",
+        "testl   $76543210, %edx",
+    ));
+    //
+    insns.push((
+        Inst::test_rmi_r(2, RegMemImm::reg(r15), rdx),
+        "664485FA",
+        "testw   %r15w, %dx",
+    ));
+    insns.push((
+        Inst::test_rmi_r(2, RegMemImm::mem(Amode::imm_reg(99, rdi)), rdx),
+        "66855763",
+        "testw   99(%rdi), %dx",
+    ));
+    insns.push((
+        Inst::test_rmi_r(2, RegMemImm::imm(23210), rdx),
+        "66F7C2AA5A",
+        "testw   $23210, %dx",
+    ));
+    //
+    insns.push((
+        Inst::test_rmi_r(1, RegMemImm::reg(r15), rdx),
+        "4484FA",
+        "testb   %r15b, %dl",
+    ));
+    insns.push((
+        Inst::test_rmi_r(1, RegMemImm::mem(Amode::imm_reg(99, rdi)), rdx),
+        "845763",
+        "testb   99(%rdi), %dl",
+    ));
+    insns.push((
+        Inst::test_rmi_r(1, RegMemImm::imm(70), rdx),
+        "F6C246",
+        "testb   $70, %dl",
+    ));
+    // Extra byte-cases (paranoia!) for test_rmi_r for first operand = R
+    insns.push((
+        Inst::test_rmi_r(1, RegMemImm::reg(rax), rbx),
+        "84C3",
+        "testb   %al, %bl",
+    ));
+    insns.push((
+        Inst::test_rmi_r(1, RegMemImm::reg(rcx), rsi),
+        "4084CE",
+        "testb   %cl, %sil",
+    ));
+    insns.push((
+        Inst::test_rmi_r(1, RegMemImm::reg(rcx), r10),
+        "4184CA",
+        "testb   %cl, %r10b",
+    ));
+
     // ========================================================
     // SetCC
     insns.push((Inst::setcc(CC::O, w_rsi), "400F90C6", "seto    %sil"));
diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs
index 806e8f276e..0f0866c813 100644
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -168,9 +168,10 @@ pub enum Inst {
         dst: Writable<Reg>,
     },
 
-    /// Integer comparisons/tests: cmp (b w l q) (reg addr imm) reg.
+    /// Integer comparisons/tests: cmp or test (b w l q) (reg addr imm) reg.
     CmpRmiR {
         size: u8, // 1, 2, 4 or 8
+        opcode: CmpOpcode,
         src: RegMemImm,
         dst: Reg,
     },
@@ -913,8 +914,30 @@ impl Inst {
     ) -> Inst {
         src.assert_regclass_is(RegClass::I64);
         debug_assert!(size == 8 || size == 4 || size == 2 || size == 1);
-        debug_assert!(dst.get_class() == RegClass::I64);
-        Inst::CmpRmiR { size, src, dst }
+        debug_assert_eq!(dst.get_class(), RegClass::I64);
+        Inst::CmpRmiR {
+            size,
+            src,
+            dst,
+            opcode: CmpOpcode::Cmp,
+        }
+    }
+
+    /// Does a comparison of dst & src for operands of size `size`.
+    pub(crate) fn test_rmi_r(
+        size: u8, // 1, 2, 4 or 8
+        src: RegMemImm,
+        dst: Reg,
+    ) -> Inst {
+        src.assert_regclass_is(RegClass::I64);
+        debug_assert!(size == 8 || size == 4 || size == 2 || size == 1);
+        debug_assert_eq!(dst.get_class(), RegClass::I64);
+        Inst::CmpRmiR {
+            size,
+            src,
+            dst,
+            opcode: CmpOpcode::Test,
+        }
     }
 
     pub(crate) fn trap(trap_code: TrapCode) -> Inst {
@@ -1597,12 +1620,23 @@ impl PrettyPrint for Inst {
                 dst.to_reg().show_rru(mb_rru)
             ),
 
-            Inst::CmpRmiR { size, src, dst } => format!(
-                "{} {}, {}",
-                ljustify2("cmp".to_string(), suffix_bwlq(*size)),
-                src.show_rru_sized(mb_rru, *size),
-                show_ireg_sized(*dst, mb_rru, *size)
-            ),
+            Inst::CmpRmiR {
+                size,
+                src,
+                dst,
+                opcode,
+            } => {
+                let op = match opcode {
+                    CmpOpcode::Cmp => "cmp",
+                    CmpOpcode::Test => "test",
+                };
+                format!(
+                    "{} {}, {}",
+                    ljustify2(op.to_string(), suffix_bwlq(*size)),
+                    src.show_rru_sized(mb_rru, *size),
+                    show_ireg_sized(*dst, mb_rru, *size)
+                )
+            }
 
             Inst::Setcc { cc, dst } => format!(
                 "{} {}",
diff --git a/cranelift/codegen/src/isa/x64/inst/unwind/systemv.rs b/cranelift/codegen/src/isa/x64/inst/unwind/systemv.rs
index 68473a8afb..e89b8a24ff 100644
--- a/cranelift/codegen/src/isa/x64/inst/unwind/systemv.rs
+++ b/cranelift/codegen/src/isa/x64/inst/unwind/systemv.rs
@@ -175,7 +175,7 @@ mod tests {
             _ => panic!("expected unwind information"),
         };
 
-        assert_eq!(format!("{:?}", fde), "FrameDescriptionEntry { address: Constant(4321), length: 23, lsda: None, instructions: [(1, CfaOffset(16)), (1, Offset(Register(6), -16)), (4, CfaRegister(Register(6))), (16, RememberState), (18, RestoreState)] }");
+        assert_eq!(format!("{:?}", fde), "FrameDescriptionEntry { address: Constant(4321), length: 22, lsda: None, instructions: [(1, CfaOffset(16)), (1, Offset(Register(6), -16)), (4, CfaRegister(Register(6))), (15, RememberState), (17, RestoreState)] }");
     }
 
     fn create_multi_return_function(call_conv: CallConv) -> Function {
diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs
index 6e6198c44b..9299fce738 100644
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -248,12 +248,6 @@ fn non_reg_input_to_sext_imm(input: NonRegInput, input_ty: Type) -> Option<u32>
     })
 }
 
-fn input_to_sext_imm<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> Option<u32> {
-    let input = ctx.get_input_as_source_or_const(spec.insn, spec.input);
-    let input_ty = ctx.input_ty(spec.insn, spec.input);
-    non_reg_input_to_sext_imm(input, input_ty)
-}
-
 fn input_to_imm<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> Option<u64> {
     ctx.get_input_as_source_or_const(spec.insn, spec.input)
         .constant
@@ -3731,10 +3725,25 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     let cond_code = ctx.data(icmp).cond_code().unwrap();
                     CC::from_intcc(cond_code)
                 } else {
-                    // The input is a boolean value, compare it against zero.
+                    let sel_ty = ctx.input_ty(insn, 0);
                     let size = ctx.input_ty(insn, 0).bytes() as u8;
                     let test = put_input_in_reg(ctx, flag_input);
-                    ctx.emit(Inst::cmp_rmi_r(size, RegMemImm::imm(0), test));
+                    let test_input = if sel_ty == types::B1 {
+                        // The input is a boolean value; test the LSB for nonzero with:
+                        //     test reg, 1
+                        RegMemImm::imm(1)
+                    } else {
+                        // The input is an integer; test the whole value for
+                        // nonzero with:
+                        //     test reg, reg
+                        //
+                        // (It doesn't make sense to have a boolean wider than
+                        // one bit here -- which bit would cause us to select an
+                        // input?)
+                        assert!(!is_bool_ty(sel_ty));
+                        RegMemImm::reg(test)
+                    };
+                    ctx.emit(Inst::test_rmi_r(size, test_input, test));
                     CC::NZ
                 };
 
@@ -4355,7 +4364,18 @@ impl LowerBackend for X64Backend {
                             _ => unreachable!(),
                         };
                         let size_bytes = src_ty.bytes() as u8;
-                        ctx.emit(Inst::cmp_rmi_r(size_bytes, RegMemImm::imm(0), src));
+                        // See case for `Opcode::Select` above re: testing the
+                        // boolean input.
+                        let test_input = if src_ty == types::B1 {
+                            // test src, 1
+                            RegMemImm::imm(1)
+                        } else {
+                            assert!(!is_bool_ty(src_ty));
+                            // test src, src
+                            RegMemImm::reg(src)
+                        };
+
+                        ctx.emit(Inst::test_rmi_r(size_bytes, test_input, src));
                         ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
                     } else {
                         unimplemented!("brz/brnz with non-int type {:?}", src_ty);
diff --git a/cranelift/filetests/filetests/isa/x64/b1.clif b/cranelift/filetests/filetests/isa/x64/b1.clif
new file mode 100644
index 0000000000..7b65fa4e55
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/b1.clif
@@ -0,0 +1,73 @@
+test compile
+target x86_64
+feature "experimental_x64"
+
+function %f0(b1, i32, i32) -> i32 {
+; check:  pushq   %rbp
+; nextln: movq    %rsp, %rbp
+
+block0(v0: b1, v1: i32, v2: i32):
+    v3 = select.i32 v0, v1, v2
+; nextln: testb   $$1, %dil
+; nextln: cmovnzl %esi, %edx
+
+    return v3
+; nextln: movq    %rdx, %rax
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
+}
+
+function %f1(b1) -> i32 {
+; check:  pushq   %rbp
+; nextln: movq    %rsp, %rbp
+
+block0(v0: b1):
+    brnz v0, block1
+    jump block2
+; nextln: testb   $$1, %dil
+; nextln: jnz     label1; j label2
+
+block1:
+    v1 = iconst.i32 1
+    return v1
+; check:  movl    $$1, %eax
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
+
+block2:
+    v2 = iconst.i32 2
+    return v2
+; check:  movl    $$2, %eax
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
+}
+
+function %f2(b1) -> i32 {
+; check:  pushq   %rbp
+; nextln: movq    %rsp, %rbp
+
+block0(v0: b1):
+    brz v0, block1
+    jump block2
+; nextln: testb   $$1, %dil
+; nextln: jz      label1; j label2
+
+block1:
+    v1 = iconst.i32 1
+    return v1
+; check:  movl    $$1, %eax
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
+
+block2:
+    v2 = iconst.i32 2
+    return v2
+; check:  movl    $$2, %eax
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
+}

From aac3751025e63a2ea978df97eac81f3fe73563eb Mon Sep 17 00:00:00 2001
From: Chris Fallin <chris@cfallin.org>
Date: Tue, 5 Jan 2021 15:48:07 -0800
Subject: [PATCH 03/10] aarch64: fix reg/imm `sub` insts that read `SP`, not
 the zero register.

On AArch64, the zero register (xzr) and the stack pointer (xsp) are
alternately named by the same index `31` in machine code depending on
context. In particular, in the reg-reg-immediate ALU instruction form,
add/subtract will use the stack pointer, not the zero register, if index
31 is given for the first (register) source arg.

In a few places, we were emitting subtract instructions with the zero
register as an argument and a reg/immediate as the second argument. When
an immediate could be incorporated directly (we have the `iconst`
definition visible), this would result in incorrect code being
generated.

This issue was found in `ineg` and in the sequence for vector
right-shifts.

Reported by Ian Cullinan; thanks!
---
 .../codegen/src/isa/aarch64/lower_inst.rs     | 13 +++++---
 .../filetests/isa/aarch64/arithmetic.clif     | 32 +++++++++++++++++++
 2 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index 89bcd517f4..504fcba438 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -215,9 +215,9 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let ty = ty.unwrap();
             if !ty.is_vector() {
                 let rn = zero_reg();
-                let rm = put_input_in_rse_imm12(ctx, inputs[0], NarrowValueMode::None);
+                let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                 let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
-                ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
+                ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm });
             } else {
                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                 ctx.emit(Inst::VecMisc {
@@ -693,9 +693,14 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 let rm = if is_right_shift {
                     // Right shifts are implemented with a negative left shift.
                     let tmp = ctx.alloc_tmp(RegClass::I64, I32);
-                    let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None);
+                    let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
                     let rn = zero_reg();
-                    ctx.emit(alu_inst_imm12(ALUOp::Sub32, tmp, rn, rm));
+                    ctx.emit(Inst::AluRRR {
+                        alu_op: ALUOp::Sub32,
+                        rd: tmp,
+                        rn,
+                        rm,
+                    });
                     tmp.to_reg()
                 } else {
                     put_input_in_reg(ctx, inputs[1], NarrowValueMode::None)
diff --git a/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif b/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif
index aa52cb7436..2ffc63ce71 100644
--- a/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif
@@ -422,3 +422,35 @@ block0(v0: i64):
 ; nextln:  mov sp, fp
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
+
+function %f29(i64) -> i64 {
+block0(v0: i64):
+  v1 = iconst.i64 1
+  v2 = ineg v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  movz x0, #1
+; nextln:  sub x0, xzr, x0
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f30(i8x16) -> i8x16 {
+block0(v0: i8x16):
+  v1 = iconst.i64 1
+  v2 = ushr.i8x16 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movz x0, #1
+; nextln: sub w0, wzr, w0
+; nextln: dup v1.16b, w0
+; nextln: ushl v0.16b, v0.16b, v1.16b
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret

From 46b1864c9e12b83c00843db30858c5389d41f67a Mon Sep 17 00:00:00 2001
From: Pat Hickey <pat@moreproductive.org>
Date: Tue, 5 Jan 2021 17:28:03 -0800
Subject: [PATCH 04/10] wiggle-wasmtime: get rid of "missing_memory" error
 code, we can Trap now

the missing memory behavior was always a silly thing, that we generate a
function for wasmtime which is Result<_, Trap> we can just Err(Trap)
when the memory export is missing.
---
 crates/wiggle/wasmtime/macro/src/config.rs | 32 +---------------------
 crates/wiggle/wasmtime/macro/src/lib.rs    | 31 +++++----------------
 2 files changed, 8 insertions(+), 55 deletions(-)

diff --git a/crates/wiggle/wasmtime/macro/src/config.rs b/crates/wiggle/wasmtime/macro/src/config.rs
index 8f2d9bbc0f..6359ed0ec4 100644
--- a/crates/wiggle/wasmtime/macro/src/config.rs
+++ b/crates/wiggle/wasmtime/macro/src/config.rs
@@ -1,5 +1,5 @@
 use {
-    proc_macro2::{Span, TokenStream},
+    proc_macro2::Span,
     std::collections::HashMap,
     syn::{
         braced,
@@ -16,7 +16,6 @@ pub struct Config {
     pub witx: WitxConf,
     pub ctx: CtxConf,
     pub modules: ModulesConf,
-    pub missing_memory: MissingMemoryConf,
 }
 
 #[derive(Debug, Clone)]
@@ -25,7 +24,6 @@ pub enum ConfigField {
     Witx(WitxConf),
     Ctx(CtxConf),
     Modules(ModulesConf),
-    MissingMemory(MissingMemoryConf),
 }
 
 mod kw {
@@ -36,7 +34,6 @@ mod kw {
     syn::custom_keyword!(modules);
     syn::custom_keyword!(name);
     syn::custom_keyword!(docs);
-    syn::custom_keyword!(missing_memory);
     syn::custom_keyword!(function_override);
 }
 
@@ -63,10 +60,6 @@ impl Parse for ConfigField {
             input.parse::<kw::modules>()?;
             input.parse::<Token![:]>()?;
             Ok(ConfigField::Modules(input.parse()?))
-        } else if lookahead.peek(kw::missing_memory) {
-            input.parse::<kw::missing_memory>()?;
-            input.parse::<Token![:]>()?;
-            Ok(ConfigField::MissingMemory(input.parse()?))
         } else {
             Err(lookahead.error())
         }
@@ -79,7 +72,6 @@ impl Config {
         let mut witx = None;
         let mut ctx = None;
         let mut modules = None;
-        let mut missing_memory = None;
         for f in fields {
             match f {
                 ConfigField::Target(c) => {
@@ -106,12 +98,6 @@ impl Config {
                     }
                     modules = Some(c);
                 }
-                ConfigField::MissingMemory(c) => {
-                    if missing_memory.is_some() {
-                        return Err(Error::new(err_loc, "duplicate `missing_memory` field"));
-                    }
-                    missing_memory = Some(c);
-                }
             }
         }
         Ok(Config {
@@ -119,8 +105,6 @@ impl Config {
             witx: witx.ok_or_else(|| Error::new(err_loc, "`witx` field required"))?,
             ctx: ctx.ok_or_else(|| Error::new(err_loc, "`ctx` field required"))?,
             modules: modules.ok_or_else(|| Error::new(err_loc, "`modules` field required"))?,
-            missing_memory: missing_memory
-                .ok_or_else(|| Error::new(err_loc, "`missing_memory` field required"))?,
         })
     }
 
@@ -265,20 +249,6 @@ impl Parse for ModulesConf {
     }
 }
 
-#[derive(Debug, Clone)]
-pub struct MissingMemoryConf {
-    pub err: TokenStream,
-}
-impl Parse for MissingMemoryConf {
-    fn parse(input: ParseStream) -> Result<Self> {
-        let contents;
-        let _lbrace = braced!(contents in input);
-        Ok(MissingMemoryConf {
-            err: contents.parse()?,
-        })
-    }
-}
-
 #[derive(Debug, Clone, Default)]
 pub struct FunctionOverrideConf {
     pub funcs: Vec<FunctionOverrideField>,
diff --git a/crates/wiggle/wasmtime/macro/src/lib.rs b/crates/wiggle/wasmtime/macro/src/lib.rs
index 71b37862c6..095dcd16c5 100644
--- a/crates/wiggle/wasmtime/macro/src/lib.rs
+++ b/crates/wiggle/wasmtime/macro/src/lib.rs
@@ -6,7 +6,7 @@ use wiggle_generate::Names;
 
 mod config;
 
-use config::{MissingMemoryConf, ModuleConf, TargetConf};
+use config::{ModuleConf, TargetConf};
 
 /// Define the structs required to integrate a Wiggle implementation with Wasmtime.
 ///
@@ -41,9 +41,6 @@ use config::{MissingMemoryConf, ModuleConf, TargetConf};
 ///    Example:
 ///    `modules: { some_module => { name: SomeTypeName, docs: "Doc string for definition of
 ///     SomeTypeName here", function_override: { foo => my_own_foo } }`.
-/// * `missing_memory`: Describes the error value to return in case the calling module does not
-///   export a Memory as `"memory"`. This value is given in braces, e.g. `missing_memory: {
-///   wasi_common::wasi::Errno::Inval }`.
 ///
 #[proc_macro]
 pub fn wasmtime_integration(args: TokenStream) -> TokenStream {
@@ -55,13 +52,7 @@ pub fn wasmtime_integration(args: TokenStream) -> TokenStream {
         let module = doc
             .module(&witx::Id::new(name))
             .unwrap_or_else(|| panic!("witx document did not contain module named '{}'", name));
-        generate_module(
-            &module,
-            &module_conf,
-            &names,
-            &config.target,
-            &config.missing_memory,
-        )
+        generate_module(&module, &module_conf, &names, &config.target)
     });
     quote!( #(#modules)* ).into()
 }
@@ -71,7 +62,6 @@ fn generate_module(
     module_conf: &ModuleConf,
     names: &Names,
     target_conf: &TargetConf,
-    missing_mem_conf: &MissingMemoryConf,
 ) -> TokenStream2 {
     let fields = module.funcs().map(|f| {
         let name_ident = names.func(&f.name);
@@ -103,7 +93,7 @@ fn generate_module(
             let name_ident = names.func(&f.name);
             quote! { let #name_ident = wasmtime::Func::wrap(store, #func_override); }
         } else {
-            generate_func(&f, names, missing_mem_conf, &target_module)
+            generate_func(&f, names, &target_module)
         }
     });
 
@@ -165,10 +155,8 @@ contained in the `cx` parameter.",
 fn generate_func(
     func: &witx::InterfaceFunc,
     names: &Names,
-    missing_mem_conf: &MissingMemoryConf,
     target_module: &TokenStream2,
 ) -> TokenStream2 {
-    let missing_mem_err = &missing_mem_conf.err;
     let name_ident = names.func(&func.name);
 
     let coretype = func.core_type();
@@ -180,17 +168,14 @@ fn generate_func(
     });
     let arg_names = coretype.args.iter().map(|arg| names.func_core_arg(arg));
 
-    let (ret_ty, handle_early_error) = if let Some(ret) = &coretype.ret {
+    let ret_ty = if let Some(ret) = &coretype.ret {
         let ret_ty = match ret.signifies {
             witx::CoreParamSignifies::Value(atom) => names.atom_type(atom),
             _ => unreachable!("coretype ret should always be passed by value"),
         };
-        (quote! { #ret_ty }, quote! { return Ok(e.into()); })
+        quote! { #ret_ty }
     } else {
-        (
-            quote! {()},
-            quote! { panic!("unrecoverable error in {}: {}", stringify!(#name_ident), e) },
-        )
+        quote! {()}
     };
 
     let runtime = names.runtime_mod();
@@ -204,9 +189,7 @@ fn generate_func(
                     let mem = match caller.get_export("memory") {
                         Some(wasmtime::Extern::Memory(m)) => m,
                         _ => {
-                            wasmtime_wiggle::tracing::warn!("callee does not export a memory as \"memory\"");
-                            let e = { #missing_mem_err };
-                            #handle_early_error
+                            return Err(wasmtime::Trap::new("missing required memory export"));
                         }
                     };
                     let mem = #runtime::WasmtimeGuestMemory::new(mem);

From bf2371c8afd99c0f6ae50694f4fb3029c0a654d9 Mon Sep 17 00:00:00 2001
From: Pat Hickey <pat@moreproductive.org>
Date: Tue, 5 Jan 2021 17:29:34 -0800
Subject: [PATCH 05/10] wasi: get rid of missing_memory config

---
 crates/wasi/src/lib.rs            | 2 --
 crates/wasi/src/old/snapshot_0.rs | 2 --
 2 files changed, 4 deletions(-)

diff --git a/crates/wasi/src/lib.rs b/crates/wasi/src/lib.rs
index 166a0e1ef9..ba4f7b6c2e 100644
--- a/crates/wasi/src/lib.rs
+++ b/crates/wasi/src/lib.rs
@@ -33,8 +33,6 @@ resolution.",
           }
         },
     },
-    // Error to return when caller module is missing memory export:
-    missing_memory: { wasi_common::wasi::types::Errno::Inval },
 });
 
 pub fn is_wasi_module(name: &str) -> bool {
diff --git a/crates/wasi/src/old/snapshot_0.rs b/crates/wasi/src/old/snapshot_0.rs
index 2b054c011b..826acb995a 100644
--- a/crates/wasi/src/old/snapshot_0.rs
+++ b/crates/wasi/src/old/snapshot_0.rs
@@ -32,8 +32,6 @@ resolution.",
           }
         },
     },
-    // Error to return when caller module is missing memory export:
-    missing_memory: { wasi_common::wasi::types::Errno::Inval },
 });
 
 pub fn is_wasi_module(name: &str) -> bool {

From 6eea015d6c512882e426ffc670b180a971afad05 Mon Sep 17 00:00:00 2001
From: Chris Fallin <chris@cfallin.org>
Date: Sat, 12 Dec 2020 20:48:56 -0800
Subject: [PATCH 06/10] Multi-register value support: framework for Values
 wider than machine regs.

This will allow for support for `I128` values everywhere, and `I64`
values on 32-bit targets (e.g., ARM32 and x86-32). It does not alter the
machine backends to build such support; it just adds the framework for
the MachInst backends to *reason* about a `Value` residing in more than
one register.
---
 cranelift/codegen/src/isa/aarch64/abi.rs      |  24 +-
 .../codegen/src/isa/aarch64/inst/args.rs      |   2 +-
 cranelift/codegen/src/isa/aarch64/inst/mod.rs |  56 ++--
 cranelift/codegen/src/isa/aarch64/lower.rs    |  38 ++-
 .../codegen/src/isa/aarch64/lower_inst.rs     | 193 +++++------
 cranelift/codegen/src/isa/arm32/abi.rs        |  16 +-
 cranelift/codegen/src/isa/arm32/inst/mod.rs   |  17 +-
 cranelift/codegen/src/isa/arm32/lower.rs      |  22 +-
 cranelift/codegen/src/isa/arm32/lower_inst.rs |  11 +-
 cranelift/codegen/src/isa/x64/abi.rs          |  20 +-
 cranelift/codegen/src/isa/x64/inst/mod.rs     |  56 ++--
 cranelift/codegen/src/isa/x64/lower.rs        | 313 +++++++++---------
 cranelift/codegen/src/machinst/abi.rs         |  62 +++-
 cranelift/codegen/src/machinst/abi_impl.rs    | 304 ++++++++++++-----
 cranelift/codegen/src/machinst/helpers.rs     |   4 +-
 cranelift/codegen/src/machinst/lower.rs       | 238 +++++++------
 cranelift/codegen/src/machinst/mod.rs         |  24 +-
 cranelift/codegen/src/machinst/valueregs.rs   | 185 +++++++++++
 18 files changed, 1024 insertions(+), 561 deletions(-)
 create mode 100644 cranelift/codegen/src/machinst/valueregs.rs

diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs
index 69335c10cd..d5d88e7770 100644
--- a/cranelift/codegen/src/isa/aarch64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -77,7 +77,7 @@ fn try_fill_baldrdash_reg(call_conv: isa::CallConv, param: &ir::AbiParam) -> Opt
             &ir::ArgumentPurpose::VMContext => {
                 // This is SpiderMonkey's `WasmTlsReg`.
                 Some(ABIArg::Reg(
-                    xreg(BALDRDASH_TLS_REG).to_real_reg(),
+                    ValueRegs::one(xreg(BALDRDASH_TLS_REG).to_real_reg()),
                     ir::types::I64,
                     param.extension,
                     param.purpose,
@@ -86,7 +86,7 @@ fn try_fill_baldrdash_reg(call_conv: isa::CallConv, param: &ir::AbiParam) -> Opt
             &ir::ArgumentPurpose::SignatureId => {
                 // This is SpiderMonkey's `WasmTableCallSigReg`.
                 Some(ABIArg::Reg(
-                    xreg(BALDRDASH_SIG_REG).to_real_reg(),
+                    ValueRegs::one(xreg(BALDRDASH_SIG_REG).to_real_reg()),
                     ir::types::I64,
                     param.extension,
                     param.purpose,
@@ -220,7 +220,9 @@ impl ABIMachineSpec for AArch64MachineDeps {
                 "Invalid type for AArch64: {:?}",
                 param.value_type
             );
-            let rc = Inst::rc_for_type(param.value_type).unwrap();
+            let (rcs, _) = Inst::rc_for_type(param.value_type).unwrap();
+            assert!(rcs.len() == 1, "Multi-reg values not supported yet");
+            let rc = rcs[0];
 
             let next_reg = match rc {
                 RegClass::I64 => &mut next_xreg,
@@ -238,7 +240,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
                     _ => unreachable!(),
                 };
                 ret.push(ABIArg::Reg(
-                    reg.to_real_reg(),
+                    ValueRegs::one(reg.to_real_reg()),
                     param.value_type,
                     param.extension,
                     param.purpose,
@@ -271,7 +273,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
             debug_assert!(args_or_rets == ArgsOrRets::Args);
             if next_xreg < max_per_class_reg_vals && remaining_reg_vals > 0 {
                 ret.push(ABIArg::Reg(
-                    xreg(next_xreg).to_real_reg(),
+                    ValueRegs::one(xreg(next_xreg).to_real_reg()),
                     I64,
                     ir::ArgumentExtension::None,
                     ir::ArgumentPurpose::Normal,
@@ -345,7 +347,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
         Inst::Ret
     }
 
-    fn gen_add_imm(into_reg: Writable<Reg>, from_reg: Reg, imm: u32) -> SmallVec<[Inst; 4]> {
+    fn gen_add_imm(into_reg: Writable<Reg>, from_reg: Reg, imm: u32) -> SmallInstVec<Inst> {
         let imm = imm as u64;
         let mut insts = SmallVec::new();
         if let Some(imm12) = Imm12::maybe_from_u64(imm) {
@@ -370,7 +372,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
         insts
     }
 
-    fn gen_stack_lower_bound_trap(limit_reg: Reg) -> SmallVec<[Inst; 2]> {
+    fn gen_stack_lower_bound_trap(limit_reg: Reg) -> SmallInstVec<Inst> {
         let mut insts = SmallVec::new();
         insts.push(Inst::AluRRRExtend {
             alu_op: ALUOp::SubS64,
@@ -411,7 +413,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
         Inst::gen_store(mem, from_reg, ty, MemFlags::trusted())
     }
 
-    fn gen_sp_reg_adjust(amount: i32) -> SmallVec<[Inst; 2]> {
+    fn gen_sp_reg_adjust(amount: i32) -> SmallInstVec<Inst> {
         if amount == 0 {
             return SmallVec::new();
         }
@@ -455,7 +457,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
         }
     }
 
-    fn gen_prologue_frame_setup() -> SmallVec<[Inst; 2]> {
+    fn gen_prologue_frame_setup() -> SmallInstVec<Inst> {
         let mut insts = SmallVec::new();
         // stp fp (x29), lr (x30), [sp, #-16]!
         insts.push(Inst::StoreP64 {
@@ -481,7 +483,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
         insts
     }
 
-    fn gen_epilogue_frame_restore() -> SmallVec<[Inst; 2]> {
+    fn gen_epilogue_frame_restore() -> SmallInstVec<Inst> {
         let mut insts = SmallVec::new();
 
         // MOV (alias of ORR) interprets x31 as XZR, so use an ADD here.
@@ -508,7 +510,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
         insts
     }
 
-    fn gen_probestack(_: u32) -> SmallVec<[Self::I; 2]> {
+    fn gen_probestack(_: u32) -> SmallInstVec<Self::I> {
         // TODO: implement if we ever require stack probes on an AArch64 host
         // (unlikely unless Lucet is ported)
         smallvec![]
diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs
index 66f0d071d4..738495714a 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -3,7 +3,7 @@
 // Some variants are never constructed, but we still want them as options in the future.
 #![allow(dead_code)]
 
-use crate::ir::types::{F32X2, F32X4, F64X2, I16X4, I16X8, I32X2, I32X4, I64X2, I8X16, I8X8};
+use crate::ir::types::*;
 use crate::ir::Type;
 use crate::isa::aarch64::inst::*;
 use crate::machinst::{ty_bits, MachLabel};
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index a8aa47c2a7..38b6e29ce4 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -5,9 +5,7 @@
 
 use crate::binemit::CodeOffset;
 use crate::ir::types::{
-    B1, B16, B16X4, B16X8, B32, B32X2, B32X4, B64, B64X2, B8, B8X16, B8X8, F32, F32X2, F32X4, F64,
-    F64X2, FFLAGS, I16, I16X4, I16X8, I32, I32X2, I32X4, I64, I64X2, I8, I8X16, I8X8, IFLAGS, R32,
-    R64,
+    B1, B128, B16, B32, B64, B8, F32, F64, FFLAGS, I128, I16, I32, I64, I8, I8X16, IFLAGS, R32, R64,
 };
 use crate::ir::{ExternalName, MemFlags, Opcode, SourceLoc, TrapCode, Type};
 use crate::isa::CallConv;
@@ -1304,7 +1302,7 @@ impl Inst {
     }
 
     /// Create instructions that load a 32-bit floating-point constant.
-    pub fn load_fp_constant32<F: FnMut(RegClass, Type) -> Writable<Reg>>(
+    pub fn load_fp_constant32<F: FnMut(Type) -> Writable<Reg>>(
         rd: Writable<Reg>,
         value: u32,
         mut alloc_tmp: F,
@@ -1322,7 +1320,7 @@ impl Inst {
         } else {
             // TODO: use FMOV immediate form when `value` has sufficiently few mantissa/exponent
             // bits.
-            let tmp = alloc_tmp(RegClass::I64, I32);
+            let tmp = alloc_tmp(I32);
             let mut insts = Inst::load_constant(tmp, value as u64);
 
             insts.push(Inst::MovToFpu {
@@ -1336,7 +1334,7 @@ impl Inst {
     }
 
     /// Create instructions that load a 64-bit floating-point constant.
-    pub fn load_fp_constant64<F: FnMut(RegClass, Type) -> Writable<Reg>>(
+    pub fn load_fp_constant64<F: FnMut(Type) -> Writable<Reg>>(
         rd: Writable<Reg>,
         const_data: u64,
         mut alloc_tmp: F,
@@ -1350,7 +1348,7 @@ impl Inst {
         // bits.  Also, treat it as half of a 128-bit vector and consider replicated
         // patterns. Scalar MOVI might also be an option.
         } else if const_data & (u32::MAX as u64) == 0 {
-            let tmp = alloc_tmp(RegClass::I64, I64);
+            let tmp = alloc_tmp(I64);
             let mut insts = Inst::load_constant(tmp, const_data);
 
             insts.push(Inst::MovToFpu {
@@ -1366,7 +1364,7 @@ impl Inst {
     }
 
     /// Create instructions that load a 128-bit vector constant.
-    pub fn load_fp_constant128<F: FnMut(RegClass, Type) -> Writable<Reg>>(
+    pub fn load_fp_constant128<F: FnMut(Type) -> Writable<Reg>>(
         rd: Writable<Reg>,
         const_data: u128,
         alloc_tmp: F,
@@ -1416,7 +1414,7 @@ impl Inst {
 
     /// Create instructions that load a vector constant consisting of elements with
     /// the same value.
-    pub fn load_replicated_vector_pattern<F: FnMut(RegClass, Type) -> Writable<Reg>>(
+    pub fn load_replicated_vector_pattern<F: FnMut(Type) -> Writable<Reg>>(
         rd: Writable<Reg>,
         pattern: u64,
         size: VectorSize,
@@ -1472,7 +1470,7 @@ impl Inst {
         } else if let Some(imm) = ASIMDFPModImm::maybe_from_u64(pattern, lane_size) {
             smallvec![Inst::VecDupFPImm { rd, imm, size }]
         } else {
-            let tmp = alloc_tmp(RegClass::I64, I64);
+            let tmp = alloc_tmp(I64);
             let mut insts = SmallVec::from(&Inst::load_constant(tmp, pattern)[..]);
 
             insts.push(Inst::VecDup {
@@ -2862,12 +2860,16 @@ impl MachInst for Inst {
         }
     }
 
-    fn gen_constant<F: FnMut(RegClass, Type) -> Writable<Reg>>(
-        to_reg: Writable<Reg>,
-        value: u64,
+    fn gen_constant<F: FnMut(Type) -> Writable<Reg>>(
+        to_regs: ValueRegs<Writable<Reg>>,
+        value: u128,
         ty: Type,
         alloc_tmp: F,
     ) -> SmallVec<[Inst; 4]> {
+        let to_reg = to_regs
+            .only_reg()
+            .expect("multi-reg values not supported yet");
+        let value = value as u64;
         if ty == F64 {
             Inst::load_fp_constant64(to_reg, value, alloc_tmp)
         } else if ty == F32 {
@@ -2905,14 +2907,28 @@ impl MachInst for Inst {
         None
     }
 
-    fn rc_for_type(ty: Type) -> CodegenResult<RegClass> {
+    fn rc_for_type(ty: Type) -> CodegenResult<(&'static [RegClass], &'static [Type])> {
         match ty {
-            I8 | I16 | I32 | I64 | B1 | B8 | B16 | B32 | B64 | R32 | R64 => Ok(RegClass::I64),
-            F32 | F64 => Ok(RegClass::V128),
-            IFLAGS | FFLAGS => Ok(RegClass::I64),
-            B8X8 | B8X16 | B16X4 | B16X8 | B32X2 | B32X4 | B64X2 => Ok(RegClass::V128),
-            F32X2 | I8X8 | I16X4 | I32X2 => Ok(RegClass::V128),
-            F32X4 | F64X2 | I8X16 | I16X8 | I32X4 | I64X2 => Ok(RegClass::V128),
+            I8 => Ok((&[RegClass::I64], &[I8])),
+            I16 => Ok((&[RegClass::I64], &[I16])),
+            I32 => Ok((&[RegClass::I64], &[I32])),
+            I64 => Ok((&[RegClass::I64], &[I64])),
+            B1 => Ok((&[RegClass::I64], &[B1])),
+            B8 => Ok((&[RegClass::I64], &[B8])),
+            B16 => Ok((&[RegClass::I64], &[B16])),
+            B32 => Ok((&[RegClass::I64], &[B32])),
+            B64 => Ok((&[RegClass::I64], &[B64])),
+            R32 => panic!("32-bit reftype pointer should never be seen on AArch64"),
+            R64 => Ok((&[RegClass::I64], &[R64])),
+            F32 => Ok((&[RegClass::V128], &[F32])),
+            F64 => Ok((&[RegClass::V128], &[F64])),
+            I128 => Ok((&[RegClass::I64, RegClass::I64], &[I64, I64])),
+            B128 => Ok((&[RegClass::I64, RegClass::I64], &[B64, B64])),
+            _ if ty.is_vector() => {
+                assert!(ty.bits() <= 128);
+                Ok((&[RegClass::V128], &[I8X16]))
+            }
+            IFLAGS | FFLAGS => Ok((&[RegClass::I64], &[I64])),
             _ => Err(CodegenError::Unsupported(format!(
                 "Unexpected SSA-value type: {}",
                 ty
diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
index 1c8f407b7b..37c5e79c8d 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -22,7 +22,7 @@ use super::lower_inst;
 
 use crate::data_value::DataValue;
 use log::{debug, trace};
-use regalloc::{Reg, RegClass, Writable};
+use regalloc::{Reg, Writable};
 use smallvec::SmallVec;
 
 //============================================================================
@@ -179,9 +179,9 @@ pub(crate) fn put_input_in_reg<C: LowerCtx<I = Inst>>(
         } else {
             c
         };
-        let to_reg = ctx.alloc_tmp(Inst::rc_for_type(ty).unwrap(), ty);
-        for inst in Inst::gen_constant(to_reg, masked, ty, |reg_class, ty| {
-            ctx.alloc_tmp(reg_class, ty)
+        let to_reg = ctx.alloc_tmp(ty).only_reg().unwrap();
+        for inst in Inst::gen_constant(ValueRegs::one(to_reg), masked as u128, ty, |ty| {
+            ctx.alloc_tmp(ty).only_reg().unwrap()
         })
         .into_iter()
         {
@@ -189,13 +189,15 @@ pub(crate) fn put_input_in_reg<C: LowerCtx<I = Inst>>(
         }
         to_reg.to_reg()
     } else {
-        ctx.put_input_in_reg(input.insn, input.input)
+        ctx.put_input_in_regs(input.insn, input.input)
+            .only_reg()
+            .unwrap()
     };
 
     match (narrow_mode, from_bits) {
         (NarrowValueMode::None, _) => in_reg,
         (NarrowValueMode::ZeroExtend32, n) if n < 32 => {
-            let tmp = ctx.alloc_tmp(RegClass::I64, I32);
+            let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
             ctx.emit(Inst::Extend {
                 rd: tmp,
                 rn: in_reg,
@@ -206,7 +208,7 @@ pub(crate) fn put_input_in_reg<C: LowerCtx<I = Inst>>(
             tmp.to_reg()
         }
         (NarrowValueMode::SignExtend32, n) if n < 32 => {
-            let tmp = ctx.alloc_tmp(RegClass::I64, I32);
+            let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
             ctx.emit(Inst::Extend {
                 rd: tmp,
                 rn: in_reg,
@@ -223,7 +225,7 @@ pub(crate) fn put_input_in_reg<C: LowerCtx<I = Inst>>(
                 // Constants are zero-extended to full 64-bit width on load already.
                 in_reg
             } else {
-                let tmp = ctx.alloc_tmp(RegClass::I64, I32);
+                let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
                 ctx.emit(Inst::Extend {
                     rd: tmp,
                     rn: in_reg,
@@ -235,7 +237,7 @@ pub(crate) fn put_input_in_reg<C: LowerCtx<I = Inst>>(
             }
         }
         (NarrowValueMode::SignExtend64, n) if n < 64 => {
-            let tmp = ctx.alloc_tmp(RegClass::I64, I32);
+            let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
             ctx.emit(Inst::Extend {
                 rd: tmp,
                 rn: in_reg,
@@ -696,7 +698,7 @@ pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
     /* addends64.len() == 0 */
     {
         if addends32.len() > 0 {
-            let tmp = ctx.alloc_tmp(RegClass::I64, I64);
+            let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
             let (reg1, extendop) = addends32.pop().unwrap();
             let signed = match extendop {
                 ExtendOp::SXTW => true,
@@ -718,7 +720,7 @@ pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
         } else
         /* addends32.len() == 0 */
         {
-            let off_reg = ctx.alloc_tmp(RegClass::I64, I64);
+            let off_reg = ctx.alloc_tmp(I64).only_reg().unwrap();
             lower_constant_u64(ctx, off_reg, offset as u64);
             offset = 0;
             AMode::reg(off_reg.to_reg())
@@ -734,7 +736,7 @@ pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
     }
 
     // Allocate the temp and shoehorn it into the AMode.
-    let addr = ctx.alloc_tmp(RegClass::I64, I64);
+    let addr = ctx.alloc_tmp(I64).only_reg().unwrap();
     let (reg, memarg) = match memarg {
         AMode::RegExtended(r1, r2, extendop) => {
             (r1, AMode::RegExtended(addr.to_reg(), r2, extendop))
@@ -782,7 +784,7 @@ pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
         // If the register is the stack reg, we must move it to another reg
         // before adding it.
         let reg = if reg == stack_reg() {
-            let tmp = ctx.alloc_tmp(RegClass::I64, I64);
+            let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
             ctx.emit(Inst::gen_move(tmp, stack_reg(), I64));
             tmp.to_reg()
         } else {
@@ -824,7 +826,7 @@ pub(crate) fn lower_constant_f32<C: LowerCtx<I = Inst>>(
     rd: Writable<Reg>,
     value: f32,
 ) {
-    let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty);
+    let alloc_tmp = |ty| ctx.alloc_tmp(ty).only_reg().unwrap();
 
     for inst in Inst::load_fp_constant32(rd, value.to_bits(), alloc_tmp) {
         ctx.emit(inst);
@@ -836,7 +838,7 @@ pub(crate) fn lower_constant_f64<C: LowerCtx<I = Inst>>(
     rd: Writable<Reg>,
     value: f64,
 ) {
-    let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty);
+    let alloc_tmp = |ty| ctx.alloc_tmp(ty).only_reg().unwrap();
 
     for inst in Inst::load_fp_constant64(rd, value.to_bits(), alloc_tmp) {
         ctx.emit(inst);
@@ -858,7 +860,7 @@ pub(crate) fn lower_constant_f128<C: LowerCtx<I = Inst>>(
             size: VectorSize::Size8x16,
         });
     } else {
-        let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty);
+        let alloc_tmp = |ty| ctx.alloc_tmp(ty).only_reg().unwrap();
         for inst in Inst::load_fp_constant128(rd, value, alloc_tmp) {
             ctx.emit(inst);
         }
@@ -885,7 +887,7 @@ pub(crate) fn lower_splat_const<C: LowerCtx<I = Inst>>(
         ),
         None => (value, size),
     };
-    let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty);
+    let alloc_tmp = |ty| ctx.alloc_tmp(ty).only_reg().unwrap();
 
     for inst in Inst::load_replicated_vector_pattern(rd, value, size, alloc_tmp) {
         ctx.emit(inst);
@@ -1217,7 +1219,7 @@ pub(crate) fn lower_load<C: LowerCtx<I = Inst>, F: FnMut(&mut C, Writable<Reg>,
 
     let off = ctx.data(ir_inst).load_store_offset().unwrap();
     let mem = lower_address(ctx, elem_ty, &inputs[..], off);
-    let rd = get_output_reg(ctx, output);
+    let rd = get_output_reg(ctx, output).only_reg().unwrap();
 
     f(ctx, rd, elem_ty, mem);
 }
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index 504fcba438..1c4e3d7e99 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -12,7 +12,7 @@ use crate::{CodegenError, CodegenResult};
 use crate::isa::aarch64::abi::*;
 use crate::isa::aarch64::inst::*;
 
-use regalloc::{RegClass, Writable};
+use regalloc::Writable;
 
 use alloc::boxed::Box;
 use alloc::vec::Vec;
@@ -46,21 +46,21 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 ty if ty.is_bool() => value,
                 ty => unreachable!("Unknown type for const: {}", ty),
             };
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             lower_constant_u64(ctx, rd, value);
         }
         Opcode::F32const => {
             let value = f32::from_bits(ctx.get_constant(insn).unwrap() as u32);
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             lower_constant_f32(ctx, rd, value);
         }
         Opcode::F64const => {
             let value = f64::from_bits(ctx.get_constant(insn).unwrap());
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             lower_constant_f64(ctx, rd, value);
         }
         Opcode::Iadd => {
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
             if !ty.is_vector() {
                 let mul_insn =
@@ -116,7 +116,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             }
         }
         Opcode::Isub => {
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let ty = ty.unwrap();
             if !ty.is_vector() {
@@ -148,7 +148,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             // (SQADD / UQADD / SQSUB / UQSUB), which require scalar FP registers.
             let is_signed = op == Opcode::SaddSat || op == Opcode::SsubSat;
             let ty = ty.unwrap();
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             if !ty.is_vector() {
                 let narrow_mode = if is_signed {
                     NarrowValueMode::SignExtend64
@@ -162,8 +162,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     Opcode::SsubSat => FPUOp2::Sqsub64,
                     _ => unreachable!(),
                 };
-                let va = ctx.alloc_tmp(RegClass::V128, I128);
-                let vb = ctx.alloc_tmp(RegClass::V128, I128);
+                let va = ctx.alloc_tmp(I8X16).only_reg().unwrap();
+                let vb = ctx.alloc_tmp(I8X16).only_reg().unwrap();
                 let ra = put_input_in_reg(ctx, inputs[0], narrow_mode);
                 let rb = put_input_in_reg(ctx, inputs[1], narrow_mode);
                 ctx.emit(Inst::MovToFpu {
@@ -211,7 +211,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::Ineg => {
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
             if !ty.is_vector() {
                 let rn = zero_reg();
@@ -230,7 +230,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::Imul => {
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let ty = ty.unwrap();
@@ -245,8 +245,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 });
             } else {
                 if ty == I64X2 {
-                    let tmp1 = ctx.alloc_tmp(RegClass::V128, I64X2);
-                    let tmp2 = ctx.alloc_tmp(RegClass::V128, I64X2);
+                    let tmp1 = ctx.alloc_tmp(I64X2).only_reg().unwrap();
+                    let tmp2 = ctx.alloc_tmp(I64X2).only_reg().unwrap();
 
                     // This I64X2 multiplication is performed with several 32-bit
                     // operations.
@@ -362,7 +362,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::Umulhi | Opcode::Smulhi => {
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let is_signed = op == Opcode::Smulhi;
             let input_ty = ctx.input_ty(insn, 0);
             assert!(ctx.input_ty(insn, 1) == input_ty);
@@ -443,7 +443,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 ALUOp::UDiv64
             };
 
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
             let rm = put_input_in_reg(ctx, inputs[1], narrow_mode);
             // The div instruction does not trap on divide by zero or signed overflow
@@ -550,7 +550,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             assert!(from_bits <= to_bits);
             if from_bits < to_bits {
                 let signed = op == Opcode::Sextend;
-                let rd = get_output_reg(ctx, outputs[0]);
+                let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
                 if let Some(extract_insn) = maybe_input_insn(ctx, inputs[0], Opcode::Extractlane) {
                     let idx =
@@ -596,7 +596,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::Bnot => {
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
             if !ty.is_vector() {
                 let rm = put_input_in_rs_immlogic(ctx, inputs[0], NarrowValueMode::None);
@@ -620,7 +620,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         | Opcode::BandNot
         | Opcode::BorNot
         | Opcode::BxorNot => {
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
             if !ty.is_vector() {
                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
@@ -646,7 +646,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
-                let rd = get_output_reg(ctx, outputs[0]);
+                let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
                 ctx.emit(Inst::VecRRR {
                     alu_op,
@@ -660,7 +660,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
         Opcode::Ishl | Opcode::Ushr | Opcode::Sshr => {
             let ty = ty.unwrap();
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             if !ty.is_vector() {
                 let size = OperandSize::from_bits(ty_bits(ty));
                 let narrow_mode = match (op, size) {
@@ -692,7 +692,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
                 let rm = if is_right_shift {
                     // Right shifts are implemented with a negative left shift.
-                    let tmp = ctx.alloc_tmp(RegClass::I64, I32);
+                    let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
                     let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
                     let rn = zero_reg();
                     ctx.emit(Inst::AluRRR {
@@ -751,7 +751,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let ty = ty.unwrap();
             let ty_bits_size = ty_bits(ty) as u8;
 
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(
                 ctx,
                 inputs[0],
@@ -785,7 +785,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                             // ignored (because of the implicit masking done by the instruction),
                             // so this is equivalent to negating the input.
                             let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
-                            let tmp = ctx.alloc_tmp(RegClass::I64, ty);
+                            let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
                             ctx.emit(Inst::AluRRR {
                                 alu_op,
                                 rd: tmp,
@@ -808,7 +808,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                             // Really ty_bits_size - rn, but the upper bits of the result are
                             // ignored (because of the implicit masking done by the instruction),
                             // so this is equivalent to negating the input.
-                            let tmp = ctx.alloc_tmp(RegClass::I64, I32);
+                            let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
                             ctx.emit(Inst::AluRRR {
                                 alu_op: ALUOp::Sub32,
                                 rd: tmp,
@@ -821,7 +821,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         };
 
                         // Explicitly mask the rotation count.
-                        let tmp_masked_rm = ctx.alloc_tmp(RegClass::I64, I32);
+                        let tmp_masked_rm = ctx.alloc_tmp(I32).only_reg().unwrap();
                         ctx.emit(Inst::AluRRImmLogic {
                             alu_op: ALUOp::And32,
                             rd: tmp_masked_rm,
@@ -830,8 +830,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         });
                         let tmp_masked_rm = tmp_masked_rm.to_reg();
 
-                        let tmp1 = ctx.alloc_tmp(RegClass::I64, I32);
-                        let tmp2 = ctx.alloc_tmp(RegClass::I64, I32);
+                        let tmp1 = ctx.alloc_tmp(I32).only_reg().unwrap();
+                        let tmp2 = ctx.alloc_tmp(I32).only_reg().unwrap();
                         ctx.emit(Inst::AluRRImm12 {
                             alu_op: ALUOp::Sub32,
                             rd: tmp1,
@@ -870,7 +870,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         }
                         immshift.imm &= ty_bits_size - 1;
 
-                        let tmp1 = ctx.alloc_tmp(RegClass::I64, I32);
+                        let tmp1 = ctx.alloc_tmp(I32).only_reg().unwrap();
                         ctx.emit(Inst::AluRRImmShift {
                             alu_op: ALUOp::Lsr32,
                             rd: tmp1,
@@ -900,7 +900,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::Bitrev | Opcode::Clz | Opcode::Cls | Opcode::Ctz => {
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let needs_zext = match op {
                 Opcode::Bitrev | Opcode::Ctz => false,
                 Opcode::Clz | Opcode::Cls => true,
@@ -970,12 +970,12 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             //   x += x << 32
             //   x >> 56
             let ty = ty.unwrap();
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             // FIXME(#1537): zero-extend 8/16/32-bit operands only to 32 bits,
             // and fix the sequence below to work properly for this.
             let narrow_mode = NarrowValueMode::ZeroExtend64;
             let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
-            let tmp = ctx.alloc_tmp(RegClass::I64, I64);
+            let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
 
             // If this is a 32-bit Popcnt, use Lsr32 to clear the top 32 bits of the register, then
             // the rest of the code is identical to the 64-bit version.
@@ -1236,7 +1236,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 } => (stack_slot, offset),
                 _ => unreachable!(),
             };
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let offset: i32 = offset.into();
             let inst = ctx
                 .abi()
@@ -1245,7 +1245,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::AtomicRmw => {
-            let r_dst = get_output_reg(ctx, outputs[0]);
+            let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let mut r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let mut r_arg2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let ty_access = ty.unwrap();
@@ -1270,7 +1270,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             // This is very similar to, but not identical to, the AtomicRmw case.  Note
             // that the AtomicCAS sequence does its own masking, so we don't need to worry
             // about zero-extending narrow (I8/I16/I32) values here.
-            let r_dst = get_output_reg(ctx, outputs[0]);
+            let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let mut r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let mut r_expected = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let mut r_replacement = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
@@ -1301,7 +1301,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::AtomicLoad => {
-            let r_data = get_output_reg(ctx, outputs[0]);
+            let r_data = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let ty_access = ty.unwrap();
             assert!(is_valid_atomic_transaction_ty(ty_access));
@@ -1382,7 +1382,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             };
 
             // csel.cond rd, rn, rm
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
             let ty = ctx.output_ty(insn, 0);
@@ -1409,7 +1409,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
 
             // csel.COND rd, rn, rm
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
             let ty = ctx.output_ty(insn, 0);
@@ -1428,8 +1428,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let ty = ty.unwrap();
             if !ty.is_vector() {
                 debug_assert_ne!(Opcode::Vselect, op);
-                let tmp = ctx.alloc_tmp(RegClass::I64, I64);
-                let rd = get_output_reg(ctx, outputs[0]);
+                let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
+                let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 let rcond = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                 let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
                 let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
@@ -1458,7 +1458,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 let rcond = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                 let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
                 let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
-                let rd = get_output_reg(ctx, outputs[0]);
+                let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 ctx.emit(Inst::gen_move(rd, rcond, ty));
 
                 ctx.emit(Inst::VecRRR {
@@ -1479,7 +1479,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             // single-def ifcmp.
             let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
             lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             materialize_bool_result(ctx, insn, rd, cond);
         }
 
@@ -1488,7 +1488,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let cond = lower_fp_condcode(condcode);
             let ffcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ffcmp).unwrap();
             lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             materialize_bool_result(ctx, insn, rd, cond);
         }
 
@@ -1496,7 +1496,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             // Null references are represented by the constant value 0; invalid references are
             // represented by the constant value -1. See `define_reftypes()` in
             // `meta/src/isa/x86/encodings.rs` to confirm.
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let ty = ctx.input_ty(insn, 0);
             let (alu_op, const_value) = match op {
@@ -1516,7 +1516,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::Copy => {
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let ty = ctx.input_ty(insn, 0);
             ctx.emit(Inst::gen_move(rd, rn, ty));
@@ -1526,7 +1526,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             // Smaller integers/booleans are stored with high-order bits
             // undefined, so we can simply do a copy.
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ctx.input_ty(insn, 0);
             ctx.emit(Inst::gen_move(rd, rn, ty));
         }
@@ -1553,7 +1553,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 // Nothing.
             } else {
                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-                let rd = get_output_reg(ctx, outputs[0]);
+                let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 let to_bits = if to_bits == 64 {
                     64
                 } else {
@@ -1575,7 +1575,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             // Booleans are stored as all-zeroes (0) or all-ones (-1). We AND
             // out the LSB to give a 0 / 1-valued integer result.
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let output_bits = ty_bits(ctx.output_ty(insn, 0));
 
             let (imm_ty, alu_op) = if output_bits > 32 {
@@ -1592,7 +1592,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::Bitcast => {
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ity = ctx.input_ty(insn, 0);
             let oty = ctx.output_ty(insn, 0);
             let ity_bits = ty_bits(ity);
@@ -1644,7 +1644,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 // (above the bits for the value's type) are undefined, so we
                 // need not extend the return values.
                 let reg = put_input_in_reg(ctx, *input, NarrowValueMode::None);
-                let retval_reg = ctx.retval(i);
+                let retval_reg = ctx.retval(i).only_reg().unwrap();
                 let ty = ctx.input_ty(insn, i);
                 ctx.emit(Inst::gen_move(retval_reg, reg, ty));
             }
@@ -1663,7 +1663,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let condcode = ctx.data(insn).cond_code().unwrap();
             let cond = lower_condcode(condcode);
             let is_signed = condcode_is_signed(condcode);
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ctx.input_ty(insn, 0);
             let bits = ty_bits(ty);
             let narrow_mode = match (bits <= 32, is_signed) {
@@ -1691,7 +1691,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let ty = ctx.input_ty(insn, 0);
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
             if !ty.is_vector() {
                 match ty_bits(ty) {
@@ -1768,7 +1768,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::FuncAddr => {
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let (extname, _) = ctx.call_target(insn).unwrap();
             let extname = extname.clone();
             ctx.emit(Inst::LoadExtName {
@@ -1783,7 +1783,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::SymbolValue => {
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let (extname, _, offset) = ctx.symbol_value(insn).unwrap();
             let extname = extname.clone();
             ctx.emit(Inst::LoadExtName {
@@ -1824,18 +1824,18 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             assert!(inputs.len() == abi.num_args());
             for (i, input) in inputs.iter().enumerate() {
                 let arg_reg = put_input_in_reg(ctx, *input, NarrowValueMode::None);
-                abi.emit_copy_reg_to_arg(ctx, i, arg_reg);
+                abi.emit_copy_regs_to_arg(ctx, i, ValueRegs::one(arg_reg));
             }
             abi.emit_call(ctx);
             for (i, output) in outputs.iter().enumerate() {
-                let retval_reg = get_output_reg(ctx, *output);
-                abi.emit_copy_retval_to_reg(ctx, i, retval_reg);
+                let retval_reg = get_output_reg(ctx, *output).only_reg().unwrap();
+                abi.emit_copy_retval_to_regs(ctx, i, ValueRegs::one(retval_reg));
             }
             abi.emit_stack_post_adjust(ctx);
         }
 
         Opcode::GetPinnedReg => {
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             ctx.emit(Inst::gen_move(rd, xreg(PINNED_REG), I64));
         }
 
@@ -1874,13 +1874,13 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
         Opcode::Vconst => {
             let value = const_param_to_u128(ctx, insn).expect("Invalid immediate bytes");
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             lower_constant_f128(ctx, rd, value);
         }
 
         Opcode::RawBitcast => {
             let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ctx.input_ty(insn, 0);
             ctx.emit(Inst::gen_move(rd, rm, ty));
         }
@@ -1888,7 +1888,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         Opcode::Extractlane => {
             if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(insn) {
                 let idx = *imm;
-                let rd = get_output_reg(ctx, outputs[0]);
+                let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                 let size = VectorSize::from_ty(ctx.input_ty(insn, 0));
                 let ty = ty.unwrap();
@@ -1913,7 +1913,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 unreachable!();
             };
             let input_ty = ctx.input_ty(insn, 1);
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let ty = ty.unwrap();
@@ -1935,7 +1935,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::Splat => {
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let size = VectorSize::from_ty(ty.unwrap());
 
             if let Some((_, insn)) = maybe_input_insn_multi(
@@ -1979,7 +1979,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     &load_inputs[..],
                     load_outputs[0],
                     |ctx, _rd, _elem_ty, mem| {
-                        let tmp = ctx.alloc_tmp(RegClass::I64, I64);
+                        let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
                         let (addr, addr_inst) = Inst::gen_load_addr(tmp, mem);
                         if let Some(addr_inst) = addr_inst {
                             ctx.emit(addr_inst);
@@ -2002,7 +2002,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
         Opcode::ScalarToVector => {
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let input_ty = ctx.input_ty(insn, 0);
             if (input_ty == I32 && ty.unwrap() == I32X4)
                 || (input_ty == I64 && ty.unwrap() == I64X2)
@@ -2021,9 +2021,10 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::VanyTrue | Opcode::VallTrue => {
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let tmp = ctx.alloc_tmp(RegClass::V128, ty.unwrap());
+            let src_ty = ctx.input_ty(insn, 0);
+            let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap();
 
             // This operation is implemented by using umaxp or uminv to
             // create a scalar value, which is then compared against zero.
@@ -2070,7 +2071,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::VhighBits => {
-            let dst_r = get_output_reg(ctx, outputs[0]);
+            let dst_r = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let src_v = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let ty = ctx.input_ty(insn, 0);
             // All three sequences use one integer temporary and two vector temporaries.  The
@@ -2080,9 +2081,9 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             // derivation of these sequences.  Alternative sequences are discussed in
             // https://github.com/bytecodealliance/wasmtime/issues/2296, although they are not
             // used here.
-            let tmp_r0 = ctx.alloc_tmp(RegClass::I64, I64);
-            let tmp_v0 = ctx.alloc_tmp(RegClass::V128, I8X16);
-            let tmp_v1 = ctx.alloc_tmp(RegClass::V128, I8X16);
+            let tmp_r0 = ctx.alloc_tmp(I64).only_reg().unwrap();
+            let tmp_v0 = ctx.alloc_tmp(I8X16).only_reg().unwrap();
+            let tmp_v1 = ctx.alloc_tmp(I8X16).only_reg().unwrap();
             match ty {
                 I8X16 => {
                     // sshr  tmp_v1.16b, src_v.16b, #7
@@ -2255,7 +2256,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
         Opcode::Shuffle => {
             let mask = const_param_to_u128(ctx, insn).expect("Invalid immediate mask bytes");
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rn2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             // 2 register table vector lookups require consecutive table registers;
@@ -2283,7 +2284,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::Swizzle => {
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
 
@@ -2310,7 +2311,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 Opcode::Imax => VecALUOp::Smax,
                 _ => unreachable!(),
             };
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let ty = ty.unwrap();
@@ -2324,12 +2325,12 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::WideningPairwiseDotProductS => {
-            let r_y = get_output_reg(ctx, outputs[0]);
+            let r_y = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let ty = ty.unwrap();
             if ty == I32X4 {
-                let tmp = ctx.alloc_tmp(RegClass::V128, I8X16);
+                let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap();
                 // The args have type I16X8.
                 // "y = i32x4.dot_i16x8_s(a, b)"
                 // => smull  tmp, a, b
@@ -2369,7 +2370,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let bits = ty_bits(ty);
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             if !ty.is_vector() {
                 let fpu_op = match (op, bits) {
                     (Opcode::Fadd, 32) => FPUOp2::Add32,
@@ -2413,7 +2414,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             if ty == F32X4 || ty == F64X2 {
                 // pmin(a,b) => bitsel(b, a, cmpgt(a, b))
                 // pmax(a,b) => bitsel(b, a, cmpgt(b, a))
-                let r_dst = get_output_reg(ctx, outputs[0]);
+                let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                 let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
                 // Since we're going to write the output register `r_dst` anyway, we might as
@@ -2449,7 +2450,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let ty = ty.unwrap();
             let bits = ty_bits(ty);
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             if !ty.is_vector() {
                 let fpu_op = match (op, bits) {
                     (Opcode::Sqrt, 32) => FPUOp1::Sqrt32,
@@ -2498,7 +2499,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     _ => panic!("Unknown op/bits combination (scalar)"),
                 };
                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-                let rd = get_output_reg(ctx, outputs[0]);
+                let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 ctx.emit(Inst::FpuRound { op, rd, rn });
             } else {
                 let (op, size) = match (op, ty) {
@@ -2513,7 +2514,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     _ => panic!("Unknown op/ty combination (vector){:?}", ty),
                 };
                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-                let rd = get_output_reg(ctx, outputs[0]);
+                let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 ctx.emit(Inst::VecMisc { op, rd, rn, size });
             }
         }
@@ -2528,7 +2529,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let ra = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             ctx.emit(Inst::FpuRRRR {
                 fpu_op,
                 rn,
@@ -2554,8 +2555,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             assert!(bits == 32 || bits == 64);
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
-            let rd = get_output_reg(ctx, outputs[0]);
-            let tmp = ctx.alloc_tmp(RegClass::V128, F64);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+            let tmp = ctx.alloc_tmp(F64).only_reg().unwrap();
 
             // Copy LHS to rd.
             ctx.emit(Inst::gen_move(rd, rn, ty));
@@ -2594,7 +2595,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             };
 
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
             // First, check the output: it's important to carry the NaN conversion before the
             // in-bounds conversion, per wasm semantics.
@@ -2611,7 +2612,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::Unordered)),
             });
 
-            let tmp = ctx.alloc_tmp(RegClass::V128, I128);
+            let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap();
 
             // Check that the input is in range, with "truncate towards zero" semantics. This means
             // we allow values that are slightly out of range:
@@ -2736,7 +2737,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         Opcode::FcvtFromUint | Opcode::FcvtFromSint => {
             let ty = ty.unwrap();
             let signed = op == Opcode::FcvtFromSint;
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
             if ty.is_vector() {
                 let op = if signed {
@@ -2782,7 +2783,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let ty = ty.unwrap();
             let out_signed = op == Opcode::FcvtToSintSat;
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
             if ty.is_vector() {
                 let op = if out_signed {
@@ -2829,8 +2830,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     _ => unreachable!(),
                 };
 
-                let rtmp1 = ctx.alloc_tmp(RegClass::V128, in_ty);
-                let rtmp2 = ctx.alloc_tmp(RegClass::V128, in_ty);
+                let rtmp1 = ctx.alloc_tmp(in_ty).only_reg().unwrap();
+                let rtmp2 = ctx.alloc_tmp(in_ty).only_reg().unwrap();
 
                 if in_bits == 32 {
                     lower_constant_f32(ctx, rtmp1, max as f32);
@@ -2920,7 +2921,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
             // Now handle the iadd as above, except use an AddS opcode that sets
             // flags.
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None);
             let ty = ty.unwrap();
@@ -3001,7 +3002,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         Opcode::DummySargT => unreachable!(),
 
         Opcode::Iabs => {
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let ty = ty.unwrap();
             ctx.emit(Inst::VecMisc {
@@ -3012,7 +3013,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             });
         }
         Opcode::AvgRound => {
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let ty = ty.unwrap();
@@ -3031,7 +3032,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             } else {
                 VecMiscNarrowOp::Sqxtun
             };
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rn2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let ty = ty.unwrap();
@@ -3054,7 +3055,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
         Opcode::SwidenLow | Opcode::SwidenHigh | Opcode::UwidenLow | Opcode::UwidenHigh => {
             let lane_type = ty.unwrap().lane_type();
-            let rd = get_output_reg(ctx, outputs[0]);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let (t, high_half) = match (lane_type, op) {
                 (I16, Opcode::SwidenLow) => (VecExtendOp::Sxtl8, false),
@@ -3313,8 +3314,8 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                     NarrowValueMode::ZeroExtend32,
                 );
 
-                let rtmp1 = ctx.alloc_tmp(RegClass::I64, I32);
-                let rtmp2 = ctx.alloc_tmp(RegClass::I64, I32);
+                let rtmp1 = ctx.alloc_tmp(I32).only_reg().unwrap();
+                let rtmp2 = ctx.alloc_tmp(I32).only_reg().unwrap();
 
                 // Bounds-check, leaving condition codes for JTSequence's
                 // branch to default target below.
diff --git a/cranelift/codegen/src/isa/arm32/abi.rs b/cranelift/codegen/src/isa/arm32/abi.rs
index f09dd7dced..9e92a7b7aa 100644
--- a/cranelift/codegen/src/isa/arm32/abi.rs
+++ b/cranelift/codegen/src/isa/arm32/abi.rs
@@ -82,7 +82,7 @@ impl ABIMachineSpec for Arm32MachineDeps {
                 let reg = rreg(next_rreg);
 
                 ret.push(ABIArg::Reg(
-                    reg.to_real_reg(),
+                    ValueRegs::one(reg.to_real_reg()),
                     param.value_type,
                     param.extension,
                     param.purpose,
@@ -102,7 +102,7 @@ impl ABIMachineSpec for Arm32MachineDeps {
             debug_assert!(args_or_rets == ArgsOrRets::Args);
             if next_rreg < max_reg_val {
                 ret.push(ABIArg::Reg(
-                    rreg(next_rreg).to_real_reg(),
+                    ValueRegs::one(rreg(next_rreg).to_real_reg()),
                     I32,
                     ir::ArgumentExtension::None,
                     ir::ArgumentPurpose::Normal,
@@ -185,7 +185,7 @@ impl ABIMachineSpec for Arm32MachineDeps {
         Inst::EpiloguePlaceholder
     }
 
-    fn gen_add_imm(into_reg: Writable<Reg>, from_reg: Reg, imm: u32) -> SmallVec<[Inst; 4]> {
+    fn gen_add_imm(into_reg: Writable<Reg>, from_reg: Reg, imm: u32) -> SmallInstVec<Inst> {
         let mut insts = SmallVec::new();
 
         if let Some(imm12) = UImm12::maybe_from_i64(imm as i64) {
@@ -209,7 +209,7 @@ impl ABIMachineSpec for Arm32MachineDeps {
         insts
     }
 
-    fn gen_stack_lower_bound_trap(limit_reg: Reg) -> SmallVec<[Inst; 2]> {
+    fn gen_stack_lower_bound_trap(limit_reg: Reg) -> SmallInstVec<Inst> {
         let mut insts = SmallVec::new();
         insts.push(Inst::Cmp {
             rn: sp_reg(),
@@ -243,7 +243,7 @@ impl ABIMachineSpec for Arm32MachineDeps {
         Inst::gen_store(from_reg, mem, ty)
     }
 
-    fn gen_sp_reg_adjust(amount: i32) -> SmallVec<[Inst; 2]> {
+    fn gen_sp_reg_adjust(amount: i32) -> SmallInstVec<Inst> {
         let mut ret = SmallVec::new();
 
         if amount == 0 {
@@ -283,7 +283,7 @@ impl ABIMachineSpec for Arm32MachineDeps {
         Inst::VirtualSPOffsetAdj { offset }
     }
 
-    fn gen_prologue_frame_setup() -> SmallVec<[Inst; 2]> {
+    fn gen_prologue_frame_setup() -> SmallInstVec<Inst> {
         let mut ret = SmallVec::new();
         let reg_list = vec![fp_reg(), lr_reg()];
         ret.push(Inst::Push { reg_list });
@@ -294,7 +294,7 @@ impl ABIMachineSpec for Arm32MachineDeps {
         ret
     }
 
-    fn gen_epilogue_frame_restore() -> SmallVec<[Inst; 2]> {
+    fn gen_epilogue_frame_restore() -> SmallInstVec<Inst> {
         let mut ret = SmallVec::new();
         ret.push(Inst::Mov {
             rd: writable_sp_reg(),
@@ -305,7 +305,7 @@ impl ABIMachineSpec for Arm32MachineDeps {
         ret
     }
 
-    fn gen_probestack(_: u32) -> SmallVec<[Self::I; 2]> {
+    fn gen_probestack(_: u32) -> SmallInstVec<Self::I> {
         // TODO: implement if we ever require stack probes on ARM32 (unlikely
         // unless Lucet is ported)
         smallvec![]
diff --git a/cranelift/codegen/src/isa/arm32/inst/mod.rs b/cranelift/codegen/src/isa/arm32/inst/mod.rs
index fff01b7d82..309aa43102 100644
--- a/cranelift/codegen/src/isa/arm32/inst/mod.rs
+++ b/cranelift/codegen/src/isa/arm32/inst/mod.rs
@@ -807,12 +807,17 @@ impl MachInst for Inst {
         Inst::mov(to_reg, from_reg)
     }
 
-    fn gen_constant<F: FnMut(RegClass, Type) -> Writable<Reg>>(
-        to_reg: Writable<Reg>,
-        value: u64,
+    fn gen_constant<F: FnMut(Type) -> Writable<Reg>>(
+        to_regs: ValueRegs<Writable<Reg>>,
+        value: u128,
         ty: Type,
         _alloc_tmp: F,
     ) -> SmallVec<[Inst; 4]> {
+        let to_reg = to_regs
+            .only_reg()
+            .expect("multi-reg values not supported yet");
+        let value = value as u64;
+
         match ty {
             B1 | I8 | B8 | I16 | B16 | I32 | B32 => {
                 let v: i64 = value as i64;
@@ -839,10 +844,10 @@ impl MachInst for Inst {
         None
     }
 
-    fn rc_for_type(ty: Type) -> CodegenResult<RegClass> {
+    fn rc_for_type(ty: Type) -> CodegenResult<(&'static [RegClass], &'static [Type])> {
         match ty {
-            I8 | I16 | I32 | B1 | B8 | B16 | B32 => Ok(RegClass::I32),
-            IFLAGS => Ok(RegClass::I32),
+            I8 | I16 | I32 | B1 | B8 | B16 | B32 => Ok((&[RegClass::I32], &[I32])),
+            IFLAGS => Ok((&[RegClass::I32], &[I32])),
             _ => Err(CodegenError::Unsupported(format!(
                 "Unexpected SSA-value type: {}",
                 ty
diff --git a/cranelift/codegen/src/isa/arm32/lower.rs b/cranelift/codegen/src/isa/arm32/lower.rs
index a148f333d2..372c18b8e9 100644
--- a/cranelift/codegen/src/isa/arm32/lower.rs
+++ b/cranelift/codegen/src/isa/arm32/lower.rs
@@ -13,7 +13,7 @@ use crate::isa::arm32::Arm32Backend;
 
 use super::lower_inst;
 
-use regalloc::{Reg, RegClass, Writable};
+use regalloc::{Reg, Writable};
 
 //============================================================================
 // Lowering: convert instruction outputs to result types.
@@ -55,7 +55,7 @@ pub(crate) enum NarrowValueMode {
 
 /// Lower an instruction output to a reg.
 pub(crate) fn output_to_reg<C: LowerCtx<I = Inst>>(ctx: &mut C, out: InsnOutput) -> Writable<Reg> {
-    ctx.get_output(out.insn, out.output)
+    ctx.get_output(out.insn, out.output).only_reg().unwrap()
 }
 
 /// Lower an instruction input to a reg.
@@ -70,21 +70,25 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
     let from_bits = ty.bits() as u8;
     let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
     let in_reg = if let Some(c) = inputs.constant {
-        let to_reg = ctx.alloc_tmp(Inst::rc_for_type(ty).unwrap(), ty);
-        for inst in Inst::gen_constant(to_reg, c, ty, |reg_class, ty| ctx.alloc_tmp(reg_class, ty))
-            .into_iter()
+        let to_reg = ctx.alloc_tmp(ty).only_reg().unwrap();
+        for inst in Inst::gen_constant(ValueRegs::one(to_reg), c as u128, ty, |ty| {
+            ctx.alloc_tmp(ty).only_reg().unwrap()
+        })
+        .into_iter()
         {
             ctx.emit(inst);
         }
         to_reg.to_reg()
     } else {
-        ctx.put_input_in_reg(input.insn, input.input)
+        ctx.put_input_in_regs(input.insn, input.input)
+            .only_reg()
+            .unwrap()
     };
 
     match (narrow_mode, from_bits) {
         (NarrowValueMode::None, _) => in_reg,
         (NarrowValueMode::ZeroExtend, 1) => {
-            let tmp = ctx.alloc_tmp(RegClass::I32, I32);
+            let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
             ctx.emit(Inst::AluRRImm8 {
                 alu_op: ALUOp::And,
                 rd: tmp,
@@ -94,7 +98,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
             tmp.to_reg()
         }
         (NarrowValueMode::ZeroExtend, n) if n < 32 => {
-            let tmp = ctx.alloc_tmp(RegClass::I32, I32);
+            let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
             ctx.emit(Inst::Extend {
                 rd: tmp,
                 rm: in_reg,
@@ -104,7 +108,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
             tmp.to_reg()
         }
         (NarrowValueMode::SignExtend, n) if n < 32 => {
-            let tmp = ctx.alloc_tmp(RegClass::I32, I32);
+            let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
             ctx.emit(Inst::Extend {
                 rd: tmp,
                 rm: in_reg,
diff --git a/cranelift/codegen/src/isa/arm32/lower_inst.rs b/cranelift/codegen/src/isa/arm32/lower_inst.rs
index eb04af14d4..dd453d772a 100644
--- a/cranelift/codegen/src/isa/arm32/lower_inst.rs
+++ b/cranelift/codegen/src/isa/arm32/lower_inst.rs
@@ -10,7 +10,6 @@ use crate::CodegenResult;
 use crate::isa::arm32::abi::*;
 use crate::isa::arm32::inst::*;
 
-use regalloc::RegClass;
 use smallvec::SmallVec;
 
 use super::lower::*;
@@ -143,7 +142,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let rd = output_to_reg(ctx, outputs[0]);
             let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
             let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
-            let tmp = ctx.alloc_tmp(RegClass::I32, I32);
+            let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
 
             // ror rd, rn, 32 - (rm & 31)
             ctx.emit(Inst::AluRRImm8 {
@@ -171,7 +170,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             match ty {
                 I32 => {
                     let rd_hi = output_to_reg(ctx, outputs[0]);
-                    let rd_lo = ctx.alloc_tmp(RegClass::I32, ty);
+                    let rd_lo = ctx.alloc_tmp(ty).only_reg().unwrap();
                     let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
                     let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
 
@@ -487,7 +486,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         Opcode::FallthroughReturn | Opcode::Return => {
             for (i, input) in inputs.iter().enumerate() {
                 let reg = input_to_reg(ctx, *input, NarrowValueMode::None);
-                let retval_reg = ctx.retval(i);
+                let retval_reg = ctx.retval(i).only_reg().unwrap();
                 let ty = ctx.input_ty(insn, i);
 
                 ctx.emit(Inst::gen_move(retval_reg, reg, ty));
@@ -522,12 +521,12 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             assert_eq!(inputs.len(), abi.num_args());
             for (i, input) in inputs.iter().enumerate().filter(|(i, _)| *i <= 3) {
                 let arg_reg = input_to_reg(ctx, *input, NarrowValueMode::None);
-                abi.emit_copy_reg_to_arg(ctx, i, arg_reg);
+                abi.emit_copy_regs_to_arg(ctx, i, ValueRegs::one(arg_reg));
             }
             abi.emit_call(ctx);
             for (i, output) in outputs.iter().enumerate() {
                 let retval_reg = output_to_reg(ctx, *output);
-                abi.emit_copy_retval_to_reg(ctx, i, retval_reg);
+                abi.emit_copy_retval_to_regs(ctx, i, ValueRegs::one(retval_reg));
             }
         }
         _ => panic!("lowering {} unimplemented!", op),
diff --git a/cranelift/codegen/src/isa/x64/abi.rs b/cranelift/codegen/src/isa/x64/abi.rs
index dc9592908b..74dca6c3ec 100644
--- a/cranelift/codegen/src/isa/x64/abi.rs
+++ b/cranelift/codegen/src/isa/x64/abi.rs
@@ -32,7 +32,7 @@ fn try_fill_baldrdash_reg(call_conv: CallConv, param: &ir::AbiParam) -> Option<A
             &ir::ArgumentPurpose::VMContext => {
                 // This is SpiderMonkey's `WasmTlsReg`.
                 Some(ABIArg::Reg(
-                    regs::r14().to_real_reg(),
+                    ValueRegs::one(regs::r14().to_real_reg()),
                     types::I64,
                     param.extension,
                     param.purpose,
@@ -41,7 +41,7 @@ fn try_fill_baldrdash_reg(call_conv: CallConv, param: &ir::AbiParam) -> Option<A
             &ir::ArgumentPurpose::SignatureId => {
                 // This is SpiderMonkey's `WasmTableCallSigReg`.
                 Some(ABIArg::Reg(
-                    regs::r10().to_real_reg(),
+                    ValueRegs::one(regs::r10().to_real_reg()),
                     types::I64,
                     param.extension,
                     param.purpose,
@@ -168,7 +168,7 @@ impl ABIMachineSpec for X64ABIMachineSpec {
                 ret.push(param);
             } else if let Some(reg) = candidate {
                 ret.push(ABIArg::Reg(
-                    reg.to_real_reg(),
+                    ValueRegs::one(reg.to_real_reg()),
                     param.value_type,
                     param.extension,
                     param.purpose,
@@ -200,7 +200,7 @@ impl ABIMachineSpec for X64ABIMachineSpec {
             debug_assert!(args_or_rets == ArgsOrRets::Args);
             if let Some(reg) = get_intreg_for_arg_systemv(&call_conv, next_gpr) {
                 ret.push(ABIArg::Reg(
-                    reg.to_real_reg(),
+                    ValueRegs::one(reg.to_real_reg()),
                     types::I64,
                     ir::ArgumentExtension::None,
                     ir::ArgumentPurpose::Normal,
@@ -288,7 +288,7 @@ impl ABIMachineSpec for X64ABIMachineSpec {
         Inst::epilogue_placeholder()
     }
 
-    fn gen_add_imm(into_reg: Writable<Reg>, from_reg: Reg, imm: u32) -> SmallVec<[Self::I; 4]> {
+    fn gen_add_imm(into_reg: Writable<Reg>, from_reg: Reg, imm: u32) -> SmallInstVec<Self::I> {
         let mut ret = SmallVec::new();
         if from_reg != into_reg.to_reg() {
             ret.push(Inst::gen_move(into_reg, from_reg, I64));
@@ -302,7 +302,7 @@ impl ABIMachineSpec for X64ABIMachineSpec {
         ret
     }
 
-    fn gen_stack_lower_bound_trap(limit_reg: Reg) -> SmallVec<[Self::I; 2]> {
+    fn gen_stack_lower_bound_trap(limit_reg: Reg) -> SmallInstVec<Self::I> {
         smallvec![
             Inst::cmp_rmi_r(/* bytes = */ 8, RegMemImm::reg(regs::rsp()), limit_reg),
             Inst::TrapIf {
@@ -343,7 +343,7 @@ impl ABIMachineSpec for X64ABIMachineSpec {
         Inst::store(ty, from_reg, mem)
     }
 
-    fn gen_sp_reg_adjust(amount: i32) -> SmallVec<[Self::I; 2]> {
+    fn gen_sp_reg_adjust(amount: i32) -> SmallInstVec<Self::I> {
         let (alu_op, amount) = if amount >= 0 {
             (AluRmiROpcode::Add, amount)
         } else {
@@ -366,7 +366,7 @@ impl ABIMachineSpec for X64ABIMachineSpec {
         }
     }
 
-    fn gen_prologue_frame_setup() -> SmallVec<[Self::I; 2]> {
+    fn gen_prologue_frame_setup() -> SmallInstVec<Self::I> {
         let r_rsp = regs::rsp();
         let r_rbp = regs::rbp();
         let w_rbp = Writable::from_reg(r_rbp);
@@ -378,7 +378,7 @@ impl ABIMachineSpec for X64ABIMachineSpec {
         insts
     }
 
-    fn gen_epilogue_frame_restore() -> SmallVec<[Self::I; 2]> {
+    fn gen_epilogue_frame_restore() -> SmallInstVec<Self::I> {
         let mut insts = SmallVec::new();
         insts.push(Inst::mov_r_r(
             true,
@@ -389,7 +389,7 @@ impl ABIMachineSpec for X64ABIMachineSpec {
         insts
     }
 
-    fn gen_probestack(frame_size: u32) -> SmallVec<[Self::I; 2]> {
+    fn gen_probestack(frame_size: u32) -> SmallInstVec<Self::I> {
         let mut insts = SmallVec::new();
         insts.push(Inst::imm(
             OperandSize::Size32,
diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs
index 0f0866c813..66a68a001b 100644
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -2506,22 +2506,28 @@ impl MachInst for Inst {
         None
     }
 
-    fn rc_for_type(ty: Type) -> CodegenResult<RegClass> {
+    fn rc_for_type(ty: Type) -> CodegenResult<(&'static [RegClass], &'static [Type])> {
         match ty {
-            types::I8
-            | types::I16
-            | types::I32
-            | types::I64
-            | types::B1
-            | types::B8
-            | types::B16
-            | types::B32
-            | types::B64
-            | types::R32
-            | types::R64 => Ok(RegClass::I64),
-            types::F32 | types::F64 => Ok(RegClass::V128),
-            _ if ty.bits() == 128 => Ok(RegClass::V128),
-            types::IFLAGS | types::FFLAGS => Ok(RegClass::I64),
+            types::I8 => Ok((&[RegClass::I64], &[types::I8])),
+            types::I16 => Ok((&[RegClass::I64], &[types::I16])),
+            types::I32 => Ok((&[RegClass::I64], &[types::I32])),
+            types::I64 => Ok((&[RegClass::I64], &[types::I64])),
+            types::B1 => Ok((&[RegClass::I64], &[types::B1])),
+            types::B8 => Ok((&[RegClass::I64], &[types::B8])),
+            types::B16 => Ok((&[RegClass::I64], &[types::B16])),
+            types::B32 => Ok((&[RegClass::I64], &[types::B32])),
+            types::B64 => Ok((&[RegClass::I64], &[types::B64])),
+            types::R32 => panic!("32-bit reftype pointer should never be seen on x86-64"),
+            types::R64 => Ok((&[RegClass::I64], &[types::R64])),
+            types::F32 => Ok((&[RegClass::V128], &[types::F32])),
+            types::F64 => Ok((&[RegClass::V128], &[types::F64])),
+            types::I128 => Ok((&[RegClass::I64, RegClass::I64], &[types::I64, types::I64])),
+            types::B128 => Ok((&[RegClass::I64, RegClass::I64], &[types::B64, types::B64])),
+            _ if ty.is_vector() => {
+                assert!(ty.bits() <= 128);
+                Ok((&[RegClass::V128], &[types::I8X16]))
+            }
+            types::IFLAGS | types::FFLAGS => Ok((&[RegClass::I64], &[types::I64])),
             _ => Err(CodegenError::Unsupported(format!(
                 "Unexpected SSA-value type: {}",
                 ty
@@ -2533,13 +2539,18 @@ impl MachInst for Inst {
         Inst::jmp_known(label)
     }
 
-    fn gen_constant<F: FnMut(RegClass, Type) -> Writable<Reg>>(
-        to_reg: Writable<Reg>,
-        value: u64,
+    fn gen_constant<F: FnMut(Type) -> Writable<Reg>>(
+        to_regs: ValueRegs<Writable<Reg>>,
+        value: u128,
         ty: Type,
         mut alloc_tmp: F,
     ) -> SmallVec<[Self; 4]> {
+        // We don't support 128-bit constants.
+        assert!(value <= u64::MAX as u128);
         let mut ret = SmallVec::new();
+        let to_reg = to_regs
+            .only_reg()
+            .expect("multi-reg values not supported on x64");
         if ty == types::F32 {
             if value == 0 {
                 ret.push(Inst::xmm_rm_r(
@@ -2548,8 +2559,8 @@ impl MachInst for Inst {
                     to_reg,
                 ));
             } else {
-                let tmp = alloc_tmp(RegClass::I64, types::I32);
-                ret.push(Inst::imm(OperandSize::Size32, value, tmp));
+                let tmp = alloc_tmp(types::I32);
+                ret.push(Inst::imm(OperandSize::Size32, value as u64, tmp));
 
                 ret.push(Inst::gpr_to_xmm(
                     SseOpcode::Movd,
@@ -2566,8 +2577,8 @@ impl MachInst for Inst {
                     to_reg,
                 ));
             } else {
-                let tmp = alloc_tmp(RegClass::I64, types::I64);
-                ret.push(Inst::imm(OperandSize::Size64, value, tmp));
+                let tmp = alloc_tmp(types::I64);
+                ret.push(Inst::imm(OperandSize::Size64, value as u64, tmp));
 
                 ret.push(Inst::gpr_to_xmm(
                     SseOpcode::Movq,
@@ -2599,6 +2610,7 @@ impl MachInst for Inst {
                     to_reg,
                 ));
             } else {
+                let value = value as u64;
                 ret.push(Inst::imm(
                     OperandSize::from_bytes(ty.bytes()),
                     value.into(),
diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs
index 9299fce738..6b779a81d8 100644
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -88,7 +88,7 @@ fn matches_input_any<C: LowerCtx<I = Inst>>(
 
 /// Emits instruction(s) to generate the given 64-bit constant value into a newly-allocated
 /// temporary register, returning that register.
-fn generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u64) -> Reg {
+fn generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u64) -> ValueRegs<Reg> {
     let from_bits = ty_bits(ty);
     let masked = if from_bits < 64 {
         c & ((1u64 << from_bits) - 1)
@@ -96,15 +96,15 @@ fn generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u64) -> Re
         c
     };
 
-    let cst_copy = ctx.alloc_tmp(Inst::rc_for_type(ty).unwrap(), ty);
-    for inst in Inst::gen_constant(cst_copy, masked, ty, |reg_class, ty| {
-        ctx.alloc_tmp(reg_class, ty)
+    let cst_copy = ctx.alloc_tmp(ty);
+    for inst in Inst::gen_constant(cst_copy, masked as u128, ty, |ty| {
+        ctx.alloc_tmp(ty).only_reg().unwrap()
     })
     .into_iter()
     {
         ctx.emit(inst);
     }
-    cst_copy.to_reg()
+    non_writable_value_regs(cst_copy)
 }
 
 /// Put the given input into a register, and mark it as used (side-effect).
@@ -115,8 +115,12 @@ fn put_input_in_reg<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> Reg
     if let Some(c) = input.constant {
         // Generate constants fresh at each use to minimize long-range register pressure.
         generate_constant(ctx, ty, c)
+            .only_reg()
+            .expect("multi-reg values not supported yet")
     } else {
-        ctx.put_input_in_reg(spec.insn, spec.input)
+        ctx.put_input_in_regs(spec.insn, spec.input)
+            .only_reg()
+            .expect("multi-reg values not supported yet")
     }
 }
 
@@ -172,7 +176,7 @@ fn input_to_reg_mem<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> RegM
     if let Some(c) = inputs.constant {
         // Generate constants fresh at each use to minimize long-range register pressure.
         let ty = ctx.input_ty(spec.insn, spec.input);
-        return RegMem::reg(generate_constant(ctx, ty, c));
+        return RegMem::reg(generate_constant(ctx, ty, c).only_reg().unwrap());
     }
 
     if let Some((src_insn, 0)) = inputs.inst {
@@ -183,7 +187,11 @@ fn input_to_reg_mem<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> RegM
         }
     }
 
-    RegMem::reg(ctx.put_input_in_reg(spec.insn, spec.input))
+    RegMem::reg(
+        ctx.put_input_in_regs(spec.insn, spec.input)
+            .only_reg()
+            .unwrap(),
+    )
 }
 
 /// An extension specification for `extend_input_to_reg`.
@@ -221,7 +229,7 @@ fn extend_input_to_reg<C: LowerCtx<I = Inst>>(
     };
 
     let src = input_to_reg_mem(ctx, spec);
-    let dst = ctx.alloc_tmp(RegClass::I64, requested_ty);
+    let dst = ctx.alloc_tmp(requested_ty).only_reg().unwrap();
     match ext_spec {
         ExtSpec::ZeroExtendTo32 | ExtSpec::ZeroExtendTo64 => {
             ctx.emit(Inst::movzx_rm_r(ext_mode, src, dst))
@@ -524,19 +532,19 @@ fn emit_vm_call<C: LowerCtx<I = Inst>>(
 
     for (i, input) in inputs.iter().enumerate() {
         let arg_reg = put_input_in_reg(ctx, *input);
-        abi.emit_copy_reg_to_arg(ctx, i, arg_reg);
+        abi.emit_copy_regs_to_arg(ctx, i, ValueRegs::one(arg_reg));
     }
     if call_conv.extends_baldrdash() {
         let vm_context_vreg = ctx
             .get_vm_context()
             .expect("should have a VMContext to pass to libcall funcs");
-        abi.emit_copy_reg_to_arg(ctx, inputs.len(), vm_context_vreg);
+        abi.emit_copy_regs_to_arg(ctx, inputs.len(), ValueRegs::one(vm_context_vreg));
     }
 
     abi.emit_call(ctx);
     for (i, output) in outputs.iter().enumerate() {
-        let retval_reg = get_output_reg(ctx, *output);
-        abi.emit_copy_retval_to_reg(ctx, i, retval_reg);
+        let retval_reg = get_output_reg(ctx, *output).only_reg().unwrap();
+        abi.emit_copy_retval_to_regs(ctx, i, ValueRegs::one(retval_reg));
     }
     abi.emit_stack_post_adjust(ctx);
 
@@ -690,8 +698,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 .get_constant(insn)
                 .expect("constant value for iconst et al");
             let dst = get_output_reg(ctx, outputs[0]);
-            for inst in Inst::gen_constant(dst, value, ty.unwrap(), |reg_class, ty| {
-                ctx.alloc_tmp(reg_class, ty)
+            for inst in Inst::gen_constant(dst, value as u128, ty.unwrap(), |ty| {
+                ctx.alloc_tmp(ty).only_reg().unwrap()
             }) {
                 ctx.emit(inst);
             }
@@ -787,10 +795,10 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                             // Get inputs rhs=A and lhs=B and the dst register
                             let lhs = put_input_in_reg(ctx, inputs[0]);
                             let rhs = put_input_in_reg(ctx, inputs[1]);
-                            let dst = get_output_reg(ctx, outputs[0]);
+                            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
                             // A' = A
-                            let rhs_1 = ctx.alloc_tmp(RegClass::V128, types::I64X2);
+                            let rhs_1 = ctx.alloc_tmp(types::I64X2).only_reg().unwrap();
                             ctx.emit(Inst::gen_move(rhs_1, rhs, ty));
 
                             // A' = A' >> 32
@@ -807,7 +815,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                             ));
 
                             // B' = B
-                            let lhs_1 = ctx.alloc_tmp(RegClass::V128, types::I64X2);
+                            let lhs_1 = ctx.alloc_tmp(types::I64X2).only_reg().unwrap();
                             ctx.emit(Inst::gen_move(lhs_1, lhs, ty));
 
                             // B' = B' >> 32
@@ -876,7 +884,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 };
                 let lhs = put_input_in_reg(ctx, inputs[0]);
                 let rhs = input_to_reg_mem(ctx, inputs[1]);
-                let dst = get_output_reg(ctx, outputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
                 // Move the `lhs` to the same register as `dst`.
                 ctx.emit(Inst::gen_move(dst, lhs, ty));
@@ -920,7 +928,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     _ => unreachable!(),
                 };
 
-                let dst = get_output_reg(ctx, outputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 ctx.emit(Inst::mov_r_r(true, lhs, dst));
                 ctx.emit(Inst::alu_rmi_r(is_64, alu_op, rhs, dst));
             }
@@ -931,7 +939,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             debug_assert!(ty.is_vector() && ty.bytes() == 16);
             let lhs = input_to_reg_mem(ctx, inputs[0]);
             let rhs = put_input_in_reg(ctx, inputs[1]);
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let sse_op = match ty {
                 types::F32X4 => SseOpcode::Andnps,
                 types::F64X2 => SseOpcode::Andnpd,
@@ -945,7 +953,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
         Opcode::Iabs => {
             let src = input_to_reg_mem(ctx, inputs[0]);
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
             if ty.is_vector() {
                 let opcode = match ty {
@@ -963,7 +971,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         Opcode::Imax | Opcode::Umax | Opcode::Imin | Opcode::Umin => {
             let lhs = put_input_in_reg(ctx, inputs[0]);
             let rhs = input_to_reg_mem(ctx, inputs[1]);
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
             if ty.is_vector() {
                 let sse_op = match op {
@@ -1006,11 +1014,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let ty = ty.unwrap();
             let size = ty.bytes() as u8;
             let src = put_input_in_reg(ctx, inputs[0]);
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             ctx.emit(Inst::gen_move(dst, src, ty));
 
             if ty.is_vector() {
-                let tmp = ctx.alloc_tmp(RegClass::V128, ty);
+                let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
                 ctx.emit(Inst::equals(ty, RegMem::from(tmp), tmp));
                 ctx.emit(Inst::xor(ty, RegMem::from(tmp), dst));
             } else if ty.is_bool() {
@@ -1025,14 +1033,14 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let condition = put_input_in_reg(ctx, inputs[0]);
             let if_true = put_input_in_reg(ctx, inputs[1]);
             let if_false = input_to_reg_mem(ctx, inputs[2]);
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
             if ty.is_vector() {
-                let tmp1 = ctx.alloc_tmp(RegClass::V128, ty);
+                let tmp1 = ctx.alloc_tmp(ty).only_reg().unwrap();
                 ctx.emit(Inst::gen_move(tmp1, if_true, ty));
                 ctx.emit(Inst::and(ty, RegMem::reg(condition.clone()), tmp1));
 
-                let tmp2 = ctx.alloc_tmp(RegClass::V128, ty);
+                let tmp2 = ctx.alloc_tmp(ty).only_reg().unwrap();
                 ctx.emit(Inst::gen_move(tmp2, condition, ty));
                 ctx.emit(Inst::and_not(ty, if_false, tmp2));
 
@@ -1084,7 +1092,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         (None, Some(put_input_in_reg(ctx, inputs[1])))
                     };
 
-                let dst = get_output_reg(ctx, outputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
                 let shift_kind = match op {
                     Opcode::Ishl => ShiftKind::ShiftLeft,
@@ -1108,13 +1116,13 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 // incorrect bits to 0s (see below for handling signs in `sshr.i8x16`).
                 let src = put_input_in_reg(ctx, inputs[0]);
                 let shift_by = input_to_reg_mem_imm(ctx, inputs[1]);
-                let dst = get_output_reg(ctx, outputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
                 // If necessary, move the shift index into the lowest bits of a vector register.
                 let shift_by_moved = match &shift_by {
                     RegMemImm::Imm { .. } => shift_by.clone(),
                     RegMemImm::Reg { reg } => {
-                        let tmp_shift_by = ctx.alloc_tmp(RegClass::V128, dst_ty);
+                        let tmp_shift_by = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
                         ctx.emit(Inst::gpr_to_xmm(
                             SseOpcode::Movd,
                             RegMem::reg(*reg),
@@ -1187,8 +1195,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         // Otherwise, we must emit the entire mask table and dynamically (i.e. at run time) find the correct
                         // mask offset in the table. We do this use LEA to find the base address of the mask table and then
                         // complex addressing to offset to the right mask: `base_address + shift_by * 4`
-                        let base_mask_address = ctx.alloc_tmp(RegClass::I64, types::I64);
-                        let mask_offset = ctx.alloc_tmp(RegClass::I64, types::I64);
+                        let base_mask_address = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                        let mask_offset = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                         let mask_constant = ctx.use_constant(VCodeConstantData::WellKnown(mask));
                         ctx.emit(Inst::lea(
                             SyntheticAmode::ConstantOffset(mask_constant),
@@ -1208,7 +1216,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 };
 
                 // Load the mask into a temporary register, `mask_value`.
-                let mask_value = ctx.alloc_tmp(RegClass::V128, dst_ty);
+                let mask_value = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
                 ctx.emit(Inst::load(dst_ty, mask_address, mask_value, ExtKind::None));
 
                 // Remove the bits that would have disappeared in a true 8x16 shift. TODO in the future,
@@ -1232,7 +1240,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 let src = put_input_in_reg(ctx, inputs[0]);
                 let shift_by = input_to_reg_mem_imm(ctx, inputs[1]);
                 let shift_by_ty = ctx.input_ty(insn, 1);
-                let dst = get_output_reg(ctx, outputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
                 // In order for PACKSSWB later to only use the high byte of each 16x8 lane, we shift right an extra 8
                 // bits, relying on PSRAW to fill in the upper bits appropriately.
@@ -1242,7 +1250,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     // Otherwise we add instructions to add the extra shift amount and move the value into an XMM
                     // register.
                     RegMemImm::Reg { reg } => {
-                        let bigger_shift_by_gpr = ctx.alloc_tmp(RegClass::I64, shift_by_ty);
+                        let bigger_shift_by_gpr = ctx.alloc_tmp(shift_by_ty).only_reg().unwrap();
                         ctx.emit(Inst::mov_r_r(true, reg, bigger_shift_by_gpr));
 
                         let is_64 = shift_by_ty == types::I64;
@@ -1254,7 +1262,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                             bigger_shift_by_gpr,
                         ));
 
-                        let bigger_shift_by_xmm = ctx.alloc_tmp(RegClass::V128, dst_ty);
+                        let bigger_shift_by_xmm = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
                         ctx.emit(Inst::gpr_to_xmm(
                             SseOpcode::Movd,
                             RegMem::from(bigger_shift_by_gpr),
@@ -1276,7 +1284,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 ));
 
                 // Unpack and shift the upper lanes of `src` into a temporary register, `upper_lanes`.
-                let upper_lanes = ctx.alloc_tmp(RegClass::V128, dst_ty);
+                let upper_lanes = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
                 ctx.emit(Inst::gen_move(upper_lanes, src, dst_ty));
                 ctx.emit(Inst::xmm_rm_r(
                     SseOpcode::Punpckhbw,
@@ -1302,13 +1310,13 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 // alternate lowering here). To remedy this, we extract each 64-bit lane to a GPR, shift each using a
                 // scalar instruction, and insert the shifted values back in the `dst` XMM register.
                 let src = put_input_in_reg(ctx, inputs[0]);
-                let dst = get_output_reg(ctx, outputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 ctx.emit(Inst::gen_move(dst, src, dst_ty));
 
                 // Extract the upper and lower lanes into temporary GPRs.
-                let lower_lane = ctx.alloc_tmp(RegClass::I64, types::I64);
+                let lower_lane = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                 emit_extract_lane(ctx, src, lower_lane, 0, types::I64);
-                let upper_lane = ctx.alloc_tmp(RegClass::I64, types::I64);
+                let upper_lane = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                 emit_extract_lane(ctx, src, upper_lane, 1, types::I64);
 
                 // Shift each value.
@@ -1337,7 +1345,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 // - shift using a dynamic value given in the lower bits of another XMM register.
                 let src = put_input_in_reg(ctx, inputs[0]);
                 let shift_by = input_to_reg_mem_imm(ctx, inputs[1]);
-                let dst = get_output_reg(ctx, outputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 let sse_op = match dst_ty {
                     types::I16X8 => match op {
                         Opcode::Ishl => SseOpcode::Psllw,
@@ -1363,7 +1371,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 let shift_by = match shift_by {
                     RegMemImm::Imm { .. } => shift_by,
                     RegMemImm::Reg { reg } => {
-                        let tmp_shift_by = ctx.alloc_tmp(RegClass::V128, dst_ty);
+                        let tmp_shift_by = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
                         ctx.emit(Inst::gpr_to_xmm(
                             SseOpcode::Movd,
                             RegMem::reg(reg),
@@ -1383,7 +1391,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::Ineg => {
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
 
             if ty.is_vector() {
@@ -1391,7 +1399,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 // of the input from the register.
 
                 let src = input_to_reg_mem(ctx, inputs[0]);
-                let tmp = ctx.alloc_tmp(RegClass::V128, types::I32X4);
+                let tmp = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
 
                 let subtract_opcode = match ty {
                     types::I8X16 => SseOpcode::Psubb,
@@ -1443,9 +1451,9 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             } else {
                 input_to_reg_mem(ctx, inputs[0])
             };
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
-            let tmp = ctx.alloc_tmp(RegClass::I64, ty);
+            let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
             ctx.emit(Inst::imm(
                 OperandSize::from_bytes(ty.bytes()),
                 u64::max_value(),
@@ -1492,9 +1500,9 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             debug_assert!(ty == types::I32 || ty == types::I64);
 
             let src = input_to_reg_mem(ctx, inputs[0]);
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
-            let tmp = ctx.alloc_tmp(RegClass::I64, ty);
+            let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
             ctx.emit(Inst::imm(OperandSize::Size32, ty.bits() as u64, tmp));
 
             ctx.emit(Inst::unary_rm_r(
@@ -1529,14 +1537,14 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 // it won't for i32.popcnt), and we don't want a larger than necessary load.
                 RegMem::reg(put_input_in_reg(ctx, inputs[0]))
             };
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
             if ty == types::I64 {
                 let is_64 = true;
 
-                let tmp1 = ctx.alloc_tmp(RegClass::I64, types::I64);
-                let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I64);
-                let cst = ctx.alloc_tmp(RegClass::I64, types::I64);
+                let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                let cst = ctx.alloc_tmp(types::I64).only_reg().unwrap();
 
                 // mov src, tmp1
                 ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1));
@@ -1666,8 +1674,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 assert_eq!(ty, types::I32);
                 let is_64 = false;
 
-                let tmp1 = ctx.alloc_tmp(RegClass::I64, types::I64);
-                let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I64);
+                let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
 
                 // mov src, tmp1
                 ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1));
@@ -1792,7 +1800,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             // represented by the constant value -1. See `define_reftypes()` in
             // `meta/src/isa/x86/encodings.rs` to confirm.
             let src = put_input_in_reg(ctx, inputs[0]);
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ctx.input_ty(insn, 0);
             let imm = match op {
                 Opcode::IsNull => {
@@ -1850,14 +1858,14 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     ],
                 ) {
                     let src = put_input_in_reg(ctx, inputs[0]);
-                    let dst = get_output_reg(ctx, outputs[0]);
+                    let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                     ctx.emit(Inst::gen_move(dst, src, types::I64));
                     return Ok(());
                 }
             }
 
             let src = input_to_reg_mem(ctx, inputs[0]);
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
             let ext_mode = ExtMode::new(src_ty.bits(), dst_ty.bits());
             assert_eq!(
@@ -1881,7 +1889,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
         Opcode::Icmp => {
             let condcode = ctx.data(insn).cond_code().unwrap();
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ctx.input_ty(insn, 0);
             if !ty.is_vector() {
                 emit_cmp(ctx, insn);
@@ -1964,7 +1972,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     IntCC::NotEqual => {
                         ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst));
                         // Emit all 1s into the `tmp` register.
-                        let tmp = ctx.alloc_tmp(RegClass::V128, ty);
+                        let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
                         ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp));
                         // Invert the result of the `PCMPEQ*`.
                         ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst));
@@ -1980,7 +1988,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         ctx.emit(Inst::xmm_rm_r(maxu(ty), input.clone(), dst));
                         ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst));
                         // Emit all 1s into the `tmp` register.
-                        let tmp = ctx.alloc_tmp(RegClass::V128, ty);
+                        let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
                         ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp));
                         // Invert the result of the `PCMPEQ*`.
                         ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst));
@@ -2013,14 +2021,14 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 // set, then both the ZF and CF flag bits must also be set we can get away with using
                 // one setcc for most condition codes.
 
-                let dst = get_output_reg(ctx, outputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
                 match emit_fcmp(ctx, insn, cond_code, FcmpSpec::Normal) {
                     FcmpCondResult::Condition(cc) => {
                         ctx.emit(Inst::setcc(cc, dst));
                     }
                     FcmpCondResult::AndConditions(cc1, cc2) => {
-                        let tmp = ctx.alloc_tmp(RegClass::I64, types::I32);
+                        let tmp = ctx.alloc_tmp(types::I32).only_reg().unwrap();
                         ctx.emit(Inst::setcc(cc1, tmp));
                         ctx.emit(Inst::setcc(cc2, dst));
                         ctx.emit(Inst::alu_rmi_r(
@@ -2031,7 +2039,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         ));
                     }
                     FcmpCondResult::OrConditions(cc1, cc2) => {
-                        let tmp = ctx.alloc_tmp(RegClass::I64, types::I32);
+                        let tmp = ctx.alloc_tmp(types::I32).only_reg().unwrap();
                         ctx.emit(Inst::setcc(cc1, tmp));
                         ctx.emit(Inst::setcc(cc2, dst));
                         ctx.emit(Inst::alu_rmi_r(
@@ -2081,7 +2089,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 // Move the `lhs` to the same register as `dst`; this may not emit an actual move
                 // but ensures that the registers are the same to match x86's read-write operand
                 // encoding.
-                let dst = get_output_reg(ctx, outputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 ctx.emit(Inst::gen_move(dst, lhs, input_ty));
 
                 // Emit the comparison.
@@ -2094,7 +2102,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 let src_reg = put_input_in_reg(ctx, inputs[i]);
                 let retval_reg = ctx.retval(i);
                 let ty = ctx.input_ty(insn, i);
-                ctx.emit(Inst::gen_move(retval_reg, src_reg, ty));
+                ctx.emit(Inst::gen_move(retval_reg.only_reg().unwrap(), src_reg, ty));
             }
             // N.B.: the Ret itself is generated by the ABI.
         }
@@ -2131,12 +2139,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             assert_eq!(inputs.len(), abi.num_args());
             for (i, input) in inputs.iter().enumerate() {
                 let arg_reg = put_input_in_reg(ctx, *input);
-                abi.emit_copy_reg_to_arg(ctx, i, arg_reg);
+                abi.emit_copy_regs_to_arg(ctx, i, ValueRegs::one(arg_reg));
             }
             abi.emit_call(ctx);
             for (i, output) in outputs.iter().enumerate() {
-                let retval_reg = get_output_reg(ctx, *output);
-                abi.emit_copy_retval_to_reg(ctx, i, retval_reg);
+                let retval_reg = get_output_reg(ctx, *output).only_reg().unwrap();
+                abi.emit_copy_retval_to_regs(ctx, i, ValueRegs::one(retval_reg));
             }
             abi.emit_stack_post_adjust(ctx);
         }
@@ -2183,8 +2191,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     FcmpCondResult::AndConditions(cc1, cc2) => {
                         // A bit unfortunate, but materialize the flags in their own register, and
                         // check against this.
-                        let tmp = ctx.alloc_tmp(RegClass::I64, types::I32);
-                        let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I32);
+                        let tmp = ctx.alloc_tmp(types::I32).only_reg().unwrap();
+                        let tmp2 = ctx.alloc_tmp(types::I32).only_reg().unwrap();
                         ctx.emit(Inst::setcc(cc1, tmp));
                         ctx.emit(Inst::setcc(cc2, tmp2));
                         ctx.emit(Inst::alu_rmi_r(
@@ -2211,8 +2219,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             // TODO use cmpeqpd for all 1s.
             let value = ctx.get_constant(insn).unwrap();
             let dst = get_output_reg(ctx, outputs[0]);
-            for inst in Inst::gen_constant(dst, value, types::F64, |reg_class, ty| {
-                ctx.alloc_tmp(reg_class, ty)
+            for inst in Inst::gen_constant(dst, value as u128, types::F64, |ty| {
+                ctx.alloc_tmp(ty).only_reg().unwrap()
             }) {
                 ctx.emit(inst);
             }
@@ -2222,8 +2230,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             // TODO use cmpeqps for all 1s.
             let value = ctx.get_constant(insn).unwrap();
             let dst = get_output_reg(ctx, outputs[0]);
-            for inst in Inst::gen_constant(dst, value, types::F32, |reg_class, ty| {
-                ctx.alloc_tmp(reg_class, ty)
+            for inst in Inst::gen_constant(dst, value as u128, types::F32, |ty| {
+                ctx.alloc_tmp(ty).only_reg().unwrap()
             }) {
                 ctx.emit(inst);
             }
@@ -2232,7 +2240,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         Opcode::WideningPairwiseDotProductS => {
             let lhs = put_input_in_reg(ctx, inputs[0]);
             let rhs = input_to_reg_mem(ctx, inputs[1]);
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
 
             ctx.emit(Inst::gen_move(dst, lhs, ty));
@@ -2250,7 +2258,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv => {
             let lhs = put_input_in_reg(ctx, inputs[0]);
             let rhs = input_to_reg_mem(ctx, inputs[1]);
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
 
             // Move the `lhs` to the same register as `dst`; this may not emit an actual move
@@ -2301,7 +2309,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         Opcode::Fmin | Opcode::Fmax => {
             let lhs = put_input_in_reg(ctx, inputs[0]);
             let rhs = put_input_in_reg(ctx, inputs[1]);
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let is_min = op == Opcode::Fmin;
             let output_ty = ty.unwrap();
             ctx.emit(Inst::gen_move(dst, rhs, output_ty));
@@ -2380,7 +2388,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         };
 
                     // Copy lhs into tmp
-                    let tmp_xmm1 = ctx.alloc_tmp(RegClass::V128, output_ty);
+                    let tmp_xmm1 = ctx.alloc_tmp(output_ty).only_reg().unwrap();
                     ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1));
 
                     // Perform min in reverse direction
@@ -2457,7 +2465,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     };
 
                     // Copy lhs into tmp.
-                    let tmp_xmm1 = ctx.alloc_tmp(RegClass::V128, types::F32);
+                    let tmp_xmm1 = ctx.alloc_tmp(types::F32).only_reg().unwrap();
                     ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1));
 
                     // Perform max in reverse direction.
@@ -2508,7 +2516,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         Opcode::FminPseudo | Opcode::FmaxPseudo => {
             let lhs = input_to_reg_mem(ctx, inputs[0]);
             let rhs = put_input_in_reg(ctx, inputs[1]);
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
             ctx.emit(Inst::gen_move(dst, rhs, ty));
             let sse_opcode = match (ty, op) {
@@ -2523,7 +2531,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
         Opcode::Sqrt => {
             let src = input_to_reg_mem(ctx, inputs[0]);
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
 
             let sse_op = match ty {
@@ -2542,13 +2550,13 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
         Opcode::Fpromote => {
             let src = input_to_reg_mem(ctx, inputs[0]);
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtss2sd, src, dst));
         }
 
         Opcode::Fdemote => {
             let src = input_to_reg_mem(ctx, inputs[0]);
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtsd2ss, src, dst));
         }
 
@@ -2573,12 +2581,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     assert_eq!(output_ty, types::F64);
                     SseOpcode::Cvtsi2sd
                 };
-                let dst = get_output_reg(ctx, outputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 ctx.emit(Inst::gpr_to_xmm(opcode, src, src_size, dst));
             } else {
                 let ty = ty.unwrap();
                 let src = put_input_in_reg(ctx, inputs[0]);
-                let dst = get_output_reg(ctx, outputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 let opcode = match ctx.input_ty(insn, 0) {
                     types::I32X4 => SseOpcode::Cvtdq2ps,
                     _ => {
@@ -2591,7 +2599,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::FcvtFromUint => {
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
 
             let input_ty = ctx.input_ty(insn, 0);
@@ -2618,11 +2626,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     types::I64 => {
                         let src = put_input_in_reg(ctx, inputs[0]);
 
-                        let src_copy = ctx.alloc_tmp(RegClass::I64, types::I64);
+                        let src_copy = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                         ctx.emit(Inst::gen_move(src_copy, src, types::I64));
 
-                        let tmp_gpr1 = ctx.alloc_tmp(RegClass::I64, types::I64);
-                        let tmp_gpr2 = ctx.alloc_tmp(RegClass::I64, types::I64);
+                        let tmp_gpr1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                        let tmp_gpr2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                         ctx.emit(Inst::cvt_u64_to_float_seq(
                             ty == types::F64,
                             src_copy,
@@ -2656,10 +2664,10 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
                 assert_eq!(ctx.input_ty(insn, 0), types::I32X4);
                 let src = put_input_in_reg(ctx, inputs[0]);
-                let dst = get_output_reg(ctx, outputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
                 // Create a temporary register
-                let tmp = ctx.alloc_tmp(RegClass::V128, types::I32X4);
+                let tmp = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
                 ctx.emit(Inst::xmm_unary_rm_r(
                     SseOpcode::Movapd,
                     RegMem::reg(src),
@@ -2697,7 +2705,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
         Opcode::FcvtToUint | Opcode::FcvtToUintSat | Opcode::FcvtToSint | Opcode::FcvtToSintSat => {
             let src = put_input_in_reg(ctx, inputs[0]);
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
             let input_ty = ctx.input_ty(insn, 0);
             if !input_ty.is_vector() {
@@ -2719,11 +2727,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 let to_signed = op == Opcode::FcvtToSint || op == Opcode::FcvtToSintSat;
                 let is_sat = op == Opcode::FcvtToUintSat || op == Opcode::FcvtToSintSat;
 
-                let src_copy = ctx.alloc_tmp(RegClass::V128, input_ty);
+                let src_copy = ctx.alloc_tmp(input_ty).only_reg().unwrap();
                 ctx.emit(Inst::gen_move(src_copy, src, input_ty));
 
-                let tmp_xmm = ctx.alloc_tmp(RegClass::V128, input_ty);
-                let tmp_gpr = ctx.alloc_tmp(RegClass::I64, output_ty);
+                let tmp_xmm = ctx.alloc_tmp(input_ty).only_reg().unwrap();
+                let tmp_gpr = ctx.alloc_tmp(output_ty).only_reg().unwrap();
 
                 if to_signed {
                     ctx.emit(Inst::cvt_float_to_sint_seq(
@@ -2738,7 +2746,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 if op == Opcode::FcvtToSintSat {
                     // Sets destination to zero if float is NaN
                     assert_eq!(types::F32X4, ctx.input_ty(insn, 0));
-                    let tmp = ctx.alloc_tmp(RegClass::V128, types::I32X4);
+                    let tmp = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
                     ctx.emit(Inst::xmm_unary_rm_r(
                         SseOpcode::Movapd,
                         RegMem::reg(src),
@@ -2843,8 +2851,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
                     // Create temporaries
                     assert_eq!(types::F32X4, ctx.input_ty(insn, 0));
-                    let tmp1 = ctx.alloc_tmp(RegClass::V128, types::I32X4);
-                    let tmp2 = ctx.alloc_tmp(RegClass::V128, types::I32X4);
+                    let tmp1 = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
+                    let tmp2 = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
 
                     // Converting to unsigned int so if float src is negative or NaN
                     // will first set to zero.
@@ -2917,7 +2925,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let input_ty = ctx.input_ty(insn, 0);
             let output_ty = ctx.output_ty(insn, 0);
             let src = put_input_in_reg(ctx, inputs[0]);
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             if output_ty.is_vector() {
                 match op {
                     Opcode::SwidenLow => match (input_ty, output_ty) {
@@ -2999,7 +3007,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let output_ty = ctx.output_ty(insn, 0);
             let src1 = put_input_in_reg(ctx, inputs[0]);
             let src2 = put_input_in_reg(ctx, inputs[1]);
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             if output_ty.is_vector() {
                 match op {
                     Opcode::Snarrow => match (input_ty, output_ty) {
@@ -3036,7 +3044,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             match (input_ty, output_ty) {
                 (types::F32, types::I32) => {
                     let src = put_input_in_reg(ctx, inputs[0]);
-                    let dst = get_output_reg(ctx, outputs[0]);
+                    let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                     ctx.emit(Inst::xmm_to_gpr(
                         SseOpcode::Movd,
                         src,
@@ -3046,7 +3054,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 }
                 (types::I32, types::F32) => {
                     let src = input_to_reg_mem(ctx, inputs[0]);
-                    let dst = get_output_reg(ctx, outputs[0]);
+                    let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                     ctx.emit(Inst::gpr_to_xmm(
                         SseOpcode::Movd,
                         src,
@@ -3056,7 +3064,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 }
                 (types::F64, types::I64) => {
                     let src = put_input_in_reg(ctx, inputs[0]);
-                    let dst = get_output_reg(ctx, outputs[0]);
+                    let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                     ctx.emit(Inst::xmm_to_gpr(
                         SseOpcode::Movq,
                         src,
@@ -3066,7 +3074,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 }
                 (types::I64, types::F64) => {
                     let src = input_to_reg_mem(ctx, inputs[0]);
-                    let dst = get_output_reg(ctx, outputs[0]);
+                    let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                     ctx.emit(Inst::gpr_to_xmm(
                         SseOpcode::Movq,
                         src,
@@ -3080,7 +3088,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
         Opcode::Fabs | Opcode::Fneg => {
             let src = input_to_reg_mem(ctx, inputs[0]);
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
             // In both cases, generate a constant and apply a single binary instruction:
             // - to compute the absolute value, set all bits to 1 but the MSB to 0, and bit-AND the
@@ -3089,7 +3097,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             // src with it.
             let output_ty = ty.unwrap();
             if !output_ty.is_vector() {
-                let (val, opcode) = match output_ty {
+                let (val, opcode): (u64, _) = match output_ty {
                     types::F32 => match op {
                         Opcode::Fabs => (0x7fffffff, SseOpcode::Andps),
                         Opcode::Fneg => (0x80000000, SseOpcode::Xorps),
@@ -3103,8 +3111,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     _ => panic!("unexpected type {:?} for Fabs", output_ty),
                 };
 
-                for inst in Inst::gen_constant(dst, val, output_ty, |reg_class, ty| {
-                    ctx.alloc_tmp(reg_class, ty)
+                for inst in Inst::gen_constant(ValueRegs::one(dst), val as u128, output_ty, |ty| {
+                    ctx.alloc_tmp(ty).only_reg().unwrap()
                 }) {
                     ctx.emit(inst);
                 }
@@ -3122,7 +3130,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
                     // Generate an all 1s constant in an XMM register. This uses CMPPS but could
                     // have used CMPPD with the same effect.
-                    let tmp = ctx.alloc_tmp(RegClass::V128, output_ty);
+                    let tmp = ctx.alloc_tmp(output_ty).only_reg().unwrap();
                     let cond = FcmpImm::from(FloatCC::Equal);
                     let cmpps = Inst::xmm_rm_r_imm(
                         SseOpcode::Cmpps,
@@ -3158,7 +3166,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::Fcopysign => {
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let lhs = put_input_in_reg(ctx, inputs[0]);
             let rhs = put_input_in_reg(ctx, inputs[1]);
 
@@ -3174,8 +3182,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             // andp{s,d}  tmp_xmm1, tmp_xmm2
             // orp{s,d}   tmp_xmm2, dst
 
-            let tmp_xmm1 = ctx.alloc_tmp(RegClass::V128, types::F32);
-            let tmp_xmm2 = ctx.alloc_tmp(RegClass::V128, types::F32);
+            let tmp_xmm1 = ctx.alloc_tmp(types::F32).only_reg().unwrap();
+            let tmp_xmm2 = ctx.alloc_tmp(types::F32).only_reg().unwrap();
 
             let (sign_bit_cst, mov_op, and_not_op, and_op, or_op) = match ty {
                 types::F32 => (
@@ -3197,8 +3205,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 }
             };
 
-            for inst in Inst::gen_constant(tmp_xmm1, sign_bit_cst, ty, |reg_class, ty| {
-                ctx.alloc_tmp(reg_class, ty)
+            for inst in Inst::gen_constant(ValueRegs::one(tmp_xmm1), sign_bit_cst, ty, |ty| {
+                ctx.alloc_tmp(ty).only_reg().unwrap()
             }) {
                 ctx.emit(inst);
             }
@@ -3247,7 +3255,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     _ => panic!("Unknown op/ty combination (vector){:?}", ty),
                 };
                 let src = put_input_in_reg(ctx, inputs[0]);
-                let dst = get_output_reg(ctx, outputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 ctx.emit(Inst::gen_move(dst, src, ty));
                 ctx.emit(Inst::xmm_rm_r_imm(
                     op,
@@ -3372,7 +3380,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 _ => unreachable!(),
             };
 
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let is_xmm = elem_ty.is_float() || elem_ty.is_vector();
 
             match (sign_extend, is_xmm) {
@@ -3494,7 +3502,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             // use the single instruction `lock xadd`.  However, those improvements have been
             // left for another day.
             // TODO: filed as https://github.com/bytecodealliance/wasmtime/issues/2153
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let mut addr = put_input_in_reg(ctx, inputs[0]);
             let mut arg2 = put_input_in_reg(ctx, inputs[1]);
             let ty_access = ty.unwrap();
@@ -3531,7 +3539,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         Opcode::AtomicCas => {
             // This is very similar to, but not identical to, the `AtomicRmw` case.  As with
             // `AtomicRmw`, there's no need to zero-extend narrow values here.
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let addr = lower_to_amode(ctx, inputs[0], 0);
             let expected = put_input_in_reg(ctx, inputs[1]);
             let replacement = put_input_in_reg(ctx, inputs[2]);
@@ -3559,7 +3567,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             // This is a normal load.  The x86-TSO memory model provides sufficient sequencing
             // to satisfy the CLIF synchronisation requirements for `AtomicLoad` without the
             // need for any fence instructions.
-            let data = get_output_reg(ctx, outputs[0]);
+            let data = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let addr = lower_to_amode(ctx, inputs[0], 0);
             let ty_access = ty.unwrap();
             assert!(is_valid_atomic_transaction_ty(ty_access));
@@ -3597,7 +3605,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::FuncAddr => {
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let (extname, _) = ctx.call_target(insn).unwrap();
             let extname = extname.clone();
             ctx.emit(Inst::LoadExtName {
@@ -3608,7 +3616,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::SymbolValue => {
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let (extname, _, offset) = ctx.symbol_value(insn).unwrap();
             let extname = extname.clone();
             ctx.emit(Inst::LoadExtName {
@@ -3627,7 +3635,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 } => (stack_slot, offset),
                 _ => unreachable!(),
             };
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let offset: i32 = offset.into();
             let inst = ctx
                 .abi()
@@ -3649,7 +3657,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
                 let ty = ctx.output_ty(insn, 0);
                 let rhs = put_input_in_reg(ctx, rhs_input);
-                let dst = get_output_reg(ctx, outputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 let lhs = if is_int_or_ref_ty(ty) && ty.bytes() < 4 {
                     // Special case: since the higher bits are undefined per CLIF semantics, we
                     // can just apply a 32-bit cmove here. Force inputs into registers, to
@@ -3718,7 +3726,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 };
 
                 let rhs = put_input_in_reg(ctx, inputs[2]);
-                let dst = get_output_reg(ctx, outputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
                 let cc = if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) {
                     emit_cmp(ctx, icmp);
@@ -3762,7 +3770,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         Opcode::Selectif | Opcode::SelectifSpectreGuard => {
             let lhs = input_to_reg_mem(ctx, inputs[1]);
             let rhs = put_input_in_reg(ctx, inputs[2]);
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ctx.output_ty(insn, 0);
 
             // Verification ensures that the input is always a single-def ifcmp.
@@ -3780,7 +3788,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 let size = ty.bytes() as u8;
                 if size == 1 {
                     // Sign-extend operands to 32, then do a cmove of size 4.
-                    let lhs_se = ctx.alloc_tmp(RegClass::I64, types::I32);
+                    let lhs_se = ctx.alloc_tmp(types::I32).only_reg().unwrap();
                     ctx.emit(Inst::movsx_rm_r(ExtMode::BL, lhs, lhs_se));
                     ctx.emit(Inst::movsx_rm_r(ExtMode::BL, RegMem::reg(rhs), dst));
                     ctx.emit(Inst::cmove(4, cc, RegMem::reg(lhs_se.to_reg()), dst));
@@ -3809,7 +3817,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let size = input_ty.bytes() as u8;
 
             let dividend = put_input_in_reg(ctx, inputs[0]);
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
             ctx.emit(Inst::gen_move(
                 Writable::from_reg(regs::rax()),
@@ -3827,11 +3835,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 // destination register.
                 let divisor = put_input_in_reg(ctx, inputs[1]);
 
-                let divisor_copy = ctx.alloc_tmp(RegClass::I64, types::I64);
+                let divisor_copy = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                 ctx.emit(Inst::gen_move(divisor_copy, divisor, types::I64));
 
                 let tmp = if op == Opcode::Sdiv && size == 8 {
-                    Some(ctx.alloc_tmp(RegClass::I64, types::I64))
+                    Some(ctx.alloc_tmp(types::I64).only_reg().unwrap())
                 } else {
                     None
                 };
@@ -3885,7 +3893,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
             let lhs = put_input_in_reg(ctx, inputs[0]);
             let rhs = input_to_reg_mem(ctx, inputs[1]);
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
             // Move lhs in %rax.
             ctx.emit(Inst::gen_move(
@@ -3903,7 +3911,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::GetPinnedReg => {
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             ctx.emit(Inst::gen_move(dst, regs::pinned_reg(), types::I64));
         }
 
@@ -3929,7 +3937,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 unreachable!("vconst should always have unary_const format")
             };
             // TODO use Inst::gen_constant() instead.
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
             ctx.emit(Inst::xmm_load_const(used_constant, dst, ty));
         }
@@ -3940,14 +3948,14 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             // instruction should emit no machine code but a move is necessary to give the register
             // allocator a definition for the output virtual register.
             let src = put_input_in_reg(ctx, inputs[0]);
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
             ctx.emit(Inst::gen_move(dst, src, ty));
         }
 
         Opcode::Shuffle => {
             let ty = ty.unwrap();
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let lhs_ty = ctx.input_ty(insn, 0);
             let lhs = put_input_in_reg(ctx, inputs[0]);
             let rhs = put_input_in_reg(ctx, inputs[1]);
@@ -3973,7 +3981,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     .map(zero_unknown_lane_index)
                     .collect();
                 let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
-                let tmp = ctx.alloc_tmp(RegClass::V128, types::I8X16);
+                let tmp = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
                 ctx.emit(Inst::xmm_load_const(constant, tmp, ty));
                 // After loading the constructed mask in a temporary register, we use this to
                 // shuffle the `dst` register (remember that, in this case, it is the same as
@@ -3985,11 +3993,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 // we build the `constructed_mask` for each case statically.
 
                 // PSHUFB the `lhs` argument into `tmp0`, placing zeroes for unused lanes.
-                let tmp0 = ctx.alloc_tmp(RegClass::V128, lhs_ty);
+                let tmp0 = ctx.alloc_tmp(lhs_ty).only_reg().unwrap();
                 ctx.emit(Inst::gen_move(tmp0, lhs, lhs_ty));
                 let constructed_mask = mask.iter().cloned().map(zero_unknown_lane_index).collect();
                 let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
-                let tmp1 = ctx.alloc_tmp(RegClass::V128, types::I8X16);
+                let tmp1 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
                 ctx.emit(Inst::xmm_load_const(constant, tmp1, ty));
                 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp1), tmp0));
 
@@ -4000,7 +4008,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     .map(zero_unknown_lane_index)
                     .collect();
                 let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
-                let tmp2 = ctx.alloc_tmp(RegClass::V128, types::I8X16);
+                let tmp2 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
                 ctx.emit(Inst::xmm_load_const(constant, tmp2, ty));
                 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp2), dst));
 
@@ -4019,7 +4027,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             // semantics match the Wasm SIMD semantics for this instruction.
             // The instruction format maps to variables like: %dst = swizzle %src, %mask
             let ty = ty.unwrap();
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let src = put_input_in_reg(ctx, inputs[0]);
             let swizzle_mask = put_input_in_reg(ctx, inputs[1]);
 
@@ -4027,7 +4035,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             ctx.emit(Inst::gen_move(dst, src, ty));
 
             // Create a mask for zeroing out-of-bounds lanes of the swizzle mask.
-            let zero_mask = ctx.alloc_tmp(RegClass::V128, types::I8X16);
+            let zero_mask = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
             static ZERO_MASK_VALUE: [u8; 16] = [
                 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
                 0x70, 0x70,
@@ -4054,7 +4062,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         Opcode::Insertlane => {
             // The instruction format maps to variables like: %dst = insertlane %in_vec, %src, %lane
             let ty = ty.unwrap();
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let in_vec = put_input_in_reg(ctx, inputs[0]);
             let src_ty = ctx.input_ty(insn, 1);
             debug_assert!(!src_ty.is_vector());
@@ -4073,7 +4081,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         Opcode::Extractlane => {
             // The instruction format maps to variables like: %dst = extractlane %src, %lane
             let ty = ty.unwrap();
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let src_ty = ctx.input_ty(insn, 0);
             assert_eq!(src_ty.bits(), 128);
             let src = put_input_in_reg(ctx, inputs[0]);
@@ -4094,7 +4102,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             assert!(src_ty.bits() < 128);
 
             let src = input_to_reg_mem(ctx, inputs[0]);
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
             // We know that splat will overwrite all of the lanes of `dst` but it takes several
             // instructions to do so. Because of the multiple instructions, there is no good way to
@@ -4107,7 +4115,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 8 => {
                     emit_insert_lane(ctx, src, dst, 0, ty.lane_type());
                     // Initialize a register with all 0s.
-                    let tmp = ctx.alloc_tmp(RegClass::V128, ty);
+                    let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
                     // Shuffle the lowest byte lane to all other lanes.
                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst))
@@ -4144,7 +4152,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::VanyTrue => {
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let src_ty = ctx.input_ty(insn, 0);
             assert_eq!(src_ty.bits(), 128);
             let src = put_input_in_reg(ctx, inputs[0]);
@@ -4155,8 +4163,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::VallTrue => {
-            let ty = ty.unwrap();
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let src_ty = ctx.input_ty(insn, 0);
             assert_eq!(src_ty.bits(), 128);
             let src = input_to_reg_mem(ctx, inputs[0]);
@@ -4170,7 +4177,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             };
 
             // Initialize a register with all 0s.
-            let tmp = ctx.alloc_tmp(RegClass::V128, ty);
+            let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap();
             ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
             // Compare to see what lanes are filled with all 1s.
             ctx.emit(Inst::xmm_rm_r(eq(src_ty), src, tmp));
@@ -4188,7 +4195,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let src = put_input_in_reg(ctx, inputs[0]);
             let src_ty = ctx.input_ty(insn, 0);
             debug_assert!(src_ty.is_vector() && src_ty.bits() == 128);
-            let dst = get_output_reg(ctx, outputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             debug_assert!(dst.to_reg().get_class() == RegClass::I64);
 
             // The Intel specification allows using both 32-bit and 64-bit GPRs as destination for
@@ -4216,7 +4223,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     //     PACKSSWB([x1, x2, ...], [x1, x2, ...]) = [x1', x2', ..., x1', x2', ...]
                     // - use PMOVMSKB to gather the high bits; now we have duplicates, though
                     // - shift away the bottom 8 high bits to remove the duplicates.
-                    let tmp = ctx.alloc_tmp(RegClass::V128, src_ty);
+                    let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap();
                     ctx.emit(Inst::gen_move(tmp, src, src_ty));
                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(src), tmp));
                     ctx.emit(Inst::xmm_to_gpr(
@@ -4515,12 +4522,12 @@ impl LowerBackend for X64Backend {
                     // worse.)
 
                     // This temporary is used as a signed integer of 64-bits (to hold addresses).
-                    let tmp1 = ctx.alloc_tmp(RegClass::I64, types::I64);
+                    let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                     // This temporary is used as a signed integer of 32-bits (for the wasm-table
                     // index) and then 64-bits (address addend). The small lie about the I64 type
                     // is benign, since the temporary is dead after this instruction (and its
                     // Cranelift type is thus unused).
-                    let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I64);
+                    let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
 
                     let targets_for_term: Vec<MachLabel> = targets.to_vec();
                     let default_target = targets[0];
diff --git a/cranelift/codegen/src/machinst/abi.rs b/cranelift/codegen/src/machinst/abi.rs
index c72a81dcc2..59738bd3a5 100644
--- a/cranelift/codegen/src/machinst/abi.rs
+++ b/cranelift/codegen/src/machinst/abi.rs
@@ -5,8 +5,12 @@ use crate::ir::StackSlot;
 use crate::isa::CallConv;
 use crate::machinst::*;
 use crate::settings;
-
 use regalloc::{Reg, Set, SpillSlot, Writable};
+use smallvec::SmallVec;
+
+/// A small vector of instructions (with some reasonable size); appropriate for
+/// a small fixed sequence implementing one operation.
+pub type SmallInstVec<I> = SmallVec<[I; 4]>;
 
 /// Trait implemented by an object that tracks ABI-related state (e.g., stack
 /// layout) and can generate code while emitting the *body* of a function.
@@ -14,9 +18,9 @@ pub trait ABICallee {
     /// The instruction type for the ISA associated with this ABI.
     type I: VCodeInst;
 
-    /// Does the ABI-body code need a temp reg? One will be provided to `init()`
-    /// as the `maybe_tmp` arg if so.
-    fn temp_needed(&self) -> bool;
+    /// Does the ABI-body code need a temp reg (and if so, of what type)? One
+    /// will be provided to `init()` as the `maybe_tmp` arg if so.
+    fn temp_needed(&self) -> Option<Type>;
 
     /// Initialize. This is called after the ABICallee is constructed because it
     /// may be provided with a temp vreg, which can only be allocated once the
@@ -52,7 +56,11 @@ pub trait ABICallee {
 
     /// Generate an instruction which copies an argument to a destination
     /// register.
-    fn gen_copy_arg_to_reg(&self, idx: usize, into_reg: Writable<Reg>) -> Self::I;
+    fn gen_copy_arg_to_regs(
+        &self,
+        idx: usize,
+        into_reg: ValueRegs<Writable<Reg>>,
+    ) -> SmallInstVec<Self::I>;
 
     /// Is the given argument needed in the body (as opposed to, e.g., serving
     /// only as a special ABI-specific placeholder)? This controls whether
@@ -67,7 +75,11 @@ pub trait ABICallee {
     fn gen_retval_area_setup(&self) -> Option<Self::I>;
 
     /// Generate an instruction which copies a source register to a return value slot.
-    fn gen_copy_reg_to_retval(&self, idx: usize, from_reg: Writable<Reg>) -> Vec<Self::I>;
+    fn gen_copy_regs_to_retval(
+        &self,
+        idx: usize,
+        from_reg: ValueRegs<Writable<Reg>>,
+    ) -> SmallInstVec<Self::I>;
 
     /// Generate a return instruction.
     fn gen_ret(&self) -> Self::I;
@@ -99,17 +111,33 @@ pub trait ABICallee {
         slot: StackSlot,
         offset: u32,
         ty: Type,
-        into_reg: Writable<Reg>,
-    ) -> Self::I;
+        into_reg: ValueRegs<Writable<Reg>>,
+    ) -> SmallInstVec<Self::I>;
 
     /// Store to a stackslot.
-    fn store_stackslot(&self, slot: StackSlot, offset: u32, ty: Type, from_reg: Reg) -> Self::I;
+    fn store_stackslot(
+        &self,
+        slot: StackSlot,
+        offset: u32,
+        ty: Type,
+        from_reg: ValueRegs<Reg>,
+    ) -> SmallInstVec<Self::I>;
 
     /// Load from a spillslot.
-    fn load_spillslot(&self, slot: SpillSlot, ty: Type, into_reg: Writable<Reg>) -> Self::I;
+    fn load_spillslot(
+        &self,
+        slot: SpillSlot,
+        ty: Type,
+        into_reg: ValueRegs<Writable<Reg>>,
+    ) -> SmallInstVec<Self::I>;
 
     /// Store to a spillslot.
-    fn store_spillslot(&self, slot: SpillSlot, ty: Type, from_reg: Reg) -> Self::I;
+    fn store_spillslot(
+        &self,
+        slot: SpillSlot,
+        ty: Type,
+        from_reg: ValueRegs<Reg>,
+    ) -> SmallInstVec<Self::I>;
 
     /// Generate a stack map, given a list of spillslots and the emission state
     /// at a given program point (prior to emission fo the safepointing
@@ -125,13 +153,13 @@ pub trait ABICallee {
     /// `store_retval`, and spillslot accesses.)  `self` is mutable so that we
     /// can store information in it which will be useful when creating the
     /// epilogue.
-    fn gen_prologue(&mut self) -> Vec<Self::I>;
+    fn gen_prologue(&mut self) -> SmallInstVec<Self::I>;
 
     /// Generate an epilogue, post-regalloc. Note that this must generate the
     /// actual return instruction (rather than emitting this in the lowering
     /// logic), because the epilogue code comes before the return and the two are
     /// likely closely related.
-    fn gen_epilogue(&self) -> Vec<Self::I>;
+    fn gen_epilogue(&self) -> SmallInstVec<Self::I>;
 
     /// Returns the full frame size for the given function, after prologue
     /// emission has run. This comprises the spill slots and stack-storage slots
@@ -188,19 +216,19 @@ pub trait ABICaller {
     fn num_args(&self) -> usize;
 
     /// Emit a copy of an argument value from a source register, prior to the call.
-    fn emit_copy_reg_to_arg<C: LowerCtx<I = Self::I>>(
+    fn emit_copy_regs_to_arg<C: LowerCtx<I = Self::I>>(
         &self,
         ctx: &mut C,
         idx: usize,
-        from_reg: Reg,
+        from_reg: ValueRegs<Reg>,
     );
 
     /// Emit a copy a return value into a destination register, after the call returns.
-    fn emit_copy_retval_to_reg<C: LowerCtx<I = Self::I>>(
+    fn emit_copy_retval_to_regs<C: LowerCtx<I = Self::I>>(
         &self,
         ctx: &mut C,
         idx: usize,
-        into_reg: Writable<Reg>,
+        into_reg: ValueRegs<Writable<Reg>>,
     );
 
     /// Emit code to pre-adjust the stack, prior to argument copies and call.
diff --git a/cranelift/codegen/src/machinst/abi_impl.rs b/cranelift/codegen/src/machinst/abi_impl.rs
index 4b21c2d946..d315c3defb 100644
--- a/cranelift/codegen/src/machinst/abi_impl.rs
+++ b/cranelift/codegen/src/machinst/abi_impl.rs
@@ -119,6 +119,7 @@ use crate::{ir, isa};
 use alloc::vec::Vec;
 use log::{debug, trace};
 use regalloc::{RealReg, Reg, RegClass, Set, SpillSlot, Writable};
+use smallvec::{smallvec, SmallVec};
 use std::convert::TryFrom;
 use std::marker::PhantomData;
 use std::mem;
@@ -126,9 +127,9 @@ use std::mem;
 /// A location for an argument or return value.
 #[derive(Clone, Copy, Debug)]
 pub enum ABIArg {
-    /// In a real register.
+    /// In a real register (or set of registers).
     Reg(
-        RealReg,
+        ValueRegs<RealReg>,
         ir::Type,
         ir::ArgumentExtension,
         ir::ArgumentPurpose,
@@ -183,6 +184,17 @@ pub enum StackAMode {
     SPOffset(i64, ir::Type),
 }
 
+impl StackAMode {
+    /// Offset by an addend.
+    pub fn offset(self, addend: i64) -> Self {
+        match self {
+            StackAMode::FPOffset(off, ty) => StackAMode::FPOffset(off + addend, ty),
+            StackAMode::NominalSPOffset(off, ty) => StackAMode::NominalSPOffset(off + addend, ty),
+            StackAMode::SPOffset(off, ty) => StackAMode::SPOffset(off + addend, ty),
+        }
+    }
+}
+
 /// Trait implemented by machine-specific backend to provide information about
 /// register assignments and to allow generating the specific instructions for
 /// stack loads/saves, prologues/epilogues, etc.
@@ -270,12 +282,12 @@ pub trait ABIMachineSpec {
     ///
     /// - The add-imm sequence must work correctly when `from_reg` and/or
     ///   `into_reg` are the register returned by `get_stacklimit_reg()`.
-    fn gen_add_imm(into_reg: Writable<Reg>, from_reg: Reg, imm: u32) -> SmallVec<[Self::I; 4]>;
+    fn gen_add_imm(into_reg: Writable<Reg>, from_reg: Reg, imm: u32) -> SmallInstVec<Self::I>;
 
     /// Generate a sequence that traps with a `TrapCode::StackOverflow` code if
     /// the stack pointer is less than the given limit register (assuming the
     /// stack grows downward).
-    fn gen_stack_lower_bound_trap(limit_reg: Reg) -> SmallVec<[Self::I; 2]>;
+    fn gen_stack_lower_bound_trap(limit_reg: Reg) -> SmallInstVec<Self::I>;
 
     /// Generate an instruction to compute an address of a stack slot (FP- or
     /// SP-based offset).
@@ -301,7 +313,7 @@ pub trait ABIMachineSpec {
     fn gen_store_base_offset(base: Reg, offset: i32, from_reg: Reg, ty: Type) -> Self::I;
 
     /// Adjust the stack pointer up or down.
-    fn gen_sp_reg_adjust(amount: i32) -> SmallVec<[Self::I; 2]>;
+    fn gen_sp_reg_adjust(amount: i32) -> SmallInstVec<Self::I>;
 
     /// Generate a meta-instruction that adjusts the nominal SP offset.
     fn gen_nominal_sp_adj(amount: i32) -> Self::I;
@@ -309,13 +321,13 @@ pub trait ABIMachineSpec {
     /// Generate the usual frame-setup sequence for this architecture: e.g.,
     /// `push rbp / mov rbp, rsp` on x86-64, or `stp fp, lr, [sp, #-16]!` on
     /// AArch64.
-    fn gen_prologue_frame_setup() -> SmallVec<[Self::I; 2]>;
+    fn gen_prologue_frame_setup() -> SmallInstVec<Self::I>;
 
     /// Generate the usual frame-restore sequence for this architecture.
-    fn gen_epilogue_frame_restore() -> SmallVec<[Self::I; 2]>;
+    fn gen_epilogue_frame_restore() -> SmallInstVec<Self::I>;
 
     /// Generate a probestack call.
-    fn gen_probestack(_frame_size: u32) -> SmallVec<[Self::I; 2]>;
+    fn gen_probestack(_frame_size: u32) -> SmallInstVec<Self::I>;
 
     /// Generate a clobber-save sequence. This takes the list of *all* registers
     /// written/modified by the function body. The implementation here is
@@ -483,7 +495,7 @@ pub struct ABICalleeImpl<M: ABIMachineSpec> {
     /// need to be extremely careful with each instruction. The instructions are
     /// manually register-allocated and carefully only use caller-saved
     /// registers and keep nothing live after this sequence of instructions.
-    stack_limit: Option<(Reg, Vec<M::I>)>,
+    stack_limit: Option<(Reg, SmallInstVec<M::I>)>,
     /// Are we to invoke the probestack function in the prologue? If so,
     /// what is the minimum size at which we must invoke it?
     probestack_min_frame: Option<u32>,
@@ -498,7 +510,7 @@ fn get_special_purpose_param_register(
 ) -> Option<Reg> {
     let idx = f.signature.special_param_index(purpose)?;
     match abi.args[idx] {
-        ABIArg::Reg(reg, ..) => Some(reg.to_reg()),
+        ABIArg::Reg(regs, ..) => Some(regs.only_reg().unwrap().to_reg()),
         ABIArg::Stack(..) => None,
     }
 }
@@ -539,7 +551,7 @@ impl<M: ABIMachineSpec> ABICalleeImpl<M> {
         // from the arguments.
         let stack_limit =
             get_special_purpose_param_register(f, &sig, ir::ArgumentPurpose::StackLimit)
-                .map(|reg| (reg, Vec::new()))
+                .map(|reg| (reg, smallvec![]))
                 .or_else(|| f.stack_limit.map(|gv| gen_stack_limit::<M>(f, &sig, gv)));
 
         // Determine whether a probestack call is required for large enough
@@ -596,7 +608,12 @@ impl<M: ABIMachineSpec> ABICalleeImpl<M> {
     /// No values can be live after the prologue, but in this case that's ok
     /// because we just need to perform a stack check before progressing with
     /// the rest of the function.
-    fn insert_stack_check(&self, stack_limit: Reg, stack_size: u32, insts: &mut Vec<M::I>) {
+    fn insert_stack_check(
+        &self,
+        stack_limit: Reg,
+        stack_size: u32,
+        insts: &mut SmallInstVec<M::I>,
+    ) {
         // With no explicit stack allocated we can just emit the simple check of
         // the stack registers against the stack limit register, and trap if
         // it's out of bounds.
@@ -649,8 +666,8 @@ fn gen_stack_limit<M: ABIMachineSpec>(
     f: &ir::Function,
     abi: &ABISig,
     gv: ir::GlobalValue,
-) -> (Reg, Vec<M::I>) {
-    let mut insts = Vec::new();
+) -> (Reg, SmallInstVec<M::I>) {
+    let mut insts = smallvec![];
     let reg = generate_gv::<M>(f, abi, gv, &mut insts);
     return (reg, insts);
 }
@@ -659,7 +676,7 @@ fn generate_gv<M: ABIMachineSpec>(
     f: &ir::Function,
     abi: &ABISig,
     gv: ir::GlobalValue,
-    insts: &mut Vec<M::I>,
+    insts: &mut SmallInstVec<M::I>,
 ) -> Reg {
     match f.global_values[gv] {
         // Return the direct register the vmcontext is in
@@ -709,11 +726,76 @@ fn ty_from_ty_hint_or_reg_class<M: ABIMachineSpec>(r: Reg, ty: Option<Type>) ->
     }
 }
 
+fn gen_move_multi<M: ABIMachineSpec>(
+    dst: ValueRegs<Writable<Reg>>,
+    src: ValueRegs<Reg>,
+    ty: Type,
+) -> SmallInstVec<M::I> {
+    let mut ret = smallvec![];
+    let (_, tys) = M::I::rc_for_type(ty).unwrap();
+    for ((&dst, &src), &ty) in dst.regs().iter().zip(src.regs().iter()).zip(tys.iter()) {
+        ret.push(M::gen_move(dst, src, ty));
+    }
+    ret
+}
+
+fn gen_load_stack_multi<M: ABIMachineSpec>(
+    from: StackAMode,
+    dst: ValueRegs<Writable<Reg>>,
+    ty: Type,
+) -> SmallInstVec<M::I> {
+    let mut ret = smallvec![];
+    let (_, tys) = M::I::rc_for_type(ty).unwrap();
+    let mut offset = 0;
+    // N.B.: registers are given in the `ValueRegs` in target endian order.
+    for (&dst, &ty) in dst.regs().iter().zip(tys.iter()) {
+        ret.push(M::gen_load_stack(from.offset(offset), dst, ty));
+        offset += ty.bytes() as i64;
+    }
+    ret
+}
+
+fn gen_store_stack_multi<M: ABIMachineSpec>(
+    from: StackAMode,
+    src: ValueRegs<Reg>,
+    ty: Type,
+) -> SmallInstVec<M::I> {
+    let mut ret = smallvec![];
+    let (_, tys) = M::I::rc_for_type(ty).unwrap();
+    let mut offset = 0;
+    // N.B.: registers are given in the `ValueRegs` in target endian order.
+    for (&src, &ty) in src.regs().iter().zip(tys.iter()) {
+        ret.push(M::gen_store_stack(from.offset(offset), src, ty));
+        offset += ty.bytes() as i64;
+    }
+    ret
+}
+
+fn gen_store_base_offset_multi<M: ABIMachineSpec>(
+    base: Reg,
+    mut offset: i32,
+    src: ValueRegs<Reg>,
+    ty: Type,
+) -> SmallInstVec<M::I> {
+    let mut ret = smallvec![];
+    let (_, tys) = M::I::rc_for_type(ty).unwrap();
+    // N.B.: registers are given in the `ValueRegs` in target endian order.
+    for (&src, &ty) in src.regs().iter().zip(tys.iter()) {
+        ret.push(M::gen_store_base_offset(base, offset, src, ty));
+        offset += ty.bytes() as i32;
+    }
+    ret
+}
+
 impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
     type I = M::I;
 
-    fn temp_needed(&self) -> bool {
-        self.sig.stack_ret_arg.is_some()
+    fn temp_needed(&self) -> Option<Type> {
+        if self.sig.stack_ret_arg.is_some() {
+            Some(M::word_type())
+        } else {
+            None
+        }
     }
 
     fn init(&mut self, maybe_tmp: Option<Writable<Reg>>) {
@@ -740,8 +822,10 @@ impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
     fn liveins(&self) -> Set<RealReg> {
         let mut set: Set<RealReg> = Set::empty();
         for &arg in &self.sig.args {
-            if let ABIArg::Reg(r, ..) = arg {
-                set.insert(r);
+            if let ABIArg::Reg(regs, ..) = arg {
+                for &r in regs.regs() {
+                    set.insert(r);
+                }
             }
         }
         set
@@ -750,8 +834,10 @@ impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
     fn liveouts(&self) -> Set<RealReg> {
         let mut set: Set<RealReg> = Set::empty();
         for &ret in &self.sig.rets {
-            if let ABIArg::Reg(r, ..) = ret {
-                set.insert(r);
+            if let ABIArg::Reg(regs, ..) = ret {
+                for &r in regs.regs() {
+                    set.insert(r);
+                }
             }
         }
         set
@@ -769,14 +855,20 @@ impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
         self.stackslots.len()
     }
 
-    fn gen_copy_arg_to_reg(&self, idx: usize, into_reg: Writable<Reg>) -> Self::I {
+    fn gen_copy_arg_to_regs(
+        &self,
+        idx: usize,
+        into_regs: ValueRegs<Writable<Reg>>,
+    ) -> SmallInstVec<Self::I> {
         match &self.sig.args[idx] {
             // Extension mode doesn't matter (we're copying out, not in; we
             // ignore high bits by convention).
-            &ABIArg::Reg(r, ty, ..) => M::gen_move(into_reg, r.to_reg(), ty),
-            &ABIArg::Stack(off, ty, ..) => M::gen_load_stack(
+            &ABIArg::Reg(regs, ty, ..) => {
+                gen_move_multi::<M>(into_regs, regs.map(|r| r.to_reg()), ty)
+            }
+            &ABIArg::Stack(off, ty, ..) => gen_load_stack_multi::<M>(
                 StackAMode::FPOffset(M::fp_to_arg_offset(self.call_conv, &self.flags) + off, ty),
-                into_reg,
+                into_regs,
                 ty,
             ),
         }
@@ -792,19 +884,29 @@ impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
         }
     }
 
-    fn gen_copy_reg_to_retval(&self, idx: usize, from_reg: Writable<Reg>) -> Vec<Self::I> {
-        let mut ret = Vec::new();
+    fn gen_copy_regs_to_retval(
+        &self,
+        idx: usize,
+        from_regs: ValueRegs<Writable<Reg>>,
+    ) -> SmallInstVec<Self::I> {
+        let mut ret = smallvec![];
         let word_bits = M::word_bits() as u8;
         match &self.sig.rets[idx] {
-            &ABIArg::Reg(r, ty, ext, ..) => {
+            &ABIArg::Reg(regs, ty, ext, ..) => {
                 let from_bits = ty_bits(ty) as u8;
-                let dest_reg = Writable::from_reg(r.to_reg());
+                let dest_regs = writable_value_regs(regs.map(|r| r.to_reg()));
                 let ext = M::get_ext_mode(self.sig.call_conv, ext);
                 match (ext, from_bits) {
                     (ArgumentExtension::Uext, n) | (ArgumentExtension::Sext, n)
                         if n < word_bits =>
                     {
                         let signed = ext == ArgumentExtension::Sext;
+                        let dest_reg = dest_regs
+                            .only_reg()
+                            .expect("extension only possible from one-reg value");
+                        let from_reg = from_regs
+                            .only_reg()
+                            .expect("extension only possible from one-reg value");
                         ret.push(M::gen_extend(
                             dest_reg,
                             from_reg.to_reg(),
@@ -813,7 +915,10 @@ impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
                             /* to_bits = */ word_bits,
                         ));
                     }
-                    _ => ret.push(M::gen_move(dest_reg, from_reg.to_reg(), ty)),
+                    _ => ret.extend(
+                        gen_move_multi::<M>(dest_regs, non_writable_value_regs(from_regs), ty)
+                            .into_iter(),
+                    ),
                 };
             }
             &ABIArg::Stack(off, mut ty, ext, ..) => {
@@ -829,6 +934,9 @@ impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
                     (ArgumentExtension::Uext, n) | (ArgumentExtension::Sext, n)
                         if n < word_bits =>
                     {
+                        let from_reg = from_regs
+                            .only_reg()
+                            .expect("extension only possible from one-reg value");
                         assert_eq!(M::word_reg_class(), from_reg.to_reg().get_class());
                         let signed = ext == ArgumentExtension::Sext;
                         ret.push(M::gen_extend(
@@ -843,12 +951,15 @@ impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
                     }
                     _ => {}
                 };
-                ret.push(M::gen_store_base_offset(
-                    self.ret_area_ptr.unwrap().to_reg(),
-                    off,
-                    from_reg.to_reg(),
-                    ty,
-                ));
+                ret.extend(
+                    gen_store_base_offset_multi::<M>(
+                        self.ret_area_ptr.unwrap().to_reg(),
+                        off,
+                        non_writable_value_regs(from_regs),
+                        ty,
+                    )
+                    .into_iter(),
+                );
             }
         }
         ret
@@ -856,7 +967,8 @@ impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
 
     fn gen_retval_area_setup(&self) -> Option<Self::I> {
         if let Some(i) = self.sig.stack_ret_arg {
-            let inst = self.gen_copy_arg_to_reg(i, self.ret_area_ptr.unwrap());
+            let insts = self.gen_copy_arg_to_regs(i, ValueRegs::one(self.ret_area_ptr.unwrap()));
+            let inst = insts.into_iter().next().unwrap();
             trace!(
                 "gen_retval_area_setup: inst {:?}; ptr reg is {:?}",
                 inst,
@@ -891,24 +1003,30 @@ impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
         slot: StackSlot,
         offset: u32,
         ty: Type,
-        into_reg: Writable<Reg>,
-    ) -> Self::I {
+        into_regs: ValueRegs<Writable<Reg>>,
+    ) -> SmallInstVec<Self::I> {
         // Offset from beginning of stackslot area, which is at nominal SP (see
         // [MemArg::NominalSPOffset] for more details on nominal SP tracking).
         let stack_off = self.stackslots[slot.as_u32() as usize] as i64;
         let sp_off: i64 = stack_off + (offset as i64);
         trace!("load_stackslot: slot {} -> sp_off {}", slot, sp_off);
-        M::gen_load_stack(StackAMode::NominalSPOffset(sp_off, ty), into_reg, ty)
+        gen_load_stack_multi::<M>(StackAMode::NominalSPOffset(sp_off, ty), into_regs, ty)
     }
 
     /// Store to a stackslot.
-    fn store_stackslot(&self, slot: StackSlot, offset: u32, ty: Type, from_reg: Reg) -> Self::I {
+    fn store_stackslot(
+        &self,
+        slot: StackSlot,
+        offset: u32,
+        ty: Type,
+        from_regs: ValueRegs<Reg>,
+    ) -> SmallInstVec<Self::I> {
         // Offset from beginning of stackslot area, which is at nominal SP (see
         // [MemArg::NominalSPOffset] for more details on nominal SP tracking).
         let stack_off = self.stackslots[slot.as_u32() as usize] as i64;
         let sp_off: i64 = stack_off + (offset as i64);
         trace!("store_stackslot: slot {} -> sp_off {}", slot, sp_off);
-        M::gen_store_stack(StackAMode::NominalSPOffset(sp_off, ty), from_reg, ty)
+        gen_store_stack_multi::<M>(StackAMode::NominalSPOffset(sp_off, ty), from_regs, ty)
     }
 
     /// Produce an instruction that computes a stackslot address.
@@ -921,23 +1039,33 @@ impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
     }
 
     /// Load from a spillslot.
-    fn load_spillslot(&self, slot: SpillSlot, ty: Type, into_reg: Writable<Reg>) -> Self::I {
+    fn load_spillslot(
+        &self,
+        slot: SpillSlot,
+        ty: Type,
+        into_regs: ValueRegs<Writable<Reg>>,
+    ) -> SmallInstVec<Self::I> {
         // Offset from beginning of spillslot area, which is at nominal SP + stackslots_size.
         let islot = slot.get() as i64;
         let spill_off = islot * M::word_bytes() as i64;
         let sp_off = self.stackslots_size as i64 + spill_off;
         trace!("load_spillslot: slot {:?} -> sp_off {}", slot, sp_off);
-        M::gen_load_stack(StackAMode::NominalSPOffset(sp_off, ty), into_reg, ty)
+        gen_load_stack_multi::<M>(StackAMode::NominalSPOffset(sp_off, ty), into_regs, ty)
     }
 
     /// Store to a spillslot.
-    fn store_spillslot(&self, slot: SpillSlot, ty: Type, from_reg: Reg) -> Self::I {
+    fn store_spillslot(
+        &self,
+        slot: SpillSlot,
+        ty: Type,
+        from_regs: ValueRegs<Reg>,
+    ) -> SmallInstVec<Self::I> {
         // Offset from beginning of spillslot area, which is at nominal SP + stackslots_size.
         let islot = slot.get() as i64;
         let spill_off = islot * M::word_bytes() as i64;
         let sp_off = self.stackslots_size as i64 + spill_off;
         trace!("store_spillslot: slot {:?} -> sp_off {}", slot, sp_off);
-        M::gen_store_stack(StackAMode::NominalSPOffset(sp_off, ty), from_reg, ty)
+        gen_store_stack_multi::<M>(StackAMode::NominalSPOffset(sp_off, ty), from_regs, ty)
     }
 
     fn spillslots_to_stack_map(
@@ -970,8 +1098,8 @@ impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
         StackMap::from_slice(&bits[..])
     }
 
-    fn gen_prologue(&mut self) -> Vec<Self::I> {
-        let mut insts = vec![];
+    fn gen_prologue(&mut self) -> SmallInstVec<Self::I> {
+        let mut insts = smallvec![];
         if !self.call_conv.extends_baldrdash() {
             // set up frame
             insts.extend(M::gen_prologue_frame_setup().into_iter());
@@ -994,7 +1122,7 @@ impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
             // specified, otherwise always insert the stack check.
             if total_stacksize > 0 || !self.is_leaf {
                 if let Some((reg, stack_limit_load)) = &self.stack_limit {
-                    insts.extend_from_slice(stack_limit_load);
+                    insts.extend(stack_limit_load.clone());
                     self.insert_stack_check(*reg, total_stacksize, &mut insts);
                 }
                 if let Some(min_frame) = &self.probestack_min_frame {
@@ -1037,8 +1165,8 @@ impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
         insts
     }
 
-    fn gen_epilogue(&self) -> Vec<M::I> {
-        let mut insts = vec![];
+    fn gen_epilogue(&self) -> SmallInstVec<M::I> {
+        let mut insts = smallvec![];
 
         // Restore clobbered registers.
         insts.extend(M::gen_clobber_restore(
@@ -1079,7 +1207,10 @@ impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
 
     fn gen_spill(&self, to_slot: SpillSlot, from_reg: RealReg, ty: Option<Type>) -> Self::I {
         let ty = ty_from_ty_hint_or_reg_class::<M>(from_reg.to_reg(), ty);
-        self.store_spillslot(to_slot, ty, from_reg.to_reg())
+        self.store_spillslot(to_slot, ty, ValueRegs::one(from_reg.to_reg()))
+            .into_iter()
+            .next()
+            .unwrap()
     }
 
     fn gen_reload(
@@ -1089,7 +1220,14 @@ impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
         ty: Option<Type>,
     ) -> Self::I {
         let ty = ty_from_ty_hint_or_reg_class::<M>(to_reg.to_reg().to_reg(), ty);
-        self.load_spillslot(from_slot, ty, to_reg.map(|r| r.to_reg()))
+        self.load_spillslot(
+            from_slot,
+            ty,
+            writable_value_regs(ValueRegs::one(to_reg.to_reg().to_reg())),
+        )
+        .into_iter()
+        .next()
+        .unwrap()
     }
 
     fn unwind_info_kind(&self) -> UnwindInfoKind {
@@ -1110,7 +1248,7 @@ fn abisig_to_uses_and_defs<M: ABIMachineSpec>(sig: &ABISig) -> (Vec<Reg>, Vec<Wr
     let mut uses = Vec::new();
     for arg in &sig.args {
         match arg {
-            &ABIArg::Reg(reg, ..) => uses.push(reg.to_reg()),
+            &ABIArg::Reg(regs, ..) => uses.extend(regs.regs().iter().map(|r| r.to_reg())),
             _ => {}
         }
     }
@@ -1119,7 +1257,9 @@ fn abisig_to_uses_and_defs<M: ABIMachineSpec>(sig: &ABISig) -> (Vec<Reg>, Vec<Wr
     let mut defs = M::get_regs_clobbered_by_call(sig.call_conv);
     for ret in &sig.rets {
         match ret {
-            &ABIArg::Reg(reg, ..) => defs.push(Writable::from_reg(reg.to_reg())),
+            &ABIArg::Reg(regs, ..) => {
+                defs.extend(regs.regs().iter().map(|r| Writable::from_reg(r.to_reg())))
+            }
             _ => {}
         }
     }
@@ -1238,18 +1378,19 @@ impl<M: ABIMachineSpec> ABICaller for ABICallerImpl<M> {
         adjust_stack_and_nominal_sp::<M, C>(ctx, off as i32, /* is_sub = */ false)
     }
 
-    fn emit_copy_reg_to_arg<C: LowerCtx<I = Self::I>>(
+    fn emit_copy_regs_to_arg<C: LowerCtx<I = Self::I>>(
         &self,
         ctx: &mut C,
         idx: usize,
-        from_reg: Reg,
+        from_regs: ValueRegs<Reg>,
     ) {
         let word_rc = M::word_reg_class();
         let word_bits = M::word_bits() as usize;
         match &self.sig.args[idx] {
-            &ABIArg::Reg(reg, ty, ext, _) => {
+            &ABIArg::Reg(regs, ty, ext, _) => {
                 let ext = M::get_ext_mode(self.sig.call_conv, ext);
                 if ext != ir::ArgumentExtension::None && ty_bits(ty) < word_bits {
+                    let reg = regs.only_reg().unwrap();
                     assert_eq!(word_rc, reg.get_class());
                     let signed = match ext {
                         ir::ArgumentExtension::Uext => false,
@@ -1258,18 +1399,27 @@ impl<M: ABIMachineSpec> ABICaller for ABICallerImpl<M> {
                     };
                     ctx.emit(M::gen_extend(
                         Writable::from_reg(reg.to_reg()),
-                        from_reg,
+                        from_regs.only_reg().unwrap(),
                         signed,
                         ty_bits(ty) as u8,
                         word_bits as u8,
                     ));
                 } else {
-                    ctx.emit(M::gen_move(Writable::from_reg(reg.to_reg()), from_reg, ty));
+                    for insn in gen_move_multi::<M>(
+                        writable_value_regs(regs.map(|r| r.to_reg())),
+                        from_regs,
+                        ty,
+                    ) {
+                        ctx.emit(insn);
+                    }
                 }
             }
             &ABIArg::Stack(off, mut ty, ext, _) => {
                 let ext = M::get_ext_mode(self.sig.call_conv, ext);
                 if ext != ir::ArgumentExtension::None && ty_bits(ty) < word_bits {
+                    let from_reg = from_regs
+                        .only_reg()
+                        .expect("only one reg for sub-word value width");
                     assert_eq!(word_rc, from_reg.get_class());
                     let signed = match ext {
                         ir::ArgumentExtension::Uext => false,
@@ -1289,32 +1439,37 @@ impl<M: ABIMachineSpec> ABICaller for ABICallerImpl<M> {
                     // Store the extended version.
                     ty = M::word_type();
                 }
-                ctx.emit(M::gen_store_stack(
-                    StackAMode::SPOffset(off, ty),
-                    from_reg,
-                    ty,
-                ));
+                for insn in gen_store_stack_multi::<M>(StackAMode::SPOffset(off, ty), from_regs, ty)
+                {
+                    ctx.emit(insn);
+                }
             }
         }
     }
 
-    fn emit_copy_retval_to_reg<C: LowerCtx<I = Self::I>>(
+    fn emit_copy_retval_to_regs<C: LowerCtx<I = Self::I>>(
         &self,
         ctx: &mut C,
         idx: usize,
-        into_reg: Writable<Reg>,
+        into_regs: ValueRegs<Writable<Reg>>,
     ) {
         match &self.sig.rets[idx] {
             // Extension mode doesn't matter because we're copying out, not in,
             // and we ignore high bits in our own registers by convention.
-            &ABIArg::Reg(reg, ty, _, _) => ctx.emit(M::gen_move(into_reg, reg.to_reg(), ty)),
+            &ABIArg::Reg(regs, ty, _, _) => {
+                for insn in gen_move_multi::<M>(into_regs, regs.map(|r| r.to_reg()), ty) {
+                    ctx.emit(insn);
+                }
+            }
             &ABIArg::Stack(off, ty, _, _) => {
                 let ret_area_base = self.sig.stack_arg_space;
-                ctx.emit(M::gen_load_stack(
+                for insn in gen_load_stack_multi::<M>(
                     StackAMode::SPOffset(off + ret_area_base, ty),
-                    into_reg,
+                    into_regs,
                     ty,
-                ));
+                ) {
+                    ctx.emit(insn);
+                }
             }
         }
     }
@@ -1324,19 +1479,18 @@ impl<M: ABIMachineSpec> ABICaller for ABICallerImpl<M> {
             mem::replace(&mut self.uses, Default::default()),
             mem::replace(&mut self.defs, Default::default()),
         );
-        let word_rc = M::word_reg_class();
         let word_type = M::word_type();
         if let Some(i) = self.sig.stack_ret_arg {
-            let rd = ctx.alloc_tmp(word_rc, word_type);
+            let rd = ctx.alloc_tmp(word_type).only_reg().unwrap();
             let ret_area_base = self.sig.stack_arg_space;
             ctx.emit(M::gen_get_stack_addr(
                 StackAMode::SPOffset(ret_area_base, I8),
                 rd,
                 I8,
             ));
-            self.emit_copy_reg_to_arg(ctx, i, rd.to_reg());
+            self.emit_copy_regs_to_arg(ctx, i, ValueRegs::one(rd.to_reg()));
         }
-        let tmp = ctx.alloc_tmp(word_rc, word_type);
+        let tmp = ctx.alloc_tmp(word_type).only_reg().unwrap();
         for (is_safepoint, inst) in M::gen_call(
             &self.dest,
             uses,
diff --git a/cranelift/codegen/src/machinst/helpers.rs b/cranelift/codegen/src/machinst/helpers.rs
index 0138da7670..b61d9560dc 100644
--- a/cranelift/codegen/src/machinst/helpers.rs
+++ b/cranelift/codegen/src/machinst/helpers.rs
@@ -1,6 +1,6 @@
 //! Miscellaneous helpers for machine backends.
 
-use super::{InsnOutput, LowerCtx, VCodeInst};
+use super::{InsnOutput, LowerCtx, VCodeInst, ValueRegs};
 use crate::ir::Type;
 use regalloc::{Reg, Writable};
 
@@ -23,6 +23,6 @@ pub(crate) fn ty_has_float_or_vec_representation(ty: Type) -> bool {
 pub(crate) fn get_output_reg<I: VCodeInst, C: LowerCtx<I = I>>(
     ctx: &mut C,
     spec: InsnOutput,
-) -> Writable<Reg> {
+) -> ValueRegs<Writable<Reg>> {
     ctx.get_output(spec.insn, spec.output)
 }
diff --git a/cranelift/codegen/src/machinst/lower.rs b/cranelift/codegen/src/machinst/lower.rs
index 6a42f3a2b6..46b5fc1685 100644
--- a/cranelift/codegen/src/machinst/lower.rs
+++ b/cranelift/codegen/src/machinst/lower.rs
@@ -5,29 +5,27 @@
 // TODO: separate the IR-query core of `LowerCtx` from the lowering logic built
 // on top of it, e.g. the side-effect/coloring analysis and the scan support.
 
+use crate::data_value::DataValue;
 use crate::entity::SecondaryMap;
 use crate::fx::{FxHashMap, FxHashSet};
 use crate::inst_predicates::{has_lowering_side_effect, is_constant_64bit};
 use crate::ir::instructions::BranchInfo;
-use crate::ir::types::I64;
 use crate::ir::{
     ArgumentPurpose, Block, Constant, ConstantData, ExternalName, Function, GlobalValueData, Inst,
     InstructionData, MemFlags, Opcode, Signature, SourceLoc, Type, Value, ValueDef,
 };
 use crate::machinst::{
-    ABICallee, BlockIndex, BlockLoweringOrder, LoweredBlock, MachLabel, VCode, VCodeBuilder,
-    VCodeConstant, VCodeConstantData, VCodeConstants, VCodeInst,
+    writable_value_regs, ABICallee, BlockIndex, BlockLoweringOrder, LoweredBlock, MachLabel, VCode,
+    VCodeBuilder, VCodeConstant, VCodeConstantData, VCodeConstants, VCodeInst, ValueRegs,
 };
 use crate::CodegenResult;
-
-use regalloc::{Reg, RegClass, StackmapRequestInfo, VirtualReg, Writable};
-
-use crate::data_value::DataValue;
 use alloc::boxed::Box;
 use alloc::vec::Vec;
 use core::convert::TryInto;
 use log::debug;
+use regalloc::{Reg, StackmapRequestInfo, Writable};
 use smallvec::SmallVec;
+use std::fmt::Debug;
 
 /// An "instruction color" partitions CLIF instructions by side-effecting ops.
 /// All instructions with the same "color" are guaranteed not to be separated by
@@ -71,7 +69,7 @@ pub trait LowerCtx {
     /// instruction should lower into a sequence that fills this register. (Why
     /// not allow the backend to specify its own result register for the return?
     /// Because there may be multiple return points.)
-    fn retval(&self, idx: usize) -> Writable<Reg>;
+    fn retval(&self, idx: usize) -> ValueRegs<Writable<Reg>>;
     /// Returns the vreg containing the VmContext parameter, if there's one.
     fn get_vm_context(&self) -> Option<Reg>;
 
@@ -118,7 +116,7 @@ pub trait LowerCtx {
     ///
     /// The instruction input may be available in either of these forms.  It may
     /// be available in neither form, if the conditions are not met; if so, use
-    /// `put_input_in_reg()` instead to get it in a register.
+    /// `put_input_in_regs()` instead to get it in a register.
     ///
     /// If the backend merges the effect of a side-effecting instruction, it
     /// must call `sink_inst()`. When this is called, it indicates that the
@@ -126,29 +124,29 @@ pub trait LowerCtx {
     /// instruction's result(s) must have *no* uses remaining, because it will
     /// not be codegen'd (it has been integrated into the current instruction).
     fn get_input_as_source_or_const(&self, ir_inst: Inst, idx: usize) -> NonRegInput;
-    /// Put the `idx`th input into a register and return the assigned register.
-    fn put_input_in_reg(&mut self, ir_inst: Inst, idx: usize) -> Reg;
-    /// Get the `idx`th output register of the given IR instruction. When
+    /// Put the `idx`th input into register(s) and return the assigned register.
+    fn put_input_in_regs(&mut self, ir_inst: Inst, idx: usize) -> ValueRegs<Reg>;
+    /// Get the `idx`th output register(s) of the given IR instruction. When
     /// `backend.lower_inst_to_regs(ctx, inst)` is called, it is expected that
     /// the backend will write results to these output register(s).  This
     /// register will always be "fresh"; it is guaranteed not to overlap with
     /// any of the inputs, and can be freely used as a scratch register within
     /// the lowered instruction sequence, as long as its final value is the
     /// result of the computation.
-    fn get_output(&self, ir_inst: Inst, idx: usize) -> Writable<Reg>;
+    fn get_output(&self, ir_inst: Inst, idx: usize) -> ValueRegs<Writable<Reg>>;
 
     // Codegen primitives: allocate temps, emit instructions, set result registers,
     // ask for an input to be gen'd into a register.
 
     /// Get a new temp.
-    fn alloc_tmp(&mut self, rc: RegClass, ty: Type) -> Writable<Reg>;
+    fn alloc_tmp(&mut self, ty: Type) -> ValueRegs<Writable<Reg>>;
     /// Emit a machine instruction.
     fn emit(&mut self, mach_inst: Self::I);
     /// Emit a machine instruction that is a safepoint.
     fn emit_safepoint(&mut self, mach_inst: Self::I);
     /// Indicate that the side-effect of an instruction has been sunk to the
     /// current scan location. This should only be done with the instruction's
-    /// original results are not used (i.e., `put_input_in_reg` is not invoked
+    /// original results are not used (i.e., `put_input_in_regs` is not invoked
     /// for the input produced by the sunk instruction), otherwise the
     /// side-effect will occur twice.
     fn sink_inst(&mut self, ir_inst: Inst);
@@ -234,10 +232,10 @@ pub struct Lower<'func, I: VCodeInst> {
     vcode: VCodeBuilder<I>,
 
     /// Mapping from `Value` (SSA value in IR) to virtual register.
-    value_regs: SecondaryMap<Value, Reg>,
+    value_regs: SecondaryMap<Value, ValueRegs<Reg>>,
 
     /// Return-value vregs.
-    retval_regs: Vec<Reg>,
+    retval_regs: Vec<ValueRegs<Reg>>,
 
     /// Instruction colors at block exits. From this map, we can recover all
     /// instruction colors by scanning backward from the block end and
@@ -306,20 +304,30 @@ pub enum RelocDistance {
     Far,
 }
 
-fn alloc_vreg(
-    value_regs: &mut SecondaryMap<Value, Reg>,
-    regclass: RegClass,
-    value: Value,
+fn alloc_vregs<I: VCodeInst>(
+    ty: Type,
     next_vreg: &mut u32,
-) -> VirtualReg {
-    if value_regs[value].is_invalid() {
-        // default value in map.
-        let v = *next_vreg;
-        *next_vreg += 1;
-        value_regs[value] = Reg::new_virtual(regclass, v);
-        debug!("value {} gets vreg {:?}", value, v);
+    vcode: &mut VCodeBuilder<I>,
+) -> CodegenResult<ValueRegs<Reg>> {
+    let v = *next_vreg;
+    let (regclasses, tys) = I::rc_for_type(ty)?;
+    *next_vreg += regclasses.len() as u32;
+    let regs = match regclasses {
+        &[rc0] => ValueRegs::one(Reg::new_virtual(rc0, v)),
+        &[rc0, rc1] => ValueRegs::two(Reg::new_virtual(rc0, v), Reg::new_virtual(rc1, v + 1)),
+        #[cfg(feature = "arm32")]
+        &[rc0, rc1, rc2, rc3] => ValueRegs::four(
+            Reg::new_virtual(rc0, v),
+            Reg::new_virtual(rc1, v + 1),
+            Reg::new_virtual(rc2, v + 2),
+            Reg::new_virtual(rc3, v + 3),
+        ),
+        _ => panic!("Value must reside in 1, 2 or 4 registers"),
+    };
+    for (&reg_ty, &reg) in tys.iter().zip(regs.regs().iter()) {
+        vcode.set_vreg_type(reg.to_virtual_reg(), reg_ty);
     }
-    value_regs[value].as_virtual_reg().unwrap()
+    Ok(regs)
 }
 
 enum GenerateReturn {
@@ -340,26 +348,29 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
 
         let mut next_vreg: u32 = 0;
 
-        let mut value_regs = SecondaryMap::with_default(Reg::invalid());
+        let mut value_regs = SecondaryMap::with_default(ValueRegs::invalid());
 
         // Assign a vreg to each block param and each inst result.
         for bb in f.layout.blocks() {
             for &param in f.dfg.block_params(bb) {
                 let ty = f.dfg.value_type(param);
-                let vreg = alloc_vreg(&mut value_regs, I::rc_for_type(ty)?, param, &mut next_vreg);
-                vcode.set_vreg_type(vreg, ty);
-                debug!("bb {} param {}: vreg {:?}", bb, param, vreg);
+                if value_regs[param].is_invalid() {
+                    let regs = alloc_vregs(ty, &mut next_vreg, &mut vcode)?;
+                    value_regs[param] = regs;
+                    debug!("bb {} param {}: regs {:?}", bb, param, regs);
+                }
             }
             for inst in f.layout.block_insts(bb) {
                 for &result in f.dfg.inst_results(inst) {
                     let ty = f.dfg.value_type(result);
-                    let vreg =
-                        alloc_vreg(&mut value_regs, I::rc_for_type(ty)?, result, &mut next_vreg);
-                    vcode.set_vreg_type(vreg, ty);
-                    debug!(
-                        "bb {} inst {} ({:?}): result vreg {:?}",
-                        bb, inst, f.dfg[inst], vreg
-                    );
+                    if value_regs[result].is_invalid() {
+                        let regs = alloc_vregs(ty, &mut next_vreg, &mut vcode)?;
+                        value_regs[result] = regs;
+                        debug!(
+                            "bb {} inst {} ({:?}): result regs {:?}",
+                            bb, inst, f.dfg[inst], regs,
+                        );
+                    }
                 }
             }
         }
@@ -370,18 +381,15 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
             .map(|vm_context_index| {
                 let entry_block = f.layout.entry_block().unwrap();
                 let param = f.dfg.block_params(entry_block)[vm_context_index];
-                value_regs[param]
+                value_regs[param].only_reg().unwrap()
             });
 
-        // Assign a vreg to each return value.
+        // Assign vreg(s) to each return value.
         let mut retval_regs = vec![];
         for ret in &f.signature.returns {
-            let v = next_vreg;
-            next_vreg += 1;
-            let regclass = I::rc_for_type(ret.value_type)?;
-            let vreg = Reg::new_virtual(regclass, v);
-            retval_regs.push(vreg);
-            vcode.set_vreg_type(vreg.as_virtual_reg().unwrap(), ret.value_type);
+            let regs = alloc_vregs(ret.value_type, &mut next_vreg, &mut vcode)?;
+            retval_regs.push(regs);
+            debug!("retval gets regs {:?}", regs);
         }
 
         // Compute instruction colors, find constant instructions, and find instructions with
@@ -453,9 +461,10 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
                 if !self.vcode.abi().arg_is_needed_in_body(i) {
                     continue;
                 }
-                let reg = Writable::from_reg(self.value_regs[*param]);
-                let insn = self.vcode.abi().gen_copy_arg_to_reg(i, reg);
-                self.emit(insn);
+                let regs = writable_value_regs(self.value_regs[*param]);
+                for insn in self.vcode.abi().gen_copy_arg_to_regs(i, regs).into_iter() {
+                    self.emit(insn);
+                }
             }
             if let Some(insn) = self.vcode.abi().gen_retval_area_setup() {
                 self.emit(insn);
@@ -465,10 +474,14 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
 
     fn gen_retval_setup(&mut self, gen_ret_inst: GenerateReturn) {
         let retval_regs = self.retval_regs.clone();
-        for (i, reg) in retval_regs.into_iter().enumerate() {
-            let reg = Writable::from_reg(reg);
-            let insns = self.vcode.abi().gen_copy_reg_to_retval(i, reg);
-            for insn in insns {
+        for (i, regs) in retval_regs.into_iter().enumerate() {
+            let regs = writable_value_regs(regs);
+            for insn in self
+                .vcode
+                .abi()
+                .gen_copy_regs_to_retval(i, regs)
+                .into_iter()
+            {
                 self.emit(insn);
             }
         }
@@ -499,8 +512,8 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
         //
         // * one for dsts whose sources are non-constants.
 
-        let mut const_bundles = SmallVec::<[(Type, Writable<Reg>, u64); 16]>::new();
-        let mut var_bundles = SmallVec::<[(Type, Writable<Reg>, Reg); 16]>::new();
+        let mut const_bundles: SmallVec<[_; 16]> = SmallVec::new();
+        let mut var_bundles: SmallVec<[_; 16]> = SmallVec::new();
 
         let mut i = 0;
         for (dst_val, src_val) in self
@@ -514,7 +527,7 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
             let ty = self.f.dfg.value_type(src_val);
 
             debug_assert!(ty == self.f.dfg.value_type(*dst_val));
-            let dst_reg = self.value_regs[*dst_val];
+            let dst_regs = self.value_regs[*dst_val];
 
             let input = self.get_value_as_source_or_const(src_val);
             debug!("jump arg {} is {}", i, src_val);
@@ -522,15 +535,15 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
 
             if let Some(c) = input.constant {
                 debug!(" -> constant {}", c);
-                const_bundles.push((ty, Writable::from_reg(dst_reg), c));
+                const_bundles.push((ty, writable_value_regs(dst_regs), c));
             } else {
-                let src_reg = self.put_value_in_reg(src_val);
-                debug!(" -> reg {:?}", src_reg);
+                let src_regs = self.put_value_in_regs(src_val);
+                debug!(" -> reg {:?}", src_regs);
                 // Skip self-assignments.  Not only are they pointless, they falsely trigger the
                 // overlap-check below and hence can cause a lot of unnecessary copying through
                 // temporaries.
-                if dst_reg != src_reg {
-                    var_bundles.push((ty, Writable::from_reg(dst_reg), src_reg));
+                if dst_regs != src_regs {
+                    var_bundles.push((ty, writable_value_regs(dst_regs), src_regs));
                 }
             }
         }
@@ -541,41 +554,69 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
         // for cases of up to circa 16 args.  Currently not possible because regalloc.rs
         // does not export it.
         let mut src_reg_set = FxHashSet::<Reg>::default();
-        for (_, _, src_reg) in &var_bundles {
-            src_reg_set.insert(*src_reg);
+        for (_, _, src_regs) in &var_bundles {
+            for &reg in src_regs.regs() {
+                src_reg_set.insert(reg);
+            }
         }
         let mut overlaps = false;
-        for (_, dst_reg, _) in &var_bundles {
-            if src_reg_set.contains(&dst_reg.to_reg()) {
-                overlaps = true;
-                break;
+        'outer: for (_, dst_regs, _) in &var_bundles {
+            for &reg in dst_regs.regs() {
+                if src_reg_set.contains(&reg.to_reg()) {
+                    overlaps = true;
+                    break 'outer;
+                }
             }
         }
 
         // If, as is mostly the case, the source and destination register sets are non
         // overlapping, then we can copy directly, so as to save the register allocator work.
         if !overlaps {
-            for (ty, dst_reg, src_reg) in &var_bundles {
-                self.emit(I::gen_move(*dst_reg, *src_reg, *ty));
+            for (ty, dst_regs, src_regs) in &var_bundles {
+                let (_, reg_tys) = I::rc_for_type(*ty)?;
+                for ((dst, src), reg_ty) in dst_regs
+                    .regs()
+                    .iter()
+                    .zip(src_regs.regs().iter())
+                    .zip(reg_tys.iter())
+                {
+                    self.emit(I::gen_move(*dst, *src, *reg_ty));
+                }
             }
         } else {
             // There's some overlap, so play safe and copy via temps.
-            let mut tmp_regs = SmallVec::<[Writable<Reg>; 16]>::new();
+            let mut tmp_regs = SmallVec::<[ValueRegs<Writable<Reg>>; 16]>::new();
             for (ty, _, _) in &var_bundles {
-                tmp_regs.push(self.alloc_tmp(I::rc_for_type(*ty)?, *ty));
+                tmp_regs.push(self.alloc_tmp(*ty));
             }
             for ((ty, _, src_reg), tmp_reg) in var_bundles.iter().zip(tmp_regs.iter()) {
-                self.emit(I::gen_move(*tmp_reg, *src_reg, *ty));
+                let (_, reg_tys) = I::rc_for_type(*ty)?;
+                for ((tmp, src), reg_ty) in tmp_reg
+                    .regs()
+                    .iter()
+                    .zip(src_reg.regs().iter())
+                    .zip(reg_tys.iter())
+                {
+                    self.emit(I::gen_move(*tmp, *src, *reg_ty));
+                }
             }
             for ((ty, dst_reg, _), tmp_reg) in var_bundles.iter().zip(tmp_regs.iter()) {
-                self.emit(I::gen_move(*dst_reg, (*tmp_reg).to_reg(), *ty));
+                let (_, reg_tys) = I::rc_for_type(*ty)?;
+                for ((dst, tmp), reg_ty) in dst_reg
+                    .regs()
+                    .iter()
+                    .zip(tmp_reg.regs().iter())
+                    .zip(reg_tys.iter())
+                {
+                    self.emit(I::gen_move(*dst, tmp.to_reg(), *reg_ty));
+                }
             }
         }
 
         // Now, finally, deal with the moves whose sources are constants.
-        for (ty, dst_reg, const_u64) in &const_bundles {
-            for inst in I::gen_constant(*dst_reg, *const_u64, *ty, |reg_class, ty| {
-                self.alloc_tmp(reg_class, ty)
+        for (ty, dst_reg, const_val) in &const_bundles {
+            for inst in I::gen_constant(*dst_reg, *const_val as u128, *ty, |ty| {
+                self.alloc_tmp(ty).only_reg().unwrap()
             })
             .into_iter()
             {
@@ -766,8 +807,8 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
         debug!("about to lower function: {:?}", self.f);
 
         // Initialize the ABI object, giving it a temp if requested.
-        let maybe_tmp = if self.vcode.abi().temp_needed() {
-            Some(self.alloc_tmp(RegClass::I64, I64))
+        let maybe_tmp = if let Some(temp_ty) = self.vcode.abi().temp_needed() {
+            Some(self.alloc_tmp(temp_ty).only_reg().unwrap())
         } else {
             None
         };
@@ -848,11 +889,11 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
         Ok((vcode, stack_map_info))
     }
 
-    fn put_value_in_reg(&mut self, val: Value) -> Reg {
-        debug!("put_value_in_reg: val {}", val,);
-        let mut reg = self.value_regs[val];
-        debug!(" -> reg {:?}", reg);
-        assert!(reg.is_valid());
+    fn put_value_in_regs(&mut self, val: Value) -> ValueRegs<Reg> {
+        debug!("put_value_in_reg: val {}", val);
+        let mut regs = self.value_regs[val];
+        debug!(" -> regs {:?}", regs);
+        assert!(regs.is_valid());
 
         self.value_lowered_uses[val] += 1;
 
@@ -864,12 +905,12 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
         if let ValueDef::Result(i, 0) = self.f.dfg.value_def(val) {
             if self.f.dfg[i].opcode() == Opcode::GetPinnedReg {
                 if let Some(pr) = self.pinned_reg {
-                    reg = pr;
+                    regs = ValueRegs::one(pr);
                 }
             }
         }
 
-        reg
+        regs
     }
 
     /// Get the actual inputs for a value. This is the implementation for
@@ -944,8 +985,8 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> {
         self.vcode.abi()
     }
 
-    fn retval(&self, idx: usize) -> Writable<Reg> {
-        Writable::from_reg(self.retval_regs[idx])
+    fn retval(&self, idx: usize) -> ValueRegs<Writable<Reg>> {
+        writable_value_regs(self.retval_regs[idx])
     }
 
     fn get_vm_context(&self) -> Option<Reg> {
@@ -1050,23 +1091,19 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> {
         self.get_value_as_source_or_const(val)
     }
 
-    fn put_input_in_reg(&mut self, ir_inst: Inst, idx: usize) -> Reg {
+    fn put_input_in_regs(&mut self, ir_inst: Inst, idx: usize) -> ValueRegs<Reg> {
         let val = self.f.dfg.inst_args(ir_inst)[idx];
         let val = self.f.dfg.resolve_aliases(val);
-        self.put_value_in_reg(val)
+        self.put_value_in_regs(val)
     }
 
-    fn get_output(&self, ir_inst: Inst, idx: usize) -> Writable<Reg> {
+    fn get_output(&self, ir_inst: Inst, idx: usize) -> ValueRegs<Writable<Reg>> {
         let val = self.f.dfg.inst_results(ir_inst)[idx];
-        Writable::from_reg(self.value_regs[val])
+        writable_value_regs(self.value_regs[val])
     }
 
-    fn alloc_tmp(&mut self, rc: RegClass, ty: Type) -> Writable<Reg> {
-        let v = self.next_vreg;
-        self.next_vreg += 1;
-        let vreg = Reg::new_virtual(rc, v);
-        self.vcode.set_vreg_type(vreg.as_virtual_reg().unwrap(), ty);
-        Writable::from_reg(vreg)
+    fn alloc_tmp(&mut self, ty: Type) -> ValueRegs<Writable<Reg>> {
+        writable_value_regs(alloc_vregs(ty, &mut self.next_vreg, &mut self.vcode).unwrap())
     }
 
     fn emit(&mut self, mach_inst: I) {
@@ -1131,8 +1168,7 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> {
         if reg.is_virtual() {
             reg
         } else {
-            let rc = reg.get_class();
-            let new_reg = self.alloc_tmp(rc, ty);
+            let new_reg = self.alloc_tmp(ty).only_reg().unwrap();
             self.emit(I::gen_move(new_reg, reg, ty));
             new_reg.to_reg()
         }
diff --git a/cranelift/codegen/src/machinst/mod.rs b/cranelift/codegen/src/machinst/mod.rs
index 4b12f2fd1d..764531d54f 100644
--- a/cranelift/codegen/src/machinst/mod.rs
+++ b/cranelift/codegen/src/machinst/mod.rs
@@ -135,6 +135,8 @@ pub mod helpers;
 pub use helpers::*;
 pub mod inst_common;
 pub use inst_common::*;
+pub mod valueregs;
+pub use valueregs::*;
 
 /// A machine instruction.
 pub trait MachInst: Clone + Debug {
@@ -165,9 +167,9 @@ pub trait MachInst: Clone + Debug {
     fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Self;
 
     /// Generate a constant into a reg.
-    fn gen_constant<F: FnMut(RegClass, Type) -> Writable<Reg>>(
-        to_reg: Writable<Reg>,
-        value: u64,
+    fn gen_constant<F: FnMut(Type) -> Writable<Reg>>(
+        to_regs: ValueRegs<Writable<Reg>>,
+        value: u128,
         ty: Type,
         alloc_tmp: F,
     ) -> SmallVec<[Self; 4]>;
@@ -180,9 +182,19 @@ pub trait MachInst: Clone + Debug {
     /// (e.g., add directly from or directly to memory), like x86.
     fn maybe_direct_reload(&self, reg: VirtualReg, slot: SpillSlot) -> Option<Self>;
 
-    /// Determine a register class to store the given Cranelift type.
-    /// May return an error if the type isn't supported by this backend.
-    fn rc_for_type(ty: Type) -> CodegenResult<RegClass>;
+    /// Determine register class(es) to store the given Cranelift type, and the
+    /// Cranelift type actually stored in the underlying register(s).  May return
+    /// an error if the type isn't supported by this backend.
+    ///
+    /// If the type requires multiple registers, then the list of registers is
+    /// returned in little-endian order.
+    ///
+    /// Note that the type actually stored in the register(s) may differ in the
+    /// case that a value is split across registers: for example, on a 32-bit
+    /// target, an I64 may be stored in two registers, each of which holds an
+    /// I32. The actually-stored types are used only to inform the backend when
+    /// generating spills and reloads for individual registers.
+    fn rc_for_type(ty: Type) -> CodegenResult<(&'static [RegClass], &'static [Type])>;
 
     /// Generate a jump to another target. Used during lowering of
     /// control flow.
diff --git a/cranelift/codegen/src/machinst/valueregs.rs b/cranelift/codegen/src/machinst/valueregs.rs
new file mode 100644
index 0000000000..1f9f0f05dd
--- /dev/null
+++ b/cranelift/codegen/src/machinst/valueregs.rs
@@ -0,0 +1,185 @@
+//! Data structure for tracking the (possibly multiple) registers that hold one
+//! SSA `Value`.
+
+use regalloc::{RealReg, Reg, VirtualReg, Writable};
+use std::fmt::Debug;
+
+#[cfg(feature = "arm32")]
+const VALUE_REGS_PARTS: usize = 4;
+
+#[cfg(not(feature = "arm32"))]
+const VALUE_REGS_PARTS: usize = 2;
+
+/// Location at which a `Value` is stored in register(s): the value is located
+/// in one or more registers, depending on its width. A value may be stored in
+/// more than one register if the machine has no registers wide enough
+/// otherwise: for example, on a 32-bit architecture, we may store `I64` values
+/// in two registers, and `I128` values in four.
+///
+/// By convention, the register parts are kept in machine-endian order here.
+///
+/// N.B.: we cap the capacity of this at four (when any 32-bit target is
+/// enabled) or two (otherwise), and we use special in-band sentinal `Reg`
+/// values (`Reg::invalid()`) to avoid the need to carry a separate length. This
+/// allows the struct to be `Copy` (no heap or drop overhead) and be only 16 or
+/// 8 bytes, which is important for compiler performance.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub struct ValueRegs<R: Clone + Copy + Debug + PartialEq + Eq + InvalidSentinel> {
+    parts: [R; VALUE_REGS_PARTS],
+}
+
+/// A type with an "invalid" sentinel value.
+pub trait InvalidSentinel: Copy + Eq {
+    /// The invalid sentinel value.
+    fn invalid_sentinel() -> Self;
+    /// Is this the invalid sentinel?
+    fn is_invalid_sentinel(self) -> bool {
+        self == Self::invalid_sentinel()
+    }
+}
+impl InvalidSentinel for Reg {
+    fn invalid_sentinel() -> Self {
+        Reg::invalid()
+    }
+}
+impl InvalidSentinel for VirtualReg {
+    fn invalid_sentinel() -> Self {
+        VirtualReg::invalid()
+    }
+}
+impl InvalidSentinel for RealReg {
+    fn invalid_sentinel() -> Self {
+        RealReg::invalid()
+    }
+}
+impl InvalidSentinel for Writable<Reg> {
+    fn invalid_sentinel() -> Self {
+        Writable::from_reg(Reg::invalid_sentinel())
+    }
+}
+
+impl<R: Clone + Copy + Debug + PartialEq + Eq + InvalidSentinel> ValueRegs<R> {
+    /// Create an invalid Value-in-Reg.
+    pub fn invalid() -> Self {
+        ValueRegs {
+            parts: [R::invalid_sentinel(); VALUE_REGS_PARTS],
+        }
+    }
+
+    /// Is this Value-to-Reg mapping valid?
+    pub fn is_valid(self) -> bool {
+        !self.parts[0].is_invalid_sentinel()
+    }
+    /// Is this Value-to-Reg mapping invalid?
+    pub fn is_invalid(self) -> bool {
+        self.parts[0].is_invalid_sentinel()
+    }
+
+    /// Return the single register used for this value, if any.
+    pub fn only_reg(self) -> Option<R> {
+        if self.len() == 1 {
+            Some(self.parts[0])
+        } else {
+            None
+        }
+    }
+
+    /// Return an iterator over the registers storing this value.
+    pub fn regs(&self) -> &[R] {
+        &self.parts[0..self.len()]
+    }
+}
+
+#[cfg(feature = "arm32")]
+impl<R: Clone + Copy + Debug + PartialEq + Eq + InvalidSentinel> ValueRegs<R> {
+    /// Create a Value-in-R location for a value stored in one register.
+    pub fn one(reg: R) -> Self {
+        ValueRegs {
+            parts: [
+                reg,
+                R::invalid_sentinel(),
+                R::invalid_sentinel(),
+                R::invalid_sentinel(),
+            ],
+        }
+    }
+    /// Create a Value-in-R location for a value stored in two registers.
+    pub fn two(r1: R, r2: R) -> Self {
+        ValueRegs {
+            parts: [r1, r2, R::invalid_sentinel(), R::invalid_sentinel()],
+        }
+    }
+    /// Create a Value-in-R location for a value stored in four registers.
+    pub fn four(r1: R, r2: R, r3: R, r4: R) -> Self {
+        ValueRegs {
+            parts: [r1, r2, r3, r4],
+        }
+    }
+
+    /// Return the number of registers used.
+    pub fn len(self) -> usize {
+        // If rustc/LLVM is smart enough, this might even be vectorized...
+        (self.parts[0] != R::invalid_sentinel()) as usize
+            + (self.parts[1] != R::invalid_sentinel()) as usize
+            + (self.parts[2] != R::invalid_sentinel()) as usize
+            + (self.parts[3] != R::invalid_sentinel()) as usize
+    }
+
+    /// Map individual registers via a map function.
+    pub fn map<NewR, F>(self, f: F) -> ValueRegs<NewR>
+    where
+        NewR: Clone + Copy + Debug + PartialEq + Eq + InvalidSentinel,
+        F: Fn(R) -> NewR,
+    {
+        ValueRegs {
+            parts: [
+                f(self.parts[0]),
+                f(self.parts[1]),
+                f(self.parts[2]),
+                f(self.parts[3]),
+            ],
+        }
+    }
+}
+
+#[cfg(not(feature = "arm32"))]
+impl<R: Clone + Copy + Debug + PartialEq + Eq + InvalidSentinel> ValueRegs<R> {
+    /// Create a Value-in-R location for a value stored in one register.
+    pub fn one(reg: R) -> Self {
+        ValueRegs {
+            parts: [reg, R::invalid_sentinel()],
+        }
+    }
+    /// Create a Value-in-R location for a value stored in two registers.
+    pub fn two(r1: R, r2: R) -> Self {
+        ValueRegs { parts: [r1, r2] }
+    }
+
+    /// Return the number of registers used.
+    pub fn len(self) -> usize {
+        // If rustc/LLVM is smart enough, this might even be vectorized...
+        (self.parts[0] != R::invalid_sentinel()) as usize
+            + (self.parts[1] != R::invalid_sentinel()) as usize
+    }
+
+    /// Map individual registers via a map function.
+    pub fn map<NewR, F>(self, f: F) -> ValueRegs<NewR>
+    where
+        NewR: Clone + Copy + Debug + PartialEq + Eq + InvalidSentinel,
+        F: Fn(R) -> NewR,
+    {
+        ValueRegs {
+            parts: [f(self.parts[0]), f(self.parts[1])],
+        }
+    }
+}
+
+/// Create a writable ValueRegs.
+pub(crate) fn writable_value_regs(regs: ValueRegs<Reg>) -> ValueRegs<Writable<Reg>> {
+    regs.map(|r| Writable::from_reg(r))
+}
+
+/// Strip a writable ValueRegs down to a readonly ValueRegs.
+pub(crate) fn non_writable_value_regs(regs: ValueRegs<Writable<Reg>>) -> ValueRegs<Reg> {
+    regs.map(|r| r.to_reg())
+}

From d4aaae3e863f4558ee965371dc87d9ce8a1190ff Mon Sep 17 00:00:00 2001
From: Pat Hickey <pat@moreproductive.org>
Date: Wed, 6 Jan 2021 10:59:29 -0800
Subject: [PATCH 07/10] wasi-nn: remove missing_memory from
 wasmtime_integration

---
 crates/wasi-nn/src/lib.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/crates/wasi-nn/src/lib.rs b/crates/wasi-nn/src/lib.rs
index e604ec768a..2e2c08b037 100644
--- a/crates/wasi-nn/src/lib.rs
+++ b/crates/wasi-nn/src/lib.rs
@@ -21,6 +21,4 @@ wasmtime_wiggle::wasmtime_integration!({
           function_override: {}
         }
     },
-    // Error to return when caller module is missing memory export:
-    missing_memory: { witx::types::Errno::MissingMemory },
 });

From c0c4834c643ea8e90e5ad5e539db7b5ec60e9690 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C3=A9o=20Gaspard?= <leo@gaspard.io>
Date: Wed, 6 Jan 2021 23:48:18 +0100
Subject: [PATCH 08/10] wasi-nn: rebuild if the witx files change

---
 Cargo.lock                | 1 +
 crates/wasi-nn/Cargo.toml | 3 +++
 crates/wasi-nn/build.rs   | 5 +++++
 3 files changed, 9 insertions(+)

diff --git a/Cargo.lock b/Cargo.lock
index dc2ec4c24e..3df60a0186 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2727,6 +2727,7 @@ dependencies = [
  "log",
  "openvino",
  "thiserror",
+ "walkdir",
  "wasmtime",
  "wasmtime-runtime",
  "wasmtime-wasi",
diff --git a/crates/wasi-nn/Cargo.toml b/crates/wasi-nn/Cargo.toml
index c05fd1d052..6811ded172 100644
--- a/crates/wasi-nn/Cargo.toml
+++ b/crates/wasi-nn/Cargo.toml
@@ -25,5 +25,8 @@ wiggle = { path = "../wiggle", version = "0.21.0" }
 openvino = "0.1.5"
 thiserror = "1.0"
 
+[build-dependencies]
+walkdir = "2.3"
+
 [badges]
 maintenance = { status = "experimental" }
diff --git a/crates/wasi-nn/build.rs b/crates/wasi-nn/build.rs
index aeced29a0f..189b9513a6 100644
--- a/crates/wasi-nn/build.rs
+++ b/crates/wasi-nn/build.rs
@@ -7,4 +7,9 @@ fn main() {
     // This is necessary for Wiggle/Witx macros.
     let wasi_root = PathBuf::from("./spec").canonicalize().unwrap();
     println!("cargo:rustc-env=WASI_ROOT={}", wasi_root.display());
+
+    // Also automatically rebuild if the Witx files change
+    for entry in walkdir::WalkDir::new(wasi_root) {
+        println!("cargo:rerun-if-changed={}", entry.unwrap().path().display());
+    }
 }

From 6354edc7bd9f23b716d10f180849dfa3bc6c2442 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Thu, 7 Jan 2021 11:40:25 -0800
Subject: [PATCH 09/10] Document that the module linking proposal is
 implemented

Forgot to do this earlier!
---
 docs/stability-wasm-proposals-support.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/stability-wasm-proposals-support.md b/docs/stability-wasm-proposals-support.md
index d2c6d808a2..3080869774 100644
--- a/docs/stability-wasm-proposals-support.md
+++ b/docs/stability-wasm-proposals-support.md
@@ -22,6 +22,7 @@ vetted](./contributing-implementing-wasm-proposals.html).
 | **[Fixed-Width SIMD]**                      | **In progress.**                 | `--enable-simd`        | [`wasm_simd`](https://docs.rs/wasmtime/*/wasmtime/struct.Config.html#method.wasm_simd) |
 | **[Threads and Atomics]**                   | **In progress.**                 | `--enable-threads`     | [`wasm_threads`](https://docs.rs/wasmtime/*/wasmtime/struct.Config.html#method.wasm_threads) |
 | **[Multi-Memory]**                          | **Yes.**                         | `--enable-multi-memory`| [`wasm_multi_memory`](https://docs.rs/wasmtime/*/wasmtime/struct.Config.html#method.wasm_multi_memory) |
+| **[Module Linking]**                        | **Yes.**                         | `--enable-module-linking` | [`wasm_module_linking`](https://docs.rs/wasmtime/*/wasmtime/struct.Config.html#method.wasm_module_linking) |
 
 [config]: https://docs.rs/wasmtime/*/wasmtime/struct.Config.html
 [Multi-Value]: https://github.com/WebAssembly/spec/blob/master/proposals/multi-value/Overview.md
@@ -34,3 +35,4 @@ vetted](./contributing-implementing-wasm-proposals.html).
 [phases]: https://github.com/WebAssembly/meetings/blob/master/process/phases.md
 [Threads and Atomics]: https://github.com/WebAssembly/threads/blob/master/proposals/threads/Overview.md
 [Multi-Memory]: https://github.com/WebAssembly/multi-memory/blob/master/proposals/multi-memory/Overview.md
+[Module Linking]: https://github.com/WebAssembly/module-linking/blob/master/proposals/module-linking/Explainer.md

From 79aaeb5eda8f76707c8531225653a94fab35fbc6 Mon Sep 17 00:00:00 2001
From: Nick Fitzgerald <fitzgen@gmail.com>
Date: Thu, 7 Jan 2021 12:11:42 -0800
Subject: [PATCH 10/10] docs: Add `wasm-{smith,encoder}` crates to Wasm
 proposal checklist (#2554)

---
 docs/contributing-implementing-wasm-proposals.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/contributing-implementing-wasm-proposals.md b/docs/contributing-implementing-wasm-proposals.md
index bce9db61db..ce42e7e8fb 100644
--- a/docs/contributing-implementing-wasm-proposals.md
+++ b/docs/contributing-implementing-wasm-proposals.md
@@ -20,6 +20,14 @@ multiple pull requests.
   [`wasmprinter`](https://github.com/bytecodealliance/wasm-tools/tree/main/crates/wasmprinter)
   crate.
 
+* <input type="checkbox"/> Add support to the
+  [`wasm-encoder`](https://github.com/bytecodealliance/wasm-tools/tree/main/crates/wasm-encoder)
+  crate.
+
+* <input type="checkbox"/> Add support to the
+  [`wasm-smith`](https://github.com/bytecodealliance/wasm-tools/tree/main/crates/wasm-smith)
+  crate.
+
 * <input type="checkbox"/> Add a `wasmtime::Config::enable_foo_bar` method to
   the `wasmtime` crate.