From 71ead6e31d5736cb35cd2f5d979f5dd3b22b5470 Mon Sep 17 00:00:00 2001
From: Chris Fallin <chris@cfallin.org>
Date: Sat, 12 Dec 2020 22:21:39 -0800
Subject: [PATCH] x64 backend: implement 128-bit ops and misc fixes.

This implements all of the ops on I128 that are implemented by the
legacy x86 backend, and includes all that are required by at least one
major use-case (cg_clif rustc backend).

The sequences are open-coded where necessary; for e.g. the bit
operations, this can be somewhat complex, but these sequences have been
tested carefully. This PR also includes a drive-by fix of clz/ctz for 8-
and 16-bit cases where they were incorrect previously.

Also includes ridealong fixes developed while bringing up cg_clif
support, because they are difficult to completely separate due to
other refactors that occurred in this PR:

- fix REX prefix logic for some 8-bit instructions.

  When using an 8-bit register in 64-bit mode on x86-64, the REX prefix
  semantics are somewhat subtle: without the REX prefix, register numbers
  4--7 correspond to the second-to-lowest byte of the first four registers
  (AH, CH, BH, DH), whereas with the REX prefix, these register numbers
  correspond to the usual encoding (SPL, BPL, SIL, DIL). We could always
  emit a REX byte for instructions with 8-bit cases (this is harmless even
  if unneeded), but this would unnecessarily inflate code size; instead,
  the usual approach is to emit it only for these registers.

  This logic was present in some cases but missing for some other
  instructions: divide, not, negate, shifts.

  Fixes #2508.

- avoid unaligned SSE loads on some f64 ops.

  The implementations of several FP ops, such as fabs/fneg, used SSE
  instructions. This is not a problem per-se, except that load-op merging
  did not take *alignment* into account. Specifically, if an op on an f64
  loaded from memory happened to merge that load, and the instruction into
  which it was merged was an SSE instruction, then the SSE instruction
  imposes stricter (128-bit) alignment requirements than the load.f64 did.

  This PR simply forces any instruction lowerings that could use SSE
  instructions to implement non-SIMD operations to take inputs in
  registers only, and avoid load-op merging.

  Fixes #2507.

- two bugfixes exposed by cg_clif: urem/srem.i8, select.b1.

  - urem/srem.i8: the 8-bit form of the DIV instruction on x86-64 places
    the remainder in AH, not RDX, different from all the other width-forms
    of this instruction.

  - select.b1: we were not recognizing selects of boolean values as
    integer-typed operations, so we were generating XMM moves instead (!).
---
 cranelift/codegen/src/isa/x64/abi.rs          |  103 +-
 cranelift/codegen/src/isa/x64/inst/args.rs    |   24 +-
 cranelift/codegen/src/isa/x64/inst/emit.rs    |   94 +-
 .../codegen/src/isa/x64/inst/emit_tests.rs    |   87 +-
 cranelift/codegen/src/isa/x64/inst/mod.rs     |  182 +-
 cranelift/codegen/src/isa/x64/lower.rs        | 2081 +++++++++++++----
 .../filetests/isa/x64/bitops-i128-run.clif    |   27 +
 .../filetests/isa/x64/bitrev-i128-run.clif    |   47 +
 .../filetests/isa/x64/floating-point.clif     |   26 +
 .../filetests/filetests/isa/x64/i128.clif     | 1082 +++++++++
 .../filetests/isa/x64/select-i128.clif        |   29 +
 .../filetests/isa/x64/shift-i128-run.clif     |  106 +
 12 files changed, 3213 insertions(+), 675 deletions(-)
 create mode 100644 cranelift/filetests/filetests/isa/x64/bitops-i128-run.clif
 create mode 100644 cranelift/filetests/filetests/isa/x64/bitrev-i128-run.clif
 create mode 100644 cranelift/filetests/filetests/isa/x64/floating-point.clif
 create mode 100644 cranelift/filetests/filetests/isa/x64/i128.clif
 create mode 100644 cranelift/filetests/filetests/isa/x64/select-i128.clif
 create mode 100644 cranelift/filetests/filetests/isa/x64/shift-i128-run.clif

diff --git a/cranelift/codegen/src/isa/x64/abi.rs b/cranelift/codegen/src/isa/x64/abi.rs
index 74dca6c3ec..aa757392e3 100644
--- a/cranelift/codegen/src/isa/x64/abi.rs
+++ b/cranelift/codegen/src/isa/x64/abi.rs
@@ -138,42 +138,62 @@ impl ABIMachineSpec for X64ABIMachineSpec {
                 ),
             }
 
-            let intreg = in_int_reg(param.value_type);
-            let vecreg = in_vec_reg(param.value_type);
-            debug_assert!(intreg || vecreg);
-            debug_assert!(!(intreg && vecreg));
-
-            let (next_reg, candidate) = if intreg {
-                let candidate = match args_or_rets {
-                    ArgsOrRets::Args => get_intreg_for_arg_systemv(&call_conv, next_gpr),
-                    ArgsOrRets::Rets => get_intreg_for_retval_systemv(&call_conv, next_gpr, i),
-                };
-                debug_assert!(candidate
-                    .map(|r| r.get_class() == RegClass::I64)
-                    .unwrap_or(true));
-                (&mut next_gpr, candidate)
-            } else {
-                let candidate = match args_or_rets {
-                    ArgsOrRets::Args => get_fltreg_for_arg_systemv(&call_conv, next_vreg),
-                    ArgsOrRets::Rets => get_fltreg_for_retval_systemv(&call_conv, next_vreg, i),
-                };
-                debug_assert!(candidate
-                    .map(|r| r.get_class() == RegClass::V128)
-                    .unwrap_or(true));
-                (&mut next_vreg, candidate)
-            };
-
             if let Some(param) = try_fill_baldrdash_reg(call_conv, param) {
-                assert!(intreg);
                 ret.push(param);
-            } else if let Some(reg) = candidate {
+                continue;
+            }
+
+            // Find regclass(es) of the register(s) used to store a value of this type.
+            let (rcs, _) = Inst::rc_for_type(param.value_type)?;
+            let intreg = rcs[0] == RegClass::I64;
+            let num_regs = rcs.len();
+            assert!(num_regs <= 2);
+            if num_regs == 2 {
+                assert_eq!(rcs[0], rcs[1]);
+            }
+
+            let mut regs: SmallVec<[RealReg; 2]> = smallvec![];
+            for j in 0..num_regs {
+                let nextreg = if intreg {
+                    match args_or_rets {
+                        ArgsOrRets::Args => get_intreg_for_arg_systemv(&call_conv, next_gpr + j),
+                        ArgsOrRets::Rets => {
+                            get_intreg_for_retval_systemv(&call_conv, next_gpr + j, i + j)
+                        }
+                    }
+                } else {
+                    match args_or_rets {
+                        ArgsOrRets::Args => get_fltreg_for_arg_systemv(&call_conv, next_vreg + j),
+                        ArgsOrRets::Rets => {
+                            get_fltreg_for_retval_systemv(&call_conv, next_vreg + j, i + j)
+                        }
+                    }
+                };
+                if let Some(reg) = nextreg {
+                    regs.push(reg.to_real_reg());
+                } else {
+                    regs.clear();
+                    break;
+                }
+            }
+
+            if regs.len() > 0 {
+                let regs = match num_regs {
+                    1 => ValueRegs::one(regs[0]),
+                    2 => ValueRegs::two(regs[0], regs[1]),
+                    _ => panic!("More than two registers unexpected"),
+                };
                 ret.push(ABIArg::Reg(
-                    ValueRegs::one(reg.to_real_reg()),
+                    regs,
                     param.value_type,
                     param.extension,
                     param.purpose,
                 ));
-                *next_reg += 1;
+                if intreg {
+                    next_gpr += num_regs;
+                } else {
+                    next_vreg += num_regs;
+                }
             } else {
                 // Compute size. Every arg takes a minimum slot of 8 bytes. (16-byte
                 // stack alignment happens separately after all args.)
@@ -658,31 +678,6 @@ impl From<StackAMode> for SyntheticAmode {
     }
 }
 
-fn in_int_reg(ty: types::Type) -> bool {
-    match ty {
-        types::I8
-        | types::I16
-        | types::I32
-        | types::I64
-        | types::B1
-        | types::B8
-        | types::B16
-        | types::B32
-        | types::B64
-        | types::R64 => true,
-        types::R32 => panic!("unexpected 32-bits refs on x64!"),
-        _ => false,
-    }
-}
-
-fn in_vec_reg(ty: types::Type) -> bool {
-    match ty {
-        types::F32 | types::F64 => true,
-        _ if ty.is_vector() => true,
-        _ => false,
-    }
-}
-
 fn get_intreg_for_arg_systemv(call_conv: &CallConv, idx: usize) -> Option<Reg> {
     match call_conv {
         CallConv::Fast
diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs
index 898134644f..39ca25d060 100644
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -346,23 +346,35 @@ impl PrettyPrintSized for RegMem {
 #[derive(Copy, Clone, PartialEq)]
 pub enum AluRmiROpcode {
     Add,
+    Adc,
     Sub,
+    Sbb,
     And,
     Or,
     Xor,
     /// The signless, non-extending (N x N -> N, for N in {32,64}) variant.
     Mul,
+    /// 8-bit form of And. Handled separately as we don't have full 8-bit op
+    /// support (we just use wider instructions). Used only with some sequences
+    /// with SETcc.
+    And8,
+    /// 8-bit form of Or.
+    Or8,
 }
 
 impl fmt::Debug for AluRmiROpcode {
     fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
         let name = match self {
             AluRmiROpcode::Add => "add",
+            AluRmiROpcode::Adc => "adc",
             AluRmiROpcode::Sub => "sub",
+            AluRmiROpcode::Sbb => "sbb",
             AluRmiROpcode::And => "and",
             AluRmiROpcode::Or => "or",
             AluRmiROpcode::Xor => "xor",
             AluRmiROpcode::Mul => "imul",
+            AluRmiROpcode::And8 => "and",
+            AluRmiROpcode::Or8 => "or",
         };
         write!(fmt, "{}", name)
     }
@@ -374,6 +386,16 @@ impl fmt::Display for AluRmiROpcode {
     }
 }
 
+impl AluRmiROpcode {
+    /// Is this a special-cased 8-bit ALU op?
+    pub fn is_8bit(self) -> bool {
+        match self {
+            AluRmiROpcode::And8 | AluRmiROpcode::Or8 => true,
+            _ => false,
+        }
+    }
+}
+
 #[derive(Clone, PartialEq)]
 pub enum UnaryRmROpcode {
     /// Bit-scan reverse.
@@ -1010,7 +1032,7 @@ impl fmt::Display for ExtMode {
 }
 
 /// These indicate the form of a scalar shift/rotate: left, signed right, unsigned right.
-#[derive(Clone)]
+#[derive(Clone, Copy)]
 pub enum ShiftKind {
     ShiftLeft,
     /// Inserts zeros in the most significant bits.
diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
index 580d469b8d..075724d493 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -83,6 +83,14 @@ impl RexFlags {
         self
     }
 
+    #[inline(always)]
+    fn always_emit_if_8bit_needed(&mut self, reg: u8) -> &mut Self {
+        if reg >= 4 && reg <= 7 {
+            self.always_emit();
+        }
+        self
+    }
+
     #[inline(always)]
     fn must_clear_w(&self) -> bool {
         (self.0 & 1) != 0
@@ -527,7 +535,7 @@ pub(crate) fn emit(
             src,
             dst: reg_g,
         } => {
-            let rex = if *is_64 {
+            let mut rex = if *is_64 {
                 RexFlags::set_w()
             } else {
                 RexFlags::clear_w()
@@ -581,17 +589,26 @@ pub(crate) fn emit(
                     }
                 }
             } else {
-                let (opcode_r, opcode_m, subopcode_i) = match op {
-                    AluRmiROpcode::Add => (0x01, 0x03, 0),
-                    AluRmiROpcode::Sub => (0x29, 0x2B, 5),
-                    AluRmiROpcode::And => (0x21, 0x23, 4),
-                    AluRmiROpcode::Or => (0x09, 0x0B, 1),
-                    AluRmiROpcode::Xor => (0x31, 0x33, 6),
+                let (opcode_r, opcode_m, subopcode_i, is_8bit) = match op {
+                    AluRmiROpcode::Add => (0x01, 0x03, 0, false),
+                    AluRmiROpcode::Adc => (0x11, 0x03, 0, false),
+                    AluRmiROpcode::Sub => (0x29, 0x2B, 5, false),
+                    AluRmiROpcode::Sbb => (0x19, 0x2B, 5, false),
+                    AluRmiROpcode::And => (0x21, 0x23, 4, false),
+                    AluRmiROpcode::Or => (0x09, 0x0B, 1, false),
+                    AluRmiROpcode::Xor => (0x31, 0x33, 6, false),
+                    AluRmiROpcode::And8 => (0x20, 0x22, 4, true),
+                    AluRmiROpcode::Or8 => (0x08, 0x0A, 1, true),
                     AluRmiROpcode::Mul => panic!("unreachable"),
                 };
+                assert!(!(is_8bit && *is_64));
 
                 match src {
                     RegMemImm::Reg { reg: reg_e } => {
+                        if is_8bit {
+                            rex.always_emit_if_8bit_needed(int_reg_enc(*reg_e));
+                            rex.always_emit_if_8bit_needed(int_reg_enc(reg_g.to_reg()));
+                        }
                         // GCC/llvm use the swapped operand encoding (viz., the R/RM vs RM/R
                         // duality). Do this too, so as to be able to compare generated machine
                         // code easily.
@@ -604,11 +621,12 @@ pub(crate) fn emit(
                             reg_g.to_reg(),
                             rex,
                         );
-                        // NB: if this is ever extended to handle byte size ops, be sure to retain
-                        // redundant REX prefixes.
                     }
 
                     RegMemImm::Mem { addr } => {
+                        if is_8bit {
+                            rex.always_emit_if_8bit_needed(int_reg_enc(reg_g.to_reg()));
+                        }
                         // Here we revert to the "normal" G-E ordering.
                         let amode = addr.finalize(state, sink);
                         emit_std_reg_mem(
@@ -625,6 +643,7 @@ pub(crate) fn emit(
                     }
 
                     RegMemImm::Imm { simm32 } => {
+                        assert!(!is_8bit);
                         let use_imm8 = low8_will_sign_extend_to_32(*simm32);
                         let opcode = if use_imm8 { 0x83 } else { 0x81 };
                         // And also here we use the "normal" G-E ordering.
@@ -685,8 +704,13 @@ pub(crate) fn emit(
         }
 
         Inst::Not { size, src } => {
+            let src = int_reg_enc(src.to_reg());
             let (opcode, prefix, rex_flags) = match size {
-                1 => (0xF6, LegacyPrefixes::None, RexFlags::clear_w()),
+                1 => (
+                    0xF6,
+                    LegacyPrefixes::None,
+                    *RexFlags::clear_w().always_emit_if_8bit_needed(src),
+                ),
                 2 => (0xF7, LegacyPrefixes::_66, RexFlags::clear_w()),
                 4 => (0xF7, LegacyPrefixes::None, RexFlags::clear_w()),
                 8 => (0xF7, LegacyPrefixes::None, RexFlags::set_w()),
@@ -694,13 +718,17 @@ pub(crate) fn emit(
             };
 
             let subopcode = 2;
-            let src = int_reg_enc(src.to_reg());
             emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, src, rex_flags)
         }
 
         Inst::Neg { size, src } => {
+            let src = int_reg_enc(src.to_reg());
             let (opcode, prefix, rex_flags) = match size {
-                1 => (0xF6, LegacyPrefixes::None, RexFlags::clear_w()),
+                1 => (
+                    0xF6,
+                    LegacyPrefixes::None,
+                    *RexFlags::clear_w().always_emit_if_8bit_needed(src),
+                ),
                 2 => (0xF7, LegacyPrefixes::_66, RexFlags::clear_w()),
                 4 => (0xF7, LegacyPrefixes::None, RexFlags::clear_w()),
                 8 => (0xF7, LegacyPrefixes::None, RexFlags::set_w()),
@@ -708,7 +736,6 @@ pub(crate) fn emit(
             };
 
             let subopcode = 3;
-            let src = int_reg_enc(src.to_reg());
             emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, src, rex_flags)
         }
 
@@ -717,7 +744,7 @@ pub(crate) fn emit(
             signed,
             divisor,
         } => {
-            let (opcode, prefix, rex_flags) = match size {
+            let (opcode, prefix, mut rex_flags) = match size {
                 1 => (0xF6, LegacyPrefixes::None, RexFlags::clear_w()),
                 2 => (0xF7, LegacyPrefixes::_66, RexFlags::clear_w()),
                 4 => (0xF7, LegacyPrefixes::None, RexFlags::clear_w()),
@@ -732,6 +759,9 @@ pub(crate) fn emit(
             match divisor {
                 RegMem::Reg { reg } => {
                     let src = int_reg_enc(*reg);
+                    if *size == 1 {
+                        rex_flags.always_emit_if_8bit_needed(src);
+                    }
                     emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, src, rex_flags)
                 }
                 RegMem::Mem { addr: src } => {
@@ -987,9 +1017,7 @@ pub(crate) fn emit(
                         ExtMode::BL | ExtMode::BQ => {
                             // A redundant REX prefix must be emitted for certain register inputs.
                             let enc_src = int_reg_enc(*src);
-                            if enc_src >= 4 && enc_src <= 7 {
-                                rex_flags.always_emit();
-                            };
+                            rex_flags.always_emit_if_8bit_needed(enc_src);
                         }
                         _ => {}
                     }
@@ -1084,9 +1112,7 @@ pub(crate) fn emit(
                         ExtMode::BL | ExtMode::BQ => {
                             // A redundant REX prefix must be emitted for certain register inputs.
                             let enc_src = int_reg_enc(*src);
-                            if enc_src >= 4 && enc_src <= 7 {
-                                rex_flags.always_emit();
-                            };
+                            rex_flags.always_emit_if_8bit_needed(enc_src);
                         }
                         _ => {}
                     }
@@ -1130,9 +1156,7 @@ pub(crate) fn emit(
                     let mut rex = RexFlags::clear_w();
 
                     let enc_src = int_reg_enc(*src);
-                    if enc_src >= 4 && enc_src <= 7 {
-                        rex.always_emit();
-                    };
+                    rex.always_emit_if_8bit_needed(enc_src);
 
                     // MOV r8, r/m8 is (REX.W==0) 88 /r
                     emit_std_reg_mem(
@@ -1215,7 +1239,11 @@ pub(crate) fn emit(
             match num_bits {
                 None => {
                     let (opcode, prefix, rex_flags) = match size {
-                        1 => (0xD2, LegacyPrefixes::None, RexFlags::clear_w()),
+                        1 => (
+                            0xD2,
+                            LegacyPrefixes::None,
+                            *RexFlags::clear_w().always_emit_if_8bit_needed(enc_dst),
+                        ),
                         2 => (0xD3, LegacyPrefixes::_66, RexFlags::clear_w()),
                         4 => (0xD3, LegacyPrefixes::None, RexFlags::clear_w()),
                         8 => (0xD3, LegacyPrefixes::None, RexFlags::set_w()),
@@ -1231,7 +1259,11 @@ pub(crate) fn emit(
 
                 Some(num_bits) => {
                     let (opcode, prefix, rex_flags) = match size {
-                        1 => (0xC0, LegacyPrefixes::None, RexFlags::clear_w()),
+                        1 => (
+                            0xC0,
+                            LegacyPrefixes::None,
+                            *RexFlags::clear_w().always_emit_if_8bit_needed(enc_dst),
+                        ),
                         2 => (0xC1, LegacyPrefixes::_66, RexFlags::clear_w()),
                         4 => (0xC1, LegacyPrefixes::None, RexFlags::clear_w()),
                         8 => (0xC1, LegacyPrefixes::None, RexFlags::set_w()),
@@ -1330,9 +1362,7 @@ pub(crate) fn emit(
                     let mut rex = RexFlags::clear_w();
                     // Here, a redundant REX prefix changes the meaning of the instruction.
                     let enc_g = int_reg_enc(*reg_g);
-                    if enc_g >= 4 && enc_g <= 7 {
-                        rex.always_emit();
-                    }
+                    rex.always_emit_if_8bit_needed(enc_g);
                     rex
                 }
                 _ => panic!("x64::Inst::Cmp_RMI_R::emit: unreachable"),
@@ -1343,9 +1373,7 @@ pub(crate) fn emit(
                     if *size == 1 {
                         // Check whether the E register forces the use of a redundant REX.
                         let enc_e = int_reg_enc(*reg_e);
-                        if enc_e >= 4 && enc_e <= 7 {
-                            rex.always_emit();
-                        }
+                        rex.always_emit_if_8bit_needed(enc_e);
                     }
 
                     // Use the swapped operands encoding for CMP, to stay consistent with the output of
@@ -2761,9 +2789,7 @@ pub(crate) fn emit(
                 types::I8 => {
                     let mut rex_flags = RexFlags::clear_w();
                     let enc_src = int_reg_enc(*src);
-                    if enc_src >= 4 && enc_src <= 7 {
-                        rex_flags.always_emit();
-                    };
+                    rex_flags.always_emit_if_8bit_needed(enc_src);
                     (LegacyPrefixes::_F0, rex_flags, 0x0FB0)
                 }
                 types::I16 => (LegacyPrefixes::_66F0, RexFlags::clear_w(), 0x0FB1),
diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
index c3489089b9..42e38c9cd5 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -1025,6 +1025,56 @@ fn test_x64_emit() {
         "4C09FA",
         "orq     %r15, %rdx",
     ));
+    insns.push((
+        Inst::alu_rmi_r(false, AluRmiROpcode::And8, RegMemImm::reg(r15), w_rdx),
+        "4420FA",
+        "andb    %r15b, %dl",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(false, AluRmiROpcode::And8, RegMemImm::reg(rax), w_rsi),
+        "4020C6",
+        "andb    %al, %sil",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(false, AluRmiROpcode::And8, RegMemImm::reg(rax), w_rbx),
+        "20C3",
+        "andb    %al, %bl",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::And8,
+            RegMemImm::mem(Amode::imm_reg(0, rax)),
+            w_rbx,
+        ),
+        "2218",
+        "andb    0(%rax), %bl",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(false, AluRmiROpcode::Or8, RegMemImm::reg(r15), w_rdx),
+        "4408FA",
+        "orb     %r15b, %dl",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(false, AluRmiROpcode::Or8, RegMemImm::reg(rax), w_rsi),
+        "4008C6",
+        "orb     %al, %sil",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(false, AluRmiROpcode::Or8, RegMemImm::reg(rax), w_rbx),
+        "08C3",
+        "orb     %al, %bl",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Or8,
+            RegMemImm::mem(Amode::imm_reg(0, rax)),
+            w_rbx,
+        ),
+        "0A18",
+        "orb     0(%rax), %bl",
+    ));
     insns.push((
         Inst::alu_rmi_r(true, AluRmiROpcode::Xor, RegMemImm::reg(r15), w_rdx),
         "4C31FA",
@@ -1193,6 +1243,16 @@ fn test_x64_emit() {
         "66F7D7",
         "notw    %di",
     ));
+    insns.push((
+        Inst::not(1, Writable::from_reg(regs::rdi())),
+        "40F6D7",
+        "notb    %dil",
+    ));
+    insns.push((
+        Inst::not(1, Writable::from_reg(regs::rax())),
+        "F6D0",
+        "notb    %al",
+    ));
 
     // ========================================================
     // Neg
@@ -1216,6 +1276,16 @@ fn test_x64_emit() {
         "66F7DF",
         "negw    %di",
     ));
+    insns.push((
+        Inst::neg(1, Writable::from_reg(regs::rdi())),
+        "40F6DF",
+        "negb    %dil",
+    ));
+    insns.push((
+        Inst::neg(1, Writable::from_reg(regs::rax())),
+        "F6D8",
+        "negb    %al",
+    ));
 
     // ========================================================
     // Div
@@ -1239,6 +1309,16 @@ fn test_x64_emit() {
         "48F7F7",
         "div     %rdi",
     ));
+    insns.push((
+        Inst::div(1, false, RegMem::reg(regs::rax())),
+        "F6F0",
+        "div     %al",
+    ));
+    insns.push((
+        Inst::div(1, false, RegMem::reg(regs::rsi())),
+        "40F6F6",
+        "div     %sil",
+    ));
 
     // ========================================================
     // MulHi
@@ -2352,9 +2432,14 @@ fn test_x64_emit() {
     ));
     insns.push((
         Inst::shift_r(1, ShiftKind::RotateRight, None, w_rsi),
-        "D2CE",
+        "40D2CE",
         "rorb    %cl, %sil",
     ));
+    insns.push((
+        Inst::shift_r(1, ShiftKind::RotateRight, None, w_rax),
+        "D2C8",
+        "rorb    %cl, %al",
+    ));
     insns.push((
         Inst::shift_r(1, ShiftKind::RotateRight, Some(5), w_r15),
         "41C0CF05",
diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs
index 09c469498d..979c264231 100644
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -1243,6 +1243,14 @@ impl PrettyPrint for Inst {
             (if is_64 { "q" } else { "l" }).to_string()
         }
 
+        fn suffix_lqb(is_64: bool, is_8: bool) -> String {
+            match (is_64, is_8) {
+                (_, true) => "b".to_string(),
+                (true, false) => "q".to_string(),
+                (false, false) => "l".to_string(),
+            }
+        }
+
         fn size_lq(is_64: bool) -> u8 {
             if is_64 {
                 8
@@ -1251,6 +1259,16 @@ impl PrettyPrint for Inst {
             }
         }
 
+        fn size_lqb(is_64: bool, is_8: bool) -> u8 {
+            if is_8 {
+                1
+            } else if is_64 {
+                8
+            } else {
+                4
+            }
+        }
+
         fn suffix_bwlq(size: u8) -> String {
             match size {
                 1 => "b".to_string(),
@@ -1271,9 +1289,9 @@ impl PrettyPrint for Inst {
                 dst,
             } => format!(
                 "{} {}, {}",
-                ljustify2(op.to_string(), suffix_lq(*is_64)),
-                src.show_rru_sized(mb_rru, size_lq(*is_64)),
-                show_ireg_sized(dst.to_reg(), mb_rru, size_lq(*is_64)),
+                ljustify2(op.to_string(), suffix_lqb(*is_64, op.is_8bit())),
+                src.show_rru_sized(mb_rru, size_lqb(*is_64, op.is_8bit())),
+                show_ireg_sized(dst.to_reg(), mb_rru, size_lqb(*is_64, op.is_8bit())),
             ),
 
             Inst::UnaryRmR { src, dst, op, size } => format!(
@@ -2065,6 +2083,17 @@ impl Amode {
             }
         }
     }
+
+    /// Offset the amode by a fixed offset.
+    pub(crate) fn offset(&self, offset: u32) -> Self {
+        let mut ret = self.clone();
+        match &mut ret {
+            &mut Amode::ImmReg { ref mut simm32, .. } => *simm32 += offset,
+            &mut Amode::ImmRegRegShift { ref mut simm32, .. } => *simm32 += offset,
+            _ => panic!("Cannot offset amode: {:?}", self),
+        }
+        ret
+    }
 }
 
 impl RegMemImm {
@@ -2548,77 +2577,88 @@ impl MachInst for Inst {
         ty: Type,
         mut alloc_tmp: F,
     ) -> SmallVec<[Self; 4]> {
-        // We don't support 128-bit constants.
-        assert!(value <= u64::MAX as u128);
         let mut ret = SmallVec::new();
-        let to_reg = to_regs
-            .only_reg()
-            .expect("multi-reg values not supported on x64");
-        if ty == types::F32 {
-            if value == 0 {
-                ret.push(Inst::xmm_rm_r(
-                    SseOpcode::Xorps,
-                    RegMem::reg(to_reg.to_reg()),
-                    to_reg,
-                ));
-            } else {
-                let tmp = alloc_tmp(types::I32);
-                ret.push(Inst::imm(OperandSize::Size32, value as u64, tmp));
-
-                ret.push(Inst::gpr_to_xmm(
-                    SseOpcode::Movd,
-                    RegMem::reg(tmp.to_reg()),
-                    OperandSize::Size32,
-                    to_reg,
-                ));
-            }
-        } else if ty == types::F64 {
-            if value == 0 {
-                ret.push(Inst::xmm_rm_r(
-                    SseOpcode::Xorpd,
-                    RegMem::reg(to_reg.to_reg()),
-                    to_reg,
-                ));
-            } else {
-                let tmp = alloc_tmp(types::I64);
-                ret.push(Inst::imm(OperandSize::Size64, value as u64, tmp));
-
-                ret.push(Inst::gpr_to_xmm(
-                    SseOpcode::Movq,
-                    RegMem::reg(tmp.to_reg()),
-                    OperandSize::Size64,
-                    to_reg,
-                ));
-            }
+        if ty == types::I128 {
+            ret.push(Inst::imm(
+                OperandSize::Size64,
+                value as u64,
+                to_regs.regs()[0],
+            ));
+            ret.push(Inst::imm(
+                OperandSize::Size64,
+                (value >> 64) as u64,
+                to_regs.regs()[1],
+            ));
         } else {
-            // Must be an integer type.
-            debug_assert!(
-                ty == types::B1
-                    || ty == types::I8
-                    || ty == types::B8
-                    || ty == types::I16
-                    || ty == types::B16
-                    || ty == types::I32
-                    || ty == types::B32
-                    || ty == types::I64
-                    || ty == types::B64
-                    || ty == types::R32
-                    || ty == types::R64
-            );
-            if value == 0 {
-                ret.push(Inst::alu_rmi_r(
-                    ty == types::I64,
-                    AluRmiROpcode::Xor,
-                    RegMemImm::reg(to_reg.to_reg()),
-                    to_reg,
-                ));
+            let to_reg = to_regs
+                .only_reg()
+                .expect("multi-reg values not supported on x64");
+            if ty == types::F32 {
+                if value == 0 {
+                    ret.push(Inst::xmm_rm_r(
+                        SseOpcode::Xorps,
+                        RegMem::reg(to_reg.to_reg()),
+                        to_reg,
+                    ));
+                } else {
+                    let tmp = alloc_tmp(types::I32);
+                    ret.push(Inst::imm(OperandSize::Size32, value as u64, tmp));
+
+                    ret.push(Inst::gpr_to_xmm(
+                        SseOpcode::Movd,
+                        RegMem::reg(tmp.to_reg()),
+                        OperandSize::Size32,
+                        to_reg,
+                    ));
+                }
+            } else if ty == types::F64 {
+                if value == 0 {
+                    ret.push(Inst::xmm_rm_r(
+                        SseOpcode::Xorpd,
+                        RegMem::reg(to_reg.to_reg()),
+                        to_reg,
+                    ));
+                } else {
+                    let tmp = alloc_tmp(types::I64);
+                    ret.push(Inst::imm(OperandSize::Size64, value as u64, tmp));
+
+                    ret.push(Inst::gpr_to_xmm(
+                        SseOpcode::Movq,
+                        RegMem::reg(tmp.to_reg()),
+                        OperandSize::Size64,
+                        to_reg,
+                    ));
+                }
             } else {
-                let value = value as u64;
-                ret.push(Inst::imm(
-                    OperandSize::from_bytes(ty.bytes()),
-                    value.into(),
-                    to_reg,
-                ));
+                // Must be an integer type.
+                debug_assert!(
+                    ty == types::B1
+                        || ty == types::I8
+                        || ty == types::B8
+                        || ty == types::I16
+                        || ty == types::B16
+                        || ty == types::I32
+                        || ty == types::B32
+                        || ty == types::I64
+                        || ty == types::B64
+                        || ty == types::R32
+                        || ty == types::R64
+                );
+                if value == 0 {
+                    ret.push(Inst::alu_rmi_r(
+                        ty == types::I64,
+                        AluRmiROpcode::Xor,
+                        RegMemImm::reg(to_reg.to_reg()),
+                        to_reg,
+                    ));
+                } else {
+                    let value = value as u64;
+                    ret.push(Inst::imm(
+                        OperandSize::from_bytes(ty.bytes()),
+                        value.into(),
+                        to_reg,
+                    ));
+                }
             }
         }
         ret
diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs
index 9293221de5..a25da666b3 100644
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -18,7 +18,7 @@ use alloc::vec::Vec;
 use cranelift_codegen_shared::condcodes::CondCode;
 use log::trace;
 use regalloc::{Reg, RegClass, Writable};
-use smallvec::SmallVec;
+use smallvec::{smallvec, SmallVec};
 use std::convert::TryFrom;
 use target_lexicon::Triple;
 
@@ -28,6 +28,7 @@ use target_lexicon::Triple;
 fn is_int_or_ref_ty(ty: Type) -> bool {
     match ty {
         types::I8 | types::I16 | types::I32 | types::I64 | types::R64 => true,
+        types::B1 | types::B8 | types::B16 | types::B32 | types::B64 => true,
         types::R32 => panic!("shouldn't have 32-bits refs on x64"),
         _ => false,
     }
@@ -107,23 +108,26 @@ fn generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u64) -> Va
     non_writable_value_regs(cst_copy)
 }
 
-/// Put the given input into a register, and mark it as used (side-effect).
-fn put_input_in_reg<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> Reg {
+/// Put the given input into possibly multiple registers, and mark it as used (side-effect).
+fn put_input_in_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> ValueRegs<Reg> {
     let ty = ctx.input_ty(spec.insn, spec.input);
     let input = ctx.get_input_as_source_or_const(spec.insn, spec.input);
 
     if let Some(c) = input.constant {
         // Generate constants fresh at each use to minimize long-range register pressure.
         generate_constant(ctx, ty, c)
-            .only_reg()
-            .expect("multi-reg values not supported yet")
     } else {
         ctx.put_input_in_regs(spec.insn, spec.input)
-            .only_reg()
-            .expect("multi-reg values not supported yet")
     }
 }
 
+/// Put the given input into a register, and mark it as used (side-effect).
+fn put_input_in_reg<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> Reg {
+    put_input_in_regs(ctx, spec)
+        .only_reg()
+        .expect("Multi-register value not expected")
+}
+
 /// Determines whether a load operation (indicated by `src_insn`) can be merged
 /// into the current lowering point. If so, returns the address-base source (as
 /// an `InsnInput`) and an offset from that address from which to perform the
@@ -373,25 +377,120 @@ fn emit_extract_lane<C: LowerCtx<I = Inst>>(
 ///
 /// Note: make sure that there are no instructions modifying the flags between a call to this
 /// function and the use of the flags!
-fn emit_cmp<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) {
+///
+/// Takes the condition code that will be tested, and returns
+/// the condition code that should be used. This allows us to
+/// synthesize comparisons out of multiple instructions for
+/// special cases (e.g., 128-bit integers).
+fn emit_cmp<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst, cc: IntCC) -> IntCC {
     let ty = ctx.input_ty(insn, 0);
 
     let inputs = [InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }];
 
-    // TODO Try to commute the operands (and invert the condition) if one is an immediate.
-    let lhs = put_input_in_reg(ctx, inputs[0]);
-    // We force the RHS into a register, and disallow load-op fusion, because we
-    // do not have a transitive guarantee that this cmp-site will be the sole
-    // user of the value. Consider: the icmp might be the only user of a load,
-    // but there may be multiple users of the icmp (e.g.  select or bint
-    // instructions) that each invoke `emit_cmp()`. If we were to allow a load
-    // to sink to the *latest* one, but other sites did not permit sinking, then
-    // we would be missing the load for other cmp-sites.
-    let rhs = put_input_in_reg(ctx, inputs[1]);
+    if ty == types::I128 {
+        // We need to compare both halves and combine the results appropriately.
+        let cmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+        let cmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+        let lhs = put_input_in_regs(ctx, inputs[0]);
+        let lhs_lo = lhs.regs()[0];
+        let lhs_hi = lhs.regs()[1];
+        let rhs = put_input_in_regs(ctx, inputs[1]);
+        let rhs_lo = RegMemImm::reg(rhs.regs()[0]);
+        let rhs_hi = RegMemImm::reg(rhs.regs()[1]);
+        match cc {
+            IntCC::Equal => {
+                ctx.emit(Inst::cmp_rmi_r(8, rhs_hi, lhs_hi));
+                ctx.emit(Inst::setcc(CC::Z, cmp1));
+                ctx.emit(Inst::cmp_rmi_r(8, rhs_lo, lhs_lo));
+                ctx.emit(Inst::setcc(CC::Z, cmp2));
+                ctx.emit(Inst::alu_rmi_r(
+                    true,
+                    AluRmiROpcode::And,
+                    RegMemImm::reg(cmp1.to_reg()),
+                    cmp2,
+                ));
+                ctx.emit(Inst::alu_rmi_r(
+                    true,
+                    AluRmiROpcode::And,
+                    RegMemImm::imm(1),
+                    cmp2,
+                ));
+                IntCC::NotEqual
+            }
+            IntCC::NotEqual => {
+                ctx.emit(Inst::cmp_rmi_r(8, rhs_hi, lhs_hi));
+                ctx.emit(Inst::setcc(CC::NZ, cmp1));
+                ctx.emit(Inst::cmp_rmi_r(8, rhs_lo, lhs_lo));
+                ctx.emit(Inst::setcc(CC::NZ, cmp2));
+                ctx.emit(Inst::alu_rmi_r(
+                    true,
+                    AluRmiROpcode::Or,
+                    RegMemImm::reg(cmp1.to_reg()),
+                    cmp2,
+                ));
+                ctx.emit(Inst::alu_rmi_r(
+                    true,
+                    AluRmiROpcode::And,
+                    RegMemImm::imm(1),
+                    cmp2,
+                ));
+                IntCC::NotEqual
+            }
+            IntCC::SignedLessThan
+            | IntCC::SignedLessThanOrEqual
+            | IntCC::SignedGreaterThan
+            | IntCC::SignedGreaterThanOrEqual
+            | IntCC::UnsignedLessThan
+            | IntCC::UnsignedLessThanOrEqual
+            | IntCC::UnsignedGreaterThan
+            | IntCC::UnsignedGreaterThanOrEqual => {
+                // Result = (lhs_hi <> rhs_hi) ||
+                //          (lhs_hi == rhs_hi && lhs_lo <> rhs_lo)
+                let cmp3 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                ctx.emit(Inst::cmp_rmi_r(8, rhs_hi, lhs_hi));
+                ctx.emit(Inst::setcc(CC::from_intcc(cc.without_equal()), cmp1));
+                ctx.emit(Inst::setcc(CC::Z, cmp2));
+                ctx.emit(Inst::cmp_rmi_r(8, rhs_lo, lhs_lo));
+                ctx.emit(Inst::setcc(CC::from_intcc(cc.unsigned()), cmp3));
+                ctx.emit(Inst::alu_rmi_r(
+                    true,
+                    AluRmiROpcode::And,
+                    RegMemImm::reg(cmp2.to_reg()),
+                    cmp3,
+                ));
+                ctx.emit(Inst::alu_rmi_r(
+                    true,
+                    AluRmiROpcode::Or,
+                    RegMemImm::reg(cmp1.to_reg()),
+                    cmp3,
+                ));
+                ctx.emit(Inst::alu_rmi_r(
+                    true,
+                    AluRmiROpcode::And,
+                    RegMemImm::imm(1),
+                    cmp3,
+                ));
+                IntCC::NotEqual
+            }
+            _ => panic!("Unhandled IntCC in I128 comparison: {:?}", cc),
+        }
+    } else {
+        // TODO Try to commute the operands (and invert the condition) if one is an immediate.
+        let lhs = put_input_in_reg(ctx, inputs[0]);
+        // We force the RHS into a register, and disallow load-op fusion, because we
+        // do not have a transitive guarantee that this cmp-site will be the sole
+        // user of the value. Consider: the icmp might be the only user of a load,
+        // but there may be multiple users of the icmp (e.g.  select or bint
+        // instructions) that each invoke `emit_cmp()`. If we were to allow a load
+        // to sink to the *latest* one, but other sites did not permit sinking, then
+        // we would be missing the load for other cmp-sites.
+        let rhs = put_input_in_reg(ctx, inputs[1]);
 
-    // Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives
-    // us dst - src at the machine instruction level, so invert operands.
-    ctx.emit(Inst::cmp_rmi_r(ty.bytes() as u8, RegMemImm::reg(rhs), lhs));
+        // Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives
+        // us dst - src at the machine instruction level, so invert operands.
+        ctx.emit(Inst::cmp_rmi_r(ty.bytes() as u8, RegMemImm::reg(rhs), lhs));
+        cc
+    }
 }
 
 /// A specification for a fcmp emission.
@@ -489,6 +588,458 @@ fn emit_fcmp<C: LowerCtx<I = Inst>>(
     cond_result
 }
 
+fn emit_bitrev<C: LowerCtx<I = Inst>>(ctx: &mut C, src: Reg, dst: Writable<Reg>, ty: Type) {
+    let bits = ty.bits();
+    let const_mask = if bits == 64 {
+        0xffff_ffff_ffff_ffff
+    } else {
+        (1u64 << bits) - 1
+    };
+    let tmp0 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+    let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+    let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+
+    ctx.emit(Inst::gen_move(tmp0, src, types::I64));
+
+    // Swap 1-bit units.
+    // tmp1 = src
+    ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
+    // tmp2 = 0b0101..
+    ctx.emit(Inst::imm(
+        OperandSize::Size64,
+        0x5555_5555_5555_5555 & const_mask,
+        tmp2,
+    ));
+    // tmp1 = src >> 1
+    ctx.emit(Inst::shift_r(
+        8,
+        ShiftKind::ShiftRightLogical,
+        Some(1),
+        tmp1,
+    ));
+    // tmp1 = (src >> 1) & 0b0101..
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::And,
+        RegMemImm::reg(tmp2.to_reg()),
+        tmp1,
+    ));
+    // tmp2 = src & 0b0101..
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::And,
+        RegMemImm::reg(tmp0.to_reg()),
+        tmp2,
+    ));
+    // tmp2 = (src & 0b0101..) << 1
+    ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, Some(1), tmp2));
+    // tmp0 = (src >> 1) & 0b0101.. | (src & 0b0101..) << 1
+    ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::Or,
+        RegMemImm::reg(tmp1.to_reg()),
+        tmp0,
+    ));
+
+    // Swap 2-bit units.
+    ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
+    ctx.emit(Inst::imm(
+        OperandSize::Size64,
+        0x3333_3333_3333_3333 & const_mask,
+        tmp2,
+    ));
+    ctx.emit(Inst::shift_r(
+        8,
+        ShiftKind::ShiftRightLogical,
+        Some(2),
+        tmp1,
+    ));
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::And,
+        RegMemImm::reg(tmp2.to_reg()),
+        tmp1,
+    ));
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::And,
+        RegMemImm::reg(tmp0.to_reg()),
+        tmp2,
+    ));
+    ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, Some(2), tmp2));
+    ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::Or,
+        RegMemImm::reg(tmp1.to_reg()),
+        tmp0,
+    ));
+
+    // Swap 4-bit units.
+    ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
+    ctx.emit(Inst::imm(
+        OperandSize::Size64,
+        0x0f0f_0f0f_0f0f_0f0f & const_mask,
+        tmp2,
+    ));
+    ctx.emit(Inst::shift_r(
+        8,
+        ShiftKind::ShiftRightLogical,
+        Some(4),
+        tmp1,
+    ));
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::And,
+        RegMemImm::reg(tmp2.to_reg()),
+        tmp1,
+    ));
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::And,
+        RegMemImm::reg(tmp0.to_reg()),
+        tmp2,
+    ));
+    ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, Some(4), tmp2));
+    ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::Or,
+        RegMemImm::reg(tmp1.to_reg()),
+        tmp0,
+    ));
+
+    if bits > 8 {
+        // Swap 8-bit units.
+        ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
+        ctx.emit(Inst::imm(
+            OperandSize::Size64,
+            0x00ff_00ff_00ff_00ff & const_mask,
+            tmp2,
+        ));
+        ctx.emit(Inst::shift_r(
+            8,
+            ShiftKind::ShiftRightLogical,
+            Some(8),
+            tmp1,
+        ));
+        ctx.emit(Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::And,
+            RegMemImm::reg(tmp2.to_reg()),
+            tmp1,
+        ));
+        ctx.emit(Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::And,
+            RegMemImm::reg(tmp0.to_reg()),
+            tmp2,
+        ));
+        ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, Some(8), tmp2));
+        ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
+        ctx.emit(Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::Or,
+            RegMemImm::reg(tmp1.to_reg()),
+            tmp0,
+        ));
+    }
+
+    if bits > 16 {
+        // Swap 16-bit units.
+        ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
+        ctx.emit(Inst::imm(
+            OperandSize::Size64,
+            0x0000_ffff_0000_ffff & const_mask,
+            tmp2,
+        ));
+        ctx.emit(Inst::shift_r(
+            8,
+            ShiftKind::ShiftRightLogical,
+            Some(16),
+            tmp1,
+        ));
+        ctx.emit(Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::And,
+            RegMemImm::reg(tmp2.to_reg()),
+            tmp1,
+        ));
+        ctx.emit(Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::And,
+            RegMemImm::reg(tmp0.to_reg()),
+            tmp2,
+        ));
+        ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, Some(16), tmp2));
+        ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
+        ctx.emit(Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::Or,
+            RegMemImm::reg(tmp1.to_reg()),
+            tmp0,
+        ));
+    }
+
+    if bits > 32 {
+        // Swap 32-bit units.
+        ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
+        ctx.emit(Inst::imm(
+            OperandSize::Size64,
+            0x0000_0000_ffff_ffff & const_mask,
+            tmp2,
+        ));
+        ctx.emit(Inst::shift_r(
+            8,
+            ShiftKind::ShiftRightLogical,
+            Some(32),
+            tmp1,
+        ));
+        ctx.emit(Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::And,
+            RegMemImm::reg(tmp2.to_reg()),
+            tmp1,
+        ));
+        ctx.emit(Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::And,
+            RegMemImm::reg(tmp0.to_reg()),
+            tmp2,
+        ));
+        ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, Some(32), tmp2));
+        ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
+        ctx.emit(Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::Or,
+            RegMemImm::reg(tmp1.to_reg()),
+            tmp0,
+        ));
+    }
+
+    ctx.emit(Inst::gen_move(dst, tmp0.to_reg(), types::I64));
+}
+
+fn emit_shl_i128<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    src: ValueRegs<Reg>,
+    dst: ValueRegs<Writable<Reg>>,
+    amt_src: Reg,
+) {
+    let src_lo = src.regs()[0];
+    let src_hi = src.regs()[1];
+    let dst_lo = dst.regs()[0];
+    let dst_hi = dst.regs()[1];
+
+    // mov tmp1, src_lo
+    // shl tmp1, amt_src
+    // mov tmp2, src_hi
+    // shl tmp2, amt_src
+    // mov amt, 64
+    // sub amt, amt_src
+    // mov tmp3, src_lo
+    // shr tmp3, amt
+    // or tmp3, tmp2
+    // xor dst_lo, dst_lo
+    // mov amt, amt_src
+    // and amt, 64
+    // cmovz dst_hi, tmp3
+    // cmovz dst_lo, tmp1
+    // cmovnz dst_hi, tmp1
+
+    let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+    let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+    let tmp3 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+    let amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+
+    ctx.emit(Inst::gen_move(tmp1, src_lo, types::I64));
+    ctx.emit(Inst::gen_move(
+        Writable::from_reg(regs::rcx()),
+        amt_src,
+        types::I64,
+    ));
+    ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, None, tmp1));
+
+    ctx.emit(Inst::gen_move(tmp2, src_hi, types::I64));
+    ctx.emit(Inst::gen_move(
+        Writable::from_reg(regs::rcx()),
+        amt_src,
+        types::I64,
+    ));
+    ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, None, tmp2));
+
+    ctx.emit(Inst::imm(OperandSize::Size64, 64, amt));
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::Sub,
+        RegMemImm::reg(amt_src),
+        amt,
+    ));
+
+    ctx.emit(Inst::gen_move(tmp3, src_lo, types::I64));
+    ctx.emit(Inst::gen_move(
+        Writable::from_reg(regs::rcx()),
+        amt.to_reg(),
+        types::I64,
+    ));
+    ctx.emit(Inst::shift_r(8, ShiftKind::ShiftRightLogical, None, tmp3));
+
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::Or,
+        RegMemImm::reg(tmp2.to_reg()),
+        tmp3,
+    ));
+
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::Xor,
+        RegMemImm::reg(dst_lo.to_reg()),
+        dst_lo,
+    ));
+    // This isn't semantically necessary, but it keeps the
+    // register allocator happy, because it cannot otherwise
+    // infer that cmovz + cmovnz always defines dst_hi.
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::Xor,
+        RegMemImm::reg(dst_hi.to_reg()),
+        dst_hi,
+    ));
+
+    ctx.emit(Inst::gen_move(amt, amt_src, types::I64));
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::And,
+        RegMemImm::imm(64),
+        amt,
+    ));
+    ctx.emit(Inst::cmove(8, CC::Z, RegMem::reg(tmp3.to_reg()), dst_hi));
+    ctx.emit(Inst::cmove(8, CC::Z, RegMem::reg(tmp1.to_reg()), dst_lo));
+    ctx.emit(Inst::cmove(8, CC::NZ, RegMem::reg(tmp1.to_reg()), dst_hi));
+}
+
+fn emit_shr_i128<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    src: ValueRegs<Reg>,
+    dst: ValueRegs<Writable<Reg>>,
+    amt_src: Reg,
+    is_signed: bool,
+) {
+    let src_lo = src.regs()[0];
+    let src_hi = src.regs()[1];
+    let dst_lo = dst.regs()[0];
+    let dst_hi = dst.regs()[1];
+
+    // mov tmp1, src_hi
+    // {u,s}shr tmp1, amt_src
+    // mov tmp2, src_lo
+    // {u,s}shr tmp2, amt_src
+    // mov amt, 64
+    // sub amt, amt_src
+    // mov tmp3, src_hi
+    // shl tmp3, amt
+    // or tmp3, tmp2
+    // if is_signed:
+    //   mov dst_hi, src_hi
+    //   sshr dst_hi, 63  // get the sign bit
+    // else:
+    //   xor dst_hi, dst_hi
+    // mov amt, amt_src
+    // and amt, 64
+    // cmovz dst_hi, tmp1
+    // cmovz dst_lo, tmp3
+    // cmovnz dst_lo, tmp1
+
+    let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+    let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+    let tmp3 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+    let amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+
+    let shift_kind = if is_signed {
+        ShiftKind::ShiftRightArithmetic
+    } else {
+        ShiftKind::ShiftRightLogical
+    };
+
+    ctx.emit(Inst::gen_move(tmp1, src_hi, types::I64));
+    ctx.emit(Inst::gen_move(
+        Writable::from_reg(regs::rcx()),
+        amt_src,
+        types::I64,
+    ));
+    ctx.emit(Inst::shift_r(8, shift_kind, None, tmp1));
+
+    ctx.emit(Inst::gen_move(tmp2, src_lo, types::I64));
+    ctx.emit(Inst::gen_move(
+        Writable::from_reg(regs::rcx()),
+        amt_src,
+        types::I64,
+    ));
+    ctx.emit(Inst::shift_r(8, shift_kind, None, tmp2));
+
+    ctx.emit(Inst::imm(OperandSize::Size64, 64, amt));
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::Sub,
+        RegMemImm::reg(amt_src),
+        amt,
+    ));
+
+    ctx.emit(Inst::gen_move(tmp3, src_hi, types::I64));
+    ctx.emit(Inst::gen_move(
+        Writable::from_reg(regs::rcx()),
+        amt.to_reg(),
+        types::I64,
+    ));
+    ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, None, tmp3));
+
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::Or,
+        RegMemImm::reg(tmp2.to_reg()),
+        tmp3,
+    ));
+
+    if is_signed {
+        ctx.emit(Inst::gen_move(dst_hi, src_hi, types::I64));
+        ctx.emit(Inst::shift_r(
+            8,
+            ShiftKind::ShiftRightArithmetic,
+            Some(63),
+            dst_hi,
+        ));
+    } else {
+        ctx.emit(Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::Xor,
+            RegMemImm::reg(dst_hi.to_reg()),
+            dst_hi,
+        ));
+    }
+    // This isn't semantically necessary, but it keeps the
+    // register allocator happy, because it cannot otherwise
+    // infer that cmovz + cmovnz always defines dst_lo.
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::Xor,
+        RegMemImm::reg(dst_lo.to_reg()),
+        dst_lo,
+    ));
+
+    ctx.emit(Inst::gen_move(amt, amt_src, types::I64));
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::And,
+        RegMemImm::imm(64),
+        amt,
+    ));
+    ctx.emit(Inst::cmove(8, CC::Z, RegMem::reg(tmp1.to_reg()), dst_hi));
+    ctx.emit(Inst::cmove(8, CC::Z, RegMem::reg(tmp3.to_reg()), dst_lo));
+    ctx.emit(Inst::cmove(8, CC::NZ, RegMem::reg(tmp1.to_reg()), dst_lo));
+}
+
 fn make_libcall_sig<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     insn: IRInst,
@@ -676,6 +1227,101 @@ fn lower_to_amode<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput, offset: i
     Amode::imm_reg(offset as u32, input).with_flags(flags)
 }
 
+fn emit_moves<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    dst: ValueRegs<Writable<Reg>>,
+    src: ValueRegs<Reg>,
+    ty: Type,
+) {
+    let (_, tys) = Inst::rc_for_type(ty).unwrap();
+    for ((dst, src), ty) in dst.regs().iter().zip(src.regs().iter()).zip(tys.iter()) {
+        ctx.emit(Inst::gen_move(*dst, *src, *ty));
+    }
+}
+
+fn emit_cmoves<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    size: u8,
+    cc: CC,
+    src: ValueRegs<Reg>,
+    dst: ValueRegs<Writable<Reg>>,
+) {
+    let size = size / src.len() as u8;
+    let size = u8::max(size, 4); // at least 32 bits
+    for (dst, src) in dst.regs().iter().zip(src.regs().iter()) {
+        ctx.emit(Inst::cmove(size, cc, RegMem::reg(*src), *dst));
+    }
+}
+
+fn emit_clz<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    orig_ty: Type,
+    ty: Type,
+    src: Reg,
+    dst: Writable<Reg>,
+) {
+    let src = RegMem::reg(src);
+    let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
+    ctx.emit(Inst::imm(
+        OperandSize::from_bytes(ty.bytes()),
+        u64::max_value(),
+        dst,
+    ));
+
+    ctx.emit(Inst::unary_rm_r(
+        ty.bytes() as u8,
+        UnaryRmROpcode::Bsr,
+        src,
+        tmp,
+    ));
+
+    ctx.emit(Inst::cmove(
+        ty.bytes() as u8,
+        CC::Z,
+        RegMem::reg(dst.to_reg()),
+        tmp,
+    ));
+
+    ctx.emit(Inst::imm(
+        OperandSize::from_bytes(ty.bytes()),
+        orig_ty.bits() as u64 - 1,
+        dst,
+    ));
+
+    ctx.emit(Inst::alu_rmi_r(
+        ty == types::I64,
+        AluRmiROpcode::Sub,
+        RegMemImm::reg(tmp.to_reg()),
+        dst,
+    ));
+}
+
+fn emit_ctz<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    orig_ty: Type,
+    ty: Type,
+    src: Reg,
+    dst: Writable<Reg>,
+) {
+    let src = RegMem::reg(src);
+    let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
+    ctx.emit(Inst::imm(OperandSize::Size32, orig_ty.bits() as u64, tmp));
+
+    ctx.emit(Inst::unary_rm_r(
+        ty.bytes() as u8,
+        UnaryRmROpcode::Bsf,
+        src,
+        dst,
+    ));
+
+    ctx.emit(Inst::cmove(
+        ty.bytes() as u8,
+        CC::Z,
+        RegMem::reg(tmp.to_reg()),
+        dst,
+    ));
+}
+
 //=============================================================================
 // Top-level instruction lowering entry point, for one instruction.
 
@@ -898,6 +1544,102 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 // Move the `lhs` to the same register as `dst`.
                 ctx.emit(Inst::gen_move(dst, lhs, ty));
                 ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
+            } else if ty == types::I128 || ty == types::B128 {
+                let alu_ops = match op {
+                    Opcode::Iadd => (AluRmiROpcode::Add, AluRmiROpcode::Adc),
+                    Opcode::Isub => (AluRmiROpcode::Sub, AluRmiROpcode::Sbb),
+                    // multiply handled specially below
+                    Opcode::Imul => (AluRmiROpcode::Mul, AluRmiROpcode::Mul),
+                    Opcode::Band => (AluRmiROpcode::And, AluRmiROpcode::And),
+                    Opcode::Bor => (AluRmiROpcode::Or, AluRmiROpcode::Or),
+                    Opcode::Bxor => (AluRmiROpcode::Xor, AluRmiROpcode::Xor),
+                    _ => panic!("Unsupported opcode with 128-bit integers: {:?}", op),
+                };
+                let lhs = put_input_in_regs(ctx, inputs[0]);
+                let rhs = put_input_in_regs(ctx, inputs[1]);
+                let dst = get_output_reg(ctx, outputs[0]);
+                assert_eq!(lhs.len(), 2);
+                assert_eq!(rhs.len(), 2);
+                assert_eq!(dst.len(), 2);
+
+                if op != Opcode::Imul {
+                    // add, sub, and, or, xor: just do ops on lower then upper half. Carry-flag
+                    // propagation is implicit (add/adc, sub/sbb).
+                    ctx.emit(Inst::gen_move(dst.regs()[0], lhs.regs()[0], types::I64));
+                    ctx.emit(Inst::gen_move(dst.regs()[1], lhs.regs()[1], types::I64));
+                    ctx.emit(Inst::alu_rmi_r(
+                        /* is_64 = */ true,
+                        alu_ops.0,
+                        RegMemImm::reg(rhs.regs()[0]),
+                        dst.regs()[0],
+                    ));
+                    ctx.emit(Inst::alu_rmi_r(
+                        /* is_64 = */ true,
+                        alu_ops.1,
+                        RegMemImm::reg(rhs.regs()[1]),
+                        dst.regs()[1],
+                    ));
+                } else {
+                    // mul:
+                    //   dst_lo = lhs_lo * rhs_lo
+                    //   dst_hi = umulhi(lhs_lo, rhs_lo) + lhs_lo * rhs_hi + lhs_hi * rhs_lo
+                    //
+                    // so we emit:
+                    //   mov dst_lo, lhs_lo
+                    //   mul dst_lo, rhs_lo
+                    //   mov dst_hi, lhs_lo
+                    //   mul dst_hi, rhs_hi
+                    //   mov tmp, lhs_hi
+                    //   mul tmp, rhs_lo
+                    //   add dst_hi, tmp
+                    //   mov rax, lhs_lo
+                    //   umulhi rhs_lo  // implicit rax arg/dst
+                    //   add dst_hi, rax
+                    let tmp = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                    ctx.emit(Inst::gen_move(dst.regs()[0], lhs.regs()[0], types::I64));
+                    ctx.emit(Inst::alu_rmi_r(
+                        /* is_64 = */ true,
+                        AluRmiROpcode::Mul,
+                        RegMemImm::reg(rhs.regs()[0]),
+                        dst.regs()[0],
+                    ));
+                    ctx.emit(Inst::gen_move(dst.regs()[1], lhs.regs()[0], types::I64));
+                    ctx.emit(Inst::alu_rmi_r(
+                        /* is_64 = */ true,
+                        AluRmiROpcode::Mul,
+                        RegMemImm::reg(rhs.regs()[1]),
+                        dst.regs()[1],
+                    ));
+                    ctx.emit(Inst::gen_move(tmp, lhs.regs()[1], types::I64));
+                    ctx.emit(Inst::alu_rmi_r(
+                        /* is_64 = */ true,
+                        AluRmiROpcode::Mul,
+                        RegMemImm::reg(rhs.regs()[0]),
+                        tmp,
+                    ));
+                    ctx.emit(Inst::alu_rmi_r(
+                        /* is_64 = */ true,
+                        AluRmiROpcode::Add,
+                        RegMemImm::reg(tmp.to_reg()),
+                        dst.regs()[1],
+                    ));
+                    ctx.emit(Inst::gen_move(
+                        Writable::from_reg(regs::rax()),
+                        lhs.regs()[0],
+                        types::I64,
+                    ));
+                    ctx.emit(Inst::mul_hi(
+                        /* size = */ 8,
+                        /* signed = */ false,
+                        RegMem::reg(rhs.regs()[0]),
+                    ));
+                    ctx.emit(Inst::alu_rmi_r(
+                        /* is_64 = */ true,
+                        AluRmiROpcode::Add,
+                        RegMemImm::reg(regs::rdx()),
+                        dst.regs()[1],
+                    ));
+                }
             } else {
                 let is_64 = ty == types::I64;
                 let alu_op = match op {
@@ -1022,17 +1764,27 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         Opcode::Bnot => {
             let ty = ty.unwrap();
             let size = ty.bytes() as u8;
-            let src = put_input_in_reg(ctx, inputs[0]);
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            ctx.emit(Inst::gen_move(dst, src, ty));
 
             if ty.is_vector() {
+                let src = put_input_in_reg(ctx, inputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                ctx.emit(Inst::gen_move(dst, src, ty));
                 let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
                 ctx.emit(Inst::equals(ty, RegMem::from(tmp), tmp));
                 ctx.emit(Inst::xor(ty, RegMem::from(tmp), dst));
+            } else if ty == types::I128 || ty == types::B128 {
+                let src = put_input_in_regs(ctx, inputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]);
+                ctx.emit(Inst::gen_move(dst.regs()[0], src.regs()[0], types::I64));
+                ctx.emit(Inst::not(8, dst.regs()[0]));
+                ctx.emit(Inst::gen_move(dst.regs()[1], src.regs()[1], types::I64));
+                ctx.emit(Inst::not(8, dst.regs()[1]));
             } else if ty.is_bool() {
                 unimplemented!("bool bnot")
             } else {
+                let src = put_input_in_reg(ctx, inputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                ctx.emit(Inst::gen_move(dst, src, ty));
                 ctx.emit(Inst::not(size, dst));
             }
         }
@@ -1064,7 +1816,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let dst_ty = ctx.output_ty(insn, 0);
             debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty);
 
-            if !dst_ty.is_vector() {
+            if !dst_ty.is_vector() && dst_ty.bits() <= 64 {
                 // Scalar shifts on x86 have various encodings:
                 // - shift by one bit, e.g. `SAL r/m8, 1` (not used here)
                 // - shift by an immediate amount, e.g. `SAL r/m8, imm8`
@@ -1118,6 +1870,89 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     ctx.emit(Inst::mov_r_r(true, rhs.unwrap(), w_rcx));
                 }
                 ctx.emit(Inst::shift_r(size, shift_kind, count, dst));
+            } else if dst_ty == types::I128 {
+                let amt_src = put_input_in_reg(ctx, inputs[1]);
+                let src = put_input_in_regs(ctx, inputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]);
+
+                match op {
+                    Opcode::Ishl => {
+                        emit_shl_i128(ctx, src, dst, amt_src);
+                    }
+                    Opcode::Ushr => {
+                        emit_shr_i128(ctx, src, dst, amt_src, /* is_signed = */ false);
+                    }
+                    Opcode::Sshr => {
+                        emit_shr_i128(ctx, src, dst, amt_src, /* is_signed = */ true);
+                    }
+                    Opcode::Rotl => {
+                        // (mov tmp, src)
+                        // (shl.i128 tmp, amt)
+                        // (mov dst, src)
+                        // (ushr.i128 dst, 128-amt)
+                        // (or dst, tmp)
+                        let tmp = ctx.alloc_tmp(types::I128);
+                        emit_shl_i128(ctx, src, tmp, amt_src);
+                        let inv_amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                        ctx.emit(Inst::imm(OperandSize::Size64, 128, inv_amt));
+                        ctx.emit(Inst::alu_rmi_r(
+                            true,
+                            AluRmiROpcode::Sub,
+                            RegMemImm::reg(amt_src),
+                            inv_amt,
+                        ));
+                        emit_shr_i128(
+                            ctx,
+                            src,
+                            dst,
+                            inv_amt.to_reg(),
+                            /* is_signed = */ false,
+                        );
+                        ctx.emit(Inst::alu_rmi_r(
+                            true,
+                            AluRmiROpcode::Or,
+                            RegMemImm::reg(tmp.regs()[0].to_reg()),
+                            dst.regs()[0],
+                        ));
+                        ctx.emit(Inst::alu_rmi_r(
+                            true,
+                            AluRmiROpcode::Or,
+                            RegMemImm::reg(tmp.regs()[1].to_reg()),
+                            dst.regs()[1],
+                        ));
+                    }
+                    Opcode::Rotr => {
+                        // (mov tmp, src)
+                        // (ushr.i128 tmp, amt)
+                        // (mov dst, src)
+                        // (shl.i128 dst, 128-amt)
+                        // (or dst, tmp)
+                        let tmp = ctx.alloc_tmp(types::I128);
+                        emit_shr_i128(ctx, src, tmp, amt_src, /* is_signed = */ false);
+                        let inv_amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                        ctx.emit(Inst::imm(OperandSize::Size64, 128, inv_amt));
+                        ctx.emit(Inst::alu_rmi_r(
+                            true,
+                            AluRmiROpcode::Sub,
+                            RegMemImm::reg(amt_src),
+                            inv_amt,
+                        ));
+                        emit_shl_i128(ctx, src, dst, inv_amt.to_reg());
+                        ctx.emit(Inst::alu_rmi_r(
+                            true,
+                            AluRmiROpcode::Or,
+                            RegMemImm::reg(tmp.regs()[0].to_reg()),
+                            dst.regs()[0],
+                        ));
+                        ctx.emit(Inst::alu_rmi_r(
+                            true,
+                            AluRmiROpcode::Or,
+                            RegMemImm::reg(tmp.regs()[1].to_reg()),
+                            dst.regs()[1],
+                        ));
+                    }
+                    _ => unreachable!(),
+                }
             } else if dst_ty == types::I8X16 && (op == Opcode::Ishl || op == Opcode::Ushr) {
                 // Since the x86 instruction set does not have any 8x16 shift instructions (even in higher feature sets
                 // like AVX), we lower the `ishl.i8x16` and `ushr.i8x16` to a sequence of instructions. The basic idea,
@@ -1449,52 +2284,50 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             // mov $(size_bits - 1), %dst
             // sub %tmp, %dst
 
-            let (ext_spec, ty) = match ctx.input_ty(insn, 0) {
-                types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32),
-                a if a == types::I32 || a == types::I64 => (None, a),
-                _ => unreachable!(),
-            };
-
-            let src = if let Some(ext_spec) = ext_spec {
-                RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec))
+            let orig_ty = ty.unwrap();
+            if orig_ty == types::I128 {
+                // clz upper, tmp1
+                // clz lower, dst
+                // add dst, 64
+                // cmp tmp1, 64
+                // cmovnz tmp1, dst
+                let dsts = get_output_reg(ctx, outputs[0]);
+                let dst = dsts.regs()[0];
+                let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                let srcs = put_input_in_regs(ctx, inputs[0]);
+                let src_lo = srcs.regs()[0];
+                let src_hi = srcs.regs()[1];
+                emit_clz(ctx, types::I64, types::I64, src_hi, tmp1);
+                emit_clz(ctx, types::I64, types::I64, src_lo, dst);
+                ctx.emit(Inst::alu_rmi_r(
+                    true,
+                    AluRmiROpcode::Add,
+                    RegMemImm::imm(64),
+                    dst,
+                ));
+                ctx.emit(Inst::cmp_rmi_r(8, RegMemImm::imm(64), tmp1.to_reg()));
+                ctx.emit(Inst::cmove(8, CC::NZ, RegMem::reg(tmp1.to_reg()), dst));
+                ctx.emit(Inst::alu_rmi_r(
+                    true,
+                    AluRmiROpcode::Xor,
+                    RegMemImm::reg(dsts.regs()[1].to_reg()),
+                    dsts.regs()[1],
+                ));
             } else {
-                input_to_reg_mem(ctx, inputs[0])
-            };
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                let (ext_spec, ty) = match orig_ty {
+                    types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32),
+                    a if a == types::I32 || a == types::I64 => (None, a),
+                    _ => unreachable!(),
+                };
+                let src = if let Some(ext_spec) = ext_spec {
+                    extend_input_to_reg(ctx, inputs[0], ext_spec)
+                } else {
+                    put_input_in_reg(ctx, inputs[0])
+                };
 
-            let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
-            ctx.emit(Inst::imm(
-                OperandSize::from_bytes(ty.bytes()),
-                u64::max_value(),
-                dst,
-            ));
-
-            ctx.emit(Inst::unary_rm_r(
-                ty.bytes() as u8,
-                UnaryRmROpcode::Bsr,
-                src,
-                tmp,
-            ));
-
-            ctx.emit(Inst::cmove(
-                ty.bytes() as u8,
-                CC::Z,
-                RegMem::reg(dst.to_reg()),
-                tmp,
-            ));
-
-            ctx.emit(Inst::imm(
-                OperandSize::from_bytes(ty.bytes()),
-                ty.bits() as u64 - 1,
-                dst,
-            ));
-
-            ctx.emit(Inst::alu_rmi_r(
-                ty == types::I64,
-                AluRmiROpcode::Sub,
-                RegMemImm::reg(tmp.to_reg()),
-                dst,
-            ));
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                emit_clz(ctx, orig_ty, ty, src, dst);
+            }
         }
 
         Opcode::Ctz => {
@@ -1504,29 +2337,47 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             // bsf %src, %dst
             // mov $(size_bits), %tmp
             // cmovz %tmp, %dst
-            let ty = ctx.input_ty(insn, 0);
-            let ty = if ty.bits() < 32 { types::I32 } else { ty };
-            debug_assert!(ty == types::I32 || ty == types::I64);
+            let orig_ty = ctx.input_ty(insn, 0);
+            if orig_ty == types::I128 {
+                // ctz src_lo, dst
+                // ctz src_hi, tmp1
+                // add tmp1, 64
+                // cmp dst, 64
+                // cmovz tmp1, dst
+                let dsts = get_output_reg(ctx, outputs[0]);
+                let dst = dsts.regs()[0];
+                let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                let srcs = put_input_in_regs(ctx, inputs[0]);
+                let src_lo = srcs.regs()[0];
+                let src_hi = srcs.regs()[1];
+                emit_ctz(ctx, types::I64, types::I64, src_lo, dst);
+                emit_ctz(ctx, types::I64, types::I64, src_hi, tmp1);
+                ctx.emit(Inst::alu_rmi_r(
+                    true,
+                    AluRmiROpcode::Add,
+                    RegMemImm::imm(64),
+                    tmp1,
+                ));
+                ctx.emit(Inst::cmp_rmi_r(8, RegMemImm::imm(64), dst.to_reg()));
+                ctx.emit(Inst::cmove(8, CC::Z, RegMem::reg(tmp1.to_reg()), dst));
+                ctx.emit(Inst::alu_rmi_r(
+                    true,
+                    AluRmiROpcode::Xor,
+                    RegMemImm::reg(dsts.regs()[1].to_reg()),
+                    dsts.regs()[1],
+                ));
+            } else {
+                let ty = if orig_ty.bits() < 32 {
+                    types::I32
+                } else {
+                    orig_ty
+                };
+                debug_assert!(ty == types::I32 || ty == types::I64);
 
-            let src = input_to_reg_mem(ctx, inputs[0]);
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-
-            let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
-            ctx.emit(Inst::imm(OperandSize::Size32, ty.bits() as u64, tmp));
-
-            ctx.emit(Inst::unary_rm_r(
-                ty.bytes() as u8,
-                UnaryRmROpcode::Bsf,
-                src,
-                dst,
-            ));
-
-            ctx.emit(Inst::cmove(
-                ty.bytes() as u8,
-                CC::Z,
-                RegMem::reg(tmp.to_reg()),
-                dst,
-            ));
+                let src = put_input_in_reg(ctx, inputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                emit_ctz(ctx, orig_ty, ty, src, dst);
+            }
         }
 
         Opcode::Popcnt => {
@@ -1535,272 +2386,329 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let (ext_spec, ty) = match ctx.input_ty(insn, 0) {
                 types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32),
                 a if a == types::I32 || a == types::I64 => (None, a),
+                types::I128 => (None, types::I128),
                 _ => unreachable!(),
             };
 
-            let src = if let Some(ext_spec) = ext_spec {
-                RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec))
+            let (srcs, ty): (SmallVec<[RegMem; 2]>, Type) = if let Some(ext_spec) = ext_spec {
+                (
+                    smallvec![RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec))],
+                    ty,
+                )
+            } else if ty == types::I128 {
+                let regs = put_input_in_regs(ctx, inputs[0]);
+                (
+                    smallvec![RegMem::reg(regs.regs()[0]), RegMem::reg(regs.regs()[1])],
+                    types::I64,
+                )
             } else {
                 // N.B.: explicitly put input in a reg here because the width of the instruction
                 // into which this RM op goes may not match the width of the input type (in fact,
                 // it won't for i32.popcnt), and we don't want a larger than necessary load.
-                RegMem::reg(put_input_in_reg(ctx, inputs[0]))
+                (smallvec![RegMem::reg(put_input_in_reg(ctx, inputs[0]))], ty)
             };
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
-            if ty == types::I64 {
-                let is_64 = true;
+            let mut dsts: SmallVec<[Reg; 2]> = smallvec![];
+            for src in srcs {
+                let dst = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                dsts.push(dst.to_reg());
+                if ty == types::I64 {
+                    let is_64 = true;
 
-                let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
-                let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
-                let cst = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                    let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                    let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                    let cst = ctx.alloc_tmp(types::I64).only_reg().unwrap();
 
-                // mov src, tmp1
-                ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1));
+                    // mov src, tmp1
+                    ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1));
 
-                // shr $1, tmp1
-                ctx.emit(Inst::shift_r(
-                    8,
-                    ShiftKind::ShiftRightLogical,
-                    Some(1),
-                    tmp1,
-                ));
+                    // shr $1, tmp1
+                    ctx.emit(Inst::shift_r(
+                        8,
+                        ShiftKind::ShiftRightLogical,
+                        Some(1),
+                        tmp1,
+                    ));
 
-                // mov 0x7777_7777_7777_7777, cst
-                ctx.emit(Inst::imm(OperandSize::Size64, 0x7777777777777777, cst));
+                    // mov 0x7777_7777_7777_7777, cst
+                    ctx.emit(Inst::imm(OperandSize::Size64, 0x7777777777777777, cst));
 
-                // andq cst, tmp1
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::And,
-                    RegMemImm::reg(cst.to_reg()),
-                    tmp1,
-                ));
+                    // andq cst, tmp1
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::And,
+                        RegMemImm::reg(cst.to_reg()),
+                        tmp1,
+                    ));
 
-                // mov src, tmp2
-                ctx.emit(Inst::mov64_rm_r(src, tmp2));
+                    // mov src, tmp2
+                    ctx.emit(Inst::mov64_rm_r(src, tmp2));
 
-                // sub tmp1, tmp2
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::Sub,
-                    RegMemImm::reg(tmp1.to_reg()),
-                    tmp2,
-                ));
+                    // sub tmp1, tmp2
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::Sub,
+                        RegMemImm::reg(tmp1.to_reg()),
+                        tmp2,
+                    ));
 
-                // shr $1, tmp1
-                ctx.emit(Inst::shift_r(
-                    8,
-                    ShiftKind::ShiftRightLogical,
-                    Some(1),
-                    tmp1,
-                ));
+                    // shr $1, tmp1
+                    ctx.emit(Inst::shift_r(
+                        8,
+                        ShiftKind::ShiftRightLogical,
+                        Some(1),
+                        tmp1,
+                    ));
 
-                // and cst, tmp1
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::And,
-                    RegMemImm::reg(cst.to_reg()),
-                    tmp1,
-                ));
+                    // and cst, tmp1
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::And,
+                        RegMemImm::reg(cst.to_reg()),
+                        tmp1,
+                    ));
 
-                // sub tmp1, tmp2
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::Sub,
-                    RegMemImm::reg(tmp1.to_reg()),
-                    tmp2,
-                ));
+                    // sub tmp1, tmp2
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::Sub,
+                        RegMemImm::reg(tmp1.to_reg()),
+                        tmp2,
+                    ));
 
-                // shr $1, tmp1
-                ctx.emit(Inst::shift_r(
-                    8,
-                    ShiftKind::ShiftRightLogical,
-                    Some(1),
-                    tmp1,
-                ));
+                    // shr $1, tmp1
+                    ctx.emit(Inst::shift_r(
+                        8,
+                        ShiftKind::ShiftRightLogical,
+                        Some(1),
+                        tmp1,
+                    ));
 
-                // and cst, tmp1
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::And,
-                    RegMemImm::reg(cst.to_reg()),
-                    tmp1,
-                ));
+                    // and cst, tmp1
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::And,
+                        RegMemImm::reg(cst.to_reg()),
+                        tmp1,
+                    ));
 
-                // sub tmp1, tmp2
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::Sub,
-                    RegMemImm::reg(tmp1.to_reg()),
-                    tmp2,
-                ));
+                    // sub tmp1, tmp2
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::Sub,
+                        RegMemImm::reg(tmp1.to_reg()),
+                        tmp2,
+                    ));
 
-                // mov tmp2, dst
-                ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst));
+                    // mov tmp2, dst
+                    ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst));
 
-                // shr $4, dst
-                ctx.emit(Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(4), dst));
+                    // shr $4, dst
+                    ctx.emit(Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(4), dst));
 
-                // add tmp2, dst
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::Add,
-                    RegMemImm::reg(tmp2.to_reg()),
-                    dst,
-                ));
+                    // add tmp2, dst
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::Add,
+                        RegMemImm::reg(tmp2.to_reg()),
+                        dst,
+                    ));
 
-                // mov $0x0F0F_0F0F_0F0F_0F0F, cst
-                ctx.emit(Inst::imm(OperandSize::Size64, 0x0F0F0F0F0F0F0F0F, cst));
+                    // mov $0x0F0F_0F0F_0F0F_0F0F, cst
+                    ctx.emit(Inst::imm(OperandSize::Size64, 0x0F0F0F0F0F0F0F0F, cst));
 
-                // and cst, dst
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::And,
-                    RegMemImm::reg(cst.to_reg()),
-                    dst,
-                ));
+                    // and cst, dst
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::And,
+                        RegMemImm::reg(cst.to_reg()),
+                        dst,
+                    ));
 
-                // mov $0x0101_0101_0101_0101, cst
-                ctx.emit(Inst::imm(OperandSize::Size64, 0x0101010101010101, cst));
+                    // mov $0x0101_0101_0101_0101, cst
+                    ctx.emit(Inst::imm(OperandSize::Size64, 0x0101010101010101, cst));
 
-                // mul cst, dst
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::Mul,
-                    RegMemImm::reg(cst.to_reg()),
-                    dst,
-                ));
+                    // mul cst, dst
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::Mul,
+                        RegMemImm::reg(cst.to_reg()),
+                        dst,
+                    ));
 
-                // shr $56, dst
-                ctx.emit(Inst::shift_r(
-                    8,
-                    ShiftKind::ShiftRightLogical,
-                    Some(56),
-                    dst,
-                ));
+                    // shr $56, dst
+                    ctx.emit(Inst::shift_r(
+                        8,
+                        ShiftKind::ShiftRightLogical,
+                        Some(56),
+                        dst,
+                    ));
+                } else {
+                    assert_eq!(ty, types::I32);
+                    let is_64 = false;
+
+                    let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                    let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+
+                    // mov src, tmp1
+                    ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1));
+
+                    // shr $1, tmp1
+                    ctx.emit(Inst::shift_r(
+                        4,
+                        ShiftKind::ShiftRightLogical,
+                        Some(1),
+                        tmp1,
+                    ));
+
+                    // andq $0x7777_7777, tmp1
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::And,
+                        RegMemImm::imm(0x77777777),
+                        tmp1,
+                    ));
+
+                    // mov src, tmp2
+                    ctx.emit(Inst::mov64_rm_r(src, tmp2));
+
+                    // sub tmp1, tmp2
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::Sub,
+                        RegMemImm::reg(tmp1.to_reg()),
+                        tmp2,
+                    ));
+
+                    // shr $1, tmp1
+                    ctx.emit(Inst::shift_r(
+                        4,
+                        ShiftKind::ShiftRightLogical,
+                        Some(1),
+                        tmp1,
+                    ));
+
+                    // and 0x7777_7777, tmp1
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::And,
+                        RegMemImm::imm(0x77777777),
+                        tmp1,
+                    ));
+
+                    // sub tmp1, tmp2
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::Sub,
+                        RegMemImm::reg(tmp1.to_reg()),
+                        tmp2,
+                    ));
+
+                    // shr $1, tmp1
+                    ctx.emit(Inst::shift_r(
+                        4,
+                        ShiftKind::ShiftRightLogical,
+                        Some(1),
+                        tmp1,
+                    ));
+
+                    // and $0x7777_7777, tmp1
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::And,
+                        RegMemImm::imm(0x77777777),
+                        tmp1,
+                    ));
+
+                    // sub tmp1, tmp2
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::Sub,
+                        RegMemImm::reg(tmp1.to_reg()),
+                        tmp2,
+                    ));
+
+                    // mov tmp2, dst
+                    ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst));
+
+                    // shr $4, dst
+                    ctx.emit(Inst::shift_r(4, ShiftKind::ShiftRightLogical, Some(4), dst));
+
+                    // add tmp2, dst
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::Add,
+                        RegMemImm::reg(tmp2.to_reg()),
+                        dst,
+                    ));
+
+                    // and $0x0F0F_0F0F, dst
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::And,
+                        RegMemImm::imm(0x0F0F0F0F),
+                        dst,
+                    ));
+
+                    // mul $0x0101_0101, dst
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::Mul,
+                        RegMemImm::imm(0x01010101),
+                        dst,
+                    ));
+
+                    // shr $24, dst
+                    ctx.emit(Inst::shift_r(
+                        4,
+                        ShiftKind::ShiftRightLogical,
+                        Some(24),
+                        dst,
+                    ));
+                }
+            }
+
+            if dsts.len() == 1 {
+                let final_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                ctx.emit(Inst::gen_move(final_dst, dsts[0], types::I64));
             } else {
-                assert_eq!(ty, types::I32);
-                let is_64 = false;
-
-                let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
-                let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
-
-                // mov src, tmp1
-                ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1));
-
-                // shr $1, tmp1
-                ctx.emit(Inst::shift_r(
-                    4,
-                    ShiftKind::ShiftRightLogical,
-                    Some(1),
-                    tmp1,
-                ));
-
-                // andq $0x7777_7777, tmp1
+                assert!(dsts.len() == 2);
+                let final_dst = get_output_reg(ctx, outputs[0]);
+                ctx.emit(Inst::gen_move(final_dst.regs()[0], dsts[0], types::I64));
                 ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::And,
-                    RegMemImm::imm(0x77777777),
-                    tmp1,
-                ));
-
-                // mov src, tmp2
-                ctx.emit(Inst::mov64_rm_r(src, tmp2));
-
-                // sub tmp1, tmp2
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::Sub,
-                    RegMemImm::reg(tmp1.to_reg()),
-                    tmp2,
-                ));
-
-                // shr $1, tmp1
-                ctx.emit(Inst::shift_r(
-                    4,
-                    ShiftKind::ShiftRightLogical,
-                    Some(1),
-                    tmp1,
-                ));
-
-                // and 0x7777_7777, tmp1
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::And,
-                    RegMemImm::imm(0x77777777),
-                    tmp1,
-                ));
-
-                // sub tmp1, tmp2
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::Sub,
-                    RegMemImm::reg(tmp1.to_reg()),
-                    tmp2,
-                ));
-
-                // shr $1, tmp1
-                ctx.emit(Inst::shift_r(
-                    4,
-                    ShiftKind::ShiftRightLogical,
-                    Some(1),
-                    tmp1,
-                ));
-
-                // and $0x7777_7777, tmp1
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::And,
-                    RegMemImm::imm(0x77777777),
-                    tmp1,
-                ));
-
-                // sub tmp1, tmp2
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::Sub,
-                    RegMemImm::reg(tmp1.to_reg()),
-                    tmp2,
-                ));
-
-                // mov tmp2, dst
-                ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst));
-
-                // shr $4, dst
-                ctx.emit(Inst::shift_r(4, ShiftKind::ShiftRightLogical, Some(4), dst));
-
-                // add tmp2, dst
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
+                    true,
                     AluRmiROpcode::Add,
-                    RegMemImm::reg(tmp2.to_reg()),
-                    dst,
+                    RegMemImm::reg(dsts[1]),
+                    final_dst.regs()[0],
                 ));
-
-                // and $0x0F0F_0F0F, dst
                 ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::And,
-                    RegMemImm::imm(0x0F0F0F0F),
-                    dst,
+                    true,
+                    AluRmiROpcode::Xor,
+                    RegMemImm::reg(final_dst.regs()[1].to_reg()),
+                    final_dst.regs()[1],
                 ));
+            }
+        }
 
-                // mul $0x0101_0101, dst
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::Mul,
-                    RegMemImm::imm(0x01010101),
-                    dst,
-                ));
+        Opcode::Bitrev => {
+            let ty = ctx.input_ty(insn, 0);
+            assert!(
+                ty == types::I8
+                    || ty == types::I16
+                    || ty == types::I32
+                    || ty == types::I64
+                    || ty == types::I128
+            );
 
-                // shr $24, dst
-                ctx.emit(Inst::shift_r(
-                    4,
-                    ShiftKind::ShiftRightLogical,
-                    Some(24),
-                    dst,
-                ));
+            if ty == types::I128 {
+                let src = put_input_in_regs(ctx, inputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]);
+                emit_bitrev(ctx, src.regs()[0], dst.regs()[1], types::I64);
+                emit_bitrev(ctx, src.regs()[1], dst.regs()[0], types::I64);
+            } else {
+                let src = put_input_in_reg(ctx, inputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                emit_bitrev(ctx, src, dst, ty);
             }
         }
 
@@ -1836,63 +2744,112 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let src_ty = ctx.input_ty(insn, 0);
             let dst_ty = ctx.output_ty(insn, 0);
 
-            // Sextend requires a sign-extended move, but all the other opcodes are simply a move
-            // from a zero-extended source. Here is why this works, in each case:
-            //
-            // - Bint: Bool-to-int. We always represent a bool as a 0 or 1, so we merely need to
-            // zero-extend here.
-            //
-            // - Breduce, Bextend: changing width of a boolean. We represent a bool as a 0 or 1, so
-            // again, this is a zero-extend / no-op.
-            //
-            // - Ireduce: changing width of an integer. Smaller ints are stored with undefined
-            // high-order bits, so we can simply do a copy.
+            if src_ty == types::I128 {
+                assert!(dst_ty.bits() <= 64);
+                assert!(op == Opcode::Ireduce);
+                let src = put_input_in_regs(ctx, inputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                ctx.emit(Inst::gen_move(dst, src.regs()[0], types::I64));
+            } else if dst_ty == types::I128 {
+                assert!(src_ty.bits() <= 64);
+                let src = put_input_in_reg(ctx, inputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]);
+                assert!(op == Opcode::Uextend || op == Opcode::Sextend || op == Opcode::Bint);
+                // Extend to 64 bits first.
 
-            if src_ty == types::I32 && dst_ty == types::I64 && op != Opcode::Sextend {
-                // As a particular x64 extra-pattern matching opportunity, all the ALU opcodes on
-                // 32-bits will zero-extend the upper 32-bits, so we can even not generate a
-                // zero-extended move in this case.
-                // TODO add loads and shifts here.
-                if let Some(_) = matches_input_any(
-                    ctx,
-                    inputs[0],
-                    &[
-                        Opcode::Iadd,
-                        Opcode::IaddIfcout,
-                        Opcode::Isub,
-                        Opcode::Imul,
-                        Opcode::Band,
-                        Opcode::Bor,
-                        Opcode::Bxor,
-                    ],
-                ) {
-                    let src = put_input_in_reg(ctx, inputs[0]);
-                    let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-                    ctx.emit(Inst::gen_move(dst, src, types::I64));
-                    return Ok(());
-                }
-            }
-
-            let src = input_to_reg_mem(ctx, inputs[0]);
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-
-            let ext_mode = ExtMode::new(src_ty.bits(), dst_ty.bits());
-            assert_eq!(
-                src_ty.bits() < dst_ty.bits(),
-                ext_mode.is_some(),
-                "unexpected extension: {} -> {}",
-                src_ty,
-                dst_ty
-            );
-
-            if let Some(ext_mode) = ext_mode {
-                if op == Opcode::Sextend {
-                    ctx.emit(Inst::movsx_rm_r(ext_mode, src, dst));
+                let ext_mode = ExtMode::new(src_ty.bits(), /* dst bits = */ 64);
+                if let Some(ext_mode) = ext_mode {
+                    if op == Opcode::Sextend {
+                        ctx.emit(Inst::movsx_rm_r(ext_mode, RegMem::reg(src), dst.regs()[0]));
+                    } else {
+                        ctx.emit(Inst::movzx_rm_r(ext_mode, RegMem::reg(src), dst.regs()[0]));
+                    }
                 } else {
-                    ctx.emit(Inst::movzx_rm_r(ext_mode, src, dst));
+                    ctx.emit(Inst::mov64_rm_r(RegMem::reg(src), dst.regs()[0]));
+                }
+
+                // Now generate the top 64 bits.
+                if op == Opcode::Sextend {
+                    // Sign-extend: move dst[0] into dst[1] and arithmetic-shift right by 63 bits
+                    // to spread the sign bit across all bits.
+                    ctx.emit(Inst::gen_move(
+                        dst.regs()[1],
+                        dst.regs()[0].to_reg(),
+                        types::I64,
+                    ));
+                    ctx.emit(Inst::shift_r(
+                        8,
+                        ShiftKind::ShiftRightArithmetic,
+                        Some(63),
+                        dst.regs()[1],
+                    ));
+                } else {
+                    // Zero-extend: just zero the top word.
+                    ctx.emit(Inst::alu_rmi_r(
+                        true,
+                        AluRmiROpcode::Xor,
+                        RegMemImm::reg(dst.regs()[1].to_reg()),
+                        dst.regs()[1],
+                    ));
                 }
             } else {
-                ctx.emit(Inst::mov64_rm_r(src, dst));
+                // Sextend requires a sign-extended move, but all the other opcodes are simply a move
+                // from a zero-extended source. Here is why this works, in each case:
+                //
+                // - Bint: Bool-to-int. We always represent a bool as a 0 or 1, so we merely need to
+                // zero-extend here.
+                //
+                // - Breduce, Bextend: changing width of a boolean. We represent a bool as a 0 or 1, so
+                // again, this is a zero-extend / no-op.
+                //
+                // - Ireduce: changing width of an integer. Smaller ints are stored with undefined
+                // high-order bits, so we can simply do a copy.
+                if src_ty == types::I32 && dst_ty == types::I64 && op != Opcode::Sextend {
+                    // As a particular x64 extra-pattern matching opportunity, all the ALU opcodes on
+                    // 32-bits will zero-extend the upper 32-bits, so we can even not generate a
+                    // zero-extended move in this case.
+                    // TODO add loads and shifts here.
+                    if let Some(_) = matches_input_any(
+                        ctx,
+                        inputs[0],
+                        &[
+                            Opcode::Iadd,
+                            Opcode::IaddIfcout,
+                            Opcode::Isub,
+                            Opcode::Imul,
+                            Opcode::Band,
+                            Opcode::Bor,
+                            Opcode::Bxor,
+                        ],
+                    ) {
+                        let src = put_input_in_reg(ctx, inputs[0]);
+                        let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                        ctx.emit(Inst::gen_move(dst, src, types::I64));
+                        return Ok(());
+                    }
+                }
+
+                let src = input_to_reg_mem(ctx, inputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+
+                let ext_mode = ExtMode::new(src_ty.bits(), dst_ty.bits());
+                assert_eq!(
+                    src_ty.bits() < dst_ty.bits(),
+                    ext_mode.is_some(),
+                    "unexpected extension: {} -> {}",
+                    src_ty,
+                    dst_ty
+                );
+
+                if let Some(ext_mode) = ext_mode {
+                    if op == Opcode::Sextend {
+                        ctx.emit(Inst::movsx_rm_r(ext_mode, src, dst));
+                    } else {
+                        ctx.emit(Inst::movzx_rm_r(ext_mode, src, dst));
+                    }
+                } else {
+                    ctx.emit(Inst::mov64_rm_r(src, dst));
+                }
             }
         }
 
@@ -1901,7 +2858,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ctx.input_ty(insn, 0);
             if !ty.is_vector() {
-                emit_cmp(ctx, insn);
+                let condcode = emit_cmp(ctx, insn, condcode);
                 let cc = CC::from_intcc(condcode);
                 ctx.emit(Inst::setcc(cc, dst));
             } else {
@@ -2108,10 +3065,19 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
         Opcode::FallthroughReturn | Opcode::Return => {
             for i in 0..ctx.num_inputs(insn) {
-                let src_reg = put_input_in_reg(ctx, inputs[i]);
+                let src_reg = put_input_in_regs(ctx, inputs[i]);
                 let retval_reg = ctx.retval(i);
                 let ty = ctx.input_ty(insn, i);
-                ctx.emit(Inst::gen_move(retval_reg.only_reg().unwrap(), src_reg, ty));
+                assert!(src_reg.len() == retval_reg.len());
+                let (_, tys) = Inst::rc_for_type(ty)?;
+                for ((&src, &dst), &ty) in src_reg
+                    .regs()
+                    .iter()
+                    .zip(retval_reg.regs().iter())
+                    .zip(tys.iter())
+                {
+                    ctx.emit(Inst::gen_move(dst, src, ty));
+                }
             }
             // N.B.: the Ret itself is generated by the ABI.
         }
@@ -2147,13 +3113,13 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             abi.emit_stack_pre_adjust(ctx);
             assert_eq!(inputs.len(), abi.num_args());
             for (i, input) in inputs.iter().enumerate() {
-                let arg_reg = put_input_in_reg(ctx, *input);
-                abi.emit_copy_regs_to_arg(ctx, i, ValueRegs::one(arg_reg));
+                let arg_regs = put_input_in_regs(ctx, *input);
+                abi.emit_copy_regs_to_arg(ctx, i, arg_regs);
             }
             abi.emit_call(ctx);
             for (i, output) in outputs.iter().enumerate() {
-                let retval_reg = get_output_reg(ctx, *output).only_reg().unwrap();
-                abi.emit_copy_retval_to_regs(ctx, i, ValueRegs::one(retval_reg));
+                let retval_regs = get_output_reg(ctx, *output);
+                abi.emit_copy_retval_to_regs(ctx, i, retval_regs);
             }
             abi.emit_stack_post_adjust(ctx);
         }
@@ -2180,11 +3146,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 ctx.emit_safepoint(Inst::TrapIf { trap_code, cc });
             } else if op == Opcode::Trapif {
                 let cond_code = ctx.data(insn).cond_code().unwrap();
-                let cc = CC::from_intcc(cond_code);
 
                 // Verification ensures that the input is always a single-def ifcmp.
                 let ifcmp = matches_input(ctx, inputs[0], Opcode::Ifcmp).unwrap();
-                emit_cmp(ctx, ifcmp);
+                let cond_code = emit_cmp(ctx, ifcmp, cond_code);
+                let cc = CC::from_intcc(cond_code);
 
                 ctx.emit_safepoint(Inst::TrapIf { trap_code, cc });
             } else {
@@ -2266,7 +3232,9 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
         Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv => {
             let lhs = put_input_in_reg(ctx, inputs[0]);
-            let rhs = input_to_reg_mem(ctx, inputs[1]);
+            // We can't guarantee the RHS (if a load) is 128-bit aligned, so we
+            // must avoid merging a load here.
+            let rhs = RegMem::reg(put_input_in_reg(ctx, inputs[1]));
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
 
@@ -2523,7 +3491,9 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::FminPseudo | Opcode::FmaxPseudo => {
-            let lhs = input_to_reg_mem(ctx, inputs[0]);
+            // We can't guarantee the RHS (if a load) is 128-bit aligned, so we
+            // must avoid merging a load here.
+            let lhs = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
             let rhs = put_input_in_reg(ctx, inputs[1]);
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
@@ -2539,7 +3509,9 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::Sqrt => {
-            let src = input_to_reg_mem(ctx, inputs[0]);
+            // We can't guarantee the RHS (if a load) is 128-bit aligned, so we
+            // must avoid merging a load here.
+            let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
 
@@ -2558,13 +3530,17 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::Fpromote => {
-            let src = input_to_reg_mem(ctx, inputs[0]);
+            // We can't guarantee the RHS (if a load) is 128-bit aligned, so we
+            // must avoid merging a load here.
+            let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtss2sd, src, dst));
         }
 
         Opcode::Fdemote => {
-            let src = input_to_reg_mem(ctx, inputs[0]);
+            // We can't guarantee the RHS (if a load) is 128-bit aligned, so we
+            // must avoid merging a load here.
+            let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtsd2ss, src, dst));
         }
@@ -2581,7 +3557,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
                 let src = match ext_spec {
                     Some(ext_spec) => RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec)),
-                    None => input_to_reg_mem(ctx, inputs[0]),
+                    None => RegMem::reg(put_input_in_reg(ctx, inputs[0])),
                 };
 
                 let opcode = if output_ty == types::F32 {
@@ -3096,7 +4072,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::Fabs | Opcode::Fneg => {
-            let src = input_to_reg_mem(ctx, inputs[0]);
+            let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
             // In both cases, generate a constant and apply a single binary instruction:
@@ -3392,59 +4368,64 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 _ => unreachable!(),
             };
 
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let is_xmm = elem_ty.is_float() || elem_ty.is_vector();
-
-            match (sign_extend, is_xmm) {
-                (true, false) => {
-                    // The load is sign-extended only when the output size is lower than 64 bits,
-                    // so ext-mode is defined in this case.
-                    ctx.emit(Inst::movsx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst));
-                }
-                (false, false) => {
-                    if elem_ty.bytes() == 8 {
-                        // Use a plain load.
-                        ctx.emit(Inst::mov64_m_r(amode, dst))
-                    } else {
-                        // Use a zero-extended load.
-                        ctx.emit(Inst::movzx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst))
+            if elem_ty == types::I128 {
+                let dsts = get_output_reg(ctx, outputs[0]);
+                ctx.emit(Inst::mov64_m_r(amode.clone(), dsts.regs()[0]));
+                ctx.emit(Inst::mov64_m_r(amode.offset(8), dsts.regs()[1]));
+            } else {
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                let is_xmm = elem_ty.is_float() || elem_ty.is_vector();
+                match (sign_extend, is_xmm) {
+                    (true, false) => {
+                        // The load is sign-extended only when the output size is lower than 64 bits,
+                        // so ext-mode is defined in this case.
+                        ctx.emit(Inst::movsx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst));
                     }
-                }
-                (_, true) => {
-                    ctx.emit(match elem_ty {
-                        types::F32 => Inst::xmm_mov(SseOpcode::Movss, RegMem::mem(amode), dst),
-                        types::F64 => Inst::xmm_mov(SseOpcode::Movsd, RegMem::mem(amode), dst),
-                        types::I8X8 => {
-                            if sign_extend == true {
-                                Inst::xmm_mov(SseOpcode::Pmovsxbw, RegMem::mem(amode), dst)
-                            } else {
-                                Inst::xmm_mov(SseOpcode::Pmovzxbw, RegMem::mem(amode), dst)
+                    (false, false) => {
+                        if elem_ty.bytes() == 8 {
+                            // Use a plain load.
+                            ctx.emit(Inst::mov64_m_r(amode, dst))
+                        } else {
+                            // Use a zero-extended load.
+                            ctx.emit(Inst::movzx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst))
+                        }
+                    }
+                    (_, true) => {
+                        ctx.emit(match elem_ty {
+                            types::F32 => Inst::xmm_mov(SseOpcode::Movss, RegMem::mem(amode), dst),
+                            types::F64 => Inst::xmm_mov(SseOpcode::Movsd, RegMem::mem(amode), dst),
+                            types::I8X8 => {
+                                if sign_extend == true {
+                                    Inst::xmm_mov(SseOpcode::Pmovsxbw, RegMem::mem(amode), dst)
+                                } else {
+                                    Inst::xmm_mov(SseOpcode::Pmovzxbw, RegMem::mem(amode), dst)
+                                }
                             }
-                        }
-                        types::I16X4 => {
-                            if sign_extend == true {
-                                Inst::xmm_mov(SseOpcode::Pmovsxwd, RegMem::mem(amode), dst)
-                            } else {
-                                Inst::xmm_mov(SseOpcode::Pmovzxwd, RegMem::mem(amode), dst)
+                            types::I16X4 => {
+                                if sign_extend == true {
+                                    Inst::xmm_mov(SseOpcode::Pmovsxwd, RegMem::mem(amode), dst)
+                                } else {
+                                    Inst::xmm_mov(SseOpcode::Pmovzxwd, RegMem::mem(amode), dst)
+                                }
                             }
-                        }
-                        types::I32X2 => {
-                            if sign_extend == true {
-                                Inst::xmm_mov(SseOpcode::Pmovsxdq, RegMem::mem(amode), dst)
-                            } else {
-                                Inst::xmm_mov(SseOpcode::Pmovzxdq, RegMem::mem(amode), dst)
+                            types::I32X2 => {
+                                if sign_extend == true {
+                                    Inst::xmm_mov(SseOpcode::Pmovsxdq, RegMem::mem(amode), dst)
+                                } else {
+                                    Inst::xmm_mov(SseOpcode::Pmovzxdq, RegMem::mem(amode), dst)
+                                }
                             }
-                        }
-                        _ if elem_ty.is_vector() && elem_ty.bits() == 128 => {
-                            Inst::xmm_mov(SseOpcode::Movups, RegMem::mem(amode), dst)
-                        }
-                        // TODO Specialize for different types: MOVUPD, MOVDQU
-                        _ => unreachable!(
-                            "unexpected type for load: {:?} - {:?}",
-                            elem_ty,
-                            elem_ty.bits()
-                        ),
-                    });
+                            _ if elem_ty.is_vector() && elem_ty.bits() == 128 => {
+                                Inst::xmm_mov(SseOpcode::Movups, RegMem::mem(amode), dst)
+                            }
+                            // TODO Specialize for different types: MOVUPD, MOVDQU
+                            _ => unreachable!(
+                                "unexpected type for load: {:?} - {:?}",
+                                elem_ty,
+                                elem_ty.bits()
+                            ),
+                        });
+                    }
                 }
             }
         }
@@ -3491,17 +4472,23 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 _ => unreachable!(),
             };
 
-            let src = put_input_in_reg(ctx, inputs[0]);
+            if elem_ty == types::I128 {
+                let srcs = put_input_in_regs(ctx, inputs[0]);
+                ctx.emit(Inst::mov_r_m(8, srcs.regs()[0], addr.clone()));
+                ctx.emit(Inst::mov_r_m(8, srcs.regs()[1], addr.offset(8)));
+            } else {
+                let src = put_input_in_reg(ctx, inputs[0]);
 
-            ctx.emit(match elem_ty {
-                types::F32 => Inst::xmm_mov_r_m(SseOpcode::Movss, src, addr),
-                types::F64 => Inst::xmm_mov_r_m(SseOpcode::Movsd, src, addr),
-                _ if elem_ty.is_vector() && elem_ty.bits() == 128 => {
-                    // TODO Specialize for different types: MOVUPD, MOVDQU, etc.
-                    Inst::xmm_mov_r_m(SseOpcode::Movups, src, addr)
-                }
-                _ => Inst::mov_r_m(elem_ty.bytes() as u8, src, addr),
-            });
+                ctx.emit(match elem_ty {
+                    types::F32 => Inst::xmm_mov_r_m(SseOpcode::Movss, src, addr),
+                    types::F64 => Inst::xmm_mov_r_m(SseOpcode::Movsd, src, addr),
+                    _ if elem_ty.is_vector() && elem_ty.bits() == 128 => {
+                        // TODO Specialize for different types: MOVUPD, MOVDQU, etc.
+                        Inst::xmm_mov_r_m(SseOpcode::Movups, src, addr)
+                    }
+                    _ => Inst::mov_r_m(elem_ty.bytes() as u8, src, addr),
+                });
+            }
         }
 
         Opcode::AtomicRmw => {
@@ -3668,17 +4655,9 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 };
 
                 let ty = ctx.output_ty(insn, 0);
-                let rhs = put_input_in_reg(ctx, rhs_input);
-                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-                let lhs = if is_int_or_ref_ty(ty) && ty.bytes() < 4 {
-                    // Special case: since the higher bits are undefined per CLIF semantics, we
-                    // can just apply a 32-bit cmove here. Force inputs into registers, to
-                    // avoid partial spilling out-of-bounds with memory accesses, though.
-                    // Sign-extend operands to 32, then do a cmove of size 4.
-                    RegMem::reg(put_input_in_reg(ctx, lhs_input))
-                } else {
-                    input_to_reg_mem(ctx, lhs_input)
-                };
+                let rhs = put_input_in_regs(ctx, rhs_input);
+                let dst = get_output_reg(ctx, outputs[0]);
+                let lhs = put_input_in_regs(ctx, lhs_input);
 
                 // We request inversion of Equal to NotEqual here: taking LHS if equal would mean
                 // take it if both CC::NP and CC::Z are set, the conjunction of which can't be
@@ -3691,15 +4670,20 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     assert_eq!(cond_code, FloatCC::Equal);
                 }
 
-                ctx.emit(Inst::gen_move(dst, rhs, ty));
+                emit_moves(ctx, dst, rhs, ty);
 
                 match fcmp_results {
                     FcmpCondResult::Condition(cc) => {
-                        if is_int_or_ref_ty(ty) {
-                            let size = u8::max(ty.bytes() as u8, 4);
-                            ctx.emit(Inst::cmove(size, cc, lhs, dst));
+                        if is_int_or_ref_ty(ty) || ty == types::I128 || ty == types::B128 {
+                            let size = ty.bytes() as u8;
+                            emit_cmoves(ctx, size, cc, lhs, dst);
                         } else {
-                            ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst));
+                            ctx.emit(Inst::xmm_cmove(
+                                ty == types::F64,
+                                cc,
+                                RegMem::reg(lhs.only_reg().unwrap()),
+                                dst.only_reg().unwrap(),
+                            ));
                         }
                     }
                     FcmpCondResult::AndConditions(_, _) => {
@@ -3709,40 +4693,37 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     }
                     FcmpCondResult::InvertedEqualOrConditions(cc1, cc2)
                     | FcmpCondResult::OrConditions(cc1, cc2) => {
-                        if is_int_or_ref_ty(ty) {
-                            let size = u8::max(ty.bytes() as u8, 4);
-                            ctx.emit(Inst::cmove(size, cc1, lhs.clone(), dst));
-                            ctx.emit(Inst::cmove(size, cc2, lhs, dst));
+                        if is_int_or_ref_ty(ty) || ty == types::I128 {
+                            let size = ty.bytes() as u8;
+                            emit_cmoves(ctx, size, cc1, lhs.clone(), dst);
+                            emit_cmoves(ctx, size, cc2, lhs, dst);
                         } else {
-                            ctx.emit(Inst::xmm_cmove(ty == types::F64, cc1, lhs.clone(), dst));
-                            ctx.emit(Inst::xmm_cmove(ty == types::F64, cc2, lhs, dst));
+                            ctx.emit(Inst::xmm_cmove(
+                                ty == types::F64,
+                                cc1,
+                                RegMem::reg(lhs.only_reg().unwrap()),
+                                dst.only_reg().unwrap(),
+                            ));
+                            ctx.emit(Inst::xmm_cmove(
+                                ty == types::F64,
+                                cc2,
+                                RegMem::reg(lhs.only_reg().unwrap()),
+                                dst.only_reg().unwrap(),
+                            ));
                         }
                     }
                 }
             } else {
                 let ty = ty.unwrap();
 
-                let mut size = ty.bytes() as u8;
-                let lhs = if is_int_or_ref_ty(ty) {
-                    if size < 4 {
-                        // Special case: since the higher bits are undefined per CLIF semantics, we
-                        // can just apply a 32-bit cmove here. Force inputs into registers, to
-                        // avoid partial spilling out-of-bounds with memory accesses, though.
-                        size = 4;
-                        RegMem::reg(put_input_in_reg(ctx, inputs[1]))
-                    } else {
-                        input_to_reg_mem(ctx, inputs[1])
-                    }
-                } else {
-                    input_to_reg_mem(ctx, inputs[1])
-                };
-
-                let rhs = put_input_in_reg(ctx, inputs[2]);
-                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                let size = ty.bytes() as u8;
+                let lhs = put_input_in_regs(ctx, inputs[1]);
+                let rhs = put_input_in_regs(ctx, inputs[2]);
+                let dst = get_output_reg(ctx, outputs[0]);
 
                 let cc = if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) {
-                    emit_cmp(ctx, icmp);
                     let cond_code = ctx.data(icmp).cond_code().unwrap();
+                    let cond_code = emit_cmp(ctx, icmp, cond_code);
                     CC::from_intcc(cond_code)
                 } else {
                     let sel_ty = ctx.input_ty(insn, 0);
@@ -3768,21 +4749,26 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 };
 
                 // This doesn't affect the flags.
-                ctx.emit(Inst::gen_move(dst, rhs, ty));
+                emit_moves(ctx, dst, rhs, ty);
 
-                if is_int_or_ref_ty(ty) {
-                    ctx.emit(Inst::cmove(size, cc, lhs, dst));
+                if is_int_or_ref_ty(ty) || ty == types::I128 {
+                    emit_cmoves(ctx, size, cc, lhs, dst);
                 } else {
                     debug_assert!(ty == types::F32 || ty == types::F64);
-                    ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst));
+                    ctx.emit(Inst::xmm_cmove(
+                        ty == types::F64,
+                        cc,
+                        RegMem::reg(lhs.only_reg().unwrap()),
+                        dst.only_reg().unwrap(),
+                    ));
                 }
             }
         }
 
         Opcode::Selectif | Opcode::SelectifSpectreGuard => {
-            let lhs = input_to_reg_mem(ctx, inputs[1]);
-            let rhs = put_input_in_reg(ctx, inputs[2]);
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+            let lhs = put_input_in_regs(ctx, inputs[1]);
+            let rhs = put_input_in_regs(ctx, inputs[2]);
+            let dst = get_output_reg(ctx, outputs[0]);
             let ty = ctx.output_ty(insn, 0);
 
             // Verification ensures that the input is always a single-def ifcmp.
@@ -3792,26 +4778,24 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 .unwrap()
                 .0;
             debug_assert_eq!(ctx.data(cmp_insn).opcode(), Opcode::Ifcmp);
-            emit_cmp(ctx, cmp_insn);
+            let cond_code = ctx.data(insn).cond_code().unwrap();
+            let cond_code = emit_cmp(ctx, cmp_insn, cond_code);
 
-            let cc = CC::from_intcc(ctx.data(insn).cond_code().unwrap());
+            let cc = CC::from_intcc(cond_code);
 
-            if is_int_or_ref_ty(ty) {
+            if is_int_or_ref_ty(ty) || ty == types::I128 {
                 let size = ty.bytes() as u8;
-                if size == 1 {
-                    // Sign-extend operands to 32, then do a cmove of size 4.
-                    let lhs_se = ctx.alloc_tmp(types::I32).only_reg().unwrap();
-                    ctx.emit(Inst::movsx_rm_r(ExtMode::BL, lhs, lhs_se));
-                    ctx.emit(Inst::movsx_rm_r(ExtMode::BL, RegMem::reg(rhs), dst));
-                    ctx.emit(Inst::cmove(4, cc, RegMem::reg(lhs_se.to_reg()), dst));
-                } else {
-                    ctx.emit(Inst::gen_move(dst, rhs, ty));
-                    ctx.emit(Inst::cmove(size, cc, lhs, dst));
-                }
+                emit_moves(ctx, dst, rhs, ty);
+                emit_cmoves(ctx, size, cc, lhs, dst);
             } else {
                 debug_assert!(ty == types::F32 || ty == types::F64);
-                ctx.emit(Inst::gen_move(dst, rhs, ty));
-                ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst));
+                emit_moves(ctx, dst, rhs, ty);
+                ctx.emit(Inst::xmm_cmove(
+                    ty == types::F64,
+                    cc,
+                    RegMem::reg(lhs.only_reg().unwrap()),
+                    dst.only_reg().unwrap(),
+                ));
             }
         }
 
@@ -3894,8 +4878,19 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 // The quotient is in rax.
                 ctx.emit(Inst::gen_move(dst, regs::rax(), input_ty));
             } else {
-                // The remainder is in rdx.
-                ctx.emit(Inst::gen_move(dst, regs::rdx(), input_ty));
+                if size == 1 {
+                    // The remainder is in AH. Right-shift by 8 bits then move from rax.
+                    ctx.emit(Inst::shift_r(
+                        8,
+                        ShiftKind::ShiftRightLogical,
+                        Some(8),
+                        Writable::from_reg(regs::rax()),
+                    ));
+                    ctx.emit(Inst::gen_move(dst, regs::rax(), input_ty));
+                } else {
+                    // The remainder is in rdx.
+                    ctx.emit(Inst::gen_move(dst, regs::rdx(), input_ty));
+                }
             }
         }
 
@@ -4297,6 +5292,38 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             }
         }
 
+        Opcode::Iconcat => {
+            let ty = ctx.output_ty(insn, 0);
+            assert_eq!(
+                ty,
+                types::I128,
+                "Iconcat not expected to be used for non-128-bit type"
+            );
+            assert_eq!(ctx.input_ty(insn, 0), types::I64);
+            assert_eq!(ctx.input_ty(insn, 1), types::I64);
+            let lo = put_input_in_reg(ctx, inputs[0]);
+            let hi = put_input_in_reg(ctx, inputs[1]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            ctx.emit(Inst::gen_move(dst.regs()[0], lo, types::I64));
+            ctx.emit(Inst::gen_move(dst.regs()[1], hi, types::I64));
+        }
+
+        Opcode::Isplit => {
+            let ty = ctx.input_ty(insn, 0);
+            assert_eq!(
+                ty,
+                types::I128,
+                "Iconcat not expected to be used for non-128-bit type"
+            );
+            assert_eq!(ctx.output_ty(insn, 0), types::I64);
+            assert_eq!(ctx.output_ty(insn, 1), types::I64);
+            let src = put_input_in_regs(ctx, inputs[0]);
+            let dst_lo = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+            let dst_hi = get_output_reg(ctx, outputs[1]).only_reg().unwrap();
+            ctx.emit(Inst::gen_move(dst_lo, src.regs()[0], types::I64));
+            ctx.emit(Inst::gen_move(dst_hi, src.regs()[1], types::I64));
+        }
+
         Opcode::IaddImm
         | Opcode::ImulImm
         | Opcode::UdivImm
@@ -4384,9 +5411,9 @@ impl LowerBackend for X64Backend {
                     let src_ty = ctx.input_ty(branches[0], 0);
 
                     if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) {
-                        emit_cmp(ctx, icmp);
-
                         let cond_code = ctx.data(icmp).cond_code().unwrap();
+                        let cond_code = emit_cmp(ctx, icmp, cond_code);
+
                         let cond_code = if op0 == Opcode::Brz {
                             cond_code.inverse()
                         } else {
@@ -4416,6 +5443,32 @@ impl LowerBackend for X64Backend {
                             }
                             FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
                         }
+                    } else if src_ty == types::I128 {
+                        let src = put_input_in_regs(
+                            ctx,
+                            InsnInput {
+                                insn: branches[0],
+                                input: 0,
+                            },
+                        );
+                        let (half_cc, comb_op) = match op0 {
+                            Opcode::Brz => (CC::Z, AluRmiROpcode::And8),
+                            Opcode::Brnz => (CC::NZ, AluRmiROpcode::Or8),
+                            _ => unreachable!(),
+                        };
+                        let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                        let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                        ctx.emit(Inst::cmp_rmi_r(8, RegMemImm::imm(0), src.regs()[0]));
+                        ctx.emit(Inst::setcc(half_cc, tmp1));
+                        ctx.emit(Inst::cmp_rmi_r(8, RegMemImm::imm(0), src.regs()[1]));
+                        ctx.emit(Inst::setcc(half_cc, tmp2));
+                        ctx.emit(Inst::alu_rmi_r(
+                            false,
+                            comb_op,
+                            RegMemImm::reg(tmp1.to_reg()),
+                            tmp2,
+                        ));
+                        ctx.emit(Inst::jmp_cond(CC::NZ, taken, not_taken));
                     } else if is_int_or_ref_ty(src_ty) || is_bool_ty(src_ty) {
                         let src = put_input_in_reg(
                             ctx,
@@ -4483,8 +5536,8 @@ impl LowerBackend for X64Backend {
                     };
 
                     if let Some(ifcmp) = matches_input(ctx, flag_input, Opcode::Ifcmp) {
-                        emit_cmp(ctx, ifcmp);
                         let cond_code = ctx.data(branches[0]).cond_code().unwrap();
+                        let cond_code = emit_cmp(ctx, ifcmp, cond_code);
                         let cc = CC::from_intcc(cond_code);
                         ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
                     } else if let Some(ifcmp_sp) = matches_input(ctx, flag_input, Opcode::IfcmpSp) {
diff --git a/cranelift/filetests/filetests/isa/x64/bitops-i128-run.clif b/cranelift/filetests/filetests/isa/x64/bitops-i128-run.clif
new file mode 100644
index 0000000000..5795900438
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/bitops-i128-run.clif
@@ -0,0 +1,27 @@
+test run
+target x86_64
+feature "experimental_x64"
+
+function %ctz(i64, i64) -> i8 {
+block0(v0: i64, v1: i64):
+    v2 = iconcat v0, v1
+    v3 = ctz.i128 v2
+    v4 = ireduce.i8 v3
+    return v4
+}
+; run: %ctz(0x00000000_00000000, 0x00000001_00000000) == 96
+; run: %ctz(0x00000000_00010000, 0x00000001_00000000) == 16
+; run: %ctz(0x00000000_00010000, 0x00000000_00000000) == 16
+; run: %ctz(0x00000000_00000000, 0x00000000_00000000) == 128
+
+function %clz(i64, i64) -> i8 {
+block0(v0: i64, v1: i64):
+    v2 = iconcat v0, v1
+    v3 = clz.i128 v2
+    v4 = ireduce.i8 v3
+    return v4
+}
+; run: %clz(0x00000000_00000000, 0x00000001_00000000) == 31
+; run: %clz(0x00000000_00010000, 0x00000001_00000000) == 31
+; run: %clz(0x00000000_00010000, 0x00000000_00000000) == 111
+; run: %clz(0x00000000_00000000, 0x00000000_00000000) == 128
diff --git a/cranelift/filetests/filetests/isa/x64/bitrev-i128-run.clif b/cranelift/filetests/filetests/isa/x64/bitrev-i128-run.clif
new file mode 100644
index 0000000000..64ea96716c
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/bitrev-i128-run.clif
@@ -0,0 +1,47 @@
+test run
+target x86_64
+feature "experimental_x64"
+
+function %reverse_bits_zero() -> b1 {
+block0:
+    v0 = iconst.i64 0
+    v1 = iconcat v0, v0
+    v2 = bitrev.i128 v1
+    v3 = icmp eq v2, v1
+    return v3
+}
+; run
+
+function %reverse_bits_one() -> b1 {
+block0:
+    v0 = iconst.i64 0
+    v1 = iconst.i64 1
+    v2 = iconcat v0, v1
+
+    v3 = bitrev.i128 v2
+
+    v4 = iconst.i64 0x8000_0000_0000_0000
+    v5 = iconst.i64 0
+    v6 = iconcat v4, v5
+
+    v7 = icmp eq v3, v6
+    return v7
+}
+; run
+
+function %reverse_bits() -> b1 {
+block0:
+    v0 = iconst.i64 0x06AD_8667_69EC_41BA
+    v1 = iconst.i64 0x6C83_D81A_6E28_83AB
+    v2 = iconcat v0, v1
+
+    v3 = bitrev.i128 v2
+
+    v4 = iconst.i64 0xD5C11476581BC136
+    v5 = iconst.i64 0x5D823796E661B560
+    v6 = iconcat v4, v5
+
+    v7 = icmp eq v3, v6
+    return v7
+}
+; run
diff --git a/cranelift/filetests/filetests/isa/x64/floating-point.clif b/cranelift/filetests/filetests/isa/x64/floating-point.clif
new file mode 100644
index 0000000000..b3b5907210
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/floating-point.clif
@@ -0,0 +1,26 @@
+test compile
+target x86_64
+feature "experimental_x64"
+
+function %f(f64) -> f64 {
+block0(v0: f64):
+    v1 = fabs.f64 v0
+    return v1
+}
+; check:  movabsq $$9223372036854775807, %rsi
+; nextln: movq    %rsi, %xmm1
+; nextln: andpd   %xmm0, %xmm1
+; nextln: movaps  %xmm1, %xmm0
+
+
+function %f(i64) -> f64 {
+block0(v0: i64):
+    v1 = load.f64 v0
+    v2 = fabs.f64 v1
+    return v2
+}
+; check:  movsd   0(%rdi), %xmm0
+; nextln: movabsq $$9223372036854775807, %rsi
+; nextln: movq    %rsi, %xmm1
+; nextln: andpd   %xmm0, %xmm1
+; nextln: movaps  %xmm1, %xmm0
diff --git a/cranelift/filetests/filetests/isa/x64/i128.clif b/cranelift/filetests/filetests/isa/x64/i128.clif
new file mode 100644
index 0000000000..e7ee34f283
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/i128.clif
@@ -0,0 +1,1082 @@
+test compile
+target x86_64
+feature "experimental_x64"
+
+function %f0(i128, i128) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128, v1: i128):
+
+    v2 = iadd v0, v1
+; nextln:  addq    %rdx, %rdi
+; nextln:  adcq    %rcx, %rsi
+
+    return v2
+; nextln:  movq    %rdi, %rax
+; nextln:  movq    %rsi, %rdx
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f1(i128, i128) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128, v1: i128):
+
+    v2 = isub v0, v1
+; nextln:  subq    %rdx, %rdi
+; nextln:  sbbq    %rcx, %rsi
+
+    return v2
+; nextln:  movq    %rdi, %rax
+; nextln:  movq    %rsi, %rdx
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f2(i128, i128) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128, v1: i128):
+
+    v2 = band v0, v1
+; nextln:  andq    %rdx, %rdi
+; nextln:  andq    %rcx, %rsi
+
+    return v2
+; nextln:  movq    %rdi, %rax
+; nextln:  movq    %rsi, %rdx
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f3(i128, i128) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128, v1: i128):
+
+    v2 = bor v0, v1
+; nextln:  orq     %rdx, %rdi
+; nextln:  orq     %rcx, %rsi
+
+    return v2
+; nextln:  movq    %rdi, %rax
+; nextln:  movq    %rsi, %rdx
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f4(i128, i128) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128, v1: i128):
+
+    v2 = bxor v0, v1
+; nextln:  xorq    %rdx, %rdi
+; nextln:  xorq    %rcx, %rsi
+
+    return v2
+; nextln:  movq    %rdi, %rax
+; nextln:  movq    %rsi, %rdx
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f5(i128) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128):
+
+    v1 = bnot v0
+; nextln:  notq    %rdi
+; nextln:  notq    %rsi
+
+    return v1
+; nextln:  movq    %rdi, %rax
+; nextln:  movq    %rsi, %rdx
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f6(i128, i128) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128, v1: i128):
+; v0 in rdi:rsi, v1 in rdx:rcx
+
+    v2 = imul v0, v1
+; nextln:  movq    %rsi, %rax
+; nextln:  movq    %rcx, %r8
+; nextln:  movq    %rdi, %rsi
+; nextln:  imulq   %rdx, %rsi
+; nextln:  movq    %rdi, %rcx
+; nextln:  imulq   %r8, %rcx
+; nextln:  imulq   %rdx, %rax
+; nextln:  addq    %rax, %rcx
+; nextln:  movq    %rdi, %rax
+; nextln:  mul     %rdx
+; nextln:  addq    %rdx, %rcx
+; nextln:  movq    %rsi, %rax
+; nextln:  movq    %rcx, %rdx
+
+    return v2
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f7(i64, i64) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i64, v1: i64):
+    v2 = iconcat.i64 v0, v1
+; nextln:  movq    %rdi, %rax
+; nextln:  movq    %rsi, %rdx
+
+    return v2
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f8(i128) -> i64, i64 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128):
+    v1, v2 = isplit.i128 v0
+; nextln:  movq    %rdi, %rax
+; nextln:  movq    %rsi, %rdx
+
+    return v1, v2
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f9(i128, i128) -> b1 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128, v1: i128):
+    v2 = icmp eq v0, v1
+; check:  cmpq    %rcx, %rsi
+; nextln: setz    %al
+; nextln: cmpq    %rdx, %rdi
+; nextln: setz    %r8b
+; nextln: andq    %rax, %r8
+; nextln: andq    $$1, %r8
+; nextln: setnz   %al
+
+    v3 = icmp ne v0, v1
+; check:  cmpq    %rcx, %rsi
+; nextln: setnz   %al
+; nextln: cmpq    %rdx, %rdi
+; nextln: setnz   %r8b
+; nextln: orq     %rax, %r8
+; nextln: andq    $$1, %r8
+; nextln: setnz   %r8b
+ 
+    v4 = icmp slt v0, v1
+; check:  cmpq    %rcx, %rsi
+; nextln: setl    %r9b
+; nextln: setz    %al
+; nextln: cmpq    %rdx, %rdi
+; nextln: setb    %r10b
+; nextln: andq    %rax, %r10
+; nextln: orq     %r9, %r10
+; nextln: andq    $$1, %r10
+; nextln: setnz   %r9b
+ 
+    v5 = icmp sle v0, v1
+; check:  cmpq    %rcx, %rsi
+; nextln: setl    %r10b
+; nextln: setz    %al
+; nextln: cmpq    %rdx, %rdi
+; nextln: setbe   %r11b
+; nextln: andq    %rax, %r11
+; nextln: orq     %r10, %r11
+; nextln: andq    $$1, %r11
+; nextln: setnz   %r10b
+ 
+    v6 = icmp sgt v0, v1
+; check:  cmpq    %rcx, %rsi
+; nextln: setnle  %r11b
+; nextln: setz    %al
+; nextln: cmpq    %rdx, %rdi
+; nextln: setnbe  %r12b
+; nextln: andq    %rax, %r12
+; nextln: orq     %r11, %r12
+; nextln: andq    $$1, %r12
+; nextln: setnz   %r11b
+
+	v7 = icmp sge v0, v1
+; check:  cmpq    %rcx, %rsi
+; nextln: setnle  %r12b
+; nextln: setz    %al
+; nextln: cmpq    %rdx, %rdi
+; nextln: setnb   %r13b
+; nextln: andq    %rax, %r13
+; nextln: orq     %r12, %r13
+; nextln: andq    $$1, %r13
+; nextln: setnz   %r12b
+
+    v8 = icmp ult v0, v1
+; check:  cmpq    %rcx, %rsi
+; nextln: setb    %r13b
+; nextln: setz    %al
+; nextln: cmpq    %rdx, %rdi
+; nextln: setb    %r14b
+; nextln: andq    %rax, %r14
+; nextln: orq     %r13, %r14
+; nextln: andq    $$1, %r14
+; nextln: setnz   %r13b
+
+    v9 = icmp ule v0, v1
+; check:  cmpq    %rcx, %rsi
+; nextln: setb    %r14b
+; nextln: setz    %al
+; nextln: cmpq    %rdx, %rdi
+; nextln: setbe   %bl
+; nextln: andq    %rax, %rbx
+; nextln: orq     %r14, %rbx
+; nextln: andq    $$1, %rbx
+; nextln: setnz   %r14b
+
+    v10 = icmp ugt v0, v1
+; check:  cmpq    %rcx, %rsi
+; nextln: setnbe  %bl
+; nextln: setz    %r15b
+; nextln: cmpq    %rdx, %rdi
+; nextln: setnbe  %al
+; nextln: andq    %r15, %rax
+; nextln: orq     %rbx, %rax
+; nextln: andq    $$1, %rax
+; nextln: setnz   %bl
+
+    v11 = icmp uge v0, v1
+; check:  cmpq    %rcx, %rsi
+; nextln: setnbe  %sil
+; nextln: setz    %cl
+; nextln: cmpq    %rdx, %rdi
+; nextln: setnb   %dil
+; nextln: andq    %rcx, %rdi
+; nextln: orq     %rsi, %rdi
+; nextln: andq    $$1, %rdi
+; nextln: setnz   %sil
+
+    v12 = band v2, v3
+    v13 = band v4, v5
+    v14 = band v6, v7
+    v15 = band v8, v9
+    v16 = band v10, v11
+    v17 = band v12, v13
+    v18 = band v14, v15
+    v19 = band v17, v18
+    v20 = band v19, v16
+
+    return v20
+; check:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f10(i128) -> i32 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128):
+    brz v0, block1
+; check:  cmpq    $$0, %rdi
+; nextln: setz    %dil
+; nextln: cmpq    $$0, %rsi
+; nextln: setz    %sil
+; nextln: andb    %dil, %sil
+; nextln: jnz     label1; j label2
+ 
+    jump block2
+
+block1:
+    v1 = iconst.i32 1
+    return v1
+
+block2:
+    v2 = iconst.i32 2
+    return v2
+
+; check:   movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f11(i128) -> i32 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128):
+    brnz v0, block1
+; check:  cmpq    $$0, %rdi
+; nextln: setnz   %dil
+; nextln: cmpq    $$0, %rsi
+; nextln: setnz   %sil
+; nextln: orb     %dil, %sil
+; nextln: jnz     label1; j label2
+    jump block2
+
+block1:
+    v1 = iconst.i32 1
+    return v1
+
+block2:
+    v2 = iconst.i32 2
+    return v2
+
+; check:   movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f12(i64) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i64):
+    v1 = uextend.i128 v0
+    return v1
+
+; nextln:  movq    %rdi, %rsi
+; nextln:  xorq    %rdi, %rdi
+; nextln:  movq    %rsi, %rax
+; nextln:  movq    %rdi, %rdx
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f13(i64) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i64):
+    v1 = sextend.i128 v0
+    return v1
+
+; nextln:  movq    %rdi, %rsi
+; nextln:  movq    %rsi, %rdi
+; nextln:  sarq    $$63, %rdi
+; nextln:  movq    %rsi, %rax
+; nextln:  movq    %rdi, %rdx
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f14(i8) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i8):
+    v1 = sextend.i128 v0
+    return v1
+
+; nextln:  movsbq  %dil, %rsi
+; nextln:  movq    %rsi, %rdi
+; nextln:  sarq    $$63, %rdi
+; nextln:  movq    %rsi, %rax
+; nextln:  movq    %rdi, %rdx
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f15(i8) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i8):
+    v1 = uextend.i128 v0
+    return v1
+
+; nextln:  movzbq  %dil, %rsi
+; nextln:  xorq    %rdi, %rdi
+; nextln:  movq    %rsi, %rax
+; nextln:  movq    %rdi, %rdx
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+
+}
+
+function %f16(i128) -> i64 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128):
+    v1 = ireduce.i64 v0
+    return v1
+
+; nextln:  movq    %rdi, %rax
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f17(i128) -> i8 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128):
+    v1 = ireduce.i8 v0
+    return v1
+
+; nextln:  movq    %rdi, %rax
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f18(b1) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: b1):
+    v1 = bint.i128 v0
+    return v1
+
+; check:  movzbq  %dil, %rsi
+; nextln: xorq    %rdi, %rdi
+; nextln: movq    %rsi, %rax
+; nextln: movq    %rdi, %rdx
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f19(i128) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128):
+    v1 = popcnt.i128 v0
+    return v1
+
+; check:  movq    %rsi, %rdx
+; nextln: movq    %rdi, %rsi
+; nextln: shrq    $$1, %rsi
+; nextln: movabsq $$8608480567731124087, %rcx
+; nextln: andq    %rcx, %rsi
+; nextln: movq    %rdi, %rax
+; nextln: subq    %rsi, %rax
+; nextln: shrq    $$1, %rsi
+; nextln: andq    %rcx, %rsi
+; nextln: subq    %rsi, %rax
+; nextln: shrq    $$1, %rsi
+; nextln: andq    %rcx, %rsi
+; nextln: subq    %rsi, %rax
+; nextln: movq    %rax, %rsi
+; nextln: shrq    $$4, %rsi
+; nextln: addq    %rax, %rsi
+; nextln: movabsq $$1085102592571150095, %rdi
+; nextln: andq    %rdi, %rsi
+; nextln: movabsq $$72340172838076673, %rdi
+; nextln: imulq   %rdi, %rsi
+; nextln: shrq    $$56, %rsi
+; nextln: movq    %rdx, %rax
+; nextln: shrq    $$1, %rax
+; nextln: movabsq $$8608480567731124087, %rcx
+; nextln: andq    %rcx, %rax
+; nextln: movq    %rdx, %rdi
+; nextln: subq    %rax, %rdi
+; nextln: shrq    $$1, %rax
+; nextln: andq    %rcx, %rax
+; nextln: subq    %rax, %rdi
+; nextln: shrq    $$1, %rax
+; nextln: andq    %rcx, %rax
+; nextln: subq    %rax, %rdi
+; nextln: movq    %rdi, %rax
+; nextln: shrq    $$4, %rax
+; nextln: addq    %rdi, %rax
+; nextln: movabsq $$1085102592571150095, %rdi
+; nextln: andq    %rdi, %rax
+; nextln: movabsq $$72340172838076673, %rdi
+; nextln: imulq   %rdi, %rax
+; nextln: shrq    $$56, %rax
+; nextln: addq    %rax, %rsi
+; nextln: xorq    %rdi, %rdi
+; nextln: movq    %rsi, %rax
+; nextln: movq    %rdi, %rdx
+
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+
+function %f20(i128) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128):
+    v1 = bitrev.i128 v0
+    return v1
+
+; check:  movq    %rdi, %rcx
+; nextln: movq    %rcx, %rdi
+; nextln: movabsq $$6148914691236517205, %rax
+; nextln: shrq    $$1, %rdi
+; nextln: andq    %rax, %rdi
+; nextln: andq    %rcx, %rax
+; nextln: shlq    $$1, %rax
+; nextln: movq    %rax, %rcx
+; nextln: orq     %rdi, %rcx
+; nextln: movq    %rcx, %rdi
+; nextln: movabsq $$3689348814741910323, %rax
+; nextln: shrq    $$2, %rdi
+; nextln: andq    %rax, %rdi
+; nextln: andq    %rcx, %rax
+; nextln: shlq    $$2, %rax
+; nextln: movq    %rax, %rcx
+; nextln: orq     %rdi, %rcx
+; nextln: movq    %rcx, %rdi
+; nextln: movabsq $$1085102592571150095, %rax
+; nextln: shrq    $$4, %rdi
+; nextln: andq    %rax, %rdi
+; nextln: andq    %rcx, %rax
+; nextln: shlq    $$4, %rax
+; nextln: movq    %rax, %rcx
+; nextln: orq     %rdi, %rcx
+; nextln: movq    %rcx, %rdi
+; nextln: movabsq $$71777214294589695, %rax
+; nextln: shrq    $$8, %rdi
+; nextln: andq    %rax, %rdi
+; nextln: andq    %rcx, %rax
+; nextln: shlq    $$8, %rax
+; nextln: movq    %rax, %rcx
+; nextln: orq     %rdi, %rcx
+; nextln: movq    %rcx, %rdi
+; nextln: movabsq $$281470681808895, %rax
+; nextln: shrq    $$16, %rdi
+; nextln: andq    %rax, %rdi
+; nextln: andq    %rcx, %rax
+; nextln: shlq    $$16, %rax
+; nextln: orq     %rdi, %rax
+; nextln: movq    %rax, %rcx
+; nextln: movl    $$-1, %edi
+; nextln: shrq    $$32, %rcx
+; nextln: andq    %rdi, %rcx
+; nextln: andq    %rax, %rdi
+; nextln: shlq    $$32, %rdi
+; nextln: orq     %rcx, %rdi
+; nextln: movq    %rsi, %rcx
+; nextln: movq    %rcx, %rsi
+; nextln: movabsq $$6148914691236517205, %rax
+; nextln: shrq    $$1, %rsi
+; nextln: andq    %rax, %rsi
+; nextln: andq    %rcx, %rax
+; nextln: shlq    $$1, %rax
+; nextln: movq    %rax, %rcx
+; nextln: orq     %rsi, %rcx
+; nextln: movq    %rcx, %rsi
+; nextln: movabsq $$3689348814741910323, %rax
+; nextln: shrq    $$2, %rsi
+; nextln: andq    %rax, %rsi
+; nextln: andq    %rcx, %rax
+; nextln: shlq    $$2, %rax
+; nextln: movq    %rax, %rcx
+; nextln: orq     %rsi, %rcx
+; nextln: movq    %rcx, %rsi
+; nextln: movabsq $$1085102592571150095, %rax
+; nextln: shrq    $$4, %rsi
+; nextln: andq    %rax, %rsi
+; nextln: andq    %rcx, %rax
+; nextln: shlq    $$4, %rax
+; nextln: movq    %rax, %rcx
+; nextln: orq     %rsi, %rcx
+; nextln: movq    %rcx, %rsi
+; nextln: movabsq $$71777214294589695, %rax
+; nextln: shrq    $$8, %rsi
+; nextln: andq    %rax, %rsi
+; nextln: andq    %rcx, %rax
+; nextln: shlq    $$8, %rax
+; nextln: movq    %rax, %rcx
+; nextln: orq     %rsi, %rcx
+; nextln: movq    %rcx, %rsi
+; nextln: movabsq $$281470681808895, %rax
+; nextln: shrq    $$16, %rsi
+; nextln: andq    %rax, %rsi
+; nextln: andq    %rcx, %rax
+; nextln: shlq    $$16, %rax
+; nextln: orq     %rsi, %rax
+; nextln: movq    %rax, %rsi
+; nextln: movl    $$-1, %ecx
+; nextln: shrq    $$32, %rsi
+; nextln: andq    %rcx, %rsi
+; nextln: andq    %rax, %rcx
+; nextln: shlq    $$32, %rcx
+; nextln: orq     %rsi, %rcx
+; nextln: movq    %rcx, %rax
+; nextln: movq    %rdi, %rdx
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f21(i128, i32) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128, v1: i32):
+    v2 = ushr v0, v1
+    return v2
+
+; check:  movq    %rdi, %rax
+; nextln: movq    %rsi, %rdi
+; nextln: movq    %rdi, %rsi
+; nextln: movq    %rdx, %rcx
+; nextln: shrq    %cl, %rsi
+; nextln: movq    %rdx, %rcx
+; nextln: shrq    %cl, %rax
+; nextln: movl    $$64, %ecx
+; nextln: subq    %rdx, %rcx
+; nextln: shlq    %cl, %rdi
+; nextln: orq     %rax, %rdi
+; nextln: xorq    %rax, %rax
+; nextln: xorq    %rcx, %rcx
+; nextln: andq    $$64, %rdx
+; nextln: cmovzq  %rsi, %rax
+; nextln: cmovzq  %rdi, %rcx
+; nextln: cmovnzq %rsi, %rcx
+; nextln: movq    %rax, %rdx
+; nextln: movq    %rcx, %rax
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f22(i128, i32) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128, v1: i32):
+    v2 = ishl v0, v1
+    return v2
+
+; check:  movq    %rsi, %rax
+; nextln: movq    %rdi, %rsi
+; nextln: movq    %rdx, %rcx
+; nextln: shlq    %cl, %rsi
+; nextln: movq    %rdx, %rcx
+; nextln: shlq    %cl, %rax
+; nextln: movl    $$64, %ecx
+; nextln: subq    %rdx, %rcx
+; nextln: shrq    %cl, %rdi
+; nextln: orq     %rax, %rdi
+; nextln: xorq    %rax, %rax
+; nextln: xorq    %rcx, %rcx
+; nextln: andq    $$64, %rdx
+; nextln: cmovzq  %rdi, %rcx
+; nextln: cmovzq  %rsi, %rax
+; nextln: cmovnzq %rsi, %rcx
+; nextln: movq    %rcx, %rdx
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f23(i128, i32) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128, v1: i32):
+    v2 = sshr v0, v1
+    return v2
+
+; check:  movq    %rdi, %r8
+; nextln: movq    %rsi, %rdi
+; nextln: movq    %rdi, %rsi
+; nextln: movq    %rdx, %rcx
+; nextln: sarq    %cl, %rsi
+; nextln: movq    %rdx, %rcx
+; nextln: sarq    %cl, %r8
+; nextln: movl    $$64, %ecx
+; nextln: subq    %rdx, %rcx
+; nextln: movq    %rdi, %rax
+; nextln: shlq    %cl, %rax
+; nextln: orq     %r8, %rax
+; nextln: sarq    $$63, %rdi
+; nextln: xorq    %rcx, %rcx
+; nextln: andq    $$64, %rdx
+; nextln: cmovzq  %rsi, %rdi
+; nextln: cmovzq  %rax, %rcx
+; nextln: cmovnzq %rsi, %rcx
+; nextln: movq    %rcx, %rax
+; nextln: movq    %rdi, %rdx
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f24(i128, i32) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128, v1: i32):
+    v2 = rotr.i128 v0, v1
+    return v2
+
+; check:  movq    %rsi, %r9
+; nextln: movq    %rdx, %rcx
+; nextln: shrq    %cl, %r9
+; nextln: movq    %rdi, %rax
+; nextln: movq    %rdx, %rcx
+; nextln: shrq    %cl, %rax
+; nextln: movl    $$64, %ecx
+; nextln: subq    %rdx, %rcx
+; nextln: movq    %rsi, %r10
+; nextln: shlq    %cl, %r10
+; nextln: orq     %rax, %r10
+; nextln: xorq    %r8, %r8
+; nextln: xorq    %rax, %rax
+; nextln: movq    %rdx, %rcx
+; nextln: andq    $$64, %rcx
+; nextln: cmovzq  %r9, %r8
+; nextln: cmovzq  %r10, %rax
+; nextln: cmovnzq %r9, %rax
+; nextln: movl    $$128, %r9d
+; nextln: subq    %rdx, %r9
+; nextln: movq    %rdi, %rdx
+; nextln: movq    %r9, %rcx
+; nextln: shlq    %cl, %rdx
+; nextln: movq    %r9, %rcx
+; nextln: shlq    %cl, %rsi
+; nextln: movl    $$64, %ecx
+; nextln: subq    %r9, %rcx
+; nextln: movq    %rdi, %r10
+; nextln: shrq    %cl, %r10
+; nextln: orq     %rsi, %r10
+; nextln: xorq    %rsi, %rsi
+; nextln: xorq    %rdi, %rdi
+; nextln: andq    $$64, %r9
+; nextln: cmovzq  %r10, %rdi
+; nextln: cmovzq  %rdx, %rsi
+; nextln: cmovnzq %rdx, %rdi
+; nextln: orq     %rax, %rsi
+; nextln: orq     %r8, %rdi
+; nextln: movq    %rsi, %rax
+; nextln: movq    %rdi, %rdx
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f25(i128, i32) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128, v1: i32):
+    v2 = rotl.i128 v0, v1
+    return v2
+
+; check:  movq    %rdi, %r9
+; nextln: movq    %rdx, %rcx
+; nextln: shlq    %cl, %r9
+; nextln: movq    %rsi, %rax
+; nextln: movq    %rdx, %rcx
+; nextln: shlq    %cl, %rax
+; nextln: movl    $$64, %ecx
+; nextln: subq    %rdx, %rcx
+; nextln: movq    %rdi, %r10
+; nextln: shrq    %cl, %r10
+; nextln: orq     %rax, %r10
+; nextln: xorq    %r8, %r8
+; nextln: xorq    %rax, %rax
+; nextln: movq    %rdx, %rcx
+; nextln: andq    $$64, %rcx
+; nextln: cmovzq  %r10, %rax
+; nextln: cmovzq  %r9, %r8
+; nextln: cmovnzq %r9, %rax
+; nextln: movl    $$128, %r9d
+; nextln: subq    %rdx, %r9
+; nextln: movq    %rsi, %rdx
+; nextln: movq    %r9, %rcx
+; nextln: shrq    %cl, %rdx
+; nextln: movq    %r9, %rcx
+; nextln: shrq    %cl, %rdi
+; nextln: movl    $$64, %ecx
+; nextln: subq    %r9, %rcx
+; nextln: shlq    %cl, %rsi
+; nextln: orq     %rdi, %rsi
+; nextln: xorq    %rdi, %rdi
+; nextln: xorq    %rcx, %rcx
+; nextln: andq    $$64, %r9
+; nextln: cmovzq  %rdx, %rdi
+; nextln: cmovzq  %rsi, %rcx
+; nextln: cmovnzq %rdx, %rcx
+; nextln: orq     %r8, %rcx
+; nextln: orq     %rax, %rdi
+; nextln: movq    %rcx, %rax
+; nextln: movq    %rdi, %rdx
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f26(i128, i64) {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128, v1: i64):
+    store.i128 v0, v1
+    return
+
+; check:  movq    %rdi, 0(%rdx)
+; nextln: movq    %rsi, 8(%rdx)
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f27(i64) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i64):
+    v1 = load.i128 v0
+    return v1
+
+; check:  movq    0(%rdi), %rsi
+; nextln: movq    8(%rdi), %rdi
+; nextln: movq    %rsi, %rax
+; nextln: movq    %rdi, %rdx
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f28(i128, b1) -> i128 {
+block0(v0: i128, v1: b1):
+    v2 = iconst.i128 0
+    brnz v1, block1(v2)
+    jump block2(v2)
+
+block1(v3: i128):
+    v4 = iconst.i128 1
+    v5 = iadd.i128 v3, v4
+    return v5
+
+block2(v6: i128):
+    v7 = iconst.i128 2
+    v8 = iadd.i128 v6, v7
+    return v8
+
+; check: pushq   %rbp
+; nextln: movq    %rsp, %rbp
+; nextln: testb   $$1, %dl
+; nextln: jnz     label1; j label2
+; check: Block 1:
+; check:  movl    $$0, %esi
+; nextln: movl    $$0, %edi
+; nextln: movl    $$1, %eax
+; nextln: movl    $$0, %ecx
+; nextln: addq    %rax, %rsi
+; nextln: adcq    %rcx, %rdi
+; nextln: movq    %rsi, %rax
+; nextln: movq    %rdi, %rdx
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
+; check: Block 2:
+; check:  movl    $$0, %esi
+; nextln: movl    $$0, %edi
+; nextln: movl    $$2, %eax
+; nextln: movl    $$0, %ecx
+; nextln: addq    %rax, %rsi
+; nextln: adcq    %rcx, %rdi
+; nextln: movq    %rsi, %rax
+; nextln: movq    %rdi, %rdx
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
+ 
+}
+
+function %f29(i128, i128, i64, i128, i128, i128) -> i128 {
+
+block0(v0: i128, v1: i128, v2: i64, v3: i128, v4: i128, v5: i128):
+    v6 = iadd.i128 v0, v1
+    v7 = uextend.i128 v2
+    v8 = iadd.i128 v3, v7
+    v9 = iadd.i128 v4, v5
+    v10 = iadd.i128 v6, v8
+    v11 = iadd.i128 v9, v10
+    return v11
+
+; check:  movq    %rsp, %rbp
+; nextln: subq    $$16, %rsp
+; nextln: movq    %r12, 0(%rsp)
+; nextln: movq    %r13, 8(%rsp)
+; nextln: virtual_sp_offset_adjust 16
+; nextln: movq    16(%rbp), %r9
+; nextln: movq    24(%rbp), %r10
+; nextln: movq    32(%rbp), %r12
+; nextln: movq    40(%rbp), %r11
+; nextln: movq    48(%rbp), %rax
+; nextln: movq    56(%rbp), %r13
+; nextln: addq    %rdx, %rdi
+; nextln: adcq    %rcx, %rsi
+; nextln: xorq    %rcx, %rcx
+; nextln: addq    %r8, %r9
+; nextln: adcq    %rcx, %r10
+; nextln: addq    %rax, %r12
+; nextln: adcq    %r13, %r11
+; nextln: addq    %r9, %rdi
+; nextln: adcq    %r10, %rsi
+; nextln: addq    %rdi, %r12
+; nextln: adcq    %rsi, %r11
+; nextln: movq    %r12, %rax
+; nextln: movq    %r11, %rdx
+; nextln: movq    0(%rsp), %r12
+; nextln: movq    8(%rsp), %r13
+; nextln: addq    $$16, %rsp
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
+
+}
+
+function %f30(i128) -> i128, i128, i128, i64, i128, i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128):
+    v1 = ireduce.i64 v0
+    return v0, v0, v0, v1, v0, v0
+
+; likely to change with regalloc -- just check the stores into the retval area:
+
+; check:  movq    %r8, 0(%rsi)
+; nextln: movq    %r9, 8(%rsi)
+; nextln: movq    %r10, 16(%rsi)
+; nextln: movq    %r11, 24(%rsi)
+; nextln: movq    %r12, 32(%rsi)
+; nextln: movq    %r13, 48(%rsi)
+; nextln: movq    %r14, 56(%rsi)
+; nextln: movq    %rdi, 64(%rsi)
+; nextln: movq    %rbx, 72(%rsi)
+
+}
+
+function %f31(i128, i128) -> i128, i128 {
+    fn0 = %g(i128, i128) -> i128, i128
+block0(v0: i128, v1: i128):
+    v2, v3 = call fn0(v0, v1)
+    return v2, v3
+
+; check:  pushq   %rbp
+; nextln: movq    %rsp, %rbp
+; nextln: subq    $$16, %rsp
+; nextln: movq    %r12, 0(%rsp)
+; nextln: virtual_sp_offset_adjust 8
+; nextln: movq    %r8, %r12
+; nextln: subq    $$16, %rsp
+; nextln: virtual_sp_offset_adjust 16
+; nextln: lea     0(%rsp), %r8
+; nextln: load_ext_name %g+0, %rax
+; nextln: call    *%rax
+; nextln: movq    0(%rsp), %rsi
+; nextln: movq    8(%rsp), %rdi
+; nextln: addq    $$16, %rsp
+; nextln: virtual_sp_offset_adjust -16
+; nextln: movq    %rsi, 0(%r12)
+; nextln: movq    %rdi, 8(%r12)
+; nextln: movq    0(%rsp), %r12
+; nextln: addq    $$16, %rsp
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
+
+}
+
+function %f32(i128) -> i128 {
+block0(v0: i128):
+    v1 = clz.i128 v0
+    return v1
+
+; check:  pushq   %rbp
+; nextln: movq    %rsp, %rbp
+; nextln: movabsq $$-1, %rcx
+; nextln: bsrq    %rsi, %rax
+; nextln: cmovzq  %rcx, %rax
+; nextln: movl    $$63, %esi
+; nextln: subq    %rax, %rsi
+; nextln: movabsq $$-1, %rcx
+; nextln: bsrq    %rdi, %rax
+; nextln: cmovzq  %rcx, %rax
+; nextln: movl    $$63, %edi
+; nextln: subq    %rax, %rdi
+; nextln: addq    $$64, %rdi
+; nextln: cmpq    $$64, %rsi
+; nextln: cmovnzq %rsi, %rdi
+; nextln: xorq    %rsi, %rsi
+; nextln: movq    %rdi, %rax
+; nextln: movq    %rsi, %rdx
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
+
+}
+
+function %f33(i128) -> i128 {
+block0(v0: i128):
+    v1 = ctz.i128 v0
+    return v1
+}
+
+; check:  pushq   %rbp
+; nextln: movq    %rsp, %rbp
+; nextln: movq    %rsi, %rax
+; nextln: movl    $$64, %ecx
+; nextln: bsfq    %rdi, %rsi
+; nextln: cmovzq  %rcx, %rsi
+; nextln: movl    $$64, %ecx
+; nextln: bsfq    %rax, %rdi
+; nextln: cmovzq  %rcx, %rdi
+; nextln: addq    $$64, %rdi
+; nextln: cmpq    $$64, %rsi
+; nextln: cmovzq  %rdi, %rsi
+; nextln: xorq    %rdi, %rdi
+; nextln: movq    %rsi, %rax
+; nextln: movq    %rdi, %rdx
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
diff --git a/cranelift/filetests/filetests/isa/x64/select-i128.clif b/cranelift/filetests/filetests/isa/x64/select-i128.clif
new file mode 100644
index 0000000000..3492a71997
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/select-i128.clif
@@ -0,0 +1,29 @@
+test compile
+target x86_64
+feature "experimental_x64"
+
+function %f0(i32, i128, i128) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i32, v1: i128, v2: i128):
+
+    v3 = iconst.i32 42
+    v4 = icmp.i32 eq v0, v3
+; nextln: movl    $$42, %eax
+; nextln: cmpl    %eax, %edi
+
+    v5 = select.i128 v4, v1, v2
+; nextln: cmovzq  %rsi, %rcx
+; nextln: cmovzq  %rdx, %r8
+
+    return v5
+; nextln: movq    %rcx, %rax
+; nextln: movq    %r8, %rdx
+
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
+ 
+}
+
diff --git a/cranelift/filetests/filetests/isa/x64/shift-i128-run.clif b/cranelift/filetests/filetests/isa/x64/shift-i128-run.clif
new file mode 100644
index 0000000000..37bc4667e7
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/shift-i128-run.clif
@@ -0,0 +1,106 @@
+test run
+target x86_64
+feature "experimental_x64"
+
+function %ishl1() -> b1 {
+block0:
+    v0 = iconst.i64 0x01010101_01010101
+    v1 = iconcat v0, v0
+    v2 = iconst.i32 2
+    v3 = ishl.i128 v1, v2
+    v4 = iconst.i64 0x04040404_04040404
+    v5 = iconcat v4, v4
+    v6 = icmp eq v3, v5
+    return v6
+}
+; run
+
+function %ishl2() -> b1 {
+block0:
+    v0 = iconst.i64 0x01010101_01010101
+    v1 = iconst.i64 0x01010101_01010101
+    v2 = iconcat v0, v1
+    v3 = iconst.i32 9
+    v4 = ishl.i128 v2, v3
+    v5 = iconst.i64 0x02020202_02020200
+    v6 = iconst.i64 0x02020202_02020202
+    v7 = iconcat v5, v6
+    v8 = icmp eq v4, v7
+    return v8
+}
+; run
+
+function %ishl3() -> b1 {
+block0:
+    v0 = iconst.i64 0x01010101_01010101
+    v1 = iconst.i64 0xffffffff_ffffffff
+    v2 = iconcat v0, v1
+    v3 = iconst.i32 66
+    v4 = ishl.i128 v2, v3
+    v5 = iconst.i64 0x00000000_00000000
+    v6 = iconst.i64 0x04040404_04040404
+    v7 = iconcat v5, v6
+    v8 = icmp eq v4, v7
+    return v8
+}
+; run
+
+function %ushr1() -> b1 {
+block0:
+    v0 = iconst.i64 0x01010101_01010101
+    v1 = iconst.i64 0x01010101_01010101
+    v2 = iconcat v0, v1
+    v3 = iconst.i32 2
+    v4 = ushr.i128 v2, v3
+    v5 = iconst.i64 0x40404040_40404040
+    v6 = iconst.i64 0x00404040_40404040
+    v7 = iconcat v5, v6
+    v8 = icmp eq v4, v7
+    return v8
+}
+; run
+
+function %ushr2() -> b1 {
+block0:
+    v0 = iconst.i64 0x01010101_01010101
+    v1 = iconst.i64 0x01010101_01010101
+    v2 = iconcat v0, v1
+    v3 = iconst.i32 66
+    v4 = ushr.i128 v2, v3
+    v5 = iconst.i64 0x00404040_40404040
+    v6 = iconst.i64 0x00000000_00000000
+    v7 = iconcat v5, v6
+    v8 = icmp eq v4, v7
+    return v8
+}
+; run
+
+function %sshr1() -> b1 {
+block0:
+    v0 = iconst.i64 0x01010101_01010101
+    v1 = iconst.i64 0x81010101_01010101
+    v2 = iconcat v0, v1
+    v3 = iconst.i32 2
+    v4 = sshr.i128 v2, v3
+    v5 = iconst.i64 0x40404040_40404040
+    v6 = iconst.i64 0xe0404040_40404040
+    v7 = iconcat v5, v6
+    v8 = icmp eq v4, v7
+    return v8
+}
+; run
+
+function %sshr2() -> b1 {
+block0:
+    v0 = iconst.i64 0x12345678_9abcdef0
+    v1 = iconst.i64 0x80101010_10101010
+    v2 = iconcat v0, v1
+    v3 = iconst.i32 66
+    v4 = sshr.i128 v2, v3
+    v5 = iconst.i64 0xe0040404_04040404
+    v6 = iconst.i64 0xffffffff_ffffffff
+    v7 = iconcat v5, v6
+    v8 = icmp eq v4, v7
+    return v8
+}
+; run