diff --git a/cranelift/codegen/src/isa/x64/abi.rs b/cranelift/codegen/src/isa/x64/abi.rs
index 74dca6c3ec..aa757392e3 100644
--- a/cranelift/codegen/src/isa/x64/abi.rs
+++ b/cranelift/codegen/src/isa/x64/abi.rs
@@ -138,42 +138,62 @@ impl ABIMachineSpec for X64ABIMachineSpec {
                 ),
             }
 
-            let intreg = in_int_reg(param.value_type);
-            let vecreg = in_vec_reg(param.value_type);
-            debug_assert!(intreg || vecreg);
-            debug_assert!(!(intreg && vecreg));
-
-            let (next_reg, candidate) = if intreg {
-                let candidate = match args_or_rets {
-                    ArgsOrRets::Args => get_intreg_for_arg_systemv(&call_conv, next_gpr),
-                    ArgsOrRets::Rets => get_intreg_for_retval_systemv(&call_conv, next_gpr, i),
-                };
-                debug_assert!(candidate
-                    .map(|r| r.get_class() == RegClass::I64)
-                    .unwrap_or(true));
-                (&mut next_gpr, candidate)
-            } else {
-                let candidate = match args_or_rets {
-                    ArgsOrRets::Args => get_fltreg_for_arg_systemv(&call_conv, next_vreg),
-                    ArgsOrRets::Rets => get_fltreg_for_retval_systemv(&call_conv, next_vreg, i),
-                };
-                debug_assert!(candidate
-                    .map(|r| r.get_class() == RegClass::V128)
-                    .unwrap_or(true));
-                (&mut next_vreg, candidate)
-            };
-
             if let Some(param) = try_fill_baldrdash_reg(call_conv, param) {
-                assert!(intreg);
                 ret.push(param);
-            } else if let Some(reg) = candidate {
+                continue;
+            }
+
+            // Find regclass(es) of the register(s) used to store a value of this type.
+            let (rcs, _) = Inst::rc_for_type(param.value_type)?;
+            let intreg = rcs[0] == RegClass::I64;
+            let num_regs = rcs.len();
+            assert!(num_regs <= 2);
+            if num_regs == 2 {
+                assert_eq!(rcs[0], rcs[1]);
+            }
+
+            let mut regs: SmallVec<[RealReg; 2]> = smallvec![];
+            for j in 0..num_regs {
+                let nextreg = if intreg {
+                    match args_or_rets {
+                        ArgsOrRets::Args => get_intreg_for_arg_systemv(&call_conv, next_gpr + j),
+                        ArgsOrRets::Rets => {
+                            get_intreg_for_retval_systemv(&call_conv, next_gpr + j, i + j)
+                        }
+                    }
+                } else {
+                    match args_or_rets {
+                        ArgsOrRets::Args => get_fltreg_for_arg_systemv(&call_conv, next_vreg + j),
+                        ArgsOrRets::Rets => {
+                            get_fltreg_for_retval_systemv(&call_conv, next_vreg + j, i + j)
+                        }
+                    }
+                };
+                if let Some(reg) = nextreg {
+                    regs.push(reg.to_real_reg());
+                } else {
+                    regs.clear();
+                    break;
+                }
+            }
+
+            if regs.len() > 0 {
+                let regs = match num_regs {
+                    1 => ValueRegs::one(regs[0]),
+                    2 => ValueRegs::two(regs[0], regs[1]),
+                    _ => panic!("More than two registers unexpected"),
+                };
                 ret.push(ABIArg::Reg(
-                    ValueRegs::one(reg.to_real_reg()),
+                    regs,
                     param.value_type,
                     param.extension,
                     param.purpose,
                 ));
-                *next_reg += 1;
+                if intreg {
+                    next_gpr += num_regs;
+                } else {
+                    next_vreg += num_regs;
+                }
             } else {
                 // Compute size. Every arg takes a minimum slot of 8 bytes. (16-byte
                 // stack alignment happens separately after all args.)
@@ -658,31 +678,6 @@ impl From<StackAMode> for SyntheticAmode {
     }
 }
 
-fn in_int_reg(ty: types::Type) -> bool {
-    match ty {
-        types::I8
-        | types::I16
-        | types::I32
-        | types::I64
-        | types::B1
-        | types::B8
-        | types::B16
-        | types::B32
-        | types::B64
-        | types::R64 => true,
-        types::R32 => panic!("unexpected 32-bits refs on x64!"),
-        _ => false,
-    }
-}
-
-fn in_vec_reg(ty: types::Type) -> bool {
-    match ty {
-        types::F32 | types::F64 => true,
-        _ if ty.is_vector() => true,
-        _ => false,
-    }
-}
-
 fn get_intreg_for_arg_systemv(call_conv: &CallConv, idx: usize) -> Option<Reg> {
     match call_conv {
         CallConv::Fast
diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs
index 898134644f..39ca25d060 100644
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -346,23 +346,35 @@ impl PrettyPrintSized for RegMem {
 #[derive(Copy, Clone, PartialEq)]
 pub enum AluRmiROpcode {
     Add,
+    Adc,
     Sub,
+    Sbb,
     And,
     Or,
     Xor,
     /// The signless, non-extending (N x N -> N, for N in {32,64}) variant.
     Mul,
+    /// 8-bit form of And. Handled separately as we don't have full 8-bit op
+    /// support (we just use wider instructions). Used only with some sequences
+    /// with SETcc.
+    And8,
+    /// 8-bit form of Or.
+    Or8,
 }
 
 impl fmt::Debug for AluRmiROpcode {
     fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
         let name = match self {
             AluRmiROpcode::Add => "add",
+            AluRmiROpcode::Adc => "adc",
             AluRmiROpcode::Sub => "sub",
+            AluRmiROpcode::Sbb => "sbb",
             AluRmiROpcode::And => "and",
             AluRmiROpcode::Or => "or",
             AluRmiROpcode::Xor => "xor",
             AluRmiROpcode::Mul => "imul",
+            AluRmiROpcode::And8 => "and",
+            AluRmiROpcode::Or8 => "or",
         };
         write!(fmt, "{}", name)
     }
@@ -374,6 +386,16 @@ impl fmt::Display for AluRmiROpcode {
     }
 }
 
+impl AluRmiROpcode {
+    /// Is this a special-cased 8-bit ALU op?
+    pub fn is_8bit(self) -> bool {
+        match self {
+            AluRmiROpcode::And8 | AluRmiROpcode::Or8 => true,
+            _ => false,
+        }
+    }
+}
+
 #[derive(Clone, PartialEq)]
 pub enum UnaryRmROpcode {
     /// Bit-scan reverse.
@@ -1010,7 +1032,7 @@ impl fmt::Display for ExtMode {
 }
 
 /// These indicate the form of a scalar shift/rotate: left, signed right, unsigned right.
-#[derive(Clone)]
+#[derive(Clone, Copy)]
 pub enum ShiftKind {
     ShiftLeft,
     /// Inserts zeros in the most significant bits.
diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
index 580d469b8d..075724d493 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -83,6 +83,14 @@ impl RexFlags {
         self
     }
 
+    #[inline(always)]
+    fn always_emit_if_8bit_needed(&mut self, reg: u8) -> &mut Self {
+        if reg >= 4 && reg <= 7 {
+            self.always_emit();
+        }
+        self
+    }
+
     #[inline(always)]
     fn must_clear_w(&self) -> bool {
         (self.0 & 1) != 0
@@ -527,7 +535,7 @@ pub(crate) fn emit(
             src,
             dst: reg_g,
         } => {
-            let rex = if *is_64 {
+            let mut rex = if *is_64 {
                 RexFlags::set_w()
             } else {
                 RexFlags::clear_w()
@@ -581,17 +589,26 @@ pub(crate) fn emit(
                     }
                 }
             } else {
-                let (opcode_r, opcode_m, subopcode_i) = match op {
-                    AluRmiROpcode::Add => (0x01, 0x03, 0),
-                    AluRmiROpcode::Sub => (0x29, 0x2B, 5),
-                    AluRmiROpcode::And => (0x21, 0x23, 4),
-                    AluRmiROpcode::Or => (0x09, 0x0B, 1),
-                    AluRmiROpcode::Xor => (0x31, 0x33, 6),
+                let (opcode_r, opcode_m, subopcode_i, is_8bit) = match op {
+                    AluRmiROpcode::Add => (0x01, 0x03, 0, false),
+                    AluRmiROpcode::Adc => (0x11, 0x03, 0, false),
+                    AluRmiROpcode::Sub => (0x29, 0x2B, 5, false),
+                    AluRmiROpcode::Sbb => (0x19, 0x2B, 5, false),
+                    AluRmiROpcode::And => (0x21, 0x23, 4, false),
+                    AluRmiROpcode::Or => (0x09, 0x0B, 1, false),
+                    AluRmiROpcode::Xor => (0x31, 0x33, 6, false),
+                    AluRmiROpcode::And8 => (0x20, 0x22, 4, true),
+                    AluRmiROpcode::Or8 => (0x08, 0x0A, 1, true),
                     AluRmiROpcode::Mul => panic!("unreachable"),
                 };
+                assert!(!(is_8bit && *is_64));
 
                 match src {
                     RegMemImm::Reg { reg: reg_e } => {
+                        if is_8bit {
+                            rex.always_emit_if_8bit_needed(int_reg_enc(*reg_e));
+                            rex.always_emit_if_8bit_needed(int_reg_enc(reg_g.to_reg()));
+                        }
                         // GCC/llvm use the swapped operand encoding (viz., the R/RM vs RM/R
                         // duality). Do this too, so as to be able to compare generated machine
                         // code easily.
@@ -604,11 +621,12 @@ pub(crate) fn emit(
                             reg_g.to_reg(),
                             rex,
                         );
-                        // NB: if this is ever extended to handle byte size ops, be sure to retain
-                        // redundant REX prefixes.
                     }
 
                     RegMemImm::Mem { addr } => {
+                        if is_8bit {
+                            rex.always_emit_if_8bit_needed(int_reg_enc(reg_g.to_reg()));
+                        }
                         // Here we revert to the "normal" G-E ordering.
                         let amode = addr.finalize(state, sink);
                         emit_std_reg_mem(
@@ -625,6 +643,7 @@ pub(crate) fn emit(
                     }
 
                     RegMemImm::Imm { simm32 } => {
+                        assert!(!is_8bit);
                         let use_imm8 = low8_will_sign_extend_to_32(*simm32);
                         let opcode = if use_imm8 { 0x83 } else { 0x81 };
                         // And also here we use the "normal" G-E ordering.
@@ -685,8 +704,13 @@ pub(crate) fn emit(
         }
 
         Inst::Not { size, src } => {
+            let src = int_reg_enc(src.to_reg());
             let (opcode, prefix, rex_flags) = match size {
-                1 => (0xF6, LegacyPrefixes::None, RexFlags::clear_w()),
+                1 => (
+                    0xF6,
+                    LegacyPrefixes::None,
+                    *RexFlags::clear_w().always_emit_if_8bit_needed(src),
+                ),
                 2 => (0xF7, LegacyPrefixes::_66, RexFlags::clear_w()),
                 4 => (0xF7, LegacyPrefixes::None, RexFlags::clear_w()),
                 8 => (0xF7, LegacyPrefixes::None, RexFlags::set_w()),
@@ -694,13 +718,17 @@ pub(crate) fn emit(
             };
 
             let subopcode = 2;
-            let src = int_reg_enc(src.to_reg());
             emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, src, rex_flags)
         }
 
         Inst::Neg { size, src } => {
+            let src = int_reg_enc(src.to_reg());
             let (opcode, prefix, rex_flags) = match size {
-                1 => (0xF6, LegacyPrefixes::None, RexFlags::clear_w()),
+                1 => (
+                    0xF6,
+                    LegacyPrefixes::None,
+                    *RexFlags::clear_w().always_emit_if_8bit_needed(src),
+                ),
                 2 => (0xF7, LegacyPrefixes::_66, RexFlags::clear_w()),
                 4 => (0xF7, LegacyPrefixes::None, RexFlags::clear_w()),
                 8 => (0xF7, LegacyPrefixes::None, RexFlags::set_w()),
@@ -708,7 +736,6 @@ pub(crate) fn emit(
             };
 
             let subopcode = 3;
-            let src = int_reg_enc(src.to_reg());
             emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, src, rex_flags)
         }
 
@@ -717,7 +744,7 @@ pub(crate) fn emit(
             signed,
             divisor,
         } => {
-            let (opcode, prefix, rex_flags) = match size {
+            let (opcode, prefix, mut rex_flags) = match size {
                 1 => (0xF6, LegacyPrefixes::None, RexFlags::clear_w()),
                 2 => (0xF7, LegacyPrefixes::_66, RexFlags::clear_w()),
                 4 => (0xF7, LegacyPrefixes::None, RexFlags::clear_w()),
@@ -732,6 +759,9 @@ pub(crate) fn emit(
             match divisor {
                 RegMem::Reg { reg } => {
                     let src = int_reg_enc(*reg);
+                    if *size == 1 {
+                        rex_flags.always_emit_if_8bit_needed(src);
+                    }
                     emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, src, rex_flags)
                 }
                 RegMem::Mem { addr: src } => {
@@ -987,9 +1017,7 @@ pub(crate) fn emit(
                         ExtMode::BL | ExtMode::BQ => {
                             // A redundant REX prefix must be emitted for certain register inputs.
                             let enc_src = int_reg_enc(*src);
-                            if enc_src >= 4 && enc_src <= 7 {
-                                rex_flags.always_emit();
-                            };
+                            rex_flags.always_emit_if_8bit_needed(enc_src);
                         }
                         _ => {}
                     }
@@ -1084,9 +1112,7 @@ pub(crate) fn emit(
                         ExtMode::BL | ExtMode::BQ => {
                             // A redundant REX prefix must be emitted for certain register inputs.
                             let enc_src = int_reg_enc(*src);
-                            if enc_src >= 4 && enc_src <= 7 {
-                                rex_flags.always_emit();
-                            };
+                            rex_flags.always_emit_if_8bit_needed(enc_src);
                         }
                         _ => {}
                     }
@@ -1130,9 +1156,7 @@ pub(crate) fn emit(
                     let mut rex = RexFlags::clear_w();
 
                     let enc_src = int_reg_enc(*src);
-                    if enc_src >= 4 && enc_src <= 7 {
-                        rex.always_emit();
-                    };
+                    rex.always_emit_if_8bit_needed(enc_src);
 
                     // MOV r8, r/m8 is (REX.W==0) 88 /r
                     emit_std_reg_mem(
@@ -1215,7 +1239,11 @@ pub(crate) fn emit(
             match num_bits {
                 None => {
                     let (opcode, prefix, rex_flags) = match size {
-                        1 => (0xD2, LegacyPrefixes::None, RexFlags::clear_w()),
+                        1 => (
+                            0xD2,
+                            LegacyPrefixes::None,
+                            *RexFlags::clear_w().always_emit_if_8bit_needed(enc_dst),
+                        ),
                         2 => (0xD3, LegacyPrefixes::_66, RexFlags::clear_w()),
                         4 => (0xD3, LegacyPrefixes::None, RexFlags::clear_w()),
                         8 => (0xD3, LegacyPrefixes::None, RexFlags::set_w()),
@@ -1231,7 +1259,11 @@ pub(crate) fn emit(
 
                 Some(num_bits) => {
                     let (opcode, prefix, rex_flags) = match size {
-                        1 => (0xC0, LegacyPrefixes::None, RexFlags::clear_w()),
+                        1 => (
+                            0xC0,
+                            LegacyPrefixes::None,
+                            *RexFlags::clear_w().always_emit_if_8bit_needed(enc_dst),
+                        ),
                         2 => (0xC1, LegacyPrefixes::_66, RexFlags::clear_w()),
                         4 => (0xC1, LegacyPrefixes::None, RexFlags::clear_w()),
                         8 => (0xC1, LegacyPrefixes::None, RexFlags::set_w()),
@@ -1330,9 +1362,7 @@ pub(crate) fn emit(
                     let mut rex = RexFlags::clear_w();
                     // Here, a redundant REX prefix changes the meaning of the instruction.
                     let enc_g = int_reg_enc(*reg_g);
-                    if enc_g >= 4 && enc_g <= 7 {
-                        rex.always_emit();
-                    }
+                    rex.always_emit_if_8bit_needed(enc_g);
                     rex
                 }
                 _ => panic!("x64::Inst::Cmp_RMI_R::emit: unreachable"),
@@ -1343,9 +1373,7 @@ pub(crate) fn emit(
                     if *size == 1 {
                         // Check whether the E register forces the use of a redundant REX.
                         let enc_e = int_reg_enc(*reg_e);
-                        if enc_e >= 4 && enc_e <= 7 {
-                            rex.always_emit();
-                        }
+                        rex.always_emit_if_8bit_needed(enc_e);
                     }
 
                     // Use the swapped operands encoding for CMP, to stay consistent with the output of
@@ -2761,9 +2789,7 @@ pub(crate) fn emit(
                 types::I8 => {
                     let mut rex_flags = RexFlags::clear_w();
                     let enc_src = int_reg_enc(*src);
-                    if enc_src >= 4 && enc_src <= 7 {
-                        rex_flags.always_emit();
-                    };
+                    rex_flags.always_emit_if_8bit_needed(enc_src);
                     (LegacyPrefixes::_F0, rex_flags, 0x0FB0)
                 }
                 types::I16 => (LegacyPrefixes::_66F0, RexFlags::clear_w(), 0x0FB1),
diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
index c3489089b9..42e38c9cd5 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -1025,6 +1025,56 @@ fn test_x64_emit() {
         "4C09FA",
         "orq     %r15, %rdx",
     ));
+    insns.push((
+        Inst::alu_rmi_r(false, AluRmiROpcode::And8, RegMemImm::reg(r15), w_rdx),
+        "4420FA",
+        "andb    %r15b, %dl",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(false, AluRmiROpcode::And8, RegMemImm::reg(rax), w_rsi),
+        "4020C6",
+        "andb    %al, %sil",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(false, AluRmiROpcode::And8, RegMemImm::reg(rax), w_rbx),
+        "20C3",
+        "andb    %al, %bl",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::And8,
+            RegMemImm::mem(Amode::imm_reg(0, rax)),
+            w_rbx,
+        ),
+        "2218",
+        "andb    0(%rax), %bl",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(false, AluRmiROpcode::Or8, RegMemImm::reg(r15), w_rdx),
+        "4408FA",
+        "orb     %r15b, %dl",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(false, AluRmiROpcode::Or8, RegMemImm::reg(rax), w_rsi),
+        "4008C6",
+        "orb     %al, %sil",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(false, AluRmiROpcode::Or8, RegMemImm::reg(rax), w_rbx),
+        "08C3",
+        "orb     %al, %bl",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Or8,
+            RegMemImm::mem(Amode::imm_reg(0, rax)),
+            w_rbx,
+        ),
+        "0A18",
+        "orb     0(%rax), %bl",
+    ));
     insns.push((
         Inst::alu_rmi_r(true, AluRmiROpcode::Xor, RegMemImm::reg(r15), w_rdx),
         "4C31FA",
@@ -1193,6 +1243,16 @@ fn test_x64_emit() {
         "66F7D7",
         "notw    %di",
     ));
+    insns.push((
+        Inst::not(1, Writable::from_reg(regs::rdi())),
+        "40F6D7",
+        "notb    %dil",
+    ));
+    insns.push((
+        Inst::not(1, Writable::from_reg(regs::rax())),
+        "F6D0",
+        "notb    %al",
+    ));
 
     // ========================================================
     // Neg
@@ -1216,6 +1276,16 @@ fn test_x64_emit() {
         "66F7DF",
         "negw    %di",
     ));
+    insns.push((
+        Inst::neg(1, Writable::from_reg(regs::rdi())),
+        "40F6DF",
+        "negb    %dil",
+    ));
+    insns.push((
+        Inst::neg(1, Writable::from_reg(regs::rax())),
+        "F6D8",
+        "negb    %al",
+    ));
 
     // ========================================================
     // Div
@@ -1239,6 +1309,16 @@ fn test_x64_emit() {
         "48F7F7",
         "div     %rdi",
     ));
+    insns.push((
+        Inst::div(1, false, RegMem::reg(regs::rax())),
+        "F6F0",
+        "div     %al",
+    ));
+    insns.push((
+        Inst::div(1, false, RegMem::reg(regs::rsi())),
+        "40F6F6",
+        "div     %sil",
+    ));
 
     // ========================================================
     // MulHi
@@ -2352,9 +2432,14 @@ fn test_x64_emit() {
     ));
     insns.push((
         Inst::shift_r(1, ShiftKind::RotateRight, None, w_rsi),
-        "D2CE",
+        "40D2CE",
         "rorb    %cl, %sil",
     ));
+    insns.push((
+        Inst::shift_r(1, ShiftKind::RotateRight, None, w_rax),
+        "D2C8",
+        "rorb    %cl, %al",
+    ));
     insns.push((
         Inst::shift_r(1, ShiftKind::RotateRight, Some(5), w_r15),
         "41C0CF05",
diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs
index 09c469498d..979c264231 100644
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -1243,6 +1243,14 @@ impl PrettyPrint for Inst {
             (if is_64 { "q" } else { "l" }).to_string()
         }
 
+        fn suffix_lqb(is_64: bool, is_8: bool) -> String {
+            match (is_64, is_8) {
+                (_, true) => "b".to_string(),
+                (true, false) => "q".to_string(),
+                (false, false) => "l".to_string(),
+            }
+        }
+
         fn size_lq(is_64: bool) -> u8 {
             if is_64 {
                 8
@@ -1251,6 +1259,16 @@ impl PrettyPrint for Inst {
             }
         }
 
+        fn size_lqb(is_64: bool, is_8: bool) -> u8 {
+            if is_8 {
+                1
+            } else if is_64 {
+                8
+            } else {
+                4
+            }
+        }
+
         fn suffix_bwlq(size: u8) -> String {
             match size {
                 1 => "b".to_string(),
@@ -1271,9 +1289,9 @@ impl PrettyPrint for Inst {
                 dst,
             } => format!(
                 "{} {}, {}",
-                ljustify2(op.to_string(), suffix_lq(*is_64)),
-                src.show_rru_sized(mb_rru, size_lq(*is_64)),
-                show_ireg_sized(dst.to_reg(), mb_rru, size_lq(*is_64)),
+                ljustify2(op.to_string(), suffix_lqb(*is_64, op.is_8bit())),
+                src.show_rru_sized(mb_rru, size_lqb(*is_64, op.is_8bit())),
+                show_ireg_sized(dst.to_reg(), mb_rru, size_lqb(*is_64, op.is_8bit())),
             ),
 
             Inst::UnaryRmR { src, dst, op, size } => format!(
@@ -2065,6 +2083,17 @@ impl Amode {
             }
         }
     }
+
+    /// Offset the amode by a fixed offset.
+    pub(crate) fn offset(&self, offset: u32) -> Self {
+        let mut ret = self.clone();
+        match &mut ret {
+            &mut Amode::ImmReg { ref mut simm32, .. } => *simm32 += offset,
+            &mut Amode::ImmRegRegShift { ref mut simm32, .. } => *simm32 += offset,
+            _ => panic!("Cannot offset amode: {:?}", self),
+        }
+        ret
+    }
 }
 
 impl RegMemImm {
@@ -2548,77 +2577,88 @@ impl MachInst for Inst {
         ty: Type,
         mut alloc_tmp: F,
     ) -> SmallVec<[Self; 4]> {
-        // We don't support 128-bit constants.
-        assert!(value <= u64::MAX as u128);
         let mut ret = SmallVec::new();
-        let to_reg = to_regs
-            .only_reg()
-            .expect("multi-reg values not supported on x64");
-        if ty == types::F32 {
-            if value == 0 {
-                ret.push(Inst::xmm_rm_r(
-                    SseOpcode::Xorps,
-                    RegMem::reg(to_reg.to_reg()),
-                    to_reg,
-                ));
-            } else {
-                let tmp = alloc_tmp(types::I32);
-                ret.push(Inst::imm(OperandSize::Size32, value as u64, tmp));
-
-                ret.push(Inst::gpr_to_xmm(
-                    SseOpcode::Movd,
-                    RegMem::reg(tmp.to_reg()),
-                    OperandSize::Size32,
-                    to_reg,
-                ));
-            }
-        } else if ty == types::F64 {
-            if value == 0 {
-                ret.push(Inst::xmm_rm_r(
-                    SseOpcode::Xorpd,
-                    RegMem::reg(to_reg.to_reg()),
-                    to_reg,
-                ));
-            } else {
-                let tmp = alloc_tmp(types::I64);
-                ret.push(Inst::imm(OperandSize::Size64, value as u64, tmp));
-
-                ret.push(Inst::gpr_to_xmm(
-                    SseOpcode::Movq,
-                    RegMem::reg(tmp.to_reg()),
-                    OperandSize::Size64,
-                    to_reg,
-                ));
-            }
+        if ty == types::I128 {
+            ret.push(Inst::imm(
+                OperandSize::Size64,
+                value as u64,
+                to_regs.regs()[0],
+            ));
+            ret.push(Inst::imm(
+                OperandSize::Size64,
+                (value >> 64) as u64,
+                to_regs.regs()[1],
+            ));
         } else {
-            // Must be an integer type.
-            debug_assert!(
-                ty == types::B1
-                    || ty == types::I8
-                    || ty == types::B8
-                    || ty == types::I16
-                    || ty == types::B16
-                    || ty == types::I32
-                    || ty == types::B32
-                    || ty == types::I64
-                    || ty == types::B64
-                    || ty == types::R32
-                    || ty == types::R64
-            );
-            if value == 0 {
-                ret.push(Inst::alu_rmi_r(
-                    ty == types::I64,
-                    AluRmiROpcode::Xor,
-                    RegMemImm::reg(to_reg.to_reg()),
-                    to_reg,
-                ));
+            let to_reg = to_regs
+                .only_reg()
+                .expect("multi-reg values not supported on x64");
+            if ty == types::F32 {
+                if value == 0 {
+                    ret.push(Inst::xmm_rm_r(
+                        SseOpcode::Xorps,
+                        RegMem::reg(to_reg.to_reg()),
+                        to_reg,
+                    ));
+                } else {
+                    let tmp = alloc_tmp(types::I32);
+                    ret.push(Inst::imm(OperandSize::Size32, value as u64, tmp));
+
+                    ret.push(Inst::gpr_to_xmm(
+                        SseOpcode::Movd,
+                        RegMem::reg(tmp.to_reg()),
+                        OperandSize::Size32,
+                        to_reg,
+                    ));
+                }
+            } else if ty == types::F64 {
+                if value == 0 {
+                    ret.push(Inst::xmm_rm_r(
+                        SseOpcode::Xorpd,
+                        RegMem::reg(to_reg.to_reg()),
+                        to_reg,
+                    ));
+                } else {
+                    let tmp = alloc_tmp(types::I64);
+                    ret.push(Inst::imm(OperandSize::Size64, value as u64, tmp));
+
+                    ret.push(Inst::gpr_to_xmm(
+                        SseOpcode::Movq,
+                        RegMem::reg(tmp.to_reg()),
+                        OperandSize::Size64,
+                        to_reg,
+                    ));
+                }
             } else {
-                let value = value as u64;
-                ret.push(Inst::imm(
-                    OperandSize::from_bytes(ty.bytes()),
-                    value.into(),
-                    to_reg,
-                ));
+                // Must be an integer type.
+                debug_assert!(
+                    ty == types::B1
+                        || ty == types::I8
+                        || ty == types::B8
+                        || ty == types::I16
+                        || ty == types::B16
+                        || ty == types::I32
+                        || ty == types::B32
+                        || ty == types::I64
+                        || ty == types::B64
+                        || ty == types::R32
+                        || ty == types::R64
+                );
+                if value == 0 {
+                    ret.push(Inst::alu_rmi_r(
+                        ty == types::I64,
+                        AluRmiROpcode::Xor,
+                        RegMemImm::reg(to_reg.to_reg()),
+                        to_reg,
+                    ));
+                } else {
+                    let value = value as u64;
+                    ret.push(Inst::imm(
+                        OperandSize::from_bytes(ty.bytes()),
+                        value.into(),
+                        to_reg,
+                    ));
+                }
             }
         }
         ret
diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs
index 9293221de5..a25da666b3 100644
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -18,7 +18,7 @@ use alloc::vec::Vec;
 use cranelift_codegen_shared::condcodes::CondCode;
 use log::trace;
 use regalloc::{Reg, RegClass, Writable};
-use smallvec::SmallVec;
+use smallvec::{smallvec, SmallVec};
 use std::convert::TryFrom;
 use target_lexicon::Triple;
 
@@ -28,6 +28,7 @@ use target_lexicon::Triple;
 fn is_int_or_ref_ty(ty: Type) -> bool {
     match ty {
         types::I8 | types::I16 | types::I32 | types::I64 | types::R64 => true,
+        types::B1 | types::B8 | types::B16 | types::B32 | types::B64 => true,
         types::R32 => panic!("shouldn't have 32-bits refs on x64"),
         _ => false,
     }
@@ -107,23 +108,26 @@ fn generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u64) -> Va
     non_writable_value_regs(cst_copy)
 }
 
-/// Put the given input into a register, and mark it as used (side-effect).
-fn put_input_in_reg<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> Reg {
+/// Put the given input into possibly multiple registers, and mark it as used (side-effect).
+fn put_input_in_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> ValueRegs<Reg> {
     let ty = ctx.input_ty(spec.insn, spec.input);
     let input = ctx.get_input_as_source_or_const(spec.insn, spec.input);
 
     if let Some(c) = input.constant {
         // Generate constants fresh at each use to minimize long-range register pressure.
         generate_constant(ctx, ty, c)
-            .only_reg()
-            .expect("multi-reg values not supported yet")
     } else {
         ctx.put_input_in_regs(spec.insn, spec.input)
-            .only_reg()
-            .expect("multi-reg values not supported yet")
     }
 }
 
+/// Put the given input into a register, and mark it as used (side-effect).
+fn put_input_in_reg<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> Reg {
+    put_input_in_regs(ctx, spec)
+        .only_reg()
+        .expect("Multi-register value not expected")
+}
+
 /// Determines whether a load operation (indicated by `src_insn`) can be merged
 /// into the current lowering point. If so, returns the address-base source (as
 /// an `InsnInput`) and an offset from that address from which to perform the
@@ -373,25 +377,120 @@ fn emit_extract_lane<C: LowerCtx<I = Inst>>(
 ///
 /// Note: make sure that there are no instructions modifying the flags between a call to this
 /// function and the use of the flags!
-fn emit_cmp<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) {
+///
+/// Takes the condition code that will be tested, and returns
+/// the condition code that should be used. This allows us to
+/// synthesize comparisons out of multiple instructions for
+/// special cases (e.g., 128-bit integers).
+fn emit_cmp<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst, cc: IntCC) -> IntCC {
     let ty = ctx.input_ty(insn, 0);
 
     let inputs = [InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }];
 
-    // TODO Try to commute the operands (and invert the condition) if one is an immediate.
-    let lhs = put_input_in_reg(ctx, inputs[0]);
-    // We force the RHS into a register, and disallow load-op fusion, because we
-    // do not have a transitive guarantee that this cmp-site will be the sole
-    // user of the value. Consider: the icmp might be the only user of a load,
-    // but there may be multiple users of the icmp (e.g.  select or bint
-    // instructions) that each invoke `emit_cmp()`. If we were to allow a load
-    // to sink to the *latest* one, but other sites did not permit sinking, then
-    // we would be missing the load for other cmp-sites.
-    let rhs = put_input_in_reg(ctx, inputs[1]);
+    if ty == types::I128 {
+        // We need to compare both halves and combine the results appropriately.
+        let cmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+        let cmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+        let lhs = put_input_in_regs(ctx, inputs[0]);
+        let lhs_lo = lhs.regs()[0];
+        let lhs_hi = lhs.regs()[1];
+        let rhs = put_input_in_regs(ctx, inputs[1]);
+        let rhs_lo = RegMemImm::reg(rhs.regs()[0]);
+        let rhs_hi = RegMemImm::reg(rhs.regs()[1]);
+        match cc {
+            IntCC::Equal => {
+                ctx.emit(Inst::cmp_rmi_r(8, rhs_hi, lhs_hi));
+                ctx.emit(Inst::setcc(CC::Z, cmp1));
+                ctx.emit(Inst::cmp_rmi_r(8, rhs_lo, lhs_lo));
+                ctx.emit(Inst::setcc(CC::Z, cmp2));
+                ctx.emit(Inst::alu_rmi_r(
+                    true,
+                    AluRmiROpcode::And,
+                    RegMemImm::reg(cmp1.to_reg()),
+                    cmp2,
+                ));
+                ctx.emit(Inst::alu_rmi_r(
+                    true,
+                    AluRmiROpcode::And,
+                    RegMemImm::imm(1),
+                    cmp2,
+                ));
+                IntCC::NotEqual
+            }
+            IntCC::NotEqual => {
+                ctx.emit(Inst::cmp_rmi_r(8, rhs_hi, lhs_hi));
+                ctx.emit(Inst::setcc(CC::NZ, cmp1));
+                ctx.emit(Inst::cmp_rmi_r(8, rhs_lo, lhs_lo));
+                ctx.emit(Inst::setcc(CC::NZ, cmp2));
+                ctx.emit(Inst::alu_rmi_r(
+                    true,
+                    AluRmiROpcode::Or,
+                    RegMemImm::reg(cmp1.to_reg()),
+                    cmp2,
+                ));
+                ctx.emit(Inst::alu_rmi_r(
+                    true,
+                    AluRmiROpcode::And,
+                    RegMemImm::imm(1),
+                    cmp2,
+                ));
+                IntCC::NotEqual
+            }
+            IntCC::SignedLessThan
+            | IntCC::SignedLessThanOrEqual
+            | IntCC::SignedGreaterThan
+            | IntCC::SignedGreaterThanOrEqual
+            | IntCC::UnsignedLessThan
+            | IntCC::UnsignedLessThanOrEqual
+            | IntCC::UnsignedGreaterThan
+            | IntCC::UnsignedGreaterThanOrEqual => {
+                // Result = (lhs_hi <> rhs_hi) ||
+                //          (lhs_hi == rhs_hi && lhs_lo <> rhs_lo)
+                let cmp3 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                ctx.emit(Inst::cmp_rmi_r(8, rhs_hi, lhs_hi));
+                ctx.emit(Inst::setcc(CC::from_intcc(cc.without_equal()), cmp1));
+                ctx.emit(Inst::setcc(CC::Z, cmp2));
+                ctx.emit(Inst::cmp_rmi_r(8, rhs_lo, lhs_lo));
+                ctx.emit(Inst::setcc(CC::from_intcc(cc.unsigned()), cmp3));
+                ctx.emit(Inst::alu_rmi_r(
+                    true,
+                    AluRmiROpcode::And,
+                    RegMemImm::reg(cmp2.to_reg()),
+                    cmp3,
+                ));
+                ctx.emit(Inst::alu_rmi_r(
+                    true,
+                    AluRmiROpcode::Or,
+                    RegMemImm::reg(cmp1.to_reg()),
+                    cmp3,
+                ));
+                ctx.emit(Inst::alu_rmi_r(
+                    true,
+                    AluRmiROpcode::And,
+                    RegMemImm::imm(1),
+                    cmp3,
+                ));
+                IntCC::NotEqual
+            }
+            _ => panic!("Unhandled IntCC in I128 comparison: {:?}", cc),
+        }
+    } else {
+        // TODO Try to commute the operands (and invert the condition) if one is an immediate.
+        let lhs = put_input_in_reg(ctx, inputs[0]);
+        // We force the RHS into a register, and disallow load-op fusion, because we
+        // do not have a transitive guarantee that this cmp-site will be the sole
+        // user of the value. Consider: the icmp might be the only user of a load,
+        // but there may be multiple users of the icmp (e.g.  select or bint
+        // instructions) that each invoke `emit_cmp()`. If we were to allow a load
+        // to sink to the *latest* one, but other sites did not permit sinking, then
+        // we would be missing the load for other cmp-sites.
+        let rhs = put_input_in_reg(ctx, inputs[1]);
 
-    // Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives
-    // us dst - src at the machine instruction level, so invert operands.
-    ctx.emit(Inst::cmp_rmi_r(ty.bytes() as u8, RegMemImm::reg(rhs), lhs));
+        // Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives
+        // us dst - src at the machine instruction level, so invert operands.
+        ctx.emit(Inst::cmp_rmi_r(ty.bytes() as u8, RegMemImm::reg(rhs), lhs));
+        cc
+    }
 }
 
 /// A specification for a fcmp emission.
@@ -489,6 +588,458 @@ fn emit_fcmp<C: LowerCtx<I = Inst>>(
     cond_result
 }
 
+fn emit_bitrev<C: LowerCtx<I = Inst>>(ctx: &mut C, src: Reg, dst: Writable<Reg>, ty: Type) {
+    let bits = ty.bits();
+    let const_mask = if bits == 64 {
+        0xffff_ffff_ffff_ffff
+    } else {
+        (1u64 << bits) - 1
+    };
+    let tmp0 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+    let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+    let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+
+    ctx.emit(Inst::gen_move(tmp0, src, types::I64));
+
+    // Swap 1-bit units.
+    // tmp1 = src
+    ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
+    // tmp2 = 0b0101..
+    ctx.emit(Inst::imm(
+        OperandSize::Size64,
+        0x5555_5555_5555_5555 & const_mask,
+        tmp2,
+    ));
+    // tmp1 = src >> 1
+    ctx.emit(Inst::shift_r(
+        8,
+        ShiftKind::ShiftRightLogical,
+        Some(1),
+        tmp1,
+    ));
+    // tmp1 = (src >> 1) & 0b0101..
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::And,
+        RegMemImm::reg(tmp2.to_reg()),
+        tmp1,
+    ));
+    // tmp2 = src & 0b0101..
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::And,
+        RegMemImm::reg(tmp0.to_reg()),
+        tmp2,
+    ));
+    // tmp2 = (src & 0b0101..) << 1
+    ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, Some(1), tmp2));
+    // tmp0 = (src >> 1) & 0b0101.. | (src & 0b0101..) << 1
+    ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::Or,
+        RegMemImm::reg(tmp1.to_reg()),
+        tmp0,
+    ));
+
+    // Swap 2-bit units.
+    ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
+    ctx.emit(Inst::imm(
+        OperandSize::Size64,
+        0x3333_3333_3333_3333 & const_mask,
+        tmp2,
+    ));
+    ctx.emit(Inst::shift_r(
+        8,
+        ShiftKind::ShiftRightLogical,
+        Some(2),
+        tmp1,
+    ));
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::And,
+        RegMemImm::reg(tmp2.to_reg()),
+        tmp1,
+    ));
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::And,
+        RegMemImm::reg(tmp0.to_reg()),
+        tmp2,
+    ));
+    ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, Some(2), tmp2));
+    ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::Or,
+        RegMemImm::reg(tmp1.to_reg()),
+        tmp0,
+    ));
+
+    // Swap 4-bit units.
+    ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
+    ctx.emit(Inst::imm(
+        OperandSize::Size64,
+        0x0f0f_0f0f_0f0f_0f0f & const_mask,
+        tmp2,
+    ));
+    ctx.emit(Inst::shift_r(
+        8,
+        ShiftKind::ShiftRightLogical,
+        Some(4),
+        tmp1,
+    ));
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::And,
+        RegMemImm::reg(tmp2.to_reg()),
+        tmp1,
+    ));
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::And,
+        RegMemImm::reg(tmp0.to_reg()),
+        tmp2,
+    ));
+    ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, Some(4), tmp2));
+    ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::Or,
+        RegMemImm::reg(tmp1.to_reg()),
+        tmp0,
+    ));
+
+    if bits > 8 {
+        // Swap 8-bit units.
+        ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
+        ctx.emit(Inst::imm(
+            OperandSize::Size64,
+            0x00ff_00ff_00ff_00ff & const_mask,
+            tmp2,
+        ));
+        ctx.emit(Inst::shift_r(
+            8,
+            ShiftKind::ShiftRightLogical,
+            Some(8),
+            tmp1,
+        ));
+        ctx.emit(Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::And,
+            RegMemImm::reg(tmp2.to_reg()),
+            tmp1,
+        ));
+        ctx.emit(Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::And,
+            RegMemImm::reg(tmp0.to_reg()),
+            tmp2,
+        ));
+        ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, Some(8), tmp2));
+        ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
+        ctx.emit(Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::Or,
+            RegMemImm::reg(tmp1.to_reg()),
+            tmp0,
+        ));
+    }
+
+    if bits > 16 {
+        // Swap 16-bit units.
+        ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
+        ctx.emit(Inst::imm(
+            OperandSize::Size64,
+            0x0000_ffff_0000_ffff & const_mask,
+            tmp2,
+        ));
+        ctx.emit(Inst::shift_r(
+            8,
+            ShiftKind::ShiftRightLogical,
+            Some(16),
+            tmp1,
+        ));
+        ctx.emit(Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::And,
+            RegMemImm::reg(tmp2.to_reg()),
+            tmp1,
+        ));
+        ctx.emit(Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::And,
+            RegMemImm::reg(tmp0.to_reg()),
+            tmp2,
+        ));
+        ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, Some(16), tmp2));
+        ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
+        ctx.emit(Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::Or,
+            RegMemImm::reg(tmp1.to_reg()),
+            tmp0,
+        ));
+    }
+
+    if bits > 32 {
+        // Swap 32-bit units.
+        ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
+        ctx.emit(Inst::imm(
+            OperandSize::Size64,
+            0x0000_0000_ffff_ffff & const_mask,
+            tmp2,
+        ));
+        ctx.emit(Inst::shift_r(
+            8,
+            ShiftKind::ShiftRightLogical,
+            Some(32),
+            tmp1,
+        ));
+        ctx.emit(Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::And,
+            RegMemImm::reg(tmp2.to_reg()),
+            tmp1,
+        ));
+        ctx.emit(Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::And,
+            RegMemImm::reg(tmp0.to_reg()),
+            tmp2,
+        ));
+        ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, Some(32), tmp2));
+        ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
+        ctx.emit(Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::Or,
+            RegMemImm::reg(tmp1.to_reg()),
+            tmp0,
+        ));
+    }
+
+    ctx.emit(Inst::gen_move(dst, tmp0.to_reg(), types::I64));
+}
+
+fn emit_shl_i128<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    src: ValueRegs<Reg>,
+    dst: ValueRegs<Writable<Reg>>,
+    amt_src: Reg,
+) {
+    let src_lo = src.regs()[0];
+    let src_hi = src.regs()[1];
+    let dst_lo = dst.regs()[0];
+    let dst_hi = dst.regs()[1];
+
+    // mov tmp1, src_lo
+    // shl tmp1, amt_src
+    // mov tmp2, src_hi
+    // shl tmp2, amt_src
+    // mov amt, 64
+    // sub amt, amt_src
+    // mov tmp3, src_lo
+    // shr tmp3, amt
+    // or tmp3, tmp2
+    // xor dst_lo, dst_lo
+    // mov amt, amt_src
+    // and amt, 64
+    // cmovz dst_hi, tmp3
+    // cmovz dst_lo, tmp1
+    // cmovnz dst_hi, tmp1
+
+    let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+    let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+    let tmp3 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+    let amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+
+    ctx.emit(Inst::gen_move(tmp1, src_lo, types::I64));
+    ctx.emit(Inst::gen_move(
+        Writable::from_reg(regs::rcx()),
+        amt_src,
+        types::I64,
+    ));
+    ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, None, tmp1));
+
+    ctx.emit(Inst::gen_move(tmp2, src_hi, types::I64));
+    ctx.emit(Inst::gen_move(
+        Writable::from_reg(regs::rcx()),
+        amt_src,
+        types::I64,
+    ));
+    ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, None, tmp2));
+
+    ctx.emit(Inst::imm(OperandSize::Size64, 64, amt));
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::Sub,
+        RegMemImm::reg(amt_src),
+        amt,
+    ));
+
+    ctx.emit(Inst::gen_move(tmp3, src_lo, types::I64));
+    ctx.emit(Inst::gen_move(
+        Writable::from_reg(regs::rcx()),
+        amt.to_reg(),
+        types::I64,
+    ));
+    ctx.emit(Inst::shift_r(8, ShiftKind::ShiftRightLogical, None, tmp3));
+
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::Or,
+        RegMemImm::reg(tmp2.to_reg()),
+        tmp3,
+    ));
+
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::Xor,
+        RegMemImm::reg(dst_lo.to_reg()),
+        dst_lo,
+    ));
+    // This isn't semantically necessary, but it keeps the
+    // register allocator happy, because it cannot otherwise
+    // infer that cmovz + cmovnz always defines dst_hi.
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::Xor,
+        RegMemImm::reg(dst_hi.to_reg()),
+        dst_hi,
+    ));
+
+    ctx.emit(Inst::gen_move(amt, amt_src, types::I64));
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::And,
+        RegMemImm::imm(64),
+        amt,
+    ));
+    ctx.emit(Inst::cmove(8, CC::Z, RegMem::reg(tmp3.to_reg()), dst_hi));
+    ctx.emit(Inst::cmove(8, CC::Z, RegMem::reg(tmp1.to_reg()), dst_lo));
+    ctx.emit(Inst::cmove(8, CC::NZ, RegMem::reg(tmp1.to_reg()), dst_hi));
+}
+
+fn emit_shr_i128<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    src: ValueRegs<Reg>,
+    dst: ValueRegs<Writable<Reg>>,
+    amt_src: Reg,
+    is_signed: bool,
+) {
+    let src_lo = src.regs()[0];
+    let src_hi = src.regs()[1];
+    let dst_lo = dst.regs()[0];
+    let dst_hi = dst.regs()[1];
+
+    // mov tmp1, src_hi
+    // {u,s}shr tmp1, amt_src
+    // mov tmp2, src_lo
+    // {u,s}shr tmp2, amt_src
+    // mov amt, 64
+    // sub amt, amt_src
+    // mov tmp3, src_hi
+    // shl tmp3, amt
+    // or tmp3, tmp2
+    // if is_signed:
+    //   mov dst_hi, src_hi
+    //   sshr dst_hi, 63  // get the sign bit
+    // else:
+    //   xor dst_hi, dst_hi
+    // mov amt, amt_src
+    // and amt, 64
+    // cmovz dst_hi, tmp1
+    // cmovz dst_lo, tmp3
+    // cmovnz dst_lo, tmp1
+
+    let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+    let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+    let tmp3 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+    let amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+
+    let shift_kind = if is_signed {
+        ShiftKind::ShiftRightArithmetic
+    } else {
+        ShiftKind::ShiftRightLogical
+    };
+
+    ctx.emit(Inst::gen_move(tmp1, src_hi, types::I64));
+    ctx.emit(Inst::gen_move(
+        Writable::from_reg(regs::rcx()),
+        amt_src,
+        types::I64,
+    ));
+    ctx.emit(Inst::shift_r(8, shift_kind, None, tmp1));
+
+    ctx.emit(Inst::gen_move(tmp2, src_lo, types::I64));
+    ctx.emit(Inst::gen_move(
+        Writable::from_reg(regs::rcx()),
+        amt_src,
+        types::I64,
+    ));
+    ctx.emit(Inst::shift_r(8, shift_kind, None, tmp2));
+
+    ctx.emit(Inst::imm(OperandSize::Size64, 64, amt));
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::Sub,
+        RegMemImm::reg(amt_src),
+        amt,
+    ));
+
+    ctx.emit(Inst::gen_move(tmp3, src_hi, types::I64));
+    ctx.emit(Inst::gen_move(
+        Writable::from_reg(regs::rcx()),
+        amt.to_reg(),
+        types::I64,
+    ));
+    ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, None, tmp3));
+
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::Or,
+        RegMemImm::reg(tmp2.to_reg()),
+        tmp3,
+    ));
+
+    if is_signed {
+        ctx.emit(Inst::gen_move(dst_hi, src_hi, types::I64));
+        ctx.emit(Inst::shift_r(
+            8,
+            ShiftKind::ShiftRightArithmetic,
+            Some(63),
+            dst_hi,
+        ));
+    } else {
+        ctx.emit(Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::Xor,
+            RegMemImm::reg(dst_hi.to_reg()),
+            dst_hi,
+        ));
+    }
+    // This isn't semantically necessary, but it keeps the
+    // register allocator happy, because it cannot otherwise
+    // infer that cmovz + cmovnz always defines dst_lo.
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::Xor,
+        RegMemImm::reg(dst_lo.to_reg()),
+        dst_lo,
+    ));
+
+    ctx.emit(Inst::gen_move(amt, amt_src, types::I64));
+    ctx.emit(Inst::alu_rmi_r(
+        true,
+        AluRmiROpcode::And,
+        RegMemImm::imm(64),
+        amt,
+    ));
+    ctx.emit(Inst::cmove(8, CC::Z, RegMem::reg(tmp1.to_reg()), dst_hi));
+    ctx.emit(Inst::cmove(8, CC::Z, RegMem::reg(tmp3.to_reg()), dst_lo));
+    ctx.emit(Inst::cmove(8, CC::NZ, RegMem::reg(tmp1.to_reg()), dst_lo));
+}
+
 fn make_libcall_sig<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     insn: IRInst,
@@ -676,6 +1227,101 @@ fn lower_to_amode<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput, offset: i
     Amode::imm_reg(offset as u32, input).with_flags(flags)
 }
 
+fn emit_moves<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    dst: ValueRegs<Writable<Reg>>,
+    src: ValueRegs<Reg>,
+    ty: Type,
+) {
+    let (_, tys) = Inst::rc_for_type(ty).unwrap();
+    for ((dst, src), ty) in dst.regs().iter().zip(src.regs().iter()).zip(tys.iter()) {
+        ctx.emit(Inst::gen_move(*dst, *src, *ty));
+    }
+}
+
+fn emit_cmoves<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    size: u8,
+    cc: CC,
+    src: ValueRegs<Reg>,
+    dst: ValueRegs<Writable<Reg>>,
+) {
+    let size = size / src.len() as u8;
+    let size = u8::max(size, 4); // at least 32 bits
+    for (dst, src) in dst.regs().iter().zip(src.regs().iter()) {
+        ctx.emit(Inst::cmove(size, cc, RegMem::reg(*src), *dst));
+    }
+}
+
+fn emit_clz<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    orig_ty: Type,
+    ty: Type,
+    src: Reg,
+    dst: Writable<Reg>,
+) {
+    let src = RegMem::reg(src);
+    let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
+    ctx.emit(Inst::imm(
+        OperandSize::from_bytes(ty.bytes()),
+        u64::max_value(),
+        dst,
+    ));
+
+    ctx.emit(Inst::unary_rm_r(
+        ty.bytes() as u8,
+        UnaryRmROpcode::Bsr,
+        src,
+        tmp,
+    ));
+
+    ctx.emit(Inst::cmove(
+        ty.bytes() as u8,
+        CC::Z,
+        RegMem::reg(dst.to_reg()),
+        tmp,
+    ));
+
+    ctx.emit(Inst::imm(
+        OperandSize::from_bytes(ty.bytes()),
+        orig_ty.bits() as u64 - 1,
+        dst,
+    ));
+
+    ctx.emit(Inst::alu_rmi_r(
+        ty == types::I64,
+        AluRmiROpcode::Sub,
+        RegMemImm::reg(tmp.to_reg()),
+        dst,
+    ));
+}
+
+fn emit_ctz<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    orig_ty: Type,
+    ty: Type,
+    src: Reg,
+    dst: Writable<Reg>,
+) {
+    let src = RegMem::reg(src);
+    let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
+    ctx.emit(Inst::imm(OperandSize::Size32, orig_ty.bits() as u64, tmp));
+
+    ctx.emit(Inst::unary_rm_r(
+        ty.bytes() as u8,
+        UnaryRmROpcode::Bsf,
+        src,
+        dst,
+    ));
+
+    ctx.emit(Inst::cmove(
+        ty.bytes() as u8,
+        CC::Z,
+        RegMem::reg(tmp.to_reg()),
+        dst,
+    ));
+}
+
 //=============================================================================
 // Top-level instruction lowering entry point, for one instruction.
 
@@ -898,6 +1544,102 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 // Move the `lhs` to the same register as `dst`.
                 ctx.emit(Inst::gen_move(dst, lhs, ty));
                 ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
+            } else if ty == types::I128 || ty == types::B128 {
+                let alu_ops = match op {
+                    Opcode::Iadd => (AluRmiROpcode::Add, AluRmiROpcode::Adc),
+                    Opcode::Isub => (AluRmiROpcode::Sub, AluRmiROpcode::Sbb),
+                    // multiply handled specially below
+                    Opcode::Imul => (AluRmiROpcode::Mul, AluRmiROpcode::Mul),
+                    Opcode::Band => (AluRmiROpcode::And, AluRmiROpcode::And),
+                    Opcode::Bor => (AluRmiROpcode::Or, AluRmiROpcode::Or),
+                    Opcode::Bxor => (AluRmiROpcode::Xor, AluRmiROpcode::Xor),
+                    _ => panic!("Unsupported opcode with 128-bit integers: {:?}", op),
+                };
+                let lhs = put_input_in_regs(ctx, inputs[0]);
+                let rhs = put_input_in_regs(ctx, inputs[1]);
+                let dst = get_output_reg(ctx, outputs[0]);
+                assert_eq!(lhs.len(), 2);
+                assert_eq!(rhs.len(), 2);
+                assert_eq!(dst.len(), 2);
+
+                if op != Opcode::Imul {
+                    // add, sub, and, or, xor: just do ops on lower then upper half. Carry-flag
+                    // propagation is implicit (add/adc, sub/sbb).
+                    ctx.emit(Inst::gen_move(dst.regs()[0], lhs.regs()[0], types::I64));
+                    ctx.emit(Inst::gen_move(dst.regs()[1], lhs.regs()[1], types::I64));
+                    ctx.emit(Inst::alu_rmi_r(
+                        /* is_64 = */ true,
+                        alu_ops.0,
+                        RegMemImm::reg(rhs.regs()[0]),
+                        dst.regs()[0],
+                    ));
+                    ctx.emit(Inst::alu_rmi_r(
+                        /* is_64 = */ true,
+                        alu_ops.1,
+                        RegMemImm::reg(rhs.regs()[1]),
+                        dst.regs()[1],
+                    ));
+                } else {
+                    // mul:
+                    //   dst_lo = lhs_lo * rhs_lo
+                    //   dst_hi = umulhi(lhs_lo, rhs_lo) + lhs_lo * rhs_hi + lhs_hi * rhs_lo
+                    //
+                    // so we emit:
+                    //   mov dst_lo, lhs_lo
+                    //   mul dst_lo, rhs_lo
+                    //   mov dst_hi, lhs_lo
+                    //   mul dst_hi, rhs_hi
+                    //   mov tmp, lhs_hi
+                    //   mul tmp, rhs_lo
+                    //   add dst_hi, tmp
+                    //   mov rax, lhs_lo
+                    //   umulhi rhs_lo  // implicit rax arg/dst
+                    //   add dst_hi, rax
+                    let tmp = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                    ctx.emit(Inst::gen_move(dst.regs()[0], lhs.regs()[0], types::I64));
+                    ctx.emit(Inst::alu_rmi_r(
+                        /* is_64 = */ true,
+                        AluRmiROpcode::Mul,
+                        RegMemImm::reg(rhs.regs()[0]),
+                        dst.regs()[0],
+                    ));
+                    ctx.emit(Inst::gen_move(dst.regs()[1], lhs.regs()[0], types::I64));
+                    ctx.emit(Inst::alu_rmi_r(
+                        /* is_64 = */ true,
+                        AluRmiROpcode::Mul,
+                        RegMemImm::reg(rhs.regs()[1]),
+                        dst.regs()[1],
+                    ));
+                    ctx.emit(Inst::gen_move(tmp, lhs.regs()[1], types::I64));
+                    ctx.emit(Inst::alu_rmi_r(
+                        /* is_64 = */ true,
+                        AluRmiROpcode::Mul,
+                        RegMemImm::reg(rhs.regs()[0]),
+                        tmp,
+                    ));
+                    ctx.emit(Inst::alu_rmi_r(
+                        /* is_64 = */ true,
+                        AluRmiROpcode::Add,
+                        RegMemImm::reg(tmp.to_reg()),
+                        dst.regs()[1],
+                    ));
+                    ctx.emit(Inst::gen_move(
+                        Writable::from_reg(regs::rax()),
+                        lhs.regs()[0],
+                        types::I64,
+                    ));
+                    ctx.emit(Inst::mul_hi(
+                        /* size = */ 8,
+                        /* signed = */ false,
+                        RegMem::reg(rhs.regs()[0]),
+                    ));
+                    ctx.emit(Inst::alu_rmi_r(
+                        /* is_64 = */ true,
+                        AluRmiROpcode::Add,
+                        RegMemImm::reg(regs::rdx()),
+                        dst.regs()[1],
+                    ));
+                }
             } else {
                 let is_64 = ty == types::I64;
                 let alu_op = match op {
@@ -1022,17 +1764,27 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         Opcode::Bnot => {
             let ty = ty.unwrap();
             let size = ty.bytes() as u8;
-            let src = put_input_in_reg(ctx, inputs[0]);
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            ctx.emit(Inst::gen_move(dst, src, ty));
 
             if ty.is_vector() {
+                let src = put_input_in_reg(ctx, inputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                ctx.emit(Inst::gen_move(dst, src, ty));
                 let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
                 ctx.emit(Inst::equals(ty, RegMem::from(tmp), tmp));
                 ctx.emit(Inst::xor(ty, RegMem::from(tmp), dst));
+            } else if ty == types::I128 || ty == types::B128 {
+                let src = put_input_in_regs(ctx, inputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]);
+                ctx.emit(Inst::gen_move(dst.regs()[0], src.regs()[0], types::I64));
+                ctx.emit(Inst::not(8, dst.regs()[0]));
+                ctx.emit(Inst::gen_move(dst.regs()[1], src.regs()[1], types::I64));
+                ctx.emit(Inst::not(8, dst.regs()[1]));
             } else if ty.is_bool() {
                 unimplemented!("bool bnot")
             } else {
+                let src = put_input_in_reg(ctx, inputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                ctx.emit(Inst::gen_move(dst, src, ty));
                 ctx.emit(Inst::not(size, dst));
             }
         }
@@ -1064,7 +1816,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let dst_ty = ctx.output_ty(insn, 0);
             debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty);
 
-            if !dst_ty.is_vector() {
+            if !dst_ty.is_vector() && dst_ty.bits() <= 64 {
                 // Scalar shifts on x86 have various encodings:
                 // - shift by one bit, e.g. `SAL r/m8, 1` (not used here)
                 // - shift by an immediate amount, e.g. `SAL r/m8, imm8`
@@ -1118,6 +1870,89 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     ctx.emit(Inst::mov_r_r(true, rhs.unwrap(), w_rcx));
                 }
                 ctx.emit(Inst::shift_r(size, shift_kind, count, dst));
+            } else if dst_ty == types::I128 {
+                let amt_src = put_input_in_reg(ctx, inputs[1]);
+                let src = put_input_in_regs(ctx, inputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]);
+
+                match op {
+                    Opcode::Ishl => {
+                        emit_shl_i128(ctx, src, dst, amt_src);
+                    }
+                    Opcode::Ushr => {
+                        emit_shr_i128(ctx, src, dst, amt_src, /* is_signed = */ false);
+                    }
+                    Opcode::Sshr => {
+                        emit_shr_i128(ctx, src, dst, amt_src, /* is_signed = */ true);
+                    }
+                    Opcode::Rotl => {
+                        // (mov tmp, src)
+                        // (shl.i128 tmp, amt)
+                        // (mov dst, src)
+                        // (ushr.i128 dst, 128-amt)
+                        // (or dst, tmp)
+                        let tmp = ctx.alloc_tmp(types::I128);
+                        emit_shl_i128(ctx, src, tmp, amt_src);
+                        let inv_amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                        ctx.emit(Inst::imm(OperandSize::Size64, 128, inv_amt));
+                        ctx.emit(Inst::alu_rmi_r(
+                            true,
+                            AluRmiROpcode::Sub,
+                            RegMemImm::reg(amt_src),
+                            inv_amt,
+                        ));
+                        emit_shr_i128(
+                            ctx,
+                            src,
+                            dst,
+                            inv_amt.to_reg(),
+                            /* is_signed = */ false,
+                        );
+                        ctx.emit(Inst::alu_rmi_r(
+                            true,
+                            AluRmiROpcode::Or,
+                            RegMemImm::reg(tmp.regs()[0].to_reg()),
+                            dst.regs()[0],
+                        ));
+                        ctx.emit(Inst::alu_rmi_r(
+                            true,
+                            AluRmiROpcode::Or,
+                            RegMemImm::reg(tmp.regs()[1].to_reg()),
+                            dst.regs()[1],
+                        ));
+                    }
+                    Opcode::Rotr => {
+                        // (mov tmp, src)
+                        // (ushr.i128 tmp, amt)
+                        // (mov dst, src)
+                        // (shl.i128 dst, 128-amt)
+                        // (or dst, tmp)
+                        let tmp = ctx.alloc_tmp(types::I128);
+                        emit_shr_i128(ctx, src, tmp, amt_src, /* is_signed = */ false);
+                        let inv_amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                        ctx.emit(Inst::imm(OperandSize::Size64, 128, inv_amt));
+                        ctx.emit(Inst::alu_rmi_r(
+                            true,
+                            AluRmiROpcode::Sub,
+                            RegMemImm::reg(amt_src),
+                            inv_amt,
+                        ));
+                        emit_shl_i128(ctx, src, dst, inv_amt.to_reg());
+                        ctx.emit(Inst::alu_rmi_r(
+                            true,
+                            AluRmiROpcode::Or,
+                            RegMemImm::reg(tmp.regs()[0].to_reg()),
+                            dst.regs()[0],
+                        ));
+                        ctx.emit(Inst::alu_rmi_r(
+                            true,
+                            AluRmiROpcode::Or,
+                            RegMemImm::reg(tmp.regs()[1].to_reg()),
+                            dst.regs()[1],
+                        ));
+                    }
+                    _ => unreachable!(),
+                }
             } else if dst_ty == types::I8X16 && (op == Opcode::Ishl || op == Opcode::Ushr) {
                 // Since the x86 instruction set does not have any 8x16 shift instructions (even in higher feature sets
                 // like AVX), we lower the `ishl.i8x16` and `ushr.i8x16` to a sequence of instructions. The basic idea,
@@ -1449,52 +2284,50 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             // mov $(size_bits - 1), %dst
             // sub %tmp, %dst
 
-            let (ext_spec, ty) = match ctx.input_ty(insn, 0) {
-                types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32),
-                a if a == types::I32 || a == types::I64 => (None, a),
-                _ => unreachable!(),
-            };
-
-            let src = if let Some(ext_spec) = ext_spec {
-                RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec))
+            let orig_ty = ty.unwrap();
+            if orig_ty == types::I128 {
+                // clz upper, tmp1
+                // clz lower, dst
+                // add dst, 64
+                // cmp tmp1, 64
+                // cmovnz tmp1, dst
+                let dsts = get_output_reg(ctx, outputs[0]);
+                let dst = dsts.regs()[0];
+                let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                let srcs = put_input_in_regs(ctx, inputs[0]);
+                let src_lo = srcs.regs()[0];
+                let src_hi = srcs.regs()[1];
+                emit_clz(ctx, types::I64, types::I64, src_hi, tmp1);
+                emit_clz(ctx, types::I64, types::I64, src_lo, dst);
+                ctx.emit(Inst::alu_rmi_r(
+                    true,
+                    AluRmiROpcode::Add,
+                    RegMemImm::imm(64),
+                    dst,
+                ));
+                ctx.emit(Inst::cmp_rmi_r(8, RegMemImm::imm(64), tmp1.to_reg()));
+                ctx.emit(Inst::cmove(8, CC::NZ, RegMem::reg(tmp1.to_reg()), dst));
+                ctx.emit(Inst::alu_rmi_r(
+                    true,
+                    AluRmiROpcode::Xor,
+                    RegMemImm::reg(dsts.regs()[1].to_reg()),
+                    dsts.regs()[1],
+                ));
             } else {
-                input_to_reg_mem(ctx, inputs[0])
-            };
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                let (ext_spec, ty) = match orig_ty {
+                    types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32),
+                    a if a == types::I32 || a == types::I64 => (None, a),
+                    _ => unreachable!(),
+                };
+                let src = if let Some(ext_spec) = ext_spec {
+                    extend_input_to_reg(ctx, inputs[0], ext_spec)
+                } else {
+                    put_input_in_reg(ctx, inputs[0])
+                };
 
-            let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
-            ctx.emit(Inst::imm(
-                OperandSize::from_bytes(ty.bytes()),
-                u64::max_value(),
-                dst,
-            ));
-
-            ctx.emit(Inst::unary_rm_r(
-                ty.bytes() as u8,
-                UnaryRmROpcode::Bsr,
-                src,
-                tmp,
-            ));
-
-            ctx.emit(Inst::cmove(
-                ty.bytes() as u8,
-                CC::Z,
-                RegMem::reg(dst.to_reg()),
-                tmp,
-            ));
-
-            ctx.emit(Inst::imm(
-                OperandSize::from_bytes(ty.bytes()),
-                ty.bits() as u64 - 1,
-                dst,
-            ));
-
-            ctx.emit(Inst::alu_rmi_r(
-                ty == types::I64,
-                AluRmiROpcode::Sub,
-                RegMemImm::reg(tmp.to_reg()),
-                dst,
-            ));
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                emit_clz(ctx, orig_ty, ty, src, dst);
+            }
         }
 
         Opcode::Ctz => {
@@ -1504,29 +2337,47 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             // bsf %src, %dst
             // mov $(size_bits), %tmp
             // cmovz %tmp, %dst
-            let ty = ctx.input_ty(insn, 0);
-            let ty = if ty.bits() < 32 { types::I32 } else { ty };
-            debug_assert!(ty == types::I32 || ty == types::I64);
+            let orig_ty = ctx.input_ty(insn, 0);
+            if orig_ty == types::I128 {
+                // ctz src_lo, dst
+                // ctz src_hi, tmp1
+                // add tmp1, 64
+                // cmp dst, 64
+                // cmovz tmp1, dst
+                let dsts = get_output_reg(ctx, outputs[0]);
+                let dst = dsts.regs()[0];
+                let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                let srcs = put_input_in_regs(ctx, inputs[0]);
+                let src_lo = srcs.regs()[0];
+                let src_hi = srcs.regs()[1];
+                emit_ctz(ctx, types::I64, types::I64, src_lo, dst);
+                emit_ctz(ctx, types::I64, types::I64, src_hi, tmp1);
+                ctx.emit(Inst::alu_rmi_r(
+                    true,
+                    AluRmiROpcode::Add,
+                    RegMemImm::imm(64),
+                    tmp1,
+                ));
+                ctx.emit(Inst::cmp_rmi_r(8, RegMemImm::imm(64), dst.to_reg()));
+                ctx.emit(Inst::cmove(8, CC::Z, RegMem::reg(tmp1.to_reg()), dst));
+                ctx.emit(Inst::alu_rmi_r(
+                    true,
+                    AluRmiROpcode::Xor,
+                    RegMemImm::reg(dsts.regs()[1].to_reg()),
+                    dsts.regs()[1],
+                ));
+            } else {
+                let ty = if orig_ty.bits() < 32 {
+                    types::I32
+                } else {
+                    orig_ty
+                };
+                debug_assert!(ty == types::I32 || ty == types::I64);
 
-            let src = input_to_reg_mem(ctx, inputs[0]);
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-
-            let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
-            ctx.emit(Inst::imm(OperandSize::Size32, ty.bits() as u64, tmp));
-
-            ctx.emit(Inst::unary_rm_r(
-                ty.bytes() as u8,
-                UnaryRmROpcode::Bsf,
-                src,
-                dst,
-            ));
-
-            ctx.emit(Inst::cmove(
-                ty.bytes() as u8,
-                CC::Z,
-                RegMem::reg(tmp.to_reg()),
-                dst,
-            ));
+                let src = put_input_in_reg(ctx, inputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                emit_ctz(ctx, orig_ty, ty, src, dst);
+            }
         }
 
         Opcode::Popcnt => {
@@ -1535,272 +2386,329 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let (ext_spec, ty) = match ctx.input_ty(insn, 0) {
                 types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32),
                 a if a == types::I32 || a == types::I64 => (None, a),
+                types::I128 => (None, types::I128),
                 _ => unreachable!(),
             };
 
-            let src = if let Some(ext_spec) = ext_spec {
-                RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec))
+            let (srcs, ty): (SmallVec<[RegMem; 2]>, Type) = if let Some(ext_spec) = ext_spec {
+                (
+                    smallvec![RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec))],
+                    ty,
+                )
+            } else if ty == types::I128 {
+                let regs = put_input_in_regs(ctx, inputs[0]);
+                (
+                    smallvec![RegMem::reg(regs.regs()[0]), RegMem::reg(regs.regs()[1])],
+                    types::I64,
+                )
             } else {
                 // N.B.: explicitly put input in a reg here because the width of the instruction
                 // into which this RM op goes may not match the width of the input type (in fact,
                 // it won't for i32.popcnt), and we don't want a larger than necessary load.
-                RegMem::reg(put_input_in_reg(ctx, inputs[0]))
+                (smallvec![RegMem::reg(put_input_in_reg(ctx, inputs[0]))], ty)
             };
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
-            if ty == types::I64 {
-                let is_64 = true;
+            let mut dsts: SmallVec<[Reg; 2]> = smallvec![];
+            for src in srcs {
+                let dst = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                dsts.push(dst.to_reg());
+                if ty == types::I64 {
+                    let is_64 = true;
 
-                let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
-                let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
-                let cst = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                    let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                    let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                    let cst = ctx.alloc_tmp(types::I64).only_reg().unwrap();
 
-                // mov src, tmp1
-                ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1));
+                    // mov src, tmp1
+                    ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1));
 
-                // shr $1, tmp1
-                ctx.emit(Inst::shift_r(
-                    8,
-                    ShiftKind::ShiftRightLogical,
-                    Some(1),
-                    tmp1,
-                ));
+                    // shr $1, tmp1
+                    ctx.emit(Inst::shift_r(
+                        8,
+                        ShiftKind::ShiftRightLogical,
+                        Some(1),
+                        tmp1,
+                    ));
 
-                // mov 0x7777_7777_7777_7777, cst
-                ctx.emit(Inst::imm(OperandSize::Size64, 0x7777777777777777, cst));
+                    // mov 0x7777_7777_7777_7777, cst
+                    ctx.emit(Inst::imm(OperandSize::Size64, 0x7777777777777777, cst));
 
-                // andq cst, tmp1
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::And,
-                    RegMemImm::reg(cst.to_reg()),
-                    tmp1,
-                ));
+                    // andq cst, tmp1
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::And,
+                        RegMemImm::reg(cst.to_reg()),
+                        tmp1,
+                    ));
 
-                // mov src, tmp2
-                ctx.emit(Inst::mov64_rm_r(src, tmp2));
+                    // mov src, tmp2
+                    ctx.emit(Inst::mov64_rm_r(src, tmp2));
 
-                // sub tmp1, tmp2
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::Sub,
-                    RegMemImm::reg(tmp1.to_reg()),
-                    tmp2,
-                ));
+                    // sub tmp1, tmp2
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::Sub,
+                        RegMemImm::reg(tmp1.to_reg()),
+                        tmp2,
+                    ));
 
-                // shr $1, tmp1
-                ctx.emit(Inst::shift_r(
-                    8,
-                    ShiftKind::ShiftRightLogical,
-                    Some(1),
-                    tmp1,
-                ));
+                    // shr $1, tmp1
+                    ctx.emit(Inst::shift_r(
+                        8,
+                        ShiftKind::ShiftRightLogical,
+                        Some(1),
+                        tmp1,
+                    ));
 
-                // and cst, tmp1
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::And,
-                    RegMemImm::reg(cst.to_reg()),
-                    tmp1,
-                ));
+                    // and cst, tmp1
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::And,
+                        RegMemImm::reg(cst.to_reg()),
+                        tmp1,
+                    ));
 
-                // sub tmp1, tmp2
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::Sub,
-                    RegMemImm::reg(tmp1.to_reg()),
-                    tmp2,
-                ));
+                    // sub tmp1, tmp2
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::Sub,
+                        RegMemImm::reg(tmp1.to_reg()),
+                        tmp2,
+                    ));
 
-                // shr $1, tmp1
-                ctx.emit(Inst::shift_r(
-                    8,
-                    ShiftKind::ShiftRightLogical,
-                    Some(1),
-                    tmp1,
-                ));
+                    // shr $1, tmp1
+                    ctx.emit(Inst::shift_r(
+                        8,
+                        ShiftKind::ShiftRightLogical,
+                        Some(1),
+                        tmp1,
+                    ));
 
-                // and cst, tmp1
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::And,
-                    RegMemImm::reg(cst.to_reg()),
-                    tmp1,
-                ));
+                    // and cst, tmp1
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::And,
+                        RegMemImm::reg(cst.to_reg()),
+                        tmp1,
+                    ));
 
-                // sub tmp1, tmp2
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::Sub,
-                    RegMemImm::reg(tmp1.to_reg()),
-                    tmp2,
-                ));
+                    // sub tmp1, tmp2
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::Sub,
+                        RegMemImm::reg(tmp1.to_reg()),
+                        tmp2,
+                    ));
 
-                // mov tmp2, dst
-                ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst));
+                    // mov tmp2, dst
+                    ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst));
 
-                // shr $4, dst
-                ctx.emit(Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(4), dst));
+                    // shr $4, dst
+                    ctx.emit(Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(4), dst));
 
-                // add tmp2, dst
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::Add,
-                    RegMemImm::reg(tmp2.to_reg()),
-                    dst,
-                ));
+                    // add tmp2, dst
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::Add,
+                        RegMemImm::reg(tmp2.to_reg()),
+                        dst,
+                    ));
 
-                // mov $0x0F0F_0F0F_0F0F_0F0F, cst
-                ctx.emit(Inst::imm(OperandSize::Size64, 0x0F0F0F0F0F0F0F0F, cst));
+                    // mov $0x0F0F_0F0F_0F0F_0F0F, cst
+                    ctx.emit(Inst::imm(OperandSize::Size64, 0x0F0F0F0F0F0F0F0F, cst));
 
-                // and cst, dst
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::And,
-                    RegMemImm::reg(cst.to_reg()),
-                    dst,
-                ));
+                    // and cst, dst
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::And,
+                        RegMemImm::reg(cst.to_reg()),
+                        dst,
+                    ));
 
-                // mov $0x0101_0101_0101_0101, cst
-                ctx.emit(Inst::imm(OperandSize::Size64, 0x0101010101010101, cst));
+                    // mov $0x0101_0101_0101_0101, cst
+                    ctx.emit(Inst::imm(OperandSize::Size64, 0x0101010101010101, cst));
 
-                // mul cst, dst
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::Mul,
-                    RegMemImm::reg(cst.to_reg()),
-                    dst,
-                ));
+                    // mul cst, dst
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::Mul,
+                        RegMemImm::reg(cst.to_reg()),
+                        dst,
+                    ));
 
-                // shr $56, dst
-                ctx.emit(Inst::shift_r(
-                    8,
-                    ShiftKind::ShiftRightLogical,
-                    Some(56),
-                    dst,
-                ));
+                    // shr $56, dst
+                    ctx.emit(Inst::shift_r(
+                        8,
+                        ShiftKind::ShiftRightLogical,
+                        Some(56),
+                        dst,
+                    ));
+                } else {
+                    assert_eq!(ty, types::I32);
+                    let is_64 = false;
+
+                    let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                    let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+
+                    // mov src, tmp1
+                    ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1));
+
+                    // shr $1, tmp1
+                    ctx.emit(Inst::shift_r(
+                        4,
+                        ShiftKind::ShiftRightLogical,
+                        Some(1),
+                        tmp1,
+                    ));
+
+                    // andq $0x7777_7777, tmp1
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::And,
+                        RegMemImm::imm(0x77777777),
+                        tmp1,
+                    ));
+
+                    // mov src, tmp2
+                    ctx.emit(Inst::mov64_rm_r(src, tmp2));
+
+                    // sub tmp1, tmp2
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::Sub,
+                        RegMemImm::reg(tmp1.to_reg()),
+                        tmp2,
+                    ));
+
+                    // shr $1, tmp1
+                    ctx.emit(Inst::shift_r(
+                        4,
+                        ShiftKind::ShiftRightLogical,
+                        Some(1),
+                        tmp1,
+                    ));
+
+                    // and 0x7777_7777, tmp1
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::And,
+                        RegMemImm::imm(0x77777777),
+                        tmp1,
+                    ));
+
+                    // sub tmp1, tmp2
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::Sub,
+                        RegMemImm::reg(tmp1.to_reg()),
+                        tmp2,
+                    ));
+
+                    // shr $1, tmp1
+                    ctx.emit(Inst::shift_r(
+                        4,
+                        ShiftKind::ShiftRightLogical,
+                        Some(1),
+                        tmp1,
+                    ));
+
+                    // and $0x7777_7777, tmp1
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::And,
+                        RegMemImm::imm(0x77777777),
+                        tmp1,
+                    ));
+
+                    // sub tmp1, tmp2
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::Sub,
+                        RegMemImm::reg(tmp1.to_reg()),
+                        tmp2,
+                    ));
+
+                    // mov tmp2, dst
+                    ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst));
+
+                    // shr $4, dst
+                    ctx.emit(Inst::shift_r(4, ShiftKind::ShiftRightLogical, Some(4), dst));
+
+                    // add tmp2, dst
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::Add,
+                        RegMemImm::reg(tmp2.to_reg()),
+                        dst,
+                    ));
+
+                    // and $0x0F0F_0F0F, dst
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::And,
+                        RegMemImm::imm(0x0F0F0F0F),
+                        dst,
+                    ));
+
+                    // mul $0x0101_0101, dst
+                    ctx.emit(Inst::alu_rmi_r(
+                        is_64,
+                        AluRmiROpcode::Mul,
+                        RegMemImm::imm(0x01010101),
+                        dst,
+                    ));
+
+                    // shr $24, dst
+                    ctx.emit(Inst::shift_r(
+                        4,
+                        ShiftKind::ShiftRightLogical,
+                        Some(24),
+                        dst,
+                    ));
+                }
+            }
+
+            if dsts.len() == 1 {
+                let final_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                ctx.emit(Inst::gen_move(final_dst, dsts[0], types::I64));
             } else {
-                assert_eq!(ty, types::I32);
-                let is_64 = false;
-
-                let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
-                let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
-
-                // mov src, tmp1
-                ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1));
-
-                // shr $1, tmp1
-                ctx.emit(Inst::shift_r(
-                    4,
-                    ShiftKind::ShiftRightLogical,
-                    Some(1),
-                    tmp1,
-                ));
-
-                // andq $0x7777_7777, tmp1
+                assert!(dsts.len() == 2);
+                let final_dst = get_output_reg(ctx, outputs[0]);
+                ctx.emit(Inst::gen_move(final_dst.regs()[0], dsts[0], types::I64));
                 ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::And,
-                    RegMemImm::imm(0x77777777),
-                    tmp1,
-                ));
-
-                // mov src, tmp2
-                ctx.emit(Inst::mov64_rm_r(src, tmp2));
-
-                // sub tmp1, tmp2
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::Sub,
-                    RegMemImm::reg(tmp1.to_reg()),
-                    tmp2,
-                ));
-
-                // shr $1, tmp1
-                ctx.emit(Inst::shift_r(
-                    4,
-                    ShiftKind::ShiftRightLogical,
-                    Some(1),
-                    tmp1,
-                ));
-
-                // and 0x7777_7777, tmp1
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::And,
-                    RegMemImm::imm(0x77777777),
-                    tmp1,
-                ));
-
-                // sub tmp1, tmp2
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::Sub,
-                    RegMemImm::reg(tmp1.to_reg()),
-                    tmp2,
-                ));
-
-                // shr $1, tmp1
-                ctx.emit(Inst::shift_r(
-                    4,
-                    ShiftKind::ShiftRightLogical,
-                    Some(1),
-                    tmp1,
-                ));
-
-                // and $0x7777_7777, tmp1
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::And,
-                    RegMemImm::imm(0x77777777),
-                    tmp1,
-                ));
-
-                // sub tmp1, tmp2
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::Sub,
-                    RegMemImm::reg(tmp1.to_reg()),
-                    tmp2,
-                ));
-
-                // mov tmp2, dst
-                ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst));
-
-                // shr $4, dst
-                ctx.emit(Inst::shift_r(4, ShiftKind::ShiftRightLogical, Some(4), dst));
-
-                // add tmp2, dst
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
+                    true,
                     AluRmiROpcode::Add,
-                    RegMemImm::reg(tmp2.to_reg()),
-                    dst,
+                    RegMemImm::reg(dsts[1]),
+                    final_dst.regs()[0],
                 ));
-
-                // and $0x0F0F_0F0F, dst
                 ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::And,
-                    RegMemImm::imm(0x0F0F0F0F),
-                    dst,
+                    true,
+                    AluRmiROpcode::Xor,
+                    RegMemImm::reg(final_dst.regs()[1].to_reg()),
+                    final_dst.regs()[1],
                 ));
+            }
+        }
 
-                // mul $0x0101_0101, dst
-                ctx.emit(Inst::alu_rmi_r(
-                    is_64,
-                    AluRmiROpcode::Mul,
-                    RegMemImm::imm(0x01010101),
-                    dst,
-                ));
+        Opcode::Bitrev => {
+            let ty = ctx.input_ty(insn, 0);
+            assert!(
+                ty == types::I8
+                    || ty == types::I16
+                    || ty == types::I32
+                    || ty == types::I64
+                    || ty == types::I128
+            );
 
-                // shr $24, dst
-                ctx.emit(Inst::shift_r(
-                    4,
-                    ShiftKind::ShiftRightLogical,
-                    Some(24),
-                    dst,
-                ));
+            if ty == types::I128 {
+                let src = put_input_in_regs(ctx, inputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]);
+                emit_bitrev(ctx, src.regs()[0], dst.regs()[1], types::I64);
+                emit_bitrev(ctx, src.regs()[1], dst.regs()[0], types::I64);
+            } else {
+                let src = put_input_in_reg(ctx, inputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                emit_bitrev(ctx, src, dst, ty);
             }
         }
 
@@ -1836,63 +2744,112 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let src_ty = ctx.input_ty(insn, 0);
             let dst_ty = ctx.output_ty(insn, 0);
 
-            // Sextend requires a sign-extended move, but all the other opcodes are simply a move
-            // from a zero-extended source. Here is why this works, in each case:
-            //
-            // - Bint: Bool-to-int. We always represent a bool as a 0 or 1, so we merely need to
-            // zero-extend here.
-            //
-            // - Breduce, Bextend: changing width of a boolean. We represent a bool as a 0 or 1, so
-            // again, this is a zero-extend / no-op.
-            //
-            // - Ireduce: changing width of an integer. Smaller ints are stored with undefined
-            // high-order bits, so we can simply do a copy.
+            if src_ty == types::I128 {
+                assert!(dst_ty.bits() <= 64);
+                assert!(op == Opcode::Ireduce);
+                let src = put_input_in_regs(ctx, inputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                ctx.emit(Inst::gen_move(dst, src.regs()[0], types::I64));
+            } else if dst_ty == types::I128 {
+                assert!(src_ty.bits() <= 64);
+                let src = put_input_in_reg(ctx, inputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]);
+                assert!(op == Opcode::Uextend || op == Opcode::Sextend || op == Opcode::Bint);
+                // Extend to 64 bits first.
 
-            if src_ty == types::I32 && dst_ty == types::I64 && op != Opcode::Sextend {
-                // As a particular x64 extra-pattern matching opportunity, all the ALU opcodes on
-                // 32-bits will zero-extend the upper 32-bits, so we can even not generate a
-                // zero-extended move in this case.
-                // TODO add loads and shifts here.
-                if let Some(_) = matches_input_any(
-                    ctx,
-                    inputs[0],
-                    &[
-                        Opcode::Iadd,
-                        Opcode::IaddIfcout,
-                        Opcode::Isub,
-                        Opcode::Imul,
-                        Opcode::Band,
-                        Opcode::Bor,
-                        Opcode::Bxor,
-                    ],
-                ) {
-                    let src = put_input_in_reg(ctx, inputs[0]);
-                    let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-                    ctx.emit(Inst::gen_move(dst, src, types::I64));
-                    return Ok(());
-                }
-            }
-
-            let src = input_to_reg_mem(ctx, inputs[0]);
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-
-            let ext_mode = ExtMode::new(src_ty.bits(), dst_ty.bits());
-            assert_eq!(
-                src_ty.bits() < dst_ty.bits(),
-                ext_mode.is_some(),
-                "unexpected extension: {} -> {}",
-                src_ty,
-                dst_ty
-            );
-
-            if let Some(ext_mode) = ext_mode {
-                if op == Opcode::Sextend {
-                    ctx.emit(Inst::movsx_rm_r(ext_mode, src, dst));
+                let ext_mode = ExtMode::new(src_ty.bits(), /* dst bits = */ 64);
+                if let Some(ext_mode) = ext_mode {
+                    if op == Opcode::Sextend {
+                        ctx.emit(Inst::movsx_rm_r(ext_mode, RegMem::reg(src), dst.regs()[0]));
+                    } else {
+                        ctx.emit(Inst::movzx_rm_r(ext_mode, RegMem::reg(src), dst.regs()[0]));
+                    }
                 } else {
-                    ctx.emit(Inst::movzx_rm_r(ext_mode, src, dst));
+                    ctx.emit(Inst::mov64_rm_r(RegMem::reg(src), dst.regs()[0]));
+                }
+
+                // Now generate the top 64 bits.
+                if op == Opcode::Sextend {
+                    // Sign-extend: move dst[0] into dst[1] and arithmetic-shift right by 63 bits
+                    // to spread the sign bit across all bits.
+                    ctx.emit(Inst::gen_move(
+                        dst.regs()[1],
+                        dst.regs()[0].to_reg(),
+                        types::I64,
+                    ));
+                    ctx.emit(Inst::shift_r(
+                        8,
+                        ShiftKind::ShiftRightArithmetic,
+                        Some(63),
+                        dst.regs()[1],
+                    ));
+                } else {
+                    // Zero-extend: just zero the top word.
+                    ctx.emit(Inst::alu_rmi_r(
+                        true,
+                        AluRmiROpcode::Xor,
+                        RegMemImm::reg(dst.regs()[1].to_reg()),
+                        dst.regs()[1],
+                    ));
                 }
             } else {
-                ctx.emit(Inst::mov64_rm_r(src, dst));
+                // Sextend requires a sign-extended move, but all the other opcodes are simply a move
+                // from a zero-extended source. Here is why this works, in each case:
+                //
+                // - Bint: Bool-to-int. We always represent a bool as a 0 or 1, so we merely need to
+                // zero-extend here.
+                //
+                // - Breduce, Bextend: changing width of a boolean. We represent a bool as a 0 or 1, so
+                // again, this is a zero-extend / no-op.
+                //
+                // - Ireduce: changing width of an integer. Smaller ints are stored with undefined
+                // high-order bits, so we can simply do a copy.
+                if src_ty == types::I32 && dst_ty == types::I64 && op != Opcode::Sextend {
+                    // As a particular x64 extra-pattern matching opportunity, all the ALU opcodes on
+                    // 32-bits will zero-extend the upper 32-bits, so we can even not generate a
+                    // zero-extended move in this case.
+                    // TODO add loads and shifts here.
+                    if let Some(_) = matches_input_any(
+                        ctx,
+                        inputs[0],
+                        &[
+                            Opcode::Iadd,
+                            Opcode::IaddIfcout,
+                            Opcode::Isub,
+                            Opcode::Imul,
+                            Opcode::Band,
+                            Opcode::Bor,
+                            Opcode::Bxor,
+                        ],
+                    ) {
+                        let src = put_input_in_reg(ctx, inputs[0]);
+                        let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                        ctx.emit(Inst::gen_move(dst, src, types::I64));
+                        return Ok(());
+                    }
+                }
+
+                let src = input_to_reg_mem(ctx, inputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+
+                let ext_mode = ExtMode::new(src_ty.bits(), dst_ty.bits());
+                assert_eq!(
+                    src_ty.bits() < dst_ty.bits(),
+                    ext_mode.is_some(),
+                    "unexpected extension: {} -> {}",
+                    src_ty,
+                    dst_ty
+                );
+
+                if let Some(ext_mode) = ext_mode {
+                    if op == Opcode::Sextend {
+                        ctx.emit(Inst::movsx_rm_r(ext_mode, src, dst));
+                    } else {
+                        ctx.emit(Inst::movzx_rm_r(ext_mode, src, dst));
+                    }
+                } else {
+                    ctx.emit(Inst::mov64_rm_r(src, dst));
+                }
             }
         }
 
@@ -1901,7 +2858,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ctx.input_ty(insn, 0);
             if !ty.is_vector() {
-                emit_cmp(ctx, insn);
+                let condcode = emit_cmp(ctx, insn, condcode);
                 let cc = CC::from_intcc(condcode);
                 ctx.emit(Inst::setcc(cc, dst));
             } else {
@@ -2108,10 +3065,19 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
         Opcode::FallthroughReturn | Opcode::Return => {
             for i in 0..ctx.num_inputs(insn) {
-                let src_reg = put_input_in_reg(ctx, inputs[i]);
+                let src_reg = put_input_in_regs(ctx, inputs[i]);
                 let retval_reg = ctx.retval(i);
                 let ty = ctx.input_ty(insn, i);
-                ctx.emit(Inst::gen_move(retval_reg.only_reg().unwrap(), src_reg, ty));
+                assert!(src_reg.len() == retval_reg.len());
+                let (_, tys) = Inst::rc_for_type(ty)?;
+                for ((&src, &dst), &ty) in src_reg
+                    .regs()
+                    .iter()
+                    .zip(retval_reg.regs().iter())
+                    .zip(tys.iter())
+                {
+                    ctx.emit(Inst::gen_move(dst, src, ty));
+                }
             }
             // N.B.: the Ret itself is generated by the ABI.
         }
@@ -2147,13 +3113,13 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             abi.emit_stack_pre_adjust(ctx);
             assert_eq!(inputs.len(), abi.num_args());
             for (i, input) in inputs.iter().enumerate() {
-                let arg_reg = put_input_in_reg(ctx, *input);
-                abi.emit_copy_regs_to_arg(ctx, i, ValueRegs::one(arg_reg));
+                let arg_regs = put_input_in_regs(ctx, *input);
+                abi.emit_copy_regs_to_arg(ctx, i, arg_regs);
             }
             abi.emit_call(ctx);
             for (i, output) in outputs.iter().enumerate() {
-                let retval_reg = get_output_reg(ctx, *output).only_reg().unwrap();
-                abi.emit_copy_retval_to_regs(ctx, i, ValueRegs::one(retval_reg));
+                let retval_regs = get_output_reg(ctx, *output);
+                abi.emit_copy_retval_to_regs(ctx, i, retval_regs);
             }
             abi.emit_stack_post_adjust(ctx);
         }
@@ -2180,11 +3146,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 ctx.emit_safepoint(Inst::TrapIf { trap_code, cc });
             } else if op == Opcode::Trapif {
                 let cond_code = ctx.data(insn).cond_code().unwrap();
-                let cc = CC::from_intcc(cond_code);
 
                 // Verification ensures that the input is always a single-def ifcmp.
                 let ifcmp = matches_input(ctx, inputs[0], Opcode::Ifcmp).unwrap();
-                emit_cmp(ctx, ifcmp);
+                let cond_code = emit_cmp(ctx, ifcmp, cond_code);
+                let cc = CC::from_intcc(cond_code);
 
                 ctx.emit_safepoint(Inst::TrapIf { trap_code, cc });
             } else {
@@ -2266,7 +3232,9 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
         Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv => {
             let lhs = put_input_in_reg(ctx, inputs[0]);
-            let rhs = input_to_reg_mem(ctx, inputs[1]);
+            // We can't guarantee the RHS (if a load) is 128-bit aligned, so we
+            // must avoid merging a load here.
+            let rhs = RegMem::reg(put_input_in_reg(ctx, inputs[1]));
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
 
@@ -2523,7 +3491,9 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::FminPseudo | Opcode::FmaxPseudo => {
-            let lhs = input_to_reg_mem(ctx, inputs[0]);
+            // We can't guarantee the RHS (if a load) is 128-bit aligned, so we
+            // must avoid merging a load here.
+            let lhs = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
             let rhs = put_input_in_reg(ctx, inputs[1]);
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
@@ -2539,7 +3509,9 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::Sqrt => {
-            let src = input_to_reg_mem(ctx, inputs[0]);
+            // We can't guarantee the RHS (if a load) is 128-bit aligned, so we
+            // must avoid merging a load here.
+            let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
 
@@ -2558,13 +3530,17 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::Fpromote => {
-            let src = input_to_reg_mem(ctx, inputs[0]);
+            // We can't guarantee the RHS (if a load) is 128-bit aligned, so we
+            // must avoid merging a load here.
+            let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtss2sd, src, dst));
         }
 
         Opcode::Fdemote => {
-            let src = input_to_reg_mem(ctx, inputs[0]);
+            // We can't guarantee the RHS (if a load) is 128-bit aligned, so we
+            // must avoid merging a load here.
+            let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtsd2ss, src, dst));
         }
@@ -2581,7 +3557,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
                 let src = match ext_spec {
                     Some(ext_spec) => RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec)),
-                    None => input_to_reg_mem(ctx, inputs[0]),
+                    None => RegMem::reg(put_input_in_reg(ctx, inputs[0])),
                 };
 
                 let opcode = if output_ty == types::F32 {
@@ -3096,7 +4072,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::Fabs | Opcode::Fneg => {
-            let src = input_to_reg_mem(ctx, inputs[0]);
+            let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
             // In both cases, generate a constant and apply a single binary instruction:
@@ -3392,59 +4368,64 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 _ => unreachable!(),
             };
 
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let is_xmm = elem_ty.is_float() || elem_ty.is_vector();
-
-            match (sign_extend, is_xmm) {
-                (true, false) => {
-                    // The load is sign-extended only when the output size is lower than 64 bits,
-                    // so ext-mode is defined in this case.
-                    ctx.emit(Inst::movsx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst));
-                }
-                (false, false) => {
-                    if elem_ty.bytes() == 8 {
-                        // Use a plain load.
-                        ctx.emit(Inst::mov64_m_r(amode, dst))
-                    } else {
-                        // Use a zero-extended load.
-                        ctx.emit(Inst::movzx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst))
+            if elem_ty == types::I128 {
+                let dsts = get_output_reg(ctx, outputs[0]);
+                ctx.emit(Inst::mov64_m_r(amode.clone(), dsts.regs()[0]));
+                ctx.emit(Inst::mov64_m_r(amode.offset(8), dsts.regs()[1]));
+            } else {
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                let is_xmm = elem_ty.is_float() || elem_ty.is_vector();
+                match (sign_extend, is_xmm) {
+                    (true, false) => {
+                        // The load is sign-extended only when the output size is lower than 64 bits,
+                        // so ext-mode is defined in this case.
+                        ctx.emit(Inst::movsx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst));
                     }
-                }
-                (_, true) => {
-                    ctx.emit(match elem_ty {
-                        types::F32 => Inst::xmm_mov(SseOpcode::Movss, RegMem::mem(amode), dst),
-                        types::F64 => Inst::xmm_mov(SseOpcode::Movsd, RegMem::mem(amode), dst),
-                        types::I8X8 => {
-                            if sign_extend == true {
-                                Inst::xmm_mov(SseOpcode::Pmovsxbw, RegMem::mem(amode), dst)
-                            } else {
-                                Inst::xmm_mov(SseOpcode::Pmovzxbw, RegMem::mem(amode), dst)
+                    (false, false) => {
+                        if elem_ty.bytes() == 8 {
+                            // Use a plain load.
+                            ctx.emit(Inst::mov64_m_r(amode, dst))
+                        } else {
+                            // Use a zero-extended load.
+                            ctx.emit(Inst::movzx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst))
+                        }
+                    }
+                    (_, true) => {
+                        ctx.emit(match elem_ty {
+                            types::F32 => Inst::xmm_mov(SseOpcode::Movss, RegMem::mem(amode), dst),
+                            types::F64 => Inst::xmm_mov(SseOpcode::Movsd, RegMem::mem(amode), dst),
+                            types::I8X8 => {
+                                if sign_extend == true {
+                                    Inst::xmm_mov(SseOpcode::Pmovsxbw, RegMem::mem(amode), dst)
+                                } else {
+                                    Inst::xmm_mov(SseOpcode::Pmovzxbw, RegMem::mem(amode), dst)
+                                }
                             }
-                        }
-                        types::I16X4 => {
-                            if sign_extend == true {
-                                Inst::xmm_mov(SseOpcode::Pmovsxwd, RegMem::mem(amode), dst)
-                            } else {
-                                Inst::xmm_mov(SseOpcode::Pmovzxwd, RegMem::mem(amode), dst)
+                            types::I16X4 => {
+                                if sign_extend == true {
+                                    Inst::xmm_mov(SseOpcode::Pmovsxwd, RegMem::mem(amode), dst)
+                                } else {
+                                    Inst::xmm_mov(SseOpcode::Pmovzxwd, RegMem::mem(amode), dst)
+                                }
                             }
-                        }
-                        types::I32X2 => {
-                            if sign_extend == true {
-                                Inst::xmm_mov(SseOpcode::Pmovsxdq, RegMem::mem(amode), dst)
-                            } else {
-                                Inst::xmm_mov(SseOpcode::Pmovzxdq, RegMem::mem(amode), dst)
+                            types::I32X2 => {
+                                if sign_extend == true {
+                                    Inst::xmm_mov(SseOpcode::Pmovsxdq, RegMem::mem(amode), dst)
+                                } else {
+                                    Inst::xmm_mov(SseOpcode::Pmovzxdq, RegMem::mem(amode), dst)
+                                }
                             }
-                        }
-                        _ if elem_ty.is_vector() && elem_ty.bits() == 128 => {
-                            Inst::xmm_mov(SseOpcode::Movups, RegMem::mem(amode), dst)
-                        }
-                        // TODO Specialize for different types: MOVUPD, MOVDQU
-                        _ => unreachable!(
-                            "unexpected type for load: {:?} - {:?}",
-                            elem_ty,
-                            elem_ty.bits()
-                        ),
-                    });
+                            _ if elem_ty.is_vector() && elem_ty.bits() == 128 => {
+                                Inst::xmm_mov(SseOpcode::Movups, RegMem::mem(amode), dst)
+                            }
+                            // TODO Specialize for different types: MOVUPD, MOVDQU
+                            _ => unreachable!(
+                                "unexpected type for load: {:?} - {:?}",
+                                elem_ty,
+                                elem_ty.bits()
+                            ),
+                        });
+                    }
                 }
             }
         }
@@ -3491,17 +4472,23 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 _ => unreachable!(),
             };
 
-            let src = put_input_in_reg(ctx, inputs[0]);
+            if elem_ty == types::I128 {
+                let srcs = put_input_in_regs(ctx, inputs[0]);
+                ctx.emit(Inst::mov_r_m(8, srcs.regs()[0], addr.clone()));
+                ctx.emit(Inst::mov_r_m(8, srcs.regs()[1], addr.offset(8)));
+            } else {
+                let src = put_input_in_reg(ctx, inputs[0]);
 
-            ctx.emit(match elem_ty {
-                types::F32 => Inst::xmm_mov_r_m(SseOpcode::Movss, src, addr),
-                types::F64 => Inst::xmm_mov_r_m(SseOpcode::Movsd, src, addr),
-                _ if elem_ty.is_vector() && elem_ty.bits() == 128 => {
-                    // TODO Specialize for different types: MOVUPD, MOVDQU, etc.
-                    Inst::xmm_mov_r_m(SseOpcode::Movups, src, addr)
-                }
-                _ => Inst::mov_r_m(elem_ty.bytes() as u8, src, addr),
-            });
+                ctx.emit(match elem_ty {
+                    types::F32 => Inst::xmm_mov_r_m(SseOpcode::Movss, src, addr),
+                    types::F64 => Inst::xmm_mov_r_m(SseOpcode::Movsd, src, addr),
+                    _ if elem_ty.is_vector() && elem_ty.bits() == 128 => {
+                        // TODO Specialize for different types: MOVUPD, MOVDQU, etc.
+                        Inst::xmm_mov_r_m(SseOpcode::Movups, src, addr)
+                    }
+                    _ => Inst::mov_r_m(elem_ty.bytes() as u8, src, addr),
+                });
+            }
         }
 
         Opcode::AtomicRmw => {
@@ -3668,17 +4655,9 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 };
 
                 let ty = ctx.output_ty(insn, 0);
-                let rhs = put_input_in_reg(ctx, rhs_input);
-                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-                let lhs = if is_int_or_ref_ty(ty) && ty.bytes() < 4 {
-                    // Special case: since the higher bits are undefined per CLIF semantics, we
-                    // can just apply a 32-bit cmove here. Force inputs into registers, to
-                    // avoid partial spilling out-of-bounds with memory accesses, though.
-                    // Sign-extend operands to 32, then do a cmove of size 4.
-                    RegMem::reg(put_input_in_reg(ctx, lhs_input))
-                } else {
-                    input_to_reg_mem(ctx, lhs_input)
-                };
+                let rhs = put_input_in_regs(ctx, rhs_input);
+                let dst = get_output_reg(ctx, outputs[0]);
+                let lhs = put_input_in_regs(ctx, lhs_input);
 
                 // We request inversion of Equal to NotEqual here: taking LHS if equal would mean
                 // take it if both CC::NP and CC::Z are set, the conjunction of which can't be
@@ -3691,15 +4670,20 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     assert_eq!(cond_code, FloatCC::Equal);
                 }
 
-                ctx.emit(Inst::gen_move(dst, rhs, ty));
+                emit_moves(ctx, dst, rhs, ty);
 
                 match fcmp_results {
                     FcmpCondResult::Condition(cc) => {
-                        if is_int_or_ref_ty(ty) {
-                            let size = u8::max(ty.bytes() as u8, 4);
-                            ctx.emit(Inst::cmove(size, cc, lhs, dst));
+                        if is_int_or_ref_ty(ty) || ty == types::I128 || ty == types::B128 {
+                            let size = ty.bytes() as u8;
+                            emit_cmoves(ctx, size, cc, lhs, dst);
                         } else {
-                            ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst));
+                            ctx.emit(Inst::xmm_cmove(
+                                ty == types::F64,
+                                cc,
+                                RegMem::reg(lhs.only_reg().unwrap()),
+                                dst.only_reg().unwrap(),
+                            ));
                         }
                     }
                     FcmpCondResult::AndConditions(_, _) => {
@@ -3709,40 +4693,37 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     }
                     FcmpCondResult::InvertedEqualOrConditions(cc1, cc2)
                     | FcmpCondResult::OrConditions(cc1, cc2) => {
-                        if is_int_or_ref_ty(ty) {
-                            let size = u8::max(ty.bytes() as u8, 4);
-                            ctx.emit(Inst::cmove(size, cc1, lhs.clone(), dst));
-                            ctx.emit(Inst::cmove(size, cc2, lhs, dst));
+                        if is_int_or_ref_ty(ty) || ty == types::I128 {
+                            let size = ty.bytes() as u8;
+                            emit_cmoves(ctx, size, cc1, lhs.clone(), dst);
+                            emit_cmoves(ctx, size, cc2, lhs, dst);
                         } else {
-                            ctx.emit(Inst::xmm_cmove(ty == types::F64, cc1, lhs.clone(), dst));
-                            ctx.emit(Inst::xmm_cmove(ty == types::F64, cc2, lhs, dst));
+                            ctx.emit(Inst::xmm_cmove(
+                                ty == types::F64,
+                                cc1,
+                                RegMem::reg(lhs.only_reg().unwrap()),
+                                dst.only_reg().unwrap(),
+                            ));
+                            ctx.emit(Inst::xmm_cmove(
+                                ty == types::F64,
+                                cc2,
+                                RegMem::reg(lhs.only_reg().unwrap()),
+                                dst.only_reg().unwrap(),
+                            ));
                         }
                     }
                 }
             } else {
                 let ty = ty.unwrap();
 
-                let mut size = ty.bytes() as u8;
-                let lhs = if is_int_or_ref_ty(ty) {
-                    if size < 4 {
-                        // Special case: since the higher bits are undefined per CLIF semantics, we
-                        // can just apply a 32-bit cmove here. Force inputs into registers, to
-                        // avoid partial spilling out-of-bounds with memory accesses, though.
-                        size = 4;
-                        RegMem::reg(put_input_in_reg(ctx, inputs[1]))
-                    } else {
-                        input_to_reg_mem(ctx, inputs[1])
-                    }
-                } else {
-                    input_to_reg_mem(ctx, inputs[1])
-                };
-
-                let rhs = put_input_in_reg(ctx, inputs[2]);
-                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                let size = ty.bytes() as u8;
+                let lhs = put_input_in_regs(ctx, inputs[1]);
+                let rhs = put_input_in_regs(ctx, inputs[2]);
+                let dst = get_output_reg(ctx, outputs[0]);
 
                 let cc = if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) {
-                    emit_cmp(ctx, icmp);
                     let cond_code = ctx.data(icmp).cond_code().unwrap();
+                    let cond_code = emit_cmp(ctx, icmp, cond_code);
                     CC::from_intcc(cond_code)
                 } else {
                     let sel_ty = ctx.input_ty(insn, 0);
@@ -3768,21 +4749,26 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 };
 
                 // This doesn't affect the flags.
-                ctx.emit(Inst::gen_move(dst, rhs, ty));
+                emit_moves(ctx, dst, rhs, ty);
 
-                if is_int_or_ref_ty(ty) {
-                    ctx.emit(Inst::cmove(size, cc, lhs, dst));
+                if is_int_or_ref_ty(ty) || ty == types::I128 {
+                    emit_cmoves(ctx, size, cc, lhs, dst);
                 } else {
                     debug_assert!(ty == types::F32 || ty == types::F64);
-                    ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst));
+                    ctx.emit(Inst::xmm_cmove(
+                        ty == types::F64,
+                        cc,
+                        RegMem::reg(lhs.only_reg().unwrap()),
+                        dst.only_reg().unwrap(),
+                    ));
                 }
             }
         }
 
         Opcode::Selectif | Opcode::SelectifSpectreGuard => {
-            let lhs = input_to_reg_mem(ctx, inputs[1]);
-            let rhs = put_input_in_reg(ctx, inputs[2]);
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+            let lhs = put_input_in_regs(ctx, inputs[1]);
+            let rhs = put_input_in_regs(ctx, inputs[2]);
+            let dst = get_output_reg(ctx, outputs[0]);
             let ty = ctx.output_ty(insn, 0);
 
             // Verification ensures that the input is always a single-def ifcmp.
@@ -3792,26 +4778,24 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 .unwrap()
                 .0;
             debug_assert_eq!(ctx.data(cmp_insn).opcode(), Opcode::Ifcmp);
-            emit_cmp(ctx, cmp_insn);
+            let cond_code = ctx.data(insn).cond_code().unwrap();
+            let cond_code = emit_cmp(ctx, cmp_insn, cond_code);
 
-            let cc = CC::from_intcc(ctx.data(insn).cond_code().unwrap());
+            let cc = CC::from_intcc(cond_code);
 
-            if is_int_or_ref_ty(ty) {
+            if is_int_or_ref_ty(ty) || ty == types::I128 {
                 let size = ty.bytes() as u8;
-                if size == 1 {
-                    // Sign-extend operands to 32, then do a cmove of size 4.
-                    let lhs_se = ctx.alloc_tmp(types::I32).only_reg().unwrap();
-                    ctx.emit(Inst::movsx_rm_r(ExtMode::BL, lhs, lhs_se));
-                    ctx.emit(Inst::movsx_rm_r(ExtMode::BL, RegMem::reg(rhs), dst));
-                    ctx.emit(Inst::cmove(4, cc, RegMem::reg(lhs_se.to_reg()), dst));
-                } else {
-                    ctx.emit(Inst::gen_move(dst, rhs, ty));
-                    ctx.emit(Inst::cmove(size, cc, lhs, dst));
-                }
+                emit_moves(ctx, dst, rhs, ty);
+                emit_cmoves(ctx, size, cc, lhs, dst);
             } else {
                 debug_assert!(ty == types::F32 || ty == types::F64);
-                ctx.emit(Inst::gen_move(dst, rhs, ty));
-                ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst));
+                emit_moves(ctx, dst, rhs, ty);
+                ctx.emit(Inst::xmm_cmove(
+                    ty == types::F64,
+                    cc,
+                    RegMem::reg(lhs.only_reg().unwrap()),
+                    dst.only_reg().unwrap(),
+                ));
             }
         }
 
@@ -3894,8 +4878,19 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 // The quotient is in rax.
                 ctx.emit(Inst::gen_move(dst, regs::rax(), input_ty));
             } else {
-                // The remainder is in rdx.
-                ctx.emit(Inst::gen_move(dst, regs::rdx(), input_ty));
+                if size == 1 {
+                    // The remainder is in AH. Right-shift by 8 bits then move from rax.
+                    ctx.emit(Inst::shift_r(
+                        8,
+                        ShiftKind::ShiftRightLogical,
+                        Some(8),
+                        Writable::from_reg(regs::rax()),
+                    ));
+                    ctx.emit(Inst::gen_move(dst, regs::rax(), input_ty));
+                } else {
+                    // The remainder is in rdx.
+                    ctx.emit(Inst::gen_move(dst, regs::rdx(), input_ty));
+                }
             }
         }
 
@@ -4297,6 +5292,38 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             }
         }
 
+        Opcode::Iconcat => {
+            let ty = ctx.output_ty(insn, 0);
+            assert_eq!(
+                ty,
+                types::I128,
+                "Iconcat not expected to be used for non-128-bit type"
+            );
+            assert_eq!(ctx.input_ty(insn, 0), types::I64);
+            assert_eq!(ctx.input_ty(insn, 1), types::I64);
+            let lo = put_input_in_reg(ctx, inputs[0]);
+            let hi = put_input_in_reg(ctx, inputs[1]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            ctx.emit(Inst::gen_move(dst.regs()[0], lo, types::I64));
+            ctx.emit(Inst::gen_move(dst.regs()[1], hi, types::I64));
+        }
+
+        Opcode::Isplit => {
+            let ty = ctx.input_ty(insn, 0);
+            assert_eq!(
+                ty,
+                types::I128,
+                "Iconcat not expected to be used for non-128-bit type"
+            );
+            assert_eq!(ctx.output_ty(insn, 0), types::I64);
+            assert_eq!(ctx.output_ty(insn, 1), types::I64);
+            let src = put_input_in_regs(ctx, inputs[0]);
+            let dst_lo = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+            let dst_hi = get_output_reg(ctx, outputs[1]).only_reg().unwrap();
+            ctx.emit(Inst::gen_move(dst_lo, src.regs()[0], types::I64));
+            ctx.emit(Inst::gen_move(dst_hi, src.regs()[1], types::I64));
+        }
+
         Opcode::IaddImm
         | Opcode::ImulImm
         | Opcode::UdivImm
@@ -4384,9 +5411,9 @@ impl LowerBackend for X64Backend {
                     let src_ty = ctx.input_ty(branches[0], 0);
 
                     if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) {
-                        emit_cmp(ctx, icmp);
-
                         let cond_code = ctx.data(icmp).cond_code().unwrap();
+                        let cond_code = emit_cmp(ctx, icmp, cond_code);
+
                         let cond_code = if op0 == Opcode::Brz {
                             cond_code.inverse()
                         } else {
@@ -4416,6 +5443,32 @@ impl LowerBackend for X64Backend {
                             }
                             FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
                         }
+                    } else if src_ty == types::I128 {
+                        let src = put_input_in_regs(
+                            ctx,
+                            InsnInput {
+                                insn: branches[0],
+                                input: 0,
+                            },
+                        );
+                        let (half_cc, comb_op) = match op0 {
+                            Opcode::Brz => (CC::Z, AluRmiROpcode::And8),
+                            Opcode::Brnz => (CC::NZ, AluRmiROpcode::Or8),
+                            _ => unreachable!(),
+                        };
+                        let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                        let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                        ctx.emit(Inst::cmp_rmi_r(8, RegMemImm::imm(0), src.regs()[0]));
+                        ctx.emit(Inst::setcc(half_cc, tmp1));
+                        ctx.emit(Inst::cmp_rmi_r(8, RegMemImm::imm(0), src.regs()[1]));
+                        ctx.emit(Inst::setcc(half_cc, tmp2));
+                        ctx.emit(Inst::alu_rmi_r(
+                            false,
+                            comb_op,
+                            RegMemImm::reg(tmp1.to_reg()),
+                            tmp2,
+                        ));
+                        ctx.emit(Inst::jmp_cond(CC::NZ, taken, not_taken));
                     } else if is_int_or_ref_ty(src_ty) || is_bool_ty(src_ty) {
                         let src = put_input_in_reg(
                             ctx,
@@ -4483,8 +5536,8 @@ impl LowerBackend for X64Backend {
                     };
 
                     if let Some(ifcmp) = matches_input(ctx, flag_input, Opcode::Ifcmp) {
-                        emit_cmp(ctx, ifcmp);
                         let cond_code = ctx.data(branches[0]).cond_code().unwrap();
+                        let cond_code = emit_cmp(ctx, ifcmp, cond_code);
                         let cc = CC::from_intcc(cond_code);
                         ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
                     } else if let Some(ifcmp_sp) = matches_input(ctx, flag_input, Opcode::IfcmpSp) {
diff --git a/cranelift/filetests/filetests/isa/x64/bitops-i128-run.clif b/cranelift/filetests/filetests/isa/x64/bitops-i128-run.clif
new file mode 100644
index 0000000000..5795900438
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/bitops-i128-run.clif
@@ -0,0 +1,27 @@
+test run
+target x86_64
+feature "experimental_x64"
+
+function %ctz(i64, i64) -> i8 {
+block0(v0: i64, v1: i64):
+    v2 = iconcat v0, v1
+    v3 = ctz.i128 v2
+    v4 = ireduce.i8 v3
+    return v4
+}
+; run: %ctz(0x00000000_00000000, 0x00000001_00000000) == 96
+; run: %ctz(0x00000000_00010000, 0x00000001_00000000) == 16
+; run: %ctz(0x00000000_00010000, 0x00000000_00000000) == 16
+; run: %ctz(0x00000000_00000000, 0x00000000_00000000) == 128
+
+function %clz(i64, i64) -> i8 {
+block0(v0: i64, v1: i64):
+    v2 = iconcat v0, v1
+    v3 = clz.i128 v2
+    v4 = ireduce.i8 v3
+    return v4
+}
+; run: %clz(0x00000000_00000000, 0x00000001_00000000) == 31
+; run: %clz(0x00000000_00010000, 0x00000001_00000000) == 31
+; run: %clz(0x00000000_00010000, 0x00000000_00000000) == 111
+; run: %clz(0x00000000_00000000, 0x00000000_00000000) == 128
diff --git a/cranelift/filetests/filetests/isa/x64/bitrev-i128-run.clif b/cranelift/filetests/filetests/isa/x64/bitrev-i128-run.clif
new file mode 100644
index 0000000000..64ea96716c
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/bitrev-i128-run.clif
@@ -0,0 +1,47 @@
+test run
+target x86_64
+feature "experimental_x64"
+
+function %reverse_bits_zero() -> b1 {
+block0:
+    v0 = iconst.i64 0
+    v1 = iconcat v0, v0
+    v2 = bitrev.i128 v1
+    v3 = icmp eq v2, v1
+    return v3
+}
+; run
+
+function %reverse_bits_one() -> b1 {
+block0:
+    v0 = iconst.i64 0
+    v1 = iconst.i64 1
+    v2 = iconcat v0, v1
+
+    v3 = bitrev.i128 v2
+
+    v4 = iconst.i64 0x8000_0000_0000_0000
+    v5 = iconst.i64 0
+    v6 = iconcat v4, v5
+
+    v7 = icmp eq v3, v6
+    return v7
+}
+; run
+
+function %reverse_bits() -> b1 {
+block0:
+    v0 = iconst.i64 0x06AD_8667_69EC_41BA
+    v1 = iconst.i64 0x6C83_D81A_6E28_83AB
+    v2 = iconcat v0, v1
+
+    v3 = bitrev.i128 v2
+
+    v4 = iconst.i64 0xD5C11476581BC136
+    v5 = iconst.i64 0x5D823796E661B560
+    v6 = iconcat v4, v5
+
+    v7 = icmp eq v3, v6
+    return v7
+}
+; run
diff --git a/cranelift/filetests/filetests/isa/x64/floating-point.clif b/cranelift/filetests/filetests/isa/x64/floating-point.clif
new file mode 100644
index 0000000000..b3b5907210
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/floating-point.clif
@@ -0,0 +1,26 @@
+test compile
+target x86_64
+feature "experimental_x64"
+
+function %f(f64) -> f64 {
+block0(v0: f64):
+    v1 = fabs.f64 v0
+    return v1
+}
+; check:  movabsq $$9223372036854775807, %rsi
+; nextln: movq    %rsi, %xmm1
+; nextln: andpd   %xmm0, %xmm1
+; nextln: movaps  %xmm1, %xmm0
+
+
+function %f(i64) -> f64 {
+block0(v0: i64):
+    v1 = load.f64 v0
+    v2 = fabs.f64 v1
+    return v2
+}
+; check:  movsd   0(%rdi), %xmm0
+; nextln: movabsq $$9223372036854775807, %rsi
+; nextln: movq    %rsi, %xmm1
+; nextln: andpd   %xmm0, %xmm1
+; nextln: movaps  %xmm1, %xmm0
diff --git a/cranelift/filetests/filetests/isa/x64/i128.clif b/cranelift/filetests/filetests/isa/x64/i128.clif
new file mode 100644
index 0000000000..e7ee34f283
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/i128.clif
@@ -0,0 +1,1082 @@
+test compile
+target x86_64
+feature "experimental_x64"
+
+function %f0(i128, i128) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128, v1: i128):
+
+    v2 = iadd v0, v1
+; nextln:  addq    %rdx, %rdi
+; nextln:  adcq    %rcx, %rsi
+
+    return v2
+; nextln:  movq    %rdi, %rax
+; nextln:  movq    %rsi, %rdx
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f1(i128, i128) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128, v1: i128):
+
+    v2 = isub v0, v1
+; nextln:  subq    %rdx, %rdi
+; nextln:  sbbq    %rcx, %rsi
+
+    return v2
+; nextln:  movq    %rdi, %rax
+; nextln:  movq    %rsi, %rdx
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f2(i128, i128) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128, v1: i128):
+
+    v2 = band v0, v1
+; nextln:  andq    %rdx, %rdi
+; nextln:  andq    %rcx, %rsi
+
+    return v2
+; nextln:  movq    %rdi, %rax
+; nextln:  movq    %rsi, %rdx
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f3(i128, i128) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128, v1: i128):
+
+    v2 = bor v0, v1
+; nextln:  orq     %rdx, %rdi
+; nextln:  orq     %rcx, %rsi
+
+    return v2
+; nextln:  movq    %rdi, %rax
+; nextln:  movq    %rsi, %rdx
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f4(i128, i128) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128, v1: i128):
+
+    v2 = bxor v0, v1
+; nextln:  xorq    %rdx, %rdi
+; nextln:  xorq    %rcx, %rsi
+
+    return v2
+; nextln:  movq    %rdi, %rax
+; nextln:  movq    %rsi, %rdx
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f5(i128) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128):
+
+    v1 = bnot v0
+; nextln:  notq    %rdi
+; nextln:  notq    %rsi
+
+    return v1
+; nextln:  movq    %rdi, %rax
+; nextln:  movq    %rsi, %rdx
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f6(i128, i128) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128, v1: i128):
+; v0 in rdi:rsi, v1 in rdx:rcx
+
+    v2 = imul v0, v1
+; nextln:  movq    %rsi, %rax
+; nextln:  movq    %rcx, %r8
+; nextln:  movq    %rdi, %rsi
+; nextln:  imulq   %rdx, %rsi
+; nextln:  movq    %rdi, %rcx
+; nextln:  imulq   %r8, %rcx
+; nextln:  imulq   %rdx, %rax
+; nextln:  addq    %rax, %rcx
+; nextln:  movq    %rdi, %rax
+; nextln:  mul     %rdx
+; nextln:  addq    %rdx, %rcx
+; nextln:  movq    %rsi, %rax
+; nextln:  movq    %rcx, %rdx
+
+    return v2
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f7(i64, i64) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i64, v1: i64):
+    v2 = iconcat.i64 v0, v1
+; nextln:  movq    %rdi, %rax
+; nextln:  movq    %rsi, %rdx
+
+    return v2
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f8(i128) -> i64, i64 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128):
+    v1, v2 = isplit.i128 v0
+; nextln:  movq    %rdi, %rax
+; nextln:  movq    %rsi, %rdx
+
+    return v1, v2
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f9(i128, i128) -> b1 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128, v1: i128):
+    v2 = icmp eq v0, v1
+; check:  cmpq    %rcx, %rsi
+; nextln: setz    %al
+; nextln: cmpq    %rdx, %rdi
+; nextln: setz    %r8b
+; nextln: andq    %rax, %r8
+; nextln: andq    $$1, %r8
+; nextln: setnz   %al
+
+    v3 = icmp ne v0, v1
+; check:  cmpq    %rcx, %rsi
+; nextln: setnz   %al
+; nextln: cmpq    %rdx, %rdi
+; nextln: setnz   %r8b
+; nextln: orq     %rax, %r8
+; nextln: andq    $$1, %r8
+; nextln: setnz   %r8b
+ 
+    v4 = icmp slt v0, v1
+; check:  cmpq    %rcx, %rsi
+; nextln: setl    %r9b
+; nextln: setz    %al
+; nextln: cmpq    %rdx, %rdi
+; nextln: setb    %r10b
+; nextln: andq    %rax, %r10
+; nextln: orq     %r9, %r10
+; nextln: andq    $$1, %r10
+; nextln: setnz   %r9b
+ 
+    v5 = icmp sle v0, v1
+; check:  cmpq    %rcx, %rsi
+; nextln: setl    %r10b
+; nextln: setz    %al
+; nextln: cmpq    %rdx, %rdi
+; nextln: setbe   %r11b
+; nextln: andq    %rax, %r11
+; nextln: orq     %r10, %r11
+; nextln: andq    $$1, %r11
+; nextln: setnz   %r10b
+ 
+    v6 = icmp sgt v0, v1
+; check:  cmpq    %rcx, %rsi
+; nextln: setnle  %r11b
+; nextln: setz    %al
+; nextln: cmpq    %rdx, %rdi
+; nextln: setnbe  %r12b
+; nextln: andq    %rax, %r12
+; nextln: orq     %r11, %r12
+; nextln: andq    $$1, %r12
+; nextln: setnz   %r11b
+
+	v7 = icmp sge v0, v1
+; check:  cmpq    %rcx, %rsi
+; nextln: setnle  %r12b
+; nextln: setz    %al
+; nextln: cmpq    %rdx, %rdi
+; nextln: setnb   %r13b
+; nextln: andq    %rax, %r13
+; nextln: orq     %r12, %r13
+; nextln: andq    $$1, %r13
+; nextln: setnz   %r12b
+
+    v8 = icmp ult v0, v1
+; check:  cmpq    %rcx, %rsi
+; nextln: setb    %r13b
+; nextln: setz    %al
+; nextln: cmpq    %rdx, %rdi
+; nextln: setb    %r14b
+; nextln: andq    %rax, %r14
+; nextln: orq     %r13, %r14
+; nextln: andq    $$1, %r14
+; nextln: setnz   %r13b
+
+    v9 = icmp ule v0, v1
+; check:  cmpq    %rcx, %rsi
+; nextln: setb    %r14b
+; nextln: setz    %al
+; nextln: cmpq    %rdx, %rdi
+; nextln: setbe   %bl
+; nextln: andq    %rax, %rbx
+; nextln: orq     %r14, %rbx
+; nextln: andq    $$1, %rbx
+; nextln: setnz   %r14b
+
+    v10 = icmp ugt v0, v1
+; check:  cmpq    %rcx, %rsi
+; nextln: setnbe  %bl
+; nextln: setz    %r15b
+; nextln: cmpq    %rdx, %rdi
+; nextln: setnbe  %al
+; nextln: andq    %r15, %rax
+; nextln: orq     %rbx, %rax
+; nextln: andq    $$1, %rax
+; nextln: setnz   %bl
+
+    v11 = icmp uge v0, v1
+; check:  cmpq    %rcx, %rsi
+; nextln: setnbe  %sil
+; nextln: setz    %cl
+; nextln: cmpq    %rdx, %rdi
+; nextln: setnb   %dil
+; nextln: andq    %rcx, %rdi
+; nextln: orq     %rsi, %rdi
+; nextln: andq    $$1, %rdi
+; nextln: setnz   %sil
+
+    v12 = band v2, v3
+    v13 = band v4, v5
+    v14 = band v6, v7
+    v15 = band v8, v9
+    v16 = band v10, v11
+    v17 = band v12, v13
+    v18 = band v14, v15
+    v19 = band v17, v18
+    v20 = band v19, v16
+
+    return v20
+; check:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f10(i128) -> i32 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128):
+    brz v0, block1
+; check:  cmpq    $$0, %rdi
+; nextln: setz    %dil
+; nextln: cmpq    $$0, %rsi
+; nextln: setz    %sil
+; nextln: andb    %dil, %sil
+; nextln: jnz     label1; j label2
+ 
+    jump block2
+
+block1:
+    v1 = iconst.i32 1
+    return v1
+
+block2:
+    v2 = iconst.i32 2
+    return v2
+
+; check:   movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f11(i128) -> i32 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128):
+    brnz v0, block1
+; check:  cmpq    $$0, %rdi
+; nextln: setnz   %dil
+; nextln: cmpq    $$0, %rsi
+; nextln: setnz   %sil
+; nextln: orb     %dil, %sil
+; nextln: jnz     label1; j label2
+    jump block2
+
+block1:
+    v1 = iconst.i32 1
+    return v1
+
+block2:
+    v2 = iconst.i32 2
+    return v2
+
+; check:   movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f12(i64) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i64):
+    v1 = uextend.i128 v0
+    return v1
+
+; nextln:  movq    %rdi, %rsi
+; nextln:  xorq    %rdi, %rdi
+; nextln:  movq    %rsi, %rax
+; nextln:  movq    %rdi, %rdx
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f13(i64) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i64):
+    v1 = sextend.i128 v0
+    return v1
+
+; nextln:  movq    %rdi, %rsi
+; nextln:  movq    %rsi, %rdi
+; nextln:  sarq    $$63, %rdi
+; nextln:  movq    %rsi, %rax
+; nextln:  movq    %rdi, %rdx
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f14(i8) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i8):
+    v1 = sextend.i128 v0
+    return v1
+
+; nextln:  movsbq  %dil, %rsi
+; nextln:  movq    %rsi, %rdi
+; nextln:  sarq    $$63, %rdi
+; nextln:  movq    %rsi, %rax
+; nextln:  movq    %rdi, %rdx
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f15(i8) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i8):
+    v1 = uextend.i128 v0
+    return v1
+
+; nextln:  movzbq  %dil, %rsi
+; nextln:  xorq    %rdi, %rdi
+; nextln:  movq    %rsi, %rax
+; nextln:  movq    %rdi, %rdx
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+
+}
+
+function %f16(i128) -> i64 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128):
+    v1 = ireduce.i64 v0
+    return v1
+
+; nextln:  movq    %rdi, %rax
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f17(i128) -> i8 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128):
+    v1 = ireduce.i8 v0
+    return v1
+
+; nextln:  movq    %rdi, %rax
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f18(b1) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: b1):
+    v1 = bint.i128 v0
+    return v1
+
+; check:  movzbq  %dil, %rsi
+; nextln: xorq    %rdi, %rdi
+; nextln: movq    %rsi, %rax
+; nextln: movq    %rdi, %rdx
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f19(i128) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128):
+    v1 = popcnt.i128 v0
+    return v1
+
+; check:  movq    %rsi, %rdx
+; nextln: movq    %rdi, %rsi
+; nextln: shrq    $$1, %rsi
+; nextln: movabsq $$8608480567731124087, %rcx
+; nextln: andq    %rcx, %rsi
+; nextln: movq    %rdi, %rax
+; nextln: subq    %rsi, %rax
+; nextln: shrq    $$1, %rsi
+; nextln: andq    %rcx, %rsi
+; nextln: subq    %rsi, %rax
+; nextln: shrq    $$1, %rsi
+; nextln: andq    %rcx, %rsi
+; nextln: subq    %rsi, %rax
+; nextln: movq    %rax, %rsi
+; nextln: shrq    $$4, %rsi
+; nextln: addq    %rax, %rsi
+; nextln: movabsq $$1085102592571150095, %rdi
+; nextln: andq    %rdi, %rsi
+; nextln: movabsq $$72340172838076673, %rdi
+; nextln: imulq   %rdi, %rsi
+; nextln: shrq    $$56, %rsi
+; nextln: movq    %rdx, %rax
+; nextln: shrq    $$1, %rax
+; nextln: movabsq $$8608480567731124087, %rcx
+; nextln: andq    %rcx, %rax
+; nextln: movq    %rdx, %rdi
+; nextln: subq    %rax, %rdi
+; nextln: shrq    $$1, %rax
+; nextln: andq    %rcx, %rax
+; nextln: subq    %rax, %rdi
+; nextln: shrq    $$1, %rax
+; nextln: andq    %rcx, %rax
+; nextln: subq    %rax, %rdi
+; nextln: movq    %rdi, %rax
+; nextln: shrq    $$4, %rax
+; nextln: addq    %rdi, %rax
+; nextln: movabsq $$1085102592571150095, %rdi
+; nextln: andq    %rdi, %rax
+; nextln: movabsq $$72340172838076673, %rdi
+; nextln: imulq   %rdi, %rax
+; nextln: shrq    $$56, %rax
+; nextln: addq    %rax, %rsi
+; nextln: xorq    %rdi, %rdi
+; nextln: movq    %rsi, %rax
+; nextln: movq    %rdi, %rdx
+
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+
+function %f20(i128) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128):
+    v1 = bitrev.i128 v0
+    return v1
+
+; check:  movq    %rdi, %rcx
+; nextln: movq    %rcx, %rdi
+; nextln: movabsq $$6148914691236517205, %rax
+; nextln: shrq    $$1, %rdi
+; nextln: andq    %rax, %rdi
+; nextln: andq    %rcx, %rax
+; nextln: shlq    $$1, %rax
+; nextln: movq    %rax, %rcx
+; nextln: orq     %rdi, %rcx
+; nextln: movq    %rcx, %rdi
+; nextln: movabsq $$3689348814741910323, %rax
+; nextln: shrq    $$2, %rdi
+; nextln: andq    %rax, %rdi
+; nextln: andq    %rcx, %rax
+; nextln: shlq    $$2, %rax
+; nextln: movq    %rax, %rcx
+; nextln: orq     %rdi, %rcx
+; nextln: movq    %rcx, %rdi
+; nextln: movabsq $$1085102592571150095, %rax
+; nextln: shrq    $$4, %rdi
+; nextln: andq    %rax, %rdi
+; nextln: andq    %rcx, %rax
+; nextln: shlq    $$4, %rax
+; nextln: movq    %rax, %rcx
+; nextln: orq     %rdi, %rcx
+; nextln: movq    %rcx, %rdi
+; nextln: movabsq $$71777214294589695, %rax
+; nextln: shrq    $$8, %rdi
+; nextln: andq    %rax, %rdi
+; nextln: andq    %rcx, %rax
+; nextln: shlq    $$8, %rax
+; nextln: movq    %rax, %rcx
+; nextln: orq     %rdi, %rcx
+; nextln: movq    %rcx, %rdi
+; nextln: movabsq $$281470681808895, %rax
+; nextln: shrq    $$16, %rdi
+; nextln: andq    %rax, %rdi
+; nextln: andq    %rcx, %rax
+; nextln: shlq    $$16, %rax
+; nextln: orq     %rdi, %rax
+; nextln: movq    %rax, %rcx
+; nextln: movl    $$-1, %edi
+; nextln: shrq    $$32, %rcx
+; nextln: andq    %rdi, %rcx
+; nextln: andq    %rax, %rdi
+; nextln: shlq    $$32, %rdi
+; nextln: orq     %rcx, %rdi
+; nextln: movq    %rsi, %rcx
+; nextln: movq    %rcx, %rsi
+; nextln: movabsq $$6148914691236517205, %rax
+; nextln: shrq    $$1, %rsi
+; nextln: andq    %rax, %rsi
+; nextln: andq    %rcx, %rax
+; nextln: shlq    $$1, %rax
+; nextln: movq    %rax, %rcx
+; nextln: orq     %rsi, %rcx
+; nextln: movq    %rcx, %rsi
+; nextln: movabsq $$3689348814741910323, %rax
+; nextln: shrq    $$2, %rsi
+; nextln: andq    %rax, %rsi
+; nextln: andq    %rcx, %rax
+; nextln: shlq    $$2, %rax
+; nextln: movq    %rax, %rcx
+; nextln: orq     %rsi, %rcx
+; nextln: movq    %rcx, %rsi
+; nextln: movabsq $$1085102592571150095, %rax
+; nextln: shrq    $$4, %rsi
+; nextln: andq    %rax, %rsi
+; nextln: andq    %rcx, %rax
+; nextln: shlq    $$4, %rax
+; nextln: movq    %rax, %rcx
+; nextln: orq     %rsi, %rcx
+; nextln: movq    %rcx, %rsi
+; nextln: movabsq $$71777214294589695, %rax
+; nextln: shrq    $$8, %rsi
+; nextln: andq    %rax, %rsi
+; nextln: andq    %rcx, %rax
+; nextln: shlq    $$8, %rax
+; nextln: movq    %rax, %rcx
+; nextln: orq     %rsi, %rcx
+; nextln: movq    %rcx, %rsi
+; nextln: movabsq $$281470681808895, %rax
+; nextln: shrq    $$16, %rsi
+; nextln: andq    %rax, %rsi
+; nextln: andq    %rcx, %rax
+; nextln: shlq    $$16, %rax
+; nextln: orq     %rsi, %rax
+; nextln: movq    %rax, %rsi
+; nextln: movl    $$-1, %ecx
+; nextln: shrq    $$32, %rsi
+; nextln: andq    %rcx, %rsi
+; nextln: andq    %rax, %rcx
+; nextln: shlq    $$32, %rcx
+; nextln: orq     %rsi, %rcx
+; nextln: movq    %rcx, %rax
+; nextln: movq    %rdi, %rdx
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f21(i128, i32) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128, v1: i32):
+    v2 = ushr v0, v1
+    return v2
+
+; check:  movq    %rdi, %rax
+; nextln: movq    %rsi, %rdi
+; nextln: movq    %rdi, %rsi
+; nextln: movq    %rdx, %rcx
+; nextln: shrq    %cl, %rsi
+; nextln: movq    %rdx, %rcx
+; nextln: shrq    %cl, %rax
+; nextln: movl    $$64, %ecx
+; nextln: subq    %rdx, %rcx
+; nextln: shlq    %cl, %rdi
+; nextln: orq     %rax, %rdi
+; nextln: xorq    %rax, %rax
+; nextln: xorq    %rcx, %rcx
+; nextln: andq    $$64, %rdx
+; nextln: cmovzq  %rsi, %rax
+; nextln: cmovzq  %rdi, %rcx
+; nextln: cmovnzq %rsi, %rcx
+; nextln: movq    %rax, %rdx
+; nextln: movq    %rcx, %rax
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f22(i128, i32) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128, v1: i32):
+    v2 = ishl v0, v1
+    return v2
+
+; check:  movq    %rsi, %rax
+; nextln: movq    %rdi, %rsi
+; nextln: movq    %rdx, %rcx
+; nextln: shlq    %cl, %rsi
+; nextln: movq    %rdx, %rcx
+; nextln: shlq    %cl, %rax
+; nextln: movl    $$64, %ecx
+; nextln: subq    %rdx, %rcx
+; nextln: shrq    %cl, %rdi
+; nextln: orq     %rax, %rdi
+; nextln: xorq    %rax, %rax
+; nextln: xorq    %rcx, %rcx
+; nextln: andq    $$64, %rdx
+; nextln: cmovzq  %rdi, %rcx
+; nextln: cmovzq  %rsi, %rax
+; nextln: cmovnzq %rsi, %rcx
+; nextln: movq    %rcx, %rdx
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f23(i128, i32) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128, v1: i32):
+    v2 = sshr v0, v1
+    return v2
+
+; check:  movq    %rdi, %r8
+; nextln: movq    %rsi, %rdi
+; nextln: movq    %rdi, %rsi
+; nextln: movq    %rdx, %rcx
+; nextln: sarq    %cl, %rsi
+; nextln: movq    %rdx, %rcx
+; nextln: sarq    %cl, %r8
+; nextln: movl    $$64, %ecx
+; nextln: subq    %rdx, %rcx
+; nextln: movq    %rdi, %rax
+; nextln: shlq    %cl, %rax
+; nextln: orq     %r8, %rax
+; nextln: sarq    $$63, %rdi
+; nextln: xorq    %rcx, %rcx
+; nextln: andq    $$64, %rdx
+; nextln: cmovzq  %rsi, %rdi
+; nextln: cmovzq  %rax, %rcx
+; nextln: cmovnzq %rsi, %rcx
+; nextln: movq    %rcx, %rax
+; nextln: movq    %rdi, %rdx
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f24(i128, i32) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128, v1: i32):
+    v2 = rotr.i128 v0, v1
+    return v2
+
+; check:  movq    %rsi, %r9
+; nextln: movq    %rdx, %rcx
+; nextln: shrq    %cl, %r9
+; nextln: movq    %rdi, %rax
+; nextln: movq    %rdx, %rcx
+; nextln: shrq    %cl, %rax
+; nextln: movl    $$64, %ecx
+; nextln: subq    %rdx, %rcx
+; nextln: movq    %rsi, %r10
+; nextln: shlq    %cl, %r10
+; nextln: orq     %rax, %r10
+; nextln: xorq    %r8, %r8
+; nextln: xorq    %rax, %rax
+; nextln: movq    %rdx, %rcx
+; nextln: andq    $$64, %rcx
+; nextln: cmovzq  %r9, %r8
+; nextln: cmovzq  %r10, %rax
+; nextln: cmovnzq %r9, %rax
+; nextln: movl    $$128, %r9d
+; nextln: subq    %rdx, %r9
+; nextln: movq    %rdi, %rdx
+; nextln: movq    %r9, %rcx
+; nextln: shlq    %cl, %rdx
+; nextln: movq    %r9, %rcx
+; nextln: shlq    %cl, %rsi
+; nextln: movl    $$64, %ecx
+; nextln: subq    %r9, %rcx
+; nextln: movq    %rdi, %r10
+; nextln: shrq    %cl, %r10
+; nextln: orq     %rsi, %r10
+; nextln: xorq    %rsi, %rsi
+; nextln: xorq    %rdi, %rdi
+; nextln: andq    $$64, %r9
+; nextln: cmovzq  %r10, %rdi
+; nextln: cmovzq  %rdx, %rsi
+; nextln: cmovnzq %rdx, %rdi
+; nextln: orq     %rax, %rsi
+; nextln: orq     %r8, %rdi
+; nextln: movq    %rsi, %rax
+; nextln: movq    %rdi, %rdx
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f25(i128, i32) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128, v1: i32):
+    v2 = rotl.i128 v0, v1
+    return v2
+
+; check:  movq    %rdi, %r9
+; nextln: movq    %rdx, %rcx
+; nextln: shlq    %cl, %r9
+; nextln: movq    %rsi, %rax
+; nextln: movq    %rdx, %rcx
+; nextln: shlq    %cl, %rax
+; nextln: movl    $$64, %ecx
+; nextln: subq    %rdx, %rcx
+; nextln: movq    %rdi, %r10
+; nextln: shrq    %cl, %r10
+; nextln: orq     %rax, %r10
+; nextln: xorq    %r8, %r8
+; nextln: xorq    %rax, %rax
+; nextln: movq    %rdx, %rcx
+; nextln: andq    $$64, %rcx
+; nextln: cmovzq  %r10, %rax
+; nextln: cmovzq  %r9, %r8
+; nextln: cmovnzq %r9, %rax
+; nextln: movl    $$128, %r9d
+; nextln: subq    %rdx, %r9
+; nextln: movq    %rsi, %rdx
+; nextln: movq    %r9, %rcx
+; nextln: shrq    %cl, %rdx
+; nextln: movq    %r9, %rcx
+; nextln: shrq    %cl, %rdi
+; nextln: movl    $$64, %ecx
+; nextln: subq    %r9, %rcx
+; nextln: shlq    %cl, %rsi
+; nextln: orq     %rdi, %rsi
+; nextln: xorq    %rdi, %rdi
+; nextln: xorq    %rcx, %rcx
+; nextln: andq    $$64, %r9
+; nextln: cmovzq  %rdx, %rdi
+; nextln: cmovzq  %rsi, %rcx
+; nextln: cmovnzq %rdx, %rcx
+; nextln: orq     %r8, %rcx
+; nextln: orq     %rax, %rdi
+; nextln: movq    %rcx, %rax
+; nextln: movq    %rdi, %rdx
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f26(i128, i64) {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128, v1: i64):
+    store.i128 v0, v1
+    return
+
+; check:  movq    %rdi, 0(%rdx)
+; nextln: movq    %rsi, 8(%rdx)
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f27(i64) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i64):
+    v1 = load.i128 v0
+    return v1
+
+; check:  movq    0(%rdi), %rsi
+; nextln: movq    8(%rdi), %rdi
+; nextln: movq    %rsi, %rax
+; nextln: movq    %rdi, %rdx
+
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}
+
+function %f28(i128, b1) -> i128 {
+block0(v0: i128, v1: b1):
+    v2 = iconst.i128 0
+    brnz v1, block1(v2)
+    jump block2(v2)
+
+block1(v3: i128):
+    v4 = iconst.i128 1
+    v5 = iadd.i128 v3, v4
+    return v5
+
+block2(v6: i128):
+    v7 = iconst.i128 2
+    v8 = iadd.i128 v6, v7
+    return v8
+
+; check: pushq   %rbp
+; nextln: movq    %rsp, %rbp
+; nextln: testb   $$1, %dl
+; nextln: jnz     label1; j label2
+; check: Block 1:
+; check:  movl    $$0, %esi
+; nextln: movl    $$0, %edi
+; nextln: movl    $$1, %eax
+; nextln: movl    $$0, %ecx
+; nextln: addq    %rax, %rsi
+; nextln: adcq    %rcx, %rdi
+; nextln: movq    %rsi, %rax
+; nextln: movq    %rdi, %rdx
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
+; check: Block 2:
+; check:  movl    $$0, %esi
+; nextln: movl    $$0, %edi
+; nextln: movl    $$2, %eax
+; nextln: movl    $$0, %ecx
+; nextln: addq    %rax, %rsi
+; nextln: adcq    %rcx, %rdi
+; nextln: movq    %rsi, %rax
+; nextln: movq    %rdi, %rdx
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
+ 
+}
+
+function %f29(i128, i128, i64, i128, i128, i128) -> i128 {
+
+block0(v0: i128, v1: i128, v2: i64, v3: i128, v4: i128, v5: i128):
+    v6 = iadd.i128 v0, v1
+    v7 = uextend.i128 v2
+    v8 = iadd.i128 v3, v7
+    v9 = iadd.i128 v4, v5
+    v10 = iadd.i128 v6, v8
+    v11 = iadd.i128 v9, v10
+    return v11
+
+; check:  movq    %rsp, %rbp
+; nextln: subq    $$16, %rsp
+; nextln: movq    %r12, 0(%rsp)
+; nextln: movq    %r13, 8(%rsp)
+; nextln: virtual_sp_offset_adjust 16
+; nextln: movq    16(%rbp), %r9
+; nextln: movq    24(%rbp), %r10
+; nextln: movq    32(%rbp), %r12
+; nextln: movq    40(%rbp), %r11
+; nextln: movq    48(%rbp), %rax
+; nextln: movq    56(%rbp), %r13
+; nextln: addq    %rdx, %rdi
+; nextln: adcq    %rcx, %rsi
+; nextln: xorq    %rcx, %rcx
+; nextln: addq    %r8, %r9
+; nextln: adcq    %rcx, %r10
+; nextln: addq    %rax, %r12
+; nextln: adcq    %r13, %r11
+; nextln: addq    %r9, %rdi
+; nextln: adcq    %r10, %rsi
+; nextln: addq    %rdi, %r12
+; nextln: adcq    %rsi, %r11
+; nextln: movq    %r12, %rax
+; nextln: movq    %r11, %rdx
+; nextln: movq    0(%rsp), %r12
+; nextln: movq    8(%rsp), %r13
+; nextln: addq    $$16, %rsp
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
+
+}
+
+function %f30(i128) -> i128, i128, i128, i64, i128, i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i128):
+    v1 = ireduce.i64 v0
+    return v0, v0, v0, v1, v0, v0
+
+; likely to change with regalloc -- just check the stores into the retval area:
+
+; check:  movq    %r8, 0(%rsi)
+; nextln: movq    %r9, 8(%rsi)
+; nextln: movq    %r10, 16(%rsi)
+; nextln: movq    %r11, 24(%rsi)
+; nextln: movq    %r12, 32(%rsi)
+; nextln: movq    %r13, 48(%rsi)
+; nextln: movq    %r14, 56(%rsi)
+; nextln: movq    %rdi, 64(%rsi)
+; nextln: movq    %rbx, 72(%rsi)
+
+}
+
+function %f31(i128, i128) -> i128, i128 {
+    fn0 = %g(i128, i128) -> i128, i128
+block0(v0: i128, v1: i128):
+    v2, v3 = call fn0(v0, v1)
+    return v2, v3
+
+; check:  pushq   %rbp
+; nextln: movq    %rsp, %rbp
+; nextln: subq    $$16, %rsp
+; nextln: movq    %r12, 0(%rsp)
+; nextln: virtual_sp_offset_adjust 8
+; nextln: movq    %r8, %r12
+; nextln: subq    $$16, %rsp
+; nextln: virtual_sp_offset_adjust 16
+; nextln: lea     0(%rsp), %r8
+; nextln: load_ext_name %g+0, %rax
+; nextln: call    *%rax
+; nextln: movq    0(%rsp), %rsi
+; nextln: movq    8(%rsp), %rdi
+; nextln: addq    $$16, %rsp
+; nextln: virtual_sp_offset_adjust -16
+; nextln: movq    %rsi, 0(%r12)
+; nextln: movq    %rdi, 8(%r12)
+; nextln: movq    0(%rsp), %r12
+; nextln: addq    $$16, %rsp
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
+
+}
+
+function %f32(i128) -> i128 {
+block0(v0: i128):
+    v1 = clz.i128 v0
+    return v1
+
+; check:  pushq   %rbp
+; nextln: movq    %rsp, %rbp
+; nextln: movabsq $$-1, %rcx
+; nextln: bsrq    %rsi, %rax
+; nextln: cmovzq  %rcx, %rax
+; nextln: movl    $$63, %esi
+; nextln: subq    %rax, %rsi
+; nextln: movabsq $$-1, %rcx
+; nextln: bsrq    %rdi, %rax
+; nextln: cmovzq  %rcx, %rax
+; nextln: movl    $$63, %edi
+; nextln: subq    %rax, %rdi
+; nextln: addq    $$64, %rdi
+; nextln: cmpq    $$64, %rsi
+; nextln: cmovnzq %rsi, %rdi
+; nextln: xorq    %rsi, %rsi
+; nextln: movq    %rdi, %rax
+; nextln: movq    %rsi, %rdx
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
+
+}
+
+function %f33(i128) -> i128 {
+block0(v0: i128):
+    v1 = ctz.i128 v0
+    return v1
+}
+
+; check:  pushq   %rbp
+; nextln: movq    %rsp, %rbp
+; nextln: movq    %rsi, %rax
+; nextln: movl    $$64, %ecx
+; nextln: bsfq    %rdi, %rsi
+; nextln: cmovzq  %rcx, %rsi
+; nextln: movl    $$64, %ecx
+; nextln: bsfq    %rax, %rdi
+; nextln: cmovzq  %rcx, %rdi
+; nextln: addq    $$64, %rdi
+; nextln: cmpq    $$64, %rsi
+; nextln: cmovzq  %rdi, %rsi
+; nextln: xorq    %rdi, %rdi
+; nextln: movq    %rsi, %rax
+; nextln: movq    %rdi, %rdx
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
diff --git a/cranelift/filetests/filetests/isa/x64/select-i128.clif b/cranelift/filetests/filetests/isa/x64/select-i128.clif
new file mode 100644
index 0000000000..3492a71997
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/select-i128.clif
@@ -0,0 +1,29 @@
+test compile
+target x86_64
+feature "experimental_x64"
+
+function %f0(i32, i128, i128) -> i128 {
+; check:   pushq   %rbp
+; nextln:  movq    %rsp, %rbp
+
+block0(v0: i32, v1: i128, v2: i128):
+
+    v3 = iconst.i32 42
+    v4 = icmp.i32 eq v0, v3
+; nextln: movl    $$42, %eax
+; nextln: cmpl    %eax, %edi
+
+    v5 = select.i128 v4, v1, v2
+; nextln: cmovzq  %rsi, %rcx
+; nextln: cmovzq  %rdx, %r8
+
+    return v5
+; nextln: movq    %rcx, %rax
+; nextln: movq    %r8, %rdx
+
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
+ 
+}
+
diff --git a/cranelift/filetests/filetests/isa/x64/shift-i128-run.clif b/cranelift/filetests/filetests/isa/x64/shift-i128-run.clif
new file mode 100644
index 0000000000..37bc4667e7
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/shift-i128-run.clif
@@ -0,0 +1,106 @@
+test run
+target x86_64
+feature "experimental_x64"
+
+function %ishl1() -> b1 {
+block0:
+    v0 = iconst.i64 0x01010101_01010101
+    v1 = iconcat v0, v0
+    v2 = iconst.i32 2
+    v3 = ishl.i128 v1, v2
+    v4 = iconst.i64 0x04040404_04040404
+    v5 = iconcat v4, v4
+    v6 = icmp eq v3, v5
+    return v6
+}
+; run
+
+function %ishl2() -> b1 {
+block0:
+    v0 = iconst.i64 0x01010101_01010101
+    v1 = iconst.i64 0x01010101_01010101
+    v2 = iconcat v0, v1
+    v3 = iconst.i32 9
+    v4 = ishl.i128 v2, v3
+    v5 = iconst.i64 0x02020202_02020200
+    v6 = iconst.i64 0x02020202_02020202
+    v7 = iconcat v5, v6
+    v8 = icmp eq v4, v7
+    return v8
+}
+; run
+
+function %ishl3() -> b1 {
+block0:
+    v0 = iconst.i64 0x01010101_01010101
+    v1 = iconst.i64 0xffffffff_ffffffff
+    v2 = iconcat v0, v1
+    v3 = iconst.i32 66
+    v4 = ishl.i128 v2, v3
+    v5 = iconst.i64 0x00000000_00000000
+    v6 = iconst.i64 0x04040404_04040404
+    v7 = iconcat v5, v6
+    v8 = icmp eq v4, v7
+    return v8
+}
+; run
+
+function %ushr1() -> b1 {
+block0:
+    v0 = iconst.i64 0x01010101_01010101
+    v1 = iconst.i64 0x01010101_01010101
+    v2 = iconcat v0, v1
+    v3 = iconst.i32 2
+    v4 = ushr.i128 v2, v3
+    v5 = iconst.i64 0x40404040_40404040
+    v6 = iconst.i64 0x00404040_40404040
+    v7 = iconcat v5, v6
+    v8 = icmp eq v4, v7
+    return v8
+}
+; run
+
+function %ushr2() -> b1 {
+block0:
+    v0 = iconst.i64 0x01010101_01010101
+    v1 = iconst.i64 0x01010101_01010101
+    v2 = iconcat v0, v1
+    v3 = iconst.i32 66
+    v4 = ushr.i128 v2, v3
+    v5 = iconst.i64 0x00404040_40404040
+    v6 = iconst.i64 0x00000000_00000000
+    v7 = iconcat v5, v6
+    v8 = icmp eq v4, v7
+    return v8
+}
+; run
+
+function %sshr1() -> b1 {
+block0:
+    v0 = iconst.i64 0x01010101_01010101
+    v1 = iconst.i64 0x81010101_01010101
+    v2 = iconcat v0, v1
+    v3 = iconst.i32 2
+    v4 = sshr.i128 v2, v3
+    v5 = iconst.i64 0x40404040_40404040
+    v6 = iconst.i64 0xe0404040_40404040
+    v7 = iconcat v5, v6
+    v8 = icmp eq v4, v7
+    return v8
+}
+; run
+
+function %sshr2() -> b1 {
+block0:
+    v0 = iconst.i64 0x12345678_9abcdef0
+    v1 = iconst.i64 0x80101010_10101010
+    v2 = iconcat v0, v1
+    v3 = iconst.i32 66
+    v4 = sshr.i128 v2, v3
+    v5 = iconst.i64 0xe0040404_04040404
+    v6 = iconst.i64 0xffffffff_ffffffff
+    v7 = iconcat v5, v6
+    v8 = icmp eq v4, v7
+    return v8
+}
+; run