Migrate clz, ctz, popcnt, bitrev, is_null, is_invalid on x64 to ISLE. (#3848)

2022-02-28 09:45:13 -08:00
parent 2a6969d2bd
commit 24f145cd1e
19 changed files with 2812 additions and 1990 deletions
--- a/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.manifest
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.manifest
@@ -1,4 +1,4 @@
 src/clif.isle 9ea75a6f790b5c03
-src/prelude.isle 9830498351ddf6a3
+src/prelude.isle 6b0160bfcac86902
 src/isa/aarch64/inst.isle 3678d0a37bdb4cff
 src/isa/aarch64/lower.isle 90accbfcadaea46d
--- a/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.rs
@@ -39,8 +39,14 @@ pub trait Context {
    fn u8_as_u64(&mut self, arg0: u8) -> u64;
    fn u16_as_u64(&mut self, arg0: u16) -> u64;
    fn u32_as_u64(&mut self, arg0: u32) -> u64;
+    fn i64_as_u64(&mut self, arg0: i64) -> u64;
+    fn u64_add(&mut self, arg0: u64, arg1: u64) -> u64;
+    fn u64_sub(&mut self, arg0: u64, arg1: u64) -> u64;
+    fn u64_and(&mut self, arg0: u64, arg1: u64) -> u64;
    fn ty_bits(&mut self, arg0: Type) -> u8;
    fn ty_bits_u16(&mut self, arg0: Type) -> u16;
+    fn ty_bits_u64(&mut self, arg0: Type) -> u64;
+    fn ty_mask(&mut self, arg0: Type) -> u64;
    fn ty_bytes(&mut self, arg0: Type) -> u16;
    fn lane_type(&mut self, arg0: Type) -> Type;
    fn fits_in_16(&mut self, arg0: Type) -> Option<Type>;
@@ -110,13 +116,13 @@ pub trait Context {
    fn rotr_opposite_amount(&mut self, arg0: Type, arg1: ImmShift) -> ImmShift;
 }

-/// Internal type SideEffectNoResult: defined at src/prelude.isle line 363.
+/// Internal type SideEffectNoResult: defined at src/prelude.isle line 385.
 #[derive(Clone, Debug)]
 pub enum SideEffectNoResult {
    Inst { inst: MInst },
 }

-/// Internal type ProducesFlags: defined at src/prelude.isle line 385.
+/// Internal type ProducesFlags: defined at src/prelude.isle line 407.
 #[derive(Clone, Debug)]
 pub enum ProducesFlags {
    ProducesFlagsSideEffect { inst: MInst },
@@ -124,7 +130,7 @@ pub enum ProducesFlags {
    ProducesFlagsReturnsResultWithConsumer { inst: MInst, result: Reg },
 }

-/// Internal type ConsumesFlags: defined at src/prelude.isle line 396.
+/// Internal type ConsumesFlags: defined at src/prelude.isle line 418.
 #[derive(Clone, Debug)]
 pub enum ConsumesFlags {
    ConsumesFlagsReturnsResultWithProducer {
@@ -140,6 +146,13 @@ pub enum ConsumesFlags {
        inst2: MInst,
        result: ValueRegs,
    },
+    ConsumesFlagsFourTimesReturnsValueRegs {
+        inst1: MInst,
+        inst2: MInst,
+        inst3: MInst,
+        inst4: MInst,
+        result: ValueRegs,
+    },
 }

 /// Internal type MInst: defined at src/isa/aarch64/inst.isle line 2.
@@ -1050,7 +1063,7 @@ pub fn constructor_side_effect<C: Context>(
        inst: ref pattern1_0,
    } = pattern0_0
    {
-        // Rule at src/prelude.isle line 368.
+        // Rule at src/prelude.isle line 390.
        let expr0_0 = C::emit(ctx, pattern1_0);
        let expr1_0 = C::output_none(ctx);
        return Some(expr1_0);
@@ -1068,7 +1081,7 @@ pub fn constructor_safepoint<C: Context>(
        inst: ref pattern1_0,
    } = pattern0_0
    {
-        // Rule at src/prelude.isle line 374.
+        // Rule at src/prelude.isle line 396.
        let expr0_0 = C::emit_safepoint(ctx, pattern1_0);
        let expr1_0 = C::output_none(ctx);
        return Some(expr1_0);
@@ -1076,6 +1089,55 @@ pub fn constructor_safepoint<C: Context>(
    return None;
 }

+// Generated as internal constructor for term produces_flags_get_reg.
+pub fn constructor_produces_flags_get_reg<C: Context>(
+    ctx: &mut C,
+    arg0: &ProducesFlags,
+) -> Option<Reg> {
+    let pattern0_0 = arg0;
+    if let &ProducesFlags::ProducesFlagsReturnsReg {
+        inst: ref pattern1_0,
+        result: pattern1_1,
+    } = pattern0_0
+    {
+        // Rule at src/prelude.isle line 434.
+        return Some(pattern1_1);
+    }
+    return None;
+}
+
+// Generated as internal constructor for term produces_flags_ignore.
+pub fn constructor_produces_flags_ignore<C: Context>(
+    ctx: &mut C,
+    arg0: &ProducesFlags,
+) -> Option<ProducesFlags> {
+    let pattern0_0 = arg0;
+    match pattern0_0 {
+        &ProducesFlags::ProducesFlagsReturnsReg {
+            inst: ref pattern1_0,
+            result: pattern1_1,
+        } => {
+            // Rule at src/prelude.isle line 439.
+            let expr0_0 = ProducesFlags::ProducesFlagsSideEffect {
+                inst: pattern1_0.clone(),
+            };
+            return Some(expr0_0);
+        }
+        &ProducesFlags::ProducesFlagsReturnsResultWithConsumer {
+            inst: ref pattern1_0,
+            result: pattern1_1,
+        } => {
+            // Rule at src/prelude.isle line 441.
+            let expr0_0 = ProducesFlags::ProducesFlagsSideEffect {
+                inst: pattern1_0.clone(),
+            };
+            return Some(expr0_0);
+        }
+        _ => {}
+    }
+    return None;
+}
+
 // Generated as internal constructor for term consumes_flags_concat.
 pub fn constructor_consumes_flags_concat<C: Context>(
    ctx: &mut C,
@@ -1094,7 +1156,7 @@ pub fn constructor_consumes_flags_concat<C: Context>(
            result: pattern3_1,
        } = pattern2_0
        {
-            // Rule at src/prelude.isle line 408.
+            // Rule at src/prelude.isle line 448.
            let expr0_0 = C::value_regs(ctx, pattern1_1, pattern3_1);
            let expr1_0 = ConsumesFlags::ConsumesFlagsTwiceReturnsValueRegs {
                inst1: pattern1_0.clone(),
@@ -1124,7 +1186,7 @@ pub fn constructor_with_flags<C: Context>(
                    inst: ref pattern3_0,
                    result: pattern3_1,
                } => {
-                    // Rule at src/prelude.isle line 433.
+                    // Rule at src/prelude.isle line 473.
                    let expr0_0 = C::emit(ctx, pattern1_0);
                    let expr1_0 = C::emit(ctx, pattern3_0);
                    let expr2_0 = C::value_reg(ctx, pattern3_1);
@@ -1135,12 +1197,27 @@ pub fn constructor_with_flags<C: Context>(
                    inst2: ref pattern3_1,
                    result: pattern3_2,
                } => {
-                    // Rule at src/prelude.isle line 439.
+                    // Rule at src/prelude.isle line 479.
                    let expr0_0 = C::emit(ctx, pattern1_0);
-                    let expr1_0 = C::emit(ctx, pattern3_1);
-                    let expr2_0 = C::emit(ctx, pattern3_0);
+                    let expr1_0 = C::emit(ctx, pattern3_0);
+                    let expr2_0 = C::emit(ctx, pattern3_1);
                    return Some(pattern3_2);
                }
+                &ConsumesFlags::ConsumesFlagsFourTimesReturnsValueRegs {
+                    inst1: ref pattern3_0,
+                    inst2: ref pattern3_1,
+                    inst3: ref pattern3_2,
+                    inst4: ref pattern3_3,
+                    result: pattern3_4,
+                } => {
+                    // Rule at src/prelude.isle line 491.
+                    let expr0_0 = C::emit(ctx, pattern1_0);
+                    let expr1_0 = C::emit(ctx, pattern3_0);
+                    let expr2_0 = C::emit(ctx, pattern3_1);
+                    let expr3_0 = C::emit(ctx, pattern3_2);
+                    let expr4_0 = C::emit(ctx, pattern3_3);
+                    return Some(pattern3_4);
+                }
                _ => {}
            }
        }
@@ -1154,7 +1231,7 @@ pub fn constructor_with_flags<C: Context>(
                result: pattern3_1,
            } = pattern2_0
            {
-                // Rule at src/prelude.isle line 427.
+                // Rule at src/prelude.isle line 467.
                let expr0_0 = C::emit(ctx, pattern1_0);
                let expr1_0 = C::emit(ctx, pattern3_0);
                let expr2_0 = C::value_regs(ctx, pattern1_1, pattern3_1);
@@ -1174,7 +1251,7 @@ pub fn constructor_with_flags_reg<C: Context>(
 ) -> Option<Reg> {
    let pattern0_0 = arg0;
    let pattern1_0 = arg1;
-    // Rule at src/prelude.isle line 452.
+    // Rule at src/prelude.isle line 508.
    let expr0_0 = constructor_with_flags(ctx, pattern0_0, pattern1_0)?;
    let expr1_0: usize = 0;
    let expr2_0 = C::value_regs_get(ctx, expr0_0, expr1_0);
--- a/cranelift/codegen/src/isa/s390x/lower/isle/generated_code.manifest
+++ b/cranelift/codegen/src/isa/s390x/lower/isle/generated_code.manifest
@@ -1,4 +1,4 @@
 src/clif.isle 9ea75a6f790b5c03
-src/prelude.isle 9830498351ddf6a3
+src/prelude.isle 6b0160bfcac86902
 src/isa/s390x/inst.isle d91a16074ab186a8
 src/isa/s390x/lower.isle 1cc5a12adc8c75f9
--- a/cranelift/codegen/src/isa/s390x/lower/isle/generated_code.rs
+++ b/cranelift/codegen/src/isa/s390x/lower/isle/generated_code.rs
@@ -39,8 +39,14 @@ pub trait Context {
    fn u8_as_u64(&mut self, arg0: u8) -> u64;
    fn u16_as_u64(&mut self, arg0: u16) -> u64;
    fn u32_as_u64(&mut self, arg0: u32) -> u64;
+    fn i64_as_u64(&mut self, arg0: i64) -> u64;
+    fn u64_add(&mut self, arg0: u64, arg1: u64) -> u64;
+    fn u64_sub(&mut self, arg0: u64, arg1: u64) -> u64;
+    fn u64_and(&mut self, arg0: u64, arg1: u64) -> u64;
    fn ty_bits(&mut self, arg0: Type) -> u8;
    fn ty_bits_u16(&mut self, arg0: Type) -> u16;
+    fn ty_bits_u64(&mut self, arg0: Type) -> u64;
+    fn ty_mask(&mut self, arg0: Type) -> u64;
    fn ty_bytes(&mut self, arg0: Type) -> u16;
    fn lane_type(&mut self, arg0: Type) -> Type;
    fn fits_in_16(&mut self, arg0: Type) -> Option<Type>;
@@ -144,13 +150,13 @@ pub trait Context {
    fn same_reg(&mut self, arg0: Reg, arg1: WritableReg) -> Option<()>;
 }

-/// Internal type SideEffectNoResult: defined at src/prelude.isle line 363.
+/// Internal type SideEffectNoResult: defined at src/prelude.isle line 385.
 #[derive(Clone, Debug)]
 pub enum SideEffectNoResult {
    Inst { inst: MInst },
 }

-/// Internal type ProducesFlags: defined at src/prelude.isle line 385.
+/// Internal type ProducesFlags: defined at src/prelude.isle line 407.
 #[derive(Clone, Debug)]
 pub enum ProducesFlags {
    ProducesFlagsSideEffect { inst: MInst },
@@ -158,7 +164,7 @@ pub enum ProducesFlags {
    ProducesFlagsReturnsResultWithConsumer { inst: MInst, result: Reg },
 }

-/// Internal type ConsumesFlags: defined at src/prelude.isle line 396.
+/// Internal type ConsumesFlags: defined at src/prelude.isle line 418.
 #[derive(Clone, Debug)]
 pub enum ConsumesFlags {
    ConsumesFlagsReturnsResultWithProducer {
@@ -174,6 +180,13 @@ pub enum ConsumesFlags {
        inst2: MInst,
        result: ValueRegs,
    },
+    ConsumesFlagsFourTimesReturnsValueRegs {
+        inst1: MInst,
+        inst2: MInst,
+        inst3: MInst,
+        inst4: MInst,
+        result: ValueRegs,
+    },
 }

 /// Internal type MInst: defined at src/isa/s390x/inst.isle line 2.
@@ -941,7 +954,7 @@ pub fn constructor_side_effect<C: Context>(
        inst: ref pattern1_0,
    } = pattern0_0
    {
-        // Rule at src/prelude.isle line 368.
+        // Rule at src/prelude.isle line 390.
        let expr0_0 = C::emit(ctx, pattern1_0);
        let expr1_0 = C::output_none(ctx);
        return Some(expr1_0);
@@ -959,7 +972,7 @@ pub fn constructor_safepoint<C: Context>(
        inst: ref pattern1_0,
    } = pattern0_0
    {
-        // Rule at src/prelude.isle line 374.
+        // Rule at src/prelude.isle line 396.
        let expr0_0 = C::emit_safepoint(ctx, pattern1_0);
        let expr1_0 = C::output_none(ctx);
        return Some(expr1_0);
@@ -967,6 +980,55 @@ pub fn constructor_safepoint<C: Context>(
    return None;
 }

+// Generated as internal constructor for term produces_flags_get_reg.
+pub fn constructor_produces_flags_get_reg<C: Context>(
+    ctx: &mut C,
+    arg0: &ProducesFlags,
+) -> Option<Reg> {
+    let pattern0_0 = arg0;
+    if let &ProducesFlags::ProducesFlagsReturnsReg {
+        inst: ref pattern1_0,
+        result: pattern1_1,
+    } = pattern0_0
+    {
+        // Rule at src/prelude.isle line 434.
+        return Some(pattern1_1);
+    }
+    return None;
+}
+
+// Generated as internal constructor for term produces_flags_ignore.
+pub fn constructor_produces_flags_ignore<C: Context>(
+    ctx: &mut C,
+    arg0: &ProducesFlags,
+) -> Option<ProducesFlags> {
+    let pattern0_0 = arg0;
+    match pattern0_0 {
+        &ProducesFlags::ProducesFlagsReturnsReg {
+            inst: ref pattern1_0,
+            result: pattern1_1,
+        } => {
+            // Rule at src/prelude.isle line 439.
+            let expr0_0 = ProducesFlags::ProducesFlagsSideEffect {
+                inst: pattern1_0.clone(),
+            };
+            return Some(expr0_0);
+        }
+        &ProducesFlags::ProducesFlagsReturnsResultWithConsumer {
+            inst: ref pattern1_0,
+            result: pattern1_1,
+        } => {
+            // Rule at src/prelude.isle line 441.
+            let expr0_0 = ProducesFlags::ProducesFlagsSideEffect {
+                inst: pattern1_0.clone(),
+            };
+            return Some(expr0_0);
+        }
+        _ => {}
+    }
+    return None;
+}
+
 // Generated as internal constructor for term consumes_flags_concat.
 pub fn constructor_consumes_flags_concat<C: Context>(
    ctx: &mut C,
@@ -985,7 +1047,7 @@ pub fn constructor_consumes_flags_concat<C: Context>(
            result: pattern3_1,
        } = pattern2_0
        {
-            // Rule at src/prelude.isle line 408.
+            // Rule at src/prelude.isle line 448.
            let expr0_0 = C::value_regs(ctx, pattern1_1, pattern3_1);
            let expr1_0 = ConsumesFlags::ConsumesFlagsTwiceReturnsValueRegs {
                inst1: pattern1_0.clone(),
@@ -1015,7 +1077,7 @@ pub fn constructor_with_flags<C: Context>(
                    inst: ref pattern3_0,
                    result: pattern3_1,
                } => {
-                    // Rule at src/prelude.isle line 433.
+                    // Rule at src/prelude.isle line 473.
                    let expr0_0 = C::emit(ctx, pattern1_0);
                    let expr1_0 = C::emit(ctx, pattern3_0);
                    let expr2_0 = C::value_reg(ctx, pattern3_1);
@@ -1026,12 +1088,27 @@ pub fn constructor_with_flags<C: Context>(
                    inst2: ref pattern3_1,
                    result: pattern3_2,
                } => {
-                    // Rule at src/prelude.isle line 439.
+                    // Rule at src/prelude.isle line 479.
                    let expr0_0 = C::emit(ctx, pattern1_0);
-                    let expr1_0 = C::emit(ctx, pattern3_1);
-                    let expr2_0 = C::emit(ctx, pattern3_0);
+                    let expr1_0 = C::emit(ctx, pattern3_0);
+                    let expr2_0 = C::emit(ctx, pattern3_1);
                    return Some(pattern3_2);
                }
+                &ConsumesFlags::ConsumesFlagsFourTimesReturnsValueRegs {
+                    inst1: ref pattern3_0,
+                    inst2: ref pattern3_1,
+                    inst3: ref pattern3_2,
+                    inst4: ref pattern3_3,
+                    result: pattern3_4,
+                } => {
+                    // Rule at src/prelude.isle line 491.
+                    let expr0_0 = C::emit(ctx, pattern1_0);
+                    let expr1_0 = C::emit(ctx, pattern3_0);
+                    let expr2_0 = C::emit(ctx, pattern3_1);
+                    let expr3_0 = C::emit(ctx, pattern3_2);
+                    let expr4_0 = C::emit(ctx, pattern3_3);
+                    return Some(pattern3_4);
+                }
                _ => {}
            }
        }
@@ -1045,7 +1122,7 @@ pub fn constructor_with_flags<C: Context>(
                result: pattern3_1,
            } = pattern2_0
            {
-                // Rule at src/prelude.isle line 427.
+                // Rule at src/prelude.isle line 467.
                let expr0_0 = C::emit(ctx, pattern1_0);
                let expr1_0 = C::emit(ctx, pattern3_0);
                let expr2_0 = C::value_regs(ctx, pattern1_1, pattern3_1);
@@ -1065,7 +1142,7 @@ pub fn constructor_with_flags_reg<C: Context>(
 ) -> Option<Reg> {
    let pattern0_0 = arg0;
    let pattern1_0 = arg1;
-    // Rule at src/prelude.isle line 452.
+    // Rule at src/prelude.isle line 508.
    let expr0_0 = constructor_with_flags(ctx, pattern0_0, pattern1_0)?;
    let expr1_0: usize = 0;
    let expr2_0 = C::value_regs_get(ctx, expr0_0, expr1_0);
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -159,15 +159,6 @@
              (alternative Gpr)
              (dst WritableGpr))

-       ;; GPR conditional move with the `OR` of two conditions; overwrites
-       ;; the destination register.
-       (CmoveOr (size OperandSize)
-                (cc1 CC)
-                (cc2 CC)
-                (consequent GprMem)
-                (alternative Gpr)
-                (dst WritableGpr))
-
       ;; XMM conditional move; overwrites the destination register.
       (XmmCmove (size OperandSize)
                 (cc CC)
@@ -175,15 +166,6 @@
                 (alternative Xmm)
                 (dst WritableXmm))

-       ;; XMM conditional move with the `OR` of two conditions; overwrites
-       ;; the destination register.
-       (XmmCmoveOr (size OperandSize)
-                   (cc1 CC)
-                   (cc2 CC)
-                   (consequent XmmMem)
-                   (alternative Xmm)
-                   (dst WritableXmm))
-
       ;; =========================================
       ;; Stack manipulation.

@@ -1074,6 +1056,18 @@
 (decl avx512f_enabled () Type)
 (extern extractor avx512f_enabled avx512f_enabled)

+(decl avx512bitalg_enabled () Type)
+(extern extractor avx512bitalg_enabled avx512bitalg_enabled)
+
+(decl use_lzcnt () Type)
+(extern extractor use_lzcnt use_lzcnt)
+
+(decl use_bmi1 () Type)
+(extern extractor use_bmi1 use_bmi1)
+
+(decl use_popcnt () Type)
+(extern extractor use_popcnt use_popcnt)
+
 ;;;; Helpers for Merging and Sinking Immediates/Loads  ;;;;;;;;;;;;;;;;;;;;;;;;;

 ;; Extract a constant `Imm8Reg.Imm8` from a value operand.
@@ -1266,6 +1260,13 @@
      (xmm_unary_rm_r (SseOpcode.Movdqu)
                      addr))

+;; Load a constant into an XMM register.
+(decl xmm_load_const (Type VCodeConstant) Xmm)
+(rule (xmm_load_const ty const)
+      (let ((dst WritableXmm (temp_writable_xmm))
+            (_ Unit (emit (MInst.XmmLoadConst const dst ty))))
+        dst))
+
 ;;;; Instruction Constructors ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; These constructors create SSA-style `MInst`s. It is their responsibility to
@@ -1398,6 +1399,13 @@
                  (imm $I64 bits)
                  (OperandSize.Size64)))

+;; Helper for emitting immediates with an `i64` value. Note that
+;; integer constants in ISLE are always parsed as `i64`s; this enables
+;; negative numbers to be used as immediates.
+(decl imm_i64 (Type i64) Reg)
+(rule (imm_i64 ty value)
+      (imm ty (i64_as_u64 value)))
+
 (decl nonzero_u64_fits_in_u32 (u64) u64)
 (extern extractor nonzero_u64_fits_in_u32 nonzero_u64_fits_in_u32)

@@ -1504,6 +1512,11 @@
 (rule (cmp size src1 src2)
      (cmp_rmi_r size (CmpOpcode.Cmp) src1 src2))

+;; Helper for creating `cmp` instructions with an immediate.
+(decl cmp_imm (OperandSize u32 Gpr) ProducesFlags)
+(rule (cmp_imm size src1 src2)
+      (cmp_rmi_r size (CmpOpcode.Cmp) (RegMemImm.Imm src1) src2))
+
 ;; Helper for creating `MInst.XmmCmpRmR` instructions.
 (decl xmm_cmp_rm_r (SseOpcode XmmMem Xmm) ProducesFlags)
 (rule (xmm_cmp_rm_r opcode src1 src2)
@@ -1579,17 +1592,25 @@
 (decl cmove_or (Type CC CC GprMem Gpr) ConsumesFlags)
 (rule (cmove_or ty cc1 cc2 consequent alternative)
      (let ((dst WritableGpr (temp_writable_gpr))
-            (size OperandSize (operand_size_of_type_32_64 ty)))
-        (ConsumesFlags.ConsumesFlagsReturnsReg
-         (MInst.CmoveOr size cc1 cc2 consequent alternative dst)
+            (tmp WritableGpr (temp_writable_gpr))
+            (size OperandSize (operand_size_of_type_32_64 ty))
+            (cmove1 MInst (MInst.Cmove size cc1 consequent alternative tmp))
+            (cmove2 MInst (MInst.Cmove size cc2 consequent tmp dst)))
+        (ConsumesFlags.ConsumesFlagsTwiceReturnsValueRegs
+         cmove1
+         cmove2
         dst)))

 (decl cmove_or_xmm (Type CC CC XmmMem Xmm) ConsumesFlags)
 (rule (cmove_or_xmm ty cc1 cc2 consequent alternative)
      (let ((dst WritableXmm (temp_writable_xmm))
-            (size OperandSize (operand_size_of_type_32_64 ty)))
-        (ConsumesFlags.ConsumesFlagsReturnsReg
-         (MInst.XmmCmoveOr size cc1 cc2 consequent alternative dst)
+            (tmp WritableXmm (temp_writable_xmm))
+            (size OperandSize (operand_size_of_type_32_64 ty))
+            (cmove1 MInst (MInst.XmmCmove size cc1 consequent alternative tmp))
+            (cmove2 MInst (MInst.XmmCmove size cc2 consequent tmp dst)))
+        (ConsumesFlags.ConsumesFlagsTwiceReturnsValueRegs
+         cmove1
+         cmove2
         dst)))

 ;; Helper for creating `cmove_or` instructions directly from values. This allows
@@ -1601,12 +1622,18 @@
            (alt ValueRegs alternative)
            (dst1 WritableGpr (temp_writable_gpr))
            (dst2 WritableGpr (temp_writable_gpr))
+            (tmp1 WritableGpr (temp_writable_gpr))
+            (tmp2 WritableGpr (temp_writable_gpr))
            (size OperandSize (OperandSize.Size64))
-            (lower_cmove MInst (MInst.CmoveOr size cc1 cc2 (value_regs_get_gpr cons 0) (value_regs_get_gpr alt 0) dst1))
-            (upper_cmove MInst (MInst.CmoveOr size cc1 cc2 (value_regs_get_gpr cons 1) (value_regs_get_gpr alt 1) dst2)))
-        (ConsumesFlags.ConsumesFlagsTwiceReturnsValueRegs
-         lower_cmove
-         upper_cmove
+            (cmove1 MInst (MInst.Cmove size cc1 (value_regs_get_gpr cons 0) (value_regs_get_gpr alt 0) tmp1))
+            (cmove2 MInst (MInst.Cmove size cc1 (value_regs_get_gpr cons 0) tmp1 dst1))
+            (cmove3 MInst (MInst.Cmove size cc1 (value_regs_get_gpr cons 1) (value_regs_get_gpr alt 1) tmp2))
+            (cmove4 MInst (MInst.Cmove size cc1 (value_regs_get_gpr cons 1) tmp2 dst2)))
+        (ConsumesFlags.ConsumesFlagsFourTimesReturnsValueRegs
+         cmove1
+         cmove2
+         cmove3
+         cmove4
         (value_regs dst1 dst2))))

 (rule (cmove_or_from_values (is_gpr_type (is_single_register_type ty)) cc1 cc2 consequent alternative)
@@ -1615,6 +1642,14 @@
 (rule (cmove_or_from_values (is_xmm_type (is_single_register_type ty)) cc1 cc2 consequent alternative)
      (cmove_or_xmm ty cc1 cc2 consequent alternative))

+;; Helper for creating `MInst.Setcc` instructions.
+(decl setcc (CC) ConsumesFlags)
+(rule (setcc cc)
+      (let ((dst WritableGpr (temp_writable_gpr)))
+        (ConsumesFlags.ConsumesFlagsReturnsReg
+         (MInst.Setcc cc dst)
+         dst)))
+
 ;; Helper for creating `MInst.MovzxRmR` instructions.
 (decl movzx (Type ExtMode GprMem) Gpr)
 (rule (movzx ty mode src)
@@ -2027,6 +2062,16 @@
                                           size))))
        dst))

+;; Helper for creating `pshufb` instructions.
+(decl pshufb (Xmm XmmMem) Xmm)
+(rule (pshufb src1 src2)
+      (let ((dst WritableXmm (temp_writable_xmm))
+            (_ Unit (emit (MInst.XmmRmR (SseOpcode.Pshufb)
+                                        src1
+                                        src2
+                                        dst))))
+        dst))
+
 ;; Helper for creating `MInst.XmmUnaryRmR` instructions.
 (decl xmm_unary_rm_r (SseOpcode XmmMem) Xmm)
 (rule (xmm_unary_rm_r op src)
@@ -2071,6 +2116,11 @@
 (rule (vpabsq src)
      (xmm_unary_rm_r_evex (Avx512Opcode.Vpabsq) src))

+;; Helper for creating `vpopcntb` instructions.
+(decl vpopcntb (XmmMem) Xmm)
+(rule (vpopcntb src)
+      (xmm_unary_rm_r_evex (Avx512Opcode.Vpopcntb) src))
+
 ;; Helper for creating `MInst.XmmRmREvex` instructions.
 (decl xmm_rm_r_evex (Avx512Opcode XmmMem Xmm) Xmm)
 (rule (xmm_rm_r_evex op src1 src2)
@@ -2221,6 +2271,70 @@
 (rule (ud2 code)
      (SideEffectNoResult.Inst (MInst.Ud2 code)))

+;; Helper for creating `lzcnt` instructions.
+(decl lzcnt (Type Gpr) Gpr)
+(rule (lzcnt ty src)
+      (let ((dst WritableGpr (temp_writable_gpr))
+            (size OperandSize (operand_size_of_type_32_64 ty))
+            (_ Unit (emit (MInst.UnaryRmR size (UnaryRmROpcode.Lzcnt) src dst))))
+        dst))
+
+;; Helper for creating `tzcnt` instructions.
+(decl tzcnt (Type Gpr) Gpr)
+(rule (tzcnt ty src)
+      (let ((dst WritableGpr (temp_writable_gpr))
+            (size OperandSize (operand_size_of_type_32_64 ty))
+            (_ Unit (emit (MInst.UnaryRmR size (UnaryRmROpcode.Tzcnt) src dst))))
+        dst))
+
+;; Helper for creating `bsr` instructions.
+(decl bsr (Type Gpr) ProducesFlags)
+(rule (bsr ty src)
+      (let ((dst WritableGpr (temp_writable_gpr))
+            (size OperandSize (operand_size_of_type_32_64 ty))
+            (inst MInst (MInst.UnaryRmR size (UnaryRmROpcode.Bsr) src dst)))
+        (ProducesFlags.ProducesFlagsReturnsReg inst dst)))
+
+;; Helper for creating `bsr + cmov` instruction pairs that produce the
+;; result of the `bsr`, or `alt` if the input was zero.
+(decl bsr_or_else (Type Gpr Gpr) Gpr)
+(rule (bsr_or_else ty src alt)
+      (let ((bsr ProducesFlags (bsr ty src))
+            ;; Manually extract the result from the bsr, then ignore
+            ;; it below, since we need to thread it into the cmove
+            ;; before we pass the cmove to with_flags_reg.
+            (bsr_result Gpr (produces_flags_get_reg bsr))
+            (cmove ConsumesFlags (cmove ty (CC.Z) alt bsr_result)))
+        (with_flags_reg (produces_flags_ignore bsr) cmove)))
+
+;; Helper for creating `bsf` instructions.
+(decl bsf (Type Gpr) ProducesFlags)
+(rule (bsf ty src)
+      (let ((dst WritableGpr (temp_writable_gpr))
+            (size OperandSize (operand_size_of_type_32_64 ty))
+            (inst MInst (MInst.UnaryRmR size (UnaryRmROpcode.Bsf) src dst)))
+        (ProducesFlags.ProducesFlagsReturnsReg inst dst)))
+
+;; Helper for creating `bsf + cmov` instruction pairs that produce the
+;; result of the `bsf`, or `alt` if the input was zero.
+(decl bsf_or_else (Type Gpr Gpr) Gpr)
+(rule (bsf_or_else ty src alt)
+      (let ((bsf ProducesFlags (bsf ty src))
+            ;; Manually extract the result from the bsf, then ignore
+            ;; it below, since we need to thread it into the cmove
+            ;; before we pass the cmove to with_flags_reg.
+            (bsf_result Gpr (produces_flags_get_reg bsf))
+            (cmove ConsumesFlags (cmove ty (CC.Z) alt bsf_result)))
+        (with_flags_reg (produces_flags_ignore bsf) cmove)))
+
+;; Helper for creating `popcnt` instructions.
+(decl x64_popcnt (Type Gpr) Gpr)
+(rule (x64_popcnt ty src)
+      (let ((dst WritableGpr (temp_writable_gpr))
+            (size OperandSize (operand_size_of_type_32_64 ty))
+            (_ Unit (emit (MInst.UnaryRmR size (UnaryRmROpcode.Popcnt) src dst))))
+        dst))
+
 ;;;; Automatic conversions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 (convert Gpr InstOutput output_gpr)
@@ -2241,6 +2355,8 @@
 (convert Reg GprMemImm reg_to_gpr_mem_imm)
 (convert WritableGpr WritableReg writable_gpr_to_reg)
 (convert WritableGpr Reg writable_gpr_to_r_reg)
+(convert WritableGpr GprMem writable_gpr_to_gpr_mem)
+(convert WritableGpr ValueRegs writable_gpr_to_value_regs)

 (convert Xmm InstOutput output_xmm)
 (convert Value Xmm put_in_xmm)
@@ -2259,8 +2375,10 @@
 (convert WritableXmm WritableReg writable_xmm_to_reg)
 (convert WritableXmm Reg writable_xmm_to_r_reg)
 (convert WritableXmm XmmMem writable_xmm_to_xmm_mem)
+(convert WritableXmm ValueRegs writable_xmm_to_value_regs)

 (convert Gpr Imm8Gpr gpr_to_imm8_gpr)
+(convert Imm8Reg Imm8Gpr imm8_reg_to_imm8_gpr)

 (convert Amode SyntheticAmode amode_to_synthetic_amode)
 (convert SyntheticAmode GprMem synthetic_amode_to_gpr_mem)
@@ -2276,12 +2394,21 @@
 (decl writable_gpr_to_r_reg (WritableGpr) Reg)
 (rule (writable_gpr_to_r_reg w_gpr)
      (writable_reg_to_reg (writable_gpr_to_reg w_gpr)))
+(decl writable_gpr_to_gpr_mem (WritableGpr) GprMem)
+(rule (writable_gpr_to_gpr_mem w_gpr)
+      (gpr_to_gpr_mem w_gpr))
+(decl writable_gpr_to_value_regs (WritableGpr) ValueRegs)
+(rule (writable_gpr_to_value_regs w_gpr)
+      (value_reg w_gpr))
 (decl writable_xmm_to_r_reg (WritableXmm) Reg)
 (rule (writable_xmm_to_r_reg w_xmm)
      (writable_reg_to_reg (writable_xmm_to_reg w_xmm)))
 (decl writable_xmm_to_xmm_mem (WritableXmm) XmmMem)
 (rule (writable_xmm_to_xmm_mem w_xmm)
      (xmm_to_xmm_mem (writable_xmm_to_xmm w_xmm)))
+(decl writable_xmm_to_value_regs (WritableXmm) ValueRegs)
+(rule (writable_xmm_to_value_regs w_xmm)
+      (value_reg w_xmm))

 (decl synthetic_amode_to_gpr_mem (SyntheticAmode) GprMem)
 (rule (synthetic_amode_to_gpr_mem amode)
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -1104,33 +1104,6 @@ pub(crate) fn emit(
            }
        }

-        Inst::CmoveOr {
-            size,
-            cc1,
-            cc2,
-            consequent,
-            alternative,
-            dst,
-        } => {
-            let first_cmove = Inst::Cmove {
-                cc: *cc1,
-                size: *size,
-                consequent: consequent.clone(),
-                alternative: alternative.clone(),
-                dst: dst.clone(),
-            };
-            first_cmove.emit(sink, info, state);
-
-            let second_cmove = Inst::Cmove {
-                cc: *cc2,
-                size: *size,
-                consequent: consequent.clone(),
-                alternative: alternative.clone(),
-                dst: dst.clone(),
-            };
-            second_cmove.emit(sink, info, state);
-        }
-
        Inst::XmmCmove {
            size,
            cc,
@@ -1159,39 +1132,6 @@ pub(crate) fn emit(
            sink.bind_label(next);
        }

-        Inst::XmmCmoveOr {
-            size,
-            cc1,
-            cc2,
-            consequent,
-            alternative,
-            dst,
-        } => {
-            debug_assert_eq!(*alternative, dst.to_reg());
-
-            let op = if *size == OperandSize::Size64 {
-                SseOpcode::Movsd
-            } else {
-                SseOpcode::Movss
-            };
-            let second_test = sink.get_label();
-            let next_instruction = sink.get_label();
-
-            // Jump to second test if `cc1` is *not* set.
-            one_way_jmp(sink, cc1.invert(), next_instruction);
-            let inst =
-                Inst::xmm_unary_rm_r(op, consequent.clone().to_reg_mem(), dst.to_writable_reg());
-            inst.emit(sink, info, state);
-            sink.bind_label(second_test);
-
-            // Jump to next instruction if `cc2` is *not* set.
-            one_way_jmp(sink, cc2.invert(), next_instruction);
-            let inst =
-                Inst::xmm_unary_rm_r(op, consequent.clone().to_reg_mem(), dst.to_writable_reg());
-            inst.emit(sink, info, state);
-            sink.bind_label(next_instruction);
-        }
-
        Inst::Push64 { src } => {
            if info.flags.enable_probestack() {
                sink.add_trap(state.cur_srcloc(), TrapCode::StackOverflow);
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -52,7 +52,6 @@ impl Inst {
            | Inst::CallUnknown { .. }
            | Inst::CheckedDivOrRemSeq { .. }
            | Inst::Cmove { .. }
-            | Inst::CmoveOr { .. }
            | Inst::CmpRmiR { .. }
            | Inst::CvtFloatToSintSeq { .. }
            | Inst::CvtFloatToUintSeq { .. }
@@ -89,7 +88,6 @@ impl Inst {
            | Inst::Ud2 { .. }
            | Inst::VirtualSPOffsetAdj { .. }
            | Inst::XmmCmove { .. }
-            | Inst::XmmCmoveOr { .. }
            | Inst::XmmCmpRmR { .. }
            | Inst::XmmLoadConst { .. }
            | Inst::XmmMinMaxSeq { .. }
@@ -141,6 +139,7 @@ impl Inst {
        }
    }

+    #[allow(dead_code)]
    pub(crate) fn unary_rm_r(
        size: OperandSize,
        op: UnaryRmROpcode,
@@ -906,12 +905,6 @@ impl Inst {
                alternative,
                dst,
                ..
-            }
-            | Inst::CmoveOr {
-                size,
-                alternative,
-                dst,
-                ..
            } => {
                if *alternative != dst.to_reg() {
                    debug_assert!(alternative.is_virtual());
@@ -926,9 +919,6 @@ impl Inst {
            }
            Inst::XmmCmove {
                alternative, dst, ..
-            }
-            | Inst::XmmCmoveOr {
-                alternative, dst, ..
            } => {
                if *alternative != dst.to_reg() {
                    debug_assert!(alternative.is_virtual());
@@ -1619,27 +1609,6 @@ impl PrettyPrint for Inst {
                show_ireg_sized(dst.to_reg().to_reg(), mb_rru, size.to_bytes())
            ),

-            Inst::CmoveOr {
-                size,
-                cc1,
-                cc2,
-                consequent: src,
-                alternative: _,
-                dst,
-            } => {
-                let src = src.show_rru_sized(mb_rru, size.to_bytes());
-                let dst = show_ireg_sized(dst.to_reg().to_reg(), mb_rru, size.to_bytes());
-                format!(
-                    "{} {}, {}; {} {}, {}",
-                    ljustify(format!("cmov{}{}", cc1.to_string(), suffix_bwlq(*size))),
-                    src,
-                    dst,
-                    ljustify(format!("cmov{}{}", cc2.to_string(), suffix_bwlq(*size))),
-                    src,
-                    dst,
-                )
-            }
-
            Inst::XmmCmove {
                size,
                cc,
@@ -1660,34 +1629,6 @@ impl PrettyPrint for Inst {
                )
            }

-            Inst::XmmCmoveOr {
-                size,
-                cc1,
-                cc2,
-                consequent: src,
-                dst,
-                ..
-            } => {
-                let suffix = if *size == OperandSize::Size64 {
-                    "sd"
-                } else {
-                    "ss"
-                };
-                let src = src.show_rru_sized(mb_rru, size.to_bytes());
-                let dst = show_ireg_sized(dst.to_reg().to_reg(), mb_rru, size.to_bytes());
-                format!(
-                    "j{} $check; mov{} {}, {}; $check: j{} $next; mov{} {}, {}; $next",
-                    cc1.invert().to_string(),
-                    suffix,
-                    src,
-                    dst,
-                    cc2.invert().to_string(),
-                    suffix,
-                    src,
-                    dst,
-                )
-            }
-
            Inst::Push64 { src } => {
                format!("{} {}", ljustify("pushq".to_string()), src.show_rru(mb_rru))
            }
@@ -2086,11 +2027,6 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
            consequent: src,
            dst,
            ..
-        }
-        | Inst::CmoveOr {
-            consequent: src,
-            dst,
-            ..
        } => {
            src.get_regs_as_uses(collector);
            collector.add_mod(dst.to_writable_reg());
@@ -2099,11 +2035,6 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
            consequent: src,
            dst,
            ..
-        }
-        | Inst::XmmCmoveOr {
-            consequent: src,
-            dst,
-            ..
        } => {
            src.get_regs_as_uses(collector);
            collector.add_mod(dst.to_writable_reg());
@@ -2554,12 +2485,6 @@ pub(crate) fn x64_map_regs<RM: RegMapper>(inst: &mut Inst, mapper: &RM) {
            ref mut dst,
            ref mut alternative,
            ..
-        }
-        | Inst::CmoveOr {
-            consequent: ref mut src,
-            ref mut dst,
-            ref mut alternative,
-            ..
        } => {
            src.map_uses(mapper);
            dst.map_mod(mapper);
@@ -2570,12 +2495,6 @@ pub(crate) fn x64_map_regs<RM: RegMapper>(inst: &mut Inst, mapper: &RM) {
            ref mut dst,
            ref mut alternative,
            ..
-        }
-        | Inst::XmmCmoveOr {
-            consequent: ref mut src,
-            ref mut dst,
-            ref mut alternative,
-            ..
        } => {
            src.map_uses(mapper);
            dst.map_mod(mapper);
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -1467,22 +1467,22 @@
 ;;  - `CC.BE -> C = 1 OR Z = 1` (below or equal)
 ;;  - `CC.NBE -> C = 0 AND Z = 0` (not below or equal)

-(rule (lower (has_type ty (select (def_inst (fcmp (FloatCC.Ordered) a b)) x y)))
+(rule (lower (has_type ty (select (fcmp (FloatCC.Ordered) a b) x y)))
      (with_flags (fpcmp b a) (cmove_from_values ty (CC.NP) x y)))

-(rule (lower (has_type ty (select (def_inst (fcmp (FloatCC.Unordered) a b)) x y)))
+(rule (lower (has_type ty (select (fcmp (FloatCC.Unordered) a b) x y)))
      (with_flags (fpcmp b a) (cmove_from_values ty (CC.P) x y)))

-(rule (lower (has_type ty (select (def_inst (fcmp (FloatCC.GreaterThan) a b)) x y)))
+(rule (lower (has_type ty (select (fcmp (FloatCC.GreaterThan) a b) x y)))
      (with_flags (fpcmp b a) (cmove_from_values ty (CC.NBE) x y)))

-(rule (lower (has_type ty (select (def_inst (fcmp (FloatCC.GreaterThanOrEqual) a b)) x y)))
+(rule (lower (has_type ty (select (fcmp (FloatCC.GreaterThanOrEqual) a b) x y)))
      (with_flags (fpcmp b a) (cmove_from_values ty (CC.NB) x y)))

-(rule (lower (has_type ty (select (def_inst (fcmp (FloatCC.UnorderedOrLessThan) a b)) x y)))
+(rule (lower (has_type ty (select (fcmp (FloatCC.UnorderedOrLessThan) a b) x y)))
      (with_flags (fpcmp b a) (cmove_from_values ty (CC.B) x y)))

-(rule (lower (has_type ty (select (def_inst (fcmp (FloatCC.UnorderedOrLessThanOrEqual) a b)) x y)))
+(rule (lower (has_type ty (select (fcmp (FloatCC.UnorderedOrLessThanOrEqual) a b) x y)))
      (with_flags (fpcmp b a) (cmove_from_values ty (CC.BE) x y)))

 ;; Certain FloatCC variants are implemented by flipping the operands of the
@@ -1496,16 +1496,16 @@
 ;; not `LT | UNO`. By flipping the operands AND inverting the comparison (e.g.,
 ;; to `CC.NBE`), we also avoid these unordered cases.

-(rule (lower (has_type ty (select (def_inst (fcmp (FloatCC.LessThan) a b)) x y)))
+(rule (lower (has_type ty (select (fcmp (FloatCC.LessThan) a b) x y)))
      (with_flags (fpcmp a b) (cmove_from_values ty (CC.NBE) x y)))

-(rule (lower (has_type ty (select (def_inst (fcmp (FloatCC.LessThanOrEqual) a b)) x y)))
+(rule (lower (has_type ty (select (fcmp (FloatCC.LessThanOrEqual) a b) x y)))
      (with_flags (fpcmp a b) (cmove_from_values ty (CC.NB) x y)))

-(rule (lower (has_type ty (select (def_inst (fcmp (FloatCC.UnorderedOrGreaterThan) a b)) x y)))
+(rule (lower (has_type ty (select (fcmp (FloatCC.UnorderedOrGreaterThan) a b) x y)))
      (with_flags (fpcmp a b) (cmove_from_values ty (CC.B) x y)))

-(rule (lower (has_type ty (select (def_inst (fcmp (FloatCC.UnorderedOrGreaterThanOrEqual) a b)) x y)))
+(rule (lower (has_type ty (select (fcmp (FloatCC.UnorderedOrGreaterThanOrEqual) a b) x y)))
      (with_flags (fpcmp a b) (cmove_from_values ty (CC.BE) x y)))

 ;; `FloatCC.Equal` and `FloatCC.NotEqual` can only be implemented with multiple
@@ -1521,8 +1521,341 @@
 ;; More details about the CLIF semantics for `fcmp` are available at
 ;; https://docs.rs/cranelift-codegen/latest/cranelift_codegen/ir/trait.InstBuilder.html#method.fcmp.

-(rule (lower (has_type ty (select (def_inst (fcmp (FloatCC.Equal) a b)) x y)))
+(rule (lower (has_type ty (select (fcmp (FloatCC.Equal) a b) x y)))
      (with_flags (fpcmp a b) (cmove_or_from_values ty (CC.NZ) (CC.P) y x)))

-(rule (lower (has_type ty (select (def_inst (fcmp (FloatCC.NotEqual) a b)) x y)))
+(rule (lower (has_type ty (select (fcmp (FloatCC.NotEqual) a b) x y)))
      (with_flags (fpcmp a b) (cmove_or_from_values ty (CC.NZ) (CC.P) x y)))
+
+;; Rules for `clz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; If available, we can use a plain lzcnt instruction here. Note no
+;; special handling is required for zero inputs, because the machine
+;; instruction does what the CLIF expects for zero, i.e. it returns
+;; zero.
+(rule 1 (lower
+         (has_type (and
+                    (ty_32_or_64 ty)
+                    (use_lzcnt))
+                   (clz src)))
+      (lzcnt ty src))
+
+(rule (lower
+       (has_type (ty_32_or_64 ty)
+                 (clz src)))
+      (do_clz ty ty src))
+
+(rule (lower
+       (has_type (ty_8_or_16 ty)
+                 (clz src)))
+      (do_clz $I32 ty (extend_to_gpr src $I32 (ExtendKind.Zero))))
+
+(rule (lower
+       (has_type $I128
+                 (clz src)))
+      (let ((upper Gpr (do_clz $I64 $I64 (value_regs_get_gpr src 1)))
+            (lower Gpr (add $I64
+                            (do_clz $I64 $I64 (value_regs_get_gpr src 0))
+                            (RegMemImm.Imm 64)))
+            (result_lo Gpr
+              (with_flags_reg
+               (cmp_imm (OperandSize.Size64) 64 upper)
+               (cmove $I64 (CC.NZ) upper lower))))
+        (value_regs result_lo (imm $I64 0))))
+
+;; Implementation helper for clz; operates on 32 or 64-bit units.
+(decl do_clz (Type Type Gpr) Gpr)
+(rule (do_clz ty orig_ty src)
+      (let ((highest_bit_index Reg (bsr_or_else ty src (imm_i64 $I64 -1)))
+            (bits_minus_1 Reg (imm ty (u64_sub (ty_bits_u64 orig_ty) 1))))
+        (sub ty bits_minus_1 highest_bit_index)))
+
+;; Rules for `ctz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Analogous to `clz` cases above, but using mirror instructions
+;; (tzcnt vs lzcnt, bsf vs bsr).
+
+(rule 1 (lower
+         (has_type (and
+                    (ty_32_or_64 ty)
+                    (use_bmi1))
+                   (ctz src)))
+      (tzcnt ty src))
+
+(rule (lower
+       (has_type (ty_32_or_64 ty)
+                 (ctz src)))
+      (do_ctz ty ty src))
+
+(rule (lower
+       (has_type (ty_8_or_16 ty)
+                 (ctz src)))
+      (do_ctz $I32 ty (extend_to_gpr src $I32 (ExtendKind.Zero))))
+
+(rule (lower
+       (has_type $I128
+                 (ctz src)))
+      (let ((lower Gpr (do_ctz $I64 $I64 (value_regs_get_gpr src 0)))
+            (upper Gpr (add $I64
+                            (do_ctz $I64 $I64 (value_regs_get_gpr src 1))
+                            (RegMemImm.Imm 64)))
+            (result_lo Gpr
+              (with_flags_reg
+               (cmp_imm (OperandSize.Size64) 64 lower)
+               (cmove $I64 (CC.Z) upper lower))))
+        (value_regs result_lo (imm $I64 0))))
+
+(decl do_ctz (Type Type Gpr) Gpr)
+(rule (do_ctz ty orig_ty src)
+      (bsf_or_else ty src (imm $I64 (ty_bits_u64 orig_ty))))
+
+;; Rules for `popcnt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule 1 (lower
+         (has_type (and
+                    (ty_32_or_64 ty)
+                    (use_popcnt))
+                   (popcnt src)))
+      (x64_popcnt ty src))
+
+(rule 1 (lower
+         (has_type (and
+                    (ty_8_or_16 ty)
+                    (use_popcnt))
+                   (popcnt src)))
+      (x64_popcnt $I32 (extend_to_gpr src $I32 (ExtendKind.Zero))))
+
+(rule 1 (lower
+         (has_type (and
+                    $I128
+                    (use_popcnt))
+                   (popcnt src)))
+      (let ((lo_count Gpr (x64_popcnt $I64 (value_regs_get_gpr src 0)))
+            (hi_count Gpr (x64_popcnt $I64 (value_regs_get_gpr src 1))))
+        (value_regs (add $I64 lo_count hi_count) (imm $I64 0))))
+
+(rule (lower
+       (has_type (ty_32_or_64 ty)
+                 (popcnt src)))
+      (do_popcnt ty src))
+
+(rule (lower
+       (has_type (ty_8_or_16 ty)
+                 (popcnt src)))
+      (do_popcnt $I32 (extend_to_gpr src $I32 (ExtendKind.Zero))))
+
+(rule (lower
+       (has_type $I128
+                 (popcnt src)))
+      (let ((lo_count Gpr (do_popcnt $I64 (value_regs_get_gpr src 0)))
+            (hi_count Gpr (do_popcnt $I64 (value_regs_get_gpr src 1))))
+        (value_regs (add $I64 lo_count hi_count) (imm $I64 0))))
+
+;; Implementation of popcount when we don't nave a native popcount
+;; instruction.
+(decl do_popcnt (Type Gpr) Gpr)
+(rule (do_popcnt $I64 src)
+      (let ((shifted1 Gpr (shr $I64 src (Imm8Reg.Imm8 1)))
+            (sevens Gpr (imm $I64 0x7777777777777777))
+            (masked1 Gpr (x64_and $I64 shifted1 sevens))
+            ;; diff1 := src - ((src >> 1) & 0b0111_0111_0111...)
+            (diff1 Gpr (sub $I64 src masked1))
+            (shifted2 Gpr (shr $I64 masked1 (Imm8Reg.Imm8 1)))
+            (masked2 Gpr (x64_and $I64 shifted2 sevens))
+            ;; diff2 := diff1 - ((diff1 >> 1) & 0b0111_0111_0111...)
+            (diff2 Gpr (sub $I64 diff1 masked2))
+            (shifted3 Gpr (shr $I64 masked2 (Imm8Reg.Imm8 1)))
+            (masked3 Gpr (x64_and $I64 shifted3 sevens))
+            ;; diff3 := diff2 - ((diff2 >> 1) & 0b0111_0111_0111...)
+            ;;
+            ;; At this point, each nibble of diff3 is the popcount of
+            ;; that nibble. This works because at each step above, we
+            ;; are basically subtracting floor(value / 2) from the
+            ;; running value; the leftover remainder is 1 if the LSB
+            ;; was 1. After three steps, we have (nibble / 8) -- 0 or
+            ;; 1 for the MSB of the nibble -- plus three possible
+            ;; additions for the three other bits.
+            (diff3 Gpr (sub $I64 diff2 masked3))
+            ;; Add the two nibbles of each byte together.
+            (sum1 Gpr (add $I64
+                           (shr $I64 diff3 (Imm8Reg.Imm8 4))
+                           diff3))
+            ;; Mask the above sum to have the popcount for each byte
+            ;; in the lower nibble of that byte.
+            (ofof Gpr (imm $I64 0x0f0f0f0f0f0f0f0f))
+            (masked4 Gpr (x64_and $I64 sum1 ofof))
+            (ones Gpr (imm $I64 0x0101010101010101))
+            ;; Use a multiply to sum all of the bytes' popcounts into
+            ;; the top byte. Consider the binomial expansion for the
+            ;; top byte: it is the sum of the bytes (masked4 >> 56) *
+            ;; 0x01 + (masked4 >> 48) * 0x01 + (masked4 >> 40) * 0x01
+            ;; + ... + (masked4 >> 0).
+            (mul Gpr (mul $I64 masked4 ones))
+            ;; Now take that top byte and return it as the popcount.
+            (final Gpr (shr $I64 mul (Imm8Reg.Imm8 56))))
+        final))
+
+;; This is the 32-bit version of the above; the steps for each nibble
+;; are the same, we just use constants half as wide.
+(rule (do_popcnt $I32 src)
+      (let ((shifted1 Gpr (shr $I32 src (Imm8Reg.Imm8 1)))
+            (sevens Gpr (imm $I32 0x77777777))
+            (masked1 Gpr (x64_and $I32 shifted1 sevens))
+            (diff1 Gpr (sub $I32 src masked1))
+            (shifted2 Gpr (shr $I32 masked1 (Imm8Reg.Imm8 1)))
+            (masked2 Gpr (x64_and $I32 shifted2 sevens))
+            (diff2 Gpr (sub $I32 diff1 masked2))
+            (shifted3 Gpr (shr $I32 masked2 (Imm8Reg.Imm8 1)))
+            (masked3 Gpr (x64_and $I32 shifted3 sevens))
+            (diff3 Gpr (sub $I32 diff2 masked3))
+            (sum1 Gpr (add $I32
+                           (shr $I32 diff3 (Imm8Reg.Imm8 4))
+                           diff3))
+            (masked4 Gpr (x64_and $I32 sum1 (RegMemImm.Imm 0x0f0f0f0f)))
+            (mul Gpr (mul $I32 masked4 (RegMemImm.Imm 0x01010101)))
+            (final Gpr (shr $I32 mul (Imm8Reg.Imm8 24))))
+        final))
+                       
+            
+(rule 1 (lower (has_type (and
+                          $I8X16
+                          (avx512vl_enabled)
+                          (avx512bitalg_enabled))
+                         (popcnt src)))
+      (vpopcntb src))
+
+
+      
+;; For SSE 4.2 we use Mula's algorithm (https://arxiv.org/pdf/1611.07612.pdf):
+;;
+;; __m128i count_bytes ( __m128i v) {
+;;     __m128i lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
+;;     __m128i low_mask = _mm_set1_epi8 (0x0f);
+;;     __m128i lo = _mm_and_si128 (v, low_mask);
+;;     __m128i hi = _mm_and_si128 (_mm_srli_epi16 (v, 4), low_mask);
+;;     __m128i cnt1 = _mm_shuffle_epi8 (lookup, lo);
+;;     __m128i cnt2 = _mm_shuffle_epi8 (lookup, hi);
+;;     return _mm_add_epi8 (cnt1, cnt2);
+;; }
+;;
+;; Details of the above algorithm can be found in the reference noted above, but the basics
+;; are to create a lookup table that pre populates the popcnt values for each number [0,15].
+;; The algorithm uses shifts to isolate 4 bit sections of the vector, pshufb as part of the
+;; lookup process, and adds together the results.
+;;
+;; __m128i lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
+
+(decl popcount_4bit_table () VCodeConstant)  ;; bits-per-nibble table `lookup` above
+(extern constructor popcount_4bit_table popcount_4bit_table)
+
+(decl popcount_low_mask () VCodeConstant)    ;; mask for low nibbles: 0x0f * 16
+(extern constructor popcount_low_mask popcount_low_mask)
+
+(rule (lower (has_type $I8X16
+                       (popcnt src)))
+      (let ((nibble_table_const VCodeConstant (popcount_4bit_table))
+            (low_mask Xmm (xmm_load_const $I8X16 (popcount_low_mask)))
+            (low_nibbles Xmm (sse_and $I8X16 src low_mask))
+            ;; Note that this is a 16x8 shift, but that's OK; we mask
+            ;; off anything that traverses from one byte to the next
+            ;; with the low_mask below.
+            (shifted_src Xmm (psrlw src (RegMemImm.Imm 4)))
+            (high_nibbles Xmm (sse_and $I8X16 shifted_src low_mask))
+            (lookup Xmm (xmm_load_const $I8X16 (popcount_4bit_table)))
+            (bit_counts_low Xmm (pshufb lookup low_nibbles))
+            (bit_counts_high Xmm (pshufb lookup high_nibbles)))
+        (paddb bit_counts_low bit_counts_high)))
+
+;; Rules for `bitrev` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type $I8 (bitrev src)))
+      (do_bitrev8 $I32 src))
+
+(rule (lower (has_type $I16 (bitrev src)))
+      (do_bitrev16 $I32 src))
+
+(rule (lower (has_type $I32 (bitrev src)))
+      (do_bitrev32 $I32 src))
+
+(rule (lower (has_type $I64 (bitrev src)))
+      (do_bitrev64 $I64 src))
+
+(rule (lower (has_type $I128 (bitrev src)))
+      (value_regs
+       (do_bitrev64 $I64 (value_regs_get_gpr src 1))
+       (do_bitrev64 $I64 (value_regs_get_gpr src 0))))
+
+(decl do_bitrev8 (Type Gpr) Gpr)
+(rule (do_bitrev8 ty src)
+      (let ((tymask u64 (ty_mask ty))
+            (mask1 Gpr (imm ty (u64_and tymask 0x5555555555555555)))
+            (lo1 Gpr (x64_and ty src mask1))
+            (hi1 Gpr (x64_and ty (shr ty src (Imm8Reg.Imm8 1)) mask1))
+            (swap1 Gpr (or ty
+                           (shl ty lo1 (Imm8Reg.Imm8 1))
+                           hi1))
+            (mask2 Gpr (imm ty (u64_and tymask 0x3333333333333333)))
+            (lo2 Gpr (x64_and ty swap1 mask2))
+            (hi2 Gpr (x64_and ty (shr ty swap1 (Imm8Reg.Imm8 2)) mask2))
+            (swap2 Gpr (or ty
+                           (shl ty lo2 (Imm8Reg.Imm8 2))
+                           hi2))
+            (mask4 Gpr (imm ty (u64_and tymask 0x0f0f0f0f0f0f0f0f)))
+            (lo4 Gpr (x64_and ty swap2 mask4))
+            (hi4 Gpr (x64_and ty (shr ty swap2 (Imm8Reg.Imm8 4)) mask4))
+            (swap4 Gpr (or ty
+                           (shl ty lo4 (Imm8Reg.Imm8 4))
+                           hi4)))
+        swap4))
+                       
+(decl do_bitrev16 (Type Gpr) Gpr)
+(rule (do_bitrev16 ty src)
+      (let ((src_ Gpr (do_bitrev8 ty src))
+            (tymask u64 (ty_mask ty))
+            (mask8 Gpr (imm ty (u64_and tymask 0x00ff00ff00ff00ff)))
+            (lo8 Gpr (x64_and ty src_ mask8))
+            (hi8 Gpr (x64_and ty (shr ty src_ (Imm8Reg.Imm8 8)) mask8))
+            (swap8 Gpr (or ty
+                           (shl ty lo8 (Imm8Reg.Imm8 8))
+                           hi8)))
+        swap8))
+      
+(decl do_bitrev32 (Type Gpr) Gpr)
+(rule (do_bitrev32 ty src)
+      (let ((src_ Gpr (do_bitrev16 ty src))
+            (tymask u64 (ty_mask ty))
+            (mask16 Gpr (imm ty (u64_and tymask 0x0000ffff0000ffff)))
+            (lo16 Gpr (x64_and ty src_ mask16))
+            (hi16 Gpr (x64_and ty (shr ty src_ (Imm8Reg.Imm8 16)) mask16))
+            (swap16 Gpr (or ty
+                            (shl ty lo16 (Imm8Reg.Imm8 16))
+                            hi16)))
+        swap16))
+
+(decl do_bitrev64 (Type Gpr) Gpr)
+(rule (do_bitrev64 ty @ $I64 src)
+      (let ((src_ Gpr (do_bitrev32 ty src))
+            (mask32 Gpr (imm ty 0xffffffff))
+            (lo32 Gpr (x64_and ty src_ mask32))
+            (hi32 Gpr (shr ty src_ (Imm8Reg.Imm8 32)))
+            (swap32 Gpr (or ty
+                            (shl ty lo32 (Imm8Reg.Imm8 32))
+                            hi32)))
+        swap32))
+
+;; Rules for `is_null` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Null references are represented by the constant value `0`.
+(rule (lower (is_null src @ (value_type $R64)))
+      (with_flags
+       (cmp_imm (OperandSize.Size64) 0 src)
+       (setcc (CC.Z))))
+
+;; Rules for `is_invalid` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Null references are represented by the constant value `-1`.
+(rule (lower (is_invalid src @ (value_type $R64)))
+      (with_flags
+       (cmp_imm (OperandSize.Size64) 0xffffffff src)  ;; simm32 0xffff_ffff is sign-extended to -1.
+       (setcc (CC.Z))))
+
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@@ -171,6 +171,42 @@ where
        }
    }

+    #[inline]
+    fn avx512bitalg_enabled(&mut self, _: Type) -> Option<()> {
+        if self.isa_flags.use_avx512bitalg_simd() {
+            Some(())
+        } else {
+            None
+        }
+    }
+
+    #[inline]
+    fn use_lzcnt(&mut self, _: Type) -> Option<()> {
+        if self.isa_flags.use_lzcnt() {
+            Some(())
+        } else {
+            None
+        }
+    }
+
+    #[inline]
+    fn use_bmi1(&mut self, _: Type) -> Option<()> {
+        if self.isa_flags.use_bmi1() {
+            Some(())
+        } else {
+            None
+        }
+    }
+
+    #[inline]
+    fn use_popcnt(&mut self, _: Type) -> Option<()> {
+        if self.isa_flags.use_popcnt() {
+            Some(())
+        } else {
+            None
+        }
+    }
+
    #[inline]
    fn imm8_from_value(&mut self, val: Value) -> Option<Imm8Reg> {
        let inst = self.lower_ctx.dfg().value_def(val).inst()?;
@@ -326,6 +362,16 @@ where
        SyntheticAmode::ConstantOffset(mask_table)
    }

+    fn popcount_4bit_table(&mut self) -> VCodeConstant {
+        self.lower_ctx
+            .use_constant(VCodeConstantData::WellKnown(&POPCOUNT_4BIT_TABLE))
+    }
+
+    fn popcount_low_mask(&mut self) -> VCodeConstant {
+        self.lower_ctx
+            .use_constant(VCodeConstantData::WellKnown(&POPCOUNT_LOW_MASK))
+    }
+
    #[inline]
    fn writable_reg_to_xmm(&mut self, r: WritableReg) -> WritableXmm {
        Writable::from_reg(Xmm::new(r.to_reg()).unwrap())
@@ -499,6 +545,18 @@ const I8X16_USHR_MASKS: [u8; 128] = [
    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
 ];

+/// Number of bits set in a given nibble (4-bit value). Used in the
+/// vector implementation of popcount.
+#[rustfmt::skip] // Preserve 4x4 layout.
+const POPCOUNT_4BIT_TABLE: [u8; 16] = [
+    0x00, 0x01, 0x01, 0x02,
+    0x01, 0x02, 0x02, 0x03,
+    0x01, 0x02, 0x02, 0x03,
+    0x02, 0x03, 0x03, 0x04,
+];
+
+const POPCOUNT_LOW_MASK: [u8; 16] = [0x0f; 16];
+
 #[inline]
 fn to_simm32(constant: i64) -> Option<GprMemImm> {
    if constant == ((constant << 32) >> 32) {
--- a/cranelift/codegen/src/isa/x64/lower/isle/generated_code.manifest
+++ b/cranelift/codegen/src/isa/x64/lower/isle/generated_code.manifest
@@ -1,4 +1,4 @@
 src/clif.isle 9ea75a6f790b5c03
-src/prelude.isle 9830498351ddf6a3
-src/isa/x64/inst.isle 5ee89205e6e9a46b
-src/isa/x64/lower.isle 348a808ea5de4cdb
+src/prelude.isle 6b0160bfcac86902
+src/isa/x64/inst.isle 67eb719e568c2a81
+src/isa/x64/lower.isle 142626fe062fd7d7
--- a/cranelift/codegen/src/isa/x64/lower/isle/generated_code.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle/generated_code.rs
--- a/cranelift/codegen/src/machinst/isle.rs
+++ b/cranelift/codegen/src/machinst/isle.rs
@@ -140,6 +140,26 @@ macro_rules! isle_prelude_methods {
            x.into()
        }

+        #[inline]
+        fn i64_as_u64(&mut self, x: i64) -> u64 {
+            x as u64
+        }
+
+        #[inline]
+        fn u64_add(&mut self, x: u64, y: u64) -> u64 {
+            x.wrapping_add(y)
+        }
+
+        #[inline]
+        fn u64_sub(&mut self, x: u64, y: u64) -> u64 {
+            x.wrapping_sub(y)
+        }
+
+        #[inline]
+        fn u64_and(&mut self, x: u64, y: u64) -> u64 {
+            x & y
+        }
+
        #[inline]
        fn ty_bits(&mut self, ty: Type) -> u8 {
            use std::convert::TryInto;
@@ -151,11 +171,28 @@ macro_rules! isle_prelude_methods {
            ty.bits()
        }

+        #[inline]
+        fn ty_bits_u64(&mut self, ty: Type) -> u64 {
+            ty.bits() as u64
+        }
+
        #[inline]
        fn ty_bytes(&mut self, ty: Type) -> u16 {
            u16::try_from(ty.bytes()).unwrap()
        }

+        #[inline]
+        fn ty_mask(&mut self, ty: Type) -> u64 {
+            match ty.bits() {
+                1 => 1,
+                8 => 0xff,
+                16 => 0xffff,
+                32 => 0xffff_ffff,
+                64 => 0xffff_ffff_ffff_ffff,
+                _ => unimplemented!(),
+            }
+        }
+
        fn fits_in_16(&mut self, ty: Type) -> Option<Type> {
            if ty.bits() <= 16 {
                Some(ty)
--- a/cranelift/codegen/src/prelude.isle
+++ b/cranelift/codegen/src/prelude.isle
@@ -167,6 +167,20 @@
 (decl u32_as_u64 (u32) u64)
 (extern constructor u32_as_u64 u32_as_u64)

+(decl i64_as_u64 (i64) u64)
+(extern constructor i64_as_u64 i64_as_u64)
+
+;;;; Primitive Arithmetic ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(decl u64_add (u64 u64) u64)
+(extern constructor u64_add u64_add)
+
+(decl u64_sub (u64 u64) u64)
+(extern constructor u64_sub u64_sub)
+
+(decl u64_and (u64 u64) u64)
+(extern constructor u64_and u64_and)
+
 ;;;; `cranelift_codegen::ir::Type` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 (extern const $B1 Type)
@@ -209,6 +223,14 @@
 (decl ty_bits_u16 (Type) u16)
 (extern constructor ty_bits_u16 ty_bits_u16)

+;; Get the bit width of a given type.
+(decl ty_bits_u64 (Type) u64)
+(extern constructor ty_bits_u64 ty_bits_u64)
+
+;; Get a mask for the width of a given type.
+(decl ty_mask (Type) u64)
+(extern constructor ty_mask ty_mask)
+
 ;; Get the byte width of a given type.
 (decl ty_bytes (Type) u16)
 (extern constructor ty_bytes ty_bytes)
@@ -398,9 +420,27 @@
                     (ConsumesFlagsReturnsReg (inst MInst) (result Reg))
                     (ConsumesFlagsTwiceReturnsValueRegs (inst1 MInst)
                                                         (inst2 MInst)
-                                                         (result ValueRegs))))
+                                                         (result ValueRegs))
+                     (ConsumesFlagsFourTimesReturnsValueRegs (inst1 MInst)
+                                                             (inst2 MInst)
+                                                             (inst3 MInst)
+                                                             (inst4 MInst)
+                                                             (result ValueRegs))))


+
+;; Get the produced register out of a ProducesFlags.
+(decl produces_flags_get_reg (ProducesFlags) Reg)
+(rule (produces_flags_get_reg (ProducesFlags.ProducesFlagsReturnsReg _ reg)) reg)
+
+;; Modify a ProducesFlags to use it only for its side-effect, ignoring
+;; its result.
+(decl produces_flags_ignore (ProducesFlags) ProducesFlags)
+(rule (produces_flags_ignore (ProducesFlags.ProducesFlagsReturnsReg inst _))
+                             (ProducesFlags.ProducesFlagsSideEffect inst))
+(rule (produces_flags_ignore (ProducesFlags.ProducesFlagsReturnsResultWithConsumer inst _))
+                             (ProducesFlags.ProducesFlagsSideEffect inst))
+
 ;; Helper for combining two flags-consumer instructions that return a
 ;; single Reg, giving a ConsumesFlags that returns both values in a
 ;; ValueRegs.
@@ -440,12 +480,28 @@
                  (ConsumesFlags.ConsumesFlagsTwiceReturnsValueRegs consumer_inst_1
                                                                    consumer_inst_2
                                                                    consumer_result))
+      ;; We must emit these instructions in order as the creator of
+      ;; the ConsumesFlags may be relying on dataflow dependencies
+      ;; amongst them.
      (let ((_x Unit (emit producer_inst))
-            ;; Note that the order of emission here is swapped, as this seems
-            ;; to generate better register allocation for now with fewer
-            ;; `mov` instructions.
-            (_y Unit (emit consumer_inst_2))
-            (_z Unit (emit consumer_inst_1)))
+            (_y Unit (emit consumer_inst_1))
+            (_z Unit (emit consumer_inst_2)))
+        consumer_result))
+
+(rule (with_flags (ProducesFlags.ProducesFlagsSideEffect producer_inst)
+                  (ConsumesFlags.ConsumesFlagsFourTimesReturnsValueRegs consumer_inst_1
+                                                                        consumer_inst_2
+                                                                        consumer_inst_3
+                                                                        consumer_inst_4
+                                                                        consumer_result))
+      ;; We must emit these instructions in order as the creator of
+      ;; the ConsumesFlags may be relying on dataflow dependencies
+      ;; amongst them.
+      (let ((_x Unit (emit producer_inst))
+            (_y Unit (emit consumer_inst_1))
+            (_z Unit (emit consumer_inst_2))
+            (_w Unit (emit consumer_inst_3))
+            (_v Unit (emit consumer_inst_4)))
        consumer_result))

 (decl with_flags_reg (ProducesFlags ConsumesFlags) Reg)
--- a/cranelift/filetests/filetests/isa/aarch64/bitops.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/bitops.clif
@@ -1184,7 +1184,7 @@ block0(v0: i128, v1: i8):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 10)
+;   (instruction range: 0 .. 13)
 ;   Inst 0:   lsl x4, x0, x2
 ;   Inst 1:   lsl x3, x1, x2
 ;   Inst 2:   orn w1, wzr, w2
@@ -1192,9 +1192,12 @@ block0(v0: i128, v1: i8):
 ;   Inst 4:   lsr x0, x0, x1
 ;   Inst 5:   orr x0, x3, x0
 ;   Inst 6:   ands xzr, x2, #64
-;   Inst 7:   csel x1, x4, x0, ne
-;   Inst 8:   csel x0, xzr, x4, ne
-;   Inst 9:   ret
+;   Inst 7:   csel x1, xzr, x4, ne
+;   Inst 8:   csel x0, x4, x0, ne
+;   Inst 9:   mov x2, x0
+;   Inst 10:   mov x0, x1
+;   Inst 11:   mov x1, x2
+;   Inst 12:   ret
 ; }}

 function %ishl_i128_i128(i128, i128) -> i128 {
@@ -1207,7 +1210,7 @@ block0(v0: i128, v1: i128):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 10)
+;   (instruction range: 0 .. 13)
 ;   Inst 0:   lsl x3, x0, x2
 ;   Inst 1:   lsl x1, x1, x2
 ;   Inst 2:   orn w4, wzr, w2
@@ -1215,9 +1218,12 @@ block0(v0: i128, v1: i128):
 ;   Inst 4:   lsr x0, x0, x4
 ;   Inst 5:   orr x0, x1, x0
 ;   Inst 6:   ands xzr, x2, #64
-;   Inst 7:   csel x1, x3, x0, ne
-;   Inst 8:   csel x0, xzr, x3, ne
-;   Inst 9:   ret
+;   Inst 7:   csel x1, xzr, x3, ne
+;   Inst 8:   csel x0, x3, x0, ne
+;   Inst 9:   mov x2, x0
+;   Inst 10:   mov x0, x1
+;   Inst 11:   mov x1, x2
+;   Inst 12:   ret
 ; }}

 function %ushr_i128_i8(i128, i8) -> i128 {
@@ -1230,17 +1236,20 @@ block0(v0: i128, v1: i8):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 10)
+;   (instruction range: 0 .. 13)
 ;   Inst 0:   lsr x3, x0, x2
 ;   Inst 1:   lsr x0, x1, x2
 ;   Inst 2:   orn w4, wzr, w2
 ;   Inst 3:   lsl x1, x1, #1
 ;   Inst 4:   lsl x1, x1, x4
-;   Inst 5:   orr x3, x3, x1
+;   Inst 5:   orr x1, x3, x1
 ;   Inst 6:   ands xzr, x2, #64
-;   Inst 7:   csel x1, xzr, x0, ne
-;   Inst 8:   csel x0, x0, x3, ne
-;   Inst 9:   ret
+;   Inst 7:   csel x1, x0, x1, ne
+;   Inst 8:   csel x0, xzr, x0, ne
+;   Inst 9:   mov x2, x0
+;   Inst 10:   mov x0, x1
+;   Inst 11:   mov x1, x2
+;   Inst 12:   ret
 ; }}

 function %ushr_i128_i128(i128, i128) -> i128 {
@@ -1253,17 +1262,20 @@ block0(v0: i128, v1: i128):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 10)
+;   (instruction range: 0 .. 13)
 ;   Inst 0:   lsr x3, x0, x2
 ;   Inst 1:   lsr x0, x1, x2
 ;   Inst 2:   orn w4, wzr, w2
 ;   Inst 3:   lsl x1, x1, #1
 ;   Inst 4:   lsl x1, x1, x4
-;   Inst 5:   orr x3, x3, x1
+;   Inst 5:   orr x1, x3, x1
 ;   Inst 6:   ands xzr, x2, #64
-;   Inst 7:   csel x1, xzr, x0, ne
-;   Inst 8:   csel x0, x0, x3, ne
-;   Inst 9:   ret
+;   Inst 7:   csel x1, x0, x1, ne
+;   Inst 8:   csel x0, xzr, x0, ne
+;   Inst 9:   mov x2, x0
+;   Inst 10:   mov x0, x1
+;   Inst 11:   mov x1, x2
+;   Inst 12:   ret
 ; }}

 function %sshr_i128_i8(i128, i8) -> i128 {
@@ -1276,7 +1288,7 @@ block0(v0: i128, v1: i8):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 11)
+;   (instruction range: 0 .. 13)
 ;   Inst 0:   lsr x3, x0, x2
 ;   Inst 1:   asr x0, x1, x2
 ;   Inst 2:   orn w4, wzr, w2
@@ -1285,9 +1297,11 @@ block0(v0: i128, v1: i8):
 ;   Inst 5:   asr x1, x1, #63
 ;   Inst 6:   orr x3, x3, x4
 ;   Inst 7:   ands xzr, x2, #64
-;   Inst 8:   csel x1, x1, x0, ne
-;   Inst 9:   csel x0, x0, x3, ne
-;   Inst 10:   ret
+;   Inst 8:   csel x2, x0, x3, ne
+;   Inst 9:   csel x0, x1, x0, ne
+;   Inst 10:   mov x1, x0
+;   Inst 11:   mov x0, x2
+;   Inst 12:   ret
 ; }}

 function %sshr_i128_i128(i128, i128) -> i128 {
@@ -1300,7 +1314,7 @@ block0(v0: i128, v1: i128):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 11)
+;   (instruction range: 0 .. 13)
 ;   Inst 0:   lsr x3, x0, x2
 ;   Inst 1:   asr x0, x1, x2
 ;   Inst 2:   orn w4, wzr, w2
@@ -1309,8 +1323,10 @@ block0(v0: i128, v1: i128):
 ;   Inst 5:   asr x1, x1, #63
 ;   Inst 6:   orr x3, x3, x4
 ;   Inst 7:   ands xzr, x2, #64
-;   Inst 8:   csel x1, x1, x0, ne
-;   Inst 9:   csel x0, x0, x3, ne
-;   Inst 10:   ret
+;   Inst 8:   csel x2, x0, x3, ne
+;   Inst 9:   csel x0, x1, x0, ne
+;   Inst 10:   mov x1, x0
+;   Inst 11:   mov x0, x2
+;   Inst 12:   ret
 ; }}

--- a/cranelift/filetests/filetests/isa/aarch64/shift-rotate.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/shift-rotate.clif
@@ -16,19 +16,19 @@ block0(v0: i128, v1: i128):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 24)
+;   (instruction range: 0 .. 25)
 ;   Inst 0:   mov x4, x1
 ;   Inst 1:   orr x1, xzr, #128
 ;   Inst 2:   sub x1, x1, x2
-;   Inst 3:   lsr x3, x0, x2
-;   Inst 4:   lsr x5, x4, x2
+;   Inst 3:   lsr x5, x0, x2
+;   Inst 4:   lsr x3, x4, x2
 ;   Inst 5:   orn w6, wzr, w2
 ;   Inst 6:   lsl x7, x4, #1
 ;   Inst 7:   lsl x6, x7, x6
-;   Inst 8:   orr x6, x3, x6
+;   Inst 8:   orr x5, x5, x6
 ;   Inst 9:   ands xzr, x2, #64
-;   Inst 10:   csel x3, xzr, x5, ne
-;   Inst 11:   csel x2, x5, x6, ne
+;   Inst 10:   csel x2, x3, x5, ne
+;   Inst 11:   csel x3, xzr, x3, ne
 ;   Inst 12:   lsl x5, x0, x1
 ;   Inst 13:   lsl x4, x4, x1
 ;   Inst 14:   orn w6, wzr, w1
@@ -36,11 +36,12 @@ block0(v0: i128, v1: i128):
 ;   Inst 16:   lsr x0, x0, x6
 ;   Inst 17:   orr x0, x4, x0
 ;   Inst 18:   ands xzr, x1, #64
-;   Inst 19:   csel x1, x5, x0, ne
-;   Inst 20:   csel x0, xzr, x5, ne
-;   Inst 21:   orr x1, x3, x1
-;   Inst 22:   orr x0, x2, x0
-;   Inst 23:   ret
+;   Inst 19:   csel x1, xzr, x5, ne
+;   Inst 20:   csel x0, x5, x0, ne
+;   Inst 21:   orr x3, x3, x0
+;   Inst 22:   orr x0, x2, x1
+;   Inst 23:   mov x1, x3
+;   Inst 24:   ret
 ; }}

 function %f0(i64, i64) -> i64 {
@@ -125,7 +126,7 @@ block0(v0: i128, v1: i128):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 27)
+;   (instruction range: 0 .. 24)
 ;   Inst 0:   mov x4, x0
 ;   Inst 1:   orr x0, xzr, #128
 ;   Inst 2:   sub x0, x0, x2
@@ -136,8 +137,8 @@ block0(v0: i128, v1: i128):
 ;   Inst 7:   lsr x6, x7, x6
 ;   Inst 8:   orr x5, x5, x6
 ;   Inst 9:   ands xzr, x2, #64
-;   Inst 10:   csel x2, x3, x5, ne
-;   Inst 11:   csel x3, xzr, x3, ne
+;   Inst 10:   csel x2, xzr, x3, ne
+;   Inst 11:   csel x3, x3, x5, ne
 ;   Inst 12:   lsr x5, x4, x0
 ;   Inst 13:   lsr x4, x1, x0
 ;   Inst 14:   orn w6, wzr, w0
@@ -145,14 +146,11 @@ block0(v0: i128, v1: i128):
 ;   Inst 16:   lsl x1, x1, x6
 ;   Inst 17:   orr x1, x5, x1
 ;   Inst 18:   ands xzr, x0, #64
-;   Inst 19:   csel x0, xzr, x4, ne
-;   Inst 20:   csel x1, x4, x1, ne
-;   Inst 21:   orr x1, x3, x1
-;   Inst 22:   orr x0, x2, x0
-;   Inst 23:   mov x2, x0
-;   Inst 24:   mov x0, x1
-;   Inst 25:   mov x1, x2
-;   Inst 26:   ret
+;   Inst 19:   csel x0, x4, x1, ne
+;   Inst 20:   csel x1, xzr, x4, ne
+;   Inst 21:   orr x0, x2, x0
+;   Inst 22:   orr x1, x3, x1
+;   Inst 23:   ret
 ; }}

 function %f4(i64, i64) -> i64 {
--- a/cranelift/filetests/filetests/isa/x64/cmp-mem-bug.clif
+++ b/cranelift/filetests/filetests/isa/x64/cmp-mem-bug.clif
@@ -43,7 +43,7 @@ block0(v0: f64, v1: i64):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 16)
+;   (instruction range: 0 .. 17)
 ;   Inst 0:   pushq   %rbp
 ;   Inst 1:   movq    %rsp, %rbp
 ;   Inst 2:   movsd   0(%rdi), %xmm1
@@ -54,10 +54,12 @@ block0(v0: f64, v1: i64):
 ;   Inst 7:   andq    $1, %rsi
 ;   Inst 8:   ucomisd %xmm0, %xmm1
 ;   Inst 9:   movaps  %xmm0, %xmm1
-;   Inst 10:   jz $check; movsd %xmm0, %xmm1; $check: jnp $next; movsd %xmm0, %xmm1; $next
-;   Inst 11:   movq    %rsi, %rax
-;   Inst 12:   movaps  %xmm1, %xmm0
-;   Inst 13:   movq    %rbp, %rsp
-;   Inst 14:   popq    %rbp
-;   Inst 15:   ret
+;   Inst 10:   jz $next; movsd %xmm0, %xmm1; $next: 
+;   Inst 11:   jnp $next; movsd %xmm0, %xmm1; $next: 
+;   Inst 12:   movq    %rsi, %rax
+;   Inst 13:   movaps  %xmm1, %xmm0
+;   Inst 14:   movq    %rbp, %rsp
+;   Inst 15:   popq    %rbp
+;   Inst 16:   ret
 ; }}
+
--- a/cranelift/filetests/filetests/isa/x64/i128.clif
+++ b/cranelift/filetests/filetests/isa/x64/i128.clif
@@ -600,57 +600,55 @@ block0(v0: i128):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 50)
+;   (instruction range: 0 .. 48)
 ;   Inst 0:   pushq   %rbp
 ;   Inst 1:   movq    %rsp, %rbp
-;   Inst 2:   movq    %rsi, %rdx
-;   Inst 3:   movq    %rdi, %rsi
-;   Inst 4:   shrq    $1, %rsi
-;   Inst 5:   movabsq $8608480567731124087, %rcx
-;   Inst 6:   andq    %rcx, %rsi
-;   Inst 7:   movq    %rdi, %rax
-;   Inst 8:   subq    %rsi, %rax
-;   Inst 9:   shrq    $1, %rsi
-;   Inst 10:   andq    %rcx, %rsi
-;   Inst 11:   subq    %rsi, %rax
-;   Inst 12:   shrq    $1, %rsi
-;   Inst 13:   andq    %rcx, %rsi
-;   Inst 14:   subq    %rsi, %rax
-;   Inst 15:   movq    %rax, %rsi
-;   Inst 16:   shrq    $4, %rsi
-;   Inst 17:   addq    %rax, %rsi
-;   Inst 18:   movabsq $1085102592571150095, %rdi
-;   Inst 19:   andq    %rdi, %rsi
-;   Inst 20:   movabsq $72340172838076673, %rdi
-;   Inst 21:   imulq   %rdi, %rsi
-;   Inst 22:   shrq    $56, %rsi
-;   Inst 23:   movq    %rdx, %rax
-;   Inst 24:   shrq    $1, %rax
-;   Inst 25:   movabsq $8608480567731124087, %rcx
-;   Inst 26:   andq    %rcx, %rax
-;   Inst 27:   movq    %rdx, %rdi
-;   Inst 28:   subq    %rax, %rdi
-;   Inst 29:   shrq    $1, %rax
-;   Inst 30:   andq    %rcx, %rax
-;   Inst 31:   subq    %rax, %rdi
-;   Inst 32:   shrq    $1, %rax
-;   Inst 33:   andq    %rcx, %rax
-;   Inst 34:   subq    %rax, %rdi
-;   Inst 35:   movq    %rdi, %rax
-;   Inst 36:   shrq    $4, %rax
-;   Inst 37:   addq    %rdi, %rax
-;   Inst 38:   movabsq $1085102592571150095, %rdi
-;   Inst 39:   andq    %rdi, %rax
-;   Inst 40:   movabsq $72340172838076673, %rdi
-;   Inst 41:   imulq   %rdi, %rax
-;   Inst 42:   shrq    $56, %rax
-;   Inst 43:   addq    %rax, %rsi
-;   Inst 44:   xorq    %rdi, %rdi
-;   Inst 45:   movq    %rsi, %rax
-;   Inst 46:   movq    %rdi, %rdx
-;   Inst 47:   movq    %rbp, %rsp
-;   Inst 48:   popq    %rbp
-;   Inst 49:   ret
+;   Inst 2:   movq    %rdi, %rax
+;   Inst 3:   movq    %rax, %rcx
+;   Inst 4:   shrq    $1, %rcx
+;   Inst 5:   movabsq $8608480567731124087, %rdi
+;   Inst 6:   andq    %rdi, %rcx
+;   Inst 7:   subq    %rcx, %rax
+;   Inst 8:   shrq    $1, %rcx
+;   Inst 9:   andq    %rdi, %rcx
+;   Inst 10:   subq    %rcx, %rax
+;   Inst 11:   shrq    $1, %rcx
+;   Inst 12:   andq    %rdi, %rcx
+;   Inst 13:   subq    %rcx, %rax
+;   Inst 14:   movq    %rax, %rdi
+;   Inst 15:   shrq    $4, %rdi
+;   Inst 16:   addq    %rax, %rdi
+;   Inst 17:   movabsq $1085102592571150095, %rax
+;   Inst 18:   andq    %rax, %rdi
+;   Inst 19:   movabsq $72340172838076673, %rax
+;   Inst 20:   imulq   %rax, %rdi
+;   Inst 21:   shrq    $56, %rdi
+;   Inst 22:   movq    %rsi, %rcx
+;   Inst 23:   shrq    $1, %rcx
+;   Inst 24:   movabsq $8608480567731124087, %rax
+;   Inst 25:   andq    %rax, %rcx
+;   Inst 26:   subq    %rcx, %rsi
+;   Inst 27:   shrq    $1, %rcx
+;   Inst 28:   andq    %rax, %rcx
+;   Inst 29:   subq    %rcx, %rsi
+;   Inst 30:   shrq    $1, %rcx
+;   Inst 31:   andq    %rax, %rcx
+;   Inst 32:   subq    %rcx, %rsi
+;   Inst 33:   movq    %rsi, %rax
+;   Inst 34:   shrq    $4, %rax
+;   Inst 35:   addq    %rsi, %rax
+;   Inst 36:   movabsq $1085102592571150095, %rsi
+;   Inst 37:   andq    %rsi, %rax
+;   Inst 38:   movabsq $72340172838076673, %rsi
+;   Inst 39:   imulq   %rsi, %rax
+;   Inst 40:   shrq    $56, %rax
+;   Inst 41:   addq    %rax, %rdi
+;   Inst 42:   xorq    %rsi, %rsi
+;   Inst 43:   movq    %rdi, %rax
+;   Inst 44:   movq    %rsi, %rdx
+;   Inst 45:   movq    %rbp, %rsp
+;   Inst 46:   popq    %rbp
+;   Inst 47:   ret
 ; }}

 function %f20(i128) -> i128 {
@@ -663,108 +661,97 @@ block0(v0: i128):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 101)
+;   (instruction range: 0 .. 90)
 ;   Inst 0:   pushq   %rbp
 ;   Inst 1:   movq    %rsp, %rbp
-;   Inst 2:   movq    %rdi, %rcx
-;   Inst 3:   movq    %rcx, %rdi
-;   Inst 4:   movabsq $6148914691236517205, %rax
-;   Inst 5:   shrq    $1, %rdi
-;   Inst 6:   andq    %rax, %rdi
-;   Inst 7:   andq    %rcx, %rax
+;   Inst 2:   movq    %rsi, %rcx
+;   Inst 3:   movabsq $6148914691236517205, %rsi
+;   Inst 4:   movq    %rcx, %rax
+;   Inst 5:   andq    %rsi, %rax
+;   Inst 6:   shrq    $1, %rcx
+;   Inst 7:   andq    %rsi, %rcx
 ;   Inst 8:   shlq    $1, %rax
-;   Inst 9:   movq    %rax, %rcx
-;   Inst 10:   orq     %rdi, %rcx
-;   Inst 11:   movq    %rcx, %rdi
-;   Inst 12:   movabsq $3689348814741910323, %rax
-;   Inst 13:   shrq    $2, %rdi
-;   Inst 14:   andq    %rax, %rdi
-;   Inst 15:   andq    %rcx, %rax
-;   Inst 16:   shlq    $2, %rax
-;   Inst 17:   movq    %rax, %rcx
-;   Inst 18:   orq     %rdi, %rcx
-;   Inst 19:   movq    %rcx, %rdi
-;   Inst 20:   movabsq $1085102592571150095, %rax
-;   Inst 21:   shrq    $4, %rdi
-;   Inst 22:   andq    %rax, %rdi
-;   Inst 23:   andq    %rcx, %rax
-;   Inst 24:   shlq    $4, %rax
+;   Inst 9:   orq     %rcx, %rax
+;   Inst 10:   movabsq $3689348814741910323, %rsi
+;   Inst 11:   movq    %rax, %rcx
+;   Inst 12:   andq    %rsi, %rcx
+;   Inst 13:   shrq    $2, %rax
+;   Inst 14:   andq    %rsi, %rax
+;   Inst 15:   shlq    $2, %rcx
+;   Inst 16:   orq     %rax, %rcx
+;   Inst 17:   movabsq $1085102592571150095, %rsi
+;   Inst 18:   movq    %rcx, %rax
+;   Inst 19:   andq    %rsi, %rax
+;   Inst 20:   shrq    $4, %rcx
+;   Inst 21:   andq    %rsi, %rcx
+;   Inst 22:   shlq    $4, %rax
+;   Inst 23:   orq     %rcx, %rax
+;   Inst 24:   movabsq $71777214294589695, %rsi
 ;   Inst 25:   movq    %rax, %rcx
-;   Inst 26:   orq     %rdi, %rcx
-;   Inst 27:   movq    %rcx, %rdi
-;   Inst 28:   movabsq $71777214294589695, %rax
-;   Inst 29:   shrq    $8, %rdi
-;   Inst 30:   andq    %rax, %rdi
-;   Inst 31:   andq    %rcx, %rax
-;   Inst 32:   shlq    $8, %rax
-;   Inst 33:   movq    %rax, %rcx
-;   Inst 34:   orq     %rdi, %rcx
-;   Inst 35:   movq    %rcx, %rdi
-;   Inst 36:   movabsq $281470681808895, %rax
-;   Inst 37:   shrq    $16, %rdi
-;   Inst 38:   andq    %rax, %rdi
-;   Inst 39:   andq    %rcx, %rax
-;   Inst 40:   shlq    $16, %rax
-;   Inst 41:   orq     %rdi, %rax
-;   Inst 42:   movq    %rax, %rcx
-;   Inst 43:   movl    $-1, %edi
-;   Inst 44:   shrq    $32, %rcx
-;   Inst 45:   andq    %rdi, %rcx
-;   Inst 46:   andq    %rax, %rdi
-;   Inst 47:   shlq    $32, %rdi
-;   Inst 48:   orq     %rcx, %rdi
-;   Inst 49:   movq    %rsi, %rcx
-;   Inst 50:   movq    %rcx, %rsi
-;   Inst 51:   movabsq $6148914691236517205, %rax
-;   Inst 52:   shrq    $1, %rsi
-;   Inst 53:   andq    %rax, %rsi
-;   Inst 54:   andq    %rcx, %rax
-;   Inst 55:   shlq    $1, %rax
-;   Inst 56:   movq    %rax, %rcx
-;   Inst 57:   orq     %rsi, %rcx
-;   Inst 58:   movq    %rcx, %rsi
-;   Inst 59:   movabsq $3689348814741910323, %rax
-;   Inst 60:   shrq    $2, %rsi
-;   Inst 61:   andq    %rax, %rsi
-;   Inst 62:   andq    %rcx, %rax
-;   Inst 63:   shlq    $2, %rax
-;   Inst 64:   movq    %rax, %rcx
-;   Inst 65:   orq     %rsi, %rcx
-;   Inst 66:   movq    %rcx, %rsi
-;   Inst 67:   movabsq $1085102592571150095, %rax
-;   Inst 68:   shrq    $4, %rsi
-;   Inst 69:   andq    %rax, %rsi
-;   Inst 70:   andq    %rcx, %rax
-;   Inst 71:   shlq    $4, %rax
-;   Inst 72:   movq    %rax, %rcx
-;   Inst 73:   orq     %rsi, %rcx
-;   Inst 74:   movq    %rcx, %rsi
-;   Inst 75:   movabsq $71777214294589695, %rax
-;   Inst 76:   shrq    $8, %rsi
-;   Inst 77:   andq    %rax, %rsi
-;   Inst 78:   andq    %rcx, %rax
-;   Inst 79:   shlq    $8, %rax
-;   Inst 80:   movq    %rax, %rcx
-;   Inst 81:   orq     %rsi, %rcx
-;   Inst 82:   movq    %rcx, %rsi
-;   Inst 83:   movabsq $281470681808895, %rax
-;   Inst 84:   shrq    $16, %rsi
-;   Inst 85:   andq    %rax, %rsi
-;   Inst 86:   andq    %rcx, %rax
-;   Inst 87:   shlq    $16, %rax
-;   Inst 88:   orq     %rsi, %rax
-;   Inst 89:   movq    %rax, %rsi
-;   Inst 90:   movl    $-1, %ecx
-;   Inst 91:   shrq    $32, %rsi
-;   Inst 92:   andq    %rcx, %rsi
-;   Inst 93:   andq    %rax, %rcx
-;   Inst 94:   shlq    $32, %rcx
-;   Inst 95:   orq     %rsi, %rcx
-;   Inst 96:   movq    %rcx, %rax
-;   Inst 97:   movq    %rdi, %rdx
-;   Inst 98:   movq    %rbp, %rsp
-;   Inst 99:   popq    %rbp
-;   Inst 100:   ret
+;   Inst 26:   andq    %rsi, %rcx
+;   Inst 27:   shrq    $8, %rax
+;   Inst 28:   andq    %rsi, %rax
+;   Inst 29:   shlq    $8, %rcx
+;   Inst 30:   orq     %rax, %rcx
+;   Inst 31:   movabsq $281470681808895, %rsi
+;   Inst 32:   movq    %rcx, %rax
+;   Inst 33:   andq    %rsi, %rax
+;   Inst 34:   shrq    $16, %rcx
+;   Inst 35:   andq    %rsi, %rcx
+;   Inst 36:   shlq    $16, %rax
+;   Inst 37:   orq     %rcx, %rax
+;   Inst 38:   movabsq $4294967295, %rcx
+;   Inst 39:   movq    %rax, %rsi
+;   Inst 40:   andq    %rcx, %rsi
+;   Inst 41:   shrq    $32, %rax
+;   Inst 42:   shlq    $32, %rsi
+;   Inst 43:   orq     %rax, %rsi
+;   Inst 44:   movabsq $6148914691236517205, %rax
+;   Inst 45:   movq    %rdi, %rcx
+;   Inst 46:   andq    %rax, %rcx
+;   Inst 47:   shrq    $1, %rdi
+;   Inst 48:   andq    %rax, %rdi
+;   Inst 49:   shlq    $1, %rcx
+;   Inst 50:   orq     %rdi, %rcx
+;   Inst 51:   movabsq $3689348814741910323, %rdi
+;   Inst 52:   movq    %rcx, %rax
+;   Inst 53:   andq    %rdi, %rax
+;   Inst 54:   shrq    $2, %rcx
+;   Inst 55:   andq    %rdi, %rcx
+;   Inst 56:   shlq    $2, %rax
+;   Inst 57:   orq     %rcx, %rax
+;   Inst 58:   movabsq $1085102592571150095, %rdi
+;   Inst 59:   movq    %rax, %rcx
+;   Inst 60:   andq    %rdi, %rcx
+;   Inst 61:   shrq    $4, %rax
+;   Inst 62:   andq    %rdi, %rax
+;   Inst 63:   shlq    $4, %rcx
+;   Inst 64:   orq     %rax, %rcx
+;   Inst 65:   movabsq $71777214294589695, %rdi
+;   Inst 66:   movq    %rcx, %rax
+;   Inst 67:   andq    %rdi, %rax
+;   Inst 68:   shrq    $8, %rcx
+;   Inst 69:   andq    %rdi, %rcx
+;   Inst 70:   shlq    $8, %rax
+;   Inst 71:   orq     %rcx, %rax
+;   Inst 72:   movabsq $281470681808895, %rdi
+;   Inst 73:   movq    %rax, %rcx
+;   Inst 74:   andq    %rdi, %rcx
+;   Inst 75:   shrq    $16, %rax
+;   Inst 76:   andq    %rdi, %rax
+;   Inst 77:   shlq    $16, %rcx
+;   Inst 78:   orq     %rax, %rcx
+;   Inst 79:   movabsq $4294967295, %rax
+;   Inst 80:   movq    %rcx, %rdi
+;   Inst 81:   andq    %rax, %rdi
+;   Inst 82:   shrq    $32, %rcx
+;   Inst 83:   shlq    $32, %rdi
+;   Inst 84:   orq     %rcx, %rdi
+;   Inst 85:   movq    %rsi, %rax
+;   Inst 86:   movq    %rdi, %rdx
+;   Inst 87:   movq    %rbp, %rsp
+;   Inst 88:   popq    %rbp
+;   Inst 89:   ret
 ; }}

 function %f21(i128, i64) {
@@ -1020,11 +1007,11 @@ block0(v0: i128):
 ;   Inst 4:   cmovzq  %rcx, %rax
 ;   Inst 5:   movl    $63, %esi
 ;   Inst 6:   subq    %rax, %rsi
-;   Inst 7:   movabsq $-1, %rcx
-;   Inst 8:   bsrq    %rdi, %rax
-;   Inst 9:   cmovzq  %rcx, %rax
+;   Inst 7:   movabsq $-1, %rax
+;   Inst 8:   bsrq    %rdi, %rcx
+;   Inst 9:   cmovzq  %rax, %rcx
 ;   Inst 10:   movl    $63, %edi
-;   Inst 11:   subq    %rax, %rdi
+;   Inst 11:   subq    %rcx, %rdi
 ;   Inst 12:   addq    $64, %rdi
 ;   Inst 13:   cmpq    $64, %rsi
 ;   Inst 14:   cmovnzq %rsi, %rdi
@@ -1098,7 +1085,7 @@ block0(v0: i128, v1: i128):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 25)
+;   (instruction range: 0 .. 24)
 ;   Inst 0:   pushq   %rbp
 ;   Inst 1:   movq    %rsp, %rbp
 ;   Inst 2:   movq    %rdi, %rax
@@ -1116,14 +1103,13 @@ block0(v0: i128, v1: i128):
 ;   Inst 14:   cmovzq  %rcx, %rax
 ;   Inst 15:   orq     %rdi, %rax
 ;   Inst 16:   testq   $64, %rdx
-;   Inst 17:   movq    %rsi, %rdi
-;   Inst 18:   cmovzq  %rax, %rdi
-;   Inst 19:   cmovzq  %rsi, %rcx
-;   Inst 20:   movq    %rcx, %rax
-;   Inst 21:   movq    %rdi, %rdx
-;   Inst 22:   movq    %rbp, %rsp
-;   Inst 23:   popq    %rbp
-;   Inst 24:   ret
+;   Inst 17:   cmovzq  %rsi, %rcx
+;   Inst 18:   cmovzq  %rax, %rsi
+;   Inst 19:   movq    %rcx, %rax
+;   Inst 20:   movq    %rsi, %rdx
+;   Inst 21:   movq    %rbp, %rsp
+;   Inst 22:   popq    %rbp
+;   Inst 23:   ret
 ; }}

 function %f31(i128, i128) -> i128 {
@@ -1136,7 +1122,7 @@ block0(v0: i128, v1: i128):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 24)
+;   (instruction range: 0 .. 25)
 ;   Inst 0:   pushq   %rbp
 ;   Inst 1:   movq    %rsp, %rbp
 ;   Inst 2:   movq    %rsi, %rax
@@ -1152,15 +1138,16 @@ block0(v0: i128, v1: i128):
 ;   Inst 12:   testq   $127, %rdx
 ;   Inst 13:   cmovzq  %rcx, %rax
 ;   Inst 14:   orq     %rdi, %rax
-;   Inst 15:   xorq    %rdi, %rdi
+;   Inst 15:   xorq    %rcx, %rcx
 ;   Inst 16:   testq   $64, %rdx
-;   Inst 17:   cmovzq  %rsi, %rdi
-;   Inst 18:   cmovzq  %rax, %rsi
-;   Inst 19:   movq    %rsi, %rax
-;   Inst 20:   movq    %rdi, %rdx
-;   Inst 21:   movq    %rbp, %rsp
-;   Inst 22:   popq    %rbp
-;   Inst 23:   ret
+;   Inst 17:   movq    %rsi, %rdi
+;   Inst 18:   cmovzq  %rax, %rdi
+;   Inst 19:   cmovzq  %rsi, %rcx
+;   Inst 20:   movq    %rdi, %rax
+;   Inst 21:   movq    %rcx, %rdx
+;   Inst 22:   movq    %rbp, %rsp
+;   Inst 23:   popq    %rbp
+;   Inst 24:   ret
 ; }}

 function %f32(i128, i128) -> i128 {
@@ -1173,7 +1160,7 @@ block0(v0: i128, v1: i128):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 25)
+;   (instruction range: 0 .. 26)
 ;   Inst 0:   pushq   %rbp
 ;   Inst 1:   movq    %rsp, %rbp
 ;   Inst 2:   movq    %rdi, %rax
@@ -1192,13 +1179,14 @@ block0(v0: i128, v1: i128):
 ;   Inst 15:   orq     %r8, %rax
 ;   Inst 16:   sarq    $63, %rsi
 ;   Inst 17:   testq   $64, %rdx
-;   Inst 18:   cmovzq  %rdi, %rsi
-;   Inst 19:   cmovzq  %rax, %rdi
-;   Inst 20:   movq    %rdi, %rax
-;   Inst 21:   movq    %rsi, %rdx
-;   Inst 22:   movq    %rbp, %rsp
-;   Inst 23:   popq    %rbp
-;   Inst 24:   ret
+;   Inst 18:   movq    %rdi, %rcx
+;   Inst 19:   cmovzq  %rax, %rcx
+;   Inst 20:   cmovzq  %rdi, %rsi
+;   Inst 21:   movq    %rcx, %rax
+;   Inst 22:   movq    %rsi, %rdx
+;   Inst 23:   movq    %rbp, %rsp
+;   Inst 24:   popq    %rbp
+;   Inst 25:   ret
 ; }}

 function %f33(i128, i128) -> i128 {
@@ -1211,27 +1199,27 @@ block0(v0: i128, v1: i128):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 46)
+;   (instruction range: 0 .. 48)
 ;   Inst 0:   pushq   %rbp
 ;   Inst 1:   movq    %rsp, %rbp
-;   Inst 2:   movq    %rdi, %r9
+;   Inst 2:   movq    %rdi, %rax
 ;   Inst 3:   movq    %rdx, %rcx
-;   Inst 4:   shlq    %cl, %r9
-;   Inst 5:   movq    %rsi, %rax
+;   Inst 4:   shlq    %cl, %rax
+;   Inst 5:   movq    %rsi, %r8
 ;   Inst 6:   movq    %rdx, %rcx
-;   Inst 7:   shlq    %cl, %rax
+;   Inst 7:   shlq    %cl, %r8
 ;   Inst 8:   movl    $64, %ecx
 ;   Inst 9:   subq    %rdx, %rcx
-;   Inst 10:   movq    %rdi, %r10
-;   Inst 11:   shrq    %cl, %r10
-;   Inst 12:   xorq    %r8, %r8
+;   Inst 10:   movq    %rdi, %r9
+;   Inst 11:   shrq    %cl, %r9
+;   Inst 12:   xorq    %rcx, %rcx
 ;   Inst 13:   testq   $127, %rdx
-;   Inst 14:   cmovzq  %r8, %r10
-;   Inst 15:   orq     %rax, %r10
+;   Inst 14:   cmovzq  %rcx, %r9
+;   Inst 15:   orq     %r8, %r9
 ;   Inst 16:   testq   $64, %rdx
-;   Inst 17:   movq    %r9, %rax
-;   Inst 18:   cmovzq  %r10, %rax
-;   Inst 19:   cmovzq  %r9, %r8
+;   Inst 17:   movq    %rcx, %r8
+;   Inst 18:   cmovzq  %rax, %r8
+;   Inst 19:   cmovzq  %r9, %rax
 ;   Inst 20:   movl    $128, %r9d
 ;   Inst 21:   subq    %rdx, %r9
 ;   Inst 22:   movq    %rdi, %rdx
@@ -1247,17 +1235,19 @@ block0(v0: i128, v1: i128):
 ;   Inst 32:   testq   $127, %r9
 ;   Inst 33:   cmovzq  %rcx, %rsi
 ;   Inst 34:   orq     %rdx, %rsi
-;   Inst 35:   xorq    %rcx, %rcx
+;   Inst 35:   xorq    %rdx, %rdx
 ;   Inst 36:   testq   $64, %r9
-;   Inst 37:   cmovzq  %rdi, %rcx
-;   Inst 38:   cmovzq  %rsi, %rdi
-;   Inst 39:   orq     %rdi, %r8
-;   Inst 40:   orq     %rcx, %rax
-;   Inst 41:   movq    %rax, %rdx
-;   Inst 42:   movq    %r8, %rax
-;   Inst 43:   movq    %rbp, %rsp
-;   Inst 44:   popq    %rbp
-;   Inst 45:   ret
+;   Inst 37:   movq    %rdi, %rcx
+;   Inst 38:   cmovzq  %rsi, %rcx
+;   Inst 39:   movq    %rdx, %rsi
+;   Inst 40:   cmovzq  %rdi, %rsi
+;   Inst 41:   orq     %rcx, %r8
+;   Inst 42:   orq     %rsi, %rax
+;   Inst 43:   movq    %rax, %rdx
+;   Inst 44:   movq    %r8, %rax
+;   Inst 45:   movq    %rbp, %rsp
+;   Inst 46:   popq    %rbp
+;   Inst 47:   ret
 ; }}

 function %f34(i128, i128) -> i128 {
@@ -1270,52 +1260,51 @@ block0(v0: i128, v1: i128):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 46)
+;   (instruction range: 0 .. 45)
 ;   Inst 0:   pushq   %rbp
 ;   Inst 1:   movq    %rsp, %rbp
 ;   Inst 2:   movq    %rdi, %rax
 ;   Inst 3:   movq    %rdx, %rcx
 ;   Inst 4:   shrq    %cl, %rax
-;   Inst 5:   movq    %rsi, %r8
+;   Inst 5:   movq    %rsi, %r9
 ;   Inst 6:   movq    %rdx, %rcx
-;   Inst 7:   shrq    %cl, %r8
+;   Inst 7:   shrq    %cl, %r9
 ;   Inst 8:   movl    $64, %ecx
 ;   Inst 9:   subq    %rdx, %rcx
-;   Inst 10:   movq    %rsi, %r9
-;   Inst 11:   shlq    %cl, %r9
+;   Inst 10:   movq    %rsi, %r8
+;   Inst 11:   shlq    %cl, %r8
 ;   Inst 12:   xorq    %rcx, %rcx
 ;   Inst 13:   testq   $127, %rdx
-;   Inst 14:   cmovzq  %rcx, %r9
-;   Inst 15:   movq    %r9, %rcx
-;   Inst 16:   orq     %rax, %rcx
-;   Inst 17:   xorq    %rax, %rax
-;   Inst 18:   testq   $64, %rdx
+;   Inst 14:   cmovzq  %rcx, %r8
+;   Inst 15:   orq     %rax, %r8
+;   Inst 16:   xorq    %rcx, %rcx
+;   Inst 17:   testq   $64, %rdx
+;   Inst 18:   movq    %r9, %rax
 ;   Inst 19:   cmovzq  %r8, %rax
-;   Inst 20:   cmovzq  %rcx, %r8
-;   Inst 21:   movl    $128, %r9d
-;   Inst 22:   subq    %rdx, %r9
-;   Inst 23:   movq    %rdi, %rdx
-;   Inst 24:   movq    %r9, %rcx
-;   Inst 25:   shlq    %cl, %rdx
-;   Inst 26:   movq    %r9, %rcx
-;   Inst 27:   shlq    %cl, %rsi
-;   Inst 28:   movl    $64, %ecx
-;   Inst 29:   subq    %r9, %rcx
-;   Inst 30:   shrq    %cl, %rdi
-;   Inst 31:   xorq    %rcx, %rcx
-;   Inst 32:   testq   $127, %r9
-;   Inst 33:   cmovzq  %rcx, %rdi
-;   Inst 34:   orq     %rsi, %rdi
-;   Inst 35:   testq   $64, %r9
-;   Inst 36:   movq    %rdx, %rsi
-;   Inst 37:   cmovzq  %rdi, %rsi
-;   Inst 38:   cmovzq  %rdx, %rcx
-;   Inst 39:   orq     %rcx, %r8
-;   Inst 40:   orq     %rsi, %rax
-;   Inst 41:   movq    %rax, %rdx
-;   Inst 42:   movq    %r8, %rax
-;   Inst 43:   movq    %rbp, %rsp
-;   Inst 44:   popq    %rbp
-;   Inst 45:   ret
+;   Inst 20:   movq    %rcx, %r8
+;   Inst 21:   cmovzq  %r9, %r8
+;   Inst 22:   movl    $128, %r9d
+;   Inst 23:   subq    %rdx, %r9
+;   Inst 24:   movq    %rdi, %rdx
+;   Inst 25:   movq    %r9, %rcx
+;   Inst 26:   shlq    %cl, %rdx
+;   Inst 27:   movq    %r9, %rcx
+;   Inst 28:   shlq    %cl, %rsi
+;   Inst 29:   movl    $64, %ecx
+;   Inst 30:   subq    %r9, %rcx
+;   Inst 31:   shrq    %cl, %rdi
+;   Inst 32:   xorq    %rcx, %rcx
+;   Inst 33:   testq   $127, %r9
+;   Inst 34:   cmovzq  %rcx, %rdi
+;   Inst 35:   orq     %rsi, %rdi
+;   Inst 36:   testq   $64, %r9
+;   Inst 37:   cmovzq  %rdx, %rcx
+;   Inst 38:   cmovzq  %rdi, %rdx
+;   Inst 39:   orq     %rcx, %rax
+;   Inst 40:   orq     %rdx, %r8
+;   Inst 41:   movq    %r8, %rdx
+;   Inst 42:   movq    %rbp, %rsp
+;   Inst 43:   popq    %rbp
+;   Inst 44:   ret
 ; }}

--- a/cranelift/filetests/filetests/isa/x64/popcnt.clif
+++ b/cranelift/filetests/filetests/isa/x64/popcnt.clif
@@ -14,17 +14,17 @@ block0(v0: i64):
 ;   (instruction range: 0 .. 25)
 ;   Inst 0:   pushq   %rbp
 ;   Inst 1:   movq    %rsp, %rbp
-;   Inst 2:   movq    %rdi, %rsi
-;   Inst 3:   shrq    $1, %rsi
-;   Inst 4:   movabsq $8608480567731124087, %rax
-;   Inst 5:   andq    %rax, %rsi
-;   Inst 6:   subq    %rsi, %rdi
-;   Inst 7:   shrq    $1, %rsi
-;   Inst 8:   andq    %rax, %rsi
-;   Inst 9:   subq    %rsi, %rdi
-;   Inst 10:   shrq    $1, %rsi
-;   Inst 11:   andq    %rax, %rsi
-;   Inst 12:   subq    %rsi, %rdi
+;   Inst 2:   movq    %rdi, %rax
+;   Inst 3:   shrq    $1, %rax
+;   Inst 4:   movabsq $8608480567731124087, %rsi
+;   Inst 5:   andq    %rsi, %rax
+;   Inst 6:   subq    %rax, %rdi
+;   Inst 7:   shrq    $1, %rax
+;   Inst 8:   andq    %rsi, %rax
+;   Inst 9:   subq    %rax, %rdi
+;   Inst 10:   shrq    $1, %rax
+;   Inst 11:   andq    %rsi, %rax
+;   Inst 12:   subq    %rax, %rdi
 ;   Inst 13:   movq    %rdi, %rsi
 ;   Inst 14:   shrq    $4, %rsi
 ;   Inst 15:   addq    %rdi, %rsi
@@ -54,17 +54,17 @@ block0(v0: i64):
 ;   Inst 0:   pushq   %rbp
 ;   Inst 1:   movq    %rsp, %rbp
 ;   Inst 2:   movq    0(%rdi), %rdi
-;   Inst 3:   movq    %rdi, %rsi
-;   Inst 4:   shrq    $1, %rsi
-;   Inst 5:   movabsq $8608480567731124087, %rax
-;   Inst 6:   andq    %rax, %rsi
-;   Inst 7:   subq    %rsi, %rdi
-;   Inst 8:   shrq    $1, %rsi
-;   Inst 9:   andq    %rax, %rsi
-;   Inst 10:   subq    %rsi, %rdi
-;   Inst 11:   shrq    $1, %rsi
-;   Inst 12:   andq    %rax, %rsi
-;   Inst 13:   subq    %rsi, %rdi
+;   Inst 3:   movq    %rdi, %rax
+;   Inst 4:   shrq    $1, %rax
+;   Inst 5:   movabsq $8608480567731124087, %rsi
+;   Inst 6:   andq    %rsi, %rax
+;   Inst 7:   subq    %rax, %rdi
+;   Inst 8:   shrq    $1, %rax
+;   Inst 9:   andq    %rsi, %rax
+;   Inst 10:   subq    %rax, %rdi
+;   Inst 11:   shrq    $1, %rax
+;   Inst 12:   andq    %rsi, %rax
+;   Inst 13:   subq    %rax, %rdi
 ;   Inst 14:   movq    %rdi, %rsi
 ;   Inst 15:   shrq    $4, %rsi
 ;   Inst 16:   addq    %rdi, %rsi
@@ -89,29 +89,30 @@ block0(v0: i32):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 22)
+;   (instruction range: 0 .. 23)
 ;   Inst 0:   pushq   %rbp
 ;   Inst 1:   movq    %rsp, %rbp
-;   Inst 2:   movq    %rdi, %rsi
-;   Inst 3:   shrl    $1, %esi
-;   Inst 4:   andl    $2004318071, %esi
-;   Inst 5:   subl    %esi, %edi
-;   Inst 6:   shrl    $1, %esi
-;   Inst 7:   andl    $2004318071, %esi
-;   Inst 8:   subl    %esi, %edi
-;   Inst 9:   shrl    $1, %esi
-;   Inst 10:   andl    $2004318071, %esi
-;   Inst 11:   subl    %esi, %edi
-;   Inst 12:   movq    %rdi, %rsi
-;   Inst 13:   shrl    $4, %esi
-;   Inst 14:   addl    %edi, %esi
-;   Inst 15:   andl    $252645135, %esi
-;   Inst 16:   imull   $16843009, %esi
-;   Inst 17:   shrl    $24, %esi
-;   Inst 18:   movq    %rsi, %rax
-;   Inst 19:   movq    %rbp, %rsp
-;   Inst 20:   popq    %rbp
-;   Inst 21:   ret
+;   Inst 2:   movq    %rdi, %rax
+;   Inst 3:   shrl    $1, %eax
+;   Inst 4:   movl    $2004318071, %esi
+;   Inst 5:   andl    %esi, %eax
+;   Inst 6:   subl    %eax, %edi
+;   Inst 7:   shrl    $1, %eax
+;   Inst 8:   andl    %esi, %eax
+;   Inst 9:   subl    %eax, %edi
+;   Inst 10:   shrl    $1, %eax
+;   Inst 11:   andl    %esi, %eax
+;   Inst 12:   subl    %eax, %edi
+;   Inst 13:   movq    %rdi, %rsi
+;   Inst 14:   shrl    $4, %esi
+;   Inst 15:   addl    %edi, %esi
+;   Inst 16:   andl    $252645135, %esi
+;   Inst 17:   imull   $16843009, %esi
+;   Inst 18:   shrl    $24, %esi
+;   Inst 19:   movq    %rsi, %rax
+;   Inst 20:   movq    %rbp, %rsp
+;   Inst 21:   popq    %rbp
+;   Inst 22:   ret
 ; }}

 function %popcnt32load(i64) -> i32 {
@@ -125,29 +126,30 @@ block0(v0: i64):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 23)
+;   (instruction range: 0 .. 24)
 ;   Inst 0:   pushq   %rbp
 ;   Inst 1:   movq    %rsp, %rbp
 ;   Inst 2:   movl    0(%rdi), %edi
-;   Inst 3:   movq    %rdi, %rsi
-;   Inst 4:   shrl    $1, %esi
-;   Inst 5:   andl    $2004318071, %esi
-;   Inst 6:   subl    %esi, %edi
-;   Inst 7:   shrl    $1, %esi
-;   Inst 8:   andl    $2004318071, %esi
-;   Inst 9:   subl    %esi, %edi
-;   Inst 10:   shrl    $1, %esi
-;   Inst 11:   andl    $2004318071, %esi
-;   Inst 12:   subl    %esi, %edi
-;   Inst 13:   movq    %rdi, %rsi
-;   Inst 14:   shrl    $4, %esi
-;   Inst 15:   addl    %edi, %esi
-;   Inst 16:   andl    $252645135, %esi
-;   Inst 17:   imull   $16843009, %esi
-;   Inst 18:   shrl    $24, %esi
-;   Inst 19:   movq    %rsi, %rax
-;   Inst 20:   movq    %rbp, %rsp
-;   Inst 21:   popq    %rbp
-;   Inst 22:   ret
+;   Inst 3:   movq    %rdi, %rax
+;   Inst 4:   shrl    $1, %eax
+;   Inst 5:   movl    $2004318071, %esi
+;   Inst 6:   andl    %esi, %eax
+;   Inst 7:   subl    %eax, %edi
+;   Inst 8:   shrl    $1, %eax
+;   Inst 9:   andl    %esi, %eax
+;   Inst 10:   subl    %eax, %edi
+;   Inst 11:   shrl    $1, %eax
+;   Inst 12:   andl    %esi, %eax
+;   Inst 13:   subl    %eax, %edi
+;   Inst 14:   movq    %rdi, %rsi
+;   Inst 15:   shrl    $4, %esi
+;   Inst 16:   addl    %edi, %esi
+;   Inst 17:   andl    $252645135, %esi
+;   Inst 18:   imull   $16843009, %esi
+;   Inst 19:   shrl    $24, %esi
+;   Inst 20:   movq    %rsi, %rax
+;   Inst 21:   movq    %rbp, %rsp
+;   Inst 22:   popq    %rbp
+;   Inst 23:   ret
 ; }}