peepmatic: Make peepmatic optional to enable

Rather than outright replacing parts of our existing peephole optimizations passes, this makes peepmatic an optional cargo feature that can be enabled. This allows us to take a conservative approach with enabling peepmatic everywhere, while also allowing us to get it in-tree and make it easier to collaborate on improving it quickly.
2020-05-08 12:15:23 -07:00
parent 6e135b3aea
commit 52c6ece5f3
25 changed files with 2284 additions and 100 deletions
--- a/cranelift/Cargo.toml
+++ b/cranelift/Cargo.toml
@@ -47,4 +47,5 @@ walkdir = "2.2"
 [features]
 default = ["disas", "wasm", "cranelift-codegen/all-arch"]
 disas = ["capstone"]
+enable-peepmatic = ["cranelift-codegen/enable-peepmatic", "cranelift-filetests/enable-peepmatic"]
 wasm = ["wat", "cranelift-wasm"]
--- a/cranelift/codegen/Cargo.toml
+++ b/cranelift/codegen/Cargo.toml
@@ -24,7 +24,7 @@ gimli = { version = "0.20.0", default-features = false, features = ["write"], op
 smallvec = { version = "1.0.0" }
 thiserror = "1.0.4"
 byteorder = { version = "1.3.2", default-features = false }
-peepmatic-runtime = { path = "../peepmatic/crates/runtime" }
+peepmatic-runtime = { path = "../peepmatic/crates/runtime", optional = true }
 regalloc = "0.0.23"
 # It is a goal of the cranelift-codegen crate to have minimal external dependencies.
 # Please don't add any unless they are essential to the task of creating binary
@@ -74,9 +74,12 @@ all-arch = [
 # For dependent crates that want to serialize some parts of cranelift
 enable-serde = ["serde"]

-# Recompile our optimizations that are written in the peepmatic DSL into a
+# Recompile our optimizations that are written in the `peepmatic` DSL into a
 # compact finite-state transducer automaton.
 rebuild-peephole-optimizers = ["peepmatic"]

+# Enable the use of `peepmatic`-generated peephole optimizers.
+enable-peepmatic = ["peepmatic-runtime"]
+
 [badges]
 maintenance = { status = "experimental" }
--- a/cranelift/codegen/src/lib.rs
+++ b/cranelift/codegen/src/lib.rs
@@ -101,7 +101,6 @@ mod licm;
 mod nan_canonicalization;
 mod num_uses;
 mod partition_slice;
-mod peepmatic;
 mod postopt;
 mod predicates;
 mod redundant_reload_remover;
@@ -116,6 +115,9 @@ mod topo_order;
 mod unreachable_code;
 mod value_label;

+#[cfg(feature = "enable-peepmatic")]
+mod peepmatic;
+
 pub use crate::result::{CodegenError, CodegenResult};

 /// Version number of this crate.
--- a/cranelift/codegen/src/simple_preopt.rs
+++ b/cranelift/codegen/src/simple_preopt.rs
@@ -15,7 +15,6 @@ use crate::ir::{
    Block, DataFlowGraph, Function, Inst, InstBuilder, InstructionData, Type, Value,
 };
 use crate::isa::TargetIsa;
-use crate::peepmatic::ValueOrInst;
 use crate::timing;

 #[inline]
@@ -182,8 +181,12 @@ fn do_divrem_transformation(divrem_info: &DivRemByConstInfo, pos: &mut FuncCurso

        // U32 div by 1: identity
        // U32 rem by 1: zero
-        DivRemByConstInfo::DivU32(_, 1) | DivRemByConstInfo::RemU32(_, 1) => {
-            unreachable!("unsigned division and remainder by one is handled in `preopt.peepmatic`");
+        DivRemByConstInfo::DivU32(n1, 1) | DivRemByConstInfo::RemU32(n1, 1) => {
+            if is_rem {
+                pos.func.dfg.replace(inst).iconst(I32, 0);
+            } else {
+                replace_single_result_with_alias(&mut pos.func.dfg, inst, n1);
+            }
        }

        // U32 div, rem by a power-of-2
@@ -198,10 +201,7 @@ fn do_divrem_transformation(divrem_info: &DivRemByConstInfo, pos: &mut FuncCurso
                let mask = (1u64 << k) - 1;
                pos.func.dfg.replace(inst).band_imm(n1, mask as i64);
            } else {
-                unreachable!(
-                    "unsigned division by a power of two is handled in \
-                     `preopt.peepmatic`"
-                );
+                pos.func.dfg.replace(inst).ushr_imm(n1, k as i64);
            }
        }

@@ -251,8 +251,12 @@ fn do_divrem_transformation(divrem_info: &DivRemByConstInfo, pos: &mut FuncCurso

        // U64 div by 1: identity
        // U64 rem by 1: zero
-        DivRemByConstInfo::DivU64(_, 1) | DivRemByConstInfo::RemU64(_, 1) => {
-            unreachable!("unsigned division and remainder by one is handled in `preopt.peepmatic`");
+        DivRemByConstInfo::DivU64(n1, 1) | DivRemByConstInfo::RemU64(n1, 1) => {
+            if is_rem {
+                pos.func.dfg.replace(inst).iconst(I64, 0);
+            } else {
+                replace_single_result_with_alias(&mut pos.func.dfg, inst, n1);
+            }
        }

        // U64 div, rem by a power-of-2
@@ -267,9 +271,7 @@ fn do_divrem_transformation(divrem_info: &DivRemByConstInfo, pos: &mut FuncCurso
                let mask = (1u64 << k) - 1;
                pos.func.dfg.replace(inst).band_imm(n1, mask as i64);
            } else {
-                unreachable!(
-                    "unsigned division by a power of two is handled in `preopt.peepmatic`"
-                );
+                pos.func.dfg.replace(inst).ushr_imm(n1, k as i64);
            }
        }

@@ -322,8 +324,12 @@ fn do_divrem_transformation(divrem_info: &DivRemByConstInfo, pos: &mut FuncCurso

        // S32 div by 1: identity
        // S32 rem by 1: zero
-        DivRemByConstInfo::DivS32(_, 1) | DivRemByConstInfo::RemS32(_, 1) => {
-            unreachable!("signed division and remainder by one is handled in `preopt.peepmatic`");
+        DivRemByConstInfo::DivS32(n1, 1) | DivRemByConstInfo::RemS32(n1, 1) => {
+            if is_rem {
+                pos.func.dfg.replace(inst).iconst(I32, 0);
+            } else {
+                replace_single_result_with_alias(&mut pos.func.dfg, inst, n1);
+            }
        }

        DivRemByConstInfo::DivS32(n1, d) | DivRemByConstInfo::RemS32(n1, d) => {
@@ -393,8 +399,12 @@ fn do_divrem_transformation(divrem_info: &DivRemByConstInfo, pos: &mut FuncCurso

        // S64 div by 1: identity
        // S64 rem by 1: zero
-        DivRemByConstInfo::DivS64(_, 1) | DivRemByConstInfo::RemS64(_, 1) => {
-            unreachable!("division and remaineder by one are handled in `preopt.peepmatic`");
+        DivRemByConstInfo::DivS64(n1, 1) | DivRemByConstInfo::RemS64(n1, 1) => {
+            if is_rem {
+                pos.func.dfg.replace(inst).iconst(I64, 0);
+            } else {
+                replace_single_result_with_alias(&mut pos.func.dfg, inst, n1);
+            }
        }

        DivRemByConstInfo::DivS64(n1, d) | DivRemByConstInfo::RemS64(n1, d) => {
@@ -598,6 +608,416 @@ fn branch_order(pos: &mut FuncCursor, cfg: &mut ControlFlowGraph, block: Block,
    cfg.recompute_block(pos.func, block);
 }

+#[cfg(feature = "enable-peepmatic")]
+mod simplify {
+    use super::*;
+    use crate::peepmatic::ValueOrInst;
+
+    pub type PeepholeOptimizer<'a, 'b> =
+        peepmatic_runtime::optimizer::PeepholeOptimizer<'static, 'a, &'b dyn TargetIsa>;
+
+    pub fn peephole_optimizer<'a, 'b>(isa: &'b dyn TargetIsa) -> PeepholeOptimizer<'a, 'b> {
+        crate::peepmatic::preopt(isa)
+    }
+
+    pub fn apply_all<'a, 'b>(
+        optimizer: &mut PeepholeOptimizer<'a, 'b>,
+        pos: &mut FuncCursor<'a>,
+        inst: Inst,
+        _native_word_width: u32,
+    ) {
+        // After we apply one optimization, that might make another
+        // optimization applicable. Keep running the peephole optimizer
+        // until either:
+        //
+        // * No optimization applied, and therefore it doesn't make sense to
+        //   try again, because no optimization will apply again.
+        //
+        // * Or when we replaced an instruction with an alias to an existing
+        //   value, because we already ran the peephole optimizer over the
+        //   aliased value's instruction in an early part of the traversal
+        //   over the function.
+        while let Some(ValueOrInst::Inst(new_inst)) =
+            optimizer.apply_one(pos, ValueOrInst::Inst(inst))
+        {
+            // We transplanted a new instruction into the current
+            // instruction, so the "new" instruction is actually the same
+            // one, just with different data.
+            debug_assert_eq!(new_inst, inst);
+        }
+        debug_assert_eq!(pos.current_inst(), Some(inst));
+    }
+}
+
+#[cfg(not(feature = "enable-peepmatic"))]
+mod simplify {
+    use super::*;
+    use crate::ir::{
+        dfg::ValueDef,
+        immediates,
+        instructions::{Opcode, ValueList},
+        types::{I16, I32, I8},
+    };
+    use std::marker::PhantomData;
+
+    pub struct PeepholeOptimizer<'a, 'b> {
+        phantom: PhantomData<(&'a (), &'b ())>,
+    }
+
+    pub fn peephole_optimizer<'a, 'b>(_: &dyn TargetIsa) -> PeepholeOptimizer<'a, 'b> {
+        PeepholeOptimizer {
+            phantom: PhantomData,
+        }
+    }
+
+    pub fn apply_all<'a, 'b>(
+        _optimizer: &mut PeepholeOptimizer<'a, 'b>,
+        pos: &mut FuncCursor<'a>,
+        inst: Inst,
+        native_word_width: u32,
+    ) {
+        simplify(pos, inst, native_word_width);
+        branch_opt(pos, inst);
+    }
+
+    #[inline]
+    fn resolve_imm64_value(dfg: &DataFlowGraph, value: Value) -> Option<immediates::Imm64> {
+        if let ValueDef::Result(candidate_inst, _) = dfg.value_def(value) {
+            if let InstructionData::UnaryImm {
+                opcode: Opcode::Iconst,
+                imm,
+            } = dfg[candidate_inst]
+            {
+                return Some(imm);
+            }
+        }
+        None
+    }
+
+    /// Try to transform [(x << N) >> N] into a (un)signed-extending move.
+    /// Returns true if the final instruction has been converted to such a move.
+    fn try_fold_extended_move(
+        pos: &mut FuncCursor,
+        inst: Inst,
+        opcode: Opcode,
+        arg: Value,
+        imm: immediates::Imm64,
+    ) -> bool {
+        if let ValueDef::Result(arg_inst, _) = pos.func.dfg.value_def(arg) {
+            if let InstructionData::BinaryImm {
+                opcode: Opcode::IshlImm,
+                arg: prev_arg,
+                imm: prev_imm,
+            } = &pos.func.dfg[arg_inst]
+            {
+                if imm != *prev_imm {
+                    return false;
+                }
+
+                let dest_ty = pos.func.dfg.ctrl_typevar(inst);
+                if dest_ty != pos.func.dfg.ctrl_typevar(arg_inst) || !dest_ty.is_int() {
+                    return false;
+                }
+
+                let imm_bits: i64 = imm.into();
+                let ireduce_ty = match (dest_ty.lane_bits() as i64).wrapping_sub(imm_bits) {
+                    8 => I8,
+                    16 => I16,
+                    32 => I32,
+                    _ => return false,
+                };
+                let ireduce_ty = ireduce_ty.by(dest_ty.lane_count()).unwrap();
+
+                // This becomes a no-op, since ireduce_ty has a smaller lane width than
+                // the argument type (also the destination type).
+                let arg = *prev_arg;
+                let narrower_arg = pos.ins().ireduce(ireduce_ty, arg);
+
+                if opcode == Opcode::UshrImm {
+                    pos.func.dfg.replace(inst).uextend(dest_ty, narrower_arg);
+                } else {
+                    pos.func.dfg.replace(inst).sextend(dest_ty, narrower_arg);
+                }
+                return true;
+            }
+        }
+        false
+    }
+
+    /// Apply basic simplifications.
+    ///
+    /// This folds constants with arithmetic to form `_imm` instructions, and other minor
+    /// simplifications.
+    ///
+    /// Doesn't apply some simplifications if the native word width (in bytes) is smaller than the
+    /// controlling type's width of the instruction. This would result in an illegal instruction that
+    /// would likely be expanded back into an instruction on smaller types with the same initial
+    /// opcode, creating unnecessary churn.
+    fn simplify(pos: &mut FuncCursor, inst: Inst, native_word_width: u32) {
+        match pos.func.dfg[inst] {
+            InstructionData::Binary { opcode, args } => {
+                if let Some(mut imm) = resolve_imm64_value(&pos.func.dfg, args[1]) {
+                    let new_opcode = match opcode {
+                        Opcode::Iadd => Opcode::IaddImm,
+                        Opcode::Imul => Opcode::ImulImm,
+                        Opcode::Sdiv => Opcode::SdivImm,
+                        Opcode::Udiv => Opcode::UdivImm,
+                        Opcode::Srem => Opcode::SremImm,
+                        Opcode::Urem => Opcode::UremImm,
+                        Opcode::Band => Opcode::BandImm,
+                        Opcode::Bor => Opcode::BorImm,
+                        Opcode::Bxor => Opcode::BxorImm,
+                        Opcode::Rotl => Opcode::RotlImm,
+                        Opcode::Rotr => Opcode::RotrImm,
+                        Opcode::Ishl => Opcode::IshlImm,
+                        Opcode::Ushr => Opcode::UshrImm,
+                        Opcode::Sshr => Opcode::SshrImm,
+                        Opcode::Isub => {
+                            imm = imm.wrapping_neg();
+                            Opcode::IaddImm
+                        }
+                        Opcode::Ifcmp => Opcode::IfcmpImm,
+                        _ => return,
+                    };
+                    let ty = pos.func.dfg.ctrl_typevar(inst);
+                    if ty.bytes() <= native_word_width {
+                        pos.func
+                            .dfg
+                            .replace(inst)
+                            .BinaryImm(new_opcode, ty, imm, args[0]);
+
+                        // Repeat for BinaryImm simplification.
+                        simplify(pos, inst, native_word_width);
+                    }
+                } else if let Some(imm) = resolve_imm64_value(&pos.func.dfg, args[0]) {
+                    let new_opcode = match opcode {
+                        Opcode::Iadd => Opcode::IaddImm,
+                        Opcode::Imul => Opcode::ImulImm,
+                        Opcode::Band => Opcode::BandImm,
+                        Opcode::Bor => Opcode::BorImm,
+                        Opcode::Bxor => Opcode::BxorImm,
+                        Opcode::Isub => Opcode::IrsubImm,
+                        _ => return,
+                    };
+                    let ty = pos.func.dfg.ctrl_typevar(inst);
+                    if ty.bytes() <= native_word_width {
+                        pos.func
+                            .dfg
+                            .replace(inst)
+                            .BinaryImm(new_opcode, ty, imm, args[1]);
+                    }
+                }
+            }
+
+            InstructionData::Unary { opcode, arg } => {
+                if let Opcode::AdjustSpDown = opcode {
+                    if let Some(imm) = resolve_imm64_value(&pos.func.dfg, arg) {
+                        // Note this works for both positive and negative immediate values.
+                        pos.func.dfg.replace(inst).adjust_sp_down_imm(imm);
+                    }
+                }
+            }
+
+            InstructionData::BinaryImm { opcode, arg, imm } => {
+                let ty = pos.func.dfg.ctrl_typevar(inst);
+
+                let mut arg = arg;
+                let mut imm = imm;
+                match opcode {
+                    Opcode::IaddImm
+                    | Opcode::ImulImm
+                    | Opcode::BorImm
+                    | Opcode::BandImm
+                    | Opcode::BxorImm => {
+                        // Fold binary_op(C2, binary_op(C1, x)) into binary_op(binary_op(C1, C2), x)
+                        if let ValueDef::Result(arg_inst, _) = pos.func.dfg.value_def(arg) {
+                            if let InstructionData::BinaryImm {
+                                opcode: prev_opcode,
+                                arg: prev_arg,
+                                imm: prev_imm,
+                            } = &pos.func.dfg[arg_inst]
+                            {
+                                if opcode == *prev_opcode
+                                    && ty == pos.func.dfg.ctrl_typevar(arg_inst)
+                                {
+                                    let lhs: i64 = imm.into();
+                                    let rhs: i64 = (*prev_imm).into();
+                                    let new_imm = match opcode {
+                                        Opcode::BorImm => lhs | rhs,
+                                        Opcode::BandImm => lhs & rhs,
+                                        Opcode::BxorImm => lhs ^ rhs,
+                                        Opcode::IaddImm => lhs.wrapping_add(rhs),
+                                        Opcode::ImulImm => lhs.wrapping_mul(rhs),
+                                        _ => panic!("can't happen"),
+                                    };
+                                    let new_imm = immediates::Imm64::from(new_imm);
+                                    let new_arg = *prev_arg;
+                                    pos.func
+                                        .dfg
+                                        .replace(inst)
+                                        .BinaryImm(opcode, ty, new_imm, new_arg);
+                                    imm = new_imm;
+                                    arg = new_arg;
+                                }
+                            }
+                        }
+                    }
+
+                    Opcode::UshrImm | Opcode::SshrImm => {
+                        if pos.func.dfg.ctrl_typevar(inst).bytes() <= native_word_width
+                            && try_fold_extended_move(pos, inst, opcode, arg, imm)
+                        {
+                            return;
+                        }
+                    }
+
+                    _ => {}
+                };
+
+                // Replace operations that are no-ops.
+                match (opcode, imm.into()) {
+                    (Opcode::IaddImm, 0)
+                    | (Opcode::ImulImm, 1)
+                    | (Opcode::SdivImm, 1)
+                    | (Opcode::UdivImm, 1)
+                    | (Opcode::BorImm, 0)
+                    | (Opcode::BandImm, -1)
+                    | (Opcode::BxorImm, 0)
+                    | (Opcode::RotlImm, 0)
+                    | (Opcode::RotrImm, 0)
+                    | (Opcode::IshlImm, 0)
+                    | (Opcode::UshrImm, 0)
+                    | (Opcode::SshrImm, 0) => {
+                        // Alias the result value with the original argument.
+                        replace_single_result_with_alias(&mut pos.func.dfg, inst, arg);
+                    }
+                    (Opcode::ImulImm, 0) | (Opcode::BandImm, 0) => {
+                        // Replace by zero.
+                        pos.func.dfg.replace(inst).iconst(ty, 0);
+                    }
+                    (Opcode::BorImm, -1) => {
+                        // Replace by minus one.
+                        pos.func.dfg.replace(inst).iconst(ty, -1);
+                    }
+                    _ => {}
+                }
+            }
+
+            InstructionData::IntCompare { opcode, cond, args } => {
+                debug_assert_eq!(opcode, Opcode::Icmp);
+                if let Some(imm) = resolve_imm64_value(&pos.func.dfg, args[1]) {
+                    if pos.func.dfg.ctrl_typevar(inst).bytes() <= native_word_width {
+                        pos.func.dfg.replace(inst).icmp_imm(cond, args[0], imm);
+                    }
+                }
+            }
+
+            InstructionData::CondTrap { .. }
+            | InstructionData::Branch { .. }
+            | InstructionData::Ternary {
+                opcode: Opcode::Select,
+                ..
+            } => {
+                // Fold away a redundant `bint`.
+                let condition_def = {
+                    let args = pos.func.dfg.inst_args(inst);
+                    pos.func.dfg.value_def(args[0])
+                };
+                if let ValueDef::Result(def_inst, _) = condition_def {
+                    if let InstructionData::Unary {
+                        opcode: Opcode::Bint,
+                        arg: bool_val,
+                    } = pos.func.dfg[def_inst]
+                    {
+                        let args = pos.func.dfg.inst_args_mut(inst);
+                        args[0] = bool_val;
+                    }
+                }
+            }
+
+            _ => {}
+        }
+    }
+
+    struct BranchOptInfo {
+        br_inst: Inst,
+        cmp_arg: Value,
+        args: ValueList,
+        new_opcode: Opcode,
+    }
+
+    /// Fold comparisons into branch operations when possible.
+    ///
+    /// This matches against operations which compare against zero, then use the
+    /// result in a `brz` or `brnz` branch. It folds those two operations into a
+    /// single `brz` or `brnz`.
+    fn branch_opt(pos: &mut FuncCursor, inst: Inst) {
+        let mut info = if let InstructionData::Branch {
+            opcode: br_opcode,
+            args: ref br_args,
+            ..
+        } = pos.func.dfg[inst]
+        {
+            let first_arg = {
+                let args = pos.func.dfg.inst_args(inst);
+                args[0]
+            };
+
+            let icmp_inst =
+                if let ValueDef::Result(icmp_inst, _) = pos.func.dfg.value_def(first_arg) {
+                    icmp_inst
+                } else {
+                    return;
+                };
+
+            if let InstructionData::IntCompareImm {
+                opcode: Opcode::IcmpImm,
+                arg: cmp_arg,
+                cond: cmp_cond,
+                imm: cmp_imm,
+            } = pos.func.dfg[icmp_inst]
+            {
+                let cmp_imm: i64 = cmp_imm.into();
+                if cmp_imm != 0 {
+                    return;
+                }
+
+                // icmp_imm returns non-zero when the comparison is true. So, if
+                // we're branching on zero, we need to invert the condition.
+                let cond = match br_opcode {
+                    Opcode::Brz => cmp_cond.inverse(),
+                    Opcode::Brnz => cmp_cond,
+                    _ => return,
+                };
+
+                let new_opcode = match cond {
+                    IntCC::Equal => Opcode::Brz,
+                    IntCC::NotEqual => Opcode::Brnz,
+                    _ => return,
+                };
+
+                BranchOptInfo {
+                    br_inst: inst,
+                    cmp_arg,
+                    args: br_args.clone(),
+                    new_opcode,
+                }
+            } else {
+                return;
+            }
+        } else {
+            return;
+        };
+
+        info.args.as_mut_slice(&mut pos.func.dfg.value_lists)[0] = info.cmp_arg;
+        if let InstructionData::Branch { ref mut opcode, .. } = pos.func.dfg[info.br_inst] {
+            *opcode = info.new_opcode;
+        } else {
+            panic!();
+        }
+    }
+}
+
 /// The main pre-opt pass.
 pub fn do_preopt<'func, 'isa>(
    func: &'func mut Function,
@@ -607,30 +1027,12 @@ pub fn do_preopt<'func, 'isa>(
    let _tt = timing::preopt();

    let mut pos = FuncCursor::new(func);
-    let mut preopt = crate::peepmatic::preopt(isa);
+    let native_word_width = isa.pointer_bytes() as u32;
+    let mut optimizer = simplify::peephole_optimizer(isa);

    while let Some(block) = pos.next_block() {
        while let Some(inst) = pos.next_inst() {
-            // After we apply one optimization, that might make another
-            // optimization applicable. Keep running the peephole optimizer
-            // until either:
-            //
-            // * No optimization applied, and therefore it doesn't make sense to
-            //   try again, because no optimization will apply again.
-            //
-            // * Or when we replaced an instruction with an alias to an existing
-            //   value, because we already ran the peephole optimizer over the
-            //   aliased value's instruction in an early part of the traversal
-            //   over the function.
-            while let Some(ValueOrInst::Inst(new_inst)) =
-                preopt.apply_one(&mut pos, ValueOrInst::Inst(inst))
-            {
-                // We transplanted a new instruction into the current
-                // instruction, so the "new" instruction is actually the same
-                // one, just with different data.
-                debug_assert_eq!(new_inst, inst);
-            }
-            debug_assert_eq!(pos.current_inst(), Some(inst));
+            simplify::apply_all(&mut optimizer, &mut pos, inst, native_word_width);

            // Try to transform divide-by-constant into simpler operations.
            if let Some(divrem_info) = get_div_info(inst, &pos.func.dfg) {
--- a/cranelift/filetests/Cargo.toml
+++ b/cranelift/filetests/Cargo.toml
@@ -26,3 +26,6 @@ num_cpus = "1.8.0"
 region = "2.1.2"
 target-lexicon = "0.10"
 thiserror = "1.0.15"
+
+[features]
+enable-peepmatic = []
--- a/cranelift/filetests/filetests/isa/x86/isub_imm-i8.clif
+++ b/cranelift/filetests/filetests/isa/x86/isub_imm-i8.clif
@@ -6,9 +6,9 @@ function u0:0(i8) -> i8 fast {
 block0(v0: i8):
    v1 = iconst.i8 0
    v2 = isub v1, v0
-    ; check:  v4 = uextend.i32 v0
-    ; nextln: v6 = iconst.i32 0
-    ; nextln: v5 = isub v6, v4
-    ; nextln: v2 = ireduce.i8 v5
+    ; check:  uextend.i32
+    ; nextln: iconst.i32
+    ; nextln: isub
+    ; nextln: ireduce.i8
    return v2
 }
--- a/cranelift/filetests/filetests/peepmatic/branch.clif
+++ b/cranelift/filetests/filetests/peepmatic/branch.clif
@@ -0,0 +1,81 @@
+test peepmatic
+target x86_64
+
+function %icmp_to_brz_fold(i32) -> i32 {
+block0(v0: i32):
+    v1 = icmp_imm eq v0, 0
+    brnz v1, block1
+    jump block2
+block1:
+    v3 = iconst.i32 1
+    return v3
+block2:
+    v4 = iconst.i32 2
+    return v4
+}
+; sameln: function %icmp_to_brz_fold
+; nextln: block0(v0: i32):
+; nextln:     v1 = icmp_imm eq v0, 0
+; nextln:     brnz v0, block2
+; nextln:     jump block1
+; nextln: 
+; nextln: block1:
+; nextln:     v3 = iconst.i32 1
+; nextln:     return v3
+; nextln: 
+; nextln: block2:
+; nextln:     v4 = iconst.i32 2
+; nextln:     return v4
+; nextln: }
+
+function %icmp_to_brz_inverted_fold(i32) -> i32 {
+block0(v0: i32):
+    v1 = icmp_imm ne v0, 0
+    brz v1, block1
+    jump block2
+block1:
+    v3 = iconst.i32 1
+    return v3
+block2:
+    v4 = iconst.i32 2
+    return v4
+}
+; sameln: function %icmp_to_brz_inve
+; nextln: block0(v0: i32):
+; nextln:     v1 = icmp_imm ne v0, 0
+; nextln:     brnz v0, block2
+; nextln:     jump block1
+; nextln: 
+; nextln: block1:
+; nextln:     v3 = iconst.i32 1
+; nextln:     return v3
+; nextln: 
+; nextln: block2:
+; nextln:     v4 = iconst.i32 2
+; nextln:     return v4
+; nextln: }
+
+function %br_icmp_inversion(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+    br_icmp ugt v0, v1, block1
+    jump block2
+block1:
+    v2 = iconst.i32 1
+    return v2
+block2:
+    v3 = iconst.i32 2
+    return v3
+}
+; sameln: function %br_icmp_inversio
+; nextln: block0(v0: i32, v1: i32):
+; nextln:     br_icmp ule v0, v1, block2
+; nextln:     jump block1
+; nextln: 
+; nextln: block1:
+; nextln:     v2 = iconst.i32 1
+; nextln:     return v2
+; nextln: 
+; nextln: block2:
+; nextln:     v3 = iconst.i32 2
+; nextln:     return v3
+; nextln: }
--- a/cranelift/filetests/filetests/peepmatic/div_by_const_indirect.clif
+++ b/cranelift/filetests/filetests/peepmatic/div_by_const_indirect.clif
@@ -0,0 +1,55 @@
+test peepmatic
+target x86_64 baseline
+
+; Cases where the denominator is created by an iconst
+
+function %indir_udiv32(i32) -> i32 {
+block0(v0: i32):
+    v1 = iconst.i32 7
+    v2 = udiv v0, v1
+    ; check:  v4 = iconst.i32 0x2492_4925
+    ; nextln: v5 = umulhi v0, v4
+    ; nextln: v6 = isub v0, v5
+    ; nextln: v7 = ushr_imm v6, 1
+    ; nextln: v8 = iadd v7, v5
+    ; nextln: v9 = ushr_imm v8, 2
+    ; nextln: v2 -> v9
+    return v2
+}
+
+function %indir_sdiv32(i32) -> i32 {
+block0(v0: i32):
+    v1 = iconst.i32 -17
+    v2 = sdiv v0, v1
+    ; check:  v4 = iconst.i32 0xffff_ffff_8787_8787
+    ; nextln: v5 = smulhi v0, v4
+    ; nextln: v6 = sshr_imm v5, 3
+    ; nextln: v7 = ushr_imm v6, 31
+    ; nextln: v8 = iadd v6, v7
+    ; nextln: v2 -> v8
+    return v2
+}
+
+function %indir_udiv64(i64) -> i64 {
+block0(v0: i64):
+    v1 = iconst.i64 1337
+    v2 = udiv v0, v1
+    ; check:  v4 = iconst.i64 0xc411_9d95_2866_a139
+    ; nextln: v5 = umulhi v0, v4
+    ; nextln: v6 = ushr_imm v5, 10
+    ; nextln: v2 -> v6
+    return v2
+}
+
+function %indir_sdiv64(i64) -> i64 {
+block0(v0: i64):
+    v1 = iconst.i64 -90210
+    v2 = sdiv v0, v1
+    ; check:  v4 = iconst.i64 0xd181_4ee8_939c_b8bb
+    ; nextln: v5 = smulhi v0, v4
+    ; nextln: v6 = sshr_imm v5, 14
+    ; nextln: v7 = ushr_imm v6, 63
+    ; nextln: v8 = iadd v6, v7
+    ; nextln: v2 -> v8
+    return v2
+}
--- a/cranelift/filetests/filetests/peepmatic/div_by_const_non_power_of_2.clif
+++ b/cranelift/filetests/filetests/peepmatic/div_by_const_non_power_of_2.clif
@@ -0,0 +1,266 @@
+test peepmatic
+target i686 baseline
+
+; -------- U32 --------
+
+; complex case (mul, sub, shift, add, shift)
+function %t_udiv32_p7(i32) -> i32 {
+block0(v0: i32):
+    v1 = udiv_imm v0, 7
+    ; check: iconst.i32 0x2492_4925
+    ; check: umulhi v0, v2
+    ; check: isub v0, v3
+    ; check: ushr_imm v4, 1
+    ; check: iadd v5, v3
+    ; check: v7 = ushr_imm v6, 2
+    ; check: v1 -> v7
+    return v1
+}
+
+; simple case (mul, shift)
+function %t_udiv32_p125(i32) -> i32 {
+block0(v0: i32):
+    v1 = udiv_imm v0, 125
+    ; check: iconst.i32 0x1062_4dd3
+    ; check: umulhi v0, v2
+    ; check: v4 = ushr_imm v3, 3
+    ; check: v1 -> v4
+    return v1
+}
+
+; simple case w/ shift by zero (mul)
+function %t_udiv32_p641(i32) -> i32 {
+block0(v0: i32):
+    v1 = udiv_imm v0, 641
+    ; check: iconst.i32 0x0066_3d81
+    ; check: v3 = umulhi v0, v2
+    ; check: v1 -> v3
+    return v1
+}
+
+
+; -------- S32 --------
+
+; simple case w/ shift by zero (mul, add-sign-bit)
+function %t_sdiv32_n6(i32) -> i32 {
+block0(v0: i32):
+    v1 = sdiv_imm v0, -6
+    ; check: iconst.i32 0xffff_ffff_d555_5555
+    ; check: smulhi v0, v2
+    ; check: ushr_imm v3, 31
+    ; check: v5 = iadd v3, v4
+    ; check: v1 -> v5
+    return v1
+}
+
+; simple case (mul, shift, add-sign-bit)
+function %t_sdiv32_n5(i32) -> i32 {
+block0(v0: i32):
+    v1 = sdiv_imm v0, -5
+    ; check: iconst.i32 0xffff_ffff_9999_9999
+    ; check: smulhi v0, v2
+    ; check: sshr_imm v3, 1
+    ; check: ushr_imm v4, 31
+    ; check: v6 = iadd v4, v5
+    ; check: v1 -> v6
+    return v1
+}
+
+; case d < 0 && M > 0 (mul, sub, shift, add-sign-bit)
+function %t_sdiv32_n3(i32) -> i32 {
+block0(v0: i32):
+    v1 = sdiv_imm v0, -3
+    ; check: iconst.i32 0x5555_5555
+    ; check: smulhi v0, v2
+    ; check: isub v3, v0
+    ; check: sshr_imm v4, 1
+    ; check: ushr_imm v5, 31
+    ; check: v7 = iadd v5, v6
+    ; check: v1 -> v7
+    return v1
+}
+
+; simple case w/ shift by zero (mul, add-sign-bit)
+function %t_sdiv32_p6(i32) -> i32 {
+block0(v0: i32):
+    v1 = sdiv_imm v0, 6
+    ; check: iconst.i32 0x2aaa_aaab
+    ; check: smulhi v0, v2
+    ; check: ushr_imm v3, 31
+    ; check: v5 = iadd v3, v4
+    ; check: v1 -> v5
+    return v1
+}
+
+; case d > 0 && M < 0 (mull, add, shift, add-sign-bit)
+function %t_sdiv32_p7(i32) -> i32 {
+block0(v0: i32):
+    v1 = sdiv_imm v0, 7
+    ; check: iconst.i32 0xffff_ffff_9249_2493
+    ; check: smulhi v0, v2
+    ; check: iadd v3, v0
+    ; check: sshr_imm v4, 2
+    ; check: ushr_imm v5, 31
+    ; check: v7 = iadd v5, v6
+    ; check: v1 -> v7
+    return v1
+}
+
+; simple case (mul, shift, add-sign-bit)
+function %t_sdiv32_p625(i32) -> i32 {
+block0(v0: i32):
+    v1 = sdiv_imm v0, 625
+    ; check: iconst.i32 0x68db_8bad
+    ; check: smulhi v0, v2
+    ; check: sshr_imm v3, 8
+    ; check: ushr_imm v4, 31
+    ; check: v6 = iadd v4, v5
+    ; check: v1 -> v6
+    return v1
+}
+
+
+; -------- U64 --------
+
+; complex case (mul, sub, shift, add, shift)
+function %t_udiv64_p7(i64) -> i64 {
+block0(v0: i64):
+    v1 = udiv_imm v0, 7
+    ; check: iconst.i64 0x2492_4924_9249_2493
+    ; check: umulhi v0, v2
+    ; check: isub v0, v3
+    ; check: ushr_imm v4, 1
+    ; check: iadd v5, v3
+    ; check: v7 = ushr_imm v6, 2
+    ; check: v1 -> v7
+    return v1
+}
+
+; simple case (mul, shift)
+function %t_udiv64_p9(i64) -> i64 {
+block0(v0: i64):
+    v1 = udiv_imm v0, 9
+    ; check: iconst.i64 0xe38e_38e3_8e38_e38f
+    ; check: umulhi v0, v2
+    ; check: v4 = ushr_imm v3, 3
+    ; check: v1 -> v4
+    return v1
+}
+
+; complex case (mul, sub, shift, add, shift)
+function %t_udiv64_p125(i64) -> i64 {
+block0(v0: i64):
+    v1 = udiv_imm v0, 125
+    ; check: iconst.i64 0x0624_dd2f_1a9f_be77
+    ; check: umulhi v0, v2
+    ; check: isub v0, v3
+    ; check: ushr_imm v4, 1
+    ; check: iadd v5, v3
+    ; check: v7 = ushr_imm v6, 6
+    ; check: v1 -> v7
+    return v1
+}
+
+; simple case w/ shift by zero (mul)
+function %t_udiv64_p274177(i64) -> i64 {
+block0(v0: i64):
+    v1 = udiv_imm v0, 274177
+    ; check: iconst.i64 0x3d30_f19c_d101
+    ; check: v3 = umulhi v0, v2
+    ; check: v1 -> v3
+    return v1
+}
+
+
+; -------- S64 --------
+
+; simple case (mul, shift, add-sign-bit)
+function %t_sdiv64_n625(i64) -> i64 {
+block0(v0: i64):
+    v1 = sdiv_imm v0, -625
+    ; check: iconst.i64 0xcb92_3a29_c779_a6b5
+    ; check: smulhi v0, v2
+    ; check: sshr_imm v3, 7
+    ; check: ushr_imm v4, 63
+    ; check: v6 = iadd v4, v5
+    ; check: v1 -> v6
+    return v1
+}
+
+; simple case w/ zero shift (mul, add-sign-bit)
+function %t_sdiv64_n6(i64) -> i64 {
+block0(v0: i64):
+    v1 = sdiv_imm v0, -6
+    ; check: iconst.i64 0xd555_5555_5555_5555
+    ; check: smulhi v0, v2
+    ; check: ushr_imm v3, 63
+    ; check: v5 = iadd v3, v4
+    ; check: v1 -> v5
+    return v1
+}
+
+; simple case w/ zero shift (mul, add-sign-bit)
+function %t_sdiv64_n5(i64) -> i64 {
+block0(v0: i64):
+    v1 = sdiv_imm v0, -5
+    ; check: iconst.i64 0x9999_9999_9999_9999
+    ; check: smulhi v0, v2
+    ; check: sshr_imm v3, 1
+    ; check: ushr_imm v4, 63
+    ; check: v6 = iadd v4, v5
+    ; check: v1 -> v6
+    return v1
+}
+
+; case d < 0 && M > 0 (mul, sub, shift, add-sign-bit)
+function %t_sdiv64_n3(i64) -> i64 {
+block0(v0: i64):
+    v1 = sdiv_imm v0, -3
+    ; check: iconst.i64 0x5555_5555_5555_5555
+    ; check: smulhi v0, v2
+    ; check: isub v3, v0
+    ; check: sshr_imm v4, 1
+    ; check: ushr_imm v5, 63
+    ; check: v7 = iadd v5, v6
+    ; check: v1 -> v7
+    return v1
+}
+
+; simple case w/ zero shift (mul, add-sign-bit)
+function %t_sdiv64_p6(i64) -> i64 {
+block0(v0: i64):
+    v1 = sdiv_imm v0, 6
+    ; check: iconst.i64 0x2aaa_aaaa_aaaa_aaab
+    ; check: smulhi v0, v2
+    ; check: ushr_imm v3, 63
+    ; check: v5 = iadd v3, v4
+    ; check: v1 -> v5
+    return v1
+}
+
+; case d > 0 && M < 0 (mul, add, shift, add-sign-bit)
+function %t_sdiv64_p15(i64) -> i64 {
+block0(v0: i64):
+    v1 = sdiv_imm v0, 15
+    ; check: iconst.i64 0x8888_8888_8888_8889
+    ; check: smulhi v0, v2
+    ; check: iadd v3, v0
+    ; check: sshr_imm v4, 3
+    ; check: ushr_imm v5, 63
+    ; check: v7 = iadd v5, v6
+    ; check: v1 -> v7
+    return v1
+}
+
+; simple case (mul, shift, add-sign-bit)
+function %t_sdiv64_p625(i64) -> i64 {
+block0(v0: i64):
+    v1 = sdiv_imm v0, 625
+    ; check: iconst.i64 0x346d_c5d6_3886_594b
+    ; check: smulhi v0, v2
+    ; check: sshr_imm v3, 7
+    ; check: ushr_imm v4, 63
+    ; check: v6 = iadd v4, v5
+    ; check: v1 -> v6
+    return v1
+}
--- a/cranelift/filetests/filetests/peepmatic/div_by_const_power_of_2.clif
+++ b/cranelift/filetests/filetests/peepmatic/div_by_const_power_of_2.clif
@@ -0,0 +1,292 @@
+test peepmatic
+target i686 baseline
+
+; -------- U32 --------
+
+; ignored
+function %t_udiv32_p0(i32) -> i32 {
+block0(v0: i32):
+    v1 = udiv_imm v0, 0
+    ; check: udiv_imm v0, 0
+    return v1
+}
+
+; converted to a nop
+function %t_udiv32_p1(i32) -> i32 {
+block0(v0: i32):
+    v1 = udiv_imm v0, 1
+    ; check: nop
+    return v1
+}
+
+; shift
+function %t_udiv32_p2(i32) -> i32 {
+block0(v0: i32):
+    v1 = udiv_imm v0, 2
+    ; check: ushr_imm v0, 1
+    return v1
+}
+
+; shift
+function %t_udiv32_p2p31(i32) -> i32 {
+block0(v0: i32):
+    v1 = udiv_imm v0, 0x8000_0000
+    ; check: ushr_imm v0, 31
+    return v1
+}
+
+
+; -------- U64 --------
+
+; ignored
+function %t_udiv64_p0(i64) -> i64 {
+block0(v0: i64):
+    v1 = udiv_imm v0, 0
+    ; check: udiv_imm v0, 0
+    return v1
+}
+
+; converted to a nop
+function %t_udiv64_p1(i64) -> i64 {
+block0(v0: i64):
+    v1 = udiv_imm v0, 1
+    ; check: nop
+    return v1
+}
+
+; shift
+function %t_udiv64_p2(i64) -> i64 {
+block0(v0: i64):
+    v1 = udiv_imm v0, 2
+    ; check: ushr_imm v0, 1
+    return v1
+}
+
+; shift
+function %t_udiv64_p2p63(i64) -> i64 {
+block0(v0: i64):
+    v1 = udiv_imm v0, 0x8000_0000_0000_0000
+    ; check: ushr_imm v0, 63
+    return v1
+}
+
+
+; -------- S32 --------
+
+; ignored
+function %t_sdiv32_p0(i32) -> i32 {
+block0(v0: i32):
+    v1 = sdiv_imm v0, 0
+    ; check: sdiv_imm v0, 0
+    return v1
+}
+
+; converted to a nop
+function %t_sdiv32_p1(i32) -> i32 {
+block0(v0: i32):
+    v1 = sdiv_imm v0, 1
+    ; check: nop
+    return v1
+}
+
+; ignored
+function %t_sdiv32_n1(i32) -> i32 {
+block0(v0: i32):
+    v1 = sdiv_imm v0, -1
+    ; check: sdiv_imm v0, -1
+    return v1
+}
+
+; shift
+function %t_sdiv32_p2(i32) -> i32 {
+block0(v0: i32):
+    v1 = sdiv_imm v0, 2
+    ; check: ushr_imm v0, 31
+    ; check: iadd v0, v2
+    ; check: sshr_imm v3, 1
+    ; check: v1 -> v4
+    return v1
+}
+
+; shift
+function %t_sdiv32_n2(i32) -> i32 {
+block0(v0: i32):
+    v1 = sdiv_imm v0, -2
+    ; check: ushr_imm v0, 31
+    ; check: iadd v0, v2
+    ; check: sshr_imm v3, 1
+    ; check: irsub_imm v4, 0
+    return v1
+}
+
+; shift
+function %t_sdiv32_p4(i32) -> i32 {
+block0(v0: i32):
+    v1 = sdiv_imm v0, 4
+    ; check: v2 = sshr_imm v0, 1
+    ; check: ushr_imm v2, 30
+    ; check: iadd v0, v3
+    ; check: v5 = sshr_imm v4, 2
+    ; check: v1 -> v5
+
+    return v1
+}
+
+; shift
+function %t_sdiv32_n4(i32) -> i32 {
+block0(v0: i32):
+    v1 = sdiv_imm v0, -4
+    ; check: sshr_imm v0, 1
+    ; check: ushr_imm v2, 30
+    ; check: iadd v0, v3
+    ; check: sshr_imm v4, 2
+    ; check: irsub_imm v5, 0
+    return v1
+}
+
+; shift
+function %t_sdiv32_p2p30(i32) -> i32 {
+block0(v0: i32):
+    v1 = sdiv_imm v0, 0x4000_0000
+    ; check: sshr_imm v0, 29
+    ; check: ushr_imm v2, 2
+    ; check: iadd v0, v3
+    ; check: v5 = sshr_imm v4, 30
+    ; check: v1 -> v5
+    return v1
+}
+
+; shift
+function %t_sdiv32_n2p30(i32) -> i32 {
+block0(v0: i32):
+    v1 = sdiv_imm v0, -0x4000_0000
+    ; check: sshr_imm v0, 29
+    ; check: ushr_imm v2, 2
+    ; check: iadd v0, v3
+    ; check: sshr_imm v4, 30
+    ; check: irsub_imm v5, 0
+    return v1
+}
+
+; there's no positive version of this, since -(-0x8000_0000) isn't
+; representable.
+function %t_sdiv32_n2p31(i32) -> i32 {
+block0(v0: i32):
+    v1 = sdiv_imm v0, -0x8000_0000
+    ; check: sshr_imm v0, 30
+    ; check: ushr_imm v2, 1
+    ; check: iadd v0, v3
+    ; check: sshr_imm v4, 31
+    ; check: irsub_imm v5, 0
+    return v1
+}
+
+
+; -------- S64 --------
+
+; ignored
+function %t_sdiv64_p0(i64) -> i64 {
+block0(v0: i64):
+    v1 = sdiv_imm v0, 0
+    ; check: sdiv_imm v0, 0
+    return v1
+}
+
+; converted to a nop
+function %t_sdiv64_p1(i64) -> i64 {
+block0(v0: i64):
+    v1 = sdiv_imm v0, 1
+    ; check: nop
+    return v1
+}
+
+; ignored
+function %t_sdiv64_n1(i64) -> i64 {
+block0(v0: i64):
+    v1 = sdiv_imm v0, -1
+    ; check: sdiv_imm v0, -1
+    return v1
+}
+
+; shift
+function %t_sdiv64_p2(i64) -> i64 {
+block0(v0: i64):
+    v1 = sdiv_imm v0, 2
+    ; check: ushr_imm v0, 63
+    ; check: iadd v0, v2
+    ; check: v4 = sshr_imm v3, 1
+    ; check: v1 -> v4
+    return v1
+}
+
+; shift
+function %t_sdiv64_n2(i64) -> i64 {
+block0(v0: i64):
+    v1 = sdiv_imm v0, -2
+    ; check: ushr_imm v0, 63
+    ; check: iadd v0, v2
+    ; check: sshr_imm v3, 1
+    ; check: irsub_imm v4, 0
+    return v1
+}
+
+; shift
+function %t_sdiv64_p4(i64) -> i64 {
+block0(v0: i64):
+    v1 = sdiv_imm v0, 4
+    ; check: sshr_imm v0, 1
+    ; check: ushr_imm v2, 62
+    ; check: iadd v0, v3
+    ; check: v5 = sshr_imm v4, 2
+    ; check: v1 -> v5
+    return v1
+}
+
+; shift
+function %t_sdiv64_n4(i64) -> i64 {
+block0(v0: i64):
+    v1 = sdiv_imm v0, -4
+    ; check: sshr_imm v0, 1
+    ; check: ushr_imm v2, 62
+    ; check: iadd v0, v3
+    ; check: sshr_imm v4, 2
+    ; check: irsub_imm v5, 0
+    return v1
+}
+
+; shift
+function %t_sdiv64_p2p62(i64) -> i64 {
+block0(v0: i64):
+    v1 = sdiv_imm v0, 0x4000_0000_0000_0000
+    ; check: sshr_imm v0, 61
+    ; check: ushr_imm v2, 2
+    ; check: iadd v0, v3
+    ; check: v5 = sshr_imm v4, 62
+    ; check: v1 -> v5
+    return v1
+}
+
+; shift
+function %t_sdiv64_n2p62(i64) -> i64 {
+block0(v0: i64):
+    v1 = sdiv_imm v0, -0x4000_0000_0000_0000
+    ; check: sshr_imm v0, 61
+    ; check: ushr_imm v2, 2
+    ; check: iadd v0, v3
+    ; check: sshr_imm v4, 62
+    ; check: irsub_imm v5, 0
+    return v1
+}
+
+; there's no positive version of this, since -(-0x8000_0000_0000_0000) isn't
+; representable.
+function %t_sdiv64_n2p63(i64) -> i64 {
+block0(v0: i64):
+    v1 = sdiv_imm v0, -0x8000_0000_0000_0000
+    ; check: sshr_imm v0, 62
+    ; check: ushr_imm v2, 1
+    ; check: iadd v0, v3
+    ; check: sshr_imm v4, 63
+    ; check: irsub_imm v5, 0
+    return v1
+}
--- a/cranelift/filetests/filetests/peepmatic/do_not_keep_applying_optimizations_after_replacing_with_an_alias.clif
+++ b/cranelift/filetests/filetests/peepmatic/do_not_keep_applying_optimizations_after_replacing_with_an_alias.clif
@@ -1,4 +1,4 @@
-test simple_preopt
+test peepmatic
 target x86_64

 ;; This file used to trigger assertions where we would keep trying to
--- a/cranelift/filetests/filetests/peepmatic/do_not_reorder_instructions_when_transplanting.clif
+++ b/cranelift/filetests/filetests/peepmatic/do_not_reorder_instructions_when_transplanting.clif
@@ -0,0 +1,22 @@
+test peepmatic
+target x86_64
+
+;; Test that although v5 can be replaced with v1, we don't transplant `load.i32
+;; v0` on top of `iadd v3, v4`, because that would move the load past other uses
+;; of its result.
+
+function %foo(i64) -> i32 {
+block0(v0: i64):
+    v1 = load.i32 v0
+    v2 = iconst.i32 16
+    v3 = iadd_imm v1, -16
+    v4 = iconst.i32 16
+    v5 = iadd v3, v4
+    ; check:  v1 = load.i32 v0
+    ; nextln: v5 -> v1
+    ; nextln: v2 = iconst.i32 16
+    ; nextln: v3 = iadd_imm v1, -16
+    ; nextln: v4 = iconst.i32 16
+    ; nextln: nop
+    return v5
+}
--- a/cranelift/filetests/filetests/peepmatic/fold-extended-move-wraparound.clif
+++ b/cranelift/filetests/filetests/peepmatic/fold-extended-move-wraparound.clif
@@ -0,0 +1,14 @@
+test peepmatic
+target x86_64
+
+function %wraparound(i64 vmctx) -> f32 system_v {
+    gv0 = vmctx
+    gv1 = iadd_imm.i64 gv0, 48
+
+block35(v0: i64):
+    v88 = iconst.i64 0
+    v89 = iconst.i64 0x8000_0000_0000_0000
+    v90 = ishl_imm v88, 0x8000_0000_0000_0000
+    v91 = sshr v90, v89; check: sshr_imm v90, 0x8000_0000_0000_0000
+    trap user0
+}
--- a/cranelift/filetests/filetests/peepmatic/rem_by_const_non_power_of_2.clif
+++ b/cranelift/filetests/filetests/peepmatic/rem_by_const_non_power_of_2.clif
@@ -0,0 +1,285 @@
+test peepmatic
+target i686 baseline
+
+; -------- U32 --------
+
+; complex case (mul, sub, shift, add, shift)
+function %t_urem32_p7(i32) -> i32 {
+block0(v0: i32):
+    v1 = urem_imm v0, 7
+    ; check: iconst.i32 0x2492_4925
+    ; check: umulhi v0, v2
+    ; check: isub v0, v3
+    ; check: ushr_imm v4, 1
+    ; check: iadd v5, v3
+    ; check: ushr_imm v6, 2
+    ; check: imul_imm v7, 7
+    ; check: isub v0, v8
+    return v1
+}
+
+; simple case (mul, shift)
+function %t_urem32_p125(i32) -> i32 {
+block0(v0: i32):
+    v1 = urem_imm v0, 125
+    ; check: iconst.i32 0x1062_4dd3
+    ; check: umulhi v0, v2
+    ; check: ushr_imm v3, 3
+    ; check: imul_imm v4, 125
+    ; check: isub v0, v5
+    return v1
+}
+
+; simple case w/ shift by zero (mul)
+function %t_urem32_p641(i32) -> i32 {
+block0(v0: i32):
+    v1 = urem_imm v0, 641
+    ; check: iconst.i32 0x0066_3d81
+    ; check: umulhi v0, v2
+    ; check: imul_imm v3, 641
+    ; check: isub v0, v4
+    return v1
+}
+
+
+; -------- S32 --------
+
+; simple case w/ shift by zero (mul, add-sign-bit)
+function %t_srem32_n6(i32) -> i32 {
+block0(v0: i32):
+    v1 = srem_imm v0, -6
+    ; check: iconst.i32 0xffff_ffff_d555_5555
+    ; check: smulhi v0, v2
+    ; check: ushr_imm v3, 31
+    ; check: iadd v3, v4
+    ; check: imul_imm v5, -6
+    ; check: isub v0, v6
+    return v1
+}
+
+; simple case (mul, shift, add-sign-bit)
+function %t_srem32_n5(i32) -> i32 {
+block0(v0: i32):
+    v1 = srem_imm v0, -5
+    ; check: iconst.i32 0xffff_ffff_9999_9999
+    ; check: smulhi v0, v2
+    ; check: sshr_imm v3, 1
+    ; check: ushr_imm v4, 31
+    ; check: iadd v4, v5
+    ; check: imul_imm v6, -5
+    ; check: isub v0, v7
+    return v1
+}
+
+; case d < 0 && M > 0 (mul, sub, shift, add-sign-bit)
+function %t_srem32_n3(i32) -> i32 {
+block0(v0: i32):
+    v1 = srem_imm v0, -3
+    ; check: iconst.i32 0x5555_5555
+    ; check: smulhi v0, v2
+    ; check: isub v3, v0
+    ; check: sshr_imm v4, 1
+    ; check: ushr_imm v5, 31
+    ; check: iadd v5, v6
+    ; check: imul_imm v7, -3
+    ; check: isub v0, v8
+    return v1
+}
+
+; simple case w/ shift by zero (mul, add-sign-bit)
+function %t_srem32_p6(i32) -> i32 {
+block0(v0: i32):
+    v1 = srem_imm v0, 6
+    ; check: iconst.i32 0x2aaa_aaab
+    ; check: smulhi v0, v2
+    ; check: ushr_imm v3, 31
+    ; check: iadd v3, v4
+    ; check: imul_imm v5, 6
+    ; check: isub v0, v6
+    return v1
+}
+
+; case d > 0 && M < 0 (mull, add, shift, add-sign-bit)
+function %t_srem32_p7(i32) -> i32 {
+block0(v0: i32):
+    v1 = srem_imm v0, 7
+    ; check: iconst.i32 0xffff_ffff_9249_2493
+    ; check: smulhi v0, v2
+    ; check: iadd v3, v0
+    ; check: sshr_imm v4, 2
+    ; check: ushr_imm v5, 31
+    ; check: iadd v5, v6
+    ; check: imul_imm v7, 7
+    ; check: isub v0, v8
+    return v1
+}
+
+; simple case (mul, shift, add-sign-bit)
+function %t_srem32_p625(i32) -> i32 {
+block0(v0: i32):
+    v1 = srem_imm v0, 625
+    ; check: iconst.i32 0x68db_8bad
+    ; check: smulhi v0, v2
+    ; check: sshr_imm v3, 8
+    ; check: ushr_imm v4, 31
+    ; check: iadd v4, v5
+    ; check: imul_imm v6, 625
+    ; check: isub v0, v7
+    return v1
+}
+
+
+; -------- U64 --------
+
+; complex case (mul, sub, shift, add, shift)
+function %t_urem64_p7(i64) -> i64 {
+block0(v0: i64):
+    v1 = urem_imm v0, 7
+    ; check: umulhi v0, v2
+    ; check: isub v0, v3
+    ; check: ushr_imm v4, 1
+    ; check: iadd v5, v3
+    ; check: ushr_imm v6, 2
+    ; check: imul_imm v7, 7
+    ; check: isub v0, v8
+    return v1
+}
+
+; simple case (mul, shift)
+function %t_urem64_p9(i64) -> i64 {
+block0(v0: i64):
+    v1 = urem_imm v0, 9
+    ; check: iconst.i64 0xe38e_38e3_8e38_e38f
+    ; check: umulhi v0, v2
+    ; check: ushr_imm v3, 3
+    ; check: imul_imm v4, 9
+    ; check: isub v0, v5
+    return v1
+}
+
+; complex case (mul, sub, shift, add, shift)
+function %t_urem64_p125(i64) -> i64 {
+block0(v0: i64):
+    v1 = urem_imm v0, 125
+    ; check: iconst.i64 0x0624_dd2f_1a9f_be77
+    ; check: umulhi v0, v2
+    ; check: isub v0, v3
+    ; check: ushr_imm v4, 1
+    ; check: iadd v5, v3
+    ; check: ushr_imm v6, 6
+    ; check: imul_imm v7, 125
+    ; check: isub v0, v8
+    return v1
+}
+
+; simple case w/ shift by zero (mul)
+function %t_urem64_p274177(i64) -> i64 {
+block0(v0: i64):
+    v1 = urem_imm v0, 274177
+    ; check: iconst.i64 0x3d30_f19c_d101
+    ; check: umulhi v0, v2
+    ; check: imul_imm v3, 0x0004_2f01
+    ; check: isub v0, v4
+    return v1
+}
+
+
+; -------- S64 --------
+
+; simple case (mul, shift, add-sign-bit)
+function %t_srem64_n625(i64) -> i64 {
+block0(v0: i64):
+    v1 = srem_imm v0, -625
+    ; check: iconst.i64 0xcb92_3a29_c779_a6b5
+    ; check: smulhi v0, v2
+    ; check: sshr_imm v3, 7
+    ; check: ushr_imm v4, 63
+    ; check: iadd v4, v5
+    ; check: imul_imm v6, -625
+    ; check: isub v0, v7
+    return v1
+}
+
+; simple case w/ zero shift (mul, add-sign-bit)
+function %t_srem64_n6(i64) -> i64 {
+block0(v0: i64):
+    v1 = srem_imm v0, -6
+    ; check: iconst.i64 0xd555_5555_5555_5555
+    ; check: smulhi v0, v2
+    ; check: ushr_imm v3, 63
+    ; check: iadd v3, v4
+    ; check: imul_imm v5, -6
+    ; check: isub v0, v6
+    return v1
+}
+
+; simple case w/ zero shift (mul, add-sign-bit)
+function %t_srem64_n5(i64) -> i64 {
+block0(v0: i64):
+    v1 = srem_imm v0, -5
+    ; check: iconst.i64 0x9999_9999_9999_9999
+    ; check: smulhi v0, v2
+    ; check: sshr_imm v3, 1
+    ; check: ushr_imm v4, 63
+    ; check: iadd v4, v5
+    ; check: imul_imm v6, -5
+    ; check: isub v0, v7
+    return v1
+}
+
+; case d < 0 && M > 0 (mul, sub, shift, add-sign-bit)
+function %t_srem64_n3(i64) -> i64 {
+block0(v0: i64):
+    v1 = srem_imm v0, -3
+    ; check: iconst.i64 0x5555_5555_5555_5555
+    ; check: smulhi v0, v2
+    ; check: isub v3, v0
+    ; check: sshr_imm v4, 1
+    ; check: ushr_imm v5, 63
+    ; check: iadd v5, v6
+    ; check: imul_imm v7, -3
+    ; check: isub v0, v8
+    return v1
+}
+
+; simple case w/ zero shift (mul, add-sign-bit)
+function %t_srem64_p6(i64) -> i64 {
+block0(v0: i64):
+    v1 = srem_imm v0, 6
+    ; check: iconst.i64 0x2aaa_aaaa_aaaa_aaab
+    ; check: smulhi v0, v2
+    ; check: ushr_imm v3, 63
+    ; check: iadd v3, v4
+    ; check: imul_imm v5, 6
+    ; check: isub v0, v6
+    return v1
+}
+
+; case d > 0 && M < 0 (mul, add, shift, add-sign-bit)
+function %t_srem64_p15(i64) -> i64 {
+block0(v0: i64):
+    v1 = srem_imm v0, 15
+    ; check: iconst.i64 0x8888_8888_8888_8889
+    ; check: smulhi v0, v2
+    ; check: iadd v3, v0
+    ; check: sshr_imm v4, 3
+    ; check: ushr_imm v5, 63
+    ; check: iadd v5, v6
+    ; check: imul_imm v7, 15
+    ; check: isub v0, v8
+    return v1
+}
+
+; simple case (mul, shift, add-sign-bit)
+function %t_srem64_p625(i64) -> i64 {
+block0(v0: i64):
+    v1 = srem_imm v0, 625
+    ; check: iconst.i64 0x346d_c5d6_3886_594b
+    ; check: smulhi v0, v2
+    ; check: sshr_imm v3, 7
+    ; check: ushr_imm v4, 63
+    ; check: iadd v4, v5
+    ; check: imul_imm v6, 625
+    ; check: isub v0, v7
+    return v1
+}
--- a/cranelift/filetests/filetests/peepmatic/rem_by_const_power_of_2.clif
+++ b/cranelift/filetests/filetests/peepmatic/rem_by_const_power_of_2.clif
@@ -0,0 +1,291 @@
+test peepmatic
+target i686 baseline
+
+; -------- U32 --------
+
+; ignored
+function %t_urem32_p0(i32) -> i32 {
+block0(v0: i32):
+    v1 = urem_imm v0, 0
+    ; check: urem_imm v0, 0
+    return v1
+}
+
+; converted to constant zero
+function %t_urem32_p1(i32) -> i32 {
+block0(v0: i32):
+    v1 = urem_imm v0, 1
+    ; check: iconst.i32 0
+    return v1
+}
+
+; shift
+function %t_urem32_p2(i32) -> i32 {
+block0(v0: i32):
+    v1 = urem_imm v0, 2
+    ; check: band_imm v0, 1
+    return v1
+}
+
+; shift
+function %t_urem32_p2p31(i32) -> i32 {
+block0(v0: i32):
+    v1 = urem_imm v0, 0x8000_0000
+    ; check: band_imm v0, 0x7fff_ffff
+    return v1
+}
+
+
+; -------- U64 --------
+
+; ignored
+function %t_urem64_p0(i64) -> i64 {
+block0(v0: i64):
+    v1 = urem_imm v0, 0
+    ; check: urem_imm v0, 0
+    return v1
+}
+
+; converted to constant zero
+function %t_urem64_p1(i64) -> i64 {
+block0(v0: i64):
+    v1 = urem_imm v0, 1
+    ; check: iconst.i64 0
+    return v1
+}
+
+; shift
+function %t_urem64_p2(i64) -> i64 {
+block0(v0: i64):
+    v1 = urem_imm v0, 2
+    ; check: band_imm v0, 1
+   return v1
+}
+
+; shift
+function %t_urem64_p2p63(i64) -> i64 {
+block0(v0: i64):
+    v1 = urem_imm v0, 0x8000_0000_0000_0000
+    ; check: band_imm v0, 0x7fff_ffff_ffff_ffff
+    return v1
+}
+
+
+; -------- S32 --------
+
+; ignored
+function %t_srem32_n1(i32) -> i32 {
+block0(v0: i32):
+    v1 = srem_imm v0, -1
+    ; check: srem_imm v0, -1
+    return v1
+}
+
+; ignored
+function %t_srem32_p0(i32) -> i32 {
+block0(v0: i32):
+    v1 = srem_imm v0, 0
+    ; check: srem_imm v0, 0
+    return v1
+}
+
+; converted to constant zero
+function %t_srem32_p1(i32) -> i32 {
+block0(v0: i32):
+    v1 = srem_imm v0, 1
+    ; check: iconst.i32 0
+    return v1
+}
+
+; shift
+function %t_srem32_p2(i32) -> i32 {
+block0(v0: i32):
+    v1 = srem_imm v0, 2
+    ; check: ushr_imm v0, 31
+    ; check: iadd v0, v2
+    ; check: band_imm v3, -2
+    ; check: isub v0, v4
+    return v1
+}
+
+; shift
+function %t_srem32_n2(i32) -> i32 {
+block0(v0: i32):
+    v1 = srem_imm v0, -2
+    ; check: ushr_imm v0, 31
+    ; check: iadd v0, v2
+    ; check: band_imm v3, -2
+    ; check: isub v0, v4
+    return v1
+}
+
+; shift
+function %t_srem32_p4(i32) -> i32 {
+block0(v0: i32):
+    v1 = srem_imm v0, 4
+    ; check: sshr_imm v0, 1
+    ; check: ushr_imm v2, 30
+    ; check: iadd v0, v3
+    ; check: band_imm v4, -4
+    ; check: isub v0, v5
+    return v1
+}
+
+; shift
+function %t_srem32_n4(i32) -> i32 {
+block0(v0: i32):
+    v1 = srem_imm v0, -4
+    ; check: sshr_imm v0, 1
+    ; check: ushr_imm v2, 30
+    ; check: iadd v0, v3
+    ; check: band_imm v4, -4
+    ; check: isub v0, v5
+    return v1
+}
+
+; shift
+function %t_srem32_p2p30(i32) -> i32 {
+block0(v0: i32):
+    v1 = srem_imm v0, 0x4000_0000
+    ; check: sshr_imm v0, 29
+    ; check: ushr_imm v2, 2
+    ; check: iadd v0, v3
+    ; check: band_imm v4, 0xffff_ffff_c000_0000
+    ; check: isub v0, v5
+    return v1
+}
+
+; shift
+function %t_srem32_n2p30(i32) -> i32 {
+block0(v0: i32):
+    v1 = srem_imm v0, -0x4000_0000
+    ; check: sshr_imm v0, 29
+    ; check: ushr_imm v2, 2
+    ; check: iadd v0, v3
+    ; check: band_imm v4, 0xffff_ffff_c000_0000
+    ; check: isub v0, v5
+    return v1
+}
+
+; there's no positive version of this, since -(-0x8000_0000) isn't
+; representable.
+function %t_srem32_n2p31(i32) -> i32 {
+block0(v0: i32):
+    v1 = srem_imm v0, -0x8000_0000
+    ; check: sshr_imm v0, 30
+    ; check: ushr_imm v2, 1
+    ; check: iadd v0, v3
+    ; check: band_imm v4, 0xffff_ffff_8000_0000
+    ; check: isub v0, v5
+    return v1
+}
+
+
+; -------- S64 --------
+
+; ignored
+function %t_srem64_n1(i64) -> i64 {
+block0(v0: i64):
+    v1 = srem_imm v0, -1
+    ; check: srem_imm v0, -1
+    return v1
+}
+
+; ignored
+function %t_srem64_p0(i64) -> i64 {
+block0(v0: i64):
+    v1 = srem_imm v0, 0
+    ; check: srem_imm v0, 0
+    return v1
+}
+
+; converted to constant zero
+function %t_srem64_p1(i64) -> i64 {
+block0(v0: i64):
+    v1 = srem_imm v0, 1
+    ; check: iconst.i64 0
+    return v1
+}
+
+; shift
+function %t_srem64_p2(i64) -> i64 {
+block0(v0: i64):
+    v1 = srem_imm v0, 2
+    ; check: ushr_imm v0, 63
+    ; check: iadd v0, v2
+    ; check: band_imm v3, -2
+    ; check: isub v0, v4
+    return v1
+}
+
+; shift
+function %t_srem64_n2(i64) -> i64 {
+block0(v0: i64):
+    v1 = srem_imm v0, -2
+    ; check: ushr_imm v0, 63
+    ; check: iadd v0, v2
+    ; check: band_imm v3, -2
+    ; check: isub v0, v4
+    return v1
+}
+
+; shift
+function %t_srem64_p4(i64) -> i64 {
+block0(v0: i64):
+    v1 = srem_imm v0, 4
+    ; check: sshr_imm v0, 1
+    ; check: ushr_imm v2, 62
+    ; check: iadd v0, v3
+    ; check: band_imm v4, -4
+    ; check: isub v0, v5
+    return v1
+}
+
+; shift
+function %t_srem64_n4(i64) -> i64 {
+block0(v0: i64):
+    v1 = srem_imm v0, -4
+    ; check: sshr_imm v0, 1
+    ; check: ushr_imm v2, 62
+    ; check: iadd v0, v3
+    ; check: band_imm v4, -4
+    ; check: isub v0, v5
+    return v1
+}
+
+; shift
+function %t_srem64_p2p62(i64) -> i64 {
+block0(v0: i64):
+    v1 = srem_imm v0, 0x4000_0000_0000_0000
+    ; check: sshr_imm v0, 61
+    ; check: ushr_imm v2, 2
+    ; check: iadd v0, v3
+    ; check: band_imm v4, 0xc000_0000_0000_0000
+    ; check: isub v0, v5
+    return v1
+}
+
+; shift
+function %t_srem64_n2p62(i64) -> i64 {
+block0(v0: i64):
+    v1 = srem_imm v0, -0x4000_0000_0000_0000
+    ; check: sshr_imm v0, 61
+    ; check: ushr_imm v2, 2
+    ; check: iadd v0, v3
+    ; check: band_imm v4, 0xc000_0000_0000_0000
+    ; check: isub v0, v5
+    return v1
+}
+
+; there's no positive version of this, since -(-0x8000_0000_0000_0000) isn't
+; representable.
+function %t_srem64_n2p63(i64) -> i64 {
+block0(v0: i64):
+    v1 = srem_imm v0, -0x8000_0000_0000_0000
+    ; check: sshr_imm v0, 62
+    ; check: ushr_imm v2, 1
+    ; check: iadd v0, v3
+    ; check: band_imm v4, 0x8000_0000_0000_0000
+    ; check: isub v0, v5
+   return v1
+}
--- a/cranelift/filetests/filetests/peepmatic/replace_branching_instructions_and_cfg_predecessors.clif
+++ b/cranelift/filetests/filetests/peepmatic/replace_branching_instructions_and_cfg_predecessors.clif
@@ -0,0 +1,22 @@
+test peepmatic
+target x86_64
+
+function u0:2(i64 , i64) {
+    gv1 = load.i64 notrap aligned gv0
+    heap0 = static gv1
+    block0(v0: i64, v1: i64):
+        v16 = iconst.i32 6
+        v17 = heap_addr.i64 heap0, v16, 1
+        v18 = load.i32 v17
+        v19 = iconst.i32 4
+        v20 = icmp ne v18, v19
+        v21 = bint.i32 v20
+        brnz v21, block2
+        jump block4
+    block4:
+        jump block1
+    block2:
+        jump block1
+    block1:
+        return
+}
--- a/cranelift/filetests/filetests/peepmatic/simplify32.clif
+++ b/cranelift/filetests/filetests/peepmatic/simplify32.clif
@@ -0,0 +1,60 @@
+test peepmatic
+target i686
+
+;; 32-bits platforms.
+
+function %iadd_imm(i32) -> i32 {
+block0(v0: i32):
+    v1 = iconst.i32 2
+    v2 = iadd v0, v1
+    return v2
+}
+; sameln: function %iadd_imm
+; nextln: block0(v0: i32):
+; nextln:     v1 = iconst.i32 2
+; nextln:     v2 = iadd_imm v0, 2
+; nextln:     return v2
+; nextln: }
+
+function %isub_imm(i32) -> i32 {
+block0(v0: i32):
+    v1 = iconst.i32 2
+    v2 = isub v0, v1
+    return v2
+}
+; sameln: function %isub_imm
+; nextln: block0(v0: i32):
+; nextln:     v1 = iconst.i32 2
+; nextln:     v2 = iadd_imm v0, -2
+; nextln:     return v2
+; nextln: }
+
+function %icmp_imm(i32) -> i32 {
+block0(v0: i32):
+    v1 = iconst.i32 2
+    v2 = icmp slt v0, v1
+    v3 = bint.i32 v2
+    return v3
+}
+; sameln: function %icmp_imm
+; nextln: block0(v0: i32):
+; nextln:     v1 = iconst.i32 2
+; nextln:     v2 = icmp_imm slt v0, 2
+; nextln:     v3 = bint.i32 v2
+; nextln:     return v3
+; nextln: }
+
+;; Don't simplify operations that would get illegal because of lack of native
+;; support.
+function %iadd_imm(i64) -> i64 {
+block0(v0: i64):
+    v1 = iconst.i64 2
+    v2 = iadd v0, v1
+    return v2
+}
+; sameln: function %iadd_imm
+; nextln: block0(v0: i64):
+; nextln:     v1 = iconst.i64 2
+; nextln:     v2 = iadd v0, v1
+; nextln:     return v2
+; nextln: }
--- a/cranelift/filetests/filetests/peepmatic/simplify64.clif
+++ b/cranelift/filetests/filetests/peepmatic/simplify64.clif
@@ -0,0 +1,326 @@
+test peepmatic
+target x86_64
+
+;; 64-bits platforms.
+
+function %iadd_imm(i32) -> i32 {
+block0(v0: i32):
+    v1 = iconst.i32 2
+    v2 = iadd v0, v1
+    return v2
+}
+; sameln: function %iadd_imm
+; nextln: block0(v0: i32):
+; nextln:     v1 = iconst.i32 2
+; nextln:     v2 = iadd_imm v0, 2
+; nextln:     return v2
+; nextln: }
+
+function %isub_imm(i32) -> i32 {
+block0(v0: i32):
+    v1 = iconst.i32 2
+    v2 = isub v0, v1
+    return v2
+}
+; sameln: function %isub_imm
+; nextln: block0(v0: i32):
+; nextln:     v1 = iconst.i32 2
+; nextln:     v2 = iadd_imm v0, -2
+; nextln:     return v2
+; nextln: }
+
+function %icmp_imm(i32) -> i32 {
+block0(v0: i32):
+    v1 = iconst.i32 2
+    v2 = icmp slt v0, v1
+    v3 = bint.i32 v2
+    return v3
+}
+; sameln: function %icmp_imm
+; nextln: block0(v0: i32):
+; nextln:     v1 = iconst.i32 2
+; nextln:     v2 = icmp_imm slt v0, 2
+; nextln:     v3 = bint.i32 v2
+; nextln:     return v3
+; nextln: }
+
+function %ifcmp_imm(i32) -> i32 {
+block0(v0: i32):
+    v1 = iconst.i32 2
+    v2 = ifcmp v0, v1
+    brif eq v2, block1
+    jump block2
+
+block1:
+    v3 = iconst.i32 1
+    return v3
+
+block2:
+    v4 = iconst.i32 2
+    return v4
+}
+; sameln: function %ifcmp_imm
+; nextln: block0(v0: i32):
+; nextln:     v1 = iconst.i32 2
+; nextln:     v2 = ifcmp_imm v0, 2
+; nextln:     brif eq v2, block1
+; nextln:     jump block2
+; nextln: 
+; nextln: block1:
+; nextln:     v3 = iconst.i32 1
+; nextln:     return v3
+; nextln: 
+; nextln: block2:
+; nextln:     v4 = iconst.i32 2
+; nextln:     return v4
+; nextln: }
+
+function %brz_bint(i32) {
+block0(v0: i32):
+    v3 = icmp_imm slt v0, 0
+    v1 = bint.i32 v3
+    v2 = select v1, v1, v1
+    trapz v1, user0
+    brz v1, block1
+    jump block2
+
+block1:
+    return
+
+block2:
+    return
+}
+; sameln: function %brz_bint
+; nextln: (v0: i32):
+; nextln:    v3 = icmp_imm slt v0, 0
+; nextln:    v1 = bint.i32 v3
+; nextln:    v2 = select v3, v1, v1
+; nextln:    trapz v3, user0
+; nextln:    brnz v3, block2
+; nextln:    jump block1
+
+function %irsub_imm(i32) -> i32 {
+block0(v0: i32):
+    v1 = iconst.i32 2
+    v2 = isub v1, v0
+    return v2
+}
+; sameln: function %irsub_imm
+; nextln: block0(v0: i32):
+; nextln:     v1 = iconst.i32 2
+; nextln:     v2 = irsub_imm v0, 2
+; nextln:     return v2
+; nextln: }
+
+;; Sign-extensions.
+
+;; 8 -> 16
+function %uextend_8_16() -> i16 {
+block0:
+    v0 = iconst.i16 37
+    v1 = ishl_imm v0, 8
+    v2 = ushr_imm v1, 8
+    return v2
+}
+; sameln: function %uextend_8_16
+; nextln: block0:
+; nextln:     v0 = iconst.i16 37
+; nextln:     v1 = ishl_imm v0, 8
+; nextln:     v3 = ireduce.i8 v0
+; nextln:     v2 = uextend.i16 v3
+; nextln:     return v2
+; nextln: }
+
+function %sextend_8_16() -> i16 {
+block0:
+    v0 = iconst.i16 37
+    v1 = ishl_imm v0, 8
+    v2 = sshr_imm v1, 8
+    return v2
+}
+; sameln: function %sextend_8_16
+; nextln: block0:
+; nextln:     v0 = iconst.i16 37
+; nextln:     v1 = ishl_imm v0, 8
+; nextln:     v3 = ireduce.i8 v0
+; nextln:     v2 = sextend.i16 v3
+; nextln:     return v2
+; nextln: }
+
+;; 8 -> 32
+function %uextend_8_32() -> i32 {
+block0:
+    v0 = iconst.i32 37
+    v1 = ishl_imm v0, 24
+    v2 = ushr_imm v1, 24
+    return v2
+}
+; sameln: function %uextend_8_32
+; nextln: block0:
+; nextln:     v0 = iconst.i32 37
+; nextln:     v1 = ishl_imm v0, 24
+; nextln:     v3 = ireduce.i8 v0
+; nextln:     v2 = uextend.i32 v3
+; nextln:     return v2
+; nextln: }
+
+function %sextend_8_32() -> i32 {
+block0:
+    v0 = iconst.i32 37
+    v1 = ishl_imm v0, 24
+    v2 = sshr_imm v1, 24
+    return v2
+}
+; sameln: function %sextend_8_32
+; nextln: block0:
+; nextln:     v0 = iconst.i32 37
+; nextln:     v1 = ishl_imm v0, 24
+; nextln:     v3 = ireduce.i8 v0
+; nextln:     v2 = sextend.i32 v3
+; nextln:     return v2
+; nextln: }
+
+;; 16 -> 32
+function %uextend_16_32() -> i32 {
+block0:
+    v0 = iconst.i32 37
+    v1 = ishl_imm v0, 16
+    v2 = ushr_imm v1, 16
+    return v2
+}
+; sameln: function %uextend_16_32
+; nextln: block0:
+; nextln:     v0 = iconst.i32 37
+; nextln:     v1 = ishl_imm v0, 16
+; nextln:     v3 = ireduce.i16 v0
+; nextln:     v2 = uextend.i32 v3
+; nextln:     return v2
+; nextln: }
+
+function %sextend_16_32() -> i32 {
+block0:
+    v0 = iconst.i32 37
+    v1 = ishl_imm v0, 16
+    v2 = sshr_imm v1, 16
+    return v2
+}
+; sameln: function %sextend_16_32
+; nextln: block0:
+; nextln:     v0 = iconst.i32 37
+; nextln:     v1 = ishl_imm v0, 16
+; nextln:     v3 = ireduce.i16 v0
+; nextln:     v2 = sextend.i32 v3
+; nextln:     return v2
+; nextln: }
+
+;; 8 -> 64
+function %uextend_8_64() -> i64 {
+block0:
+    v0 = iconst.i64 37
+    v1 = ishl_imm v0, 56
+    v2 = ushr_imm v1, 56
+    return v2
+}
+; sameln: function %uextend_8_64
+; nextln: block0:
+; nextln:     v0 = iconst.i64 37
+; nextln:     v1 = ishl_imm v0, 56
+; nextln:     v3 = ireduce.i8 v0
+; nextln:     v2 = uextend.i64 v3
+; nextln:     return v2
+; nextln: }
+
+function %sextend_8_64() -> i64 {
+block0:
+    v0 = iconst.i64 37
+    v1 = ishl_imm v0, 56
+    v2 = sshr_imm v1, 56
+    return v2
+}
+; sameln: function %sextend_8_64
+; nextln: block0:
+; nextln:     v0 = iconst.i64 37
+; nextln:     v1 = ishl_imm v0, 56
+; nextln:     v3 = ireduce.i8 v0
+; nextln:     v2 = sextend.i64 v3
+; nextln:     return v2
+; nextln: }
+
+;; 16 -> 64
+function %uextend_16_64() -> i64 {
+block0:
+    v0 = iconst.i64 37
+    v1 = ishl_imm v0, 48
+    v2 = ushr_imm v1, 48
+    return v2
+}
+; sameln: function %uextend_16_64
+; nextln: block0:
+; nextln:     v0 = iconst.i64 37
+; nextln:     v1 = ishl_imm v0, 48
+; nextln:     v3 = ireduce.i16 v0
+; nextln:     v2 = uextend.i64 v3
+; nextln:     return v2
+; nextln: }
+
+function %sextend_16_64() -> i64 {
+block0:
+    v0 = iconst.i64 37
+    v1 = ishl_imm v0, 48
+    v2 = sshr_imm v1, 48
+    return v2
+}
+; sameln: function %sextend_16_64
+; nextln: block0:
+; nextln:     v0 = iconst.i64 37
+; nextln:     v1 = ishl_imm v0, 48
+; nextln:     v3 = ireduce.i16 v0
+; nextln:     v2 = sextend.i64 v3
+; nextln:     return v2
+; nextln: }
+
+;; 32 -> 64
+function %uextend_32_64() -> i64 {
+block0:
+    v0 = iconst.i64 37
+    v1 = ishl_imm v0, 32
+    v2 = ushr_imm v1, 32
+    return v2
+}
+; sameln: function %uextend_32_64
+; nextln: block0:
+; nextln:     v0 = iconst.i64 37
+; nextln:     v1 = ishl_imm v0, 32
+; nextln:     v3 = ireduce.i32 v0
+; nextln:     v2 = uextend.i64 v3
+; nextln:     return v2
+; nextln: }
+
+function %sextend_32_64() -> i64 {
+block0:
+    v0 = iconst.i64 37
+    v1 = ishl_imm v0, 32
+    v2 = sshr_imm v1, 32
+    return v2
+}
+; sameln: function %sextend_32_64
+; nextln: block0:
+; nextln:     v0 = iconst.i64 37
+; nextln:     v1 = ishl_imm v0, 32
+; nextln:     v3 = ireduce.i32 v0
+; nextln:     v2 = sextend.i64 v3
+; nextln:     return v2
+; nextln: }
+
+function %add_imm_fold(i32) -> i32 {
+block0(v0: i32):
+  v1 = iadd_imm v0, 42
+  v2 = iadd_imm v1, -42
+  return v2
+}
+; sameln: function %add_imm_fold(i32)
+; nextln: block0(v0: i32):
+; nextln:    v2 -> v0
+; nextln:    v1 = iadd_imm v0, 42
+; nextln:    nop
+; nextln:    return v2
--- a/cranelift/filetests/filetests/peepmatic/simplify_instruction_into_alias_of_value.clif
+++ b/cranelift/filetests/filetests/peepmatic/simplify_instruction_into_alias_of_value.clif
@@ -0,0 +1,17 @@
+test peepmatic
+target x86_64
+
+;; The `isub` is a no-op, but we can't replace the whole `isub` instruction with
+;; its `v2` operand's instruction because `v2` is one of many results. Instead,
+;; we need to make an alias `v3 -> v2`.
+
+function %replace_inst_with_alias() -> i32 {
+block0:
+    v0 = iconst.i32 0
+    v1, v2 = x86_smulx v0, v0
+    v3 = isub v2, v0
+    ; check:  v0 = iconst.i32 0
+    ; nextln: v1, v2 = x86_smulx v0, v0
+    ; nextln: v3 -> v2
+    return v3
+}
--- a/cranelift/filetests/filetests/simple_preopt/div_by_const_indirect.clif
+++ b/cranelift/filetests/filetests/simple_preopt/div_by_const_indirect.clif
@@ -7,13 +7,14 @@ function %indir_udiv32(i32) -> i32 {
 block0(v0: i32):
    v1 = iconst.i32 7
    v2 = udiv v0, v1
-    ; check:  v4 = iconst.i32 0x2492_4925
-    ; nextln: v5 = umulhi v0, v4
-    ; nextln: v6 = isub v0, v5
-    ; nextln: v7 = ushr_imm v6, 1
-    ; nextln: v8 = iadd v7, v5
-    ; nextln: v9 = ushr_imm v8, 2
-    ; nextln: v2 -> v9
+    ; check: iconst.i32 7
+    ; check: iconst.i32 0x2492_4925
+    ; check: umulhi v0, v3
+    ; check: isub v0, v4
+    ; check: ushr_imm v5, 1
+    ; check: iadd v6, v4
+    ; check: v8 = ushr_imm v7, 2
+    ; check: v2 -> v8
    return v2
 }

@@ -21,12 +22,13 @@ function %indir_sdiv32(i32) -> i32 {
 block0(v0: i32):
    v1 = iconst.i32 -17
    v2 = sdiv v0, v1
-    ; check:  v4 = iconst.i32 0xffff_ffff_8787_8787
-    ; nextln: v5 = smulhi v0, v4
-    ; nextln: v6 = sshr_imm v5, 3
-    ; nextln: v7 = ushr_imm v6, 31
-    ; nextln: v8 = iadd v6, v7
-    ; nextln: v2 -> v8
+    ; check: iconst.i32 -17
+    ; check: iconst.i32 0xffff_ffff_8787_8787
+    ; check: smulhi v0, v3
+    ; check: sshr_imm v4, 3
+    ; check: ushr_imm v5, 31
+    ; check: v7 = iadd v5, v6
+    ; check: v2 -> v7
    return v2
 }

@@ -34,10 +36,11 @@ function %indir_udiv64(i64) -> i64 {
 block0(v0: i64):
    v1 = iconst.i64 1337
    v2 = udiv v0, v1
-    ; check:  v4 = iconst.i64 0xc411_9d95_2866_a139
-    ; nextln: v5 = umulhi v0, v4
-    ; nextln: v6 = ushr_imm v5, 10
-    ; nextln: v2 -> v6
+    ; check: iconst.i64 1337
+    ; check: iconst.i64 0xc411_9d95_2866_a139
+    ; check: umulhi v0, v3
+    ; check: v5 = ushr_imm v4, 10
+    ; check: v2 -> v5
    return v2
 }

@@ -45,11 +48,12 @@ function %indir_sdiv64(i64) -> i64 {
 block0(v0: i64):
    v1 = iconst.i64 -90210
    v2 = sdiv v0, v1
-    ; check:  v4 = iconst.i64 0xd181_4ee8_939c_b8bb
-    ; nextln: v5 = smulhi v0, v4
-    ; nextln: v6 = sshr_imm v5, 14
-    ; nextln: v7 = ushr_imm v6, 63
-    ; nextln: v8 = iadd v6, v7
-    ; nextln: v2 -> v8
+    ; check: iconst.i64 0xffff_ffff_fffe_9f9e
+    ; check: iconst.i64 0xd181_4ee8_939c_b8bb
+    ; check: smulhi v0, v3
+    ; check: sshr_imm v4, 14
+    ; check: ushr_imm v5, 63
+    ; check: v7 = iadd v5, v6
+    ; check: v2 -> v7
    return v2
 }
--- a/cranelift/filetests/filetests/simple_preopt/simplify32.clif
+++ b/cranelift/filetests/filetests/simple_preopt/simplify32.clif
@@ -58,3 +58,4 @@ block0(v0: i64):
 ; nextln:     v2 = iadd v0, v1
 ; nextln:     return v2
 ; nextln: }
+
--- a/cranelift/filetests/filetests/simple_preopt/simplify64.clif
+++ b/cranelift/filetests/filetests/simple_preopt/simplify64.clif
@@ -44,37 +44,6 @@ block0(v0: i32):
 ; nextln:     return v3
 ; nextln: }

-function %ifcmp_imm(i32) -> i32 {
-block0(v0: i32):
-    v1 = iconst.i32 2
-    v2 = ifcmp v0, v1
-    brif eq v2, block1
-    jump block2
-
-block1:
-    v3 = iconst.i32 1
-    return v3
-
-block2:
-    v4 = iconst.i32 2
-    return v4
-}
-; sameln: function %ifcmp_imm
-; nextln: block0(v0: i32):
-; nextln:     v1 = iconst.i32 2
-; nextln:     v2 = ifcmp_imm v0, 2
-; nextln:     brif eq v2, block1
-; nextln:     jump block2
-; nextln: 
-; nextln: block1:
-; nextln:     v3 = iconst.i32 1
-; nextln:     return v3
-; nextln: 
-; nextln: block2:
-; nextln:     v4 = iconst.i32 2
-; nextln:     return v4
-; nextln: }
-
 function %brz_bint(i32) {
 block0(v0: i32):
    v3 = icmp_imm slt v0, 0
--- a/cranelift/filetests/src/lib.rs
+++ b/cranelift/filetests/src/lib.rs
@@ -45,6 +45,7 @@ mod test_domtree;
 mod test_interpret;
 mod test_legalizer;
 mod test_licm;
+mod test_peepmatic;
 mod test_postopt;
 mod test_preopt;
 mod test_print_cfg;
@@ -128,6 +129,7 @@ fn new_subtest(parsed: &TestCommand) -> subtest::SubtestResult<Box<dyn subtest::
        "interpret" => test_interpret::subtest(parsed),
        "legalizer" => test_legalizer::subtest(parsed),
        "licm" => test_licm::subtest(parsed),
+        "peepmatic" => test_peepmatic::subtest(parsed),
        "postopt" => test_postopt::subtest(parsed),
        "preopt" => test_preopt::subtest(parsed),
        "print-cfg" => test_print_cfg::subtest(parsed),
--- a/cranelift/filetests/src/test_peepmatic.rs
+++ b/cranelift/filetests/src/test_peepmatic.rs
@@ -0,0 +1,56 @@
+//! Test command for `peepmatic`-generated peephole optimizers.
+
+use crate::subtest::{run_filecheck, Context, SubTest, SubtestResult};
+use cranelift_codegen;
+use cranelift_codegen::ir::Function;
+use cranelift_codegen::print_errors::pretty_error;
+use cranelift_reader::TestCommand;
+use std::borrow::Cow;
+
+struct TestPreopt;
+
+pub fn subtest(parsed: &TestCommand) -> SubtestResult<Box<dyn SubTest>> {
+    assert_eq!(parsed.command, "peepmatic");
+    if parsed.options.is_empty() {
+        Ok(Box::new(TestPreopt))
+    } else {
+        Err(format!("No options allowed on {}", parsed))
+    }
+}
+
+impl SubTest for TestPreopt {
+    fn name(&self) -> &'static str {
+        "peepmatic"
+    }
+
+    fn is_mutating(&self) -> bool {
+        true
+    }
+
+    fn needs_isa(&self) -> bool {
+        true
+    }
+
+    fn run(&self, func: Cow<Function>, context: &Context) -> SubtestResult<()> {
+        let mut comp_ctx = cranelift_codegen::Context::for_function(func.into_owned());
+        let isa = context.isa.expect("preopt needs an ISA");
+
+        comp_ctx.compute_cfg();
+        comp_ctx
+            .preopt(isa)
+            .map_err(|e| pretty_error(&comp_ctx.func, context.isa, Into::into(e)))?;
+        let text = &comp_ctx.func.display(isa).to_string();
+        log::debug!("After peepmatic-based simple_preopt:\n{}", text);
+
+        // Only actually run the filecheck if peepmatic is enabled, because it
+        // can generate slightly different code (alias a result vs replace an
+        // instruction) than the non-peepmatic versions of peephole
+        // optimizations. Note that the non-`peepmatic` results can be tested
+        // with the `test simple_preopt` subtest.
+        if cfg!(feature = "enable-peepmatic") {
+            run_filecheck(&text, context)
+        } else {
+            Ok(())
+        }
+    }
+}
--- a/cranelift/filetests/src/test_simple_preopt.rs
+++ b/cranelift/filetests/src/test_simple_preopt.rs
@@ -39,6 +39,16 @@ impl SubTest for TestSimplePreopt {
            .map_err(|e| pretty_error(&comp_ctx.func, context.isa, Into::into(e)))?;
        let text = &comp_ctx.func.display(isa).to_string();
        log::debug!("After simple_preopt:\n{}", text);
-        run_filecheck(&text, context)
+
+        // Only actually run the filecheck if peepmatic is *not* enabled,
+        // because it can generate slightly different code (alias a result vs
+        // replace an instruction) than the non-peepmatic versions of peephole
+        // optimizations. Note that the `peepmatic`-based results can be tested
+        // with the `test peepmatic` subtest.
+        if cfg!(feature = "enable-peepmatic") {
+            Ok(())
+        } else {
+            run_filecheck(&text, context)
+        }
    }
 }