Optimize immediates and compare and branch sequences (#286)

* Add a pre-opt optimization to change constants into immediates. This converts 'iadd' + 'iconst' into 'iadd_imm', and so on. * Optimize away redundant `bint` instructions. Cretonne has a concept of "Testable" values, which can be either boolean or integer. When the an instruction needing a "Testable" value receives the result of a `bint`, converting boolean to integer, eliminate the `bint`, as it's redundant. * Postopt: Optimize using CPU flags. This introduces a post-legalization optimization pass which converts compare+branch sequences to use flags values on CPUs which support it. * Define a form of x86's `urm` that doesn't clobber FLAGS. movzbl/movsbl/etc. don't clobber FLAGS; define a form of the `urm` recipe that represents this. * Implement a DCE pass. This pass deletes instructions with no side effects and no results that are used. * Clarify ambiguity about "32-bit" and "64-bit" in comments. * Add x86 encodings for icmp_imm. * Add a testcase for postopt CPU flags optimization. This covers the basic functionality of transforming compare+branch sequences to use CPU flags. * Pattern-match irsub_imm in preopt.
2018-03-30 12:30:07 -07:00
parent 5377092e5b
commit 6606b88136
22 changed files with 921 additions and 109 deletions
--- a/lib/cretonne/meta/isa/intel/encodings.py
+++ b/lib/cretonne/meta/isa/intel/encodings.py
@@ -378,6 +378,8 @@ X86_64.enc(base.trapff, r.trapff, 0)
 # Comparisons
 #
 enc_i32_i64(base.icmp, r.icscc, 0x39)
+enc_i32_i64(base.icmp_imm, r.icsccib, 0x83, rrr=7)
+enc_i32_i64(base.icmp_imm, r.icsccid, 0x81, rrr=7)
 enc_i32_i64(base.ifcmp, r.rcmp, 0x39)
 enc_i32_i64(base.ifcmp_imm, r.rcmpib, 0x83, rrr=7)
 enc_i32_i64(base.ifcmp_imm, r.rcmpid, 0x81, rrr=7)
@@ -409,11 +411,13 @@ enc_i32_i64(x86.bsr, r.bsf_and_bsr, 0x0F, 0xBD)
 #
 # This assumes that b1 is represented as an 8-bit low register with the value 0
 # or 1.
-X86_32.enc(base.bint.i32.b1, *r.urm_abcd(0x0f, 0xb6))
-X86_64.enc(base.bint.i64.b1, *r.urm.rex(0x0f, 0xb6))   # zext to i64 implicit.
-X86_64.enc(base.bint.i64.b1, *r.urm_abcd(0x0f, 0xb6))  # zext to i64 implicit.
-X86_64.enc(base.bint.i32.b1, *r.urm.rex(0x0f, 0xb6))
-X86_64.enc(base.bint.i32.b1, *r.urm_abcd(0x0f, 0xb6))
+#
+# Encode movzbq as movzbl, because it's equivalent and shorter.
+X86_32.enc(base.bint.i32.b1, *r.urm_noflags_abcd(0x0f, 0xb6))
+X86_64.enc(base.bint.i64.b1, *r.urm_noflags.rex(0x0f, 0xb6))
+X86_64.enc(base.bint.i64.b1, *r.urm_noflags_abcd(0x0f, 0xb6))
+X86_64.enc(base.bint.i32.b1, *r.urm_noflags.rex(0x0f, 0xb6))
+X86_64.enc(base.bint.i32.b1, *r.urm_noflags_abcd(0x0f, 0xb6))

 # Numerical conversions.

@@ -430,41 +434,41 @@ X86_64.enc(base.ireduce.i32.i64, r.null, 0)
 # instructions for %al/%ax/%eax to %ax/%eax/%rax.

 # movsbl
-X86_32.enc(base.sextend.i32.i8, *r.urm(0x0f, 0xbe))
-X86_64.enc(base.sextend.i32.i8, *r.urm.rex(0x0f, 0xbe))
-X86_64.enc(base.sextend.i32.i8, *r.urm(0x0f, 0xbe))
+X86_32.enc(base.sextend.i32.i8, *r.urm_noflags(0x0f, 0xbe))
+X86_64.enc(base.sextend.i32.i8, *r.urm_noflags.rex(0x0f, 0xbe))
+X86_64.enc(base.sextend.i32.i8, *r.urm_noflags(0x0f, 0xbe))

 # movswl
-X86_32.enc(base.sextend.i32.i16, *r.urm(0x0f, 0xbf))
-X86_64.enc(base.sextend.i32.i16, *r.urm.rex(0x0f, 0xbf))
-X86_64.enc(base.sextend.i32.i16, *r.urm(0x0f, 0xbf))
+X86_32.enc(base.sextend.i32.i16, *r.urm_noflags(0x0f, 0xbf))
+X86_64.enc(base.sextend.i32.i16, *r.urm_noflags.rex(0x0f, 0xbf))
+X86_64.enc(base.sextend.i32.i16, *r.urm_noflags(0x0f, 0xbf))

 # movsbq
-X86_64.enc(base.sextend.i64.i8, *r.urm.rex(0x0f, 0xbe, w=1))
+X86_64.enc(base.sextend.i64.i8, *r.urm_noflags.rex(0x0f, 0xbe, w=1))

 # movswq
-X86_64.enc(base.sextend.i64.i16, *r.urm.rex(0x0f, 0xbf, w=1))
+X86_64.enc(base.sextend.i64.i16, *r.urm_noflags.rex(0x0f, 0xbf, w=1))

 # movslq
-X86_64.enc(base.sextend.i64.i32, *r.urm.rex(0x63, w=1))
+X86_64.enc(base.sextend.i64.i32, *r.urm_noflags.rex(0x63, w=1))

 # movzbl
-X86_32.enc(base.uextend.i32.i8, *r.urm(0x0f, 0xb6))
-X86_64.enc(base.uextend.i32.i8, *r.urm.rex(0x0f, 0xb6))
-X86_64.enc(base.uextend.i32.i8, *r.urm(0x0f, 0xb6))
+X86_32.enc(base.uextend.i32.i8, *r.urm_noflags(0x0f, 0xb6))
+X86_64.enc(base.uextend.i32.i8, *r.urm_noflags.rex(0x0f, 0xb6))
+X86_64.enc(base.uextend.i32.i8, *r.urm_noflags(0x0f, 0xb6))

 # movzwl
-X86_32.enc(base.uextend.i32.i16, *r.urm(0x0f, 0xb7))
-X86_64.enc(base.uextend.i32.i16, *r.urm.rex(0x0f, 0xb7))
-X86_64.enc(base.uextend.i32.i16, *r.urm(0x0f, 0xb7))
+X86_32.enc(base.uextend.i32.i16, *r.urm_noflags(0x0f, 0xb7))
+X86_64.enc(base.uextend.i32.i16, *r.urm_noflags.rex(0x0f, 0xb7))
+X86_64.enc(base.uextend.i32.i16, *r.urm_noflags(0x0f, 0xb7))

 # movzbq, encoded as movzbl because it's equivalent and shorter
-X86_64.enc(base.uextend.i64.i8, *r.urm.rex(0x0f, 0xb6))
-X86_64.enc(base.uextend.i64.i8, *r.urm(0x0f, 0xb6))
+X86_64.enc(base.uextend.i64.i8, *r.urm_noflags.rex(0x0f, 0xb6))
+X86_64.enc(base.uextend.i64.i8, *r.urm_noflags(0x0f, 0xb6))

 # movzwq, encoded as movzwl because it's equivalent and shorter
-X86_64.enc(base.uextend.i64.i16, *r.urm.rex(0x0f, 0xb7))
-X86_64.enc(base.uextend.i64.i16, *r.urm(0x0f, 0xb7))
+X86_64.enc(base.uextend.i64.i16, *r.urm_noflags.rex(0x0f, 0xb7))
+X86_64.enc(base.uextend.i64.i16, *r.urm_noflags(0x0f, 0xb7))

 # A 32-bit register copy clears the high 32 bits.
 X86_64.enc(base.uextend.i64.i32, *r.umr.rex(0x89))
--- a/lib/cretonne/meta/isa/intel/recipes.py
+++ b/lib/cretonne/meta/isa/intel/recipes.py
@@ -8,7 +8,8 @@ from cdsl.registers import RegClass
 from base.formats import Unary, UnaryImm, UnaryBool, Binary, BinaryImm
 from base.formats import MultiAry, NullAry
 from base.formats import Trap, Call, IndirectCall, Store, Load
-from base.formats import IntCompare, FloatCompare, IntCond, FloatCond
+from base.formats import IntCompare, IntCompareImm, FloatCompare
+from base.formats import IntCond, FloatCond
 from base.formats import IntSelect, IntCondTrap, FloatCondTrap
 from base.formats import Jump, Branch, BranchInt, BranchFloat
 from base.formats import Ternary, FuncAddr, UnaryGlobalVar
@@ -364,7 +365,7 @@ rfumr = TailRecipe(
        ''')

 # XX /r, but for a unary operator with separate input/output register.
-# RM form.
+# RM form. Clobbers FLAGS.
 urm = TailRecipe(
        'urm', Unary, size=1, ins=GPR, outs=GPR,
        emit='''
@@ -372,10 +373,19 @@ urm = TailRecipe(
        modrm_rr(in_reg0, out_reg0, sink);
        ''')

-# XX /r. Same as urm, but input limited to ABCD.
-urm_abcd = TailRecipe(
-        'urm_abcd', Unary, size=1, ins=ABCD, outs=GPR,
-        when_prefixed=urm,
+# XX /r. Same as urm, but doesn't clobber FLAGS.
+urm_noflags = TailRecipe(
+        'urm_noflags', Unary, size=1, ins=GPR, outs=GPR,
+        clobbers_flags=False,
+        emit='''
+        PUT_OP(bits, rex2(in_reg0, out_reg0), sink);
+        modrm_rr(in_reg0, out_reg0, sink);
+        ''')
+
+# XX /r. Same as urm_noflags, but input limited to ABCD.
+urm_noflags_abcd = TailRecipe(
+        'urm_noflags_abcd', Unary, size=1, ins=ABCD, outs=GPR,
+        when_prefixed=urm_noflags,
        emit='''
        PUT_OP(bits, rex2(in_reg0, out_reg0), sink);
        modrm_rr(in_reg0, out_reg0, sink);
@@ -1360,6 +1370,61 @@ icscc = TailRecipe(
        modrm_rr(out_reg0, 0, sink);
        ''')

+icsccib = TailRecipe(
+        'icsccib', IntCompareImm, size=2 + 3, ins=GPR, outs=ABCD,
+        instp=IsSignedInt(IntCompareImm.imm, 8),
+        emit='''
+        // Comparison instruction.
+        PUT_OP(bits, rex1(in_reg0), sink);
+        modrm_r_bits(in_reg0, bits, sink);
+        let imm: i64 = imm.into();
+        sink.put1(imm as u8);
+        // `setCC` instruction, no REX.
+        use ir::condcodes::IntCC::*;
+        let setcc = match cond {
+            Equal => 0x94,
+            NotEqual => 0x95,
+            SignedLessThan => 0x9c,
+            SignedGreaterThanOrEqual => 0x9d,
+            SignedGreaterThan => 0x9f,
+            SignedLessThanOrEqual => 0x9e,
+            UnsignedLessThan => 0x92,
+            UnsignedGreaterThanOrEqual => 0x93,
+            UnsignedGreaterThan => 0x97,
+            UnsignedLessThanOrEqual => 0x96,
+        };
+        sink.put1(0x0f);
+        sink.put1(setcc);
+        modrm_rr(out_reg0, 0, sink);
+        ''')
+
+icsccid = TailRecipe(
+        'icsccid', IntCompareImm, size=5 + 3, ins=GPR, outs=ABCD,
+        instp=IsSignedInt(IntCompareImm.imm, 32),
+        emit='''
+        // Comparison instruction.
+        PUT_OP(bits, rex1(in_reg0), sink);
+        modrm_r_bits(in_reg0, bits, sink);
+        let imm: i64 = imm.into();
+        sink.put4(imm as u32);
+        // `setCC` instruction, no REX.
+        use ir::condcodes::IntCC::*;
+        let setcc = match cond {
+            Equal => 0x94,
+            NotEqual => 0x95,
+            SignedLessThan => 0x9c,
+            SignedGreaterThanOrEqual => 0x9d,
+            SignedGreaterThan => 0x9f,
+            SignedLessThanOrEqual => 0x9e,
+            UnsignedLessThan => 0x92,
+            UnsignedGreaterThanOrEqual => 0x93,
+            UnsignedGreaterThan => 0x97,
+            UnsignedLessThanOrEqual => 0x96,
+        };
+        sink.put1(0x0f);
+        sink.put1(setcc);
+        modrm_rr(out_reg0, 0, sink);
+        ''')

 # Make a FloatCompare instruction predicate with the supported condition codes.

--- a/lib/cretonne/src/context.rs
+++ b/lib/cretonne/src/context.rs
@@ -21,9 +21,11 @@ use result::{CtonError, CtonResult};
 use settings::{FlagsOrIsa, OptLevel};
 use unreachable_code::eliminate_unreachable_code;
 use verifier;
+use dce::do_dce;
 use simple_gvn::do_simple_gvn;
 use licm::do_licm;
 use preopt::do_preopt;
+use postopt::do_postopt;
 use timing;

 /// Persistent data structures and compilation pipeline.
@@ -92,6 +94,9 @@ impl Context {
            self.preopt(isa)?;
        }
        self.legalize(isa)?;
+        if isa.flags().opt_level() != OptLevel::Fastest {
+            self.postopt(isa)?;
+        }
        if isa.flags().opt_level() == OptLevel::Best {
            self.compute_domtree();
            self.compute_loop_analysis();
@@ -100,6 +105,7 @@ impl Context {
        }
        self.compute_domtree();
        self.eliminate_unreachable_code(isa)?;
+        self.dce(isa)?;
        self.regalloc(isa)?;
        self.prologue_epilogue(isa)?;
        self.relax_branches(isa)
@@ -153,6 +159,13 @@ impl Context {
        }
    }

+    /// Perform dead-code elimination on the function.
+    pub fn dce<'a, FOI: Into<FlagsOrIsa<'a>>>(&mut self, fisa: FOI) -> CtonResult {
+        do_dce(&mut self.func, &mut self.domtree);
+        self.verify_if(fisa)?;
+        Ok(())
+    }
+
    /// Perform pre-legalization rewrites on the function.
    pub fn preopt(&mut self, isa: &TargetIsa) -> CtonResult {
        do_preopt(&mut self.func);
@@ -170,6 +183,13 @@ impl Context {
        self.verify_if(isa)
    }

+    /// Perform post-legalization rewrites on the function.
+    pub fn postopt(&mut self, isa: &TargetIsa) -> CtonResult {
+        do_postopt(&mut self.func, isa);
+        self.verify_if(isa)?;
+        Ok(())
+    }
+
    /// Compute the control flow graph.
    pub fn compute_cfg(&mut self) {
        self.cfg.compute(&self.func)
--- a/lib/cretonne/src/dce.rs
+++ b/lib/cretonne/src/dce.rs
@@ -0,0 +1,68 @@
+//! A Dead-Code Elimination (DCE) pass.
+//!
+//! Dead code here means instructions that have no side effects and have no
+//! result values used by other instructions.
+
+use cursor::{Cursor, FuncCursor};
+use dominator_tree::DominatorTree;
+use entity::EntityRef;
+use ir::{Function, Inst, Opcode, DataFlowGraph};
+use ir::instructions::InstructionData;
+use timing;
+use std::vec::Vec;
+
+/// Test whether the given opcode is unsafe to even consider for DCE.
+fn trivially_unsafe_for_dce(opcode: Opcode) -> bool {
+    opcode.is_call() || opcode.is_branch() || opcode.is_terminator() ||
+        opcode.is_return() || opcode.can_trap() || opcode.other_side_effects() ||
+        opcode.can_store()
+}
+
+/// Preserve instructions with used result values.
+fn any_inst_results_used(inst: Inst, live: &[bool], dfg: &DataFlowGraph) -> bool {
+    dfg.inst_results(inst).iter().any(|v| live[v.index()])
+}
+
+/// Load instructions without the `notrap` flag are defined to trap when
+/// operating on inaccessible memory, so we can't DCE them even if the
+/// loaded value is unused.
+fn is_load_with_defined_trapping(opcode: Opcode, data: &InstructionData) -> bool {
+    if !opcode.can_load() {
+        return false;
+    }
+    match *data {
+        InstructionData::StackLoad { .. } => false,
+        InstructionData::Load { flags, .. } => !flags.notrap(),
+        _ => true,
+    }
+}
+
+/// Perform DCE on `func`.
+pub fn do_dce(func: &mut Function, domtree: &mut DominatorTree) {
+    let _tt = timing::dce();
+    debug_assert!(domtree.is_valid());
+
+    let mut live = Vec::with_capacity(func.dfg.num_values());
+    live.resize(func.dfg.num_values(), false);
+
+    for &ebb in domtree.cfg_postorder().iter() {
+        let mut pos = FuncCursor::new(func).at_bottom(ebb);
+        while let Some(inst) = pos.prev_inst() {
+            {
+                let data = &pos.func.dfg[inst];
+                let opcode = data.opcode();
+                if trivially_unsafe_for_dce(opcode) ||
+                    is_load_with_defined_trapping(opcode, &data) ||
+                    any_inst_results_used(inst, &live, &pos.func.dfg)
+                {
+                    for arg in pos.func.dfg.inst_args(inst) {
+                        let v = pos.func.dfg.resolve_aliases(*arg);
+                        live[v.index()] = true;
+                    }
+                    continue;
+                }
+            }
+            pos.remove_inst();
+        }
+    }
+}
--- a/lib/cretonne/src/ir/immediates.rs
+++ b/lib/cretonne/src/ir/immediates.rs
@@ -21,6 +21,11 @@ impl Imm64 {
    pub fn new(x: i64) -> Imm64 {
        Imm64(x)
    }
+
+    /// Return self negated.
+    pub fn wrapping_neg(self) -> Imm64 {
+        Imm64(self.0.wrapping_neg())
+    }
 }

 impl Into<i64> for Imm64 {
--- a/lib/cretonne/src/isa/intel/mod.rs
+++ b/lib/cretonne/src/isa/intel/mod.rs
@@ -58,6 +58,10 @@ impl TargetIsa for Isa {
        &self.shared_flags
    }

+    fn uses_cpu_flags(&self) -> bool {
+        true
+    }
+
    fn register_info(&self) -> RegInfo {
        registers::INFO.clone()
    }
--- a/lib/cretonne/src/isa/mod.rs
+++ b/lib/cretonne/src/isa/mod.rs
@@ -158,6 +158,11 @@ pub trait TargetIsa: fmt::Display {
    /// Get the ISA-independent flags that were used to make this trait object.
    fn flags(&self) -> &settings::Flags;

+    /// Does the CPU implement scalar comparisons using a CPU flags register?
+    fn uses_cpu_flags(&self) -> bool {
+        false
+    }
+
    /// Get a data structure describing the registers in this ISA.
    fn register_info(&self) -> RegInfo;

--- a/lib/cretonne/src/lib.rs
+++ b/lib/cretonne/src/lib.rs
@@ -68,11 +68,13 @@ mod abi;
 mod bitset;
 mod constant_hash;
 mod context;
+mod dce;
 mod divconst_magic_numbers;
 mod iterators;
 mod legalizer;
 mod licm;
 mod partition_slice;
+mod postopt;
 mod predicates;
 mod preopt;
 mod ref_slice;
--- a/lib/cretonne/src/postopt.rs
+++ b/lib/cretonne/src/postopt.rs
@@ -0,0 +1,211 @@
+//! A post-legalization rewriting pass.
+
+#![allow(non_snake_case)]
+
+use cursor::{Cursor, EncCursor};
+use ir::dfg::ValueDef;
+use ir::{Function, InstructionData, Value, InstBuilder, Ebb, Inst};
+use ir::condcodes::{CondCode, IntCC, FloatCC};
+use ir::instructions::{Opcode, ValueList};
+use ir::immediates::Imm64;
+use isa::TargetIsa;
+use timing;
+
+/// Information collected about a compare+branch sequence.
+struct CmpBrInfo {
+    /// The branch instruction.
+    br_inst: Inst,
+    /// The icmp, icmp_imm, or fcmp instruction.
+    cmp_inst: Inst,
+    /// The destination of the branch.
+    destination: Ebb,
+    /// The arguments of the branch.
+    args: ValueList,
+    /// The first argument to the comparison. The second is in the `kind` field.
+    cmp_arg: Value,
+    /// If the branch is `brz` rather than `brnz`, we need to invert the condition
+    /// before the branch.
+    invert_branch_cond: bool,
+    /// The kind of comparison, and the second argument.
+    kind: CmpBrKind,
+}
+
+enum CmpBrKind {
+    Icmp { cond: IntCC, arg: Value },
+    IcmpImm { cond: IntCC, imm: Imm64 },
+    Fcmp { cond: FloatCC, arg: Value },
+}
+
+/// Optimize comparisons to use flags values, to avoid materializing conditions
+/// in integer registers.
+///
+/// For example, optimize icmp/fcmp brz/brnz sequences into ifcmp/ffcmp brif/brff
+/// sequences.
+fn optimize_cpu_flags(
+    pos: &mut EncCursor,
+    inst: Inst,
+    last_flags_clobber: Option<Inst>,
+    isa: &TargetIsa,
+) {
+    // Look for compare and branch patterns.
+    // This code could be considerably simplified with non-lexical lifetimes.
+    let info = match pos.func.dfg[inst] {
+        InstructionData::Branch {
+            opcode,
+            destination,
+            ref args,
+        } => {
+            let first_arg = args.first(&pos.func.dfg.value_lists).unwrap();
+            let invert_branch_cond = match opcode {
+                Opcode::Brz => true,
+                Opcode::Brnz => false,
+                _ => panic!(),
+            };
+            if let ValueDef::Result(cond_inst, _) = pos.func.dfg.value_def(first_arg) {
+                match pos.func.dfg[cond_inst] {
+                    InstructionData::IntCompare {
+                        cond,
+                        args: cmp_args,
+                        ..
+                    } => {
+                        CmpBrInfo {
+                            br_inst: inst,
+                            cmp_inst: cond_inst,
+                            destination,
+                            args: args.clone(),
+                            cmp_arg: cmp_args[0],
+                            invert_branch_cond,
+                            kind: CmpBrKind::Icmp {
+                                cond,
+                                arg: cmp_args[1],
+                            },
+                        }
+                    }
+                    InstructionData::IntCompareImm {
+                        cond,
+                        arg: cmp_arg,
+                        imm: cmp_imm,
+                        ..
+                    } => {
+                        CmpBrInfo {
+                            br_inst: inst,
+                            cmp_inst: cond_inst,
+                            destination,
+                            args: args.clone(),
+                            cmp_arg,
+                            invert_branch_cond,
+                            kind: CmpBrKind::IcmpImm { cond, imm: cmp_imm },
+                        }
+                    }
+                    InstructionData::FloatCompare {
+                        cond,
+                        args: cmp_args,
+                        ..
+                    } => {
+                        CmpBrInfo {
+                            br_inst: inst,
+                            cmp_inst: cond_inst,
+                            destination,
+                            args: args.clone(),
+                            cmp_arg: cmp_args[0],
+                            invert_branch_cond,
+                            kind: CmpBrKind::Fcmp {
+                                cond,
+                                arg: cmp_args[1],
+                            },
+                        }
+                    }
+                    _ => return,
+                }
+            } else {
+                return;
+            }
+        }
+        // TODO: trapif, trueif, selectif, and their ff counterparts.
+        _ => return,
+    };
+
+    // If any instructions clobber the flags between the comparison and the branch,
+    // don't optimize them.
+    if last_flags_clobber != Some(info.cmp_inst) {
+        return;
+    }
+
+    // We found a compare+branch pattern. Transform it to use flags.
+    let args = info.args.as_slice(&pos.func.dfg.value_lists)[1..].to_vec();
+    pos.goto_inst(info.cmp_inst);
+    match info.kind {
+        CmpBrKind::Icmp { mut cond, arg } => {
+            let flags = pos.ins().ifcmp(info.cmp_arg, arg);
+            pos.func.dfg.replace(info.cmp_inst).trueif(cond, flags);
+            if info.invert_branch_cond {
+                cond = cond.inverse();
+            }
+            pos.func.dfg.replace(info.br_inst).brif(
+                cond,
+                flags,
+                info.destination,
+                &args,
+            );
+        }
+        CmpBrKind::IcmpImm { mut cond, imm } => {
+            let flags = pos.ins().ifcmp_imm(info.cmp_arg, imm);
+            pos.func.dfg.replace(info.cmp_inst).trueif(cond, flags);
+            if info.invert_branch_cond {
+                cond = cond.inverse();
+            }
+            pos.func.dfg.replace(info.br_inst).brif(
+                cond,
+                flags,
+                info.destination,
+                &args,
+            );
+        }
+        CmpBrKind::Fcmp { mut cond, arg } => {
+            let flags = pos.ins().ffcmp(info.cmp_arg, arg);
+            pos.func.dfg.replace(info.cmp_inst).trueff(cond, flags);
+            if info.invert_branch_cond {
+                cond = cond.inverse();
+            }
+            pos.func.dfg.replace(info.br_inst).brff(
+                cond,
+                flags,
+                info.destination,
+                &args,
+            );
+        }
+    }
+    pos.func.update_encoding(info.cmp_inst, isa).is_ok();
+    pos.func.update_encoding(info.br_inst, isa).is_ok();
+}
+
+
+//----------------------------------------------------------------------
+//
+// The main post-opt pass.
+
+pub fn do_postopt(func: &mut Function, isa: &TargetIsa) {
+    let _tt = timing::postopt();
+    let mut pos = EncCursor::new(func, isa);
+    while let Some(_ebb) = pos.next_ebb() {
+        let mut last_flags_clobber = None;
+        while let Some(inst) = pos.next_inst() {
+            if isa.uses_cpu_flags() {
+                // Optimize instructions to make use of flags.
+                optimize_cpu_flags(&mut pos, inst, last_flags_clobber, isa);
+
+                // Track the most recent seen instruction that clobbers the flags.
+                if let Some(constraints) =
+                    isa.encoding_info().operand_constraints(
+                        pos.func.encodings[inst],
+                    )
+                {
+                    if constraints.clobbers_flags {
+                        last_flags_clobber = Some(inst)
+                    }
+                }
+
+            }
+        }
+    }
+}
--- a/lib/cretonne/src/preopt.rs
+++ b/lib/cretonne/src/preopt.rs
@@ -127,28 +127,6 @@ fn get_div_info(inst: Inst, dfg: &DataFlowGraph) -> Option<DivRemByConstInfo> {
        return package_up_divrem_info(arg, argL_ty, imm.into(), isSigned, isRem);
    }

-    // TODO: should we actually bother to do this (that is, manually match
-    // the case that the second argument is an iconst)? Or should we assume
-    // that some previous constant propagation pass has pushed all such
-    // immediates to their use points, creating BinaryImm instructions
-    // instead? For now we take the conservative approach.
-    if let InstructionData::Binary { opcode, args } = *idata {
-        let (isSigned, isRem) = match opcode {
-            Opcode::Udiv => (false, false),
-            Opcode::Urem => (false, true),
-            Opcode::Sdiv => (true, false),
-            Opcode::Srem => (true, true),
-            _other => return None,
-        };
-        let argR: Value = args[1];
-        if let Some(simm64) = get_const(argR, dfg) {
-            let argL: Value = args[0];
-            // Pull the operation size (type) from the left arg
-            let argL_ty = dfg.value_type(argL);
-            return package_up_divrem_info(argL, argL_ty, simm64, isSigned, isRem);
-        }
-    }
-
    None
 }

@@ -473,25 +451,106 @@ fn do_divrem_transformation(divrem_info: &DivRemByConstInfo, pos: &mut FuncCurso
    }
 }

-
-//----------------------------------------------------------------------
-//
-// General pattern-match helpers.
-
-/// Find out if `value` actually resolves to a constant, and if so what its
-/// value is.
-fn get_const(value: Value, dfg: &DataFlowGraph) -> Option<i64> {
-    match dfg.value_def(value) {
-        ValueDef::Result(definingInst, resultNo) => {
-            let definingIData: &InstructionData = &dfg[definingInst];
-            if let InstructionData::UnaryImm { opcode, imm } = *definingIData {
-                if opcode == Opcode::Iconst && resultNo == 0 {
-                    return Some(imm.into());
+/// Apply basic simplifications.
+///
+/// This folds constants with arithmetic to form `_imm` instructions, and other
+/// minor simplifications.
+fn simplify(pos: &mut FuncCursor, inst: Inst) {
+    match pos.func.dfg[inst] {
+        InstructionData::Binary { opcode, args } => {
+            if let ValueDef::Result(iconst_inst, _) = pos.func.dfg.value_def(args[1]) {
+                if let InstructionData::UnaryImm {
+                    opcode: Opcode::Iconst,
+                    mut imm,
+                } = pos.func.dfg[iconst_inst]
+                {
+                    let new_opcode = match opcode {
+                        Opcode::Iadd => Opcode::IaddImm,
+                        Opcode::Imul => Opcode::ImulImm,
+                        Opcode::Sdiv => Opcode::SdivImm,
+                        Opcode::Udiv => Opcode::UdivImm,
+                        Opcode::Srem => Opcode::SremImm,
+                        Opcode::Urem => Opcode::UremImm,
+                        Opcode::Band => Opcode::BandImm,
+                        Opcode::Bor => Opcode::BorImm,
+                        Opcode::Bxor => Opcode::BxorImm,
+                        Opcode::Rotl => Opcode::RotlImm,
+                        Opcode::Rotr => Opcode::RotrImm,
+                        Opcode::Ishl => Opcode::IshlImm,
+                        Opcode::Ushr => Opcode::UshrImm,
+                        Opcode::Sshr => Opcode::SshrImm,
+                        Opcode::Isub => {
+                            imm = imm.wrapping_neg();
+                            Opcode::IaddImm
+                        }
+                        _ => return,
+                    };
+                    let ty = pos.func.dfg.ctrl_typevar(inst);
+                    pos.func.dfg.replace(inst).BinaryImm(
+                        new_opcode,
+                        ty,
+                        imm,
+                        args[0],
+                    );
+                }
+            } else if let ValueDef::Result(iconst_inst, _) = pos.func.dfg.value_def(args[0]) {
+                if let InstructionData::UnaryImm {
+                    opcode: Opcode::Iconst,
+                    mut imm,
+                } = pos.func.dfg[iconst_inst]
+                {
+                    let new_opcode = match opcode {
+                        Opcode::Isub => Opcode::IrsubImm,
+                        _ => return,
+                    };
+                    let ty = pos.func.dfg.ctrl_typevar(inst);
+                    pos.func.dfg.replace(inst).BinaryImm(
+                        new_opcode,
+                        ty,
+                        imm,
+                        args[0],
+                    );
                }
            }
-            None
        }
-        ValueDef::Param(_definingEbb, _paramNo) => None,
+        InstructionData::IntCompare { opcode, cond, args } => {
+            debug_assert_eq!(opcode, Opcode::Icmp);
+            if let ValueDef::Result(iconst_inst, _) = pos.func.dfg.value_def(args[1]) {
+                if let InstructionData::UnaryImm {
+                    opcode: Opcode::Iconst,
+                    imm,
+                } = pos.func.dfg[iconst_inst]
+                {
+                    pos.func.dfg.replace(inst).icmp_imm(cond, args[0], imm);
+                }
+            }
+        }
+        InstructionData::CondTrap { .. } |
+        InstructionData::Branch { .. } |
+        InstructionData::Ternary { opcode: Opcode::Select, .. } => {
+            // Fold away a redundant `bint`.
+            let maybe = {
+                let args = pos.func.dfg.inst_args(inst);
+                if let ValueDef::Result(def_inst, _) = pos.func.dfg.value_def(args[0]) {
+                    if let InstructionData::Unary {
+                        opcode: Opcode::Bint,
+                        arg: bool_val,
+                    } = pos.func.dfg[def_inst]
+                    {
+                        Some(bool_val)
+                    } else {
+                        None
+                    }
+                } else {
+                    None
+                }
+            };
+            if let Some(bool_val) = maybe {
+                let args = pos.func.dfg.inst_args_mut(inst);
+                args[0] = bool_val;
+            }
+        }
+        _ => {}
    }
 }

@@ -503,6 +562,8 @@ pub fn do_preopt(func: &mut Function) {
    while let Some(_ebb) = pos.next_ebb() {

        while let Some(inst) = pos.next_inst() {
+            // Apply basic simplifications.
+            simplify(&mut pos, inst);

            //-- BEGIN -- division by constants ----------------

--- a/lib/cretonne/src/timing.rs
+++ b/lib/cretonne/src/timing.rs
@@ -55,7 +55,9 @@ define_passes!{
    flowgraph: "Control flow graph",
    domtree: "Dominator tree",
    loop_analysis: "Loop analysis",
+    postopt: "Post-legalization rewriting",
    preopt: "Pre-legalization rewriting",
+    dce: "Dead code elimination",
    legalize: "Legalization",
    gvn: "Global value numbering",
    licm: "Loop invariant code motion",
--- a/lib/filetests/src/lib.rs
+++ b/lib/filetests/src/lib.rs
@@ -28,9 +28,11 @@ mod match_directive;
 mod test_binemit;
 mod test_cat;
 mod test_compile;
+mod test_dce;
 mod test_domtree;
 mod test_legalizer;
 mod test_licm;
+mod test_postopt;
 mod test_preopt;
 mod test_print_cfg;
 mod test_regalloc;
@@ -73,9 +75,11 @@ fn new_subtest(parsed: &TestCommand) -> subtest::Result<Box<subtest::SubTest>> {
        "binemit" => test_binemit::subtest(parsed),
        "cat" => test_cat::subtest(parsed),
        "compile" => test_compile::subtest(parsed),
+        "dce" => test_dce::subtest(parsed),
        "domtree" => test_domtree::subtest(parsed),
        "legalizer" => test_legalizer::subtest(parsed),
        "licm" => test_licm::subtest(parsed),
+        "postopt" => test_postopt::subtest(parsed),
        "preopt" => test_preopt::subtest(parsed),
        "print-cfg" => test_print_cfg::subtest(parsed),
        "regalloc" => test_regalloc::subtest(parsed),
--- a/lib/filetests/src/test_dce.rs
+++ b/lib/filetests/src/test_dce.rs
@@ -0,0 +1,53 @@
+//! Test command for testing the DCE pass.
+//!
+//! The `dce` test command runs each function through the DCE pass after ensuring
+//! that all instructions are legal for the target.
+//!
+//! The resulting function is sent to `filecheck`.
+
+use cretonne::ir::Function;
+use cretonne;
+use cretonne::print_errors::pretty_error;
+use cton_reader::TestCommand;
+use subtest::{SubTest, Context, Result, run_filecheck};
+use std::borrow::Cow;
+use std::fmt::Write;
+
+struct TestDCE;
+
+pub fn subtest(parsed: &TestCommand) -> Result<Box<SubTest>> {
+    assert_eq!(parsed.command, "dce");
+    if !parsed.options.is_empty() {
+        Err(format!("No options allowed on {}", parsed))
+    } else {
+        Ok(Box::new(TestDCE))
+    }
+}
+
+impl SubTest for TestDCE {
+    fn name(&self) -> Cow<str> {
+        Cow::from("dce")
+    }
+
+    fn is_mutating(&self) -> bool {
+        true
+    }
+
+    fn run(&self, func: Cow<Function>, context: &Context) -> Result<()> {
+        // Create a compilation context, and drop in the function.
+        let mut comp_ctx = cretonne::Context::new();
+        comp_ctx.func = func.into_owned();
+
+        comp_ctx.flowgraph();
+        comp_ctx.compute_loop_analysis();
+        comp_ctx.dce(context.flags_or_isa()).map_err(|e| {
+            pretty_error(&comp_ctx.func, context.isa, Into::into(e))
+        })?;
+
+        let mut text = String::new();
+        write!(&mut text, "{}", &comp_ctx.func).map_err(
+            |e| e.to_string(),
+        )?;
+        run_filecheck(&text, context)
+    }
+}
--- a/lib/filetests/src/test_postopt.rs
+++ b/lib/filetests/src/test_postopt.rs
@@ -0,0 +1,50 @@
+//! Test command for testing the postopt pass.
+//!
+//! The resulting function is sent to `filecheck`.
+
+use cretonne::ir::Function;
+use cretonne;
+use cretonne::print_errors::pretty_error;
+use cton_reader::TestCommand;
+use subtest::{SubTest, Context, Result, run_filecheck};
+use std::borrow::Cow;
+use std::fmt::Write;
+
+struct TestPostopt;
+
+pub fn subtest(parsed: &TestCommand) -> Result<Box<SubTest>> {
+    assert_eq!(parsed.command, "postopt");
+    if !parsed.options.is_empty() {
+        Err(format!("No options allowed on {}", parsed))
+    } else {
+        Ok(Box::new(TestPostopt))
+    }
+}
+
+impl SubTest for TestPostopt {
+    fn name(&self) -> Cow<str> {
+        Cow::from("postopt")
+    }
+
+    fn is_mutating(&self) -> bool {
+        true
+    }
+
+    fn run(&self, func: Cow<Function>, context: &Context) -> Result<()> {
+        // Create a compilation context, and drop in the function.
+        let mut comp_ctx = cretonne::Context::new();
+        comp_ctx.func = func.into_owned();
+        let isa = context.isa.expect("postopt needs an ISA");
+
+        comp_ctx.flowgraph();
+        comp_ctx.postopt(isa).map_err(|e| {
+            pretty_error(&comp_ctx.func, context.isa, Into::into(e))
+        })?;
+
+        let mut text = String::new();
+        write!(&mut text, "{}", &comp_ctx.func).map_err(
+            |e| e.to_string(),
+        )?;
+        run_filecheck(&text, context)
+    }
+}