Reuse the DominatorTree postorder travesal in BlockLoweringOrder (#5843)

* Rework the blockorder module to reuse the dom tree's cfg postorder * Update domtree tests * Treat br_table with an empty jump table as multiple block exits * Bless tests * Change branch_idx to succ_idx and fix the comment
2023-02-23 14:05:20 -08:00
parent 4314210162
commit 8abfe928d6
175 changed files with 2936 additions and 3186 deletions
--- a/cranelift/codegen/src/context.rs
+++ b/cranelift/codegen/src/context.rs
@@ -140,7 +140,7 @@ impl Context {

        self.optimize(isa)?;

-        isa.compile_function(&self.func, self.want_disasm)
+        isa.compile_function(&self.func, &self.domtree, self.want_disasm)
    }

    /// Optimize the function, performing all compilation steps up to
--- a/cranelift/codegen/src/dominator_tree.rs
+++ b/cranelift/codegen/src/dominator_tree.rs
@@ -311,9 +311,17 @@ impl DominatorTree {
                        self.nodes[block].rpo_number = SEEN;
                        self.stack.push((Visit::Last, block));
                        if let Some(inst) = func.stencil.layout.last_inst(block) {
+                            // Heuristic: chase the children in reverse. This puts the first
+                            // successor block first in the postorder, all other things being
+                            // equal, which tends to prioritize loop backedges over out-edges,
+                            // putting the edge-block closer to the loop body and minimizing
+                            // live-ranges in linear instruction space. This heuristic doesn't have
+                            // any effect on the computation of dominators, and is purely for other
+                            // consumers of the postorder we cache here.
                            for block in func.stencil.dfg.insts[inst]
                                .branch_destination(&func.stencil.dfg.jump_tables)
                                .iter()
+                                .rev()
                            {
                                let succ = block.block(&func.stencil.dfg.value_lists);

@@ -641,7 +649,7 @@ mod tests {
        //       return
        //     } block2
        // } block0
-        assert_eq!(dt.cfg_postorder(), &[trap_block, block2, block0]);
+        assert_eq!(dt.cfg_postorder(), &[block2, trap_block, block0]);

        let v2_def = cur.func.dfg.value_def(v2).unwrap_inst();
        assert!(!dt.dominates(v2_def, block0, &cur.func.layout));
--- a/cranelift/codegen/src/isa/aarch64/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/mod.rs
@@ -1,5 +1,6 @@
 //! ARM 64-bit Instruction Set Architecture.

+use crate::dominator_tree::DominatorTree;
 use crate::ir::condcodes::IntCC;
 use crate::ir::{Function, Type};
 use crate::isa::aarch64::settings as aarch64_settings;
@@ -56,11 +57,12 @@ impl AArch64Backend {
    fn compile_vcode(
        &self,
        func: &Function,
+        domtree: &DominatorTree,
    ) -> CodegenResult<(VCode<inst::Inst>, regalloc2::Output)> {
        let emit_info = EmitInfo::new(self.flags.clone());
        let sigs = SigSet::new::<abi::AArch64MachineDeps>(func, &self.flags)?;
        let abi = abi::AArch64Callee::new(func, self, &self.isa_flags, &sigs)?;
-        compile::compile::<AArch64Backend>(func, self, abi, emit_info, sigs)
+        compile::compile::<AArch64Backend>(func, domtree, self, abi, emit_info, sigs)
    }
 }

@@ -68,9 +70,10 @@ impl TargetIsa for AArch64Backend {
    fn compile_function(
        &self,
        func: &Function,
+        domtree: &DominatorTree,
        want_disasm: bool,
    ) -> CodegenResult<CompiledCodeStencil> {
-        let (vcode, regalloc_result) = self.compile_vcode(func)?;
+        let (vcode, regalloc_result) = self.compile_vcode(func, domtree)?;

        let emit_result = vcode.emit(
            &regalloc_result,
@@ -241,6 +244,8 @@ pub fn isa_builder(triple: Triple) -> IsaBuilder {
 mod test {
    use super::*;
    use crate::cursor::{Cursor, FuncCursor};
+    use crate::dominator_tree::DominatorTree;
+    use crate::flowgraph::ControlFlowGraph;
    use crate::ir::types::*;
    use crate::ir::{AbiParam, Function, InstBuilder, JumpTableData, Signature, UserFuncName};
    use crate::isa::CallConv;
@@ -275,7 +280,12 @@ mod test {
            shared_flags,
            isa_flags,
        );
-        let buffer = backend.compile_function(&mut func, false).unwrap().buffer;
+        let cfg = ControlFlowGraph::with_function(&func);
+        let domtree = DominatorTree::with_function(&func, &cfg);
+        let buffer = backend
+            .compile_function(&mut func, &domtree, false)
+            .unwrap()
+            .buffer;
        let code = buffer.data();

        // To update this comment, write the golden bytes to a file, and run the following command
@@ -328,8 +338,10 @@ mod test {
            shared_flags,
            isa_flags,
        );
+        let cfg = ControlFlowGraph::with_function(&func);
+        let domtree = DominatorTree::with_function(&func, &cfg);
        let result = backend
-            .compile_function(&mut func, /* want_disasm = */ false)
+            .compile_function(&mut func, &domtree, /* want_disasm = */ false)
            .unwrap();
        let code = result.buffer.data();

@@ -340,21 +352,22 @@ mod test {
        //   0:   52824689        mov     w9, #0x1234                     // #4660
        //   4:   0b09000b        add     w11, w0, w9
        //   8:   2a0b03ea        mov     w10, w11
-        //   c:   b50000aa        cbnz    x10, 0x20
-        //  10:   5282468c        mov     w12, #0x1234                    // #4660
-        //  14:   0b0c016e        add     w14, w11, w12
-        //  18:   2a0e03ed        mov     w13, w14
-        //  1c:   b5ffffad        cbnz    x13, 0x10
-        //  20:   2a0b03e0        mov     w0, w11
-        //  24:   b5ffff60        cbnz    x0, 0x10
-        //  28:   52824681        mov     w1, #0x1234                     // #4660
-        //  2c:   4b010160        sub     w0, w11, w1
-        //  30:   d65f03c0        ret
+        //   c:   b40000ca        cbz     x10, 0x24
+        //  10:   2a0b03ed        mov     w13, w11
+        //  14:   b500008d        cbnz    x13, 0x24
+        //  18:   5282468e        mov     w14, #0x1234                    // #4660
+        //  1c:   4b0e0160        sub     w0, w11, w14
+        //  20:   d65f03c0        ret
+        //  24:   5282468f        mov     w15, #0x1234                    // #4660
+        //  28:   0b0f0161        add     w1, w11, w15
+        //  2c:   2a0103e0        mov     w0, w1
+        //  30:   b5ffffa0        cbnz    x0, 0x24
+        //  34:   17fffff7        b       0x10

        let golden = vec![
-            137, 70, 130, 82, 11, 0, 9, 11, 234, 3, 11, 42, 170, 0, 0, 181, 140, 70, 130, 82, 110,
-            1, 12, 11, 237, 3, 14, 42, 173, 255, 255, 181, 224, 3, 11, 42, 96, 255, 255, 181, 129,
-            70, 130, 82, 96, 1, 1, 75, 192, 3, 95, 214,
+            137, 70, 130, 82, 11, 0, 9, 11, 234, 3, 11, 42, 202, 0, 0, 180, 237, 3, 11, 42, 141, 0,
+            0, 181, 142, 70, 130, 82, 96, 1, 14, 75, 192, 3, 95, 214, 143, 70, 130, 82, 97, 1, 15,
+            11, 224, 3, 1, 42, 160, 255, 255, 181, 247, 255, 255, 23,
        ];

        assert_eq!(code, &golden[..]);
@@ -409,8 +422,10 @@ mod test {
            shared_flags,
            isa_flags,
        );
+        let cfg = ControlFlowGraph::with_function(&func);
+        let domtree = DominatorTree::with_function(&func, &cfg);
        let result = backend
-            .compile_function(&mut func, /* want_disasm = */ false)
+            .compile_function(&mut func, &domtree, /* want_disasm = */ false)
            .unwrap();
        let code = result.buffer.data();

@@ -419,7 +434,7 @@ mod test {
        // > aarch64-linux-gnu-objdump -b binary -D <file> -m aarch64
        //
        //   0:   7100081f        cmp     w0, #0x2
-        //   4:   54000122        b.cs    0x28  // b.hs, b.nlast
+        //   4:   540001a2        b.cs    0x38  // b.hs, b.nlast
        //   8:   9a8023e8        csel    x8, xzr, x0, cs  // cs = hs, nlast
        //   c:   d503229f        csdb
        //  10:   10000087        adr     x7, 0x20
@@ -427,18 +442,18 @@ mod test {
        //  18:   8b0800e7        add     x7, x7, x8
        //  1c:   d61f00e0        br      x7
        //  20:   00000010        udf     #16
-        //  24:   00000018        udf     #24
-        //  28:   52800060        mov     w0, #0x3                        // #3
+        //  24:   00000008        udf     #8
+        //  28:   52800040        mov     w0, #0x2                        // #2
        //  2c:   d65f03c0        ret
        //  30:   52800020        mov     w0, #0x1                        // #1
        //  34:   d65f03c0        ret
-        //  38:   52800040        mov     w0, #0x2                        // #2
+        //  38:   52800060        mov     w0, #0x3                        // #3
        //  3c:   d65f03c0        ret

        let golden = vec![
-            31, 8, 0, 113, 34, 1, 0, 84, 232, 35, 128, 154, 159, 34, 3, 213, 135, 0, 0, 16, 232,
-            88, 168, 184, 231, 0, 8, 139, 224, 0, 31, 214, 16, 0, 0, 0, 24, 0, 0, 0, 96, 0, 128,
-            82, 192, 3, 95, 214, 32, 0, 128, 82, 192, 3, 95, 214, 64, 0, 128, 82, 192, 3, 95, 214,
+            31, 8, 0, 113, 162, 1, 0, 84, 232, 35, 128, 154, 159, 34, 3, 213, 135, 0, 0, 16, 232,
+            88, 168, 184, 231, 0, 8, 139, 224, 0, 31, 214, 16, 0, 0, 0, 8, 0, 0, 0, 64, 0, 128, 82,
+            192, 3, 95, 214, 32, 0, 128, 82, 192, 3, 95, 214, 96, 0, 128, 82, 192, 3, 95, 214,
        ];

        assert_eq!(code, &golden[..]);
--- a/cranelift/codegen/src/isa/mod.rs
+++ b/cranelift/codegen/src/isa/mod.rs
@@ -43,6 +43,7 @@
 //! The configured target ISA trait object is a `Box<TargetIsa>` which can be used for multiple
 //! concurrent function compilations.

+use crate::dominator_tree::DominatorTree;
 pub use crate::isa::call_conv::CallConv;

 use crate::flowgraph;
@@ -252,6 +253,7 @@ pub trait TargetIsa: fmt::Display + Send + Sync {
    fn compile_function(
        &self,
        func: &Function,
+        domtree: &DominatorTree,
        want_disasm: bool,
    ) -> CodegenResult<CompiledCodeStencil>;

--- a/cranelift/codegen/src/isa/riscv64/mod.rs
+++ b/cranelift/codegen/src/isa/riscv64/mod.rs
@@ -1,5 +1,6 @@
 //! risc-v 64-bit Instruction Set Architecture.

+use crate::dominator_tree::DominatorTree;
 use crate::ir;
 use crate::ir::condcodes::IntCC;
 use crate::ir::Function;
@@ -56,11 +57,12 @@ impl Riscv64Backend {
    fn compile_vcode(
        &self,
        func: &Function,
+        domtree: &DominatorTree,
    ) -> CodegenResult<(VCode<inst::Inst>, regalloc2::Output)> {
        let emit_info = EmitInfo::new(self.flags.clone(), self.isa_flags.clone());
        let sigs = SigSet::new::<abi::Riscv64MachineDeps>(func, &self.flags)?;
        let abi = abi::Riscv64Callee::new(func, self, &self.isa_flags, &sigs)?;
-        compile::compile::<Riscv64Backend>(func, self, abi, emit_info, sigs)
+        compile::compile::<Riscv64Backend>(func, domtree, self, abi, emit_info, sigs)
    }
 }

@@ -68,9 +70,10 @@ impl TargetIsa for Riscv64Backend {
    fn compile_function(
        &self,
        func: &Function,
+        domtree: &DominatorTree,
        want_disasm: bool,
    ) -> CodegenResult<CompiledCodeStencil> {
-        let (vcode, regalloc_result) = self.compile_vcode(func)?;
+        let (vcode, regalloc_result) = self.compile_vcode(func, domtree)?;

        let want_disasm = want_disasm || log::log_enabled!(log::Level::Debug);
        let emit_result = vcode.emit(
@@ -216,6 +219,8 @@ pub fn isa_builder(triple: Triple) -> IsaBuilder {
 mod test {
    use super::*;
    use crate::cursor::{Cursor, FuncCursor};
+    use crate::dominator_tree::DominatorTree;
+    use crate::flowgraph::ControlFlowGraph;
    use crate::ir::types::*;
    use crate::ir::{AbiParam, Function, InstBuilder, Signature, UserFuncName};
    use crate::isa::CallConv;
@@ -250,7 +255,9 @@ mod test {
            shared_flags,
            isa_flags,
        );
-        let buffer = backend.compile_function(&mut func, true).unwrap();
+        let cfg = ControlFlowGraph::with_function(&func);
+        let domtree = DominatorTree::with_function(&func, &cfg);
+        let buffer = backend.compile_function(&mut func, &domtree, true).unwrap();
        let code = buffer.buffer.data();

        // To update this comment, write the golden bytes to a file, and run the following command
--- a/cranelift/codegen/src/isa/s390x/mod.rs
+++ b/cranelift/codegen/src/isa/s390x/mod.rs
@@ -1,5 +1,6 @@
 //! IBM Z 64-bit Instruction Set Architecture.

+use crate::dominator_tree::DominatorTree;
 use crate::ir::condcodes::IntCC;
 use crate::ir::{Function, Type};
 use crate::isa::s390x::settings as s390x_settings;
@@ -56,11 +57,12 @@ impl S390xBackend {
    fn compile_vcode(
        &self,
        func: &Function,
+        domtree: &DominatorTree,
    ) -> CodegenResult<(VCode<inst::Inst>, regalloc2::Output)> {
        let emit_info = EmitInfo::new(self.isa_flags.clone());
        let sigs = SigSet::new::<abi::S390xMachineDeps>(func, &self.flags)?;
        let abi = abi::S390xCallee::new(func, self, &self.isa_flags, &sigs)?;
-        compile::compile::<S390xBackend>(func, self, abi, emit_info, sigs)
+        compile::compile::<S390xBackend>(func, domtree, self, abi, emit_info, sigs)
    }
 }

@@ -68,10 +70,11 @@ impl TargetIsa for S390xBackend {
    fn compile_function(
        &self,
        func: &Function,
+        domtree: &DominatorTree,
        want_disasm: bool,
    ) -> CodegenResult<CompiledCodeStencil> {
        let flags = self.flags();
-        let (vcode, regalloc_result) = self.compile_vcode(func)?;
+        let (vcode, regalloc_result) = self.compile_vcode(func, domtree)?;

        let emit_result = vcode.emit(&regalloc_result, want_disasm, flags.machine_code_cfg_info());
        let frame_size = emit_result.frame_size;
@@ -213,6 +216,8 @@ pub fn isa_builder(triple: Triple) -> IsaBuilder {
 mod test {
    use super::*;
    use crate::cursor::{Cursor, FuncCursor};
+    use crate::dominator_tree::DominatorTree;
+    use crate::flowgraph::ControlFlowGraph;
    use crate::ir::types::*;
    use crate::ir::UserFuncName;
    use crate::ir::{AbiParam, Function, InstBuilder, Signature};
@@ -248,8 +253,10 @@ mod test {
            shared_flags,
            isa_flags,
        );
+        let cfg = ControlFlowGraph::with_function(&func);
+        let domtree = DominatorTree::with_function(&func, &cfg);
        let result = backend
-            .compile_function(&mut func, /* want_disasm = */ false)
+            .compile_function(&mut func, &domtree, /* want_disasm = */ false)
            .unwrap();
        let code = result.buffer.data();

@@ -297,8 +304,10 @@ mod test {
            shared_flags,
            isa_flags,
        );
+        let cfg = ControlFlowGraph::with_function(&func);
+        let domtree = DominatorTree::with_function(&func, &cfg);
        let result = backend
-            .compile_function(&mut func, /* want_disasm = */ false)
+            .compile_function(&mut func, &domtree, /* want_disasm = */ false)
            .unwrap();
        let code = result.buffer.data();

@@ -310,19 +319,20 @@ mod test {
        //
        //  0:   a7 2a 12 34             ahi     %r2,4660
        //  4:   a7 2e 00 00             chi     %r2,0
-        //  8:   c0 64 00 00 00 0b       jglh    0x1e
-        //  e:   ec 32 12 34 00 d8       ahik    %r3,%r2,4660
-        // 14:   a7 3e 00 00             chi     %r3,0
-        // 18:   c0 64 ff ff ff fb       jglh    0xe
-        // 1e:   a7 2e 00 00             chi     %r2,0
-        // 22:   c0 64 ff ff ff f6       jglh    0xe
-        // 28:   a7 2a ed cc             ahi     %r2,-4660
-        // 2c:   07 fe                   br      %r14
+        //  8:   c0 94 00 00 00 0b       jgnlh   0x1e
+        //  e:   a7 2e 00 00             chi     %r2,0
+        // 12:   c0 64 00 00 00 06       jglh    0x1e
+        // 18:   a7 2a ed cc             ahi     %r2,-4660
+        // 1c:   07 fe                   br      %r14
+        // 1e:   ec 32 12 34 00 d8       ahik    %r3,%r2,4660
+        // 24:   a7 3e 00 00             chi     %r3,0
+        // 28:   c0 64 ff ff ff fb       jglh    0x1e
+        // 2e:   c0 f4 ff ff ff f0       jg      0xe

        let golden = vec![
-            167, 42, 18, 52, 167, 46, 0, 0, 192, 100, 0, 0, 0, 11, 236, 50, 18, 52, 0, 216, 167,
-            62, 0, 0, 192, 100, 255, 255, 255, 251, 167, 46, 0, 0, 192, 100, 255, 255, 255, 246,
-            167, 42, 237, 204, 7, 254,
+            167, 42, 18, 52, 167, 46, 0, 0, 192, 148, 0, 0, 0, 11, 167, 46, 0, 0, 192, 100, 0, 0,
+            0, 6, 167, 42, 237, 204, 7, 254, 236, 50, 18, 52, 0, 216, 167, 62, 0, 0, 192, 100, 255,
+            255, 255, 251, 192, 244, 255, 255, 255, 240,
        ];

        assert_eq!(code, &golden[..]);
--- a/cranelift/codegen/src/isa/x64/mod.rs
+++ b/cranelift/codegen/src/isa/x64/mod.rs
@@ -3,6 +3,7 @@
 pub use self::inst::{args, EmitInfo, EmitState, Inst};

 use super::{OwnedTargetIsa, TargetIsa};
+use crate::dominator_tree::DominatorTree;
 use crate::ir::{condcodes::IntCC, Function, Type};
 #[cfg(feature = "unwind")]
 use crate::isa::unwind::systemv;
@@ -48,13 +49,14 @@ impl X64Backend {
    fn compile_vcode(
        &self,
        func: &Function,
+        domtree: &DominatorTree,
    ) -> CodegenResult<(VCode<inst::Inst>, regalloc2::Output)> {
        // This performs lowering to VCode, register-allocates the code, computes
        // block layout and finalizes branches. The result is ready for binary emission.
        let emit_info = EmitInfo::new(self.flags.clone(), self.x64_flags.clone());
        let sigs = SigSet::new::<abi::X64ABIMachineSpec>(func, &self.flags)?;
-        let abi = abi::X64Callee::new(&func, self, &self.x64_flags, &sigs)?;
-        compile::compile::<Self>(&func, self, abi, emit_info, sigs)
+        let abi = abi::X64Callee::new(func, self, &self.x64_flags, &sigs)?;
+        compile::compile::<Self>(func, domtree, self, abi, emit_info, sigs)
    }
 }

@@ -62,9 +64,10 @@ impl TargetIsa for X64Backend {
    fn compile_function(
        &self,
        func: &Function,
+        domtree: &DominatorTree,
        want_disasm: bool,
    ) -> CodegenResult<CompiledCodeStencil> {
-        let (vcode, regalloc_result) = self.compile_vcode(func)?;
+        let (vcode, regalloc_result) = self.compile_vcode(func, domtree)?;

        let emit_result = vcode.emit(
            &regalloc_result,
@@ -231,6 +234,8 @@ fn isa_constructor(
 mod test {
    use super::*;
    use crate::cursor::{Cursor, FuncCursor};
+    use crate::dominator_tree::DominatorTree;
+    use crate::flowgraph::ControlFlowGraph;
    use crate::ir::{types::*, RelSourceLoc, SourceLoc, UserFuncName, ValueLabel, ValueLabelStart};
    use crate::ir::{AbiParam, Function, InstBuilder, JumpTableData, Signature};
    use crate::isa::CallConv;
@@ -341,8 +346,10 @@ mod test {
            shared_flags,
            isa_flags,
        );
+        let cfg = ControlFlowGraph::with_function(&func);
+        let domtree = DominatorTree::with_function(&func, &cfg);
        let result = backend
-            .compile_function(&mut func, /* want_disasm = */ false)
+            .compile_function(&mut func, &domtree, /* want_disasm = */ false)
            .unwrap();
        let code = result.buffer.data();

@@ -355,27 +362,28 @@ mod test {
        //  4:   48 89 fe                mov    rsi,rdi
        //  7:   81 c6 34 12 00 00       add    esi,0x1234
        //  d:   85 f6                   test   esi,esi
-        //  f:   0f 84 1c 00 00 00       je     0x31
+        //  f:   0f 84 21 00 00 00       je     0x36
        // 15:   49 89 f0                mov    r8,rsi
        // 18:   48 89 f0                mov    rax,rsi
        // 1b:   81 e8 34 12 00 00       sub    eax,0x1234
        // 21:   44 01 c0                add    eax,r8d
        // 24:   85 f6                   test   esi,esi
-        // 26:   0f 85 05 00 00 00       jne    0x31
+        // 26:   0f 85 0a 00 00 00       jne    0x36
        // 2c:   48 89 ec                mov    rsp,rbp
        // 2f:   5d                      pop    rbp
        // 30:   c3                      ret
-        // 31:   49 89 f0                mov    r8,rsi
-        // 34:   41 81 c0 34 12 00 00    add    r8d,0x1234
-        // 3b:   45 85 c0                test   r8d,r8d
-        // 3e:   0f 85 ed ff ff ff       jne    0x31
-        // 44:   e9 cf ff ff ff          jmp    0x18
+        // 31:   e9 e2 ff ff ff          jmp    0x18
+        // 36:   49 89 f0                mov    r8,rsi
+        // 39:   41 81 c0 34 12 00 00    add    r8d,0x1234
+        // 40:   45 85 c0                test   r8d,r8d
+        // 43:   0f 84 cf ff ff ff       je     0x18
+        // 49:   e9 e8 ff ff ff          jmp    0x36

        let golden = vec![
-            85, 72, 137, 229, 72, 137, 254, 129, 198, 52, 18, 0, 0, 133, 246, 15, 132, 28, 0, 0, 0,
-            73, 137, 240, 72, 137, 240, 129, 232, 52, 18, 0, 0, 68, 1, 192, 133, 246, 15, 133, 5,
-            0, 0, 0, 72, 137, 236, 93, 195, 73, 137, 240, 65, 129, 192, 52, 18, 0, 0, 69, 133, 192,
-            15, 133, 237, 255, 255, 255, 233, 207, 255, 255, 255,
+            85, 72, 137, 229, 72, 137, 254, 129, 198, 52, 18, 0, 0, 133, 246, 15, 132, 33, 0, 0, 0,
+            73, 137, 240, 72, 137, 240, 129, 232, 52, 18, 0, 0, 68, 1, 192, 133, 246, 15, 133, 10,
+            0, 0, 0, 72, 137, 236, 93, 195, 233, 226, 255, 255, 255, 73, 137, 240, 65, 129, 192,
+            52, 18, 0, 0, 69, 133, 192, 15, 132, 207, 255, 255, 255, 233, 232, 255, 255, 255,
        ];

        assert_eq!(code, &golden[..]);
@@ -450,8 +458,10 @@ mod test {
            shared_flags,
            isa_flags,
        );
+        let cfg = ControlFlowGraph::with_function(&func);
+        let domtree = DominatorTree::with_function(&func, &cfg);
        let result = backend
-            .compile_function(&mut func, /* want_disasm = */ false)
+            .compile_function(&mut func, &domtree, /* want_disasm = */ false)
            .unwrap();
        let code = result.buffer.data();

@@ -462,7 +472,7 @@ mod test {
        //  0:   55                      push   rbp
        //  1:   48 89 e5                mov    rbp,rsp
        //  4:   83 ff 02                cmp    edi,0x2
-        //  7:   0f 83 27 00 00 00       jae    0x34
+        //  7:   0f 83 3b 00 00 00       jae    0x48
        //  d:   44 8b d7                mov    r10d,edi
        // 10:   41 b9 00 00 00 00       mov    r9d,0x0
        // 16:   4d 0f 43 d1             cmovae r10,r9
@@ -472,9 +482,9 @@ mod test {
        // 29:   41 ff e1                jmp    r9
        // 2c:   12 00                   adc    al,BYTE PTR [rax]
        // 2e:   00 00                   add    BYTE PTR [rax],al
-        // 30:   1c 00                   sbb    al,0x0
+        // 30:   08 00                   or     BYTE PTR [rax],al
        // 32:   00 00                   add    BYTE PTR [rax],al
-        // 34:   b8 03 00 00 00          mov    eax,0x3
+        // 34:   b8 02 00 00 00          mov    eax,0x2
        // 39:   48 89 ec                mov    rsp,rbp
        // 3c:   5d                      pop    rbp
        // 3d:   c3                      ret
@@ -482,16 +492,16 @@ mod test {
        // 43:   48 89 ec                mov    rsp,rbp
        // 46:   5d                      pop    rbp
        // 47:   c3                      ret
-        // 48:   b8 02 00 00 00          mov    eax,0x2
+        // 48:   b8 03 00 00 00          mov    eax,0x3
        // 4d:   48 89 ec                mov    rsp,rbp
        // 50:   5d                      pop    rbp
        // 51:   c3                      ret

        let golden = vec![
-            85, 72, 137, 229, 131, 255, 2, 15, 131, 39, 0, 0, 0, 68, 139, 215, 65, 185, 0, 0, 0, 0,
+            85, 72, 137, 229, 131, 255, 2, 15, 131, 59, 0, 0, 0, 68, 139, 215, 65, 185, 0, 0, 0, 0,
            77, 15, 67, 209, 76, 141, 13, 11, 0, 0, 0, 79, 99, 84, 145, 0, 77, 1, 209, 65, 255,
-            225, 18, 0, 0, 0, 28, 0, 0, 0, 184, 3, 0, 0, 0, 72, 137, 236, 93, 195, 184, 1, 0, 0, 0,
-            72, 137, 236, 93, 195, 184, 2, 0, 0, 0, 72, 137, 236, 93, 195,
+            225, 18, 0, 0, 0, 8, 0, 0, 0, 184, 2, 0, 0, 0, 72, 137, 236, 93, 195, 184, 1, 0, 0, 0,
+            72, 137, 236, 93, 195, 184, 3, 0, 0, 0, 72, 137, 236, 93, 195,
        ];

        assert_eq!(code, &golden[..]);
--- a/cranelift/codegen/src/loop_analysis.rs
+++ b/cranelift/codegen/src/loop_analysis.rs
@@ -417,25 +417,23 @@ mod tests {
        }

        let mut loop_analysis = LoopAnalysis::new();
-        let mut cfg = ControlFlowGraph::new();
-        let mut domtree = DominatorTree::new();
-        cfg.compute(&func);
-        domtree.compute(&func, &cfg);
+        let cfg = ControlFlowGraph::with_function(&func);
+        let domtree = DominatorTree::with_function(&func, &cfg);
        loop_analysis.compute(&func, &cfg, &domtree);

        let loops = loop_analysis.loops().collect::<Vec<Loop>>();
        assert_eq!(loops.len(), 3);
        assert_eq!(loop_analysis.loop_header(loops[0]), block0);
-        assert_eq!(loop_analysis.loop_header(loops[1]), block1);
-        assert_eq!(loop_analysis.loop_header(loops[2]), block3);
+        assert_eq!(loop_analysis.loop_header(loops[1]), block3);
+        assert_eq!(loop_analysis.loop_header(loops[2]), block1);
        assert_eq!(loop_analysis.loop_parent(loops[1]), Some(loops[0]));
        assert_eq!(loop_analysis.loop_parent(loops[2]), Some(loops[0]));
        assert_eq!(loop_analysis.loop_parent(loops[0]), None);
        assert_eq!(loop_analysis.is_in_loop(block0, loops[0]), true);
-        assert_eq!(loop_analysis.is_in_loop(block1, loops[1]), true);
-        assert_eq!(loop_analysis.is_in_loop(block2, loops[1]), true);
-        assert_eq!(loop_analysis.is_in_loop(block3, loops[2]), true);
-        assert_eq!(loop_analysis.is_in_loop(block4, loops[2]), true);
+        assert_eq!(loop_analysis.is_in_loop(block1, loops[2]), true);
+        assert_eq!(loop_analysis.is_in_loop(block2, loops[2]), true);
+        assert_eq!(loop_analysis.is_in_loop(block3, loops[1]), true);
+        assert_eq!(loop_analysis.is_in_loop(block4, loops[1]), true);
        assert_eq!(loop_analysis.is_in_loop(block5, loops[0]), true);
        assert_eq!(loop_analysis.loop_level(block0).level(), 1);
        assert_eq!(loop_analysis.loop_level(block1).level(), 2);
--- a/cranelift/codegen/src/machinst/blockorder.rs
+++ b/cranelift/codegen/src/machinst/blockorder.rs
@@ -34,27 +34,18 @@
 //!            +--------------+
 //!               /           \
 //!     +--------------+     +--------------+
-//!     | (edge 0->1)  |     |(edge 0->2)   |
+//!     | (edge 0->1)  |     | (edge 0->2)  |
 //!     | CLIF block 1 |     | CLIF block 2 |
+//!     | (edge 1->3)  |     | (edge 2->3)  |
 //!     +--------------+     +--------------+
-//!              \                /
-//!          +-----------+ +-----------+
-//!          |(edge 1->3)| |(edge 2->3)|
-//!          +-----------+ +-----------+
-//!                   \      /
+//!                \           /
+//!                 \         /
 //!                +------------+
 //!                |CLIF block 3|
 //!                +------------+
 //! ```
 //!
-//! (note that the edges into CLIF blocks 1 and 2 could be merged with those
-//! blocks' original bodies, but the out-edges could not because for simplicity
-//! in the successor-function definition, we only ever merge an edge onto one
-//! side of an original CLIF block.)
-//!
-//! Each `LoweredBlock` names just an original CLIF block, an original CLIF
-//! block prepended or appended with an edge block (never both, though), or just
-//! an edge block.
+//! Each `LoweredBlock` names just an original CLIF block, or just an edge block.
 //!
 //! To compute this lowering, we do a DFS over the CLIF-plus-edge-block graph
 //! (never actually materialized, just defined by a "successors" function), and
@@ -69,6 +60,7 @@
 //! branch editing that in practice elides empty blocks and simplifies some of
 //! the other redundancies that this scheme produces.

+use crate::dominator_tree::DominatorTree;
 use crate::entity::SecondaryMap;
 use crate::fx::{FxHashMap, FxHashSet};
 use crate::inst_predicates::visit_block_succs;
@@ -84,21 +76,11 @@ pub struct BlockLoweringOrder {
    /// (i) a CLIF block, and (ii) inserted crit-edge blocks before or after;
    /// see [LoweredBlock] for details.
    lowered_order: Vec<LoweredBlock>,
-    /// Successors for all lowered blocks, in one serialized vector. Indexed by
-    /// the ranges in `lowered_succ_ranges`.
-    #[allow(dead_code)]
-    lowered_succs: Vec<(Inst, LoweredBlock)>,
-    /// BlockIndex values for successors for all lowered blocks, in the same
-    /// order as `lowered_succs`.
-    lowered_succ_indices: Vec<(Inst, BlockIndex)>,
-    /// Ranges in `lowered_succs` giving the successor lists for each lowered
+    /// BlockIndex values for successors for all lowered blocks, indexing `lowered_order`.
+    lowered_succ_indices: Vec<BlockIndex>,
+    /// Ranges in `lowered_succ_indices` giving the successor lists for each lowered
    /// block. Indexed by lowering-order index (`BlockIndex`).
-    lowered_succ_ranges: Vec<(usize, usize)>,
-    /// Mapping from CLIF BB to BlockIndex (index in lowered order). Note that
-    /// some CLIF BBs may not be lowered; in particular, we skip unreachable
-    /// blocks.
-    #[allow(dead_code)]
-    orig_map: SecondaryMap<Block, Option<BlockIndex>>,
+    lowered_succ_ranges: Vec<(Option<Inst>, std::ops::Range<usize>)>,
    /// Cold blocks. These blocks are not reordered in the
    /// `lowered_order` above; the lowered order must respect RPO
    /// (uses after defs) in order for lowering to be
@@ -110,390 +92,198 @@ pub struct BlockLoweringOrder {
    indirect_branch_targets: FxHashSet<BlockIndex>,
 }

-/// The origin of a block in the lowered block-order: either an original CLIF
-/// block, or an inserted edge-block, or a combination of the two if an edge is
-/// non-critical.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
 pub enum LoweredBlock {
-    /// Block in original CLIF, with no merged edge-blocks.
+    /// Block in original CLIF.
    Orig {
        /// Original CLIF block.
        block: Block,
    },
-    /// Block in the original CLIF, plus edge-block to one succ (which is the
-    /// one successor of the original block).
-    OrigAndEdge {
-        /// The original CLIF block contained in this lowered block.
-        block: Block,
-        /// The edge (jump) instruction transitioning from this block
-        /// to the next, i.e., corresponding to the included edge-block. This
-        /// will be an instruction in `block`.
-        edge_inst: Inst,
-        /// The successor index in this edge, to distinguish multiple
-        /// edges between the same block pair.
-        succ_idx: usize,
-        /// The successor CLIF block.
-        succ: Block,
-    },
-    /// Block in the original CLIF, preceded by edge-block from one pred (which
-    /// is the one pred of the original block).
-    EdgeAndOrig {
-        /// The previous CLIF block, i.e., the edge block's predecessor.
+
+    /// Critical edge between two CLIF blocks.
+    CriticalEdge {
+        /// The predecessor block.
        pred: Block,
-        /// The edge (jump) instruction corresponding to the included
-        /// edge-block. This will be an instruction in `pred`.
-        edge_inst: Inst,
-        /// The successor index in this edge, to distinguish multiple
-        /// edges between the same block pair.
-        succ_idx: usize,
-        /// The original CLIF block included in this lowered block.
-        block: Block,
-    },
-    /// Split critical edge between two CLIF blocks. This lowered block does not
-    /// correspond to any original CLIF blocks; it only serves as an insertion
-    /// point for work to happen on the transition from `pred` to `succ`.
-    Edge {
-        /// The predecessor CLIF block.
-        pred: Block,
-        /// The edge (jump) instruction corresponding to this edge's transition.
-        /// This will be an instruction in `pred`.
-        edge_inst: Inst,
-        /// The successor index in this edge, to distinguish multiple
-        /// edges between the same block pair.
-        succ_idx: usize,
-        /// The successor CLIF block.
+
+        /// The successor block.
        succ: Block,
+
+        /// The index of this branch in the successor edges from `pred`, following the same
+        /// indexing order as `inst_predicates::visit_block_succs`. This is used to distinguish
+        /// multiple edges between the same CLIF blocks.
+        succ_idx: u32,
    },
 }

 impl LoweredBlock {
-    /// The associated original (CLIF) block included in this lowered block, if
-    /// any.
-    pub fn orig_block(self) -> Option<Block> {
+    /// Unwrap an `Orig` block.
+    pub fn orig_block(&self) -> Option<Block> {
        match self {
-            LoweredBlock::Orig { block, .. }
-            | LoweredBlock::OrigAndEdge { block, .. }
-            | LoweredBlock::EdgeAndOrig { block, .. } => Some(block),
-            LoweredBlock::Edge { .. } => None,
+            &LoweredBlock::Orig { block } => Some(block),
+            &LoweredBlock::CriticalEdge { .. } => None,
        }
    }

-    /// The associated in-edge, if any.
+    /// The associated in-edge predecessor, if this is a critical edge.
    #[cfg(test)]
-    pub fn in_edge(self) -> Option<(Block, Inst, Block)> {
+    pub fn in_edge(&self) -> Option<Block> {
        match self {
-            LoweredBlock::EdgeAndOrig {
-                pred,
-                edge_inst,
-                block,
-                ..
-            } => Some((pred, edge_inst, block)),
-            _ => None,
+            &LoweredBlock::CriticalEdge { pred, .. } => Some(pred),
+            &LoweredBlock::Orig { .. } => None,
        }
    }

-    /// the associated out-edge, if any. Also includes edge-only blocks.
+    /// The associated out-edge successor, if this is a critical edge.
    #[cfg(test)]
-    pub fn out_edge(self) -> Option<(Block, Inst, Block)> {
+    pub fn out_edge(&self) -> Option<Block> {
        match self {
-            LoweredBlock::OrigAndEdge {
-                block,
-                edge_inst,
-                succ,
-                ..
-            } => Some((block, edge_inst, succ)),
-            LoweredBlock::Edge {
-                pred,
-                edge_inst,
-                succ,
-                ..
-            } => Some((pred, edge_inst, succ)),
-            _ => None,
+            &LoweredBlock::CriticalEdge { succ, .. } => Some(succ),
+            &LoweredBlock::Orig { .. } => None,
        }
    }
 }

 impl BlockLoweringOrder {
    /// Compute and return a lowered block order for `f`.
-    pub fn new(f: &Function) -> BlockLoweringOrder {
+    pub fn new(f: &Function, domtree: &DominatorTree) -> BlockLoweringOrder {
        trace!("BlockLoweringOrder: function body {:?}", f);

-        // Make sure that we have an entry block, and the entry block is
-        // not marked as cold. (The verifier ensures this as well, but
-        // the user may not have run the verifier, and this property is
-        // critical to avoid a miscompile, so we assert it here too.)
-        let entry = f.layout.entry_block().expect("Must have entry block");
-        assert!(!f.layout.is_cold(entry));
-
        // Step 1: compute the in-edge and out-edge count of every block.
        let mut block_in_count = SecondaryMap::with_default(0);
        let mut block_out_count = SecondaryMap::with_default(0);

-        // Cache the block successors to avoid re-examining branches below.
-        let mut block_succs: SmallVec<[(Inst, usize, Block); 128]> = SmallVec::new();
-        let mut block_succ_range = SecondaryMap::with_default((0, 0));
+        // Block successors are stored as `LoweredBlocks` to simplify the construction of
+        // `lowered_succs` in the final result. Initially, all entries are `Orig` values, and are
+        // updated to be `CriticalEdge` when those cases are identified in step 2 below.
+        let mut block_succs: SmallVec<[LoweredBlock; 128]> = SmallVec::new();
+        let mut block_succ_range = SecondaryMap::with_default(0..0);
+
        let mut indirect_branch_target_clif_blocks = FxHashSet::default();

        for block in f.layout.blocks() {
-            let block_succ_start = block_succs.len();
-            let mut succ_idx = 0;
-            visit_block_succs(f, block, |inst, succ, from_table| {
+            let start = block_succs.len();
+            visit_block_succs(f, block, |_, succ, from_table| {
                block_out_count[block] += 1;
                block_in_count[succ] += 1;
-                block_succs.push((inst, succ_idx, succ));
-                succ_idx += 1;
+                block_succs.push(LoweredBlock::Orig { block: succ });

                if from_table {
                    indirect_branch_target_clif_blocks.insert(succ);
                }
            });
-            let block_succ_end = block_succs.len();
-            block_succ_range[block] = (block_succ_start, block_succ_end);

+            // Ensure that blocks terminated by br_table instructions with an empty jump table are
+            // still treated like conditional blocks from the point of view of critical edge
+            // splitting.
            if let Some(inst) = f.layout.last_inst(block) {
-                if f.dfg.insts[inst].opcode() == Opcode::Return {
-                    // Implicit output edge for any return.
-                    block_out_count[block] += 1;
+                if Opcode::BrTable == f.dfg.insts[inst].opcode() {
+                    block_out_count[block] = block_out_count[block].max(2);
                }
            }
-        }
-        // Implicit input edge for entry block.
-        block_in_count[entry] += 1;

-        // All blocks ending in conditional branches or br_tables must
-        // have edge-moves inserted at the top of successor blocks,
-        // not at the end of themselves. This is because the moves
-        // would have to be inserted prior to the branch's register
-        // use; but RA2's model is that the moves happen *on* the
-        // edge, after every def/use in the block. RA2 will check for
-        // "branch register use safety" and panic if such a problem
-        // occurs. To avoid this, we force the below algorithm to
-        // never merge the edge block onto the end of a block that
-        // ends in a conditional branch. We do this by "faking" more
-        // than one successor, even if there is only one.
-        //
-        // (One might ask, isn't that always the case already? It
-        // could not be, in cases of br_table with no table and just a
-        // default label, for example.)
-        for block in f.layout.blocks() {
-            if let Some(inst) = f.layout.last_inst(block) {
-                // If the block has a branch with any "fixed args"
-                // (not blockparam args) ...
-                if f.dfg.insts[inst].opcode().is_branch() && f.dfg.inst_fixed_args(inst).len() > 0 {
-                    // ... then force a minimum successor count of
-                    // two, so the below algorithm cannot put
-                    // edge-moves on the end of the block.
-                    block_out_count[block] = std::cmp::max(2, block_out_count[block]);
-                }
-            }
+            let end = block_succs.len();
+            block_succ_range[block] = start..end;
        }

-        // Here we define the implicit CLIF-plus-edges graph. There are
-        // conceptually two such graphs: the original, with every edge explicit,
-        // and the merged one, with blocks (represented by `LoweredBlock`
-        // values) that contain original CLIF blocks, edges, or both. This
-        // function returns a lowered block's successors as per the latter, with
-        // consideration to edge-block merging.
-        //
-        // Note that there is a property of the block-merging rules below
-        // that is very important to ensure we don't miss any lowered blocks:
-        // any block in the implicit CLIF-plus-edges graph will *only* be
-        // included in one block in the merged graph.
-        //
-        // This, combined with the property that every edge block is reachable
-        // only from one predecessor (and hence cannot be reached by a DFS
-        // backedge), means that it is sufficient in our DFS below to track
-        // visited-bits per original CLIF block only, not per edge. This greatly
-        // simplifies the data structures (no need to keep a sparse hash-set of
-        // (block, block) tuples).
-        let compute_lowered_succs = |ret: &mut Vec<(Inst, LoweredBlock)>, block: LoweredBlock| {
-            let start_idx = ret.len();
-            match block {
-                LoweredBlock::Orig { block } | LoweredBlock::EdgeAndOrig { block, .. } => {
-                    // At an orig block; successors are always edge blocks,
-                    // possibly with orig blocks following.
-                    let range = block_succ_range[block];
-                    for &(edge_inst, succ_idx, succ) in &block_succs[range.0..range.1] {
-                        if block_in_count[succ] == 1 {
-                            ret.push((
-                                edge_inst,
-                                LoweredBlock::EdgeAndOrig {
-                                    pred: block,
-                                    edge_inst,
-                                    succ_idx,
-                                    block: succ,
-                                },
-                            ));
-                        } else {
-                            ret.push((
-                                edge_inst,
-                                LoweredBlock::Edge {
-                                    pred: block,
-                                    edge_inst,
-                                    succ_idx,
-                                    succ,
-                                },
-                            ));
-                        }
-                    }
-                }
-                LoweredBlock::Edge {
-                    succ, edge_inst, ..
-                }
-                | LoweredBlock::OrigAndEdge {
-                    succ, edge_inst, ..
-                } => {
-                    // At an edge block; successors are always orig blocks,
-                    // possibly with edge blocks following.
-                    if block_out_count[succ] == 1 {
-                        let range = block_succ_range[succ];
-                        // check if the one succ is a real CFG edge (vs.
-                        // implicit return succ).
-                        if range.1 - range.0 > 0 {
-                            debug_assert!(range.1 - range.0 == 1);
-                            let (succ_edge_inst, succ_succ_idx, succ_succ) = block_succs[range.0];
-                            ret.push((
-                                edge_inst,
-                                LoweredBlock::OrigAndEdge {
-                                    block: succ,
-                                    edge_inst: succ_edge_inst,
-                                    succ_idx: succ_succ_idx,
-                                    succ: succ_succ,
-                                },
-                            ));
-                        } else {
-                            ret.push((edge_inst, LoweredBlock::Orig { block: succ }));
-                        }
-                    } else {
-                        ret.push((edge_inst, LoweredBlock::Orig { block: succ }));
-                    }
-                }
-            }
-            let end_idx = ret.len();
-            (start_idx, end_idx)
-        };
+        // Step 2: walk the postorder from the domtree in reverse to produce our desired node
+        // lowering order, identifying critical edges to split along the way.

-        // Build the explicit LoweredBlock-to-LoweredBlock successors list.
-        let mut lowered_succs = vec![];
-        let mut lowered_succ_indices = vec![];
-
-        // Step 2: Compute RPO traversal of the implicit CLIF-plus-edge-block graph. Use an
-        // explicit stack so we don't overflow the real stack with a deep DFS.
-        #[derive(Debug)]
-        struct StackEntry {
-            this: LoweredBlock,
-            succs: (usize, usize), // range in lowered_succs
-            cur_succ: usize,       // index in lowered_succs
-        }
-
-        let mut stack: SmallVec<[StackEntry; 16]> = SmallVec::new();
-        let mut visited = FxHashSet::default();
-        let mut postorder = vec![];
-
-        // Add the entry block.
-        //
-        // FIXME(cfallin): we might be able to use OrigAndEdge. Find a
-        // way to not special-case the entry block here.
-        let block = LoweredBlock::Orig { block: entry };
-        visited.insert(block);
-        let range = compute_lowered_succs(&mut lowered_succs, block);
-        lowered_succ_indices.resize(lowered_succs.len(), 0);
-        stack.push(StackEntry {
-            this: block,
-            succs: range,
-            cur_succ: range.1,
-        });
-
-        while !stack.is_empty() {
-            let stack_entry = stack.last_mut().unwrap();
-            let range = stack_entry.succs;
-            if stack_entry.cur_succ == range.0 {
-                postorder.push((stack_entry.this, range));
-                stack.pop();
-            } else {
-                // Heuristic: chase the children in reverse. This puts the first
-                // successor block first in RPO, all other things being equal,
-                // which tends to prioritize loop backedges over out-edges,
-                // putting the edge-block closer to the loop body and minimizing
-                // live-ranges in linear instruction space.
-                let next = lowered_succs[stack_entry.cur_succ - 1].1;
-                stack_entry.cur_succ -= 1;
-                if visited.contains(&next) {
-                    continue;
-                }
-                visited.insert(next);
-                let range = compute_lowered_succs(&mut lowered_succs, next);
-                lowered_succ_indices.resize(lowered_succs.len(), 0);
-                stack.push(StackEntry {
-                    this: next,
-                    succs: range,
-                    cur_succ: range.1,
-                });
-            }
-        }
-
-        postorder.reverse();
-        let rpo = postorder;
-
-        // Step 3: now that we have RPO, build the BlockIndex/BB fwd/rev maps.
-        let mut lowered_order = vec![];
-        let mut cold_blocks = FxHashSet::default();
-        let mut lowered_succ_ranges = vec![];
        let mut lb_to_bindex = FxHashMap::default();
+        let mut lowered_order = Vec::new();
+
+        for &block in domtree.cfg_postorder().iter().rev() {
+            let lb = LoweredBlock::Orig { block };
+            let bindex = BlockIndex::new(lowered_order.len());
+            lb_to_bindex.insert(lb.clone(), bindex);
+            lowered_order.push(lb);
+
+            if block_out_count[block] > 1 {
+                let range = block_succ_range[block].clone();
+                for (succ_ix, lb) in block_succs[range].iter_mut().enumerate() {
+                    let succ = lb.orig_block().unwrap();
+                    if block_in_count[succ] > 1 {
+                        // Mutate the successor to be a critical edge, as `block` has multiple
+                        // edges leaving it, and `succ` has multiple edges entering it.
+                        *lb = LoweredBlock::CriticalEdge {
+                            pred: block,
+                            succ,
+                            succ_idx: succ_ix as u32,
+                        };
+                        let bindex = BlockIndex::new(lowered_order.len());
+                        lb_to_bindex.insert(*lb, bindex);
+                        lowered_order.push(*lb);
+                    }
+                }
+            }
+        }
+
+        // Step 3: build the successor tables given the lowering order. We can't perform this step
+        // during the creation of `lowering_order`, as we need `lb_to_bindex` to be fully populated
+        // first.
+        let mut lowered_succ_indices = Vec::new();
+        let mut cold_blocks = FxHashSet::default();
        let mut indirect_branch_targets = FxHashSet::default();
-        for (block, succ_range) in rpo.into_iter() {
-            let index = BlockIndex::new(lowered_order.len());
-            lb_to_bindex.insert(block, index);
-            lowered_order.push(block);
-            lowered_succ_ranges.push(succ_range);
+        let lowered_succ_ranges =
+            Vec::from_iter(lowered_order.iter().enumerate().map(|(ix, lb)| {
+                let bindex = BlockIndex::new(ix);
+                let start = lowered_succ_indices.len();
+                let opt_inst = match lb {
+                    // Block successors are pulled directly over, as they'll have been mutated when
+                    // determining the block order already.
+                    &LoweredBlock::Orig { block } => {
+                        let range = block_succ_range[block].clone();
+                        lowered_succ_indices
+                            .extend(block_succs[range].iter().map(|lb| lb_to_bindex[lb]));

-            match block {
-                LoweredBlock::Orig { block }
-                | LoweredBlock::OrigAndEdge { block, .. }
-                | LoweredBlock::EdgeAndOrig { block, .. } => {
-                    if f.layout.is_cold(block) {
-                        cold_blocks.insert(index);
+                        if f.layout.is_cold(block) {
+                            cold_blocks.insert(bindex);
+                        }
+
+                        if indirect_branch_target_clif_blocks.contains(&block) {
+                            indirect_branch_targets.insert(bindex);
+                        }
+
+                        let last = f.layout.last_inst(block).unwrap();
+                        let opcode = f.dfg.insts[last].opcode();
+
+                        assert!(opcode.is_terminator());
+
+                        opcode.is_branch().then_some(last)
                    }

-                    if indirect_branch_target_clif_blocks.contains(&block) {
-                        indirect_branch_targets.insert(index);
-                    }
-                }
-                LoweredBlock::Edge { pred, succ, .. } => {
-                    if f.layout.is_cold(pred) || f.layout.is_cold(succ) {
-                        cold_blocks.insert(index);
-                    }
+                    // Critical edges won't have successor information in block_succ_range, but
+                    // they only have a single known successor to record anyway.
+                    &LoweredBlock::CriticalEdge { succ, .. } => {
+                        let succ_index = lb_to_bindex[&LoweredBlock::Orig { block: succ }];
+                        lowered_succ_indices.push(succ_index);

-                    if indirect_branch_target_clif_blocks.contains(&succ) {
-                        indirect_branch_targets.insert(index);
+                        // Edges inherit indirect branch and cold block metadata from their
+                        // successor.
+
+                        if f.layout.is_cold(succ) {
+                            cold_blocks.insert(bindex);
+                        }
+
+                        if indirect_branch_target_clif_blocks.contains(&succ) {
+                            indirect_branch_targets.insert(bindex);
+                        }
+
+                        None
                    }
-                }
-            }
-        }
-
-        let lowered_succ_indices = lowered_succs
-            .iter()
-            .map(|&(inst, succ)| (inst, lb_to_bindex.get(&succ).cloned().unwrap()))
-            .collect();
-
-        let mut orig_map = SecondaryMap::with_default(None);
-        for (i, lb) in lowered_order.iter().enumerate() {
-            let i = BlockIndex::new(i);
-            if let Some(b) = lb.orig_block() {
-                orig_map[b] = Some(i);
-            }
-        }
+                };
+                let end = lowered_succ_indices.len();
+                (opt_inst, start..end)
+            }));

        let result = BlockLoweringOrder {
            lowered_order,
-            lowered_succs,
            lowered_succ_indices,
            lowered_succ_ranges,
-            orig_map,
            cold_blocks,
            indirect_branch_targets,
        };
-        trace!("BlockLoweringOrder: {:?}", result);
+
+        trace!("BlockLoweringOrder: {:#?}", result);
        result
    }

@@ -503,9 +293,9 @@ impl BlockLoweringOrder {
    }

    /// Get the successor indices for a lowered block.
-    pub fn succ_indices(&self, block: BlockIndex) -> &[(Inst, BlockIndex)] {
-        let range = self.lowered_succ_ranges[block.index()];
-        &self.lowered_succ_indices[range.0..range.1]
+    pub fn succ_indices(&self, block: BlockIndex) -> (Option<Inst>, &[BlockIndex]) {
+        let (opt_inst, range) = &self.lowered_succ_ranges[block.index()];
+        (opt_inst.clone(), &self.lowered_succ_indices[range.clone()])
    }

    /// Determine whether the given lowered-block index is cold.
@@ -524,12 +314,13 @@ impl BlockLoweringOrder {
 mod test {
    use super::*;
    use crate::cursor::{Cursor, FuncCursor};
+    use crate::flowgraph::ControlFlowGraph;
    use crate::ir::types::*;
    use crate::ir::UserFuncName;
    use crate::ir::{AbiParam, Function, InstBuilder, Signature};
    use crate::isa::CallConv;

-    fn build_test_func(n_blocks: usize, edges: &[(usize, usize)]) -> Function {
+    fn build_test_func(n_blocks: usize, edges: &[(usize, usize)]) -> BlockLoweringOrder {
        assert!(n_blocks > 0);

        let name = UserFuncName::testcase("test0");
@@ -568,42 +359,20 @@ mod test {
            }
        }

-        func
+        let mut cfg = ControlFlowGraph::new();
+        cfg.compute(&func);
+        let dom_tree = DominatorTree::with_function(&func, &cfg);
+
+        BlockLoweringOrder::new(&func, &dom_tree)
    }

    #[test]
    fn test_blockorder_diamond() {
-        let func = build_test_func(4, &[(0, 1), (0, 2), (1, 3), (2, 3)]);
-        let order = BlockLoweringOrder::new(&func);
+        let order = build_test_func(4, &[(0, 1), (0, 2), (1, 3), (2, 3)]);

-        assert_eq!(order.lowered_order.len(), 6);
-
-        assert!(order.lowered_order[0].orig_block().unwrap().as_u32() == 0);
-        assert!(order.lowered_order[0].in_edge().is_none());
-        assert!(order.lowered_order[0].out_edge().is_none());
-
-        assert!(order.lowered_order[1].orig_block().unwrap().as_u32() == 1);
-        assert!(order.lowered_order[1].in_edge().unwrap().0.as_u32() == 0);
-        assert!(order.lowered_order[1].in_edge().unwrap().2.as_u32() == 1);
-
-        assert!(order.lowered_order[2].orig_block().is_none());
-        assert!(order.lowered_order[2].in_edge().is_none());
-        assert!(order.lowered_order[2].out_edge().unwrap().0.as_u32() == 1);
-        assert!(order.lowered_order[2].out_edge().unwrap().2.as_u32() == 3);
-
-        assert!(order.lowered_order[3].orig_block().unwrap().as_u32() == 2);
-        assert!(order.lowered_order[3].in_edge().unwrap().0.as_u32() == 0);
-        assert!(order.lowered_order[3].in_edge().unwrap().2.as_u32() == 2);
-        assert!(order.lowered_order[3].out_edge().is_none());
-
-        assert!(order.lowered_order[4].orig_block().is_none());
-        assert!(order.lowered_order[4].in_edge().is_none());
-        assert!(order.lowered_order[4].out_edge().unwrap().0.as_u32() == 2);
-        assert!(order.lowered_order[4].out_edge().unwrap().2.as_u32() == 3);
-
-        assert!(order.lowered_order[5].orig_block().unwrap().as_u32() == 3);
-        assert!(order.lowered_order[5].in_edge().is_none());
-        assert!(order.lowered_order[5].out_edge().is_none());
+        // This test case doesn't need to introduce any critical edges, as all regalloc allocations
+        // can sit on either the entry or exit of blocks 1 and 2.
+        assert_eq!(order.lowered_order.len(), 4);
    }

    #[test]
@@ -618,9 +387,9 @@ mod test {
        //       | /\ |
        //       5    6
        //
-        // (3 -> 5, 3 -> 6, 4 -> 6 are critical edges and must be split)
+        // (3 -> 5, and 3 -> 6 are critical edges and must be split)
        //
-        let func = build_test_func(
+        let order = build_test_func(
            7,
            &[
                (0, 1),
@@ -633,72 +402,53 @@ mod test {
                (4, 6),
            ],
        );
-        let order = BlockLoweringOrder::new(&func);

-        assert_eq!(order.lowered_order.len(), 11);
+        assert_eq!(order.lowered_order.len(), 9);
        println!("ordered = {:?}", order.lowered_order);

        // block 0
-        assert!(order.lowered_order[0].orig_block().unwrap().as_u32() == 0);
+        assert_eq!(order.lowered_order[0].orig_block().unwrap().as_u32(), 0);
        assert!(order.lowered_order[0].in_edge().is_none());
        assert!(order.lowered_order[0].out_edge().is_none());

-        // edge 0->1 + block 1
-        assert!(order.lowered_order[1].orig_block().unwrap().as_u32() == 1);
-        assert!(order.lowered_order[1].in_edge().unwrap().0.as_u32() == 0);
-        assert!(order.lowered_order[1].in_edge().unwrap().2.as_u32() == 1);
+        // block 2
+        assert_eq!(order.lowered_order[1].orig_block().unwrap().as_u32(), 2);
+        assert!(order.lowered_order[1].in_edge().is_none());
        assert!(order.lowered_order[1].out_edge().is_none());

-        // edge 1->3 + block 3
-        assert!(order.lowered_order[2].orig_block().unwrap().as_u32() == 3);
-        assert!(order.lowered_order[2].in_edge().unwrap().0.as_u32() == 1);
-        assert!(order.lowered_order[2].in_edge().unwrap().2.as_u32() == 3);
+        // block 1
+        assert_eq!(order.lowered_order[2].orig_block().unwrap().as_u32(), 1);
+        assert!(order.lowered_order[2].in_edge().is_none());
        assert!(order.lowered_order[2].out_edge().is_none());

-        // edge 3->5
-        assert!(order.lowered_order[3].orig_block().is_none());
+        // block 4
+        assert_eq!(order.lowered_order[3].orig_block().unwrap().as_u32(), 4);
        assert!(order.lowered_order[3].in_edge().is_none());
-        assert!(order.lowered_order[3].out_edge().unwrap().0.as_u32() == 3);
-        assert!(order.lowered_order[3].out_edge().unwrap().2.as_u32() == 5);
+        assert!(order.lowered_order[3].out_edge().is_none());

-        // edge 3->6
-        assert!(order.lowered_order[4].orig_block().is_none());
+        // block 3
+        assert_eq!(order.lowered_order[4].orig_block().unwrap().as_u32(), 3);
        assert!(order.lowered_order[4].in_edge().is_none());
-        assert!(order.lowered_order[4].out_edge().unwrap().0.as_u32() == 3);
-        assert!(order.lowered_order[4].out_edge().unwrap().2.as_u32() == 6);
+        assert!(order.lowered_order[4].out_edge().is_none());

-        // edge 1->4 + block 4
-        assert!(order.lowered_order[5].orig_block().unwrap().as_u32() == 4);
-        assert!(order.lowered_order[5].in_edge().unwrap().0.as_u32() == 1);
-        assert!(order.lowered_order[5].in_edge().unwrap().2.as_u32() == 4);
-        assert!(order.lowered_order[5].out_edge().is_none());
+        // critical edge 3 -> 5
+        assert!(order.lowered_order[5].orig_block().is_none());
+        assert_eq!(order.lowered_order[5].in_edge().unwrap().as_u32(), 3);
+        assert_eq!(order.lowered_order[5].out_edge().unwrap().as_u32(), 5);

-        // edge 4->6
+        // critical edge 3 -> 6
        assert!(order.lowered_order[6].orig_block().is_none());
-        assert!(order.lowered_order[6].in_edge().is_none());
-        assert!(order.lowered_order[6].out_edge().unwrap().0.as_u32() == 4);
-        assert!(order.lowered_order[6].out_edge().unwrap().2.as_u32() == 6);
+        assert_eq!(order.lowered_order[6].in_edge().unwrap().as_u32(), 3);
+        assert_eq!(order.lowered_order[6].out_edge().unwrap().as_u32(), 6);

        // block 6
-        assert!(order.lowered_order[7].orig_block().unwrap().as_u32() == 6);
+        assert_eq!(order.lowered_order[7].orig_block().unwrap().as_u32(), 6);
        assert!(order.lowered_order[7].in_edge().is_none());
        assert!(order.lowered_order[7].out_edge().is_none());

-        // edge 0->2 + block 2
-        assert!(order.lowered_order[8].orig_block().unwrap().as_u32() == 2);
-        assert!(order.lowered_order[8].in_edge().unwrap().0.as_u32() == 0);
-        assert!(order.lowered_order[8].in_edge().unwrap().2.as_u32() == 2);
-        assert!(order.lowered_order[8].out_edge().is_none());
-
-        // edge 2->5
-        assert!(order.lowered_order[9].orig_block().is_none());
-        assert!(order.lowered_order[9].in_edge().is_none());
-        assert!(order.lowered_order[9].out_edge().unwrap().0.as_u32() == 2);
-        assert!(order.lowered_order[9].out_edge().unwrap().2.as_u32() == 5);
-
        // block 5
-        assert!(order.lowered_order[10].orig_block().unwrap().as_u32() == 5);
-        assert!(order.lowered_order[10].in_edge().is_none());
-        assert!(order.lowered_order[10].out_edge().is_none());
+        assert_eq!(order.lowered_order[8].orig_block().unwrap().as_u32(), 5);
+        assert!(order.lowered_order[8].in_edge().is_none());
+        assert!(order.lowered_order[8].out_edge().is_none());
    }
 }
--- a/cranelift/codegen/src/machinst/compile.rs
+++ b/cranelift/codegen/src/machinst/compile.rs
@@ -1,5 +1,6 @@
 //! Compilation backend pipeline: optimized IR to VCode / binemit.

+use crate::dominator_tree::DominatorTree;
 use crate::ir::Function;
 use crate::isa::TargetIsa;
 use crate::machinst::*;
@@ -12,6 +13,7 @@ use regalloc2::RegallocOptions;
 /// for binary emission.
 pub fn compile<B: LowerBackend + TargetIsa>(
    f: &Function,
+    domtree: &DominatorTree,
    b: &B,
    abi: Callee<<<B as LowerBackend>::MInst as MachInst>::ABIMachineSpec>,
    emit_info: <B::MInst as MachInstEmit>::Info,
@@ -20,7 +22,7 @@ pub fn compile<B: LowerBackend + TargetIsa>(
    let machine_env = b.machine_env();

    // Compute lowered block order.
-    let block_order = BlockLoweringOrder::new(f);
+    let block_order = BlockLoweringOrder::new(f, domtree);

    // Build the lowering context.
    let lower = crate::machinst::Lower::new(f, machine_env, abi, emit_info, block_order, sigs)?;
--- a/cranelift/codegen/src/machinst/lower.rs
+++ b/cranelift/codegen/src/machinst/lower.rs
@@ -928,9 +928,12 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
    }

    fn lower_branch_blockparam_args(&mut self, block: BlockIndex) {
-        for succ_idx in 0..self.vcode.block_order().succ_indices(block).len() {
+        // TODO: why not make `block_order` public?
+        for succ_idx in 0..self.vcode.block_order().succ_indices(block).1.len() {
            // Avoid immutable borrow by explicitly indexing.
-            let (inst, succ) = self.vcode.block_order().succ_indices(block)[succ_idx];
+            let (opt_inst, succs) = self.vcode.block_order().succ_indices(block);
+            let inst = opt_inst.expect("lower_branch_blockparam_args called on a critical edge!");
+            let succ = succs[succ_idx];

            // The use of `succ_idx` to index `branch_destination` is valid on the assumption that
            // the traversal order defined in `visit_block_succs` mirrors the order returned by
@@ -960,17 +963,9 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
        targets: &mut SmallVec<[MachLabel; 2]>,
    ) -> Option<Inst> {
        targets.clear();
-        let mut last_inst = None;
-        for &(inst, succ) in self.vcode.block_order().succ_indices(bindex) {
-            // Basic blocks may end in a single branch instruction, but those instructions may have
-            // multiple destinations. As such, all `inst` values in `succ_indices` must be the
-            // same, or this basic block would have multiple branch instructions present.
-            debug_assert!(last_inst.map_or(true, |prev| prev == inst));
-            last_inst = Some(inst);
-            targets.push(MachLabel::from_block(succ));
-        }
-
-        last_inst
+        let (opt_inst, succs) = self.vcode.block_order().succ_indices(bindex);
+        targets.extend(succs.iter().map(|succ| MachLabel::from_block(*succ)));
+        opt_inst
    }

    /// Lower the function.
@@ -1025,7 +1020,8 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
                // according to the one successor, and pass them
                // through; note that the successor must have an
                // original block.
-                let (_, succ) = self.vcode.block_order().succ_indices(bindex)[0];
+                let (_, succs) = self.vcode.block_order().succ_indices(bindex);
+                let succ = succs[0];

                let orig_succ = lowered_order[succ.index()];
                let orig_succ = orig_succ