Cranelift: de-duplicate bounds checks in legalizations (#5190)

* Cranelift: Add the `DataFlowGraph::display_value_inst` convenience method * Cranelift: Add some `trace!` logs to some parts of legalization * Cranelift: de-duplicate bounds checks in legalizations When both (1) "dynamic" memories that need explicit bounds checks and (2) spectre mitigations that perform bounds checks are enabled, reuse the same bounds checks between the two legalizations. This reduces the overhead of explicit bounds checks and spectre mitigations over using virtual memory guard pages with spectre mitigations from ~1.9-2.1x overhead to ~1.6-1.8x overhead. That is about a 14-19% speed up for when dynamic memories and spectre mitigations are enabled. <details> ``` execution :: instructions-retired :: benchmarks/spidermonkey/benchmark.wasm Δ = 3422455129.47 ± 120159.49 (confidence = 99%) virtual-memory-guards.so is 2.09x to 2.09x faster than bounds-checks.so! [6563931659 6564063496.07 6564301535] bounds-checks.so [3141492675 3141608366.60 3141895249] virtual-memory-guards.so execution :: instructions-retired :: benchmarks/bz2/benchmark.wasm Δ = 338716136.87 ± 1.38 (confidence = 99%) virtual-memory-guards.so is 2.08x to 2.08x faster than bounds-checks.so! [651961494 651961495.47 651961497] bounds-checks.so [313245357 313245358.60 313245362] virtual-memory-guards.so execution :: instructions-retired :: benchmarks/pulldown-cmark/benchmark.wasm Δ = 22742944.07 ± 331.73 (confidence = 99%) virtual-memory-guards.so is 1.87x to 1.87x faster than bounds-checks.so! [48841295 48841567.33 48842139] bounds-checks.so [26098439 26098623.27 26099479] virtual-memory-guards.so ``` </details> <details> ``` execution :: instructions-retired :: benchmarks/spidermonkey/benchmark.wasm Δ = 2465900207.27 ± 146476.61 (confidence = 99%) virtual-memory-guards.so is 1.78x to 1.78x faster than de-duped-bounds-checks.so! [5607275431 5607442989.13 5607838342] de-duped-bounds-checks.so [3141445345 3141542781.87 3141711213] virtual-memory-guards.so execution :: instructions-retired :: benchmarks/bz2/benchmark.wasm Δ = 234253620.20 ± 2.33 (confidence = 99%) virtual-memory-guards.so is 1.75x to 1.75x faster than de-duped-bounds-checks.so! [547498977 547498980.93 547498985] de-duped-bounds-checks.so [313245357 313245360.73 313245363] virtual-memory-guards.so execution :: instructions-retired :: benchmarks/pulldown-cmark/benchmark.wasm Δ = 16605659.13 ± 315.78 (confidence = 99%) virtual-memory-guards.so is 1.64x to 1.64x faster than de-duped-bounds-checks.so! [42703971 42704284.40 42704787] de-duped-bounds-checks.so [26098432 26098625.27 26099234] virtual-memory-guards.so ``` </details> <details> ``` execution :: instructions-retired :: benchmarks/bz2/benchmark.wasm Δ = 104462517.13 ± 7.32 (confidence = 99%) de-duped-bounds-checks.so is 1.19x to 1.19x faster than bounds-checks.so! [651961493 651961500.80 651961532] bounds-checks.so [547498981 547498983.67 547498989] de-duped-bounds-checks.so execution :: instructions-retired :: benchmarks/spidermonkey/benchmark.wasm Δ = 956556982.80 ± 103034.59 (confidence = 99%) de-duped-bounds-checks.so is 1.17x to 1.17x faster than bounds-checks.so! [6563930590 6564019842.40 6564243651] bounds-checks.so [5607307146 5607462859.60 5607677763] de-duped-bounds-checks.so execution :: instructions-retired :: benchmarks/pulldown-cmark/benchmark.wasm Δ = 6137307.87 ± 247.75 (confidence = 99%) de-duped-bounds-checks.so is 1.14x to 1.14x faster than bounds-checks.so! [48841303 48841472.93 48842000] bounds-checks.so [42703965 42704165.07 42704718] de-duped-bounds-checks.so ``` </details> * Update test expectations * Add a test for deduplicating bounds checks between dynamic memories and spectre mitigations * Define a struct for the Spectre comparison instead of using a tuple * More trace logging for heap legalization
2022-11-15 08:47:22 -08:00
parent dece901d16
commit c2a7ea7e24
9 changed files with 187 additions and 104 deletions
--- a/cranelift/codegen/src/ir/dfg.rs
+++ b/cranelift/codegen/src/ir/dfg.rs
@@ -595,6 +595,17 @@ impl DataFlowGraph {
        DisplayInst(self, inst)
    }

+    /// Returns an object that displays the given `value`'s defining instruction.
+    ///
+    /// Panics if the value is not defined by an instruction (i.e. it is a basic
+    /// block argument).
+    pub fn display_value_inst(&self, value: Value) -> DisplayInst<'_> {
+        match self.value_def(value) {
+            ir::ValueDef::Result(inst, _) => self.display_inst(inst),
+            ir::ValueDef::Param(_, _) => panic!("value is not defined by an instruction"),
+        }
+    }
+
    /// Get all value arguments on `inst` as a slice.
    pub fn inst_args(&self, inst: Inst) -> &[Value] {
        self.insts[inst].arguments(&self.value_lists)
--- a/cranelift/codegen/src/legalizer/globalvalue.rs
+++ b/cranelift/codegen/src/legalizer/globalvalue.rs
@@ -14,6 +14,12 @@ pub fn expand_global_value(
    isa: &dyn TargetIsa,
    global_value: ir::GlobalValue,
 ) {
+    crate::trace!(
+        "expanding global value: {:?}: {}",
+        inst,
+        func.dfg.display_inst(inst)
+    );
+
    match func.global_values[global_value] {
        ir::GlobalValueData::VMContext => vmctx_addr(inst, func),
        ir::GlobalValueData::IAddImm {
--- a/cranelift/codegen/src/legalizer/heap.rs
+++ b/cranelift/codegen/src/legalizer/heap.rs
@@ -9,6 +9,7 @@ use crate::ir::condcodes::IntCC;
 use crate::ir::immediates::{Uimm32, Uimm8};
 use crate::ir::{self, InstBuilder, RelSourceLoc};
 use crate::isa::TargetIsa;
+use crate::trace;

 /// Expand a `heap_addr` instruction according to the definition of the heap.
 pub fn expand_heap_addr(
@@ -21,6 +22,12 @@ pub fn expand_heap_addr(
    offset_immediate: Uimm32,
    access_size: Uimm8,
 ) {
+    trace!(
+        "expanding heap_addr: {:?}: {}",
+        inst,
+        func.dfg.display_inst(inst)
+    );
+
    match func.heaps[heap].style {
        ir::HeapStyle::Dynamic { bound_gv } => dynamic_addr(
            isa,
@@ -76,6 +83,10 @@ fn dynamic_addr(
        let adj_bound = pos
            .ins()
            .iadd_imm(bound, -(offset_plus_size(offset, access_size) as i64));
+        trace!(
+            "  inserting: {}",
+            pos.func.dfg.display_value_inst(adj_bound)
+        );
        (IntCC::UnsignedGreaterThan, index, adj_bound)
    } else {
        // We need an overflow check for the adjusted offset.
@@ -85,14 +96,30 @@ fn dynamic_addr(
        let adj_offset =
            pos.ins()
                .uadd_overflow_trap(index, access_size_val, ir::TrapCode::HeapOutOfBounds);
+        trace!(
+            "  inserting: {}",
+            pos.func.dfg.display_value_inst(adj_offset)
+        );
        (IntCC::UnsignedGreaterThan, adj_offset, bound)
    };
-    let oob = pos.ins().icmp(cc, lhs, bound);
-    pos.ins().trapnz(oob, ir::TrapCode::HeapOutOfBounds);

    let spectre_oob_comparison = if isa.flags().enable_heap_access_spectre_mitigation() {
-        Some((cc, lhs, bound))
+        // When we emit a spectre-guarded heap access, we do a `select
+        // is_out_of_bounds, NULL, addr` to compute the address, and so the load
+        // will trap if the address is out of bounds, which means we don't need
+        // to do another explicit bounds check like we do below.
+        Some(SpectreOobComparison {
+            cc,
+            lhs,
+            rhs: bound,
+        })
    } else {
+        let oob = pos.ins().icmp(cc, lhs, bound);
+        trace!("  inserting: {}", pos.func.dfg.display_value_inst(oob));
+
+        let trapnz = pos.ins().trapnz(oob, ir::TrapCode::HeapOutOfBounds);
+        trace!("  inserting: {}", pos.func.dfg.display_inst(trapnz));
+
        None
    };

@@ -130,8 +157,10 @@ fn static_addr(
    // This first case is a trivial case where we can statically trap.
    if offset_plus_size(offset, access_size) > bound {
        // This will simply always trap since `offset >= 0`.
-        pos.ins().trap(ir::TrapCode::HeapOutOfBounds);
-        pos.func.dfg.replace(inst).iconst(addr_ty, 0);
+        let trap = pos.ins().trap(ir::TrapCode::HeapOutOfBounds);
+        trace!("  inserting: {}", pos.func.dfg.display_inst(trap));
+        let iconst = pos.func.dfg.replace(inst).iconst(addr_ty, 0);
+        trace!("  inserting: {}", pos.func.dfg.display_value_inst(iconst));

        // Split the block, as the trap is a terminator instruction.
        let curr_block = pos.current_block().expect("Cursor is not in a block");
@@ -180,10 +209,19 @@ fn static_addr(
            (IntCC::UnsignedGreaterThan, index, limit)
        };
        let oob = pos.ins().icmp_imm(cc, lhs, limit_imm);
-        pos.ins().trapnz(oob, ir::TrapCode::HeapOutOfBounds);
+        trace!("  inserting: {}", pos.func.dfg.display_value_inst(oob));
+
+        let trapnz = pos.ins().trapnz(oob, ir::TrapCode::HeapOutOfBounds);
+        trace!("  inserting: {}", pos.func.dfg.display_inst(trapnz));
+
        if isa.flags().enable_heap_access_spectre_mitigation() {
            let limit = pos.ins().iconst(addr_ty, limit_imm);
-            spectre_oob_comparison = Some((cc, lhs, limit));
+            trace!("  inserting: {}", pos.func.dfg.display_value_inst(limit));
+            spectre_oob_comparison = Some(SpectreOobComparison {
+                cc,
+                lhs,
+                rhs: limit,
+            });
        }
    }

@@ -229,6 +267,12 @@ fn cast_index_to_pointer_ty(
    extended_index
 }

+struct SpectreOobComparison {
+    cc: IntCC,
+    lhs: ir::Value,
+    rhs: ir::Value,
+}
+
 /// Emit code for the base address computation of a `heap_addr` instruction.
 fn compute_addr(
    isa: &dyn TargetIsa,
@@ -242,7 +286,7 @@ fn compute_addr(
    // values to compare and the condition code that indicates an out-of bounds
    // condition; on this condition, the conditional move will choose a
    // speculatively safe address (a zero / null pointer) instead.
-    spectre_oob_comparison: Option<(IntCC, ir::Value, ir::Value)>,
+    spectre_oob_comparison: Option<SpectreOobComparison>,
 ) {
    debug_assert_eq!(func.dfg.value_type(index), addr_ty);
    let mut pos = FuncCursor::new(func).at_inst(inst);
@@ -250,13 +294,17 @@ fn compute_addr(

    // Add the heap base address base
    let base = if isa.flags().enable_pinned_reg() && isa.flags().use_pinned_reg_as_heap_base() {
-        pos.ins().get_pinned_reg(isa.pointer_type())
+        let base = pos.ins().get_pinned_reg(isa.pointer_type());
+        trace!("  inserting: {}", pos.func.dfg.display_value_inst(base));
+        base
    } else {
        let base_gv = pos.func.heaps[heap].base;
-        pos.ins().global_value(addr_ty, base_gv)
+        let base = pos.ins().global_value(addr_ty, base_gv);
+        trace!("  inserting: {}", pos.func.dfg.display_value_inst(base));
+        base
    };

-    if let Some((cc, a, b)) = spectre_oob_comparison {
+    if let Some(SpectreOobComparison { cc, lhs, rhs }) = spectre_oob_comparison {
        let final_base = pos.ins().iadd(base, index);
        // NB: The addition of the offset immediate must happen *before* the
        // `select_spectre_guard`. If it happens after, then we potentially are
@@ -264,22 +312,40 @@ fn compute_addr(
        let final_addr = if offset == 0 {
            final_base
        } else {
-            pos.ins().iadd_imm(final_base, offset as i64)
+            let final_addr = pos.ins().iadd_imm(final_base, offset as i64);
+            trace!(
+                "  inserting: {}",
+                pos.func.dfg.display_value_inst(final_addr)
+            );
+            final_addr
        };
        let zero = pos.ins().iconst(addr_ty, 0);
-        let cmp = pos.ins().icmp(cc, a, b);
-        pos.func
+        trace!("  inserting: {}", pos.func.dfg.display_value_inst(zero));
+
+        let cmp = pos.ins().icmp(cc, lhs, rhs);
+        trace!("  inserting: {}", pos.func.dfg.display_value_inst(cmp));
+
+        let value = pos
+            .func
            .dfg
            .replace(inst)
            .select_spectre_guard(cmp, zero, final_addr);
+        trace!("  inserting: {}", pos.func.dfg.display_value_inst(value));
    } else if offset == 0 {
-        pos.func.dfg.replace(inst).iadd(base, index);
+        let addr = pos.func.dfg.replace(inst).iadd(base, index);
+        trace!("  inserting: {}", pos.func.dfg.display_value_inst(addr));
    } else {
        let final_base = pos.ins().iadd(base, index);
-        pos.func
+        trace!(
+            "  inserting: {}",
+            pos.func.dfg.display_value_inst(final_base)
+        );
+        let addr = pos
+            .func
            .dfg
            .replace(inst)
            .iadd_imm(final_base, offset as i64);
+        trace!("  inserting: {}", pos.func.dfg.display_value_inst(addr));
    }
 }

--- a/cranelift/codegen/src/legalizer/mod.rs
+++ b/cranelift/codegen/src/legalizer/mod.rs
@@ -19,6 +19,7 @@ use crate::ir::immediates::Imm64;
 use crate::ir::types::{I128, I64};
 use crate::ir::{self, InstBuilder, InstructionData, MemFlags, Value};
 use crate::isa::TargetIsa;
+use crate::trace;

 mod globalvalue;
 mod heap;
@@ -46,6 +47,8 @@ fn imm_const(pos: &mut FuncCursor, arg: Value, imm: Imm64, is_signed: bool) -> V
 /// Perform a simple legalization by expansion of the function, without
 /// platform-specific transforms.
 pub fn simple_legalize(func: &mut ir::Function, cfg: &mut ControlFlowGraph, isa: &dyn TargetIsa) {
+    trace!("Pre-legalization function:\n{}", func.display());
+
    let mut pos = FuncCursor::new(func);
    let func_begin = pos.position();
    pos.set_position(func_begin);
@@ -246,6 +249,8 @@ pub fn simple_legalize(func: &mut ir::Function, cfg: &mut ControlFlowGraph, isa:
            pos.set_position(prev_pos);
        }
    }
+
+    trace!("Post-legalization function:\n{}", func.display());
 }

 /// Custom expansion for conditional trap instructions.
@@ -257,6 +262,12 @@ fn expand_cond_trap(
    arg: ir::Value,
    code: ir::TrapCode,
 ) {
+    trace!(
+        "expanding conditional trap: {:?}: {}",
+        inst,
+        func.dfg.display_inst(inst)
+    );
+
    // Parse the instruction.
    let trapz = match opcode {
        ir::Opcode::Trapz => true,