Cranelift: de-duplicate bounds checks in legalizations (#5190)

* Cranelift: Add the `DataFlowGraph::display_value_inst` convenience method * Cranelift: Add some `trace!` logs to some parts of legalization * Cranelift: de-duplicate bounds checks in legalizations When both (1) "dynamic" memories that need explicit bounds checks and (2) spectre mitigations that perform bounds checks are enabled, reuse the same bounds checks between the two legalizations. This reduces the overhead of explicit bounds checks and spectre mitigations over using virtual memory guard pages with spectre mitigations from ~1.9-2.1x overhead to ~1.6-1.8x overhead. That is about a 14-19% speed up for when dynamic memories and spectre mitigations are enabled. <details> ``` execution :: instructions-retired :: benchmarks/spidermonkey/benchmark.wasm Δ = 3422455129.47 ± 120159.49 (confidence = 99%) virtual-memory-guards.so is 2.09x to 2.09x faster than bounds-checks.so! [6563931659 6564063496.07 6564301535] bounds-checks.so [3141492675 3141608366.60 3141895249] virtual-memory-guards.so execution :: instructions-retired :: benchmarks/bz2/benchmark.wasm Δ = 338716136.87 ± 1.38 (confidence = 99%) virtual-memory-guards.so is 2.08x to 2.08x faster than bounds-checks.so! [651961494 651961495.47 651961497] bounds-checks.so [313245357 313245358.60 313245362] virtual-memory-guards.so execution :: instructions-retired :: benchmarks/pulldown-cmark/benchmark.wasm Δ = 22742944.07 ± 331.73 (confidence = 99%) virtual-memory-guards.so is 1.87x to 1.87x faster than bounds-checks.so! [48841295 48841567.33 48842139] bounds-checks.so [26098439 26098623.27 26099479] virtual-memory-guards.so ``` </details> <details> ``` execution :: instructions-retired :: benchmarks/spidermonkey/benchmark.wasm Δ = 2465900207.27 ± 146476.61 (confidence = 99%) virtual-memory-guards.so is 1.78x to 1.78x faster than de-duped-bounds-checks.so! [5607275431 5607442989.13 5607838342] de-duped-bounds-checks.so [3141445345 3141542781.87 3141711213] virtual-memory-guards.so execution :: instructions-retired :: benchmarks/bz2/benchmark.wasm Δ = 234253620.20 ± 2.33 (confidence = 99%) virtual-memory-guards.so is 1.75x to 1.75x faster than de-duped-bounds-checks.so! [547498977 547498980.93 547498985] de-duped-bounds-checks.so [313245357 313245360.73 313245363] virtual-memory-guards.so execution :: instructions-retired :: benchmarks/pulldown-cmark/benchmark.wasm Δ = 16605659.13 ± 315.78 (confidence = 99%) virtual-memory-guards.so is 1.64x to 1.64x faster than de-duped-bounds-checks.so! [42703971 42704284.40 42704787] de-duped-bounds-checks.so [26098432 26098625.27 26099234] virtual-memory-guards.so ``` </details> <details> ``` execution :: instructions-retired :: benchmarks/bz2/benchmark.wasm Δ = 104462517.13 ± 7.32 (confidence = 99%) de-duped-bounds-checks.so is 1.19x to 1.19x faster than bounds-checks.so! [651961493 651961500.80 651961532] bounds-checks.so [547498981 547498983.67 547498989] de-duped-bounds-checks.so execution :: instructions-retired :: benchmarks/spidermonkey/benchmark.wasm Δ = 956556982.80 ± 103034.59 (confidence = 99%) de-duped-bounds-checks.so is 1.17x to 1.17x faster than bounds-checks.so! [6563930590 6564019842.40 6564243651] bounds-checks.so [5607307146 5607462859.60 5607677763] de-duped-bounds-checks.so execution :: instructions-retired :: benchmarks/pulldown-cmark/benchmark.wasm Δ = 6137307.87 ± 247.75 (confidence = 99%) de-duped-bounds-checks.so is 1.14x to 1.14x faster than bounds-checks.so! [48841303 48841472.93 48842000] bounds-checks.so [42703965 42704165.07 42704718] de-duped-bounds-checks.so ``` </details> * Update test expectations * Add a test for deduplicating bounds checks between dynamic memories and spectre mitigations * Define a struct for the Spectre comparison instead of using a tuple * More trace logging for heap legalization
2022-11-15 08:47:22 -08:00
parent dece901d16
commit c2a7ea7e24
9 changed files with 187 additions and 104 deletions
--- a/cranelift/filetests/filetests/isa/aarch64/heap_addr.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/heap_addr.clif
@@ -17,17 +17,12 @@ block0(v0: i64, v1: i32):
 ;   mov w8, w1
 ;   ldr x9, [x0]
 ;   mov x9, x9
-;   subs xzr, x8, x9
-;   b.ls label1 ; b label2
-; block1:
 ;   add x10, x0, x1, UXTW
-;   movz x11, #0
+;   movz x7, #0
 ;   subs xzr, x8, x9
-;   csel x0, x11, x10, hi
+;   csel x0, x7, x10, hi
 ;   csdb
 ;   ret
-; block2:
-;   udf #0xc11f

 function %static_heap_check(i64 vmctx, i32) -> i64 {
    gv0 = vmctx
@@ -69,18 +64,13 @@ block0(v0: i64, v1: i32):
 ;   movz x9, #24
 ;   adds x11, x11, x9
 ;   b.lo 8 ; udf
+;   add x12, x0, x1, UXTW
+;   add x12, x12, #16
+;   movz x13, #0
 ;   subs xzr, x11, x10
-;   b.ls label1 ; b label2
-; block1:
-;   add x13, x0, x1, UXTW
-;   add x13, x13, #16
-;   movz x12, #0
-;   subs xzr, x11, x10
-;   csel x0, x12, x13, hi
+;   csel x0, x13, x12, hi
 ;   csdb
 ;   ret
-; block2:
-;   udf #0xc11f

 function %static_heap_check_with_offset(i64 vmctx, i32) -> i64 {
    gv0 = vmctx
--- a/cranelift/filetests/filetests/isa/riscv64/heap-addr.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/heap-addr.clif
@@ -13,19 +13,14 @@ block0(v0: i64, v1: i32):
 }

 ; block0:
-;   uext.w a7,a1
-;   ld t3,0(a0)
-;   addi t3,t3,0
-;   ule t4,a7,t3##ty=i64
-;   bne t4,zero,taken(label1),not_taken(label2)
-; block1:
-;   add t4,a0,a7
-;   ugt a7,a7,t3##ty=i64
-;   li t0,0
-;   selectif_spectre_guard a0,t0,t4##test=a7
+;   uext.w a6,a1
+;   ld a7,0(a0)
+;   addi t3,a7,0
+;   add a7,a0,a6
+;   ugt a5,a6,t3##ty=i64
+;   li t3,0
+;   selectif_spectre_guard a0,t3,a7##test=a5
 ;   ret
-; block2:
-;   udf##trap_code=heap_oob

 function %static_heap_check(i64 vmctx, i32) -> i64 {
    gv0 = vmctx
@@ -62,23 +57,18 @@ block0(v0: i64, v1: i32):
 }

 ; block0:
-;   uext.w t1,a1
-;   ld t0,0(a0)
-;   li t3,24
-;   add t2,t1,t3
-;   ult a1,t2,t1##ty=i64
-;   trap_if a1,heap_oob
-;   ule a1,t2,t0##ty=i64
-;   bne a1,zero,taken(label1),not_taken(label2)
-; block1:
-;   add a0,a0,t1
-;   addi a0,a0,16
-;   ugt t1,t2,t0##ty=i64
-;   li a1,0
-;   selectif_spectre_guard a0,a1,a0##test=t1
+;   uext.w t0,a1
+;   ld t4,0(a0)
+;   li a7,24
+;   add t1,t0,a7
+;   ult t2,t1,t0##ty=i64
+;   trap_if t2,heap_oob
+;   add t0,a0,t0
+;   addi t0,t0,16
+;   ugt t4,t1,t4##ty=i64
+;   li t1,0
+;   selectif_spectre_guard a0,t1,t0##test=t4
 ;   ret
-; block2:
-;   udf##trap_code=heap_oob

 function %static_heap_check_with_offset(i64 vmctx, i32) -> i64 {
    gv0 = vmctx
--- a/cranelift/filetests/filetests/isa/s390x/heap_addr.clif
+++ b/cranelift/filetests/filetests/isa/s390x/heap_addr.clif
@@ -12,20 +12,14 @@ block0(v0: i64, v1: i32):
 }

 ; block0:
-;   llgfr %r5, %r3
-;   lgr %r4, %r2
-;   lg %r2, 0(%r4)
-;   aghik %r3, %r2, 0
-;   clgr %r5, %r3
-;   jgnh label1 ; jg label2
-; block1:
-;   agrk %r2, %r4, %r5
-;   lghi %r4, 0
-;   clgr %r5, %r3
-;   locgrh %r2, %r4
+;   llgfr %r4, %r3
+;   lghi %r3, 0
+;   ag %r3, 0(%r2)
+;   agr %r2, %r4
+;   lghi %r5, 0
+;   clgr %r4, %r3
+;   locgrh %r2, %r5
 ;   br %r14
-; block2:
-;   trap

 function %static_heap_check(i64 vmctx, i32) -> i64 {
    gv0 = vmctx
@@ -66,18 +60,13 @@ block0(v0: i64, v1: i32):
 ;   lghi %r5, 24
 ;   algfr %r5, %r3
 ;   jle 6 ; trap
-;   clgr %r5, %r4
-;   jgnh label1 ; jg label2
-; block1:
-;   agrk %r3, %r2, %r7
-;   aghik %r2, %r3, 16
+;   agr %r2, %r7
+;   aghi %r2, 16
 ;   lghi %r3, 0
 ;   clgr %r5, %r4
 ;   locgrh %r2, %r3
 ;   lmg %r7, %r15, 56(%r15)
 ;   br %r14
-; block2:
-;   trap

 function %static_heap_check_with_offset(i64 vmctx, i32) -> i64 {
    gv0 = vmctx
--- a/cranelift/filetests/filetests/isa/x64/heap.clif
+++ b/cranelift/filetests/filetests/isa/x64/heap.clif
@@ -33,23 +33,18 @@ block0(v0: i32, v1: i64):
 ;   movq    %rsp, %rbp
 ; block0:
 ;   movl    %edi, %eax
-;   movq    8(%rsi), %rdx
-;   movq    %rax, %rdi
-;   addq    %rdi, $32768, %rdi
+;   movq    8(%rsi), %rdi
+;   movq    %rax, %rcx
+;   addq    %rcx, $32768, %rcx
 ;   jnb ; ud2 heap_oob ;
-;   cmpq    %rdx, %rdi
-;   jbe     label1; j label2
-; block1:
 ;   addq    %rax, 0(%rsi), %rax
 ;   addq    %rax, $32768, %rax
-;   xorq    %rcx, %rcx, %rcx
-;   cmpq    %rdx, %rdi
-;   cmovnbeq %rcx, %rax, %rax
+;   xorq    %rsi, %rsi, %rsi
+;   cmpq    %rdi, %rcx
+;   cmovnbeq %rsi, %rax, %rax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
-; block2:
-;   ud2 heap_oob

 ;; The heap address calculation for this statically-allocated memory checks that
 ;; the passed offset (%r11) is within bounds (`cmp + jbe + j`) and then includes
@@ -120,25 +115,18 @@ block0(v0: i64, v1: i32):
 ; block0:
 ;   movq    %rdi, %rax
 ;   movl    %esi, %edi
-;   movq    %rax, %rcx
-;   movq    0(%rcx), %rsi
-;   movq    %rdi, %rdx
-;   addq    %rdx, $24, %rdx
+;   movq    0(%rax), %rsi
+;   movq    %rdi, %rcx
+;   addq    %rcx, $24, %rcx
 ;   jnb ; ud2 heap_oob ;
-;   cmpq    %rsi, %rdx
-;   jbe     label1; j label2
-; block1:
-;   movq    %rcx, %rax
 ;   addq    %rax, %rdi, %rax
 ;   addq    %rax, $16, %rax
-;   xorq    %rcx, %rcx, %rcx
-;   cmpq    %rsi, %rdx
-;   cmovnbeq %rcx, %rax, %rax
+;   xorq    %rdi, %rdi, %rdi
+;   cmpq    %rsi, %rcx
+;   cmovnbeq %rdi, %rax, %rax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
-; block2:
-;   ud2 heap_oob

 function %static_heap_check_with_offset(i64 vmctx, i32) -> i64 {
    gv0 = vmctx
--- a/cranelift/filetests/filetests/legalizer/bounds-checks.clif
+++ b/cranelift/filetests/filetests/legalizer/bounds-checks.clif
@@ -0,0 +1,32 @@
+test legalizer
+set enable_heap_access_spectre_mitigation=true
+target aarch64
+target x86_64
+
+;; Test that when both (1) dynamic memories and (2) heap access spectre
+;; mitigations are enabled, we deduplicate the bounds check between the two.
+
+function %wasm_load(i64 vmctx, i32) -> i32 wasmtime_system_v {
+    gv0 = vmctx
+    gv1 = load.i64 notrap aligned gv0+88
+    gv2 = load.i64 notrap aligned gv0+80
+    heap0 = dynamic gv2, min 0, bound gv1, offset_guard 0x8000_0000, index_type i32
+
+block0(v0: i64, v1: i32):
+    v2 = heap_addr.i64 heap0, v1, 0, 4
+    v3 = load.i32 little heap v2
+    return v3
+}
+
+; check:  block0(v0: i64, v1: i32):
+; nextln:     v4 = uextend.i64 v1
+; nextln:     v5 = load.i64 notrap aligned v0+88
+; nextln:     v6 = iconst.i64 4
+; nextln:     v7 = uadd_overflow_trap v4, v6, heap_oob  ; v6 = 4
+; nextln:     v8 = load.i64 notrap aligned v0+80
+; nextln:     v9 = iadd v8, v4
+; nextln:     v10 = iconst.i64 0
+; nextln:     v11 = icmp ugt v7, v5
+; nextln:     v2 = select_spectre_guard v11, v10, v9  ; v10 = 0
+; nextln:     v3 = load.i32 little heap v2
+; nextln:     return v3