From 18ee645ebe289e1bba5338628e36a1c05ee42b31 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Mon, 6 Mar 2023 09:29:43 -0600
Subject: [PATCH] Allow hoisting `vconst` instructions out of loops (#5909)

* Allow hoisting `vconst` instructions out of loops

Staring at some SIMD code and what LLVM and v8 both generate it appears
that a common technique for SIMD-loops is to hoist constants outside of
loops since they're nontrivial to rematerialize unlike integer
constants. This commit updates the `loop_hoist_level` calculation with
egraphs to have a nonzero default for instructions that have no
arguments (e.g. consts) which enables hoisting these instructions out of
loops.

Note, though, that for now I've listed the maximum as hoisting outside
of one loop, but not all of them. While theoretically vconsts could move
up to the top of the function I'd be worried about their impact on
register pressure and having to save/restore around calls or similar, so
hopefully if the hot part of a program is a single loop then hoisting
out of one loop is a reasonable-enough heuristic for now.

Locally on x64 with a benchmark that just encodes binary to hex this saw
a 15% performance improvement taking hex encoding from ~6G/s to ~6.7G/s.

* Test vconst is only hoisted one loop out
---
 cranelift/codegen/src/egraph/elaborate.rs     | 13 +++-
 .../filetests/filetests/egraph/licm.clif      | 69 +++++++++++++++++++
 2 files changed, 81 insertions(+), 1 deletion(-)

diff --git a/cranelift/codegen/src/egraph/elaborate.rs b/cranelift/codegen/src/egraph/elaborate.rs
index d52927ffea..dbb14505a7 100644
--- a/cranelift/codegen/src/egraph/elaborate.rs
+++ b/cranelift/codegen/src/egraph/elaborate.rs
@@ -401,6 +401,17 @@ impl<'a> Elaborator<'a> {
                     let arg_values = &self.elab_result_stack[arg_idx..];
 
                     // Compute max loop depth.
+                    //
+                    // Note that if there are no arguments then this instruction
+                    // is allowed to get hoisted up one loop. This is not
+                    // usually used since no-argument values are things like
+                    // constants which are typically rematerialized, but for the
+                    // `vconst` instruction 128-bit constants aren't as easily
+                    // rematerialized. They're hoisted out of inner loops but
+                    // not to the function entry which may run the risk of
+                    // placing too much register pressure on the entire
+                    // function. This is modeled with the `.saturating_sub(1)`
+                    // as the default if there's otherwise no maximum.
                     let loop_hoist_level = arg_values
                         .iter()
                         .map(|&value| {
@@ -423,7 +434,7 @@ impl<'a> Elaborator<'a> {
                             hoist_level
                         })
                         .max()
-                        .unwrap_or(self.loop_stack.len());
+                        .unwrap_or(self.loop_stack.len().saturating_sub(1));
                     trace!(
                         " -> loop hoist level: {:?}; cur loop depth: {:?}, loop_stack: {:?}",
                         loop_hoist_level,
diff --git a/cranelift/filetests/filetests/egraph/licm.clif b/cranelift/filetests/filetests/egraph/licm.clif
index 7d44f53fe8..8d6a5ec329 100644
--- a/cranelift/filetests/filetests/egraph/licm.clif
+++ b/cranelift/filetests/filetests/egraph/licm.clif
@@ -36,3 +36,72 @@ block2(v9: i32):
 ; check:      v10 = iconst.i32 1
 ; check:      v4 = iadd.i32 v1, v10
 ; check:      return v4
+
+function %f(i64x2, i32) -> i64x2 {
+block0(v0: i64x2, v1: i32):
+    jump block1(v0, v1)
+
+block1(v2: i64x2, v3: i32):
+    v4 = vconst.i64x2 0x1000000010000000
+    v5 = iadd v2, v4
+    v6 = iconst.i32 1
+    v7 = isub v3, v6
+    brif v7, block1(v5, v7), block2(v5)
+
+block2(v8: i64x2):
+    return v8
+}
+
+; check:  block0(v0: i64x2, v1: i32):
+; nextln:     v4 = vconst.i64x2 const0
+; nextln:     jump block1(v0, v1)
+; check:  block1(v2: i64x2, v3: i32):
+; check:      v6 = iconst.i32 1
+; check:      v7 = isub v3, v6
+; check:      v5 = iadd v2, v4
+; check:      v8 -> v5
+; check:      brif v7, block1(v5, v7), block2
+; check:  block2:
+; check:      return v5
+
+;; don't lift vconst out of 2 loops, only the inner loop, based on the current
+;; heuristic.
+function %f(i64x2, i32, i32) -> i64x2 {
+block0(v0: i64x2, v1: i32, v2: i32):
+    jump block1(v0, v1, v2)
+
+block1(v3: i64x2, v4: i32, v5: i32):
+    jump block2(v3, v4)
+
+block2(v6: i64x2, v7: i32):
+    v8 = vconst.i64x2 0x1000000010000000
+    v9 = iadd v6, v8
+    v10 = iconst.i32 1
+    v11 = isub v7, v10
+    brif v11, block2(v9, v11), block3(v9)
+
+block3(v12: i64x2):
+    v13 = iconst.i32 1
+    v14 = isub v5, v13
+    brif v14, block1(v9, v4, v14), block4
+
+block4:
+    return v12
+}
+
+; check:  block0(v0: i64x2, v1: i32, v2: i32):
+; check:      jump block1(v0, v2)
+; check:  block1(v3: i64x2, v5: i32):
+; check:      v8 = vconst.i64x2 const0
+; check:      jump block2(v3, v1)
+; check:  block2(v6: i64x2, v7: i32):
+; check:      v10 = iconst.i32 1
+; check:      v11 = isub v7, v10
+; check:      v9 = iadd v6, v8
+; check:      brif v11, block2(v9, v11), block3
+; check:  block3:
+; check:      v15 = iconst.i32 1
+; check:      v14 = isub.i32 v5, v15
+; check:      brif v14, block1(v9, v14), block4
+; check:  block4:
+; check:      return v9