Allow hoisting vconst instructions out of loops (#5909)
* Allow hoisting `vconst` instructions out of loops Staring at some SIMD code and what LLVM and v8 both generate it appears that a common technique for SIMD-loops is to hoist constants outside of loops since they're nontrivial to rematerialize unlike integer constants. This commit updates the `loop_hoist_level` calculation with egraphs to have a nonzero default for instructions that have no arguments (e.g. consts) which enables hoisting these instructions out of loops. Note, though, that for now I've listed the maximum as hoisting outside of one loop, but not all of them. While theoretically vconsts could move up to the top of the function I'd be worried about their impact on register pressure and having to save/restore around calls or similar, so hopefully if the hot part of a program is a single loop then hoisting out of one loop is a reasonable-enough heuristic for now. Locally on x64 with a benchmark that just encodes binary to hex this saw a 15% performance improvement taking hex encoding from ~6G/s to ~6.7G/s. * Test vconst is only hoisted one loop out
This commit is contained in:
@@ -401,6 +401,17 @@ impl<'a> Elaborator<'a> {
|
||||
let arg_values = &self.elab_result_stack[arg_idx..];
|
||||
|
||||
// Compute max loop depth.
|
||||
//
|
||||
// Note that if there are no arguments then this instruction
|
||||
// is allowed to get hoisted up one loop. This is not
|
||||
// usually used since no-argument values are things like
|
||||
// constants which are typically rematerialized, but for the
|
||||
// `vconst` instruction 128-bit constants aren't as easily
|
||||
// rematerialized. They're hoisted out of inner loops but
|
||||
// not to the function entry which may run the risk of
|
||||
// placing too much register pressure on the entire
|
||||
// function. This is modeled with the `.saturating_sub(1)`
|
||||
// as the default if there's otherwise no maximum.
|
||||
let loop_hoist_level = arg_values
|
||||
.iter()
|
||||
.map(|&value| {
|
||||
@@ -423,7 +434,7 @@ impl<'a> Elaborator<'a> {
|
||||
hoist_level
|
||||
})
|
||||
.max()
|
||||
.unwrap_or(self.loop_stack.len());
|
||||
.unwrap_or(self.loop_stack.len().saturating_sub(1));
|
||||
trace!(
|
||||
" -> loop hoist level: {:?}; cur loop depth: {:?}, loop_stack: {:?}",
|
||||
loop_hoist_level,
|
||||
|
||||
@@ -36,3 +36,72 @@ block2(v9: i32):
|
||||
; check: v10 = iconst.i32 1
|
||||
; check: v4 = iadd.i32 v1, v10
|
||||
; check: return v4
|
||||
|
||||
function %f(i64x2, i32) -> i64x2 {
|
||||
block0(v0: i64x2, v1: i32):
|
||||
jump block1(v0, v1)
|
||||
|
||||
block1(v2: i64x2, v3: i32):
|
||||
v4 = vconst.i64x2 0x1000000010000000
|
||||
v5 = iadd v2, v4
|
||||
v6 = iconst.i32 1
|
||||
v7 = isub v3, v6
|
||||
brif v7, block1(v5, v7), block2(v5)
|
||||
|
||||
block2(v8: i64x2):
|
||||
return v8
|
||||
}
|
||||
|
||||
; check: block0(v0: i64x2, v1: i32):
|
||||
; nextln: v4 = vconst.i64x2 const0
|
||||
; nextln: jump block1(v0, v1)
|
||||
; check: block1(v2: i64x2, v3: i32):
|
||||
; check: v6 = iconst.i32 1
|
||||
; check: v7 = isub v3, v6
|
||||
; check: v5 = iadd v2, v4
|
||||
; check: v8 -> v5
|
||||
; check: brif v7, block1(v5, v7), block2
|
||||
; check: block2:
|
||||
; check: return v5
|
||||
|
||||
;; don't lift vconst out of 2 loops, only the inner loop, based on the current
|
||||
;; heuristic.
|
||||
function %f(i64x2, i32, i32) -> i64x2 {
|
||||
block0(v0: i64x2, v1: i32, v2: i32):
|
||||
jump block1(v0, v1, v2)
|
||||
|
||||
block1(v3: i64x2, v4: i32, v5: i32):
|
||||
jump block2(v3, v4)
|
||||
|
||||
block2(v6: i64x2, v7: i32):
|
||||
v8 = vconst.i64x2 0x1000000010000000
|
||||
v9 = iadd v6, v8
|
||||
v10 = iconst.i32 1
|
||||
v11 = isub v7, v10
|
||||
brif v11, block2(v9, v11), block3(v9)
|
||||
|
||||
block3(v12: i64x2):
|
||||
v13 = iconst.i32 1
|
||||
v14 = isub v5, v13
|
||||
brif v14, block1(v9, v4, v14), block4
|
||||
|
||||
block4:
|
||||
return v12
|
||||
}
|
||||
|
||||
; check: block0(v0: i64x2, v1: i32, v2: i32):
|
||||
; check: jump block1(v0, v2)
|
||||
; check: block1(v3: i64x2, v5: i32):
|
||||
; check: v8 = vconst.i64x2 const0
|
||||
; check: jump block2(v3, v1)
|
||||
; check: block2(v6: i64x2, v7: i32):
|
||||
; check: v10 = iconst.i32 1
|
||||
; check: v11 = isub v7, v10
|
||||
; check: v9 = iadd v6, v8
|
||||
; check: brif v11, block2(v9, v11), block3
|
||||
; check: block3:
|
||||
; check: v15 = iconst.i32 1
|
||||
; check: v14 = isub.i32 v5, v15
|
||||
; check: brif v14, block1(v9, v14), block4
|
||||
; check: block4:
|
||||
; check: return v9
|
||||
|
||||
Reference in New Issue
Block a user