diff --git a/cranelift/codegen/src/egraph/elaborate.rs b/cranelift/codegen/src/egraph/elaborate.rs index d52927ffea..dbb14505a7 100644 --- a/cranelift/codegen/src/egraph/elaborate.rs +++ b/cranelift/codegen/src/egraph/elaborate.rs @@ -401,6 +401,17 @@ impl<'a> Elaborator<'a> { let arg_values = &self.elab_result_stack[arg_idx..]; // Compute max loop depth. + // + // Note that if there are no arguments then this instruction + // is allowed to get hoisted up one loop. This is not + // usually used since no-argument values are things like + // constants which are typically rematerialized, but for the + // `vconst` instruction 128-bit constants aren't as easily + // rematerialized. They're hoisted out of inner loops but + // not to the function entry which may run the risk of + // placing too much register pressure on the entire + // function. This is modeled with the `.saturating_sub(1)` + // as the default if there's otherwise no maximum. let loop_hoist_level = arg_values .iter() .map(|&value| { @@ -423,7 +434,7 @@ impl<'a> Elaborator<'a> { hoist_level }) .max() - .unwrap_or(self.loop_stack.len()); + .unwrap_or(self.loop_stack.len().saturating_sub(1)); trace!( " -> loop hoist level: {:?}; cur loop depth: {:?}, loop_stack: {:?}", loop_hoist_level, diff --git a/cranelift/filetests/filetests/egraph/licm.clif b/cranelift/filetests/filetests/egraph/licm.clif index 7d44f53fe8..8d6a5ec329 100644 --- a/cranelift/filetests/filetests/egraph/licm.clif +++ b/cranelift/filetests/filetests/egraph/licm.clif @@ -36,3 +36,72 @@ block2(v9: i32): ; check: v10 = iconst.i32 1 ; check: v4 = iadd.i32 v1, v10 ; check: return v4 + +function %f(i64x2, i32) -> i64x2 { +block0(v0: i64x2, v1: i32): + jump block1(v0, v1) + +block1(v2: i64x2, v3: i32): + v4 = vconst.i64x2 0x1000000010000000 + v5 = iadd v2, v4 + v6 = iconst.i32 1 + v7 = isub v3, v6 + brif v7, block1(v5, v7), block2(v5) + +block2(v8: i64x2): + return v8 +} + +; check: block0(v0: i64x2, v1: i32): +; nextln: v4 = vconst.i64x2 const0 +; nextln: jump block1(v0, v1) +; check: block1(v2: i64x2, v3: i32): +; check: v6 = iconst.i32 1 +; check: v7 = isub v3, v6 +; check: v5 = iadd v2, v4 +; check: v8 -> v5 +; check: brif v7, block1(v5, v7), block2 +; check: block2: +; check: return v5 + +;; don't lift vconst out of 2 loops, only the inner loop, based on the current +;; heuristic. +function %f(i64x2, i32, i32) -> i64x2 { +block0(v0: i64x2, v1: i32, v2: i32): + jump block1(v0, v1, v2) + +block1(v3: i64x2, v4: i32, v5: i32): + jump block2(v3, v4) + +block2(v6: i64x2, v7: i32): + v8 = vconst.i64x2 0x1000000010000000 + v9 = iadd v6, v8 + v10 = iconst.i32 1 + v11 = isub v7, v10 + brif v11, block2(v9, v11), block3(v9) + +block3(v12: i64x2): + v13 = iconst.i32 1 + v14 = isub v5, v13 + brif v14, block1(v9, v4, v14), block4 + +block4: + return v12 +} + +; check: block0(v0: i64x2, v1: i32, v2: i32): +; check: jump block1(v0, v2) +; check: block1(v3: i64x2, v5: i32): +; check: v8 = vconst.i64x2 const0 +; check: jump block2(v3, v1) +; check: block2(v6: i64x2, v7: i32): +; check: v10 = iconst.i32 1 +; check: v11 = isub v7, v10 +; check: v9 = iadd v6, v8 +; check: brif v11, block2(v9, v11), block3 +; check: block3: +; check: v15 = iconst.i32 1 +; check: v14 = isub.i32 v5, v15 +; check: brif v14, block1(v9, v14), block4 +; check: block4: +; check: return v9