Cranelift: fix branch-of-icmp/fcmp regression: look through uextend. (#5487)

In #5031, we removed `bool` types from CLIF, using integers instead for
"truthy" values. This greatly simplified the IR, and was generally an
improvement.

However, because x86's `SETcc` instruction sets only the low 8 bits of a
register, we chose to use `i8` types as the result of `icmp` and `fcmp`,
to avoid the need for a masking operation when materializing the result.

Unfortunately this means that uses of truthy values often now have
`uextend` operations, especially when coming from Wasm (where truthy
values are naturally `i32`-typed). For example, where we previously had
`(brz (icmp ...))`, we now have `(brz (uextend (icmp ...)))`.

It's arguable whether or not we should switch to `i32` truthy values --
in most cases we can avoid materializing a value that's immediately used
for a branch or select, so a mask would in most cases be unnecessary,
and it would be a win at the IR level -- but irrespective of that, this
change *did* regress our generated code quality: our backends had
patterns for e.g. `(brz (icmp ...))` but not with the `uextend`, so we
were *always* materializing truthy values. Many blocks thus ended with
"cmp; setcc; cmp; test; branch" rather than "cmp; branch".

In #5391 we noticed this and fixed it on x64, but it was a general
problem on aarch64 and riscv64 as well. This PR introduces a
`maybe_uextend` extractor that "looks through" uextends, and uses it
where we consume truthy values, thus fixing the regression.  This PR
also adds compile filetests to ensure we don't regress again.

The riscv64 backend has not been updated here because doing so appears
to trigger another issue in its branch handling; fixing that is TBD.
This commit is contained in:
Chris Fallin
2022-12-22 01:43:44 -08:00
committed by GitHub
parent 24a2f8cac6
commit 03463458e4
8 changed files with 204 additions and 45 deletions

View File

@@ -152,7 +152,8 @@ block0(v0: i128, v1: i128):
function %f(i64, i64) -> i64 {
block0(v0: i64, v1: i64):
v2 = icmp eq v0, v1
brnz v2, block1
v3 = uextend.i32 v2
brnz v3, block1
jump block2
block1:
@@ -177,7 +178,8 @@ block2:
function %f(i64, i64) -> i64 {
block0(v0: i64, v1: i64):
v2 = icmp eq v0, v1
brnz v2, block1
v3 = uextend.i32 v2
brnz v3, block1
jump block1
block1:
@@ -239,7 +241,8 @@ block1:
function %i128_bricmp_eq(i128, i128) {
block0(v0: i128, v1: i128):
v2 = icmp eq v0, v1
brnz v2, block1
v3 = uextend.i32 v2
brnz v3, block1
jump block1
block1:
@@ -260,7 +263,8 @@ block1:
function %i128_bricmp_ne(i128, i128) {
block0(v0: i128, v1: i128):
v2 = icmp ne v0, v1
brnz v2, block1
v3 = uextend.i32 v2
brnz v3, block1
jump block1
block1:
@@ -281,7 +285,8 @@ block1:
function %i128_bricmp_slt(i128, i128) {
block0(v0: i128, v1: i128):
v2 = icmp slt v0, v1
brnz v2, block1
v3 = uextend.i32 v2
brnz v3, block1
jump block1
block1:
@@ -306,7 +311,8 @@ block1:
function %i128_bricmp_ult(i128, i128) {
block0(v0: i128, v1: i128):
v2 = icmp ult v0, v1
brnz v2, block1
v3 = uextend.i32 v2
brnz v3, block1
jump block1
block1:
@@ -331,7 +337,8 @@ block1:
function %i128_bricmp_sle(i128, i128) {
block0(v0: i128, v1: i128):
v2 = icmp sle v0, v1
brnz v2, block1
v3 = uextend.i32 v2
brnz v3, block1
jump block1
block1:
@@ -357,7 +364,8 @@ block1:
function %i128_bricmp_ule(i128, i128) {
block0(v0: i128, v1: i128):
v2 = icmp ule v0, v1
brnz v2, block1
v3 = uextend.i32 v2
brnz v3, block1
jump block1
block1:
@@ -383,7 +391,8 @@ block1:
function %i128_bricmp_sgt(i128, i128) {
block0(v0: i128, v1: i128):
v2 = icmp sgt v0, v1
brnz v2, block1
v3 = uextend.i32 v2
brnz v3, block1
jump block1
block1:
@@ -408,7 +417,8 @@ block1:
function %i128_bricmp_ugt(i128, i128) {
block0(v0: i128, v1: i128):
v2 = icmp ugt v0, v1
brnz v2, block1
v3 = uextend.i32 v2
brnz v3, block1
jump block1
block1:
@@ -433,7 +443,8 @@ block1:
function %i128_bricmp_sge(i128, i128) {
block0(v0: i128, v1: i128):
v2 = icmp sge v0, v1
brnz v2, block1
v3 = uextend.i32 v2
brnz v3, block1
jump block1
block1:
@@ -459,7 +470,8 @@ block1:
function %i128_bricmp_uge(i128, i128) {
block0(v0: i128, v1: i128):
v2 = icmp uge v0, v1
brnz v2, block1
v3 = uextend.i32 v2
brnz v3, block1
jump block1
block1:

View File

@@ -0,0 +1,29 @@
test compile precise-output
target aarch64
function %f0(i32, i32, i64, i64) -> i64 {
block0(v0: i32, v1: i32, v2: i64, v3: i64):
v4 = icmp eq v0, v1
v5 = uextend.i32 v4
v6 = select.i64 v5, v2, v3
return v6
}
; block0:
; subs wzr, w0, w1
; csel x0, x2, x3, eq
; ret
function %f0(f32, f32, i64, i64) -> i64 {
block0(v0: f32, v1: f32, v2: i64, v3: i64):
v4 = fcmp eq v0, v1
v5 = uextend.i32 v4
v6 = select.i64 v5, v2, v3
return v6
}
; block0:
; fcmp s0, s1
; csel x0, x0, x1, eq
; ret

View File

@@ -328,3 +328,69 @@ block202:
; popq %rbp
; ret
function %br_i8_icmp(i32, i32) -> i32 {
block0(v0: i32, v1: i32):
v2 = icmp eq v0, v1
v3 = uextend.i32 v2
brnz v3, block1
jump block2
block1:
v4 = iconst.i32 1
return v4
block2:
v5 = iconst.i32 2
return v5
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; cmpl %esi, %edi
; jz label1; j label2
; block1:
; movl $1, %eax
; movq %rbp, %rsp
; popq %rbp
; ret
; block2:
; movl $2, %eax
; movq %rbp, %rsp
; popq %rbp
; ret
function %br_i8_fcmp(f32, f32) -> i32 {
block0(v0: f32, v1: f32):
v2 = fcmp eq v0, v1
v3 = uextend.i32 v2
brnz v3, block1
jump block2
block1:
v4 = iconst.i32 1
return v4
block2:
v5 = iconst.i32 2
return v5
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; ucomiss %xmm1, %xmm0
; jp label2
; jnz label2; j label1
; block1:
; movl $1, %eax
; movq %rbp, %rsp
; popq %rbp
; ret
; block2:
; movl $2, %eax
; movq %rbp, %rsp
; popq %rbp
; ret

View File

@@ -0,0 +1,41 @@
test compile precise-output
target x86_64
function %f0(i32, i32, i64, i64) -> i64 {
block0(v0: i32, v1: i32, v2: i64, v3: i64):
v4 = icmp eq v0, v1
v5 = uextend.i32 v4
v6 = select.i64 v5, v2, v3
return v6
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; cmpl %esi, %edi
; movq %rcx, %rax
; cmovzq %rdx, %rax, %rax
; movq %rbp, %rsp
; popq %rbp
; ret
function %f0(f32, f32, i64, i64) -> i64 {
block0(v0: f32, v1: f32, v2: i64, v3: i64):
v4 = fcmp eq v0, v1
v5 = uextend.i32 v4
v6 = select.i64 v5, v2, v3
return v6
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; ucomiss %xmm0, %xmm1
; movq %rdi, %rax
; cmovnzq %rsi, %rax, %rax
; cmovpq %rsi, %rax, %rax
; movq %rbp, %rsp
; popq %rbp
; ret