Move bitselect->vselect optimization to x64 back-end (#5191)

The simplifier was performing an optimization to replace bitselect
with vselect if the all bytes of the condition mask could be shown
to be all ones or all zeros.

This optimization only ever made any difference in codegen on the
x64 target.  Therefore, move this optimization to the x64 back-end
and perform it in ISLE instead.  Resulting codegen should be
unchanged, with slightly improved compile time.

This also eliminates a few endian-dependent bitcast operations.
This commit is contained in:
Ulrich Weigand
2022-11-03 21:17:36 +01:00
committed by GitHub
parent 3ef30b5b67
commit 137a8b710f
6 changed files with 152 additions and 135 deletions

View File

@@ -0,0 +1,123 @@
test compile precise-output
set enable_simd
target x86_64 skylake
function %mask_from_icmp(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = icmp eq v0, v1
v3 = bitselect v2, v0, v1
return v3
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm5
; pcmpeqb %xmm5, %xmm1, %xmm5
; movdqa %xmm0, %xmm8
; movdqa %xmm5, %xmm0
; movdqa %xmm1, %xmm6
; pblendvb %xmm6, %xmm8, %xmm6
; movdqa %xmm6, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %mask_from_fcmp(f32x4, f32x4, i32x4, i32x4) -> i32x4 {
block0(v0: f32x4, v1: f32x4, v2: i32x4, v3: i32x4):
v4 = fcmp eq v0, v1
v5 = bitselect v4, v2, v3
return v5
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; cmpps $0, %xmm0, %xmm1, %xmm0
; movdqa %xmm3, %xmm8
; pblendvb %xmm8, %xmm2, %xmm8
; movdqa %xmm8, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %mask_casted(i8x16, i8x16, i32x4) -> i8x16 {
block0(v0: i8x16, v1: i8x16, v2: i32x4):
v3 = bitcast.i8x16 v2
v4 = bitselect v3, v0, v1
return v4
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm5
; pand %xmm5, %xmm2, %xmm5
; movdqa %xmm2, %xmm0
; pandn %xmm0, %xmm1, %xmm0
; por %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %good_const_mask_i8x16(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v3 = vconst.i8x16 [0 0 0xFF 0 0 0xFF 0 0 0 0 0xFF 0 0 0 0 0xFF]
v4 = bitselect v3, v0, v1
return v4
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm6
; movdqu const(0), %xmm0
; movdqa %xmm6, %xmm8
; movdqa %xmm1, %xmm6
; pblendvb %xmm6, %xmm8, %xmm6
; movdqa %xmm6, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %good_const_mask_i16x8(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v3 = vconst.i16x8 [0x0000 0xFF00 0x0000 0x00FF 0x0000 0xFFFF 0x00FF 0xFFFF]
v4 = bitselect v3, v0, v1
return v4
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm6
; movdqu const(0), %xmm0
; movdqa %xmm6, %xmm8
; movdqa %xmm1, %xmm6
; pblendvb %xmm6, %xmm8, %xmm6
; movdqa %xmm6, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %bad_const_mask(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v3 = vconst.i8x16 [0 0 0xF0 0 0 0xFF 0 0 0 0 0xFF 0 0 0 0 0xFF]
v4 = bitselect v3, v0, v1
return v4
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqu const(0), %xmm6
; movdqa %xmm6, %xmm9
; movdqa %xmm0, %xmm5
; pand %xmm5, %xmm9, %xmm5
; movdqa %xmm9, %xmm0
; pandn %xmm0, %xmm1, %xmm0
; por %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret

View File

@@ -128,28 +128,6 @@ block0(v0: i32x4, v1: i32x4):
; popq %rbp
; ret
function %bitselect_i16x8() -> i16x8 {
block0:
v0 = vconst.i16x8 [0 0 0 0 0 0 0 0]
v1 = vconst.i16x8 [0 0 0 0 0 0 0 0]
v2 = vconst.i16x8 [0 0 0 0 0 0 0 0]
v3 = bitselect v0, v1, v2
return v3
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqu const(0), %xmm0
; movdqu const(0), %xmm2
; movdqu const(0), %xmm6
; pand %xmm2, %xmm0, %xmm2
; pandn %xmm0, %xmm6, %xmm0
; por %xmm0, %xmm2, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %vselect_i16x8(i16x8, i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8, v2: i16x8):
v3 = vselect v0, v1, v2