Improve bitselect codegen with knowledge of operand origin (#1783)

* Encode vselect using BLEND instructions on x86

* Legalize vselect to bitselect

* Optimize bitselect to vselect for some operands

* Add run tests for bitselect-vselect optimization

* Address review feedback
This commit is contained in:
teapotd
2020-05-30 04:53:11 +02:00
committed by GitHub
parent 16afca4451
commit e430984ac4
11 changed files with 341 additions and 1 deletions

View File

@@ -0,0 +1,39 @@
test run
set opt_level=speed_and_size
set enable_simd
target x86_64 haswell
;; Test if bitselect->vselect optimization works properly
function %mask_from_icmp(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = icmp sge v0, v1
v3 = raw_bitcast.i32x4 v2
v4 = bitselect v3, v0, v1
return v4
}
; run: %mask_from_icmp([5 6 7 8], [1 10 20 7]) == [5 10 20 8]
function %mask_casted(i64x2, i64x2, i32x4) -> i64x2 {
block0(v0: i64x2, v1: i64x2, v2: i32x4):
v3 = raw_bitcast.i64x2 v2
v4 = bitselect v3, v0, v1
return v4
}
; run: %mask_casted([0 0], [0xFFFFFF 0xFFFF4F], [0xFFF1 0 0xF 0]) == [0xFF000E 0xFFFF40]
function %good_const_mask(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = vconst.i32x4 [0x0000FF00 0x00FF00FF 0x00FF00FF 0xFF00FFFF]
v4 = bitselect v2, v0, v1
return v4
}
; run: %good_const_mask([0x1234 0x5678 0x1234 0x5678], [0xAAAA 0xAAAA 0xAAAA 0xAAAA]) == [0x12AA 0xAA78 0xAA34 0x5678]
function %bad_const_mask(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = vconst.i32x4 [0x0000FF00 0x00FF00FF 0x00FF000F 0xFF00FFF0]
v4 = bitselect v2, v0, v1
return v4
}
; run: %bad_const_mask([0x1234 0x5678 0x1234 0x5678], [0xAAAA 0xAAAA 0xAAAA 0xAAAA]) == [0x12AA 0xAA78 0xAAA4 0x567A]

View File

@@ -0,0 +1,27 @@
test binemit
set enable_simd
target x86_64 haswell
function %vselect_i8x16(b8x16, i8x16, i8x16) {
block0(v0: b8x16 [%xmm0], v1: i8x16 [%xmm3], v2: i8x16 [%xmm5]):
[-, %xmm5] v3 = vselect v0, v1, v2 ; bin: 66 0f 38 10 eb
return
}
function %vselect_i16x8(b16x8, i16x8, i16x8) {
block0(v0: b16x8 [%xmm0], v1: i16x8 [%xmm3], v2: i16x8 [%xmm5]):
[-, %xmm5] v3 = vselect v0, v1, v2 ; bin: 66 0f 38 10 eb
return
}
function %vselect_i32x4(b32x4, i32x4, i32x4) {
block0(v0: b32x4 [%xmm0], v1: i32x4 [%xmm3], v2: i32x4 [%xmm5]):
[-, %xmm5] v3 = vselect v0, v1, v2 ; bin: 66 0f 38 14 eb
return
}
function %vselect_i64x2(b64x2, i64x2, i64x2) {
block0(v0: b64x2 [%xmm0], v1: i64x2 [%xmm3], v2: i64x2 [%xmm5]):
[-, %xmm5] v3 = vselect v0, v1, v2 ; bin: 66 0f 38 15 eb
return
}

View File

@@ -0,0 +1,45 @@
test legalizer
set enable_simd
target x86_64
;; Test if vselect gets legalized if BLEND* instructions are not available
function %vselect_i8x16(b8x16, i8x16, i8x16) -> i8x16 {
block0(v0: b8x16, v1: i8x16, v2: i8x16):
v3 = vselect v0, v1, v2
; check: v4 = raw_bitcast.i8x16 v0
; nextln: v5 = band v1, v4
; nextln: v6 = band_not v2, v4
; nextln: v3 = bor v5, v6
return v3
}
function %vselect_i16x8(b16x8, i16x8, i16x8) -> i16x8 {
block0(v0: b16x8, v1: i16x8, v2: i16x8):
v3 = vselect v0, v1, v2
; check: v4 = raw_bitcast.i16x8 v0
; nextln: v5 = band v1, v4
; nextln: v6 = band_not v2, v4
; nextln: v3 = bor v5, v6
return v3
}
function %vselect_i32x4(b32x4, i32x4, i32x4) -> i32x4 {
block0(v0: b32x4, v1: i32x4, v2: i32x4):
v3 = vselect v0, v1, v2
; check: v4 = raw_bitcast.i32x4 v0
; nextln: v5 = band v1, v4
; nextln: v6 = band_not v2, v4
; nextln: v3 = bor v5, v6
return v3
}
function %vselect_i64x2(b64x2, i64x2, i64x2) -> i64x2 {
block0(v0: b64x2, v1: i64x2, v2: i64x2):
v3 = vselect v0, v1, v2
; check: v4 = raw_bitcast.i64x2 v0
; nextln: v5 = band v1, v4
; nextln: v6 = band_not v2, v4
; nextln: v3 = bor v5, v6
return v3
}

View File

@@ -0,0 +1,43 @@
test run
set enable_simd
target x86_64 haswell
function %vselect_i8x16() -> i8x16 {
block0:
v1 = vconst.b8x16 [false true false true false true true true true true false false false false false false]
v2 = vconst.i8x16 [100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115]
v3 = vconst.i8x16 [200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215]
v4 = vselect v1, v2, v3
return v4
}
; run: %vselect_i8x16() == [200 101 202 103 204 105 106 107 108 109 210 211 212 213 214 215]
function %vselect_i16x8() -> i16x8 {
block0:
v1 = vconst.b16x8 [false true false true false true true true]
v2 = vconst.i16x8 [100 101 102 103 104 105 106 107]
v3 = vconst.i16x8 [200 201 202 203 204 205 206 207]
v4 = vselect v1, v2, v3
return v4
}
; run: %vselect_i16x8() == [200 101 202 103 204 105 106 107]
function %vselect_i32x4() -> i32x4 {
block0:
v1 = vconst.b32x4 [false true false true]
v2 = vconst.i32x4 [100 101 102 103]
v3 = vconst.i32x4 [200 201 202 203]
v4 = vselect v1, v2, v3
return v4
}
; run: %vselect_i32x4() == [200 101 202 103]
function %vselect_i64x2() -> i64x2 {
block0:
v1 = vconst.b64x2 [false true]
v2 = vconst.i64x2 [100 101]
v3 = vconst.i64x2 [200 201]
v4 = vselect v1, v2, v3
return v4
}
; run: %vselect_i64x2() == [200 101]

View File

@@ -0,0 +1,50 @@
test simple_preopt
target x86_64
;; Test replacement of bitselect with vselect for special masks
function %mask_from_icmp(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = icmp eq v0, v1
v3 = raw_bitcast.i8x16 v2
v4 = bitselect v3, v0, v1
; check: v4 = vselect v2, v0, v1
return v4
}
function %mask_casted(i8x16, i8x16, i32x4) -> i8x16 {
block0(v0: i8x16, v1: i8x16, v2: i32x4):
v3 = raw_bitcast.i8x16 v2
v4 = bitselect v3, v0, v1
; check: v4 = bitselect v3, v0, v1
return v4
}
function %good_const_mask_i8x16(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v3 = vconst.i8x16 [0 0 0xFF 0 0 0xFF 0 0 0 0 0xFF 0 0 0 0 0xFF]
v4 = bitselect v3, v0, v1
; check: v5 = raw_bitcast.b8x16 v3
; nextln: v4 = vselect v5, v0, v1
return v4
}
function %good_const_mask_i16x8(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v3 = vconst.i16x8 [0x0000 0xFF00 0x0000 0x00FF 0x0000 0xFFFF 0x00FF 0xFFFF]
v4 = bitselect v3, v0, v1
; check: v5 = raw_bitcast.b8x16 v3
; nextln: v6 = raw_bitcast.i8x16 v0
; nextln: v7 = raw_bitcast.i8x16 v1
; nextln: v8 = vselect v5, v6, v7
; nextln: v4 = raw_bitcast.i16x8 v8
return v4
}
function %bad_const_mask(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v3 = vconst.i8x16 [0 0 0xF0 0 0 0xFF 0 0 0 0 0xFF 0 0 0 0 0xFF]
v4 = bitselect v3, v0, v1
; check: v4 = bitselect v3, v0, v1
return v4
}