Optimize vconst for x86 when immediate contains all zeroes or ones

Instead of using MOVUPS to expensively load bits from memory, this change uses a predicate to optimize vconst without a memory access:
 - when the 128-bit immediate is all zeroes in all bits, use PXOR to zero out an XMM register
 - when the 128-bit immediate is all ones in all bits, use PCMPEQB to set an XMM register to all ones

This leaves the constant data in the constant pool, which may increase code size (TODO)
This commit is contained in:
Andrew Brown
2019-08-28 15:29:40 -07:00
parent 694de912a5
commit 702155b19b
7 changed files with 154 additions and 3 deletions

View File

@@ -5,7 +5,7 @@ target x86_64
function %test_vconst_b8() {
ebb0:
[-, %xmm2] v0 = vconst.b8x16 0x00 ; bin: 0f 10 15 00000008 PCRelRodata4(15)
[-, %xmm3] v1 = vconst.b8x16 0x01 ; bin: 0f 10 1d 00000011 PCRelRodata4(31)
[-, %xmm2] v0 = vconst.b8x16 0x01 ; bin: 0f 10 15 00000008 PCRelRodata4(15)
[-, %xmm3] v1 = vconst.b8x16 0x02 ; bin: 0f 10 1d 00000011 PCRelRodata4(31)
return
}

View File

@@ -0,0 +1,23 @@
test run
set enable_simd
target x86_64
; TODO move to vconst-run.clif
function %test_vconst_zeroes() -> b1 {
ebb0:
v0 = vconst.i8x16 0x00
v1 = extractlane v0, 4
v2 = icmp_imm eq v1, 0
return v2
}
; run
function %test_vconst_ones() -> b1 {
ebb0:
v0 = vconst.i8x16 0xffffffffffffffffffffffffffffffff
v1 = extractlane v0, 2
v2 = icmp_imm eq v1, 0xff
return v2
}
; run

View File

@@ -0,0 +1,12 @@
test binemit
set enable_simd
target x86_64
; TODO move to vconst-compile.clif or vconst-binemit.clif
function %test_vconst_optimizations() {
ebb0:
[-, %xmm4] v0 = vconst.b8x16 0x00 ; bin: 66 0f ef e4
[-, %xmm7] v1 = vconst.b8x16 0xffffffffffffffffffffffffffffffff ; bin: 66 0f 74 ff
return
}