Optimize vconst for x86 when immediate contains all zeroes or ones

Instead of using MOVUPS to expensively load bits from memory, this change uses a predicate to optimize vconst without a memory access: - when the 128-bit immediate is all zeroes in all bits, use PXOR to zero out an XMM register - when the 128-bit immediate is all ones in all bits, use PCMPEQB to set an XMM register to all ones This leaves the constant data in the constant pool, which may increase code size (TODO)
2019-08-28 15:29:40 -07:00
parent 694de912a5
commit 702155b19b
7 changed files with 154 additions and 3 deletions
--- a/cranelift/filetests/filetests/isa/x86/vconst-binemit.clif
+++ b/cranelift/filetests/filetests/isa/x86/vconst-binemit.clif
@@ -5,7 +5,7 @@ target x86_64

 function %test_vconst_b8() {
 ebb0:
-[-, %xmm2]  v0 = vconst.b8x16 0x00  ; bin: 0f 10 15 00000008 PCRelRodata4(15)
-[-, %xmm3]  v1 = vconst.b8x16 0x01  ; bin: 0f 10 1d 00000011 PCRelRodata4(31)
+[-, %xmm2]  v0 = vconst.b8x16 0x01  ; bin: 0f 10 15 00000008 PCRelRodata4(15)
+[-, %xmm3]  v1 = vconst.b8x16 0x02  ; bin: 0f 10 1d 00000011 PCRelRodata4(31)
            return
 }
--- a/cranelift/filetests/filetests/isa/x86/vconst-opt-run.clif
+++ b/cranelift/filetests/filetests/isa/x86/vconst-opt-run.clif
@@ -0,0 +1,23 @@
+test run
+set enable_simd
+target x86_64
+
+; TODO move to vconst-run.clif
+
+function %test_vconst_zeroes() -> b1 {
+ebb0:
+    v0 = vconst.i8x16 0x00
+    v1 = extractlane v0, 4
+    v2 = icmp_imm eq v1, 0
+    return v2
+}
+; run
+
+function %test_vconst_ones() -> b1 {
+ebb0:
+    v0 = vconst.i8x16 0xffffffffffffffffffffffffffffffff
+    v1 = extractlane v0, 2
+    v2 = icmp_imm eq v1, 0xff
+    return v2
+}
+; run
--- a/cranelift/filetests/filetests/isa/x86/vconst-opt.clif
+++ b/cranelift/filetests/filetests/isa/x86/vconst-opt.clif
@@ -0,0 +1,12 @@
+test binemit
+set enable_simd
+target x86_64
+
+; TODO move to vconst-compile.clif or vconst-binemit.clif
+
+function %test_vconst_optimizations() {
+ebb0:
+[-, %xmm4]  v0 = vconst.b8x16 0x00                                  ; bin: 66 0f ef e4
+[-, %xmm7]  v1 = vconst.b8x16 0xffffffffffffffffffffffffffffffff    ; bin: 66 0f 74 ff
+            return
+}