x64: Sink constant loads into xmm instructions (#5880)

A number of places in the x64 backend make use of 128-bit constants for various wasm SIMD-related instructions although most of them currently use the `x64_xmm_load_const` helper to load the constant into a register. Almost all xmm instructions, however, enable using a memory operand which means that these loads can be folded into instructions to help reduce register pressure. Automatic conversions were added for a `VCodeConstant` into an `XmmMem` value and then explicit loads were all removed in favor of forwarding the `XmmMem` value directly to the underlying instruction. Note that some instances of `x64_xmm_load_const` remain since they're used in contexts where load sinking won't work (e.g. they're the first operand, not the second for non-commutative instructions).
2023-02-27 16:02:42 -06:00
parent 9b86a0b9b1
commit f2dce812c3
11 changed files with 147 additions and 182 deletions
--- a/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif
@@ -17,12 +17,10 @@ block0:
 ;   movq    %rsp, %rbp
 ; block0:
 ;   movdqu  const(3), %xmm0
-;   movdqu  const(2), %xmm4
-;   movdqu  const(0), %xmm2
-;   pshufb  %xmm0, %xmm2, %xmm0
-;   movdqu  const(1), %xmm6
-;   pshufb  %xmm4, %xmm6, %xmm4
-;   por     %xmm0, %xmm4, %xmm0
+;   movdqu  const(2), %xmm2
+;   pshufb  %xmm0, const(0), %xmm0
+;   pshufb  %xmm2, const(1), %xmm2
+;   por     %xmm0, %xmm2, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -32,13 +30,11 @@ block0:
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movdqu 0x64(%rip), %xmm0
-;   movdqu 0x4c(%rip), %xmm4
-;   movdqu 0x24(%rip), %xmm2
-;   pshufb %xmm2, %xmm0
-;   movdqu 0x27(%rip), %xmm6
-;   pshufb %xmm6, %xmm4
-;   por %xmm4, %xmm0
+;   movdqu 0x54(%rip), %xmm0
+;   movdqu 0x3c(%rip), %xmm2
+;   pshufb 0x13(%rip), %xmm0
+;   pshufb 0x1a(%rip), %xmm2
+;   por %xmm2, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
@@ -50,10 +46,6 @@ block0:
 ;   addb %al, (%rax)
 ;   addb %al, (%rax)
 ;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
 ;   addb $0x80, -0x7f7f7f80(%rax)
 ;   addb $0x80, -0x7f7f7f80(%rax)
 ;   addb $0, 0x101(%rax)
@@ -84,8 +76,7 @@ block0:
 ;   movq    %rsp, %rbp
 ; block0:
 ;   movdqu  const(1), %xmm0
-;   movdqu  const(0), %xmm1
-;   pshufb  %xmm0, %xmm1, %xmm0
+;   pshufb  %xmm0, const(0), %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -96,8 +87,7 @@ block0:
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   movdqu 0x24(%rip), %xmm0
-;   movdqu 0xc(%rip), %xmm1
-;   pshufb %xmm1, %xmm0
+;   pshufb 0xb(%rip), %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
@@ -109,6 +99,8 @@ block0:
 ;   addb %al, (%rax)
 ;   addb %al, (%rax)
 ;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
 ;   addb %al, (%rcx, %rax)
 ;   addb %al, (%rax)
 ;   addb %al, (%rax)
@@ -131,10 +123,9 @@ block0:
 ;   movq    %rsp, %rbp
 ; block0:
 ;   movdqu  const(1), %xmm0
-;   movdqu  const(1), %xmm2
-;   movdqu  const(0), %xmm3
-;   paddusb %xmm2, %xmm3, %xmm2
-;   pshufb  %xmm0, %xmm2, %xmm0
+;   movdqu  const(1), %xmm1
+;   paddusb %xmm1, const(0), %xmm1
+;   pshufb  %xmm0, %xmm1, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -145,16 +136,17 @@ block0:
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   movdqu 0x34(%rip), %xmm0
-;   movdqu 0x2c(%rip), %xmm2
-;   movdqu 0x14(%rip), %xmm3
-;   paddusb %xmm3, %xmm2
-;   pshufb %xmm2, %xmm0
+;   movdqu 0x2c(%rip), %xmm1
+;   paddusb 0x14(%rip), %xmm1
+;   pshufb %xmm1, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 ;   addb %al, (%rax)
 ;   addb %al, (%rax)
 ;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
 ;   jo 0xa2
 ;   jo 0xa4
 ;   jo 0xa6