From 52b4c48a1b27b80247ccf31c9fe3f64d4f56b423 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Wed, 1 Mar 2023 23:43:42 -0600
Subject: [PATCH] x64: Improve codegen for i8x16.shr_u (#5906)

This catches a case that wasn't handled previously by #5880 to allow a
constant load to be folded into an instruction rather than forcing it to
be loaded into a temporary register.
---
 cranelift/codegen/src/isa/x64/lower.isle      |  6 ++--
 .../filetests/isa/x64/simd-arith-avx.clif     | 17 ++++++-----
 .../isa/x64/simd-bitwise-compile.clif         | 30 +++++++------------
 3 files changed, 22 insertions(+), 31 deletions(-)

diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
index 9f83ccbfb9..ec21968b38 100644
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -522,12 +522,10 @@
             ;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
             ;; correct for half of the lanes; the others must be fixed up with
             ;; the mask below.
-            (unmasked Xmm (x64_psrlw src (mov_rmi_to_xmm masked_amt)))
-            (mask_addr SyntheticAmode (ushr_i8x16_mask masked_amt))
-            (mask Reg (x64_load $I8X16 mask_addr (ExtKind.None))))
+            (unmasked Xmm (x64_psrlw src (mov_rmi_to_xmm masked_amt))))
         (sse_and $I8X16
                  unmasked
-                 (RegMem.Reg mask))))
+                 (ushr_i8x16_mask masked_amt))))
 
 ;; Get the address of the mask to use when fixing up the lanes that weren't
 ;; correctly generated by the 16x8 shift.
diff --git a/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif b/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif
index 3e622bd5f9..fdd58032ea 100644
--- a/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif
@@ -1635,8 +1635,7 @@ block0(v0: i8x16, v1: i32):
 ;   vpsrlw  %xmm0, %xmm5, %xmm7
 ;   lea     const(0), %rsi
 ;   shlq    $4, %r10, %r10
-;   movdqu  0(%rsi,%r10,1), %xmm13
-;   vpand   %xmm7, %xmm13, %xmm0
+;   vpand   %xmm7, 0(%rsi,%r10,1), %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -1652,11 +1651,13 @@ block0(v0: i8x16, v1: i32):
 ;   vpsrlw %xmm5, %xmm0, %xmm7
 ;   leaq 0x15(%rip), %rsi
 ;   shlq $4, %r10
-;   movdqu (%rsi, %r10), %xmm13
-;   vpand %xmm13, %xmm7, %xmm0
+;   vpand (%rsi, %r10), %xmm7, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %bh, %bh
 
 function %i8x16_ushr_imm(i8x16) -> i8x16 {
 block0(v0: i8x16):
@@ -1670,8 +1671,7 @@ block0(v0: i8x16):
 ;   movq    %rsp, %rbp
 ; block0:
 ;   vpsrlw  %xmm0, $1, %xmm2
-;   movdqu  const(0), %xmm4
-;   vpand   %xmm2, %xmm4, %xmm0
+;   vpand   %xmm2, const(0), %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -1682,14 +1682,15 @@ block0(v0: i8x16):
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   vpsrlw $1, %xmm0, %xmm2
-;   movdqu 0xf(%rip), %xmm4
-;   vpand %xmm4, %xmm2, %xmm0
+;   vpand 0xf(%rip), %xmm2, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 ;   addb %al, (%rax)
 ;   addb %al, (%rax)
 ;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
 ;   jg 0xa1
 ;   jg 0xa3
 ;   jg 0xa5
diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
index 1ecdf31ed0..f63cc22313 100644
--- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
@@ -485,8 +485,7 @@ block0:
 ; block0:
 ;   movdqu  const(1), %xmm0
 ;   psrlw   %xmm0, $1, %xmm0
-;   movdqu  const(0), %xmm3
-;   pand    %xmm0, %xmm3, %xmm0
+;   pand    %xmm0, const(0), %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -496,28 +495,21 @@ block0:
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movdqu 0x34(%rip), %xmm0
+;   movdqu 0x24(%rip), %xmm0
 ;   psrlw $1, %xmm0
-;   movdqu 0x17(%rip), %xmm3
-;   pand %xmm3, %xmm0
+;   pand 7(%rip), %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 ;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   jg 0xb1
-;   jg 0xb3
-;   jg 0xb5
-;   jg 0xb7
-;   jg 0xb9
-;   jg 0xbb
-;   jg 0xbd
-;   jg 0xbf
+;   jg 0xa1
+;   jg 0xa3
+;   jg 0xa5
+;   jg 0xa7
+;   jg 0xa9
+;   jg 0xab
+;   jg 0xad
+;   jg 0xaf
 ;   addb %al, (%rcx)
 ;   addb (%rbx), %al
 ;   addb $5, %al