From cae3b26623e98044c4cc96a18ef2bd50edf01da4 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Thu, 16 Feb 2023 14:47:59 -0600
Subject: [PATCH] x64: Improve codegen for vectors with constant shift amounts
 (#5797)

I stumbled across this working on #5795 and figured this was a nice
opportunity to improve the codegen here.
---
 cranelift/codegen/src/isa/x64/inst.isle       |   4 +
 cranelift/codegen/src/isa/x64/lower.isle      |  36 +-
 cranelift/codegen/src/isa/x64/lower/isle.rs   |   4 +
 .../isa/x64/simd-bitwise-compile.clif         | 363 +++++++++++++++---
 4 files changed, 344 insertions(+), 63 deletions(-)

diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle
index cfc2afc618..715d9fd879 100644
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -1455,6 +1455,10 @@
 (decl shift_mask (Type) u32)
 (extern constructor shift_mask shift_mask)
 
+;; Mask a constant with the type's shift mask
+(decl shift_amount_masked (Type Imm64) u32)
+(extern constructor shift_amount_masked shift_amount_masked)
+
 ;; Extract a constant `GprMemImm.Imm` from a value operand.
 (decl simm32_from_value (GprMemImm) Value)
 (extern extractor simm32_from_value simm32_from_value)
diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
index e8fd01f840..1275ab2ad2 100644
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -473,7 +473,7 @@
 (rule (lower (has_type ty @ $I8X16 (ishl src amt)))
       (let (
             ;; Mask the amount to ensure wrapping behaviour
-            (masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))
+            (masked_amt RegMemImm (mask_xmm_shift ty amt))
             ;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
             ;; correct for half of the lanes; the others must be fixed up with
             ;; the mask below.
@@ -515,16 +515,13 @@
 ;; 16x8, 32x4, and 64x2 shifts can each use a single instruction, once the shift amount is masked.
 
 (rule (lower (has_type ty @ $I16X8 (ishl src amt)))
-      (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
-        (x64_psllw src (mov_rmi_to_xmm masked_amt))))
+      (x64_psllw src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))
 
 (rule (lower (has_type ty @ $I32X4 (ishl src amt)))
-      (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
-        (x64_pslld src (mov_rmi_to_xmm masked_amt))))
+      (x64_pslld src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))
 
 (rule (lower (has_type ty @ $I64X2 (ishl src amt)))
-      (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
-        (x64_psllq src (mov_rmi_to_xmm masked_amt))))
+      (x64_psllq src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))
 
 ;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -580,7 +577,7 @@
 (rule (lower (has_type ty @ $I8X16 (ushr src amt)))
       (let (
             ;; Mask the amount to ensure wrapping behaviour
-            (masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))
+            (masked_amt RegMemImm (mask_xmm_shift ty amt))
             ;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
             ;; correct for half of the lanes; the others must be fixed up with
             ;; the mask below.
@@ -625,16 +622,19 @@
 ;; 16x8, 32x4, and 64x2 shifts can each use a single instruction, once the shift amount is masked.
 
 (rule (lower (has_type ty @ $I16X8 (ushr src amt)))
-      (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
-        (x64_psrlw src (mov_rmi_to_xmm masked_amt))))
+      (x64_psrlw src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))
 
 (rule (lower (has_type ty @ $I32X4 (ushr src amt)))
-      (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
-        (x64_psrld src (mov_rmi_to_xmm masked_amt))))
+      (x64_psrld src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))
 
 (rule (lower (has_type ty @ $I64X2 (ushr src amt)))
-      (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
-        (x64_psrlq src (mov_rmi_to_xmm masked_amt))))
+      (x64_psrlq src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))
+
+(decl mask_xmm_shift (Type Value) RegMemImm)
+(rule (mask_xmm_shift ty amt)
+      (gpr_to_reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
+(rule 1 (mask_xmm_shift ty (iconst n))
+      (RegMemImm.Imm (shift_amount_masked ty n)))
 
 ;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -701,7 +701,7 @@
 (rule (lower (has_type ty @ $I8X16 (sshr src amt @ (value_type amt_ty))))
       (let ((src_ Xmm (put_in_xmm src))
             ;; Mask the amount to ensure wrapping behaviour
-            (masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))
+            (masked_amt RegMemImm (mask_xmm_shift ty amt))
             ;; In order for `packsswb` later to only use the high byte of each
             ;; 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to
             ;; fill in the upper bits appropriately.
@@ -728,12 +728,10 @@
 ;; that if the shift amount is in a register, it is in an XMM register.
 
 (rule (lower (has_type ty @ $I16X8 (sshr src amt)))
-      (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
-        (x64_psraw src (mov_rmi_to_xmm masked_amt))))
+      (x64_psraw src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))
 
 (rule (lower (has_type ty @ $I32X4 (sshr src amt)))
-      (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
-        (x64_psrad src (mov_rmi_to_xmm masked_amt))))
+      (x64_psrad src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))
 
 ;; The `sshr.i64x2` CLIF instruction has no single x86 instruction in the older
 ;; feature sets. Newer ones like AVX512VL + AVX512F include `vpsraq`, a 128-bit
diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs
index 7d97d761ed..26766dc3d6 100644
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@@ -259,6 +259,10 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
         ty.lane_bits() - 1
     }
 
+    fn shift_amount_masked(&mut self, ty: Type, val: Imm64) -> u32 {
+        (val.bits() as u32) & self.shift_mask(ty)
+    }
+
     #[inline]
     fn simm32_from_value(&mut self, val: Value) -> Option<GprMemImm> {
         let inst = self.lower_ctx.dfg().value_def(val).inst()?;
diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
index 056f256013..2c56dfd3c8 100644
--- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
@@ -360,6 +360,117 @@ block0(v0: i32):
 ;   addb %al, (%rax)
 ;   addb %al, (%rax)
 
+function %ishl_i8x16_imm(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = iconst.i32 124
+    v2 = ishl v0, v1
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   psllw   %xmm0, $4, %xmm0
+;   movdqu  const(0), %xmm4
+;   pand    %xmm0, %xmm4, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   psllw $4, %xmm0
+;   movdqu 0xf(%rip), %xmm4
+;   pand %xmm4, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+
+function %ishl_i16x8_imm(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = iconst.i32 1
+    v2 = ishl v0, v1
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   psllw   %xmm0, $1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   psllw $1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %ishl_i32x4_imm(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = iconst.i32 100
+    v2 = ishl v0, v1
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pslld   %xmm0, $4, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pslld $4, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %ishl_i64x2_imm(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = iconst.i32 100
+    v2 = ishl v0, v1
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   psllq   %xmm0, $36, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   psllq $0x24, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
 function %ushr_i8x16_imm() -> i8x16 {
 block0:
     v0 = iconst.i32 1
@@ -373,14 +484,9 @@ block0:
 ;   movq    %rsp, %rbp
 ; block0:
 ;   movdqu  const(1), %xmm0
-;   movl    $1, %r9d
-;   andq    %r9, $7, %r9
-;   movd    %r9d, %xmm5
-;   psrlw   %xmm0, %xmm5, %xmm0
-;   lea     const(0), %rsi
-;   shlq    $4, %r9, %r9
-;   movdqu  0(%rsi,%r9,1), %xmm13
-;   pand    %xmm0, %xmm13, %xmm0
+;   psrlw   %xmm0, $1, %xmm0
+;   movdqu  const(0), %xmm3
+;   pand    %xmm0, %xmm3, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -390,21 +496,109 @@ block0:
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movdqu 0xb4(%rip), %xmm0
-;   movl $1, %r9d
-;   andq $7, %r9
-;   movd %r9d, %xmm5
-;   psrlw %xmm5, %xmm0
-;   leaq 0x1a(%rip), %rsi
-;   shlq $4, %r9
-;   movdqu (%rsi, %r9), %xmm13
-;   pand %xmm13, %xmm0
+;   movdqu 0x34(%rip), %xmm0
+;   psrlw $1, %xmm0
+;   movdqu 0x17(%rip), %xmm3
+;   pand %xmm3, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 ;   addb %al, (%rax)
 ;   addb %al, (%rax)
-;   addb %bh, %bh
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   jg 0xb1
+;   jg 0xb3
+;   jg 0xb5
+;   jg 0xb7
+;   jg 0xb9
+;   jg 0xbb
+;   jg 0xbd
+;   jg 0xbf
+;   addb %al, (%rcx)
+;   addb (%rbx), %al
+;   addb $5, %al
+
+function %ushr_i16x8_imm(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = iconst.i32 1
+    v2 = ushr v0, v1
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   psrlw   %xmm0, $1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   psrlw $1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %ushr_i32x4_imm(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = iconst.i32 100
+    v2 = ushr v0, v1
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   psrld   %xmm0, $4, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   psrld $4, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %ushr_i64x2_imm(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = iconst.i32 100
+    v2 = ushr v0, v1
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   psrlq   %xmm0, $36, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   psrlq $0x24, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
 
 function %sshr_i8x16(i32) -> i8x16 {
 block0(v0: i32):
@@ -465,19 +659,15 @@ block0(v0: i8x16, v1: i32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movl    $3, %r10d
-;   andq    %r10, $7, %r10
-;   movdqa  %xmm0, %xmm13
-;   punpcklbw %xmm13, %xmm0, %xmm13
-;   movdqa  %xmm13, %xmm12
-;   movdqa  %xmm0, %xmm13
-;   punpckhbw %xmm13, %xmm0, %xmm13
-;   addl    %r10d, $8, %r10d
-;   movd    %r10d, %xmm14
-;   movdqa  %xmm12, %xmm0
-;   psraw   %xmm0, %xmm14, %xmm0
-;   psraw   %xmm13, %xmm14, %xmm13
-;   packsswb %xmm0, %xmm13, %xmm0
+;   movdqa  %xmm0, %xmm7
+;   punpcklbw %xmm7, %xmm0, %xmm7
+;   movdqa  %xmm7, %xmm8
+;   movdqa  %xmm0, %xmm7
+;   punpckhbw %xmm7, %xmm0, %xmm7
+;   movdqa  %xmm8, %xmm0
+;   psraw   %xmm0, $11, %xmm0
+;   psraw   %xmm7, $11, %xmm7
+;   packsswb %xmm0, %xmm7, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -487,19 +677,104 @@ block0(v0: i8x16, v1: i32):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movl $3, %r10d
-;   andq $7, %r10
-;   movdqa %xmm0, %xmm13
-;   punpcklbw %xmm0, %xmm13
-;   movdqa %xmm13, %xmm12
-;   movdqa %xmm0, %xmm13
-;   punpckhbw %xmm0, %xmm13
-;   addl $8, %r10d
-;   movd %r10d, %xmm14
-;   movdqa %xmm12, %xmm0
-;   psraw %xmm14, %xmm0
-;   psraw %xmm14, %xmm13
-;   packsswb %xmm13, %xmm0
+;   movdqa %xmm0, %xmm7
+;   punpcklbw %xmm0, %xmm7
+;   movdqa %xmm7, %xmm8
+;   movdqa %xmm0, %xmm7
+;   punpckhbw %xmm0, %xmm7
+;   movdqa %xmm8, %xmm0
+;   psraw $0xb, %xmm0
+;   psraw $0xb, %xmm7
+;   packsswb %xmm7, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %sshr_i16x8_imm(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = iconst.i32 1
+    v2 = sshr v0, v1
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   psraw   %xmm0, $1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   psraw $1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %sshr_i32x4_imm(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = iconst.i32 100
+    v2 = sshr v0, v1
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   psrad   %xmm0, $4, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   psrad $4, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %sshr_i64x2_imm(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = iconst.i32 100
+    v2 = sshr v0, v1
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pextrd.w $0, %xmm0, %rdx
+;   pextrd.w $1, %xmm0, %r9
+;   sarq    $36, %rdx, %rdx
+;   sarq    $36, %r9, %r9
+;   uninit  %xmm0
+;   pinsrd.w $0, %xmm0, %rdx, %xmm0
+;   pinsrd.w $1, %xmm0, %r9, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pextrq $0, %xmm0, %rdx
+;   pextrq $1, %xmm0, %r9
+;   sarq $0x24, %rdx
+;   sarq $0x24, %r9
+;   pinsrq $0, %rdx, %xmm0
+;   pinsrq $1, %r9, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq