x64: Improve codegen for vectors with constant shift amounts (#5797)

I stumbled across this working on #5795 and figured this was a nice opportunity to improve the codegen here.
2023-02-16 14:47:59 -06:00
parent 1efee4abdf
commit cae3b26623
4 changed files with 344 additions and 63 deletions
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -1455,6 +1455,10 @@
 (decl shift_mask (Type) u32)
 (extern constructor shift_mask shift_mask)

+;; Mask a constant with the type's shift mask
+(decl shift_amount_masked (Type Imm64) u32)
+(extern constructor shift_amount_masked shift_amount_masked)
+
 ;; Extract a constant `GprMemImm.Imm` from a value operand.
 (decl simm32_from_value (GprMemImm) Value)
 (extern extractor simm32_from_value simm32_from_value)
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -473,7 +473,7 @@
 (rule (lower (has_type ty @ $I8X16 (ishl src amt)))
      (let (
            ;; Mask the amount to ensure wrapping behaviour
-            (masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))
+            (masked_amt RegMemImm (mask_xmm_shift ty amt))
            ;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
            ;; correct for half of the lanes; the others must be fixed up with
            ;; the mask below.
@@ -515,16 +515,13 @@
 ;; 16x8, 32x4, and 64x2 shifts can each use a single instruction, once the shift amount is masked.

 (rule (lower (has_type ty @ $I16X8 (ishl src amt)))
-      (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
-        (x64_psllw src (mov_rmi_to_xmm masked_amt))))
+      (x64_psllw src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))

 (rule (lower (has_type ty @ $I32X4 (ishl src amt)))
-      (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
-        (x64_pslld src (mov_rmi_to_xmm masked_amt))))
+      (x64_pslld src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))

 (rule (lower (has_type ty @ $I64X2 (ishl src amt)))
-      (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
-        (x64_psllq src (mov_rmi_to_xmm masked_amt))))
+      (x64_psllq src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))

 ;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

@@ -580,7 +577,7 @@
 (rule (lower (has_type ty @ $I8X16 (ushr src amt)))
      (let (
            ;; Mask the amount to ensure wrapping behaviour
-            (masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))
+            (masked_amt RegMemImm (mask_xmm_shift ty amt))
            ;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
            ;; correct for half of the lanes; the others must be fixed up with
            ;; the mask below.
@@ -625,16 +622,19 @@
 ;; 16x8, 32x4, and 64x2 shifts can each use a single instruction, once the shift amount is masked.

 (rule (lower (has_type ty @ $I16X8 (ushr src amt)))
-      (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
-        (x64_psrlw src (mov_rmi_to_xmm masked_amt))))
+      (x64_psrlw src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))

 (rule (lower (has_type ty @ $I32X4 (ushr src amt)))
-      (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
-        (x64_psrld src (mov_rmi_to_xmm masked_amt))))
+      (x64_psrld src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))

 (rule (lower (has_type ty @ $I64X2 (ushr src amt)))
-      (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
-        (x64_psrlq src (mov_rmi_to_xmm masked_amt))))
+      (x64_psrlq src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))
+
+(decl mask_xmm_shift (Type Value) RegMemImm)
+(rule (mask_xmm_shift ty amt)
+      (gpr_to_reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
+(rule 1 (mask_xmm_shift ty (iconst n))
+      (RegMemImm.Imm (shift_amount_masked ty n)))

 ;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

@@ -701,7 +701,7 @@
 (rule (lower (has_type ty @ $I8X16 (sshr src amt @ (value_type amt_ty))))
      (let ((src_ Xmm (put_in_xmm src))
            ;; Mask the amount to ensure wrapping behaviour
-            (masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))
+            (masked_amt RegMemImm (mask_xmm_shift ty amt))
            ;; In order for `packsswb` later to only use the high byte of each
            ;; 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to
            ;; fill in the upper bits appropriately.
@@ -728,12 +728,10 @@
 ;; that if the shift amount is in a register, it is in an XMM register.

 (rule (lower (has_type ty @ $I16X8 (sshr src amt)))
-      (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
-        (x64_psraw src (mov_rmi_to_xmm masked_amt))))
+      (x64_psraw src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))

 (rule (lower (has_type ty @ $I32X4 (sshr src amt)))
-      (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
-        (x64_psrad src (mov_rmi_to_xmm masked_amt))))
+      (x64_psrad src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))

 ;; The `sshr.i64x2` CLIF instruction has no single x86 instruction in the older
 ;; feature sets. Newer ones like AVX512VL + AVX512F include `vpsraq`, a 128-bit
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@@ -259,6 +259,10 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
        ty.lane_bits() - 1
    }

+    fn shift_amount_masked(&mut self, ty: Type, val: Imm64) -> u32 {
+        (val.bits() as u32) & self.shift_mask(ty)
+    }
+
    #[inline]
    fn simm32_from_value(&mut self, val: Value) -> Option<GprMemImm> {
        let inst = self.lower_ctx.dfg().value_def(val).inst()?;