x64: Improve codegen for vectors with constant shift amounts (#5797)

I stumbled across this working on #5795 and figured this was a nice
opportunity to improve the codegen here.
This commit is contained in:
Alex Crichton
2023-02-16 14:47:59 -06:00
committed by GitHub
parent 1efee4abdf
commit cae3b26623
4 changed files with 344 additions and 63 deletions

View File

@@ -1455,6 +1455,10 @@
(decl shift_mask (Type) u32)
(extern constructor shift_mask shift_mask)
;; Mask a constant with the type's shift mask
(decl shift_amount_masked (Type Imm64) u32)
(extern constructor shift_amount_masked shift_amount_masked)
;; Extract a constant `GprMemImm.Imm` from a value operand.
(decl simm32_from_value (GprMemImm) Value)
(extern extractor simm32_from_value simm32_from_value)

View File

@@ -473,7 +473,7 @@
(rule (lower (has_type ty @ $I8X16 (ishl src amt)))
(let (
;; Mask the amount to ensure wrapping behaviour
(masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))
(masked_amt RegMemImm (mask_xmm_shift ty amt))
;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
;; correct for half of the lanes; the others must be fixed up with
;; the mask below.
@@ -515,16 +515,13 @@
;; 16x8, 32x4, and 64x2 shifts can each use a single instruction, once the shift amount is masked.
(rule (lower (has_type ty @ $I16X8 (ishl src amt)))
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
(x64_psllw src (mov_rmi_to_xmm masked_amt))))
(x64_psllw src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))
(rule (lower (has_type ty @ $I32X4 (ishl src amt)))
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
(x64_pslld src (mov_rmi_to_xmm masked_amt))))
(x64_pslld src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))
(rule (lower (has_type ty @ $I64X2 (ishl src amt)))
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
(x64_psllq src (mov_rmi_to_xmm masked_amt))))
(x64_psllq src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))
;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -580,7 +577,7 @@
(rule (lower (has_type ty @ $I8X16 (ushr src amt)))
(let (
;; Mask the amount to ensure wrapping behaviour
(masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))
(masked_amt RegMemImm (mask_xmm_shift ty amt))
;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
;; correct for half of the lanes; the others must be fixed up with
;; the mask below.
@@ -625,16 +622,19 @@
;; 16x8, 32x4, and 64x2 shifts can each use a single instruction, once the shift amount is masked.
(rule (lower (has_type ty @ $I16X8 (ushr src amt)))
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
(x64_psrlw src (mov_rmi_to_xmm masked_amt))))
(x64_psrlw src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))
(rule (lower (has_type ty @ $I32X4 (ushr src amt)))
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
(x64_psrld src (mov_rmi_to_xmm masked_amt))))
(x64_psrld src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))
(rule (lower (has_type ty @ $I64X2 (ushr src amt)))
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
(x64_psrlq src (mov_rmi_to_xmm masked_amt))))
(x64_psrlq src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))
(decl mask_xmm_shift (Type Value) RegMemImm)
(rule (mask_xmm_shift ty amt)
(gpr_to_reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
(rule 1 (mask_xmm_shift ty (iconst n))
(RegMemImm.Imm (shift_amount_masked ty n)))
;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -701,7 +701,7 @@
(rule (lower (has_type ty @ $I8X16 (sshr src amt @ (value_type amt_ty))))
(let ((src_ Xmm (put_in_xmm src))
;; Mask the amount to ensure wrapping behaviour
(masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))
(masked_amt RegMemImm (mask_xmm_shift ty amt))
;; In order for `packsswb` later to only use the high byte of each
;; 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to
;; fill in the upper bits appropriately.
@@ -728,12 +728,10 @@
;; that if the shift amount is in a register, it is in an XMM register.
(rule (lower (has_type ty @ $I16X8 (sshr src amt)))
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
(x64_psraw src (mov_rmi_to_xmm masked_amt))))
(x64_psraw src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))
(rule (lower (has_type ty @ $I32X4 (sshr src amt)))
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
(x64_psrad src (mov_rmi_to_xmm masked_amt))))
(x64_psrad src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))
;; The `sshr.i64x2` CLIF instruction has no single x86 instruction in the older
;; feature sets. Newer ones like AVX512VL + AVX512F include `vpsraq`, a 128-bit

View File

@@ -259,6 +259,10 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
ty.lane_bits() - 1
}
fn shift_amount_masked(&mut self, ty: Type, val: Imm64) -> u32 {
(val.bits() as u32) & self.shift_mask(ty)
}
#[inline]
fn simm32_from_value(&mut self, val: Value) -> Option<GprMemImm> {
let inst = self.lower_ctx.dfg().value_def(val).inst()?;