cranelift: Port ushr SIMD lowerings to ISLE on x64
This commit is contained in:
@@ -595,13 +595,17 @@
|
||||
|
||||
;; When the shift amount is known, we can statically (i.e. at compile time)
|
||||
;; determine the mask to use and only emit that.
|
||||
(decl ishl_i8x16_mask_for_const (u32) SyntheticAmode)
|
||||
(extern constructor ishl_i8x16_mask_for_const ishl_i8x16_mask_for_const)
|
||||
(rule (ishl_i8x16_mask (RegMemImm.Imm amt))
|
||||
(ishl_i8x16_mask_for_const amt))
|
||||
|
||||
;; Otherwise, we must emit the entire mask table and dynamically (i.e. at run
|
||||
;; time) find the correct mask offset in the table. We do this use `lea` to find
|
||||
;; the base address of the mask table and then complex addressing to offset to
|
||||
;; the right mask: `base_address + amt << 4`
|
||||
;; time) find the correct mask offset in the table. We use `lea` to find the
|
||||
;; base address of the mask table and then complex addressing to offset to the
|
||||
;; right mask: `base_address + amt << 4`
|
||||
(decl ishl_i8x16_mask_table () SyntheticAmode)
|
||||
(extern constructor ishl_i8x16_mask_table ishl_i8x16_mask_table)
|
||||
(rule (ishl_i8x16_mask (RegMemImm.Reg amt))
|
||||
(let ((mask_table SyntheticAmode (ishl_i8x16_mask_table))
|
||||
(base_mask_addr Reg (lea mask_table))
|
||||
@@ -613,14 +617,6 @@
|
||||
(rule (ishl_i8x16_mask (RegMemImm.Mem amt))
|
||||
(ishl_i8x16_mask (RegMemImm.Reg (x64_load $I64 amt (ExtKind.None)))))
|
||||
|
||||
;; Get the address of the mask for a constant 8x16 shift amount.
|
||||
(decl ishl_i8x16_mask_for_const (u32) SyntheticAmode)
|
||||
(extern constructor ishl_i8x16_mask_for_const ishl_i8x16_mask_for_const)
|
||||
|
||||
;; Get the address of the mask table for a dynamic 8x16 shift amount.
|
||||
(decl ishl_i8x16_mask_table () SyntheticAmode)
|
||||
(extern constructor ishl_i8x16_mask_table ishl_i8x16_mask_table)
|
||||
|
||||
;; 16x8, 32x4, and 64x2 shifts can each use a single instruction.
|
||||
(rule (lower (has_type $I16X8 (ishl src amt)))
|
||||
(value_reg (psllw (put_in_reg src)
|
||||
@@ -671,6 +667,61 @@
|
||||
(let ((amt_ Reg (lo_reg amt)))
|
||||
(shr_i128 (put_in_regs src) amt_)))
|
||||
|
||||
;; SSE.
|
||||
|
||||
;; There are no 8x16 shifts in x64. Do the same 16x8-shift-and-mask thing we do
|
||||
;; with 8x16 `ishl`.
|
||||
(rule (lower (has_type $I8X16 (ushr src amt)))
|
||||
(let ((src_ Reg (put_in_reg src))
|
||||
(amt_gpr RegMemImm (put_in_reg_mem_imm amt))
|
||||
(amt_xmm RegMemImm (reg_mem_imm_to_xmm amt_gpr))
|
||||
;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
|
||||
;; correct for half of the lanes; the others must be fixed up with
|
||||
;; the mask below.
|
||||
(unmasked Reg (psrlw src_ amt_xmm))
|
||||
(mask_addr SyntheticAmode (ushr_i8x16_mask amt_gpr))
|
||||
(mask Reg (x64_load $I8X16 mask_addr (ExtKind.None))))
|
||||
(value_reg (sse_and $I8X16 unmasked (RegMem.Reg mask)))))
|
||||
|
||||
;; Get the address of the mask to use when fixing up the lanes that weren't
|
||||
;; correctly generated by the 16x8 shift.
|
||||
(decl ushr_i8x16_mask (RegMemImm) SyntheticAmode)
|
||||
|
||||
;; When the shift amount is known, we can statically (i.e. at compile time)
|
||||
;; determine the mask to use and only emit that.
|
||||
(decl ushr_i8x16_mask_for_const (u32) SyntheticAmode)
|
||||
(extern constructor ushr_i8x16_mask_for_const ushr_i8x16_mask_for_const)
|
||||
(rule (ushr_i8x16_mask (RegMemImm.Imm amt))
|
||||
(ushr_i8x16_mask_for_const amt))
|
||||
|
||||
;; Otherwise, we must emit the entire mask table and dynamically (i.e. at run
|
||||
;; time) find the correct mask offset in the table. We use `lea` to find the
|
||||
;; base address of the mask table and then complex addressing to offset to the
|
||||
;; right mask: `base_address + amt << 4`
|
||||
(decl ushr_i8x16_mask_table () SyntheticAmode)
|
||||
(extern constructor ushr_i8x16_mask_table ushr_i8x16_mask_table)
|
||||
(rule (ushr_i8x16_mask (RegMemImm.Reg amt))
|
||||
(let ((mask_table SyntheticAmode (ushr_i8x16_mask_table))
|
||||
(base_mask_addr Reg (lea mask_table))
|
||||
(mask_offset Reg (shl $I64 amt (Imm8Reg.Imm8 4))))
|
||||
(amode_to_synthetic_amode (amode_imm_reg_reg_shift 0
|
||||
base_mask_addr
|
||||
mask_offset
|
||||
0))))
|
||||
(rule (ushr_i8x16_mask (RegMemImm.Mem amt))
|
||||
(ushr_i8x16_mask (RegMemImm.Reg (x64_load $I64 amt (ExtKind.None)))))
|
||||
|
||||
;; 16x8, 32x4, and 64x2 shifts can each use a single instruction.
|
||||
(rule (lower (has_type $I16X8 (ushr src amt)))
|
||||
(value_reg (psrlw (put_in_reg src)
|
||||
(reg_mem_imm_to_xmm (put_in_reg_mem_imm amt)))))
|
||||
(rule (lower (has_type $I32X4 (ushr src amt)))
|
||||
(value_reg (psrld (put_in_reg src)
|
||||
(reg_mem_imm_to_xmm (put_in_reg_mem_imm amt)))))
|
||||
(rule (lower (has_type $I64X2 (ushr src amt)))
|
||||
(value_reg (psrlq (put_in_reg src)
|
||||
(reg_mem_imm_to_xmm (put_in_reg_mem_imm amt)))))
|
||||
|
||||
;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;; `i64` and smaller.
|
||||
|
||||
Reference in New Issue
Block a user