x64: Refactor and fill out some gpr-vs-xmm bits (#6058)

* x64: Add instruction helpers for `mov{d,q}`

These will soon grow AVX-equivalents so move them to instruction helpers
to have clauses for AVX in the future.

* x64: Don't auto-convert between RegMemImm and XmmMemImm

The previous conversion, `mov_rmi_to_xmm`, would move from GPR registers
to XMM registers which isn't what many of the other `convert` statements
between these newtypes do. This seemed like a possible footgun so I've
removed the auto-conversion and added an explicit helper to go from a
`u32` to an `XmmMemImm`.

* x64: Add AVX encodings of some more GPR-related insns

This commit adds some more support for AVX instructions where GPRs are
in use mixed in with XMM registers. This required a few more variants of
`Inst` to handle the new instructions.

* Fix vpmovmskb encoding

* Fix xmm-to-gpr encoding of vmovd/vmovq

* Fix typo

* Fix rebase conflict

* Fix rebase conflict with tests
This commit is contained in:
Alex Crichton
2023-03-22 09:58:09 -05:00
committed by GitHub
parent a1072007b8
commit 2fde25311e
14 changed files with 695 additions and 83 deletions

View File

@@ -883,17 +883,17 @@
(let ((a0 Xmm a)
(b0 Xmm b)
;; a_hi = A >> 32
(a_hi Xmm (x64_psrlq a0 (RegMemImm.Imm 32)))
(a_hi Xmm (x64_psrlq a0 (xmi_imm 32)))
;; ah_bl = Ah * Bl
(ah_bl Xmm (x64_pmuludq a_hi b0))
;; b_hi = B >> 32
(b_hi Xmm (x64_psrlq b0 (RegMemImm.Imm 32)))
(b_hi Xmm (x64_psrlq b0 (xmi_imm 32)))
;; al_bh = Al * Bh
(al_bh Xmm (x64_pmuludq a0 b_hi))
;; aa_bb = ah_bl + al_bh
(aa_bb Xmm (x64_paddq ah_bl al_bh))
;; aa_bb_shifted = aa_bb << 32
(aa_bb_shifted Xmm (x64_psllq aa_bb (RegMemImm.Imm 32)))
(aa_bb_shifted Xmm (x64_psllq aa_bb (xmi_imm 32)))
;; al_bl = Al * Bl
(al_bl Xmm (x64_pmuludq a0 b0)))
;; al_bl + aa_bb_shifted
@@ -1087,14 +1087,12 @@
;; Special case for `f32x4.abs`.
(rule (lower (has_type $F32X4 (fabs x)))
(x64_andps x
(x64_psrld (vector_all_ones)
(RegMemImm.Imm 1))))
(x64_psrld (vector_all_ones) (xmi_imm 1))))
;; Special case for `f64x2.abs`.
(rule (lower (has_type $F64X2 (fabs x)))
(x64_andpd x
(x64_psrlq (vector_all_ones)
(RegMemImm.Imm 1))))
(x64_psrlq (vector_all_ones) (xmi_imm 1))))
;;;; Rules for `fneg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -1106,13 +1104,11 @@
(rule (lower (has_type $F32X4 (fneg x)))
(x64_xorps x
(x64_pslld (vector_all_ones)
(RegMemImm.Imm 31))))
(x64_pslld (vector_all_ones) (xmi_imm 31))))
(rule (lower (has_type $F64X2 (fneg x)))
(x64_xorpd x
(x64_psllq (vector_all_ones)
(RegMemImm.Imm 63))))
(x64_psllq (vector_all_ones) (xmi_imm 63))))
;;;; Rules for `bmask` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -1918,7 +1914,7 @@
;; Note that this is a 16x8 shift, but that's OK; we mask
;; off anything that traverses from one byte to the next
;; with the low_mask below.
(shifted_src Xmm (x64_psrlw src (RegMemImm.Imm 4)))
(shifted_src Xmm (x64_psrlw src (xmi_imm 4)))
(high_nibbles Xmm (sse_and $I8X16 shifted_src low_mask))
(lookup Xmm (x64_xmm_load_const $I8X16 (popcount_4bit_table)))
(bit_counts_low Xmm (x64_pshufb lookup low_nibbles))
@@ -2237,7 +2233,7 @@
;; All-ones for NaN, shifted down to leave 10 top bits (1
;; sign, 8 exponent, 1 QNaN bit that must remain set)
;; cleared.
(nan_fraction_mask Xmm (x64_psrld is_nan_mask (RegMemImm.Imm 10)))
(nan_fraction_mask Xmm (x64_psrld is_nan_mask (xmi_imm 10)))
;; Do a NAND, so that we retain every bit not set in
;; `nan_fraction_mask`. This mask will be all zeroes (so
;; we retain every bit) in non-NaN cases, and will have
@@ -2254,7 +2250,7 @@
(min_or Xmm (x64_orpd min1 min2))
(is_nan_mask Xmm (x64_cmppd min1 min2 (FcmpImm.Unordered)))
(min_or_2 Xmm (x64_orpd min_or is_nan_mask))
(nan_fraction_mask Xmm (x64_psrlq is_nan_mask (RegMemImm.Imm 13)))
(nan_fraction_mask Xmm (x64_psrlq is_nan_mask (xmi_imm 13)))
(final Xmm (x64_andnpd nan_fraction_mask min_or_2)))
final))
@@ -2302,7 +2298,7 @@
;; All-ones for NaN, shifted down to leave 10 top bits (1
;; sign, 8 exponent, 1 QNaN bit that must remain set)
;; cleared.
(nan_fraction_mask Xmm (x64_psrld is_nan_mask (RegMemImm.Imm 10)))
(nan_fraction_mask Xmm (x64_psrld is_nan_mask (xmi_imm 10)))
;; Do a NAND, so that we retain every bit not set in
;; `nan_fraction_mask`. This mask will be all zeroes (so
;; we retain every bit) in non-NaN cases, and will have
@@ -2346,7 +2342,7 @@
;; All-ones for NaN, shifted down to leave 13 top bits (1
;; sign, 11 exponent, 1 QNaN bit that must remain set)
;; cleared.
(nan_fraction_mask Xmm (x64_psrlq is_nan_mask (RegMemImm.Imm 13)))
(nan_fraction_mask Xmm (x64_psrlq is_nan_mask (xmi_imm 13)))
;; Do a NAND, so that we retain every bit not set in
;; `nan_fraction_mask`. This mask will be all zeroes (so
;; we retain every bit) in non-NaN cases, and will have
@@ -3011,8 +3007,8 @@
(let ((a Xmm val)
;; get the low 16 bits
(a_lo Xmm (x64_pslld a (RegMemImm.Imm 16)))
(a_lo Xmm (x64_psrld a_lo (RegMemImm.Imm 16)))
(a_lo Xmm (x64_pslld a (xmi_imm 16)))
(a_lo Xmm (x64_psrld a_lo (xmi_imm 16)))
;; get the high 16 bits
(a_hi Xmm (x64_psubd a a_lo))
@@ -3022,7 +3018,7 @@
;; shift the high bits by 1, convert, and double to get the correct
;; value
(a_hi Xmm (x64_psrld a_hi (RegMemImm.Imm 1)))
(a_hi Xmm (x64_psrld a_hi (xmi_imm 1)))
(a_hi Xmm (x64_cvtdq2ps a_hi))
(a_hi Xmm (x64_addps a_hi a_hi)))
@@ -3060,7 +3056,7 @@
;; Set top bit only if < 0
(tmp Xmm (x64_pand dst tmp))
(tmp Xmm (x64_psrad tmp (RegMemImm.Imm 31))))
(tmp Xmm (x64_psrad tmp (xmi_imm 31))))
;; On overflow 0x80000000 is returned to a lane.
;; Below sets positive overflow lanes to 0x7FFFFFFF
@@ -3130,7 +3126,7 @@
;; integer that it can represent. In the case of INT_MAX, this value gets
;; represented as 0x4f000000 which is the integer value (INT_MAX+1).
(tmp2 Xmm (x64_pcmpeqd tmp2 tmp2))
(tmp2 Xmm (x64_psrld tmp2 (RegMemImm.Imm 1)))
(tmp2 Xmm (x64_psrld tmp2 (xmi_imm 1)))
(tmp2 Xmm (x64_cvtdq2ps tmp2))
;; Make a copy of these lanes and then do the first conversion.