x64: Lower fcvt_to_{u,s}int{,_sat} in ISLE (#4704)
https://github.com/bytecodealliance/wasmtime/pull/4704
This commit is contained in:
@@ -3062,3 +3062,130 @@
|
||||
|
||||
;; add together the two converted values
|
||||
(x64_addps a_hi a_lo)))
|
||||
|
||||
;; Rules for `fcvt_to_uint` and `fcvt_to_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule (lower (has_type out_ty (fcvt_to_uint val @ (value_type (ty_scalar_float _)))))
|
||||
(cvt_float_to_uint_seq out_ty val $false))
|
||||
|
||||
(rule (lower (has_type out_ty (fcvt_to_uint_sat val @ (value_type (ty_scalar_float _)))))
|
||||
(cvt_float_to_uint_seq out_ty val $true))
|
||||
|
||||
(rule (lower (has_type out_ty (fcvt_to_sint val @ (value_type (ty_scalar_float _)))))
|
||||
(cvt_float_to_sint_seq out_ty val $false))
|
||||
|
||||
(rule (lower (has_type out_ty (fcvt_to_sint_sat val @ (value_type (ty_scalar_float _)))))
|
||||
(cvt_float_to_sint_seq out_ty val $true))
|
||||
|
||||
;; The x64 backend currently only supports these two type combinations.
|
||||
(rule (lower (has_type $I32X4 (fcvt_to_sint_sat val @ (value_type $F32X4))))
|
||||
(let (;; Sets tmp to zero if float is NaN
|
||||
(tmp Xmm (x64_cmpps val val (FcmpImm.Equal)))
|
||||
(dst Xmm (x64_andps val tmp))
|
||||
|
||||
;; Sets top bit of tmp if float is positive
|
||||
;; Setting up to set top bit on negative float values
|
||||
(tmp Xmm (x64_pxor tmp dst))
|
||||
|
||||
;; Convert the packed float to packed doubleword.
|
||||
(dst Xmm (x64_cvttps2dq $F32X4 dst))
|
||||
|
||||
;; Set top bit only if < 0
|
||||
(tmp Xmm (x64_pand dst tmp))
|
||||
(tmp Xmm (x64_psrad tmp (RegMemImm.Imm 31))))
|
||||
|
||||
;; On overflow 0x80000000 is returned to a lane.
|
||||
;; Below sets positive overflow lanes to 0x7FFFFFFF
|
||||
;; Keeps negative overflow lanes as is.
|
||||
(x64_pxor tmp dst)))
|
||||
|
||||
;; The algorithm for converting floats to unsigned ints is a little tricky. The
|
||||
;; complication arises because we are converting from a signed 64-bit int with a positive
|
||||
;; integer range from 1..INT_MAX (0x1..0x7FFFFFFF) to an unsigned integer with an extended
|
||||
;; range from (INT_MAX+1)..UINT_MAX. It's this range from (INT_MAX+1)..UINT_MAX
|
||||
;; (0x80000000..0xFFFFFFFF) that needs to be accounted for as a special case since our
|
||||
;; conversion instruction (cvttps2dq) only converts as high as INT_MAX (0x7FFFFFFF), but
|
||||
;; which conveniently setting underflows and overflows (smaller than MIN_INT or larger than
|
||||
;; MAX_INT) to be INT_MAX+1 (0x80000000). Nothing that the range (INT_MAX+1)..UINT_MAX includes
|
||||
;; precisely INT_MAX values we can correctly account for and convert every value in this range
|
||||
;; if we simply subtract INT_MAX+1 before doing the cvttps2dq conversion. After the subtraction
|
||||
;; every value originally (INT_MAX+1)..UINT_MAX is now the range (0..INT_MAX).
|
||||
;; After the conversion we add INT_MAX+1 back to this converted value, noting again that
|
||||
;; values we are trying to account for were already set to INT_MAX+1 during the original conversion.
|
||||
;; We simply have to create a mask and make sure we are adding together only the lanes that need
|
||||
;; to be accounted for. Digesting it all the steps then are:
|
||||
;;
|
||||
;; Step 1 - Account for NaN and negative floats by setting these src values to zero.
|
||||
;; Step 2 - Make a copy (tmp1) of the src value since we need to convert twice for
|
||||
;; reasons described above.
|
||||
;; Step 3 - Convert the original src values. This will convert properly all floats up to INT_MAX
|
||||
;; Step 4 - Subtract INT_MAX from the copy set (tmp1). Note, all zero and negative values are those
|
||||
;; values that were originally in the range (0..INT_MAX). This will come in handy during
|
||||
;; step 7 when we zero negative lanes.
|
||||
;; Step 5 - Create a bit mask for tmp1 that will correspond to all lanes originally less than
|
||||
;; UINT_MAX that are now less than INT_MAX thanks to the subtraction.
|
||||
;; Step 6 - Convert the second set of values (tmp1)
|
||||
;; Step 7 - Prep the converted second set by zeroing out negative lanes (these have already been
|
||||
;; converted correctly with the first set) and by setting overflow lanes to 0x7FFFFFFF
|
||||
;; as this will allow us to properly saturate overflow lanes when adding to 0x80000000
|
||||
;; Step 8 - Add the orginal converted src and the converted tmp1 where float values originally less
|
||||
;; than and equal to INT_MAX will be unchanged, float values originally between INT_MAX+1 and
|
||||
;; UINT_MAX will add together (INT_MAX) + (SRC - INT_MAX), and float values originally
|
||||
;; greater than UINT_MAX will be saturated to UINT_MAX (0xFFFFFFFF) after adding (0x8000000 + 0x7FFFFFFF).
|
||||
;;
|
||||
;;
|
||||
;; The table below illustrates the result after each step where it matters for the converted set.
|
||||
;; Note the original value range (original src set) is the final dst in Step 8:
|
||||
;;
|
||||
;; Original src set:
|
||||
;; | Original Value Range | Step 1 | Step 3 | Step 8 |
|
||||
;; | -FLT_MIN..FLT_MAX | 0.0..FLT_MAX | 0..INT_MAX(w/overflow) | 0..UINT_MAX(w/saturation) |
|
||||
;;
|
||||
;; Copied src set (tmp1):
|
||||
;; | Step 2 | Step 4 |
|
||||
;; | 0.0..FLT_MAX | (0.0-(INT_MAX+1))..(FLT_MAX-(INT_MAX+1)) |
|
||||
;;
|
||||
;; | Step 6 | Step 7 |
|
||||
;; | (0-(INT_MAX+1))..(UINT_MAX-(INT_MAX+1))(w/overflow) | ((INT_MAX+1)-(INT_MAX+1))..(INT_MAX+1) |
|
||||
(rule (lower (has_type $I32X4 (fcvt_to_uint_sat val @ (value_type $F32X4))))
|
||||
(let (;; Converting to unsigned int so if float src is negative or NaN
|
||||
;; will first set to zero.
|
||||
(tmp2 Xmm (x64_pxor val val)) ;; make a zero
|
||||
(dst Xmm (x64_maxps val tmp2))
|
||||
|
||||
;; Set tmp2 to INT_MAX+1. It is important to note here that after it looks
|
||||
;; like we are only converting INT_MAX (0x7FFFFFFF) but in fact because
|
||||
;; single precision IEEE-754 floats can only accurately represent contingous
|
||||
;; integers up to 2^23 and outside of this range it rounds to the closest
|
||||
;; integer that it can represent. In the case of INT_MAX, this value gets
|
||||
;; represented as 0x4f000000 which is the integer value (INT_MAX+1).
|
||||
(tmp2 Xmm (x64_pcmpeqd tmp2 tmp2))
|
||||
(tmp2 Xmm (x64_psrld tmp2 (RegMemImm.Imm 1)))
|
||||
(tmp2 Xmm (x64_cvtdq2ps tmp2))
|
||||
|
||||
;; Make a copy of these lanes and then do the first conversion.
|
||||
;; Overflow lanes greater than the maximum allowed signed value will
|
||||
;; set to 0x80000000. Negative and NaN lanes will be 0x0
|
||||
(tmp1 Xmm dst)
|
||||
(dst Xmm (x64_cvttps2dq $F32X4 dst))
|
||||
|
||||
;; Set lanes to src - max_signed_int
|
||||
(tmp1 Xmm (x64_subps tmp1 tmp2))
|
||||
|
||||
;; Create mask for all positive lanes to saturate (i.e. greater than
|
||||
;; or equal to the maxmimum allowable unsigned int).
|
||||
(tmp2 Xmm (x64_cmpps tmp2 tmp1 (FcmpImm.LessThanOrEqual)))
|
||||
|
||||
;; Convert those set of lanes that have the max_signed_int factored out.
|
||||
(tmp1 Xmm (x64_cvttps2dq $F32X4 tmp1))
|
||||
|
||||
;; Prepare converted lanes by zeroing negative lanes and prepping lanes
|
||||
;; that have positive overflow (based on the mask) by setting these lanes
|
||||
;; to 0x7FFFFFFF
|
||||
(tmp1 Xmm (x64_pxor tmp1 tmp2))
|
||||
(tmp2 Xmm (x64_pxor tmp2 tmp2)) ;; make another zero
|
||||
(tmp1 Xmm (x64_pmaxsd tmp1 tmp2)))
|
||||
|
||||
;; Add this second set of converted lanes to the original to properly handle
|
||||
;; values greater than max signed int.
|
||||
(x64_paddd tmp1 dst)))
|
||||
|
||||
Reference in New Issue
Block a user