Rework the ISA flag checking extractors for x64 (#4878)
Using fallible extractors that produce no values for flag checks means that it's not possible to pattern match cases where those flags are false. This change reworks the existing flag-checking extractors to be infallible, returning the flag's boolean value from the context instead.
This commit is contained in:
@@ -939,8 +939,8 @@
|
||||
|
||||
;; With AVX-512 we can implement `i64x2` multiplication with a single
|
||||
;; instruction.
|
||||
(rule (lower (has_type (and (avx512vl_enabled)
|
||||
(avx512dq_enabled)
|
||||
(rule (lower (has_type (and (avx512vl_enabled $true)
|
||||
(avx512dq_enabled $true)
|
||||
(multi_lane 64 2))
|
||||
(imul x y)))
|
||||
(x64_vpmullq x y))
|
||||
@@ -1167,8 +1167,8 @@
|
||||
(x64_pabsd x))
|
||||
|
||||
;; When AVX512 is available, we can use a single `vpabsq` instruction.
|
||||
(rule (lower (has_type (and (avx512vl_enabled)
|
||||
(avx512f_enabled)
|
||||
(rule (lower (has_type (and (avx512vl_enabled $true)
|
||||
(avx512f_enabled $true)
|
||||
$I64X2)
|
||||
(iabs x)))
|
||||
(x64_vpabsq x))
|
||||
@@ -1733,7 +1733,7 @@
|
||||
(rule 1 (lower
|
||||
(has_type (and
|
||||
(ty_32_or_64 ty)
|
||||
(use_lzcnt))
|
||||
(use_lzcnt $true))
|
||||
(clz src)))
|
||||
(x64_lzcnt ty src))
|
||||
|
||||
@@ -1775,7 +1775,7 @@
|
||||
(rule 1 (lower
|
||||
(has_type (and
|
||||
(ty_32_or_64 ty)
|
||||
(use_bmi1))
|
||||
(use_bmi1 $true))
|
||||
(ctz src)))
|
||||
(x64_tzcnt ty src))
|
||||
|
||||
@@ -1811,21 +1811,21 @@
|
||||
(rule 1 (lower
|
||||
(has_type (and
|
||||
(ty_32_or_64 ty)
|
||||
(use_popcnt))
|
||||
(use_popcnt $true))
|
||||
(popcnt src)))
|
||||
(x64_popcnt ty src))
|
||||
|
||||
(rule 1 (lower
|
||||
(has_type (and
|
||||
(ty_8_or_16 ty)
|
||||
(use_popcnt))
|
||||
(use_popcnt $true))
|
||||
(popcnt src)))
|
||||
(x64_popcnt $I32 (extend_to_gpr src $I32 (ExtendKind.Zero))))
|
||||
|
||||
(rule 1 (lower
|
||||
(has_type (and
|
||||
$I128
|
||||
(use_popcnt))
|
||||
(use_popcnt $true))
|
||||
(popcnt src)))
|
||||
(let ((lo_count Gpr (x64_popcnt $I64 (value_regs_get_gpr src 0)))
|
||||
(hi_count Gpr (x64_popcnt $I64 (value_regs_get_gpr src 1))))
|
||||
@@ -1916,8 +1916,8 @@
|
||||
|
||||
(rule 1 (lower (has_type (and
|
||||
$I8X16
|
||||
(avx512vl_enabled)
|
||||
(avx512bitalg_enabled))
|
||||
(avx512vl_enabled $true)
|
||||
(avx512bitalg_enabled $true))
|
||||
(popcnt src)))
|
||||
(x64_vpopcntb src))
|
||||
|
||||
@@ -2480,13 +2480,13 @@
|
||||
(libcall_3 (LibCall.FmaF32) x y z))
|
||||
(rule (lower (has_type $F64 (fma x y z)))
|
||||
(libcall_3 (LibCall.FmaF64) x y z))
|
||||
(rule 1 (lower (has_type (and (use_fma) $F32) (fma x y z)))
|
||||
(rule 1 (lower (has_type (and (use_fma $true) $F32) (fma x y z)))
|
||||
(x64_vfmadd213ss x y z))
|
||||
(rule 1 (lower (has_type (and (use_fma) $F64) (fma x y z)))
|
||||
(rule 1 (lower (has_type (and (use_fma $true) $F64) (fma x y z)))
|
||||
(x64_vfmadd213sd x y z))
|
||||
(rule (lower (has_type (and (use_fma) $F32X4) (fma x y z)))
|
||||
(rule (lower (has_type (and (use_fma $true) $F32X4) (fma x y z)))
|
||||
(x64_vfmadd213ps x y z))
|
||||
(rule (lower (has_type (and (use_fma) $F64X2) (fma x y z)))
|
||||
(rule (lower (has_type (and (use_fma $true) $F64X2) (fma x y z)))
|
||||
(x64_vfmadd213pd x y z))
|
||||
|
||||
;; Rules for `load*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
@@ -2993,7 +2993,7 @@
|
||||
;;
|
||||
;; NOTE: the priority of 1 here is to break ties with the next case for $F32X4,
|
||||
;; as it doesn't require either of the avx512 extensions to be enabled.
|
||||
(rule 1 (lower (has_type (and (avx512vl_enabled) (avx512f_enabled) $F32X4)
|
||||
(rule 1 (lower (has_type (and (avx512vl_enabled $true) (avx512f_enabled $true) $F32X4)
|
||||
(fcvt_from_uint src)))
|
||||
(x64_vcvtudq2ps src))
|
||||
|
||||
@@ -3332,82 +3332,82 @@
|
||||
|
||||
;; Rules for `ceil` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule (lower (has_type (use_sse41) (ceil a @ (value_type $F32))))
|
||||
(rule (lower (has_type (use_sse41 $true) (ceil a @ (value_type $F32))))
|
||||
(x64_roundss a (RoundImm.RoundUp)))
|
||||
|
||||
(rule (lower (ceil a @ (value_type $F32)))
|
||||
(libcall_1 (LibCall.CeilF32) a))
|
||||
|
||||
(rule (lower (has_type (use_sse41) (ceil a @ (value_type $F64))))
|
||||
(rule (lower (has_type (use_sse41 $true) (ceil a @ (value_type $F64))))
|
||||
(x64_roundsd a (RoundImm.RoundUp)))
|
||||
|
||||
(rule (lower (ceil a @ (value_type $F64)))
|
||||
(libcall_1 (LibCall.CeilF64) a))
|
||||
|
||||
(rule (lower (has_type (use_sse41) (ceil a @ (value_type $F32X4))))
|
||||
(rule (lower (has_type (use_sse41 $true) (ceil a @ (value_type $F32X4))))
|
||||
(x64_roundps a (RoundImm.RoundUp)))
|
||||
|
||||
(rule (lower (has_type (use_sse41) (ceil a @ (value_type $F64X2))))
|
||||
(rule (lower (has_type (use_sse41 $true) (ceil a @ (value_type $F64X2))))
|
||||
(x64_roundpd a (RoundImm.RoundUp)))
|
||||
|
||||
;; Rules for `floor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule (lower (has_type (use_sse41) (floor a @ (value_type $F32))))
|
||||
(rule (lower (has_type (use_sse41 $true) (floor a @ (value_type $F32))))
|
||||
(x64_roundss a (RoundImm.RoundDown)))
|
||||
|
||||
(rule (lower (floor a @ (value_type $F32)))
|
||||
(libcall_1 (LibCall.FloorF32) a))
|
||||
|
||||
(rule (lower (has_type (use_sse41) (floor a @ (value_type $F64))))
|
||||
(rule (lower (has_type (use_sse41 $true) (floor a @ (value_type $F64))))
|
||||
(x64_roundsd a (RoundImm.RoundDown)))
|
||||
|
||||
(rule (lower (floor a @ (value_type $F64)))
|
||||
(libcall_1 (LibCall.FloorF64) a))
|
||||
|
||||
(rule (lower (has_type (use_sse41) (floor a @ (value_type $F32X4))))
|
||||
(rule (lower (has_type (use_sse41 $true) (floor a @ (value_type $F32X4))))
|
||||
(x64_roundps a (RoundImm.RoundDown)))
|
||||
|
||||
(rule (lower (has_type (use_sse41) (floor a @ (value_type $F64X2))))
|
||||
(rule (lower (has_type (use_sse41 $true) (floor a @ (value_type $F64X2))))
|
||||
(x64_roundpd a (RoundImm.RoundDown)))
|
||||
|
||||
;; Rules for `nearest` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule (lower (has_type (use_sse41) (nearest a @ (value_type $F32))))
|
||||
(rule (lower (has_type (use_sse41 $true) (nearest a @ (value_type $F32))))
|
||||
(x64_roundss a (RoundImm.RoundNearest)))
|
||||
|
||||
(rule (lower (nearest a @ (value_type $F32)))
|
||||
(libcall_1 (LibCall.NearestF32) a))
|
||||
|
||||
(rule (lower (has_type (use_sse41) (nearest a @ (value_type $F64))))
|
||||
(rule (lower (has_type (use_sse41 $true) (nearest a @ (value_type $F64))))
|
||||
(x64_roundsd a (RoundImm.RoundNearest)))
|
||||
|
||||
(rule (lower (nearest a @ (value_type $F64)))
|
||||
(libcall_1 (LibCall.NearestF64) a))
|
||||
|
||||
(rule (lower (has_type (use_sse41) (nearest a @ (value_type $F32X4))))
|
||||
(rule (lower (has_type (use_sse41 $true) (nearest a @ (value_type $F32X4))))
|
||||
(x64_roundps a (RoundImm.RoundNearest)))
|
||||
|
||||
(rule (lower (has_type (use_sse41) (nearest a @ (value_type $F64X2))))
|
||||
(rule (lower (has_type (use_sse41 $true) (nearest a @ (value_type $F64X2))))
|
||||
(x64_roundpd a (RoundImm.RoundNearest)))
|
||||
|
||||
;; Rules for `trunc` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule (lower (has_type (use_sse41) (trunc a @ (value_type $F32))))
|
||||
(rule (lower (has_type (use_sse41 $true) (trunc a @ (value_type $F32))))
|
||||
(x64_roundss a (RoundImm.RoundZero)))
|
||||
|
||||
(rule (lower (trunc a @ (value_type $F32)))
|
||||
(libcall_1 (LibCall.TruncF32) a))
|
||||
|
||||
(rule (lower (has_type (use_sse41) (trunc a @ (value_type $F64))))
|
||||
(rule (lower (has_type (use_sse41 $true) (trunc a @ (value_type $F64))))
|
||||
(x64_roundsd a (RoundImm.RoundZero)))
|
||||
|
||||
(rule (lower (trunc a @ (value_type $F64)))
|
||||
(libcall_1 (LibCall.TruncF64) a))
|
||||
|
||||
(rule (lower (has_type (use_sse41) (trunc a @ (value_type $F32X4))))
|
||||
(rule (lower (has_type (use_sse41 $true) (trunc a @ (value_type $F32X4))))
|
||||
(x64_roundps a (RoundImm.RoundZero)))
|
||||
|
||||
(rule (lower (has_type (use_sse41) (trunc a @ (value_type $F64X2))))
|
||||
(rule (lower (has_type (use_sse41 $true) (trunc a @ (value_type $F64X2))))
|
||||
(x64_roundpd a (RoundImm.RoundZero)))
|
||||
|
||||
;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
@@ -3506,7 +3506,7 @@
|
||||
;; For the case where the shuffle mask contains out-of-bounds values (values
|
||||
;; greater than 31) we must mask off those resulting values in the result of
|
||||
;; `vpermi2b`.
|
||||
(rule (lower (has_type (and (avx512vl_enabled) (avx512vbmi_enabled))
|
||||
(rule (lower (has_type (and (avx512vl_enabled $true) (avx512vbmi_enabled $true))
|
||||
(shuffle a b (vec_mask_from_immediate
|
||||
(perm_from_mask_with_zeros mask zeros)))))
|
||||
(x64_andps
|
||||
@@ -3515,7 +3515,7 @@
|
||||
|
||||
;; However, if the shuffle mask contains no out-of-bounds values, we can use
|
||||
;; `vpermi2b` without any masking.
|
||||
(rule (lower (has_type (and (avx512vl_enabled) (avx512vbmi_enabled))
|
||||
(rule (lower (has_type (and (avx512vl_enabled $true) (avx512vbmi_enabled $true))
|
||||
(shuffle a b (vec_mask_from_immediate mask))))
|
||||
(x64_vpermi2b b a (x64_xmm_load_const $I8X16 (perm_from_mask mask))))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user