x64: port icmp to ISLE (#3886)

* x64: port GPR-held `icmp` to ISLE
* x64: port equality `icmp` for i128 type
* x64: port `icmp` for vector types
* x64: rename from_intcc to intcc_to_cc
This commit is contained in:
Andrew Brown
2022-03-18 11:22:09 -07:00
committed by GitHub
parent 8cfb552090
commit e92cbfb283
8 changed files with 1145 additions and 549 deletions

View File

@@ -1440,6 +1440,107 @@
(rule (lower (resumable_trap code))
(safepoint (ud2 code)))
;;;; Rules for `icmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; For GPR-held values we only need to emit `CMP + SETCC`. We rely here on
;; Cranelift's verification that `a` and `b` are of the same type.
;; Unfortunately for clarity, the registers are flipped here (TODO).
(rule (lower (icmp cc a @ (value_type (fits_in_64 ty)) b))
(let ((size OperandSize (raw_operand_size_of_type ty)))
(with_flags (cmp size b a) (setcc cc))))
;; For XMM-held values, we lower to `PCMP*` instructions, sometimes more than
;; one. To note: what is different here about the output values is that each
;; lane will be filled with all 1s or all 0s according to the comparison,
;; whereas for GPR-held values, the result will be simply 0 or 1 (upper bits
;; unset).
(rule (lower (icmp (IntCC.Equal) a @ (value_type (vec128 ty)) b))
(pcmpeq ty a b))
;; To lower a not-equals comparison, we perform an equality comparison
;; (PCMPEQ*) and then invert the bits (PXOR with all 1s).
(rule (lower (icmp (IntCC.NotEqual) a @ (value_type (vec128 ty)) b))
(let ((checked Xmm (pcmpeq ty a b))
(all_ones Xmm (vector_all_ones ty)))
(pxor checked all_ones)))
;; Signed comparisons have a single-instruction lowering, unlike their unsigned
;; counterparts. These latter instructions use the unsigned min/max
;; (PMINU*/PMAXU*) and negate the result (PXOR with all 1s).
(rule (lower (icmp (IntCC.SignedGreaterThan) a @ (value_type (vec128 ty)) b))
(pcmpgt ty a b))
(rule (lower (icmp (IntCC.SignedLessThan) a @ (value_type (vec128 ty)) b))
(pcmpgt ty b a))
(rule (lower (icmp (IntCC.UnsignedGreaterThan) a @ (value_type (vec128 ty)) b))
(let ((max Xmm (pmaxu ty a b))
(eq Xmm (pcmpeq ty max b))
(all_ones Xmm (vector_all_ones ty)))
(pxor eq all_ones)))
(rule (lower (icmp (IntCC.UnsignedLessThan) a @ (value_type (vec128 ty)) b))
(let ((min Xmm (pminu ty a b))
(eq Xmm (pcmpeq ty min b))
(all_ones Xmm (vector_all_ones ty)))
(pxor eq all_ones)))
;; To lower signed and unsigned *-or-equals comparisons, we find the minimum
;; number (PMIN[U|S]*) and compare that to one of the terms (PCMPEQ*). Note that
;; there is no 64x2 version of this lowering (see below).
(rule (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type (vec128 ty)) b))
(let ((max Xmm (pmaxs ty a b)))
(pcmpeq ty a max)))
(rule (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type (vec128 ty)) b))
(let ((min Xmm (pmins ty a b)))
(pcmpeq ty a min)))
(rule (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type (vec128 ty)) b))
(let ((max Xmm (pmaxu ty a b)))
(pcmpeq ty a max)))
(rule (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type (vec128 ty)) b))
(let ((min Xmm (pminu ty a b)))
(pcmpeq ty a min)))
;; The PMIN[S|U]Q instruction is only available in AVX512VL/F so we must instead
;; compare with flipped operands (PCMPGT*) and negate the result (PXOR with all
;; 1s), emitting one more instruction than the smaller-lane versions.
(rule (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type $I64X2) b))
(let ((checked Xmm (pcmpgt $I64X2 b a))
(all_ones Xmm (vector_all_ones $I64X2)))
(pxor checked all_ones)))
(rule (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type $I64X2) b))
(let ((checked Xmm (pcmpgt $I64X2 a b))
(all_ones Xmm (vector_all_ones $I64X2)))
(pxor checked all_ones)))
;; TODO: not used by WebAssembly translation
;; (rule (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type $I64X2) b))
;; TODO: not used by WebAssembly translation
;; (rule (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type $I64X2) b))
;; For I128 values (held in two GPRs), the instruction sequences depend on what
;; kind of condition is tested.
(rule (lower (icmp (IntCC.Equal) a @ (value_type $I128) b))
(let ((a_lo Gpr (value_regs_get_gpr a 0))
(a_hi Gpr (value_regs_get_gpr a 1))
(b_lo Gpr (value_regs_get_gpr b 0))
(b_hi Gpr (value_regs_get_gpr b 1))
(cmp_lo Reg (with_flags_reg (cmp (OperandSize.Size64) b_lo a_lo) (setcc (CC.Z))))
(cmp_hi Reg (with_flags_reg (cmp (OperandSize.Size64) b_hi a_hi) (setcc (CC.Z))))
;; At this point, `cmp_lo` and `cmp_hi` contain either 0 or 1 in the
;; lowest 8 bits--`SETcc` guarantees this. The upper bits may be
;; unchanged so we must compare against 1; this instruction combines
;; `cmp_lo` and `cmp_hi` for that final comparison.
(cmp Reg (x64_and $I64 cmp_lo cmp_hi)))
;; We can use the flag-setting behavior of `AND` to set the final
;; bits. If the result of `AND` is zero, then the `ZF` will be set;
;; if either of the halves `AND`s to 0, they were not equal,
;; therefore we `SETcc` with `NZ`.
(with_flags (x64_and_with_flags_paired $I64 cmp (RegMemImm.Imm 1)) (setcc (CC.NZ)))))
(rule (lower (icmp (IntCC.NotEqual) a @ (value_type $I128) b))
(let ((a_lo Gpr (value_regs_get_gpr a 0))
(a_hi Gpr (value_regs_get_gpr a 1))
(b_lo Gpr (value_regs_get_gpr b 0))
(b_hi Gpr (value_regs_get_gpr b 1))
(cmp_lo Reg (with_flags_reg (cmp (OperandSize.Size64) b_lo a_lo) (setcc (CC.NZ))))
(cmp_hi Reg (with_flags_reg (cmp (OperandSize.Size64) b_hi a_hi) (setcc (CC.NZ))))
;; See comments for `IntCC.Equal`.
(cmp Reg (or $I64 cmp_lo cmp_hi)))
(with_flags (x64_and_with_flags_paired $I64 cmp (RegMemImm.Imm 1)) (setcc (CC.NZ)))))
;;;; Rules for `select` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; CLIF `select` instructions receive a testable argument (i.e. boolean or
@@ -1715,8 +1816,8 @@
(mul Gpr (mul $I32 masked4 (RegMemImm.Imm 0x01010101)))
(final Gpr (shr $I32 mul (Imm8Reg.Imm8 24))))
final))
(rule 1 (lower (has_type (and
$I8X16
(avx512vl_enabled)
@@ -1725,7 +1826,7 @@
(vpopcntb src))
;; For SSE 4.2 we use Mula's algorithm (https://arxiv.org/pdf/1611.07612.pdf):
;;
;; __m128i count_bytes ( __m128i v) {
@@ -1807,7 +1908,7 @@
(shl ty lo4 (Imm8Reg.Imm8 4))
hi4)))
swap4))
(decl do_bitrev16 (Type Gpr) Gpr)
(rule (do_bitrev16 ty src)
(let ((src_ Gpr (do_bitrev8 ty src))
@@ -1819,7 +1920,7 @@
(shl ty lo8 (Imm8Reg.Imm8 8))
hi8)))
swap8))
(decl do_bitrev32 (Type Gpr) Gpr)
(rule (do_bitrev32 ty src)
(let ((src_ Gpr (do_bitrev16 ty src))