x64: port icmp to ISLE (#3886)
* x64: port GPR-held `icmp` to ISLE * x64: port equality `icmp` for i128 type * x64: port `icmp` for vector types * x64: rename from_intcc to intcc_to_cc
This commit is contained in:
@@ -1440,6 +1440,107 @@
|
||||
(rule (lower (resumable_trap code))
|
||||
(safepoint (ud2 code)))
|
||||
|
||||
;;;; Rules for `icmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;; For GPR-held values we only need to emit `CMP + SETCC`. We rely here on
|
||||
;; Cranelift's verification that `a` and `b` are of the same type.
|
||||
;; Unfortunately for clarity, the registers are flipped here (TODO).
|
||||
(rule (lower (icmp cc a @ (value_type (fits_in_64 ty)) b))
|
||||
(let ((size OperandSize (raw_operand_size_of_type ty)))
|
||||
(with_flags (cmp size b a) (setcc cc))))
|
||||
|
||||
;; For XMM-held values, we lower to `PCMP*` instructions, sometimes more than
|
||||
;; one. To note: what is different here about the output values is that each
|
||||
;; lane will be filled with all 1s or all 0s according to the comparison,
|
||||
;; whereas for GPR-held values, the result will be simply 0 or 1 (upper bits
|
||||
;; unset).
|
||||
(rule (lower (icmp (IntCC.Equal) a @ (value_type (vec128 ty)) b))
|
||||
(pcmpeq ty a b))
|
||||
;; To lower a not-equals comparison, we perform an equality comparison
|
||||
;; (PCMPEQ*) and then invert the bits (PXOR with all 1s).
|
||||
(rule (lower (icmp (IntCC.NotEqual) a @ (value_type (vec128 ty)) b))
|
||||
(let ((checked Xmm (pcmpeq ty a b))
|
||||
(all_ones Xmm (vector_all_ones ty)))
|
||||
(pxor checked all_ones)))
|
||||
;; Signed comparisons have a single-instruction lowering, unlike their unsigned
|
||||
;; counterparts. These latter instructions use the unsigned min/max
|
||||
;; (PMINU*/PMAXU*) and negate the result (PXOR with all 1s).
|
||||
(rule (lower (icmp (IntCC.SignedGreaterThan) a @ (value_type (vec128 ty)) b))
|
||||
(pcmpgt ty a b))
|
||||
(rule (lower (icmp (IntCC.SignedLessThan) a @ (value_type (vec128 ty)) b))
|
||||
(pcmpgt ty b a))
|
||||
(rule (lower (icmp (IntCC.UnsignedGreaterThan) a @ (value_type (vec128 ty)) b))
|
||||
(let ((max Xmm (pmaxu ty a b))
|
||||
(eq Xmm (pcmpeq ty max b))
|
||||
(all_ones Xmm (vector_all_ones ty)))
|
||||
(pxor eq all_ones)))
|
||||
(rule (lower (icmp (IntCC.UnsignedLessThan) a @ (value_type (vec128 ty)) b))
|
||||
(let ((min Xmm (pminu ty a b))
|
||||
(eq Xmm (pcmpeq ty min b))
|
||||
(all_ones Xmm (vector_all_ones ty)))
|
||||
(pxor eq all_ones)))
|
||||
;; To lower signed and unsigned *-or-equals comparisons, we find the minimum
|
||||
;; number (PMIN[U|S]*) and compare that to one of the terms (PCMPEQ*). Note that
|
||||
;; there is no 64x2 version of this lowering (see below).
|
||||
(rule (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type (vec128 ty)) b))
|
||||
(let ((max Xmm (pmaxs ty a b)))
|
||||
(pcmpeq ty a max)))
|
||||
(rule (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type (vec128 ty)) b))
|
||||
(let ((min Xmm (pmins ty a b)))
|
||||
(pcmpeq ty a min)))
|
||||
(rule (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type (vec128 ty)) b))
|
||||
(let ((max Xmm (pmaxu ty a b)))
|
||||
(pcmpeq ty a max)))
|
||||
(rule (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type (vec128 ty)) b))
|
||||
(let ((min Xmm (pminu ty a b)))
|
||||
(pcmpeq ty a min)))
|
||||
;; The PMIN[S|U]Q instruction is only available in AVX512VL/F so we must instead
|
||||
;; compare with flipped operands (PCMPGT*) and negate the result (PXOR with all
|
||||
;; 1s), emitting one more instruction than the smaller-lane versions.
|
||||
(rule (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type $I64X2) b))
|
||||
(let ((checked Xmm (pcmpgt $I64X2 b a))
|
||||
(all_ones Xmm (vector_all_ones $I64X2)))
|
||||
(pxor checked all_ones)))
|
||||
(rule (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type $I64X2) b))
|
||||
(let ((checked Xmm (pcmpgt $I64X2 a b))
|
||||
(all_ones Xmm (vector_all_ones $I64X2)))
|
||||
(pxor checked all_ones)))
|
||||
;; TODO: not used by WebAssembly translation
|
||||
;; (rule (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type $I64X2) b))
|
||||
;; TODO: not used by WebAssembly translation
|
||||
;; (rule (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type $I64X2) b))
|
||||
|
||||
;; For I128 values (held in two GPRs), the instruction sequences depend on what
|
||||
;; kind of condition is tested.
|
||||
(rule (lower (icmp (IntCC.Equal) a @ (value_type $I128) b))
|
||||
(let ((a_lo Gpr (value_regs_get_gpr a 0))
|
||||
(a_hi Gpr (value_regs_get_gpr a 1))
|
||||
(b_lo Gpr (value_regs_get_gpr b 0))
|
||||
(b_hi Gpr (value_regs_get_gpr b 1))
|
||||
(cmp_lo Reg (with_flags_reg (cmp (OperandSize.Size64) b_lo a_lo) (setcc (CC.Z))))
|
||||
(cmp_hi Reg (with_flags_reg (cmp (OperandSize.Size64) b_hi a_hi) (setcc (CC.Z))))
|
||||
;; At this point, `cmp_lo` and `cmp_hi` contain either 0 or 1 in the
|
||||
;; lowest 8 bits--`SETcc` guarantees this. The upper bits may be
|
||||
;; unchanged so we must compare against 1; this instruction combines
|
||||
;; `cmp_lo` and `cmp_hi` for that final comparison.
|
||||
(cmp Reg (x64_and $I64 cmp_lo cmp_hi)))
|
||||
;; We can use the flag-setting behavior of `AND` to set the final
|
||||
;; bits. If the result of `AND` is zero, then the `ZF` will be set;
|
||||
;; if either of the halves `AND`s to 0, they were not equal,
|
||||
;; therefore we `SETcc` with `NZ`.
|
||||
(with_flags (x64_and_with_flags_paired $I64 cmp (RegMemImm.Imm 1)) (setcc (CC.NZ)))))
|
||||
|
||||
(rule (lower (icmp (IntCC.NotEqual) a @ (value_type $I128) b))
|
||||
(let ((a_lo Gpr (value_regs_get_gpr a 0))
|
||||
(a_hi Gpr (value_regs_get_gpr a 1))
|
||||
(b_lo Gpr (value_regs_get_gpr b 0))
|
||||
(b_hi Gpr (value_regs_get_gpr b 1))
|
||||
(cmp_lo Reg (with_flags_reg (cmp (OperandSize.Size64) b_lo a_lo) (setcc (CC.NZ))))
|
||||
(cmp_hi Reg (with_flags_reg (cmp (OperandSize.Size64) b_hi a_hi) (setcc (CC.NZ))))
|
||||
;; See comments for `IntCC.Equal`.
|
||||
(cmp Reg (or $I64 cmp_lo cmp_hi)))
|
||||
(with_flags (x64_and_with_flags_paired $I64 cmp (RegMemImm.Imm 1)) (setcc (CC.NZ)))))
|
||||
|
||||
;;;; Rules for `select` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;; CLIF `select` instructions receive a testable argument (i.e. boolean or
|
||||
@@ -1715,8 +1816,8 @@
|
||||
(mul Gpr (mul $I32 masked4 (RegMemImm.Imm 0x01010101)))
|
||||
(final Gpr (shr $I32 mul (Imm8Reg.Imm8 24))))
|
||||
final))
|
||||
|
||||
|
||||
|
||||
|
||||
(rule 1 (lower (has_type (and
|
||||
$I8X16
|
||||
(avx512vl_enabled)
|
||||
@@ -1725,7 +1826,7 @@
|
||||
(vpopcntb src))
|
||||
|
||||
|
||||
|
||||
|
||||
;; For SSE 4.2 we use Mula's algorithm (https://arxiv.org/pdf/1611.07612.pdf):
|
||||
;;
|
||||
;; __m128i count_bytes ( __m128i v) {
|
||||
@@ -1807,7 +1908,7 @@
|
||||
(shl ty lo4 (Imm8Reg.Imm8 4))
|
||||
hi4)))
|
||||
swap4))
|
||||
|
||||
|
||||
(decl do_bitrev16 (Type Gpr) Gpr)
|
||||
(rule (do_bitrev16 ty src)
|
||||
(let ((src_ Gpr (do_bitrev8 ty src))
|
||||
@@ -1819,7 +1920,7 @@
|
||||
(shl ty lo8 (Imm8Reg.Imm8 8))
|
||||
hi8)))
|
||||
swap8))
|
||||
|
||||
|
||||
(decl do_bitrev32 (Type Gpr) Gpr)
|
||||
(rule (do_bitrev32 ty src)
|
||||
(let ((src_ Gpr (do_bitrev16 ty src))
|
||||
|
||||
Reference in New Issue
Block a user