x64: port icmp to ISLE (#3886)

* x64: port GPR-held `icmp` to ISLE * x64: port equality `icmp` for i128 type * x64: port `icmp` for vector types * x64: rename from_intcc to intcc_to_cc
2022-03-18 11:22:09 -07:00
parent 8cfb552090
commit e92cbfb283
8 changed files with 1145 additions and 549 deletions
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -1440,6 +1440,107 @@
 (rule (lower (resumable_trap code))
      (safepoint (ud2 code)))

+;;;; Rules for `icmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; For GPR-held values we only need to emit `CMP + SETCC`. We rely here on
+;; Cranelift's verification that `a` and `b` are of the same type.
+;; Unfortunately for clarity, the registers are flipped here (TODO).
+(rule (lower (icmp cc a @ (value_type (fits_in_64 ty)) b))
+      (let ((size OperandSize (raw_operand_size_of_type ty)))
+           (with_flags (cmp size b a) (setcc cc))))
+
+;; For XMM-held values, we lower to `PCMP*` instructions, sometimes more than
+;; one. To note: what is different here about the output values is that each
+;; lane will be filled with all 1s or all 0s according to the comparison,
+;; whereas for GPR-held values, the result will be simply 0 or 1 (upper bits
+;; unset).
+(rule (lower (icmp (IntCC.Equal) a @ (value_type (vec128 ty)) b))
+      (pcmpeq ty a b))
+;; To lower a not-equals comparison, we perform an equality comparison
+;; (PCMPEQ*) and then invert the bits (PXOR with all 1s).
+(rule (lower (icmp (IntCC.NotEqual) a @ (value_type (vec128 ty)) b))
+      (let ((checked Xmm (pcmpeq ty a b))
+            (all_ones Xmm (vector_all_ones ty)))
+           (pxor checked all_ones)))
+;; Signed comparisons have a single-instruction lowering, unlike their unsigned
+;; counterparts. These latter instructions use the unsigned min/max
+;; (PMINU*/PMAXU*) and negate the result (PXOR with all 1s).
+(rule (lower (icmp (IntCC.SignedGreaterThan) a @ (value_type (vec128 ty)) b))
+      (pcmpgt ty a b))
+(rule (lower (icmp (IntCC.SignedLessThan) a @ (value_type (vec128 ty)) b))
+      (pcmpgt ty b a))
+(rule (lower (icmp (IntCC.UnsignedGreaterThan) a @ (value_type (vec128 ty)) b))
+      (let ((max Xmm (pmaxu ty a b))
+            (eq Xmm (pcmpeq ty max b))
+            (all_ones Xmm (vector_all_ones ty)))
+           (pxor eq all_ones)))
+(rule (lower (icmp (IntCC.UnsignedLessThan) a @ (value_type (vec128 ty)) b))
+      (let ((min Xmm (pminu ty a b))
+            (eq Xmm (pcmpeq ty min b))
+            (all_ones Xmm (vector_all_ones ty)))
+           (pxor eq all_ones)))
+;; To lower signed and unsigned *-or-equals comparisons, we find the minimum
+;; number (PMIN[U|S]*) and compare that to one of the terms (PCMPEQ*). Note that
+;; there is no 64x2 version of this lowering (see below).
+(rule (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type (vec128 ty)) b))
+      (let ((max Xmm (pmaxs ty a b)))
+           (pcmpeq ty a max)))
+(rule (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type (vec128 ty)) b))
+      (let ((min Xmm (pmins ty a b)))
+           (pcmpeq ty a min)))
+(rule (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type (vec128 ty)) b))
+      (let ((max Xmm (pmaxu ty a b)))
+           (pcmpeq ty a max)))
+(rule (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type (vec128 ty)) b))
+      (let ((min Xmm (pminu ty a b)))
+           (pcmpeq ty a min)))
+;; The PMIN[S|U]Q instruction is only available in AVX512VL/F so we must instead
+;; compare with flipped operands (PCMPGT*) and negate the result (PXOR with all
+;; 1s), emitting one more instruction than the smaller-lane versions.
+(rule (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type $I64X2) b))
+      (let ((checked Xmm (pcmpgt $I64X2 b a))
+            (all_ones Xmm (vector_all_ones $I64X2)))
+           (pxor checked all_ones)))
+(rule (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type $I64X2) b))
+      (let ((checked Xmm (pcmpgt $I64X2 a b))
+            (all_ones Xmm (vector_all_ones $I64X2)))
+           (pxor checked all_ones)))
+;; TODO: not used by WebAssembly translation
+;; (rule (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type $I64X2) b))
+;; TODO: not used by WebAssembly translation
+;; (rule (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type $I64X2) b))
+
+;; For I128 values (held in two GPRs), the instruction sequences depend on what
+;; kind of condition is tested.
+(rule (lower (icmp (IntCC.Equal) a @ (value_type $I128) b))
+      (let ((a_lo Gpr (value_regs_get_gpr a 0))
+            (a_hi Gpr (value_regs_get_gpr a 1))
+            (b_lo Gpr (value_regs_get_gpr b 0))
+            (b_hi Gpr (value_regs_get_gpr b 1))
+            (cmp_lo Reg (with_flags_reg (cmp (OperandSize.Size64) b_lo a_lo) (setcc (CC.Z))))
+            (cmp_hi Reg (with_flags_reg (cmp (OperandSize.Size64) b_hi a_hi) (setcc (CC.Z))))
+            ;; At this point, `cmp_lo` and `cmp_hi` contain either 0 or 1 in the
+            ;; lowest 8 bits--`SETcc` guarantees this. The upper bits may be
+            ;; unchanged so we must compare against 1; this instruction combines
+            ;; `cmp_lo` and `cmp_hi` for that final comparison.
+            (cmp Reg (x64_and $I64 cmp_lo cmp_hi)))
+           ;; We can use the flag-setting behavior of `AND` to set the final
+           ;; bits. If the result of `AND` is zero, then the `ZF` will be set;
+           ;; if either of the halves `AND`s to 0, they were not equal,
+           ;; therefore we `SETcc` with `NZ`.
+           (with_flags (x64_and_with_flags_paired $I64 cmp (RegMemImm.Imm 1)) (setcc (CC.NZ)))))
+
+(rule (lower (icmp (IntCC.NotEqual) a @ (value_type $I128) b))
+      (let ((a_lo Gpr (value_regs_get_gpr a 0))
+            (a_hi Gpr (value_regs_get_gpr a 1))
+            (b_lo Gpr (value_regs_get_gpr b 0))
+            (b_hi Gpr (value_regs_get_gpr b 1))
+            (cmp_lo Reg (with_flags_reg (cmp (OperandSize.Size64) b_lo a_lo) (setcc (CC.NZ))))
+            (cmp_hi Reg (with_flags_reg (cmp (OperandSize.Size64) b_hi a_hi) (setcc (CC.NZ))))
+            ;; See comments for `IntCC.Equal`.
+            (cmp Reg (or $I64 cmp_lo cmp_hi)))
+           (with_flags (x64_and_with_flags_paired $I64 cmp (RegMemImm.Imm 1)) (setcc (CC.NZ)))))
+
 ;;;; Rules for `select` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 ;; CLIF `select` instructions receive a testable argument (i.e. boolean or
@@ -1715,8 +1816,8 @@
            (mul Gpr (mul $I32 masked4 (RegMemImm.Imm 0x01010101)))
            (final Gpr (shr $I32 mul (Imm8Reg.Imm8 24))))
        final))
-                       
-            
+
+
 (rule 1 (lower (has_type (and
                          $I8X16
                          (avx512vl_enabled)
@@ -1725,7 +1826,7 @@
      (vpopcntb src))


-      
+
 ;; For SSE 4.2 we use Mula's algorithm (https://arxiv.org/pdf/1611.07612.pdf):
 ;;
 ;; __m128i count_bytes ( __m128i v) {
@@ -1807,7 +1908,7 @@
                           (shl ty lo4 (Imm8Reg.Imm8 4))
                           hi4)))
        swap4))
-                       
+
 (decl do_bitrev16 (Type Gpr) Gpr)
 (rule (do_bitrev16 ty src)
      (let ((src_ Gpr (do_bitrev8 ty src))
@@ -1819,7 +1920,7 @@
                           (shl ty lo8 (Imm8Reg.Imm8 8))
                           hi8)))
        swap8))
-      
+
 (decl do_bitrev32 (Type Gpr) Gpr)
 (rule (do_bitrev32 ty src)
      (let ((src_ Gpr (do_bitrev16 ty src))