x64: port fcmp to ISLE (#3967)

* x64: port scalar `fcmp` to ISLE Implement the CLIF lowering for the `fcmp` to ISLE. This adds a new type-matcher, `ty_scalar_float`, for detecting uses of `F32` and `F64`. * isle: rename `vec128` to `ty_vec12` This refactoring changes the name of the `vec128` matcher function to follow the `ty_*` convention of the other type matchers. It also makes the helper an inline function call. * x64: port vector `fcmp` to ISLE
2022-03-29 15:41:49 -07:00
parent 819b61b661
commit 5d8dd648d7
13 changed files with 746 additions and 400 deletions
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -2092,7 +2092,11 @@
                    imm
                    size))

-;; Helper for creating `cmpps` instructions.
+;; Helpers for creating `cmpp*` instructions.
+(decl x64_cmpp (Type Xmm XmmMem FcmpImm) Xmm)
+(rule (x64_cmpp $F32X4 x y imm) (x64_cmpps x y imm))
+(rule (x64_cmpp $F64X2 x y imm) (x64_cmppd x y imm))
+
 (decl x64_cmpps (Xmm XmmMem FcmpImm) Xmm)
 (rule (x64_cmpps src1 src2 imm)
      (xmm_rm_r_imm (SseOpcode.Cmpps)
@@ -2101,6 +2105,17 @@
                    (encode_fcmp_imm imm)
                    (OperandSize.Size32)))

+;; Note that `Size32` is intentional despite this being used for 64-bit
+;; operations, since this presumably induces the correct encoding of the
+;; instruction.
+(decl x64_cmppd (Xmm XmmMem FcmpImm) Xmm)
+(rule (x64_cmppd src1 src2 imm)
+      (xmm_rm_r_imm (SseOpcode.Cmppd)
+                    src1
+                    src2
+                    (encode_fcmp_imm imm)
+                    (OperandSize.Size32)))
+
 ;; Helper for creating `pinsrb` instructions.
 (decl x64_pinsrb (Xmm GprMem u8) Xmm)
 (rule (x64_pinsrb src1 src2 lane)
@@ -2321,19 +2336,6 @@
                                           (operand_size_of_type_32_64 (lane_type ty))))))
        dst))

-;; Helper for creating `cmppd` instructions.
-;;
-;; Note that `Size32` is intentional despite this being used for 64-bit
-;; operations, since this presumably induces the correct encoding of the
-;; instruction.
-(decl x64_cmppd (Xmm XmmMem FcmpImm) Xmm)
-(rule (x64_cmppd src1 src2 imm)
-      (xmm_rm_r_imm (SseOpcode.Cmppd)
-                    src1
-                    src2
-                    (encode_fcmp_imm imm)
-                    (OperandSize.Size32)))
-
 ;; Helper for creating `MInst.GprToXmm` instructions.
 (decl gpr_to_xmm (SseOpcode GprMem OperandSize) Xmm)
 (rule (gpr_to_xmm op src size)
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -1454,22 +1454,22 @@
 ;; lane will be filled with all 1s or all 0s according to the comparison,
 ;; whereas for GPR-held values, the result will be simply 0 or 1 (upper bits
 ;; unset).
-(rule (lower (icmp (IntCC.Equal) a @ (value_type (vec128 ty)) b))
+(rule (lower (icmp (IntCC.Equal) a @ (value_type (ty_vec128 ty)) b))
      (x64_pcmpeq ty a b))
 ;; To lower a not-equals comparison, we perform an equality comparison
 ;; (PCMPEQ*) and then invert the bits (PXOR with all 1s).
-(rule (lower (icmp (IntCC.NotEqual) a @ (value_type (vec128 ty)) b))
+(rule (lower (icmp (IntCC.NotEqual) a @ (value_type (ty_vec128 ty)) b))
      (let ((checked Xmm (x64_pcmpeq ty a b))
            (all_ones Xmm (vector_all_ones ty)))
           (x64_pxor checked all_ones)))
 ;; Signed comparisons have a single-instruction lowering, unlike their unsigned
 ;; counterparts. These latter instructions use the unsigned min/max
 ;; (PMINU*/PMAXU*) and negate the result (PXOR with all 1s).
-(rule (lower (icmp (IntCC.SignedGreaterThan) a @ (value_type (vec128 ty)) b))
+(rule (lower (icmp (IntCC.SignedGreaterThan) a @ (value_type (ty_vec128 ty)) b))
      (x64_pcmpgt ty a b))
-(rule (lower (icmp (IntCC.SignedLessThan) a @ (value_type (vec128 ty)) b))
+(rule (lower (icmp (IntCC.SignedLessThan) a @ (value_type (ty_vec128 ty)) b))
      (x64_pcmpgt ty b a))
-(rule (lower (icmp (IntCC.UnsignedGreaterThan) a @ (value_type (vec128 ty)) b))
+(rule (lower (icmp (IntCC.UnsignedGreaterThan) a @ (value_type (ty_vec128 ty)) b))
      ;; N.B.: we must manually prevent load coalescing of these operands; the
      ;; register allocator gets confused otherwise. TODO:
      ;; https://github.com/bytecodealliance/wasmtime/issues/3953.
@@ -1479,7 +1479,7 @@
            (eq Xmm (x64_pcmpeq ty max xmm_b))
            (all_ones Xmm (vector_all_ones ty)))
           (x64_pxor eq all_ones)))
-(rule (lower (icmp (IntCC.UnsignedLessThan) a @ (value_type (vec128 ty)) b))
+(rule (lower (icmp (IntCC.UnsignedLessThan) a @ (value_type (ty_vec128 ty)) b))
      ;; N.B.: see note above.
      (let ((xmm_a Xmm (put_in_xmm a))
            (xmm_b Xmm (put_in_xmm b))
@@ -1490,16 +1490,16 @@
 ;; To lower signed and unsigned *-or-equals comparisons, we find the minimum
 ;; number (PMIN[U|S]*) and compare that to one of the terms (PCMPEQ*). Note that
 ;; there is no 64x2 version of this lowering (see below).
-(rule (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type (vec128 ty)) b))
+(rule (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
      (let ((max Xmm (x64_pmaxs ty a b)))
           (x64_pcmpeq ty a max)))
-(rule (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type (vec128 ty)) b))
+(rule (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
      (let ((min Xmm (x64_pmins ty a b)))
           (x64_pcmpeq ty a min)))
-(rule (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type (vec128 ty)) b))
+(rule (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
      (let ((max Xmm (x64_pmaxu ty a b)))
           (x64_pcmpeq ty a max)))
-(rule (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type (vec128 ty)) b))
+(rule (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
      (let ((min Xmm (x64_pminu ty a b)))
           (x64_pcmpeq ty a min)))
 ;; The PMIN[S|U]Q instruction is only available in AVX512VL/F so we must instead
@@ -1550,6 +1550,115 @@
            (cmp Reg (x64_or $I64 cmp_lo cmp_hi)))
           (with_flags (x64_test (OperandSize.Size64) (RegMemImm.Imm 1) cmp) (x64_setcc (CC.NZ)))))

+;;;; Rules for `fcmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; CLIF's `fcmp` instruction always operates on XMM registers--both scalar and
+;; vector. For the scalar versions, we use the flag-setting behavior of the
+;; `UCOMIS*` instruction to `SETcc` a 0 or 1 in a GPR register. Note that CLIF's
+;; `select` uses the same kind of flag-setting behavior but chooses values other
+;; than 0 or 1.
+;;
+;; Checking the result of `UCOMIS*` is unfortunately difficult in some cases
+;; because we do not have `SETcc` instructions that explicitly check
+;; simultaneously for the condition (i.e., `eq`, `le`, `gt`, etc.) *and*
+;; orderedness. Instead, we must check the flags multiple times. The UCOMIS*
+;; documentation (see Intel's Software Developer's Manual, volume 2, chapter 4)
+;; is helpful:
+;;  - unordered assigns    Z = 1, P = 1, C = 1
+;;  - greater than assigns Z = 0, P = 0, C = 0
+;;  - less than assigns    Z = 0, P = 0, C = 1
+;;  - equal assigns        Z = 1, P = 0, C = 0
+
+(rule (lower (fcmp (FloatCC.Equal) a @ (value_type (ty_scalar_float ty)) b))
+      (let ((maybe ValueRegs (with_flags (x64_ucomis b a)
+                  (consumes_flags_concat
+                        (x64_setcc (CC.NP))
+                        (x64_setcc (CC.Z)))))
+            (maybe_np Gpr (value_regs_get_gpr maybe 0))
+            (maybe_z Gpr (value_regs_get_gpr maybe 1)))
+           (x64_and $I32 maybe_np maybe_z)))
+
+(rule (lower (fcmp (FloatCC.NotEqual) a @ (value_type (ty_scalar_float ty)) b))
+      (let ((maybe ValueRegs (with_flags (x64_ucomis b a)
+                  (consumes_flags_concat
+                        (x64_setcc (CC.P))
+                        (x64_setcc (CC.NZ)))))
+            (maybe_p Gpr (value_regs_get_gpr maybe 0))
+            (maybe_nz Gpr (value_regs_get_gpr maybe 1)))
+           (x64_or $I32 maybe_p maybe_nz)))
+
+;; Some scalar lowerings correspond to one condition code.
+
+(rule (lower (fcmp (FloatCC.Ordered) a @ (value_type (ty_scalar_float ty)) b))
+      (with_flags (x64_ucomis b a) (x64_setcc (CC.NP))))
+(rule (lower (fcmp (FloatCC.Unordered) a @ (value_type (ty_scalar_float ty)) b))
+      (with_flags (x64_ucomis b a) (x64_setcc (CC.P))))
+(rule (lower (fcmp (FloatCC.OrderedNotEqual) a @ (value_type (ty_scalar_float ty)) b))
+      (with_flags (x64_ucomis b a) (x64_setcc (CC.NZ))))
+(rule (lower (fcmp (FloatCC.UnorderedOrEqual) a @ (value_type (ty_scalar_float ty)) b))
+      (with_flags (x64_ucomis b a) (x64_setcc (CC.Z))))
+(rule (lower (fcmp (FloatCC.GreaterThan) a @ (value_type (ty_scalar_float ty)) b))
+      (with_flags (x64_ucomis b a) (x64_setcc (CC.NBE))))
+(rule (lower (fcmp (FloatCC.GreaterThanOrEqual) a @ (value_type (ty_scalar_float ty)) b))
+      (with_flags (x64_ucomis b a) (x64_setcc (CC.NB))))
+(rule (lower (fcmp (FloatCC.UnorderedOrLessThan) a @ (value_type (ty_scalar_float ty)) b))
+      (with_flags (x64_ucomis b a) (x64_setcc (CC.B))))
+(rule (lower (fcmp (FloatCC.UnorderedOrLessThanOrEqual) a @ (value_type (ty_scalar_float ty)) b))
+      (with_flags (x64_ucomis b a) (x64_setcc (CC.BE))))
+
+;; Other scalar lowerings are made possible by flipping the operands and
+;; reversing the condition code.
+
+(rule (lower (fcmp (FloatCC.LessThan) a @ (value_type (ty_scalar_float ty)) b))
+      ;; Same flags as `GreaterThan`.
+      (with_flags (x64_ucomis a b) (x64_setcc (CC.NBE))))
+(rule (lower (fcmp (FloatCC.LessThanOrEqual) a @ (value_type (ty_scalar_float ty)) b))
+      ;; Same flags as `GreaterThanOrEqual`.
+      (with_flags (x64_ucomis a b) (x64_setcc (CC.NB))))
+(rule (lower (fcmp (FloatCC.UnorderedOrGreaterThan) a @ (value_type (ty_scalar_float ty)) b))
+      ;; Same flags as `UnorderedOrLessThan`.
+      (with_flags (x64_ucomis a b) (x64_setcc (CC.B))))
+(rule (lower (fcmp (FloatCC.UnorderedOrGreaterThanOrEqual) a @ (value_type (ty_scalar_float ty)) b))
+      ;; Same flags as `UnorderedOrLessThanOrEqual`.
+      (with_flags (x64_ucomis a b) (x64_setcc (CC.BE))))
+
+;; For vector lowerings, we use `CMPP*` instructions with a 3-bit operand that
+;; determines the comparison to make. Note that comparisons that succeed will
+;; fill the lane with 1s; comparisons that do not will fill the lane with 0s.
+
+(rule (lower (fcmp (FloatCC.Equal) a @ (value_type (ty_vec128 ty)) b))
+      (x64_cmpp ty a b (FcmpImm.Equal)))
+(rule (lower (fcmp (FloatCC.NotEqual) a @ (value_type (ty_vec128 ty)) b))
+      (x64_cmpp ty a b (FcmpImm.NotEqual)))
+(rule (lower (fcmp (FloatCC.LessThan) a @ (value_type (ty_vec128 ty)) b))
+      (x64_cmpp ty a b (FcmpImm.LessThan)))
+(rule (lower (fcmp (FloatCC.LessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
+      (x64_cmpp ty a b (FcmpImm.LessThanOrEqual)))
+(rule (lower (fcmp (FloatCC.Ordered) a @ (value_type (ty_vec128 ty)) b))
+      (x64_cmpp ty a b (FcmpImm.Ordered)))
+(rule (lower (fcmp (FloatCC.Unordered) a @ (value_type (ty_vec128 ty)) b))
+      (x64_cmpp ty a b (FcmpImm.Unordered)))
+(rule (lower (fcmp (FloatCC.UnorderedOrGreaterThan) a @ (value_type (ty_vec128 ty)) b))
+      (x64_cmpp ty a b (FcmpImm.UnorderedOrGreaterThan)))
+(rule (lower (fcmp (FloatCC.UnorderedOrGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
+      (x64_cmpp ty a b (FcmpImm.UnorderedOrGreaterThanOrEqual)))
+
+;; Some vector lowerings rely on flipping the operands and using a reversed
+;; comparison code.
+
+(rule (lower (fcmp (FloatCC.GreaterThan) a @ (value_type (ty_vec128 ty)) b))
+      (x64_cmpp ty b a (FcmpImm.LessThan)))
+(rule (lower (fcmp (FloatCC.GreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
+      (x64_cmpp ty b a (FcmpImm.LessThanOrEqual)))
+(rule (lower (fcmp (FloatCC.UnorderedOrLessThan) a @ (value_type (ty_vec128 ty)) b))
+      (x64_cmpp ty b a (FcmpImm.UnorderedOrGreaterThan)))
+(rule (lower (fcmp (FloatCC.UnorderedOrLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
+      (x64_cmpp ty b a (FcmpImm.UnorderedOrGreaterThanOrEqual)))
+
+;; Some vector lowerings are simply not supported for certain codes:
+;; - FloatCC::OrderedNotEqual
+;; - FloatCC::UnorderedOrEqual
+
 ;;;; Rules for `select` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 ;; CLIF `select` instructions receive a testable argument (i.e. boolean or
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -930,104 +930,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        }

        Opcode::Fcmp => {
-            let cond_code = ctx.data(insn).fp_cond_code().unwrap();
-            let input_ty = ctx.input_ty(insn, 0);
-            if !input_ty.is_vector() {
-                // Unordered is returned by setting ZF, PF, CF <- 111
-                // Greater than by ZF, PF, CF <- 000
-                // Less than by ZF, PF, CF <- 001
-                // Equal by ZF, PF, CF <- 100
-                //
-                // Checking the result of comiss is somewhat annoying because you don't have setcc
-                // instructions that explicitly check simultaneously for the condition (i.e. eq, le,
-                // gt, etc) *and* orderedness.
-                //
-                // So that might mean we need more than one setcc check and then a logical "and" or
-                // "or" to determine both, in some cases.  However knowing that if the parity bit is
-                // set, then the result was considered unordered and knowing that if the parity bit is
-                // set, then both the ZF and CF flag bits must also be set we can get away with using
-                // one setcc for most condition codes.
-
-                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-
-                match emit_fcmp(ctx, insn, cond_code, FcmpSpec::Normal) {
-                    FcmpCondResult::Condition(cc) => {
-                        ctx.emit(Inst::setcc(cc, dst));
-                    }
-                    FcmpCondResult::AndConditions(cc1, cc2) => {
-                        let tmp = ctx.alloc_tmp(types::I32).only_reg().unwrap();
-                        ctx.emit(Inst::setcc(cc1, tmp));
-                        ctx.emit(Inst::setcc(cc2, dst));
-                        ctx.emit(Inst::alu_rmi_r(
-                            OperandSize::Size32,
-                            AluRmiROpcode::And,
-                            RegMemImm::reg(tmp.to_reg()),
-                            dst,
-                        ));
-                    }
-                    FcmpCondResult::OrConditions(cc1, cc2) => {
-                        let tmp = ctx.alloc_tmp(types::I32).only_reg().unwrap();
-                        ctx.emit(Inst::setcc(cc1, tmp));
-                        ctx.emit(Inst::setcc(cc2, dst));
-                        ctx.emit(Inst::alu_rmi_r(
-                            OperandSize::Size32,
-                            AluRmiROpcode::Or,
-                            RegMemImm::reg(tmp.to_reg()),
-                            dst,
-                        ));
-                    }
-                    FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
-                }
-            } else {
-                let op = match input_ty {
-                    types::F32X4 => SseOpcode::Cmpps,
-                    types::F64X2 => SseOpcode::Cmppd,
-                    _ => panic!("Bad input type to fcmp: {}", input_ty),
-                };
-
-                // Since some packed comparisons are not available, some of the condition codes
-                // must be inverted, with a corresponding `flip` of the operands.
-                let (imm, flip) = match cond_code {
-                    FloatCC::GreaterThan => (FcmpImm::LessThan, true),
-                    FloatCC::GreaterThanOrEqual => (FcmpImm::LessThanOrEqual, true),
-                    FloatCC::UnorderedOrLessThan => (FcmpImm::UnorderedOrGreaterThan, true),
-                    FloatCC::UnorderedOrLessThanOrEqual => {
-                        (FcmpImm::UnorderedOrGreaterThanOrEqual, true)
-                    }
-                    FloatCC::OrderedNotEqual | FloatCC::UnorderedOrEqual => {
-                        panic!("unsupported float condition code: {}", cond_code)
-                    }
-                    _ => (FcmpImm::from(cond_code), false),
-                };
-
-                // Determine the operands of the comparison, possibly by flipping them.
-                let (lhs, rhs) = if flip {
-                    (
-                        put_input_in_reg(ctx, inputs[1]),
-                        input_to_reg_mem(ctx, inputs[0]),
-                    )
-                } else {
-                    (
-                        put_input_in_reg(ctx, inputs[0]),
-                        input_to_reg_mem(ctx, inputs[1]),
-                    )
-                };
-
-                // Move the `lhs` to the same register as `dst`; this may not emit an actual move
-                // but ensures that the registers are the same to match x86's read-write operand
-                // encoding.
-                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-                ctx.emit(Inst::gen_move(dst, lhs, input_ty));
-
-                // Emit the comparison.
-                ctx.emit(Inst::xmm_rm_r_imm(
-                    op,
-                    rhs,
-                    dst,
-                    imm.encode(),
-                    OperandSize::Size32,
-                ));
-            }
+            implemented_in_isle(ctx);
        }

        Opcode::FallthroughReturn | Opcode::Return => {
--- a/cranelift/codegen/src/isa/x64/lower/isle/generated_code.manifest
+++ b/cranelift/codegen/src/isa/x64/lower/isle/generated_code.manifest
@@ -1,4 +1,4 @@
 src/clif.isle 9ea75a6f790b5c03
-src/prelude.isle b2bc986bcbbbb77
-src/isa/x64/inst.isle bfb0fb7143d8dc34
-src/isa/x64/lower.isle ccaee2b83bdf73e1
+src/prelude.isle 74d9514ac948e163
+src/isa/x64/inst.isle a002d62dcfce285
+src/isa/x64/lower.isle d8facef52a4e2ac6
--- a/cranelift/codegen/src/isa/x64/lower/isle/generated_code.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle/generated_code.rs