From afb417920d0f4741edbbe25e4c10b4e6dcfb6ae0 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Wed, 29 Mar 2023 11:24:25 -0500
Subject: [PATCH] x64: Deduplicate fcmp emission logic (#6113)

* x64: Deduplicate fcmp emission logic

The `select`-of-`fcmp` lowering duplicated a good deal of `FloatCC`
lowering logic that was already done by `emit_fcmp`, so this commit
refactors these lowering rules to instead delegate to `emit_fcmp` and
then handle that result.

* Swap order of condition codes

Shouldn't affect the correctness of this operation and it's a bit more
natural to write the lowering rule this way.

* Swap the order of comparison operands

No need to swap `a b`, only the `x y` needs swapping.

* Fix x64 printing of `XmmCmove`
---
 cranelift/codegen/src/isa/x64/inst/mod.rs     | 23 +++--
 cranelift/codegen/src/isa/x64/lower.isle      | 99 ++++---------------
 .../filetests/isa/x64/cmp-mem-bug.clif        | 12 +--
 .../filetests/isa/x64/select-i128.clif        |  8 +-
 .../filetests/isa/x64/select-issue-3744.clif  | 35 ++++++-
 .../filetests/filetests/isa/x64/select.clif   |  8 +-
 6 files changed, 73 insertions(+), 112 deletions(-)

diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs
index f9dd071dca..723cffb10a 100644
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -1589,20 +1589,19 @@ impl PrettyPrint for Inst {
                 let alternative = pretty_print_reg(alternative.to_reg(), size, allocs);
                 let dst = pretty_print_reg(dst.to_reg().to_reg(), size, allocs);
                 let consequent = consequent.pretty_print(size, allocs);
+                let suffix = match *ty {
+                    types::F64 => "sd",
+                    types::F32 => "ss",
+                    types::F32X4 => "aps",
+                    types::F64X2 => "apd",
+                    _ => "dqa",
+                };
                 format!(
-                    "mov {}, {}; j{} $next; mov{} {}, {}; $next: ",
+                    "mov{suffix} {alternative}, {dst}; \
+                    j{} $next; \
+                    mov{suffix} {consequent}, {dst}; \
+                    $next:",
                     cc.invert().to_string(),
-                    match *ty {
-                        types::F64 => "sd",
-                        types::F32 => "ss",
-                        types::F32X4 => "aps",
-                        types::F64X2 => "apd",
-                        _ => "dqa",
-                    },
-                    consequent,
-                    dst,
-                    alternative,
-                    dst,
                 )
             }
 
diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
index ee7ca827bd..3e50417d0f 100644
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -1570,90 +1570,25 @@
 
 ;;;; Rules for `select` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-;; CLIF `select` instructions receive a testable argument (i.e. boolean or
-;; integer) that determines which of the other two arguments is selected as
-;; output. Since Cranelift booleans are typically generated by a comparison, the
-;; lowerings in this section "look upwards in the tree" to emit the proper
-;; sequence of "selection" instructions.
+;; When a `select` has an `fcmp` as a condition then rely on `emit_fcmp` to
+;; figure out how to perform the comparison.
 ;;
-;; The following rules--for selecting on a floating-point comparison--emit a
-;; `UCOMIS*` instruction and then a conditional move, `cmove`. Note that for
-;; values contained in XMM registers, `cmove` and `cmove_or` may in fact emit a
-;; jump sequence, not `CMOV`. The `cmove` instruction operates on the flags set
-;; by `UCOMIS*`; the key to understanding these is the UCOMIS* documentation
-;; (see Intel's Software Developer's Manual, volume 2, chapter 4):
-;;  - unordered assigns    Z = 1, P = 1, C = 1
-;;  - greater than assigns Z = 0, P = 0, C = 0
-;;  - less than assigns    Z = 0, P = 0, C = 1
-;;  - equal assigns        Z = 1, P = 0, C = 0
-;;
-;; Note that prefixing the flag with `N` means "not," so that `CC.P -> P = 1`
-;; and `CC.NP -> P = 0`. Also, x86 uses mnemonics for certain combinations of
-;; flags; e.g.:
-;;  - `CC.B -> C = 1` (below)
-;;  - `CC.NB -> C = 0` (not below)
-;;  - `CC.BE -> C = 1 OR Z = 1` (below or equal)
-;;  - `CC.NBE -> C = 0 AND Z = 0` (not below or equal)
+;; Note, though, that the `FloatCC.Equal` requires an "and" to happen for two
+;; condition codes which isn't the easiest thing to lower to a `cmove`
+;; instruction. For this reason a `select (fcmp eq ..) ..` is instead
+;; flipped around to be `select (fcmp ne ..) ..` with all operands reversed.
+;; This will produce a `FcmpCondResult.OrCondition` which is easier to codegen
+;; for.
+(rule (lower (has_type ty (select (maybe_uextend (fcmp cc a b)) x y)))
+      (lower_select_fcmp ty (emit_fcmp cc a b) x y))
+(rule 1 (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.Equal) a b)) x y)))
+        (lower_select_fcmp ty (emit_fcmp (FloatCC.NotEqual) a b) y x))
 
-(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.Ordered) a b)) x y)))
-      (with_flags (x64_ucomis b a) (cmove_from_values ty (CC.NP) x y)))
-
-(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.Unordered) a b)) x y)))
-      (with_flags (x64_ucomis b a) (cmove_from_values ty (CC.P) x y)))
-
-(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.GreaterThan) a b)) x y)))
-      (with_flags (x64_ucomis b a) (cmove_from_values ty (CC.NBE) x y)))
-
-(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.GreaterThanOrEqual) a b)) x y)))
-      (with_flags (x64_ucomis b a) (cmove_from_values ty (CC.NB) x y)))
-
-(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.UnorderedOrLessThan) a b)) x y)))
-      (with_flags (x64_ucomis b a) (cmove_from_values ty (CC.B) x y)))
-
-(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.UnorderedOrLessThanOrEqual) a b)) x y)))
-      (with_flags (x64_ucomis b a) (cmove_from_values ty (CC.BE) x y)))
-
-;; Certain FloatCC variants are implemented by flipping the operands of the
-;; comparison (e.g., "greater than" is lowered the same as "less than" but the
-;; comparison is reversed). This allows us to use a single flag for the `cmove`,
-;; which involves fewer instructions than `cmove_or`.
-;;
-;; But why flip at all, you may ask? Can't we just use `CC.B` (i.e., below) for
-;; `FloatCC.LessThan`? Recall that in these floating-point lowerings, values may
-;; be unordered and we must we want to express that `FloatCC.LessThan` is `LT`,
-;; not `LT | UNO`. By flipping the operands AND inverting the comparison (e.g.,
-;; to `CC.NBE`), we also avoid these unordered cases.
-
-(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.LessThan) a b)) x y)))
-      (with_flags (x64_ucomis a b) (cmove_from_values ty (CC.NBE) x y)))
-
-(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.LessThanOrEqual) a b)) x y)))
-      (with_flags (x64_ucomis a b) (cmove_from_values ty (CC.NB) x y)))
-
-(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.UnorderedOrGreaterThan) a b)) x y)))
-      (with_flags (x64_ucomis a b) (cmove_from_values ty (CC.B) x y)))
-
-(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.UnorderedOrGreaterThanOrEqual) a b)) x y)))
-      (with_flags (x64_ucomis a b) (cmove_from_values ty (CC.BE) x y)))
-
-;; `FloatCC.Equal` and `FloatCC.NotEqual` can only be implemented with multiple
-;; flag checks. Recall from the flag assignment chart above that equality, e.g.,
-;; will assign `Z = 1`. But so does an unordered comparison: `Z = 1, P = 1, C =
-;; 1`. In order to avoid semantics like `EQ | UNO` for equality, we must ensure
-;; that the values are actually ordered, checking that `P = 0` (note that the
-;; `C` flag is irrelevant here). Since we cannot find a single instruction that
-;; implements a `Z = 1 AND P = 0` check, we invert the flag checks (i.e., `Z = 1
-;; AND P = 0` becomes `Z = 0 OR P = 1`) and also flip the select operands, `x`
-;; and `y`. The same argument applies to `FloatCC.NotEqual`.
-;;
-;; More details about the CLIF semantics for `fcmp` are available at
-;; https://docs.rs/cranelift-codegen/latest/cranelift_codegen/ir/trait.InstBuilder.html#method.fcmp.
-
-(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.Equal) a b)) x y)))
-      (with_flags (x64_ucomis a b) (cmove_or_from_values ty (CC.NZ) (CC.P) y x)))
-
-(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.NotEqual) a b)) x y)))
-      (with_flags (x64_ucomis a b) (cmove_or_from_values ty (CC.NZ) (CC.P) x y)))
+(decl lower_select_fcmp (Type FcmpCondResult Value Value) InstOutput)
+(rule (lower_select_fcmp ty (FcmpCondResult.Condition flags cc) x y)
+      (with_flags flags (cmove_from_values ty cc x y)))
+(rule (lower_select_fcmp ty (FcmpCondResult.OrCondition flags cc1 cc2) x y)
+      (with_flags flags (cmove_or_from_values ty cc1 cc2 x y)))
 
 ;; We also can lower `select`s that depend on an `icmp` test, but more simply
 ;; than the `fcmp` variants above. In these cases, we lower to a `CMP`
diff --git a/cranelift/filetests/filetests/isa/x64/cmp-mem-bug.clif b/cranelift/filetests/filetests/isa/x64/cmp-mem-bug.clif
index 459302bebb..a7996bdab2 100644
--- a/cranelift/filetests/filetests/isa/x64/cmp-mem-bug.clif
+++ b/cranelift/filetests/filetests/isa/x64/cmp-mem-bug.clif
@@ -60,10 +60,10 @@ block0(v0: f64, v1: i64):
 ;   setz    %al
 ;   andl    %edi, %eax, %edi
 ;   movzbq  %dil, %rax
-;   ucomisd %xmm0, %xmm9
+;   ucomisd %xmm9, %xmm0
 ;   movdqa  %xmm0, %xmm2
-;   mov z, sd; j%xmm2 $next; mov%xmm0 %xmm0, %xmm0; $next: 
-;   mov np, sd; j%xmm2 $next; mov%xmm0 %xmm0, %xmm0; $next: 
+;   movsd %xmm0, %xmm0; jnp $next; movsd %xmm2, %xmm0; $next:
+;   movsd %xmm0, %xmm0; jz $next; movsd %xmm2, %xmm0; $next:
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -79,11 +79,11 @@ block0(v0: f64, v1: i64):
 ;   sete %al
 ;   andl %eax, %edi
 ;   movzbq %dil, %rax
-;   ucomisd %xmm0, %xmm9
+;   ucomisd %xmm9, %xmm0
 ;   movdqa %xmm0, %xmm2
-;   je 0x2f
+;   jnp 0x2f
 ;   movsd %xmm2, %xmm0
-;   jnp 0x39
+;   je 0x39
 ;   movsd %xmm2, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
diff --git a/cranelift/filetests/filetests/isa/x64/select-i128.clif b/cranelift/filetests/filetests/isa/x64/select-i128.clif
index 89576c873f..035322813d 100644
--- a/cranelift/filetests/filetests/isa/x64/select-i128.clif
+++ b/cranelift/filetests/filetests/isa/x64/select-i128.clif
@@ -52,11 +52,11 @@ block0(v0: f32, v1: i128, v2: i128):
 ; block0:
 ;   ucomiss %xmm0, %xmm0
 ;   movq    %rdi, %rax
-;   cmovnzq %rdx, %rax, %rax
 ;   cmovpq  %rdx, %rax, %rax
+;   cmovnzq %rdx, %rax, %rax
 ;   movq    %rsi, %rdx
-;   cmovnzq %rcx, %rdx, %rdx
 ;   cmovpq  %rcx, %rdx, %rdx
+;   cmovnzq %rcx, %rdx, %rdx
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -68,11 +68,11 @@ block0(v0: f32, v1: i128, v2: i128):
 ; block1: ; offset 0x4
 ;   ucomiss %xmm0, %xmm0
 ;   movq %rdi, %rax
-;   cmovneq %rdx, %rax
 ;   cmovpq %rdx, %rax
+;   cmovneq %rdx, %rax
 ;   movq %rsi, %rdx
-;   cmovneq %rcx, %rdx
 ;   cmovpq %rcx, %rdx
+;   cmovneq %rcx, %rdx
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
diff --git a/cranelift/filetests/filetests/isa/x64/select-issue-3744.clif b/cranelift/filetests/filetests/isa/x64/select-issue-3744.clif
index 2e50d6c0a3..eff72d2b30 100644
--- a/cranelift/filetests/filetests/isa/x64/select-issue-3744.clif
+++ b/cranelift/filetests/filetests/isa/x64/select-issue-3744.clif
@@ -1,4 +1,4 @@
-test compile
+test compile precise-output
 target x86_64
 
 ; Check that no intervening moves are inserted when lowering `select` (see
@@ -9,8 +9,35 @@ block0(v0: f32, v1: f32):
     v3 = iconst.i32 1
     v4 = iconst.i32 0
     v5 = select v2, v3, v4
-    ; check: ucomiss
-    ; nextln: cmovnzl
-    ; nextln: cmovpl
     return v5
 }
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movl    $1, %eax
+;   ucomiss %xmm1, %xmm0
+;   cmovpl  const(0), %eax, %eax
+;   cmovnzl const(0), %eax, %eax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movl $1, %eax
+;   ucomiss %xmm1, %xmm0
+;   cmovpl 0xd(%rip), %eax
+;   cmovnel 6(%rip), %eax
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+
diff --git a/cranelift/filetests/filetests/isa/x64/select.clif b/cranelift/filetests/filetests/isa/x64/select.clif
index 674a01e5fc..0b7445ace3 100644
--- a/cranelift/filetests/filetests/isa/x64/select.clif
+++ b/cranelift/filetests/filetests/isa/x64/select.clif
@@ -44,10 +44,10 @@ block0(v0: f32, v1: f32, v2: i64, v3: i64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   ucomiss %xmm0, %xmm1
+;   ucomiss %xmm1, %xmm0
 ;   movq    %rdi, %rax
-;   cmovnzq %rsi, %rax, %rax
 ;   cmovpq  %rsi, %rax, %rax
+;   cmovnzq %rsi, %rax, %rax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -57,10 +57,10 @@ block0(v0: f32, v1: f32, v2: i64, v3: i64):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   ucomiss %xmm0, %xmm1
+;   ucomiss %xmm1, %xmm0
 ;   movq %rdi, %rax
-;   cmovneq %rsi, %rax
 ;   cmovpq %rsi, %rax
+;   cmovneq %rsi, %rax
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq