From afb417920d0f4741edbbe25e4c10b4e6dcfb6ae0 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Wed, 29 Mar 2023 11:24:25 -0500 Subject: [PATCH] x64: Deduplicate fcmp emission logic (#6113) * x64: Deduplicate fcmp emission logic The `select`-of-`fcmp` lowering duplicated a good deal of `FloatCC` lowering logic that was already done by `emit_fcmp`, so this commit refactors these lowering rules to instead delegate to `emit_fcmp` and then handle that result. * Swap order of condition codes Shouldn't affect the correctness of this operation and it's a bit more natural to write the lowering rule this way. * Swap the order of comparison operands No need to swap `a b`, only the `x y` needs swapping. * Fix x64 printing of `XmmCmove` --- cranelift/codegen/src/isa/x64/inst/mod.rs | 23 +++-- cranelift/codegen/src/isa/x64/lower.isle | 99 ++++--------------- .../filetests/isa/x64/cmp-mem-bug.clif | 12 +-- .../filetests/isa/x64/select-i128.clif | 8 +- .../filetests/isa/x64/select-issue-3744.clif | 35 ++++++- .../filetests/filetests/isa/x64/select.clif | 8 +- 6 files changed, 73 insertions(+), 112 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index f9dd071dca..723cffb10a 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -1589,20 +1589,19 @@ impl PrettyPrint for Inst { let alternative = pretty_print_reg(alternative.to_reg(), size, allocs); let dst = pretty_print_reg(dst.to_reg().to_reg(), size, allocs); let consequent = consequent.pretty_print(size, allocs); + let suffix = match *ty { + types::F64 => "sd", + types::F32 => "ss", + types::F32X4 => "aps", + types::F64X2 => "apd", + _ => "dqa", + }; format!( - "mov {}, {}; j{} $next; mov{} {}, {}; $next: ", + "mov{suffix} {alternative}, {dst}; \ + j{} $next; \ + mov{suffix} {consequent}, {dst}; \ + $next:", cc.invert().to_string(), - match *ty { - types::F64 => "sd", - types::F32 => "ss", - types::F32X4 => "aps", - types::F64X2 => "apd", - _ => "dqa", - }, - consequent, - dst, - alternative, - dst, ) } diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index ee7ca827bd..3e50417d0f 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -1570,90 +1570,25 @@ ;;;; Rules for `select` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; CLIF `select` instructions receive a testable argument (i.e. boolean or -;; integer) that determines which of the other two arguments is selected as -;; output. Since Cranelift booleans are typically generated by a comparison, the -;; lowerings in this section "look upwards in the tree" to emit the proper -;; sequence of "selection" instructions. +;; When a `select` has an `fcmp` as a condition then rely on `emit_fcmp` to +;; figure out how to perform the comparison. ;; -;; The following rules--for selecting on a floating-point comparison--emit a -;; `UCOMIS*` instruction and then a conditional move, `cmove`. Note that for -;; values contained in XMM registers, `cmove` and `cmove_or` may in fact emit a -;; jump sequence, not `CMOV`. The `cmove` instruction operates on the flags set -;; by `UCOMIS*`; the key to understanding these is the UCOMIS* documentation -;; (see Intel's Software Developer's Manual, volume 2, chapter 4): -;; - unordered assigns Z = 1, P = 1, C = 1 -;; - greater than assigns Z = 0, P = 0, C = 0 -;; - less than assigns Z = 0, P = 0, C = 1 -;; - equal assigns Z = 1, P = 0, C = 0 -;; -;; Note that prefixing the flag with `N` means "not," so that `CC.P -> P = 1` -;; and `CC.NP -> P = 0`. Also, x86 uses mnemonics for certain combinations of -;; flags; e.g.: -;; - `CC.B -> C = 1` (below) -;; - `CC.NB -> C = 0` (not below) -;; - `CC.BE -> C = 1 OR Z = 1` (below or equal) -;; - `CC.NBE -> C = 0 AND Z = 0` (not below or equal) +;; Note, though, that the `FloatCC.Equal` requires an "and" to happen for two +;; condition codes which isn't the easiest thing to lower to a `cmove` +;; instruction. For this reason a `select (fcmp eq ..) ..` is instead +;; flipped around to be `select (fcmp ne ..) ..` with all operands reversed. +;; This will produce a `FcmpCondResult.OrCondition` which is easier to codegen +;; for. +(rule (lower (has_type ty (select (maybe_uextend (fcmp cc a b)) x y))) + (lower_select_fcmp ty (emit_fcmp cc a b) x y)) +(rule 1 (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.Equal) a b)) x y))) + (lower_select_fcmp ty (emit_fcmp (FloatCC.NotEqual) a b) y x)) -(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.Ordered) a b)) x y))) - (with_flags (x64_ucomis b a) (cmove_from_values ty (CC.NP) x y))) - -(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.Unordered) a b)) x y))) - (with_flags (x64_ucomis b a) (cmove_from_values ty (CC.P) x y))) - -(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.GreaterThan) a b)) x y))) - (with_flags (x64_ucomis b a) (cmove_from_values ty (CC.NBE) x y))) - -(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.GreaterThanOrEqual) a b)) x y))) - (with_flags (x64_ucomis b a) (cmove_from_values ty (CC.NB) x y))) - -(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.UnorderedOrLessThan) a b)) x y))) - (with_flags (x64_ucomis b a) (cmove_from_values ty (CC.B) x y))) - -(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.UnorderedOrLessThanOrEqual) a b)) x y))) - (with_flags (x64_ucomis b a) (cmove_from_values ty (CC.BE) x y))) - -;; Certain FloatCC variants are implemented by flipping the operands of the -;; comparison (e.g., "greater than" is lowered the same as "less than" but the -;; comparison is reversed). This allows us to use a single flag for the `cmove`, -;; which involves fewer instructions than `cmove_or`. -;; -;; But why flip at all, you may ask? Can't we just use `CC.B` (i.e., below) for -;; `FloatCC.LessThan`? Recall that in these floating-point lowerings, values may -;; be unordered and we must we want to express that `FloatCC.LessThan` is `LT`, -;; not `LT | UNO`. By flipping the operands AND inverting the comparison (e.g., -;; to `CC.NBE`), we also avoid these unordered cases. - -(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.LessThan) a b)) x y))) - (with_flags (x64_ucomis a b) (cmove_from_values ty (CC.NBE) x y))) - -(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.LessThanOrEqual) a b)) x y))) - (with_flags (x64_ucomis a b) (cmove_from_values ty (CC.NB) x y))) - -(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.UnorderedOrGreaterThan) a b)) x y))) - (with_flags (x64_ucomis a b) (cmove_from_values ty (CC.B) x y))) - -(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.UnorderedOrGreaterThanOrEqual) a b)) x y))) - (with_flags (x64_ucomis a b) (cmove_from_values ty (CC.BE) x y))) - -;; `FloatCC.Equal` and `FloatCC.NotEqual` can only be implemented with multiple -;; flag checks. Recall from the flag assignment chart above that equality, e.g., -;; will assign `Z = 1`. But so does an unordered comparison: `Z = 1, P = 1, C = -;; 1`. In order to avoid semantics like `EQ | UNO` for equality, we must ensure -;; that the values are actually ordered, checking that `P = 0` (note that the -;; `C` flag is irrelevant here). Since we cannot find a single instruction that -;; implements a `Z = 1 AND P = 0` check, we invert the flag checks (i.e., `Z = 1 -;; AND P = 0` becomes `Z = 0 OR P = 1`) and also flip the select operands, `x` -;; and `y`. The same argument applies to `FloatCC.NotEqual`. -;; -;; More details about the CLIF semantics for `fcmp` are available at -;; https://docs.rs/cranelift-codegen/latest/cranelift_codegen/ir/trait.InstBuilder.html#method.fcmp. - -(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.Equal) a b)) x y))) - (with_flags (x64_ucomis a b) (cmove_or_from_values ty (CC.NZ) (CC.P) y x))) - -(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.NotEqual) a b)) x y))) - (with_flags (x64_ucomis a b) (cmove_or_from_values ty (CC.NZ) (CC.P) x y))) +(decl lower_select_fcmp (Type FcmpCondResult Value Value) InstOutput) +(rule (lower_select_fcmp ty (FcmpCondResult.Condition flags cc) x y) + (with_flags flags (cmove_from_values ty cc x y))) +(rule (lower_select_fcmp ty (FcmpCondResult.OrCondition flags cc1 cc2) x y) + (with_flags flags (cmove_or_from_values ty cc1 cc2 x y))) ;; We also can lower `select`s that depend on an `icmp` test, but more simply ;; than the `fcmp` variants above. In these cases, we lower to a `CMP` diff --git a/cranelift/filetests/filetests/isa/x64/cmp-mem-bug.clif b/cranelift/filetests/filetests/isa/x64/cmp-mem-bug.clif index 459302bebb..a7996bdab2 100644 --- a/cranelift/filetests/filetests/isa/x64/cmp-mem-bug.clif +++ b/cranelift/filetests/filetests/isa/x64/cmp-mem-bug.clif @@ -60,10 +60,10 @@ block0(v0: f64, v1: i64): ; setz %al ; andl %edi, %eax, %edi ; movzbq %dil, %rax -; ucomisd %xmm0, %xmm9 +; ucomisd %xmm9, %xmm0 ; movdqa %xmm0, %xmm2 -; mov z, sd; j%xmm2 $next; mov%xmm0 %xmm0, %xmm0; $next: -; mov np, sd; j%xmm2 $next; mov%xmm0 %xmm0, %xmm0; $next: +; movsd %xmm0, %xmm0; jnp $next; movsd %xmm2, %xmm0; $next: +; movsd %xmm0, %xmm0; jz $next; movsd %xmm2, %xmm0; $next: ; movq %rbp, %rsp ; popq %rbp ; ret @@ -79,11 +79,11 @@ block0(v0: f64, v1: i64): ; sete %al ; andl %eax, %edi ; movzbq %dil, %rax -; ucomisd %xmm0, %xmm9 +; ucomisd %xmm9, %xmm0 ; movdqa %xmm0, %xmm2 -; je 0x2f +; jnp 0x2f ; movsd %xmm2, %xmm0 -; jnp 0x39 +; je 0x39 ; movsd %xmm2, %xmm0 ; movq %rbp, %rsp ; popq %rbp diff --git a/cranelift/filetests/filetests/isa/x64/select-i128.clif b/cranelift/filetests/filetests/isa/x64/select-i128.clif index 89576c873f..035322813d 100644 --- a/cranelift/filetests/filetests/isa/x64/select-i128.clif +++ b/cranelift/filetests/filetests/isa/x64/select-i128.clif @@ -52,11 +52,11 @@ block0(v0: f32, v1: i128, v2: i128): ; block0: ; ucomiss %xmm0, %xmm0 ; movq %rdi, %rax -; cmovnzq %rdx, %rax, %rax ; cmovpq %rdx, %rax, %rax +; cmovnzq %rdx, %rax, %rax ; movq %rsi, %rdx -; cmovnzq %rcx, %rdx, %rdx ; cmovpq %rcx, %rdx, %rdx +; cmovnzq %rcx, %rdx, %rdx ; movq %rbp, %rsp ; popq %rbp ; ret @@ -68,11 +68,11 @@ block0(v0: f32, v1: i128, v2: i128): ; block1: ; offset 0x4 ; ucomiss %xmm0, %xmm0 ; movq %rdi, %rax -; cmovneq %rdx, %rax ; cmovpq %rdx, %rax +; cmovneq %rdx, %rax ; movq %rsi, %rdx -; cmovneq %rcx, %rdx ; cmovpq %rcx, %rdx +; cmovneq %rcx, %rdx ; movq %rbp, %rsp ; popq %rbp ; retq diff --git a/cranelift/filetests/filetests/isa/x64/select-issue-3744.clif b/cranelift/filetests/filetests/isa/x64/select-issue-3744.clif index 2e50d6c0a3..eff72d2b30 100644 --- a/cranelift/filetests/filetests/isa/x64/select-issue-3744.clif +++ b/cranelift/filetests/filetests/isa/x64/select-issue-3744.clif @@ -1,4 +1,4 @@ -test compile +test compile precise-output target x86_64 ; Check that no intervening moves are inserted when lowering `select` (see @@ -9,8 +9,35 @@ block0(v0: f32, v1: f32): v3 = iconst.i32 1 v4 = iconst.i32 0 v5 = select v2, v3, v4 - ; check: ucomiss - ; nextln: cmovnzl - ; nextln: cmovpl return v5 } + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movl $1, %eax +; ucomiss %xmm1, %xmm0 +; cmovpl const(0), %eax, %eax +; cmovnzl const(0), %eax, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movl $1, %eax +; ucomiss %xmm1, %xmm0 +; cmovpl 0xd(%rip), %eax +; cmovnel 6(%rip), %eax +; movq %rbp, %rsp +; popq %rbp +; retq +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) + diff --git a/cranelift/filetests/filetests/isa/x64/select.clif b/cranelift/filetests/filetests/isa/x64/select.clif index 674a01e5fc..0b7445ace3 100644 --- a/cranelift/filetests/filetests/isa/x64/select.clif +++ b/cranelift/filetests/filetests/isa/x64/select.clif @@ -44,10 +44,10 @@ block0(v0: f32, v1: f32, v2: i64, v3: i64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; ucomiss %xmm0, %xmm1 +; ucomiss %xmm1, %xmm0 ; movq %rdi, %rax -; cmovnzq %rsi, %rax, %rax ; cmovpq %rsi, %rax, %rax +; cmovnzq %rsi, %rax, %rax ; movq %rbp, %rsp ; popq %rbp ; ret @@ -57,10 +57,10 @@ block0(v0: f32, v1: f32, v2: i64, v3: i64): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; ucomiss %xmm0, %xmm1 +; ucomiss %xmm1, %xmm0 ; movq %rdi, %rax -; cmovneq %rsi, %rax ; cmovpq %rsi, %rax +; cmovneq %rsi, %rax ; movq %rbp, %rsp ; popq %rbp ; retq