x64: Deduplicate fcmp emission logic (#6113)
* x64: Deduplicate fcmp emission logic The `select`-of-`fcmp` lowering duplicated a good deal of `FloatCC` lowering logic that was already done by `emit_fcmp`, so this commit refactors these lowering rules to instead delegate to `emit_fcmp` and then handle that result. * Swap order of condition codes Shouldn't affect the correctness of this operation and it's a bit more natural to write the lowering rule this way. * Swap the order of comparison operands No need to swap `a b`, only the `x y` needs swapping. * Fix x64 printing of `XmmCmove`
This commit is contained in:
@@ -1589,20 +1589,19 @@ impl PrettyPrint for Inst {
|
||||
let alternative = pretty_print_reg(alternative.to_reg(), size, allocs);
|
||||
let dst = pretty_print_reg(dst.to_reg().to_reg(), size, allocs);
|
||||
let consequent = consequent.pretty_print(size, allocs);
|
||||
let suffix = match *ty {
|
||||
types::F64 => "sd",
|
||||
types::F32 => "ss",
|
||||
types::F32X4 => "aps",
|
||||
types::F64X2 => "apd",
|
||||
_ => "dqa",
|
||||
};
|
||||
format!(
|
||||
"mov {}, {}; j{} $next; mov{} {}, {}; $next: ",
|
||||
"mov{suffix} {alternative}, {dst}; \
|
||||
j{} $next; \
|
||||
mov{suffix} {consequent}, {dst}; \
|
||||
$next:",
|
||||
cc.invert().to_string(),
|
||||
match *ty {
|
||||
types::F64 => "sd",
|
||||
types::F32 => "ss",
|
||||
types::F32X4 => "aps",
|
||||
types::F64X2 => "apd",
|
||||
_ => "dqa",
|
||||
},
|
||||
consequent,
|
||||
dst,
|
||||
alternative,
|
||||
dst,
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -1570,90 +1570,25 @@
|
||||
|
||||
;;;; Rules for `select` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;; CLIF `select` instructions receive a testable argument (i.e. boolean or
|
||||
;; integer) that determines which of the other two arguments is selected as
|
||||
;; output. Since Cranelift booleans are typically generated by a comparison, the
|
||||
;; lowerings in this section "look upwards in the tree" to emit the proper
|
||||
;; sequence of "selection" instructions.
|
||||
;; When a `select` has an `fcmp` as a condition then rely on `emit_fcmp` to
|
||||
;; figure out how to perform the comparison.
|
||||
;;
|
||||
;; The following rules--for selecting on a floating-point comparison--emit a
|
||||
;; `UCOMIS*` instruction and then a conditional move, `cmove`. Note that for
|
||||
;; values contained in XMM registers, `cmove` and `cmove_or` may in fact emit a
|
||||
;; jump sequence, not `CMOV`. The `cmove` instruction operates on the flags set
|
||||
;; by `UCOMIS*`; the key to understanding these is the UCOMIS* documentation
|
||||
;; (see Intel's Software Developer's Manual, volume 2, chapter 4):
|
||||
;; - unordered assigns Z = 1, P = 1, C = 1
|
||||
;; - greater than assigns Z = 0, P = 0, C = 0
|
||||
;; - less than assigns Z = 0, P = 0, C = 1
|
||||
;; - equal assigns Z = 1, P = 0, C = 0
|
||||
;;
|
||||
;; Note that prefixing the flag with `N` means "not," so that `CC.P -> P = 1`
|
||||
;; and `CC.NP -> P = 0`. Also, x86 uses mnemonics for certain combinations of
|
||||
;; flags; e.g.:
|
||||
;; - `CC.B -> C = 1` (below)
|
||||
;; - `CC.NB -> C = 0` (not below)
|
||||
;; - `CC.BE -> C = 1 OR Z = 1` (below or equal)
|
||||
;; - `CC.NBE -> C = 0 AND Z = 0` (not below or equal)
|
||||
;; Note, though, that the `FloatCC.Equal` requires an "and" to happen for two
|
||||
;; condition codes which isn't the easiest thing to lower to a `cmove`
|
||||
;; instruction. For this reason a `select (fcmp eq ..) ..` is instead
|
||||
;; flipped around to be `select (fcmp ne ..) ..` with all operands reversed.
|
||||
;; This will produce a `FcmpCondResult.OrCondition` which is easier to codegen
|
||||
;; for.
|
||||
(rule (lower (has_type ty (select (maybe_uextend (fcmp cc a b)) x y)))
|
||||
(lower_select_fcmp ty (emit_fcmp cc a b) x y))
|
||||
(rule 1 (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.Equal) a b)) x y)))
|
||||
(lower_select_fcmp ty (emit_fcmp (FloatCC.NotEqual) a b) y x))
|
||||
|
||||
(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.Ordered) a b)) x y)))
|
||||
(with_flags (x64_ucomis b a) (cmove_from_values ty (CC.NP) x y)))
|
||||
|
||||
(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.Unordered) a b)) x y)))
|
||||
(with_flags (x64_ucomis b a) (cmove_from_values ty (CC.P) x y)))
|
||||
|
||||
(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.GreaterThan) a b)) x y)))
|
||||
(with_flags (x64_ucomis b a) (cmove_from_values ty (CC.NBE) x y)))
|
||||
|
||||
(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.GreaterThanOrEqual) a b)) x y)))
|
||||
(with_flags (x64_ucomis b a) (cmove_from_values ty (CC.NB) x y)))
|
||||
|
||||
(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.UnorderedOrLessThan) a b)) x y)))
|
||||
(with_flags (x64_ucomis b a) (cmove_from_values ty (CC.B) x y)))
|
||||
|
||||
(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.UnorderedOrLessThanOrEqual) a b)) x y)))
|
||||
(with_flags (x64_ucomis b a) (cmove_from_values ty (CC.BE) x y)))
|
||||
|
||||
;; Certain FloatCC variants are implemented by flipping the operands of the
|
||||
;; comparison (e.g., "greater than" is lowered the same as "less than" but the
|
||||
;; comparison is reversed). This allows us to use a single flag for the `cmove`,
|
||||
;; which involves fewer instructions than `cmove_or`.
|
||||
;;
|
||||
;; But why flip at all, you may ask? Can't we just use `CC.B` (i.e., below) for
|
||||
;; `FloatCC.LessThan`? Recall that in these floating-point lowerings, values may
|
||||
;; be unordered and we must we want to express that `FloatCC.LessThan` is `LT`,
|
||||
;; not `LT | UNO`. By flipping the operands AND inverting the comparison (e.g.,
|
||||
;; to `CC.NBE`), we also avoid these unordered cases.
|
||||
|
||||
(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.LessThan) a b)) x y)))
|
||||
(with_flags (x64_ucomis a b) (cmove_from_values ty (CC.NBE) x y)))
|
||||
|
||||
(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.LessThanOrEqual) a b)) x y)))
|
||||
(with_flags (x64_ucomis a b) (cmove_from_values ty (CC.NB) x y)))
|
||||
|
||||
(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.UnorderedOrGreaterThan) a b)) x y)))
|
||||
(with_flags (x64_ucomis a b) (cmove_from_values ty (CC.B) x y)))
|
||||
|
||||
(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.UnorderedOrGreaterThanOrEqual) a b)) x y)))
|
||||
(with_flags (x64_ucomis a b) (cmove_from_values ty (CC.BE) x y)))
|
||||
|
||||
;; `FloatCC.Equal` and `FloatCC.NotEqual` can only be implemented with multiple
|
||||
;; flag checks. Recall from the flag assignment chart above that equality, e.g.,
|
||||
;; will assign `Z = 1`. But so does an unordered comparison: `Z = 1, P = 1, C =
|
||||
;; 1`. In order to avoid semantics like `EQ | UNO` for equality, we must ensure
|
||||
;; that the values are actually ordered, checking that `P = 0` (note that the
|
||||
;; `C` flag is irrelevant here). Since we cannot find a single instruction that
|
||||
;; implements a `Z = 1 AND P = 0` check, we invert the flag checks (i.e., `Z = 1
|
||||
;; AND P = 0` becomes `Z = 0 OR P = 1`) and also flip the select operands, `x`
|
||||
;; and `y`. The same argument applies to `FloatCC.NotEqual`.
|
||||
;;
|
||||
;; More details about the CLIF semantics for `fcmp` are available at
|
||||
;; https://docs.rs/cranelift-codegen/latest/cranelift_codegen/ir/trait.InstBuilder.html#method.fcmp.
|
||||
|
||||
(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.Equal) a b)) x y)))
|
||||
(with_flags (x64_ucomis a b) (cmove_or_from_values ty (CC.NZ) (CC.P) y x)))
|
||||
|
||||
(rule (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.NotEqual) a b)) x y)))
|
||||
(with_flags (x64_ucomis a b) (cmove_or_from_values ty (CC.NZ) (CC.P) x y)))
|
||||
(decl lower_select_fcmp (Type FcmpCondResult Value Value) InstOutput)
|
||||
(rule (lower_select_fcmp ty (FcmpCondResult.Condition flags cc) x y)
|
||||
(with_flags flags (cmove_from_values ty cc x y)))
|
||||
(rule (lower_select_fcmp ty (FcmpCondResult.OrCondition flags cc1 cc2) x y)
|
||||
(with_flags flags (cmove_or_from_values ty cc1 cc2 x y)))
|
||||
|
||||
;; We also can lower `select`s that depend on an `icmp` test, but more simply
|
||||
;; than the `fcmp` variants above. In these cases, we lower to a `CMP`
|
||||
|
||||
@@ -60,10 +60,10 @@ block0(v0: f64, v1: i64):
|
||||
; setz %al
|
||||
; andl %edi, %eax, %edi
|
||||
; movzbq %dil, %rax
|
||||
; ucomisd %xmm0, %xmm9
|
||||
; ucomisd %xmm9, %xmm0
|
||||
; movdqa %xmm0, %xmm2
|
||||
; mov z, sd; j%xmm2 $next; mov%xmm0 %xmm0, %xmm0; $next:
|
||||
; mov np, sd; j%xmm2 $next; mov%xmm0 %xmm0, %xmm0; $next:
|
||||
; movsd %xmm0, %xmm0; jnp $next; movsd %xmm2, %xmm0; $next:
|
||||
; movsd %xmm0, %xmm0; jz $next; movsd %xmm2, %xmm0; $next:
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
@@ -79,11 +79,11 @@ block0(v0: f64, v1: i64):
|
||||
; sete %al
|
||||
; andl %eax, %edi
|
||||
; movzbq %dil, %rax
|
||||
; ucomisd %xmm0, %xmm9
|
||||
; ucomisd %xmm9, %xmm0
|
||||
; movdqa %xmm0, %xmm2
|
||||
; je 0x2f
|
||||
; jnp 0x2f
|
||||
; movsd %xmm2, %xmm0
|
||||
; jnp 0x39
|
||||
; je 0x39
|
||||
; movsd %xmm2, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
|
||||
@@ -52,11 +52,11 @@ block0(v0: f32, v1: i128, v2: i128):
|
||||
; block0:
|
||||
; ucomiss %xmm0, %xmm0
|
||||
; movq %rdi, %rax
|
||||
; cmovnzq %rdx, %rax, %rax
|
||||
; cmovpq %rdx, %rax, %rax
|
||||
; cmovnzq %rdx, %rax, %rax
|
||||
; movq %rsi, %rdx
|
||||
; cmovnzq %rcx, %rdx, %rdx
|
||||
; cmovpq %rcx, %rdx, %rdx
|
||||
; cmovnzq %rcx, %rdx, %rdx
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
@@ -68,11 +68,11 @@ block0(v0: f32, v1: i128, v2: i128):
|
||||
; block1: ; offset 0x4
|
||||
; ucomiss %xmm0, %xmm0
|
||||
; movq %rdi, %rax
|
||||
; cmovneq %rdx, %rax
|
||||
; cmovpq %rdx, %rax
|
||||
; cmovneq %rdx, %rax
|
||||
; movq %rsi, %rdx
|
||||
; cmovneq %rcx, %rdx
|
||||
; cmovpq %rcx, %rdx
|
||||
; cmovneq %rcx, %rdx
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
test compile
|
||||
test compile precise-output
|
||||
target x86_64
|
||||
|
||||
; Check that no intervening moves are inserted when lowering `select` (see
|
||||
@@ -9,8 +9,35 @@ block0(v0: f32, v1: f32):
|
||||
v3 = iconst.i32 1
|
||||
v4 = iconst.i32 0
|
||||
v5 = select v2, v3, v4
|
||||
; check: ucomiss
|
||||
; nextln: cmovnzl
|
||||
; nextln: cmovpl
|
||||
return v5
|
||||
}
|
||||
|
||||
; VCode:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; movl $1, %eax
|
||||
; ucomiss %xmm1, %xmm0
|
||||
; cmovpl const(0), %eax, %eax
|
||||
; cmovnzl const(0), %eax, %eax
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
;
|
||||
; Disassembled:
|
||||
; block0: ; offset 0x0
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; movl $1, %eax
|
||||
; ucomiss %xmm1, %xmm0
|
||||
; cmovpl 0xd(%rip), %eax
|
||||
; cmovnel 6(%rip), %eax
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
; addb %al, (%rax)
|
||||
; addb %al, (%rax)
|
||||
; addb %al, (%rax)
|
||||
; addb %al, (%rax)
|
||||
|
||||
|
||||
@@ -44,10 +44,10 @@ block0(v0: f32, v1: f32, v2: i64, v3: i64):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; ucomiss %xmm0, %xmm1
|
||||
; ucomiss %xmm1, %xmm0
|
||||
; movq %rdi, %rax
|
||||
; cmovnzq %rsi, %rax, %rax
|
||||
; cmovpq %rsi, %rax, %rax
|
||||
; cmovnzq %rsi, %rax, %rax
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
@@ -57,10 +57,10 @@ block0(v0: f32, v1: f32, v2: i64, v3: i64):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; ucomiss %xmm0, %xmm1
|
||||
; ucomiss %xmm1, %xmm0
|
||||
; movq %rdi, %rax
|
||||
; cmovneq %rsi, %rax
|
||||
; cmovpq %rsi, %rax
|
||||
; cmovneq %rsi, %rax
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
Reference in New Issue
Block a user