ISLE: port fmin, fmax, fmin_pseudo, fmax_pseudo on x64. (#3856)
This commit is contained in:
@@ -281,7 +281,8 @@
|
||||
(XmmMinMaxSeq (size OperandSize)
|
||||
(is_min bool)
|
||||
(lhs Xmm)
|
||||
(rhs_dst WritableXmm))
|
||||
(rhs Xmm)
|
||||
(dst WritableXmm))
|
||||
|
||||
;; Float comparisons/tests: cmp (b w l q) (reg addr imm) reg.
|
||||
(XmmCmpRmR (op SseOpcode)
|
||||
@@ -2430,6 +2431,71 @@
|
||||
(_ Unit (emit (MInst.UnaryRmR size (UnaryRmROpcode.Popcnt) src dst))))
|
||||
dst))
|
||||
|
||||
;; Helper for creating `xmm_min_max_seq` psuedo-instructions.
|
||||
(decl xmm_min_max_seq (Type bool Xmm Xmm) Xmm)
|
||||
(rule (xmm_min_max_seq ty is_min lhs rhs)
|
||||
(let ((dst WritableXmm (temp_writable_xmm))
|
||||
(size OperandSize (operand_size_of_type_32_64 ty))
|
||||
(_ Unit (emit (MInst.XmmMinMaxSeq size is_min lhs rhs dst))))
|
||||
dst))
|
||||
|
||||
;; Helper for creating `minss` instructions.
|
||||
(decl minss (Xmm Xmm) Xmm)
|
||||
(rule (minss x y)
|
||||
(let ((dst WritableXmm (temp_writable_xmm))
|
||||
(_ Unit (emit (MInst.XmmRmR (SseOpcode.Minss) x y dst))))
|
||||
dst))
|
||||
|
||||
;; Helper for creating `minsd` instructions.
|
||||
(decl minsd (Xmm Xmm) Xmm)
|
||||
(rule (minsd x y)
|
||||
(let ((dst WritableXmm (temp_writable_xmm))
|
||||
(_ Unit (emit (MInst.XmmRmR (SseOpcode.Minsd) x y dst))))
|
||||
dst))
|
||||
|
||||
|
||||
;; Helper for creating `minps` instructions.
|
||||
(decl minps (Xmm Xmm) Xmm)
|
||||
(rule (minps x y)
|
||||
(let ((dst WritableXmm (temp_writable_xmm))
|
||||
(_ Unit (emit (MInst.XmmRmR (SseOpcode.Minps) x y dst))))
|
||||
dst))
|
||||
|
||||
;; Helper for creating `minpd` instructions.
|
||||
(decl minpd (Xmm Xmm) Xmm)
|
||||
(rule (minpd x y)
|
||||
(let ((dst WritableXmm (temp_writable_xmm))
|
||||
(_ Unit (emit (MInst.XmmRmR (SseOpcode.Minpd) x y dst))))
|
||||
dst))
|
||||
|
||||
;; Helper for creating `maxss` instructions.
|
||||
(decl maxss (Xmm Xmm) Xmm)
|
||||
(rule (maxss x y)
|
||||
(let ((dst WritableXmm (temp_writable_xmm))
|
||||
(_ Unit (emit (MInst.XmmRmR (SseOpcode.Maxss) x y dst))))
|
||||
dst))
|
||||
|
||||
;; Helper for creating `maxsd` instructions.
|
||||
(decl maxsd (Xmm Xmm) Xmm)
|
||||
(rule (maxsd x y)
|
||||
(let ((dst WritableXmm (temp_writable_xmm))
|
||||
(_ Unit (emit (MInst.XmmRmR (SseOpcode.Maxsd) x y dst))))
|
||||
dst))
|
||||
|
||||
;; Helper for creating `maxps` instructions.
|
||||
(decl maxps (Xmm Xmm) Xmm)
|
||||
(rule (maxps x y)
|
||||
(let ((dst WritableXmm (temp_writable_xmm))
|
||||
(_ Unit (emit (MInst.XmmRmR (SseOpcode.Maxps) x y dst))))
|
||||
dst))
|
||||
|
||||
;; Helper for creating `maxpd` instructions.
|
||||
(decl maxpd (Xmm Xmm) Xmm)
|
||||
(rule (maxpd x y)
|
||||
(let ((dst WritableXmm (temp_writable_xmm))
|
||||
(_ Unit (emit (MInst.XmmRmR (SseOpcode.Maxpd) x y dst))))
|
||||
dst))
|
||||
|
||||
;;;; Automatic conversions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(convert Gpr InstOutput output_gpr)
|
||||
|
||||
@@ -1696,8 +1696,11 @@ pub(crate) fn emit(
|
||||
size,
|
||||
is_min,
|
||||
lhs,
|
||||
rhs_dst,
|
||||
rhs,
|
||||
dst,
|
||||
} => {
|
||||
debug_assert_eq!(*rhs, dst.to_reg());
|
||||
|
||||
// Generates the following sequence:
|
||||
// cmpss/cmpsd %lhs, %rhs_dst
|
||||
// jnz do_min_max
|
||||
@@ -1747,8 +1750,7 @@ pub(crate) fn emit(
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
let inst =
|
||||
Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(lhs.to_reg()), rhs_dst.to_reg().to_reg());
|
||||
let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(lhs.to_reg()), dst.to_reg().to_reg());
|
||||
inst.emit(sink, info, state);
|
||||
|
||||
one_way_jmp(sink, CC::NZ, do_min_max);
|
||||
@@ -1758,7 +1760,7 @@ pub(crate) fn emit(
|
||||
// and negative zero. These instructions merge the sign bits in that
|
||||
// case, and are no-ops otherwise.
|
||||
let op = if *is_min { or_op } else { and_op };
|
||||
let inst = Inst::xmm_rm_r(op, RegMem::reg(lhs.to_reg()), rhs_dst.to_writable_reg());
|
||||
let inst = Inst::xmm_rm_r(op, RegMem::reg(lhs.to_reg()), dst.to_writable_reg());
|
||||
inst.emit(sink, info, state);
|
||||
|
||||
let inst = Inst::jmp_known(done);
|
||||
@@ -1768,17 +1770,13 @@ pub(crate) fn emit(
|
||||
// read-only operand: perform an addition between the two operands, which has the
|
||||
// desired NaN propagation effects.
|
||||
sink.bind_label(propagate_nan);
|
||||
let inst = Inst::xmm_rm_r(add_op, RegMem::reg(lhs.to_reg()), rhs_dst.to_writable_reg());
|
||||
let inst = Inst::xmm_rm_r(add_op, RegMem::reg(lhs.to_reg()), dst.to_writable_reg());
|
||||
inst.emit(sink, info, state);
|
||||
|
||||
one_way_jmp(sink, CC::P, done);
|
||||
|
||||
sink.bind_label(do_min_max);
|
||||
let inst = Inst::xmm_rm_r(
|
||||
min_max_op,
|
||||
RegMem::reg(lhs.to_reg()),
|
||||
rhs_dst.to_writable_reg(),
|
||||
);
|
||||
let inst = Inst::xmm_rm_r(min_max_op, RegMem::reg(lhs.to_reg()), dst.to_writable_reg());
|
||||
inst.emit(sink, info, state);
|
||||
|
||||
sink.bind_label(done);
|
||||
|
||||
@@ -450,20 +450,24 @@ impl Inst {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub(crate) fn xmm_min_max_seq(
|
||||
size: OperandSize,
|
||||
is_min: bool,
|
||||
lhs: Reg,
|
||||
rhs_dst: Writable<Reg>,
|
||||
rhs: Reg,
|
||||
dst: Writable<Reg>,
|
||||
) -> Inst {
|
||||
debug_assert!(size.is_one_of(&[OperandSize::Size32, OperandSize::Size64]));
|
||||
debug_assert_eq!(lhs.get_class(), RegClass::V128);
|
||||
debug_assert_eq!(rhs_dst.to_reg().get_class(), RegClass::V128);
|
||||
debug_assert_eq!(rhs.get_class(), RegClass::V128);
|
||||
debug_assert_eq!(dst.to_reg().get_class(), RegClass::V128);
|
||||
Inst::XmmMinMaxSeq {
|
||||
size,
|
||||
is_min,
|
||||
lhs: Xmm::new(lhs).unwrap(),
|
||||
rhs_dst: WritableXmm::from_writable_reg(rhs_dst).unwrap(),
|
||||
rhs: Xmm::new(rhs).unwrap(),
|
||||
dst: WritableXmm::from_writable_reg(dst).unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -900,6 +904,18 @@ impl Inst {
|
||||
}
|
||||
insts.push(self);
|
||||
}
|
||||
Inst::XmmMinMaxSeq { rhs, dst, .. } => {
|
||||
if *rhs != dst.to_reg() {
|
||||
debug_assert!(rhs.is_virtual());
|
||||
insts.push(Self::gen_move(
|
||||
dst.to_writable_reg(),
|
||||
rhs.to_reg(),
|
||||
types::I8X16,
|
||||
));
|
||||
*rhs = dst.to_reg();
|
||||
}
|
||||
insts.push(self);
|
||||
}
|
||||
Inst::Cmove {
|
||||
size,
|
||||
alternative,
|
||||
@@ -1330,11 +1346,12 @@ impl PrettyPrint for Inst {
|
||||
|
||||
Inst::XmmMinMaxSeq {
|
||||
lhs,
|
||||
rhs_dst,
|
||||
rhs,
|
||||
dst,
|
||||
is_min,
|
||||
size,
|
||||
} => format!(
|
||||
"{} {}, {}",
|
||||
"{} {}, {}, {}",
|
||||
ljustify2(
|
||||
if *is_min {
|
||||
"xmm min seq ".to_string()
|
||||
@@ -1344,7 +1361,8 @@ impl PrettyPrint for Inst {
|
||||
format!("f{}", size.to_bits())
|
||||
),
|
||||
show_ireg_sized(lhs.to_reg(), mb_rru, 8),
|
||||
show_ireg_sized(rhs_dst.to_reg().to_reg(), mb_rru, 8),
|
||||
show_ireg_sized(rhs.to_reg(), mb_rru, 8),
|
||||
show_ireg_sized(dst.to_reg().to_reg(), mb_rru, 8),
|
||||
),
|
||||
|
||||
Inst::XmmRmRImm {
|
||||
@@ -1924,9 +1942,10 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
|
||||
}
|
||||
Inst::XmmUninitializedValue { dst } => collector.add_def(dst.to_writable_reg()),
|
||||
Inst::XmmLoadConst { dst, .. } => collector.add_def(*dst),
|
||||
Inst::XmmMinMaxSeq { lhs, rhs_dst, .. } => {
|
||||
Inst::XmmMinMaxSeq { lhs, rhs, dst, .. } => {
|
||||
debug_assert_eq!(*rhs, dst.to_reg());
|
||||
collector.add_use(lhs.to_reg());
|
||||
collector.add_mod(rhs_dst.to_writable_reg());
|
||||
collector.add_mod(dst.to_writable_reg());
|
||||
}
|
||||
Inst::XmmRmiReg {
|
||||
src1, src2, dst, ..
|
||||
@@ -2352,11 +2371,14 @@ pub(crate) fn x64_map_regs<RM: RegMapper>(inst: &mut Inst, mapper: &RM) {
|
||||
}
|
||||
Inst::XmmMinMaxSeq {
|
||||
ref mut lhs,
|
||||
ref mut rhs_dst,
|
||||
ref mut rhs,
|
||||
ref mut dst,
|
||||
..
|
||||
} => {
|
||||
debug_assert_eq!(*rhs, dst.to_reg());
|
||||
lhs.map_use(mapper);
|
||||
rhs_dst.map_mod(mapper);
|
||||
dst.map_mod(mapper);
|
||||
*rhs = dst.to_reg();
|
||||
}
|
||||
Inst::XmmMovRM {
|
||||
ref mut src,
|
||||
|
||||
@@ -2059,3 +2059,194 @@
|
||||
(divps x y))
|
||||
(rule (lower (has_type $F64X2 (fdiv x y)))
|
||||
(divpd x y))
|
||||
|
||||
;; Rules for `fmin` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule (lower (has_type $F32 (fmin x y)))
|
||||
(xmm_min_max_seq $F32 $true x y))
|
||||
(rule (lower (has_type $F64 (fmin x y)))
|
||||
(xmm_min_max_seq $F64 $true x y))
|
||||
|
||||
;; Vector-typed version. We don't use single pseudoinstructions as
|
||||
;; above, because we don't need to generate a mini-CFG. Instead, we
|
||||
;; perform a branchless series of operations.
|
||||
;;
|
||||
;; We cannot simply use native min instructions (minps, minpd) because
|
||||
;; NaN handling is different per CLIF semantics than on
|
||||
;; x86. Specifically, if an argument is NaN, or the arguments are both
|
||||
;; zero but of opposite signs, then the x86 instruction always
|
||||
;; produces the second argument. However, per CLIF semantics, we
|
||||
;; require that fmin(NaN, _) = fmin(_, NaN) = NaN, and fmin(+0, -0) =
|
||||
;; fmin(-0, +0) = -0.
|
||||
|
||||
(rule (lower (has_type $F32X4 (fmin x y)))
|
||||
;; Compute min(x, y) and min(y, x) with native
|
||||
;; instructions. These will differ in one of the edge cases
|
||||
;; above that we have to handle properly. (Conversely, if they
|
||||
;; don't differ, then the native instruction's answer is the
|
||||
;; right one per CLIF semantics.)
|
||||
(let ((min1 Xmm (minps x y))
|
||||
(min2 Xmm (minps y x))
|
||||
;; Compute the OR of the two. Note that NaNs have an
|
||||
;; exponent field of all-ones (0xFF for F32), so if either
|
||||
;; result is a NaN, this OR will be. And if either is a
|
||||
;; zero (which has an exponent of 0 and mantissa of 0),
|
||||
;; this captures a sign-bit of 1 (negative) if either
|
||||
;; input is negative.
|
||||
;;
|
||||
;; In the case where we don't have a +/-0 mismatch or
|
||||
;; NaNs, then `min1` and `min2` are equal and `min_or` is
|
||||
;; the correct minimum.
|
||||
(min_or Xmm (orps min1 min2))
|
||||
;; "compare unordered" produces a true mask (all ones) in
|
||||
;; a given lane if the min is a NaN. We use this to
|
||||
;; generate a mask to ensure quiet NaNs.
|
||||
(is_nan_mask Xmm (cmpps min_or min2 (FcmpImm.Unordered)))
|
||||
;; OR in the NaN mask.
|
||||
(min_or_2 Xmm (orps min_or is_nan_mask))
|
||||
;; Shift the NaN mask down so that it covers just the
|
||||
;; fraction below the NaN signalling bit; we'll use this
|
||||
;; to mask off non-canonical NaN payloads.
|
||||
;;
|
||||
;; All-ones for NaN, shifted down to leave 10 top bits (1
|
||||
;; sign, 8 exponent, 1 QNaN bit that must remain set)
|
||||
;; cleared.
|
||||
(nan_fraction_mask Xmm (psrld is_nan_mask (RegMemImm.Imm 10)))
|
||||
;; Do a NAND, so that we retain every bit not set in
|
||||
;; `nan_fraction_mask`. This mask will be all zeroes (so
|
||||
;; we retain every bit) in non-NaN cases, and will have
|
||||
;; ones (so we clear those bits) in NaN-payload bits
|
||||
;; otherwise.
|
||||
(final Xmm (andnps nan_fraction_mask min_or_2)))
|
||||
final))
|
||||
|
||||
;; Likewise for F64 lanes, except that the right-shift is by 13 bits
|
||||
;; (1 sign, 11 exponent, 1 QNaN bit).
|
||||
(rule (lower (has_type $F64X2 (fmin x y)))
|
||||
(let ((min1 Xmm (minpd x y))
|
||||
(min2 Xmm (minpd y x))
|
||||
(min_or Xmm (orpd min1 min2))
|
||||
(is_nan_mask Xmm (cmppd min1 min2 (FcmpImm.Unordered)))
|
||||
(min_or_2 Xmm (orpd min_or is_nan_mask))
|
||||
(nan_fraction_mask Xmm (psrlq is_nan_mask (RegMemImm.Imm 13)))
|
||||
(final Xmm (andnpd nan_fraction_mask min_or_2)))
|
||||
final))
|
||||
|
||||
;; Rules for `fmax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule (lower (has_type $F32 (fmax x y)))
|
||||
(xmm_min_max_seq $F32 $false x y))
|
||||
(rule (lower (has_type $F64 (fmax x y)))
|
||||
(xmm_min_max_seq $F64 $false x y))
|
||||
|
||||
;; The vector version of fmax here is a dual to the fmin sequence
|
||||
;; above, almost, with a few differences.
|
||||
|
||||
(rule (lower (has_type $F32X4 (fmax x y)))
|
||||
;; Compute max(x, y) and max(y, x) with native
|
||||
;; instructions. These will differ in one of the edge cases
|
||||
;; above that we have to handle properly. (Conversely, if they
|
||||
;; don't differ, then the native instruction's answer is the
|
||||
;; right one per CLIF semantics.)
|
||||
(let ((max1 Xmm (maxps x y))
|
||||
(max2 Xmm (maxps y x))
|
||||
;; Compute the XOR of the two maxima. In the case
|
||||
;; where we don't have a +/-0 mismatch or NaNs, then
|
||||
;; `min1` and `min2` are equal and this XOR is zero.
|
||||
(max_xor Xmm (xorps max1 max2))
|
||||
;; OR the XOR into one of the original maxima. If they are
|
||||
;; equal, this does nothing. If max2 was NaN, its exponent
|
||||
;; bits were all-ones, so the xor's exponent bits were the
|
||||
;; complement of max1, and the OR of max1 and max_xor has
|
||||
;; an all-ones exponent (is a NaN). If max1 was NaN, then
|
||||
;; its exponent bits were already all-ones, so the OR will
|
||||
;; be a NaN as well.
|
||||
(max_blended_nan Xmm (orps max1 max_xor))
|
||||
;; Subtract the XOR. This ensures that if we had +0 and
|
||||
;; -0, we end up with +0.
|
||||
(max_blended_nan_positive Xmm (subps max_blended_nan max_xor))
|
||||
;; "compare unordered" produces a true mask (all ones) in
|
||||
;; a given lane if the min is a NaN. We use this to
|
||||
;; generate a mask to ensure quiet NaNs.
|
||||
(is_nan_mask Xmm (cmpps max_blended_nan max_blended_nan (FcmpImm.Unordered)))
|
||||
;; Shift the NaN mask down so that it covers just the
|
||||
;; fraction below the NaN signalling bit; we'll use this
|
||||
;; to mask off non-canonical NaN payloads.
|
||||
;;
|
||||
;; All-ones for NaN, shifted down to leave 10 top bits (1
|
||||
;; sign, 8 exponent, 1 QNaN bit that must remain set)
|
||||
;; cleared.
|
||||
(nan_fraction_mask Xmm (psrld is_nan_mask (RegMemImm.Imm 10)))
|
||||
;; Do a NAND, so that we retain every bit not set in
|
||||
;; `nan_fraction_mask`. This mask will be all zeroes (so
|
||||
;; we retain every bit) in non-NaN cases, and will have
|
||||
;; ones (so we clear those bits) in NaN-payload bits
|
||||
;; otherwise.
|
||||
(final Xmm (andnps nan_fraction_mask max_blended_nan_positive)))
|
||||
final))
|
||||
|
||||
(rule (lower (has_type $F64X2 (fmax x y)))
|
||||
;; Compute max(x, y) and max(y, x) with native
|
||||
;; instructions. These will differ in one of the edge cases
|
||||
;; above that we have to handle properly. (Conversely, if they
|
||||
;; don't differ, then the native instruction's answer is the
|
||||
;; right one per CLIF semantics.)
|
||||
(let ((max1 Xmm (maxpd x y))
|
||||
(max2 Xmm (maxpd y x))
|
||||
;; Compute the XOR of the two maxima. In the case
|
||||
;; where we don't have a +/-0 mismatch or NaNs, then
|
||||
;; `min1` and `min2` are equal and this XOR is zero.
|
||||
(max_xor Xmm (xorpd max1 max2))
|
||||
;; OR the XOR into one of the original maxima. If they are
|
||||
;; equal, this does nothing. If max2 was NaN, its exponent
|
||||
;; bits were all-ones, so the xor's exponent bits were the
|
||||
;; complement of max1, and the OR of max1 and max_xor has
|
||||
;; an all-ones exponent (is a NaN). If max1 was NaN, then
|
||||
;; its exponent bits were already all-ones, so the OR will
|
||||
;; be a NaN as well.
|
||||
(max_blended_nan Xmm (orpd max1 max_xor))
|
||||
;; Subtract the XOR. This ensures that if we had +0 and
|
||||
;; -0, we end up with +0.
|
||||
(max_blended_nan_positive Xmm (subpd max_blended_nan max_xor))
|
||||
;; `cmpps` with predicate index `3` is `cmpunordps`, or
|
||||
;; "compare unordered": it produces a true mask (all ones)
|
||||
;; in a given lane if the min is a NaN. We use this to
|
||||
;; generate a mask to ensure quiet NaNs.
|
||||
(is_nan_mask Xmm (cmppd max_blended_nan max_blended_nan (FcmpImm.Unordered)))
|
||||
;; Shift the NaN mask down so that it covers just the
|
||||
;; fraction below the NaN signalling bit; we'll use this
|
||||
;; to mask off non-canonical NaN payloads.
|
||||
;;
|
||||
;; All-ones for NaN, shifted down to leave 13 top bits (1
|
||||
;; sign, 11 exponent, 1 QNaN bit that must remain set)
|
||||
;; cleared.
|
||||
(nan_fraction_mask Xmm (psrlq is_nan_mask (RegMemImm.Imm 13)))
|
||||
;; Do a NAND, so that we retain every bit not set in
|
||||
;; `nan_fraction_mask`. This mask will be all zeroes (so
|
||||
;; we retain every bit) in non-NaN cases, and will have
|
||||
;; ones (so we clear those bits) in NaN-payload bits
|
||||
;; otherwise.
|
||||
(final Xmm (andnpd nan_fraction_mask max_blended_nan_positive)))
|
||||
final))
|
||||
|
||||
;; Rules for `fmin_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule (lower (has_type $F32 (fmin_pseudo x y)))
|
||||
(minss y x))
|
||||
(rule (lower (has_type $F64 (fmin_pseudo x y)))
|
||||
(minsd y x))
|
||||
(rule (lower (has_type $F32X4 (fmin_pseudo x y)))
|
||||
(minps y x))
|
||||
(rule (lower (has_type $F64X2 (fmin_pseudo x y)))
|
||||
(minpd y x))
|
||||
|
||||
;; Rules for `fmax_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule (lower (has_type $F32 (fmax_pseudo x y)))
|
||||
(maxss y x))
|
||||
(rule (lower (has_type $F64 (fmax_pseudo x y)))
|
||||
(maxsd y x))
|
||||
(rule (lower (has_type $F32X4 (fmax_pseudo x y)))
|
||||
(maxps y x))
|
||||
(rule (lower (has_type $F64X2 (fmax_pseudo x y)))
|
||||
(maxpd y x))
|
||||
|
||||
@@ -910,7 +910,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
| Opcode::Fadd
|
||||
| Opcode::Fsub
|
||||
| Opcode::Fmul
|
||||
| Opcode::Fdiv => implemented_in_isle(ctx),
|
||||
| Opcode::Fdiv
|
||||
| Opcode::Fmin
|
||||
| Opcode::Fmax
|
||||
| Opcode::FminPseudo
|
||||
| Opcode::FmaxPseudo => implemented_in_isle(ctx),
|
||||
|
||||
Opcode::Icmp => {
|
||||
let condcode = ctx.data(insn).cond_code().unwrap();
|
||||
@@ -1278,235 +1282,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
};
|
||||
}
|
||||
|
||||
Opcode::Fmin | Opcode::Fmax => {
|
||||
let lhs = put_input_in_reg(ctx, inputs[0]);
|
||||
let rhs = put_input_in_reg(ctx, inputs[1]);
|
||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||
let is_min = op == Opcode::Fmin;
|
||||
let output_ty = ty.unwrap();
|
||||
ctx.emit(Inst::gen_move(dst, rhs, output_ty));
|
||||
if !output_ty.is_vector() {
|
||||
let op_size = match output_ty {
|
||||
types::F32 => OperandSize::Size32,
|
||||
types::F64 => OperandSize::Size64,
|
||||
_ => panic!("unexpected type {:?} for fmin/fmax", output_ty),
|
||||
};
|
||||
ctx.emit(Inst::xmm_min_max_seq(op_size, is_min, lhs, dst));
|
||||
} else {
|
||||
// X64's implementation of floating point min and floating point max does not
|
||||
// propagate NaNs and +0's in a way that is friendly to the SIMD spec. For the
|
||||
// scalar approach we use jumps to handle cases where NaN and +0 propagation is
|
||||
// not consistent with what is needed. However for packed floating point min and
|
||||
// floating point max we implement a different approach to avoid the sequence
|
||||
// of jumps that would be required on a per lane basis. Because we do not need to
|
||||
// lower labels and jumps but do need ctx for creating temporaries we implement
|
||||
// the lowering here in lower.rs instead of emit.rs as is done in the case for scalars.
|
||||
// The outline of approach is as follows:
|
||||
//
|
||||
// First we preform the Min/Max in both directions. This is because in the
|
||||
// case of an operand's lane containing a NaN or in the case of the lanes of the
|
||||
// two operands containing 0 but with mismatched signs, x64 will return the second
|
||||
// operand regardless of its contents. So in order to make sure we capture NaNs and
|
||||
// normalize NaNs and 0 values we capture the operation in both directions and merge the
|
||||
// results. Then we normalize the results through operations that create a mask for the
|
||||
// lanes containing NaNs, we use that mask to adjust NaNs to quite NaNs and normalize
|
||||
// 0s.
|
||||
//
|
||||
// The following sequence is generated for min:
|
||||
//
|
||||
// movap{s,d} %lhs, %tmp
|
||||
// minp{s,d} %dst, %tmp
|
||||
// minp,{s,d} %lhs, %dst
|
||||
// orp{s,d} %dst, %tmp
|
||||
// cmpp{s,d} %tmp, %dst, $3
|
||||
// orps{s,d} %dst, %tmp
|
||||
// psrl{s,d} {$10, $13}, %dst
|
||||
// andnp{s,d} %tmp, %dst
|
||||
//
|
||||
// and for max the sequence is:
|
||||
//
|
||||
// movap{s,d} %lhs, %tmp
|
||||
// minp{s,d} %dst, %tmp
|
||||
// minp,{s,d} %lhs, %dst
|
||||
// xorp{s,d} %tmp, %dst
|
||||
// orp{s,d} %dst, %tmp
|
||||
// subp{s,d} %dst, %tmp
|
||||
// cmpp{s,d} %tmp, %dst, $3
|
||||
// psrl{s,d} {$10, $13}, %dst
|
||||
// andnp{s,d} %tmp, %dst
|
||||
|
||||
if is_min {
|
||||
let (mov_op, min_op, or_op, cmp_op, shift_op, shift_by, andn_op) =
|
||||
match output_ty {
|
||||
types::F32X4 => (
|
||||
SseOpcode::Movaps,
|
||||
SseOpcode::Minps,
|
||||
SseOpcode::Orps,
|
||||
SseOpcode::Cmpps,
|
||||
SseOpcode::Psrld,
|
||||
10,
|
||||
SseOpcode::Andnps,
|
||||
),
|
||||
types::F64X2 => (
|
||||
SseOpcode::Movapd,
|
||||
SseOpcode::Minpd,
|
||||
SseOpcode::Orpd,
|
||||
SseOpcode::Cmppd,
|
||||
SseOpcode::Psrlq,
|
||||
13,
|
||||
SseOpcode::Andnpd,
|
||||
),
|
||||
_ => unimplemented!("unsupported op type {:?}", output_ty),
|
||||
};
|
||||
|
||||
// Copy lhs into tmp
|
||||
let tmp_xmm1 = ctx.alloc_tmp(output_ty).only_reg().unwrap();
|
||||
ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1));
|
||||
|
||||
// Perform min in reverse direction
|
||||
ctx.emit(Inst::xmm_rm_r(min_op, RegMem::from(dst), tmp_xmm1));
|
||||
|
||||
// Perform min in original direction
|
||||
ctx.emit(Inst::xmm_rm_r(min_op, RegMem::reg(lhs), dst));
|
||||
|
||||
// X64 handles propagation of -0's and Nans differently between left and right
|
||||
// operands. After doing the min in both directions, this OR will
|
||||
// guarrentee capture of -0's and Nan in our tmp register
|
||||
ctx.emit(Inst::xmm_rm_r(or_op, RegMem::from(dst), tmp_xmm1));
|
||||
|
||||
// Compare unordered to create mask for lanes containing NaNs and then use
|
||||
// that mask to saturate the NaN containing lanes in the tmp register with 1s.
|
||||
// TODO: Would a check for NaN and then a jump be better here in the
|
||||
// common case than continuing on to normalize NaNs that might not exist?
|
||||
let cond = FcmpImm::from(FloatCC::Unordered);
|
||||
ctx.emit(Inst::xmm_rm_r_imm(
|
||||
cmp_op,
|
||||
RegMem::reg(tmp_xmm1.to_reg()),
|
||||
dst,
|
||||
cond.encode(),
|
||||
OperandSize::Size32,
|
||||
));
|
||||
ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
|
||||
|
||||
// The dst register holds a mask for lanes containing NaNs.
|
||||
// We take that mask and shift in preparation for creating a different mask
|
||||
// to normalize NaNs (create a quite NaN) by zeroing out the appropriate
|
||||
// number of least signficant bits. We shift right each lane by 10 bits
|
||||
// (1 sign + 8 exp. + 1 MSB sig.) for F32X4 and by 13 bits (1 sign +
|
||||
// 11 exp. + 1 MSB sig.) for F64X2.
|
||||
ctx.emit(Inst::xmm_rmi_reg(shift_op, RegMemImm::imm(shift_by), dst));
|
||||
|
||||
// Finally we do a nand with the tmp register to produce the final results
|
||||
// in the dst.
|
||||
ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
|
||||
} else {
|
||||
let (
|
||||
mov_op,
|
||||
max_op,
|
||||
xor_op,
|
||||
or_op,
|
||||
sub_op,
|
||||
cmp_op,
|
||||
shift_op,
|
||||
shift_by,
|
||||
andn_op,
|
||||
) = match output_ty {
|
||||
types::F32X4 => (
|
||||
SseOpcode::Movaps,
|
||||
SseOpcode::Maxps,
|
||||
SseOpcode::Xorps,
|
||||
SseOpcode::Orps,
|
||||
SseOpcode::Subps,
|
||||
SseOpcode::Cmpps,
|
||||
SseOpcode::Psrld,
|
||||
10,
|
||||
SseOpcode::Andnps,
|
||||
),
|
||||
types::F64X2 => (
|
||||
SseOpcode::Movapd,
|
||||
SseOpcode::Maxpd,
|
||||
SseOpcode::Xorpd,
|
||||
SseOpcode::Orpd,
|
||||
SseOpcode::Subpd,
|
||||
SseOpcode::Cmppd,
|
||||
SseOpcode::Psrlq,
|
||||
13,
|
||||
SseOpcode::Andnpd,
|
||||
),
|
||||
_ => unimplemented!("unsupported op type {:?}", output_ty),
|
||||
};
|
||||
|
||||
// Copy lhs into tmp.
|
||||
let tmp_xmm1 = ctx.alloc_tmp(types::F32).only_reg().unwrap();
|
||||
ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1));
|
||||
|
||||
// Perform max in reverse direction.
|
||||
ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
|
||||
|
||||
// Perform max in original direction.
|
||||
ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(lhs), dst));
|
||||
|
||||
// Get the difference between the two results and store in tmp.
|
||||
// Max uses a different approach than min to account for potential
|
||||
// discrepancies with plus/minus 0.
|
||||
ctx.emit(Inst::xmm_rm_r(xor_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
|
||||
|
||||
// X64 handles propagation of -0's and Nans differently between left and right
|
||||
// operands. After doing the max in both directions, this OR will
|
||||
// guarentee capture of 0's and Nan in our tmp register.
|
||||
ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
|
||||
|
||||
// Capture NaNs and sign discrepancies.
|
||||
ctx.emit(Inst::xmm_rm_r(sub_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
|
||||
|
||||
// Compare unordered to create mask for lanes containing NaNs and then use
|
||||
// that mask to saturate the NaN containing lanes in the tmp register with 1s.
|
||||
let cond = FcmpImm::from(FloatCC::Unordered);
|
||||
ctx.emit(Inst::xmm_rm_r_imm(
|
||||
cmp_op,
|
||||
RegMem::reg(tmp_xmm1.to_reg()),
|
||||
dst,
|
||||
cond.encode(),
|
||||
OperandSize::Size32,
|
||||
));
|
||||
|
||||
// The dst register holds a mask for lanes containing NaNs.
|
||||
// We take that mask and shift in preparation for creating a different mask
|
||||
// to normalize NaNs (create a quite NaN) by zeroing out the appropriate
|
||||
// number of least signficant bits. We shift right each lane by 10 bits
|
||||
// (1 sign + 8 exp. + 1 MSB sig.) for F32X4 and by 13 bits (1 sign +
|
||||
// 11 exp. + 1 MSB sig.) for F64X2.
|
||||
ctx.emit(Inst::xmm_rmi_reg(shift_op, RegMemImm::imm(shift_by), dst));
|
||||
|
||||
// Finally we do a nand with the tmp register to produce the final results
|
||||
// in the dst.
|
||||
ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Opcode::FminPseudo | Opcode::FmaxPseudo => {
|
||||
// We can't guarantee the RHS (if a load) is 128-bit aligned, so we
|
||||
// must avoid merging a load here.
|
||||
let lhs = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
|
||||
let rhs = put_input_in_reg(ctx, inputs[1]);
|
||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||
let ty = ty.unwrap();
|
||||
ctx.emit(Inst::gen_move(dst, rhs, ty));
|
||||
let sse_opcode = match (ty, op) {
|
||||
(types::F32, Opcode::FminPseudo) => SseOpcode::Minss,
|
||||
(types::F32, Opcode::FmaxPseudo) => SseOpcode::Maxss,
|
||||
(types::F64, Opcode::FminPseudo) => SseOpcode::Minsd,
|
||||
(types::F64, Opcode::FmaxPseudo) => SseOpcode::Maxsd,
|
||||
(types::F32X4, Opcode::FminPseudo) => SseOpcode::Minps,
|
||||
(types::F32X4, Opcode::FmaxPseudo) => SseOpcode::Maxps,
|
||||
(types::F64X2, Opcode::FminPseudo) => SseOpcode::Minpd,
|
||||
(types::F64X2, Opcode::FmaxPseudo) => SseOpcode::Maxpd,
|
||||
_ => unimplemented!("unsupported type {} for {}", ty, op),
|
||||
};
|
||||
ctx.emit(Inst::xmm_rm_r(sse_opcode, lhs, dst));
|
||||
}
|
||||
|
||||
Opcode::Sqrt => {
|
||||
// We can't guarantee the RHS (if a load) is 128-bit aligned, so we
|
||||
// must avoid merging a load here.
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
src/clif.isle 9ea75a6f790b5c03
|
||||
src/prelude.isle b2bc986bcbbbb77
|
||||
src/isa/x64/inst.isle 9a8a3babd8257100
|
||||
src/isa/x64/lower.isle f0f4af691241209e
|
||||
src/isa/x64/inst.isle 40f495d3ca5ae547
|
||||
src/isa/x64/lower.isle faa2a07bba48a813
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user