x64: Lower fcvt_to_{u,s}int{,_sat} in ISLE (#4704)
https://github.com/bytecodealliance/wasmtime/pull/4704
This commit is contained in:
@@ -3047,6 +3047,10 @@
|
|||||||
(_ Unit (emit (MInst.GprToXmm (SseOpcode.Cvtsi2sd) x dst size))))
|
(_ Unit (emit (MInst.GprToXmm (SseOpcode.Cvtsi2sd) x dst size))))
|
||||||
dst))
|
dst))
|
||||||
|
|
||||||
|
(decl x64_cvttps2dq (Type XmmMem) Xmm)
|
||||||
|
(rule (x64_cvttps2dq ty x)
|
||||||
|
(xmm_unary_rm_r (SseOpcode.Cvttps2dq) x))
|
||||||
|
|
||||||
(decl cvt_u64_to_float_seq (Type Gpr) Xmm)
|
(decl cvt_u64_to_float_seq (Type Gpr) Xmm)
|
||||||
(rule (cvt_u64_to_float_seq ty src)
|
(rule (cvt_u64_to_float_seq ty src)
|
||||||
(let ((size OperandSize (raw_operand_size_of_type ty))
|
(let ((size OperandSize (raw_operand_size_of_type ty))
|
||||||
@@ -3058,6 +3062,34 @@
|
|||||||
(_ Unit (emit (MInst.CvtUint64ToFloatSeq size src_copy dst tmp_gpr1 tmp_gpr2))))
|
(_ Unit (emit (MInst.CvtUint64ToFloatSeq size src_copy dst tmp_gpr1 tmp_gpr2))))
|
||||||
dst))
|
dst))
|
||||||
|
|
||||||
|
(decl cvt_float_to_uint_seq (Type Value bool) Gpr)
|
||||||
|
(rule (cvt_float_to_uint_seq out_ty src @ (value_type src_ty) is_saturating)
|
||||||
|
(let ((out_size OperandSize (raw_operand_size_of_type out_ty))
|
||||||
|
(src_size OperandSize (raw_operand_size_of_type src_ty))
|
||||||
|
|
||||||
|
(tmp WritableXmm (temp_writable_xmm))
|
||||||
|
(_ Unit (emit (gen_move src_ty tmp src)))
|
||||||
|
|
||||||
|
(dst WritableGpr (temp_writable_gpr))
|
||||||
|
(tmp_xmm WritableXmm (temp_writable_xmm))
|
||||||
|
(tmp_gpr WritableGpr (temp_writable_gpr))
|
||||||
|
(_ Unit (emit (MInst.CvtFloatToUintSeq out_size src_size is_saturating tmp dst tmp_gpr tmp_xmm))))
|
||||||
|
dst))
|
||||||
|
|
||||||
|
(decl cvt_float_to_sint_seq (Type Value bool) Gpr)
|
||||||
|
(rule (cvt_float_to_sint_seq out_ty src @ (value_type src_ty) is_saturating)
|
||||||
|
(let ((out_size OperandSize (raw_operand_size_of_type out_ty))
|
||||||
|
(src_size OperandSize (raw_operand_size_of_type src_ty))
|
||||||
|
|
||||||
|
(tmp WritableXmm (temp_writable_xmm))
|
||||||
|
(_ Unit (emit (gen_move src_ty tmp src)))
|
||||||
|
|
||||||
|
(dst WritableGpr (temp_writable_gpr))
|
||||||
|
(tmp_xmm WritableXmm (temp_writable_xmm))
|
||||||
|
(tmp_gpr WritableGpr (temp_writable_gpr))
|
||||||
|
(_ Unit (emit (MInst.CvtFloatToSintSeq out_size src_size is_saturating tmp dst tmp_gpr tmp_xmm))))
|
||||||
|
dst))
|
||||||
|
|
||||||
(decl fcvt_uint_mask_const () VCodeConstant)
|
(decl fcvt_uint_mask_const () VCodeConstant)
|
||||||
(extern constructor fcvt_uint_mask_const fcvt_uint_mask_const)
|
(extern constructor fcvt_uint_mask_const fcvt_uint_mask_const)
|
||||||
|
|
||||||
|
|||||||
@@ -408,58 +408,6 @@ impl Inst {
|
|||||||
Inst::XmmCmpRmR { op, src, dst }
|
Inst::XmmCmpRmR { op, src, dst }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn cvt_float_to_sint_seq(
|
|
||||||
src_size: OperandSize,
|
|
||||||
dst_size: OperandSize,
|
|
||||||
is_saturating: bool,
|
|
||||||
src: Writable<Reg>,
|
|
||||||
dst: Writable<Reg>,
|
|
||||||
tmp_gpr: Writable<Reg>,
|
|
||||||
tmp_xmm: Writable<Reg>,
|
|
||||||
) -> Inst {
|
|
||||||
debug_assert!(src_size.is_one_of(&[OperandSize::Size32, OperandSize::Size64]));
|
|
||||||
debug_assert!(dst_size.is_one_of(&[OperandSize::Size32, OperandSize::Size64]));
|
|
||||||
debug_assert!(src.to_reg().class() == RegClass::Float);
|
|
||||||
debug_assert!(tmp_xmm.to_reg().class() == RegClass::Float);
|
|
||||||
debug_assert!(tmp_gpr.to_reg().class() == RegClass::Int);
|
|
||||||
debug_assert!(dst.to_reg().class() == RegClass::Int);
|
|
||||||
Inst::CvtFloatToSintSeq {
|
|
||||||
src_size,
|
|
||||||
dst_size,
|
|
||||||
is_saturating,
|
|
||||||
src: WritableXmm::from_writable_reg(src).unwrap(),
|
|
||||||
dst: WritableGpr::from_writable_reg(dst).unwrap(),
|
|
||||||
tmp_gpr: WritableGpr::from_writable_reg(tmp_gpr).unwrap(),
|
|
||||||
tmp_xmm: WritableXmm::from_writable_reg(tmp_xmm).unwrap(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn cvt_float_to_uint_seq(
|
|
||||||
src_size: OperandSize,
|
|
||||||
dst_size: OperandSize,
|
|
||||||
is_saturating: bool,
|
|
||||||
src: Writable<Reg>,
|
|
||||||
dst: Writable<Reg>,
|
|
||||||
tmp_gpr: Writable<Reg>,
|
|
||||||
tmp_xmm: Writable<Reg>,
|
|
||||||
) -> Inst {
|
|
||||||
debug_assert!(src_size.is_one_of(&[OperandSize::Size32, OperandSize::Size64]));
|
|
||||||
debug_assert!(dst_size.is_one_of(&[OperandSize::Size32, OperandSize::Size64]));
|
|
||||||
debug_assert!(src.to_reg().class() == RegClass::Float);
|
|
||||||
debug_assert!(tmp_xmm.to_reg().class() == RegClass::Float);
|
|
||||||
debug_assert!(tmp_gpr.to_reg().class() == RegClass::Int);
|
|
||||||
debug_assert!(dst.to_reg().class() == RegClass::Int);
|
|
||||||
Inst::CvtFloatToUintSeq {
|
|
||||||
src_size,
|
|
||||||
dst_size,
|
|
||||||
is_saturating,
|
|
||||||
src: WritableXmm::from_writable_reg(src).unwrap(),
|
|
||||||
dst: WritableGpr::from_writable_reg(dst).unwrap(),
|
|
||||||
tmp_gpr: WritableGpr::from_writable_reg(tmp_gpr).unwrap(),
|
|
||||||
tmp_xmm: WritableXmm::from_writable_reg(tmp_xmm).unwrap(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[allow(dead_code)]
|
#[allow(dead_code)]
|
||||||
pub(crate) fn xmm_min_max_seq(
|
pub(crate) fn xmm_min_max_seq(
|
||||||
size: OperandSize,
|
size: OperandSize,
|
||||||
@@ -1257,7 +1205,7 @@ impl PrettyPrint for Inst {
|
|||||||
dst_size,
|
dst_size,
|
||||||
tmp_xmm,
|
tmp_xmm,
|
||||||
tmp_gpr,
|
tmp_gpr,
|
||||||
..
|
is_saturating,
|
||||||
} => {
|
} => {
|
||||||
let src = pretty_print_reg(src.to_reg().to_reg(), src_size.to_bytes(), allocs);
|
let src = pretty_print_reg(src.to_reg().to_reg(), src_size.to_bytes(), allocs);
|
||||||
let dst = pretty_print_reg(dst.to_reg().to_reg(), dst_size.to_bytes(), allocs);
|
let dst = pretty_print_reg(dst.to_reg().to_reg(), dst_size.to_bytes(), allocs);
|
||||||
@@ -1266,9 +1214,10 @@ impl PrettyPrint for Inst {
|
|||||||
format!(
|
format!(
|
||||||
"{} {}, {}, {}, {}",
|
"{} {}, {}, {}, {}",
|
||||||
ljustify(format!(
|
ljustify(format!(
|
||||||
"cvt_float{}_to_sint{}_seq",
|
"cvt_float{}_to_sint{}{}_seq",
|
||||||
src_size.to_bits(),
|
src_size.to_bits(),
|
||||||
dst_size.to_bits()
|
dst_size.to_bits(),
|
||||||
|
if *is_saturating { "_sat" } else { "" },
|
||||||
)),
|
)),
|
||||||
src,
|
src,
|
||||||
dst,
|
dst,
|
||||||
@@ -1284,7 +1233,7 @@ impl PrettyPrint for Inst {
|
|||||||
dst_size,
|
dst_size,
|
||||||
tmp_gpr,
|
tmp_gpr,
|
||||||
tmp_xmm,
|
tmp_xmm,
|
||||||
..
|
is_saturating,
|
||||||
} => {
|
} => {
|
||||||
let src = pretty_print_reg(src.to_reg().to_reg(), src_size.to_bytes(), allocs);
|
let src = pretty_print_reg(src.to_reg().to_reg(), src_size.to_bytes(), allocs);
|
||||||
let dst = pretty_print_reg(dst.to_reg().to_reg(), dst_size.to_bytes(), allocs);
|
let dst = pretty_print_reg(dst.to_reg().to_reg(), dst_size.to_bytes(), allocs);
|
||||||
@@ -1293,9 +1242,10 @@ impl PrettyPrint for Inst {
|
|||||||
format!(
|
format!(
|
||||||
"{} {}, {}, {}, {}",
|
"{} {}, {}, {}, {}",
|
||||||
ljustify(format!(
|
ljustify(format!(
|
||||||
"cvt_float{}_to_uint{}_seq",
|
"cvt_float{}_to_uint{}{}_seq",
|
||||||
src_size.to_bits(),
|
src_size.to_bits(),
|
||||||
dst_size.to_bits()
|
dst_size.to_bits(),
|
||||||
|
if *is_saturating { "_sat" } else { "" },
|
||||||
)),
|
)),
|
||||||
src,
|
src,
|
||||||
dst,
|
dst,
|
||||||
|
|||||||
@@ -3062,3 +3062,130 @@
|
|||||||
|
|
||||||
;; add together the two converted values
|
;; add together the two converted values
|
||||||
(x64_addps a_hi a_lo)))
|
(x64_addps a_hi a_lo)))
|
||||||
|
|
||||||
|
;; Rules for `fcvt_to_uint` and `fcvt_to_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
(rule (lower (has_type out_ty (fcvt_to_uint val @ (value_type (ty_scalar_float _)))))
|
||||||
|
(cvt_float_to_uint_seq out_ty val $false))
|
||||||
|
|
||||||
|
(rule (lower (has_type out_ty (fcvt_to_uint_sat val @ (value_type (ty_scalar_float _)))))
|
||||||
|
(cvt_float_to_uint_seq out_ty val $true))
|
||||||
|
|
||||||
|
(rule (lower (has_type out_ty (fcvt_to_sint val @ (value_type (ty_scalar_float _)))))
|
||||||
|
(cvt_float_to_sint_seq out_ty val $false))
|
||||||
|
|
||||||
|
(rule (lower (has_type out_ty (fcvt_to_sint_sat val @ (value_type (ty_scalar_float _)))))
|
||||||
|
(cvt_float_to_sint_seq out_ty val $true))
|
||||||
|
|
||||||
|
;; The x64 backend currently only supports these two type combinations.
|
||||||
|
(rule (lower (has_type $I32X4 (fcvt_to_sint_sat val @ (value_type $F32X4))))
|
||||||
|
(let (;; Sets tmp to zero if float is NaN
|
||||||
|
(tmp Xmm (x64_cmpps val val (FcmpImm.Equal)))
|
||||||
|
(dst Xmm (x64_andps val tmp))
|
||||||
|
|
||||||
|
;; Sets top bit of tmp if float is positive
|
||||||
|
;; Setting up to set top bit on negative float values
|
||||||
|
(tmp Xmm (x64_pxor tmp dst))
|
||||||
|
|
||||||
|
;; Convert the packed float to packed doubleword.
|
||||||
|
(dst Xmm (x64_cvttps2dq $F32X4 dst))
|
||||||
|
|
||||||
|
;; Set top bit only if < 0
|
||||||
|
(tmp Xmm (x64_pand dst tmp))
|
||||||
|
(tmp Xmm (x64_psrad tmp (RegMemImm.Imm 31))))
|
||||||
|
|
||||||
|
;; On overflow 0x80000000 is returned to a lane.
|
||||||
|
;; Below sets positive overflow lanes to 0x7FFFFFFF
|
||||||
|
;; Keeps negative overflow lanes as is.
|
||||||
|
(x64_pxor tmp dst)))
|
||||||
|
|
||||||
|
;; The algorithm for converting floats to unsigned ints is a little tricky. The
|
||||||
|
;; complication arises because we are converting from a signed 64-bit int with a positive
|
||||||
|
;; integer range from 1..INT_MAX (0x1..0x7FFFFFFF) to an unsigned integer with an extended
|
||||||
|
;; range from (INT_MAX+1)..UINT_MAX. It's this range from (INT_MAX+1)..UINT_MAX
|
||||||
|
;; (0x80000000..0xFFFFFFFF) that needs to be accounted for as a special case since our
|
||||||
|
;; conversion instruction (cvttps2dq) only converts as high as INT_MAX (0x7FFFFFFF), but
|
||||||
|
;; which conveniently setting underflows and overflows (smaller than MIN_INT or larger than
|
||||||
|
;; MAX_INT) to be INT_MAX+1 (0x80000000). Nothing that the range (INT_MAX+1)..UINT_MAX includes
|
||||||
|
;; precisely INT_MAX values we can correctly account for and convert every value in this range
|
||||||
|
;; if we simply subtract INT_MAX+1 before doing the cvttps2dq conversion. After the subtraction
|
||||||
|
;; every value originally (INT_MAX+1)..UINT_MAX is now the range (0..INT_MAX).
|
||||||
|
;; After the conversion we add INT_MAX+1 back to this converted value, noting again that
|
||||||
|
;; values we are trying to account for were already set to INT_MAX+1 during the original conversion.
|
||||||
|
;; We simply have to create a mask and make sure we are adding together only the lanes that need
|
||||||
|
;; to be accounted for. Digesting it all the steps then are:
|
||||||
|
;;
|
||||||
|
;; Step 1 - Account for NaN and negative floats by setting these src values to zero.
|
||||||
|
;; Step 2 - Make a copy (tmp1) of the src value since we need to convert twice for
|
||||||
|
;; reasons described above.
|
||||||
|
;; Step 3 - Convert the original src values. This will convert properly all floats up to INT_MAX
|
||||||
|
;; Step 4 - Subtract INT_MAX from the copy set (tmp1). Note, all zero and negative values are those
|
||||||
|
;; values that were originally in the range (0..INT_MAX). This will come in handy during
|
||||||
|
;; step 7 when we zero negative lanes.
|
||||||
|
;; Step 5 - Create a bit mask for tmp1 that will correspond to all lanes originally less than
|
||||||
|
;; UINT_MAX that are now less than INT_MAX thanks to the subtraction.
|
||||||
|
;; Step 6 - Convert the second set of values (tmp1)
|
||||||
|
;; Step 7 - Prep the converted second set by zeroing out negative lanes (these have already been
|
||||||
|
;; converted correctly with the first set) and by setting overflow lanes to 0x7FFFFFFF
|
||||||
|
;; as this will allow us to properly saturate overflow lanes when adding to 0x80000000
|
||||||
|
;; Step 8 - Add the orginal converted src and the converted tmp1 where float values originally less
|
||||||
|
;; than and equal to INT_MAX will be unchanged, float values originally between INT_MAX+1 and
|
||||||
|
;; UINT_MAX will add together (INT_MAX) + (SRC - INT_MAX), and float values originally
|
||||||
|
;; greater than UINT_MAX will be saturated to UINT_MAX (0xFFFFFFFF) after adding (0x8000000 + 0x7FFFFFFF).
|
||||||
|
;;
|
||||||
|
;;
|
||||||
|
;; The table below illustrates the result after each step where it matters for the converted set.
|
||||||
|
;; Note the original value range (original src set) is the final dst in Step 8:
|
||||||
|
;;
|
||||||
|
;; Original src set:
|
||||||
|
;; | Original Value Range | Step 1 | Step 3 | Step 8 |
|
||||||
|
;; | -FLT_MIN..FLT_MAX | 0.0..FLT_MAX | 0..INT_MAX(w/overflow) | 0..UINT_MAX(w/saturation) |
|
||||||
|
;;
|
||||||
|
;; Copied src set (tmp1):
|
||||||
|
;; | Step 2 | Step 4 |
|
||||||
|
;; | 0.0..FLT_MAX | (0.0-(INT_MAX+1))..(FLT_MAX-(INT_MAX+1)) |
|
||||||
|
;;
|
||||||
|
;; | Step 6 | Step 7 |
|
||||||
|
;; | (0-(INT_MAX+1))..(UINT_MAX-(INT_MAX+1))(w/overflow) | ((INT_MAX+1)-(INT_MAX+1))..(INT_MAX+1) |
|
||||||
|
(rule (lower (has_type $I32X4 (fcvt_to_uint_sat val @ (value_type $F32X4))))
|
||||||
|
(let (;; Converting to unsigned int so if float src is negative or NaN
|
||||||
|
;; will first set to zero.
|
||||||
|
(tmp2 Xmm (x64_pxor val val)) ;; make a zero
|
||||||
|
(dst Xmm (x64_maxps val tmp2))
|
||||||
|
|
||||||
|
;; Set tmp2 to INT_MAX+1. It is important to note here that after it looks
|
||||||
|
;; like we are only converting INT_MAX (0x7FFFFFFF) but in fact because
|
||||||
|
;; single precision IEEE-754 floats can only accurately represent contingous
|
||||||
|
;; integers up to 2^23 and outside of this range it rounds to the closest
|
||||||
|
;; integer that it can represent. In the case of INT_MAX, this value gets
|
||||||
|
;; represented as 0x4f000000 which is the integer value (INT_MAX+1).
|
||||||
|
(tmp2 Xmm (x64_pcmpeqd tmp2 tmp2))
|
||||||
|
(tmp2 Xmm (x64_psrld tmp2 (RegMemImm.Imm 1)))
|
||||||
|
(tmp2 Xmm (x64_cvtdq2ps tmp2))
|
||||||
|
|
||||||
|
;; Make a copy of these lanes and then do the first conversion.
|
||||||
|
;; Overflow lanes greater than the maximum allowed signed value will
|
||||||
|
;; set to 0x80000000. Negative and NaN lanes will be 0x0
|
||||||
|
(tmp1 Xmm dst)
|
||||||
|
(dst Xmm (x64_cvttps2dq $F32X4 dst))
|
||||||
|
|
||||||
|
;; Set lanes to src - max_signed_int
|
||||||
|
(tmp1 Xmm (x64_subps tmp1 tmp2))
|
||||||
|
|
||||||
|
;; Create mask for all positive lanes to saturate (i.e. greater than
|
||||||
|
;; or equal to the maxmimum allowable unsigned int).
|
||||||
|
(tmp2 Xmm (x64_cmpps tmp2 tmp1 (FcmpImm.LessThanOrEqual)))
|
||||||
|
|
||||||
|
;; Convert those set of lanes that have the max_signed_int factored out.
|
||||||
|
(tmp1 Xmm (x64_cvttps2dq $F32X4 tmp1))
|
||||||
|
|
||||||
|
;; Prepare converted lanes by zeroing negative lanes and prepping lanes
|
||||||
|
;; that have positive overflow (based on the mask) by setting these lanes
|
||||||
|
;; to 0x7FFFFFFF
|
||||||
|
(tmp1 Xmm (x64_pxor tmp1 tmp2))
|
||||||
|
(tmp2 Xmm (x64_pxor tmp2 tmp2)) ;; make another zero
|
||||||
|
(tmp1 Xmm (x64_pmaxsd tmp1 tmp2)))
|
||||||
|
|
||||||
|
;; Add this second set of converted lanes to the original to properly handle
|
||||||
|
;; values greater than max signed int.
|
||||||
|
(x64_paddd tmp1 dst)))
|
||||||
|
|||||||
@@ -557,232 +557,14 @@ fn lower_insn_to_regs(
|
|||||||
| Opcode::SelectifSpectreGuard
|
| Opcode::SelectifSpectreGuard
|
||||||
| Opcode::FcvtFromSint
|
| Opcode::FcvtFromSint
|
||||||
| Opcode::FcvtLowFromSint
|
| Opcode::FcvtLowFromSint
|
||||||
| Opcode::FcvtFromUint => {
|
| Opcode::FcvtFromUint
|
||||||
|
| Opcode::FcvtToUint
|
||||||
|
| Opcode::FcvtToSint
|
||||||
|
| Opcode::FcvtToUintSat
|
||||||
|
| Opcode::FcvtToSintSat => {
|
||||||
implemented_in_isle(ctx);
|
implemented_in_isle(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
Opcode::FcvtToUint | Opcode::FcvtToUintSat | Opcode::FcvtToSint | Opcode::FcvtToSintSat => {
|
|
||||||
let src = put_input_in_reg(ctx, inputs[0]);
|
|
||||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
||||||
|
|
||||||
let input_ty = ctx.input_ty(insn, 0);
|
|
||||||
if !input_ty.is_vector() {
|
|
||||||
let src_size = if input_ty == types::F32 {
|
|
||||||
OperandSize::Size32
|
|
||||||
} else {
|
|
||||||
assert_eq!(input_ty, types::F64);
|
|
||||||
OperandSize::Size64
|
|
||||||
};
|
|
||||||
|
|
||||||
let output_ty = ty.unwrap();
|
|
||||||
let dst_size = if output_ty == types::I32 {
|
|
||||||
OperandSize::Size32
|
|
||||||
} else {
|
|
||||||
assert_eq!(output_ty, types::I64);
|
|
||||||
OperandSize::Size64
|
|
||||||
};
|
|
||||||
|
|
||||||
let to_signed = op == Opcode::FcvtToSint || op == Opcode::FcvtToSintSat;
|
|
||||||
let is_sat = op == Opcode::FcvtToUintSat || op == Opcode::FcvtToSintSat;
|
|
||||||
|
|
||||||
let src_copy = ctx.alloc_tmp(input_ty).only_reg().unwrap();
|
|
||||||
ctx.emit(Inst::gen_move(src_copy, src, input_ty));
|
|
||||||
|
|
||||||
let tmp_xmm = ctx.alloc_tmp(input_ty).only_reg().unwrap();
|
|
||||||
let tmp_gpr = ctx.alloc_tmp(output_ty).only_reg().unwrap();
|
|
||||||
|
|
||||||
if to_signed {
|
|
||||||
ctx.emit(Inst::cvt_float_to_sint_seq(
|
|
||||||
src_size, dst_size, is_sat, src_copy, dst, tmp_gpr, tmp_xmm,
|
|
||||||
));
|
|
||||||
} else {
|
|
||||||
ctx.emit(Inst::cvt_float_to_uint_seq(
|
|
||||||
src_size, dst_size, is_sat, src_copy, dst, tmp_gpr, tmp_xmm,
|
|
||||||
));
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if op == Opcode::FcvtToSintSat {
|
|
||||||
// Sets destination to zero if float is NaN
|
|
||||||
assert_eq!(types::F32X4, ctx.input_ty(insn, 0));
|
|
||||||
let tmp = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
|
|
||||||
ctx.emit(Inst::xmm_unary_rm_r(
|
|
||||||
SseOpcode::Movapd,
|
|
||||||
RegMem::reg(src),
|
|
||||||
tmp,
|
|
||||||
));
|
|
||||||
ctx.emit(Inst::gen_move(dst, src, input_ty));
|
|
||||||
let cond = FcmpImm::from(FloatCC::Equal);
|
|
||||||
ctx.emit(Inst::xmm_rm_r_imm(
|
|
||||||
SseOpcode::Cmpps,
|
|
||||||
RegMem::reg(tmp.to_reg()),
|
|
||||||
tmp,
|
|
||||||
cond.encode(),
|
|
||||||
OperandSize::Size32,
|
|
||||||
));
|
|
||||||
ctx.emit(Inst::xmm_rm_r(
|
|
||||||
SseOpcode::Andps,
|
|
||||||
RegMem::reg(tmp.to_reg()),
|
|
||||||
dst,
|
|
||||||
));
|
|
||||||
|
|
||||||
// Sets top bit of tmp if float is positive
|
|
||||||
// Setting up to set top bit on negative float values
|
|
||||||
ctx.emit(Inst::xmm_rm_r(
|
|
||||||
SseOpcode::Pxor,
|
|
||||||
RegMem::reg(dst.to_reg()),
|
|
||||||
tmp,
|
|
||||||
));
|
|
||||||
|
|
||||||
// Convert the packed float to packed doubleword.
|
|
||||||
ctx.emit(Inst::xmm_unary_rm_r(
|
|
||||||
SseOpcode::Cvttps2dq,
|
|
||||||
RegMem::reg(dst.to_reg()),
|
|
||||||
dst,
|
|
||||||
));
|
|
||||||
|
|
||||||
// Set top bit only if < 0
|
|
||||||
// Saturate lane with sign (top) bit.
|
|
||||||
ctx.emit(Inst::xmm_rm_r(
|
|
||||||
SseOpcode::Pand,
|
|
||||||
RegMem::reg(dst.to_reg()),
|
|
||||||
tmp,
|
|
||||||
));
|
|
||||||
ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrad, RegMemImm::imm(31), tmp));
|
|
||||||
|
|
||||||
// On overflow 0x80000000 is returned to a lane.
|
|
||||||
// Below sets positive overflow lanes to 0x7FFFFFFF
|
|
||||||
// Keeps negative overflow lanes as is.
|
|
||||||
ctx.emit(Inst::xmm_rm_r(
|
|
||||||
SseOpcode::Pxor,
|
|
||||||
RegMem::reg(tmp.to_reg()),
|
|
||||||
dst,
|
|
||||||
));
|
|
||||||
} else if op == Opcode::FcvtToUintSat {
|
|
||||||
// The algorithm for converting floats to unsigned ints is a little tricky. The
|
|
||||||
// complication arises because we are converting from a signed 64-bit int with a positive
|
|
||||||
// integer range from 1..INT_MAX (0x1..0x7FFFFFFF) to an unsigned integer with an extended
|
|
||||||
// range from (INT_MAX+1)..UINT_MAX. It's this range from (INT_MAX+1)..UINT_MAX
|
|
||||||
// (0x80000000..0xFFFFFFFF) that needs to be accounted for as a special case since our
|
|
||||||
// conversion instruction (cvttps2dq) only converts as high as INT_MAX (0x7FFFFFFF), but
|
|
||||||
// which conveniently setting underflows and overflows (smaller than MIN_INT or larger than
|
|
||||||
// MAX_INT) to be INT_MAX+1 (0x80000000). Nothing that the range (INT_MAX+1)..UINT_MAX includes
|
|
||||||
// precisely INT_MAX values we can correctly account for and convert every value in this range
|
|
||||||
// if we simply subtract INT_MAX+1 before doing the cvttps2dq conversion. After the subtraction
|
|
||||||
// every value originally (INT_MAX+1)..UINT_MAX is now the range (0..INT_MAX).
|
|
||||||
// After the conversion we add INT_MAX+1 back to this converted value, noting again that
|
|
||||||
// values we are trying to account for were already set to INT_MAX+1 during the original conversion.
|
|
||||||
// We simply have to create a mask and make sure we are adding together only the lanes that need
|
|
||||||
// to be accounted for. Digesting it all the steps then are:
|
|
||||||
//
|
|
||||||
// Step 1 - Account for NaN and negative floats by setting these src values to zero.
|
|
||||||
// Step 2 - Make a copy (tmp1) of the src value since we need to convert twice for
|
|
||||||
// reasons described above.
|
|
||||||
// Step 3 - Convert the original src values. This will convert properly all floats up to INT_MAX
|
|
||||||
// Step 4 - Subtract INT_MAX from the copy set (tmp1). Note, all zero and negative values are those
|
|
||||||
// values that were originally in the range (0..INT_MAX). This will come in handy during
|
|
||||||
// step 7 when we zero negative lanes.
|
|
||||||
// Step 5 - Create a bit mask for tmp1 that will correspond to all lanes originally less than
|
|
||||||
// UINT_MAX that are now less than INT_MAX thanks to the subtraction.
|
|
||||||
// Step 6 - Convert the second set of values (tmp1)
|
|
||||||
// Step 7 - Prep the converted second set by zeroing out negative lanes (these have already been
|
|
||||||
// converted correctly with the first set) and by setting overflow lanes to 0x7FFFFFFF
|
|
||||||
// as this will allow us to properly saturate overflow lanes when adding to 0x80000000
|
|
||||||
// Step 8 - Add the orginal converted src and the converted tmp1 where float values originally less
|
|
||||||
// than and equal to INT_MAX will be unchanged, float values originally between INT_MAX+1 and
|
|
||||||
// UINT_MAX will add together (INT_MAX) + (SRC - INT_MAX), and float values originally
|
|
||||||
// greater than UINT_MAX will be saturated to UINT_MAX (0xFFFFFFFF) after adding (0x8000000 + 0x7FFFFFFF).
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// The table below illustrates the result after each step where it matters for the converted set.
|
|
||||||
// Note the original value range (original src set) is the final dst in Step 8:
|
|
||||||
//
|
|
||||||
// Original src set:
|
|
||||||
// | Original Value Range | Step 1 | Step 3 | Step 8 |
|
|
||||||
// | -FLT_MIN..FLT_MAX | 0.0..FLT_MAX | 0..INT_MAX(w/overflow) | 0..UINT_MAX(w/saturation) |
|
|
||||||
//
|
|
||||||
// Copied src set (tmp1):
|
|
||||||
// | Step 2 | Step 4 |
|
|
||||||
// | 0.0..FLT_MAX | (0.0-(INT_MAX+1))..(FLT_MAX-(INT_MAX+1)) |
|
|
||||||
//
|
|
||||||
// | Step 6 | Step 7 |
|
|
||||||
// | (0-(INT_MAX+1))..(UINT_MAX-(INT_MAX+1))(w/overflow) | ((INT_MAX+1)-(INT_MAX+1))..(INT_MAX+1) |
|
|
||||||
|
|
||||||
// Create temporaries
|
|
||||||
assert_eq!(types::F32X4, ctx.input_ty(insn, 0));
|
|
||||||
let tmp1 = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
|
|
||||||
let tmp2 = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
|
|
||||||
|
|
||||||
// Converting to unsigned int so if float src is negative or NaN
|
|
||||||
// will first set to zero.
|
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp2));
|
|
||||||
ctx.emit(Inst::gen_move(dst, src, input_ty));
|
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Maxps, RegMem::from(tmp2), dst));
|
|
||||||
|
|
||||||
// Set tmp2 to INT_MAX+1. It is important to note here that after it looks
|
|
||||||
// like we are only converting INT_MAX (0x7FFFFFFF) but in fact because
|
|
||||||
// single precision IEEE-754 floats can only accurately represent contingous
|
|
||||||
// integers up to 2^23 and outside of this range it rounds to the closest
|
|
||||||
// integer that it can represent. In the case of INT_MAX, this value gets
|
|
||||||
// represented as 0x4f000000 which is the integer value (INT_MAX+1).
|
|
||||||
|
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pcmpeqd, RegMem::from(tmp2), tmp2));
|
|
||||||
ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(1), tmp2));
|
|
||||||
ctx.emit(Inst::xmm_unary_rm_r(
|
|
||||||
SseOpcode::Cvtdq2ps,
|
|
||||||
RegMem::from(tmp2),
|
|
||||||
tmp2,
|
|
||||||
));
|
|
||||||
|
|
||||||
// Make a copy of these lanes and then do the first conversion.
|
|
||||||
// Overflow lanes greater than the maximum allowed signed value will
|
|
||||||
// set to 0x80000000. Negative and NaN lanes will be 0x0
|
|
||||||
ctx.emit(Inst::xmm_mov(SseOpcode::Movaps, RegMem::from(dst), tmp1));
|
|
||||||
ctx.emit(Inst::xmm_unary_rm_r(
|
|
||||||
SseOpcode::Cvttps2dq,
|
|
||||||
RegMem::from(dst),
|
|
||||||
dst,
|
|
||||||
));
|
|
||||||
|
|
||||||
// Set lanes to src - max_signed_int
|
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Subps, RegMem::from(tmp2), tmp1));
|
|
||||||
|
|
||||||
// Create mask for all positive lanes to saturate (i.e. greater than
|
|
||||||
// or equal to the maxmimum allowable unsigned int).
|
|
||||||
let cond = FcmpImm::from(FloatCC::LessThanOrEqual);
|
|
||||||
ctx.emit(Inst::xmm_rm_r_imm(
|
|
||||||
SseOpcode::Cmpps,
|
|
||||||
RegMem::from(tmp1),
|
|
||||||
tmp2,
|
|
||||||
cond.encode(),
|
|
||||||
OperandSize::Size32,
|
|
||||||
));
|
|
||||||
|
|
||||||
// Convert those set of lanes that have the max_signed_int factored out.
|
|
||||||
ctx.emit(Inst::xmm_unary_rm_r(
|
|
||||||
SseOpcode::Cvttps2dq,
|
|
||||||
RegMem::from(tmp1),
|
|
||||||
tmp1,
|
|
||||||
));
|
|
||||||
|
|
||||||
// Prepare converted lanes by zeroing negative lanes and prepping lanes
|
|
||||||
// that have positive overflow (based on the mask) by setting these lanes
|
|
||||||
// to 0x7FFFFFFF
|
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp1));
|
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp2));
|
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmaxsd, RegMem::from(tmp2), tmp1));
|
|
||||||
|
|
||||||
// Add this second set of converted lanes to the original to properly handle
|
|
||||||
// values greater than max signed int.
|
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Paddd, RegMem::from(tmp1), dst));
|
|
||||||
} else {
|
|
||||||
// Since this branch is also guarded by a check for vector types
|
|
||||||
// neither Opcode::FcvtToUint nor Opcode::FcvtToSint can reach here
|
|
||||||
// due to vector varients not existing. The first two branches will
|
|
||||||
// cover all reachable cases.
|
|
||||||
unreachable!();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Opcode::IaddPairwise => {
|
Opcode::IaddPairwise => {
|
||||||
if let (Some(swiden_low), Some(swiden_high)) = (
|
if let (Some(swiden_low), Some(swiden_high)) = (
|
||||||
matches_input(ctx, inputs[0], Opcode::SwidenLow),
|
matches_input(ctx, inputs[0], Opcode::SwidenLow),
|
||||||
|
|||||||
@@ -200,3 +200,275 @@ block0(v0: i32x4):
|
|||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
|
|
||||||
|
function %f13(f32) -> i32 {
|
||||||
|
block0(v0: f32):
|
||||||
|
v1 = fcvt_to_uint.i32 v0
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; cvt_float32_to_uint32_seq %xmm0, %eax, %r10, %xmm6
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %f14(f32) -> i64 {
|
||||||
|
block0(v0: f32):
|
||||||
|
v1 = fcvt_to_uint.i64 v0
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; cvt_float32_to_uint64_seq %xmm0, %rax, %r10, %xmm6
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %f15(f64) -> i32 {
|
||||||
|
block0(v0: f64):
|
||||||
|
v1 = fcvt_to_uint.i32 v0
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; cvt_float64_to_uint32_seq %xmm0, %eax, %r10, %xmm6
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %f16(f64) -> i64 {
|
||||||
|
block0(v0: f64):
|
||||||
|
v1 = fcvt_to_uint.i64 v0
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; cvt_float64_to_uint64_seq %xmm0, %rax, %r10, %xmm6
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %f17(f32) -> i32 {
|
||||||
|
block0(v0: f32):
|
||||||
|
v1 = fcvt_to_uint_sat.i32 v0
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; cvt_float32_to_uint32_sat_seq %xmm0, %eax, %r10, %xmm6
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %f18(f32) -> i64 {
|
||||||
|
block0(v0: f32):
|
||||||
|
v1 = fcvt_to_uint_sat.i64 v0
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; cvt_float32_to_uint64_sat_seq %xmm0, %rax, %r10, %xmm6
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %f19(f64) -> i32 {
|
||||||
|
block0(v0: f64):
|
||||||
|
v1 = fcvt_to_uint_sat.i32 v0
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; cvt_float64_to_uint32_sat_seq %xmm0, %eax, %r10, %xmm6
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %f20(f64) -> i64 {
|
||||||
|
block0(v0: f64):
|
||||||
|
v1 = fcvt_to_uint_sat.i64 v0
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; cvt_float64_to_uint64_sat_seq %xmm0, %rax, %r10, %xmm6
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %f21(f32) -> i32 {
|
||||||
|
block0(v0: f32):
|
||||||
|
v1 = fcvt_to_sint.i32 v0
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; cvt_float32_to_sint32_seq %xmm0, %eax, %r10, %xmm6
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %f22(f32) -> i64 {
|
||||||
|
block0(v0: f32):
|
||||||
|
v1 = fcvt_to_sint.i64 v0
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; cvt_float32_to_sint64_seq %xmm0, %rax, %r10, %xmm6
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %f23(f64) -> i32 {
|
||||||
|
block0(v0: f64):
|
||||||
|
v1 = fcvt_to_sint.i32 v0
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; cvt_float64_to_sint32_seq %xmm0, %eax, %r10, %xmm6
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %f24(f64) -> i64 {
|
||||||
|
block0(v0: f64):
|
||||||
|
v1 = fcvt_to_sint.i64 v0
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; cvt_float64_to_sint64_seq %xmm0, %rax, %r10, %xmm6
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %f25(f32) -> i32 {
|
||||||
|
block0(v0: f32):
|
||||||
|
v1 = fcvt_to_sint_sat.i32 v0
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; cvt_float32_to_sint32_sat_seq %xmm0, %eax, %r10, %xmm6
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %f26(f32) -> i64 {
|
||||||
|
block0(v0: f32):
|
||||||
|
v1 = fcvt_to_sint_sat.i64 v0
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; cvt_float32_to_sint64_sat_seq %xmm0, %rax, %r10, %xmm6
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %f27(f64) -> i32 {
|
||||||
|
block0(v0: f64):
|
||||||
|
v1 = fcvt_to_sint_sat.i32 v0
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; cvt_float64_to_sint32_sat_seq %xmm0, %eax, %r10, %xmm6
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %f28(f64) -> i64 {
|
||||||
|
block0(v0: f64):
|
||||||
|
v1 = fcvt_to_sint_sat.i64 v0
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; cvt_float64_to_sint64_sat_seq %xmm0, %rax, %r10, %xmm6
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %f29(f32x4) -> i32x4 {
|
||||||
|
block0(v0: f32x4):
|
||||||
|
v1 = fcvt_to_uint_sat.i32x4 v0
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; pxor %xmm3, %xmm3, %xmm3
|
||||||
|
; maxps %xmm0, %xmm3, %xmm0
|
||||||
|
; pcmpeqd %xmm8, %xmm8, %xmm8
|
||||||
|
; psrld %xmm8, $1, %xmm8
|
||||||
|
; cvtdq2ps %xmm8, %xmm14
|
||||||
|
; cvttps2dq %xmm0, %xmm13
|
||||||
|
; subps %xmm0, %xmm14, %xmm0
|
||||||
|
; cmpps $2, %xmm14, %xmm0, %xmm14
|
||||||
|
; cvttps2dq %xmm0, %xmm0
|
||||||
|
; pxor %xmm0, %xmm14, %xmm0
|
||||||
|
; pxor %xmm7, %xmm7, %xmm7
|
||||||
|
; pmaxsd %xmm0, %xmm7, %xmm0
|
||||||
|
; paddd %xmm0, %xmm13, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %f30(f32x4) -> i32x4 {
|
||||||
|
block0(v0: f32x4):
|
||||||
|
v1 = fcvt_to_sint_sat.i32x4 v0
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; movdqa %xmm0, %xmm5
|
||||||
|
; cmpps $0, %xmm5, %xmm0, %xmm5
|
||||||
|
; andps %xmm0, %xmm5, %xmm0
|
||||||
|
; pxor %xmm5, %xmm0, %xmm5
|
||||||
|
; cvttps2dq %xmm0, %xmm9
|
||||||
|
; movdqa %xmm9, %xmm0
|
||||||
|
; pand %xmm0, %xmm5, %xmm0
|
||||||
|
; psrad %xmm0, $31, %xmm0
|
||||||
|
; pxor %xmm0, %xmm9, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ block0(v0:f32x4):
|
|||||||
}
|
}
|
||||||
; run: %fcvt_to_sint_sat([0x0.0 -0x1.0 0x1.0 0x1.0p100]) == [0 -1 1 0x7FFFFFFF]
|
; run: %fcvt_to_sint_sat([0x0.0 -0x1.0 0x1.0 0x1.0p100]) == [0 -1 1 0x7FFFFFFF]
|
||||||
; run: %fcvt_to_sint_sat([-0x8.1 0x0.0 0x0.0 -0x1.0p100]) == [-8 0 0 0x80000000]
|
; run: %fcvt_to_sint_sat([-0x8.1 0x0.0 0x0.0 -0x1.0p100]) == [-8 0 0 0x80000000]
|
||||||
|
; run: %fcvt_to_sint_sat([+NaN +NaN +NaN +NaN]) == [0 0 0 0]
|
||||||
|
|
||||||
function %fcvt_to_uint_sat(f32x4) -> i32x4 {
|
function %fcvt_to_uint_sat(f32x4) -> i32x4 {
|
||||||
block0(v0:f32x4):
|
block0(v0:f32x4):
|
||||||
@@ -37,3 +38,4 @@ block0(v0:f32x4):
|
|||||||
; run: %fcvt_to_uint_sat([0x1.0 0x4.2 0x4.6 0x1.0p100]) == [1 4 4 0xFFFFFFFF]
|
; run: %fcvt_to_uint_sat([0x1.0 0x4.2 0x4.6 0x1.0p100]) == [1 4 4 0xFFFFFFFF]
|
||||||
; run: %fcvt_to_uint_sat([-0x8.1 -0x0.0 0x0.0 -0x1.0p100]) == [0 0 0 0]
|
; run: %fcvt_to_uint_sat([-0x8.1 -0x0.0 0x0.0 -0x1.0p100]) == [0 0 0 0]
|
||||||
; run: %fcvt_to_uint_sat([0xB2D05E00.0 0.0 0.0 0.0]) == [3000000000 0 0 0]
|
; run: %fcvt_to_uint_sat([0xB2D05E00.0 0.0 0.0 0.0]) == [3000000000 0 0 0]
|
||||||
|
; run: %fcvt_to_uint_sat([+NaN +NaN +NaN +NaN]) == [0 0 0 0]
|
||||||
|
|||||||
Reference in New Issue
Block a user