x64: Lower fcvt_from_uint in ISLE (#4684)
* Add a test for the existing behavior of fcvt_from_unit * Migrate the I8, I16, I32 cases of fcvt_from_uint * Implement the I64 case of fcvt_from_uint * Add a test for the existing behavior of fcvt_from_uint.f64x2 * Migrate fcvt_from_uint.f64x2 to ISLE * Lower the last case of `fcvt_from_uint` * Add a test for `fcvt_from_uint` * Finish lowering fcmp_from_uint * Format
This commit is contained in:
@@ -1662,6 +1662,10 @@
|
||||
(rule (x64_movdqu from)
|
||||
(xmm_unary_rm_r (SseOpcode.Movdqu) from))
|
||||
|
||||
(decl x64_movapd (XmmMem) Xmm)
|
||||
(rule (x64_movapd src)
|
||||
(xmm_unary_rm_r (SseOpcode.Movapd) src))
|
||||
|
||||
(decl x64_pmovsxbw (XmmMem) Xmm)
|
||||
(rule (x64_pmovsxbw from)
|
||||
(xmm_unary_rm_r (SseOpcode.Pmovsxbw) from))
|
||||
@@ -2276,6 +2280,11 @@
|
||||
(rule (x64_punpcklwd src1 src2)
|
||||
(xmm_rm_r $I16X8 (SseOpcode.Punpcklwd) src1 src2))
|
||||
|
||||
;; Helper for creating `unpcklps` instructions.
|
||||
(decl x64_unpcklps (Xmm XmmMem) Xmm)
|
||||
(rule (x64_unpcklps src1 src2)
|
||||
(xmm_rm_r $I16X8 (SseOpcode.Unpcklps) src1 src2))
|
||||
|
||||
;; Helper for creating `andnps` instructions.
|
||||
(decl x64_andnps (Xmm XmmMem) Xmm)
|
||||
(rule (x64_andnps src1 src2)
|
||||
@@ -2628,6 +2637,11 @@
|
||||
(_ Unit (emit (MInst.XmmUnaryRmREvex op src dst))))
|
||||
dst))
|
||||
|
||||
;; Helper for creating `vcvtudq2ps` instructions.
|
||||
(decl x64_vcvtudq2ps (XmmMem) Xmm)
|
||||
(rule (x64_vcvtudq2ps src)
|
||||
(xmm_unary_rm_r_evex (Avx512Opcode.Vcvtudq2ps) src))
|
||||
|
||||
;; Helper for creating `vpabsq` instructions.
|
||||
(decl x64_vpabsq (XmmMem) Xmm)
|
||||
(rule (x64_vpabsq src)
|
||||
@@ -3018,6 +3032,23 @@
|
||||
(_ Unit (emit (MInst.GprToXmm (SseOpcode.Cvtsi2sd) x dst size))))
|
||||
dst))
|
||||
|
||||
(decl cvt_u64_to_float_seq (Type Gpr) Xmm)
|
||||
(rule (cvt_u64_to_float_seq ty src)
|
||||
(let ((size OperandSize (raw_operand_size_of_type ty))
|
||||
(src_copy WritableGpr (temp_writable_gpr))
|
||||
(dst WritableXmm (temp_writable_xmm))
|
||||
(tmp_gpr1 WritableGpr (temp_writable_gpr))
|
||||
(tmp_gpr2 WritableGpr (temp_writable_gpr))
|
||||
(_ Unit (emit (gen_move $I64 src_copy src)))
|
||||
(_ Unit (emit (MInst.CvtUint64ToFloatSeq size src_copy dst tmp_gpr1 tmp_gpr2))))
|
||||
dst))
|
||||
|
||||
(decl fcvt_uint_mask_const () VCodeConstant)
|
||||
(extern constructor fcvt_uint_mask_const fcvt_uint_mask_const)
|
||||
|
||||
(decl fcvt_uint_mask_high_const () VCodeConstant)
|
||||
(extern constructor fcvt_uint_mask_high_const fcvt_uint_mask_high_const)
|
||||
|
||||
;; Helpers for creating `pcmpeq*` instructions.
|
||||
(decl x64_pcmpeq (Type Xmm XmmMem) Xmm)
|
||||
(rule (x64_pcmpeq $I8X16 x y) (x64_pcmpeqb x y))
|
||||
|
||||
@@ -26,6 +26,16 @@ impl Inst {
|
||||
dst: WritableGpr::from_writable_reg(src).unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
fn xmm_unary_rm_r_evex(op: Avx512Opcode, src: RegMem, dst: Writable<Reg>) -> Inst {
|
||||
src.assert_regclass_is(RegClass::Float);
|
||||
debug_assert!(dst.to_reg().class() == RegClass::Float);
|
||||
Inst::XmmUnaryRmREvex {
|
||||
op,
|
||||
src: XmmMem::new(src).unwrap(),
|
||||
dst: WritableXmm::from_writable_reg(dst).unwrap(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -308,16 +308,6 @@ impl Inst {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn xmm_unary_rm_r_evex(op: Avx512Opcode, src: RegMem, dst: Writable<Reg>) -> Inst {
|
||||
src.assert_regclass_is(RegClass::Float);
|
||||
debug_assert!(dst.to_reg().class() == RegClass::Float);
|
||||
Inst::XmmUnaryRmREvex {
|
||||
op,
|
||||
src: XmmMem::new(src).unwrap(),
|
||||
dst: WritableXmm::from_writable_reg(dst).unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn xmm_rm_r(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Self {
|
||||
src.assert_regclass_is(RegClass::Float);
|
||||
debug_assert!(dst.to_reg().class() == RegClass::Float);
|
||||
@@ -418,27 +408,6 @@ impl Inst {
|
||||
Inst::XmmCmpRmR { op, src, dst }
|
||||
}
|
||||
|
||||
pub(crate) fn cvt_u64_to_float_seq(
|
||||
dst_size: OperandSize,
|
||||
src: Writable<Reg>,
|
||||
tmp_gpr1: Writable<Reg>,
|
||||
tmp_gpr2: Writable<Reg>,
|
||||
dst: Writable<Reg>,
|
||||
) -> Inst {
|
||||
debug_assert!(dst_size.is_one_of(&[OperandSize::Size32, OperandSize::Size64]));
|
||||
debug_assert!(src.to_reg().class() == RegClass::Int);
|
||||
debug_assert!(tmp_gpr1.to_reg().class() == RegClass::Int);
|
||||
debug_assert!(tmp_gpr2.to_reg().class() == RegClass::Int);
|
||||
debug_assert!(dst.to_reg().class() == RegClass::Float);
|
||||
Inst::CvtUint64ToFloatSeq {
|
||||
src: WritableGpr::from_writable_reg(src).unwrap(),
|
||||
dst: WritableXmm::from_writable_reg(dst).unwrap(),
|
||||
tmp_gpr1: WritableGpr::from_writable_reg(tmp_gpr1).unwrap(),
|
||||
tmp_gpr2: WritableGpr::from_writable_reg(tmp_gpr2).unwrap(),
|
||||
dst_size,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn cvt_float_to_sint_seq(
|
||||
src_size: OperandSize,
|
||||
dst_size: OperandSize,
|
||||
|
||||
@@ -2985,3 +2985,76 @@
|
||||
|
||||
(rule (lower (fcvt_low_from_sint a @ (value_type ty)))
|
||||
(x64_cvtdq2pd ty a))
|
||||
|
||||
;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule (lower (has_type $F32 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty))))))
|
||||
(x64_cvtsi2ss $I64 (extend_to_gpr val $I64 (ExtendKind.Zero))))
|
||||
|
||||
(rule (lower (has_type $F64 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty))))))
|
||||
(x64_cvtsi2sd $I64 (extend_to_gpr val $I64 (ExtendKind.Zero))))
|
||||
|
||||
(rule (lower (has_type ty (fcvt_from_uint val @ (value_type $I64))))
|
||||
(cvt_u64_to_float_seq ty val))
|
||||
|
||||
;; Algorithm uses unpcklps to help create a float that is equivalent
|
||||
;; 0x1.0p52 + double(src). 0x1.0p52 is unique because at this exponent
|
||||
;; every value of the mantissa represents a corresponding uint32 number.
|
||||
;; When we subtract 0x1.0p52 we are left with double(src).
|
||||
(rule (lower (has_type $F64X2 (fcvt_from_uint (uwiden_low val @ (value_type $I32X4)))))
|
||||
(let ((uint_mask Xmm (x64_xmm_load_const $I32X4 (fcvt_uint_mask_const)))
|
||||
(res Xmm (x64_unpcklps val uint_mask))
|
||||
(uint_mask_high Xmm (x64_xmm_load_const $I32X4 (fcvt_uint_mask_high_const))))
|
||||
(x64_subpd res uint_mask_high)))
|
||||
|
||||
;; When AVX512VL and AVX512F are available,
|
||||
;; `fcvt_from_uint` can be lowered to a single instruction.
|
||||
;;
|
||||
;; NOTE: the priority of 1 here is to break ties with the next case for $F32X4,
|
||||
;; as it doesn't require either of the avx512 extensions to be enabled.
|
||||
(rule 1 (lower (has_type (and (avx512vl_enabled) (avx512f_enabled) $F32X4)
|
||||
(fcvt_from_uint src)))
|
||||
(x64_vcvtudq2ps src))
|
||||
|
||||
;; Converting packed unsigned integers to packed floats
|
||||
;; requires a few steps. There is no single instruction
|
||||
;; lowering for converting unsigned floats but there is for
|
||||
;; converting packed signed integers to float (cvtdq2ps). In
|
||||
;; the steps below we isolate the upper half (16 bits) and
|
||||
;; lower half (16 bits) of each lane and then we convert
|
||||
;; each half separately using cvtdq2ps meant for signed
|
||||
;; integers. In order for this to work for the upper half
|
||||
;; bits we must shift right by 1 (divide by 2) these bits in
|
||||
;; order to ensure the most significant bit is 0 not signed,
|
||||
;; and then after the conversion we double the value.
|
||||
;; Finally we add the converted values where addition will
|
||||
;; correctly round.
|
||||
;;
|
||||
;; Sequence:
|
||||
;; -> A = 0xffffffff
|
||||
;; -> Ah = 0xffff0000
|
||||
;; -> Al = 0x0000ffff
|
||||
;; -> Convert(Al) // Convert int to float
|
||||
;; -> Ah = Ah >> 1 // Shift right 1 to assure Ah conversion isn't treated as signed
|
||||
;; -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift
|
||||
;; -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion.
|
||||
;; -> dst = Ah + Al // Add the two floats together
|
||||
(rule (lower (has_type $F32X4 (fcvt_from_uint a)))
|
||||
(let (;; get the low 16 bits
|
||||
(a_lo Xmm (x64_pslld a (RegMemImm.Imm 16)))
|
||||
(a_lo Xmm (x64_psrld a_lo (RegMemImm.Imm 16)))
|
||||
|
||||
;; get the high 16 bits
|
||||
(a_hi Xmm (x64_psubd a a_lo))
|
||||
|
||||
;; convert the low 16 bits
|
||||
(a_lo Xmm (x64_cvtdq2ps a_lo))
|
||||
|
||||
;; shift the high bits by 1, convert, and double to get the correct
|
||||
;; value
|
||||
(a_hi Xmm (x64_psrld a_hi (RegMemImm.Imm 1)))
|
||||
(a_hi Xmm (x64_cvtdq2ps a_hi))
|
||||
(a_hi Xmm (x64_addps a_hi a_hi)))
|
||||
|
||||
;; add together the two converted values
|
||||
(x64_addps a_hi a_lo)))
|
||||
|
||||
@@ -166,57 +166,6 @@ fn input_to_reg_mem<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> RegM
|
||||
)
|
||||
}
|
||||
|
||||
/// An extension specification for `extend_input_to_reg`.
|
||||
#[derive(Clone, Copy)]
|
||||
enum ExtSpec {
|
||||
#[allow(dead_code)]
|
||||
ZeroExtendTo32,
|
||||
ZeroExtendTo64,
|
||||
#[allow(dead_code)]
|
||||
SignExtendTo32,
|
||||
#[allow(dead_code)] // not used just yet but may be used in the future!
|
||||
SignExtendTo64,
|
||||
}
|
||||
|
||||
/// Put the given input into a register, marking it as used, and do a zero- or signed- extension if
|
||||
/// required. (This obviously causes side-effects.)
|
||||
fn extend_input_to_reg<C: LowerCtx<I = Inst>>(
|
||||
ctx: &mut C,
|
||||
spec: InsnInput,
|
||||
ext_spec: ExtSpec,
|
||||
) -> Reg {
|
||||
let requested_size = match ext_spec {
|
||||
ExtSpec::ZeroExtendTo32 | ExtSpec::SignExtendTo32 => 32,
|
||||
ExtSpec::ZeroExtendTo64 | ExtSpec::SignExtendTo64 => 64,
|
||||
};
|
||||
let input_size = ctx.input_ty(spec.insn, spec.input).bits();
|
||||
|
||||
let requested_ty = if requested_size == 32 {
|
||||
types::I32
|
||||
} else {
|
||||
types::I64
|
||||
};
|
||||
|
||||
let ext_mode = match (input_size, requested_size) {
|
||||
(a, b) if a == b => return put_input_in_reg(ctx, spec),
|
||||
(1, 8) => return put_input_in_reg(ctx, spec),
|
||||
(a, b) => ExtMode::new(a.try_into().unwrap(), b.try_into().unwrap())
|
||||
.unwrap_or_else(|| panic!("invalid extension: {} -> {}", a, b)),
|
||||
};
|
||||
|
||||
let src = input_to_reg_mem(ctx, spec);
|
||||
let dst = ctx.alloc_tmp(requested_ty).only_reg().unwrap();
|
||||
match ext_spec {
|
||||
ExtSpec::ZeroExtendTo32 | ExtSpec::ZeroExtendTo64 => {
|
||||
ctx.emit(Inst::movzx_rm_r(ext_mode, src, dst))
|
||||
}
|
||||
ExtSpec::SignExtendTo32 | ExtSpec::SignExtendTo64 => {
|
||||
ctx.emit(Inst::movsx_rm_r(ext_mode, src, dst))
|
||||
}
|
||||
}
|
||||
dst.to_reg()
|
||||
}
|
||||
|
||||
fn input_to_imm<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> Option<u64> {
|
||||
ctx.get_input_as_source_or_const(spec.insn, spec.input)
|
||||
.constant
|
||||
@@ -629,207 +578,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
| Opcode::Selectif
|
||||
| Opcode::SelectifSpectreGuard
|
||||
| Opcode::FcvtFromSint
|
||||
| Opcode::FcvtLowFromSint => {
|
||||
| Opcode::FcvtLowFromSint
|
||||
| Opcode::FcvtFromUint => {
|
||||
implemented_in_isle(ctx);
|
||||
}
|
||||
|
||||
Opcode::FcvtFromUint => {
|
||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||
let ty = ty.unwrap();
|
||||
let input_ty = ctx.input_ty(insn, 0);
|
||||
let output_ty = ctx.output_ty(insn, 0);
|
||||
|
||||
if !ty.is_vector() {
|
||||
match input_ty {
|
||||
types::I8 | types::I16 | types::I32 => {
|
||||
// Conversion from an unsigned int smaller than 64-bit is easy: zero-extend +
|
||||
// do a signed conversion (which won't overflow).
|
||||
let opcode = if ty == types::F32 {
|
||||
SseOpcode::Cvtsi2ss
|
||||
} else {
|
||||
assert_eq!(ty, types::F64);
|
||||
SseOpcode::Cvtsi2sd
|
||||
};
|
||||
|
||||
let src = RegMem::reg(extend_input_to_reg(
|
||||
ctx,
|
||||
inputs[0],
|
||||
ExtSpec::ZeroExtendTo64,
|
||||
));
|
||||
ctx.emit(Inst::gpr_to_xmm(opcode, src, OperandSize::Size64, dst));
|
||||
}
|
||||
|
||||
types::I64 => {
|
||||
let src = put_input_in_reg(ctx, inputs[0]);
|
||||
|
||||
let src_copy = ctx.alloc_tmp(types::I64).only_reg().unwrap();
|
||||
ctx.emit(Inst::gen_move(src_copy, src, types::I64));
|
||||
|
||||
let tmp_gpr1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
|
||||
let tmp_gpr2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
|
||||
ctx.emit(Inst::cvt_u64_to_float_seq(
|
||||
if ty == types::F64 {
|
||||
OperandSize::Size64
|
||||
} else {
|
||||
OperandSize::Size32
|
||||
},
|
||||
src_copy,
|
||||
tmp_gpr1,
|
||||
tmp_gpr2,
|
||||
dst,
|
||||
));
|
||||
}
|
||||
_ => panic!("unexpected input type for FcvtFromUint: {:?}", input_ty),
|
||||
};
|
||||
} else if output_ty == types::F64X2 {
|
||||
if let Some(uwiden) = matches_input(ctx, inputs[0], Opcode::UwidenLow) {
|
||||
let uwiden_input = InsnInput {
|
||||
insn: uwiden,
|
||||
input: 0,
|
||||
};
|
||||
let src = put_input_in_reg(ctx, uwiden_input);
|
||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||
let input_ty = ctx.input_ty(uwiden, 0);
|
||||
|
||||
// Matches_input further obfuscates which Wasm instruction this is ultimately
|
||||
// lowering. Check here that the types are as expected for F64x2ConvertLowI32x4U.
|
||||
debug_assert!(input_ty == types::I32X4);
|
||||
|
||||
// Algorithm uses unpcklps to help create a float that is equivalent
|
||||
// 0x1.0p52 + double(src). 0x1.0p52 is unique because at this exponent
|
||||
// every value of the mantissa represents a corresponding uint32 number.
|
||||
// When we subtract 0x1.0p52 we are left with double(src).
|
||||
let uint_mask = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
|
||||
ctx.emit(Inst::gen_move(dst, src, types::I32X4));
|
||||
|
||||
static UINT_MASK: [u8; 16] = [
|
||||
0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00,
|
||||
];
|
||||
|
||||
let uint_mask_const =
|
||||
ctx.use_constant(VCodeConstantData::WellKnown(&UINT_MASK));
|
||||
|
||||
ctx.emit(Inst::xmm_load_const(
|
||||
uint_mask_const,
|
||||
uint_mask,
|
||||
types::I32X4,
|
||||
));
|
||||
|
||||
// Creates 0x1.0p52 + double(src)
|
||||
ctx.emit(Inst::xmm_rm_r(
|
||||
SseOpcode::Unpcklps,
|
||||
RegMem::from(uint_mask),
|
||||
dst,
|
||||
));
|
||||
|
||||
static UINT_MASK_HIGH: [u8; 16] = [
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x30, 0x43,
|
||||
];
|
||||
|
||||
let uint_mask_high_const =
|
||||
ctx.use_constant(VCodeConstantData::WellKnown(&UINT_MASK_HIGH));
|
||||
let uint_mask_high = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
|
||||
ctx.emit(Inst::xmm_load_const(
|
||||
uint_mask_high_const,
|
||||
uint_mask_high,
|
||||
types::I32X4,
|
||||
));
|
||||
|
||||
// 0x1.0p52 + double(src) - 0x1.0p52
|
||||
ctx.emit(Inst::xmm_rm_r(
|
||||
SseOpcode::Subpd,
|
||||
RegMem::from(uint_mask_high),
|
||||
dst,
|
||||
));
|
||||
} else {
|
||||
panic!("Unsupported FcvtFromUint conversion types: {}", ty);
|
||||
}
|
||||
} else {
|
||||
assert_eq!(ctx.input_ty(insn, 0), types::I32X4);
|
||||
let src = put_input_in_reg(ctx, inputs[0]);
|
||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||
|
||||
if isa_flags.use_avx512vl_simd() && isa_flags.use_avx512f_simd() {
|
||||
// When AVX512VL and AVX512F are available,
|
||||
// `fcvt_from_uint` can be lowered to a single instruction.
|
||||
ctx.emit(Inst::xmm_unary_rm_r_evex(
|
||||
Avx512Opcode::Vcvtudq2ps,
|
||||
RegMem::reg(src),
|
||||
dst,
|
||||
));
|
||||
} else {
|
||||
// Converting packed unsigned integers to packed floats
|
||||
// requires a few steps. There is no single instruction
|
||||
// lowering for converting unsigned floats but there is for
|
||||
// converting packed signed integers to float (cvtdq2ps). In
|
||||
// the steps below we isolate the upper half (16 bits) and
|
||||
// lower half (16 bits) of each lane and then we convert
|
||||
// each half separately using cvtdq2ps meant for signed
|
||||
// integers. In order for this to work for the upper half
|
||||
// bits we must shift right by 1 (divide by 2) these bits in
|
||||
// order to ensure the most significant bit is 0 not signed,
|
||||
// and then after the conversion we double the value.
|
||||
// Finally we add the converted values where addition will
|
||||
// correctly round.
|
||||
//
|
||||
// Sequence:
|
||||
// -> A = 0xffffffff
|
||||
// -> Ah = 0xffff0000
|
||||
// -> Al = 0x0000ffff
|
||||
// -> Convert(Al) // Convert int to float
|
||||
// -> Ah = Ah >> 1 // Shift right 1 to assure Ah conversion isn't treated as signed
|
||||
// -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift
|
||||
// -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion.
|
||||
// -> dst = Ah + Al // Add the two floats together
|
||||
|
||||
// Create a temporary register
|
||||
let tmp = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
|
||||
ctx.emit(Inst::xmm_unary_rm_r(
|
||||
SseOpcode::Movapd,
|
||||
RegMem::reg(src),
|
||||
tmp,
|
||||
));
|
||||
ctx.emit(Inst::gen_move(dst, src, ty));
|
||||
|
||||
// Get the low 16 bits
|
||||
ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Pslld, RegMemImm::imm(16), tmp));
|
||||
ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(16), tmp));
|
||||
|
||||
// Get the high 16 bits
|
||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::from(tmp), dst));
|
||||
|
||||
// Convert the low 16 bits
|
||||
ctx.emit(Inst::xmm_unary_rm_r(
|
||||
SseOpcode::Cvtdq2ps,
|
||||
RegMem::from(tmp),
|
||||
tmp,
|
||||
));
|
||||
|
||||
// Shift the high bits by 1, convert, and double to get the correct value.
|
||||
ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(1), dst));
|
||||
ctx.emit(Inst::xmm_unary_rm_r(
|
||||
SseOpcode::Cvtdq2ps,
|
||||
RegMem::from(dst),
|
||||
dst,
|
||||
));
|
||||
ctx.emit(Inst::xmm_rm_r(
|
||||
SseOpcode::Addps,
|
||||
RegMem::reg(dst.to_reg()),
|
||||
dst,
|
||||
));
|
||||
|
||||
// Add together the two converted values.
|
||||
ctx.emit(Inst::xmm_rm_r(
|
||||
SseOpcode::Addps,
|
||||
RegMem::reg(tmp.to_reg()),
|
||||
dst,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Opcode::FcvtToUint | Opcode::FcvtToUintSat | Opcode::FcvtToSint | Opcode::FcvtToSintSat => {
|
||||
let src = put_input_in_reg(ctx, inputs[0]);
|
||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||
|
||||
@@ -770,6 +770,18 @@ where
|
||||
fn jump_table_size(&mut self, targets: &BoxVecMachLabel) -> u32 {
|
||||
targets.len() as u32
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn fcvt_uint_mask_const(&mut self) -> VCodeConstant {
|
||||
self.lower_ctx
|
||||
.use_constant(VCodeConstantData::WellKnown(&UINT_MASK))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn fcvt_uint_mask_high_const(&mut self) -> VCodeConstant {
|
||||
self.lower_ctx
|
||||
.use_constant(VCodeConstantData::WellKnown(&UINT_MASK_HIGH))
|
||||
}
|
||||
}
|
||||
|
||||
impl<C> IsleContext<'_, C, Flags, IsaFlags, 6>
|
||||
@@ -891,3 +903,11 @@ fn to_simm32(constant: i64) -> Option<GprMemImm> {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
const UINT_MASK: [u8; 16] = [
|
||||
0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
];
|
||||
|
||||
const UINT_MASK_HIGH: [u8; 16] = [
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43,
|
||||
];
|
||||
|
||||
Reference in New Issue
Block a user