x64: Lower fcvt_from_uint in ISLE (#4684)

* Add a test for the existing behavior of fcvt_from_unit

* Migrate the I8, I16, I32 cases of fcvt_from_uint

* Implement the I64 case of fcvt_from_uint

* Add a test for the existing behavior of fcvt_from_uint.f64x2

* Migrate fcvt_from_uint.f64x2 to ISLE

* Lower the last case of `fcvt_from_uint`

* Add a test for `fcvt_from_uint`

* Finish lowering fcmp_from_uint

* Format
This commit is contained in:
Trevor Elliott
2022-08-11 12:28:41 -07:00
committed by GitHub
parent c4fd6a95da
commit 0c2e0494bd
8 changed files with 223 additions and 280 deletions

View File

@@ -1662,6 +1662,10 @@
(rule (x64_movdqu from)
(xmm_unary_rm_r (SseOpcode.Movdqu) from))
(decl x64_movapd (XmmMem) Xmm)
(rule (x64_movapd src)
(xmm_unary_rm_r (SseOpcode.Movapd) src))
(decl x64_pmovsxbw (XmmMem) Xmm)
(rule (x64_pmovsxbw from)
(xmm_unary_rm_r (SseOpcode.Pmovsxbw) from))
@@ -2276,6 +2280,11 @@
(rule (x64_punpcklwd src1 src2)
(xmm_rm_r $I16X8 (SseOpcode.Punpcklwd) src1 src2))
;; Helper for creating `unpcklps` instructions.
(decl x64_unpcklps (Xmm XmmMem) Xmm)
(rule (x64_unpcklps src1 src2)
(xmm_rm_r $I16X8 (SseOpcode.Unpcklps) src1 src2))
;; Helper for creating `andnps` instructions.
(decl x64_andnps (Xmm XmmMem) Xmm)
(rule (x64_andnps src1 src2)
@@ -2628,6 +2637,11 @@
(_ Unit (emit (MInst.XmmUnaryRmREvex op src dst))))
dst))
;; Helper for creating `vcvtudq2ps` instructions.
(decl x64_vcvtudq2ps (XmmMem) Xmm)
(rule (x64_vcvtudq2ps src)
(xmm_unary_rm_r_evex (Avx512Opcode.Vcvtudq2ps) src))
;; Helper for creating `vpabsq` instructions.
(decl x64_vpabsq (XmmMem) Xmm)
(rule (x64_vpabsq src)
@@ -3018,6 +3032,23 @@
(_ Unit (emit (MInst.GprToXmm (SseOpcode.Cvtsi2sd) x dst size))))
dst))
(decl cvt_u64_to_float_seq (Type Gpr) Xmm)
(rule (cvt_u64_to_float_seq ty src)
(let ((size OperandSize (raw_operand_size_of_type ty))
(src_copy WritableGpr (temp_writable_gpr))
(dst WritableXmm (temp_writable_xmm))
(tmp_gpr1 WritableGpr (temp_writable_gpr))
(tmp_gpr2 WritableGpr (temp_writable_gpr))
(_ Unit (emit (gen_move $I64 src_copy src)))
(_ Unit (emit (MInst.CvtUint64ToFloatSeq size src_copy dst tmp_gpr1 tmp_gpr2))))
dst))
(decl fcvt_uint_mask_const () VCodeConstant)
(extern constructor fcvt_uint_mask_const fcvt_uint_mask_const)
(decl fcvt_uint_mask_high_const () VCodeConstant)
(extern constructor fcvt_uint_mask_high_const fcvt_uint_mask_high_const)
;; Helpers for creating `pcmpeq*` instructions.
(decl x64_pcmpeq (Type Xmm XmmMem) Xmm)
(rule (x64_pcmpeq $I8X16 x y) (x64_pcmpeqb x y))

View File

@@ -26,6 +26,16 @@ impl Inst {
dst: WritableGpr::from_writable_reg(src).unwrap(),
}
}
fn xmm_unary_rm_r_evex(op: Avx512Opcode, src: RegMem, dst: Writable<Reg>) -> Inst {
src.assert_regclass_is(RegClass::Float);
debug_assert!(dst.to_reg().class() == RegClass::Float);
Inst::XmmUnaryRmREvex {
op,
src: XmmMem::new(src).unwrap(),
dst: WritableXmm::from_writable_reg(dst).unwrap(),
}
}
}
#[test]

View File

@@ -308,16 +308,6 @@ impl Inst {
}
}
pub(crate) fn xmm_unary_rm_r_evex(op: Avx512Opcode, src: RegMem, dst: Writable<Reg>) -> Inst {
src.assert_regclass_is(RegClass::Float);
debug_assert!(dst.to_reg().class() == RegClass::Float);
Inst::XmmUnaryRmREvex {
op,
src: XmmMem::new(src).unwrap(),
dst: WritableXmm::from_writable_reg(dst).unwrap(),
}
}
pub(crate) fn xmm_rm_r(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Self {
src.assert_regclass_is(RegClass::Float);
debug_assert!(dst.to_reg().class() == RegClass::Float);
@@ -418,27 +408,6 @@ impl Inst {
Inst::XmmCmpRmR { op, src, dst }
}
pub(crate) fn cvt_u64_to_float_seq(
dst_size: OperandSize,
src: Writable<Reg>,
tmp_gpr1: Writable<Reg>,
tmp_gpr2: Writable<Reg>,
dst: Writable<Reg>,
) -> Inst {
debug_assert!(dst_size.is_one_of(&[OperandSize::Size32, OperandSize::Size64]));
debug_assert!(src.to_reg().class() == RegClass::Int);
debug_assert!(tmp_gpr1.to_reg().class() == RegClass::Int);
debug_assert!(tmp_gpr2.to_reg().class() == RegClass::Int);
debug_assert!(dst.to_reg().class() == RegClass::Float);
Inst::CvtUint64ToFloatSeq {
src: WritableGpr::from_writable_reg(src).unwrap(),
dst: WritableXmm::from_writable_reg(dst).unwrap(),
tmp_gpr1: WritableGpr::from_writable_reg(tmp_gpr1).unwrap(),
tmp_gpr2: WritableGpr::from_writable_reg(tmp_gpr2).unwrap(),
dst_size,
}
}
pub(crate) fn cvt_float_to_sint_seq(
src_size: OperandSize,
dst_size: OperandSize,

View File

@@ -2985,3 +2985,76 @@
(rule (lower (fcvt_low_from_sint a @ (value_type ty)))
(x64_cvtdq2pd ty a))
;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type $F32 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty))))))
(x64_cvtsi2ss $I64 (extend_to_gpr val $I64 (ExtendKind.Zero))))
(rule (lower (has_type $F64 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty))))))
(x64_cvtsi2sd $I64 (extend_to_gpr val $I64 (ExtendKind.Zero))))
(rule (lower (has_type ty (fcvt_from_uint val @ (value_type $I64))))
(cvt_u64_to_float_seq ty val))
;; Algorithm uses unpcklps to help create a float that is equivalent
;; 0x1.0p52 + double(src). 0x1.0p52 is unique because at this exponent
;; every value of the mantissa represents a corresponding uint32 number.
;; When we subtract 0x1.0p52 we are left with double(src).
(rule (lower (has_type $F64X2 (fcvt_from_uint (uwiden_low val @ (value_type $I32X4)))))
(let ((uint_mask Xmm (x64_xmm_load_const $I32X4 (fcvt_uint_mask_const)))
(res Xmm (x64_unpcklps val uint_mask))
(uint_mask_high Xmm (x64_xmm_load_const $I32X4 (fcvt_uint_mask_high_const))))
(x64_subpd res uint_mask_high)))
;; When AVX512VL and AVX512F are available,
;; `fcvt_from_uint` can be lowered to a single instruction.
;;
;; NOTE: the priority of 1 here is to break ties with the next case for $F32X4,
;; as it doesn't require either of the avx512 extensions to be enabled.
(rule 1 (lower (has_type (and (avx512vl_enabled) (avx512f_enabled) $F32X4)
(fcvt_from_uint src)))
(x64_vcvtudq2ps src))
;; Converting packed unsigned integers to packed floats
;; requires a few steps. There is no single instruction
;; lowering for converting unsigned floats but there is for
;; converting packed signed integers to float (cvtdq2ps). In
;; the steps below we isolate the upper half (16 bits) and
;; lower half (16 bits) of each lane and then we convert
;; each half separately using cvtdq2ps meant for signed
;; integers. In order for this to work for the upper half
;; bits we must shift right by 1 (divide by 2) these bits in
;; order to ensure the most significant bit is 0 not signed,
;; and then after the conversion we double the value.
;; Finally we add the converted values where addition will
;; correctly round.
;;
;; Sequence:
;; -> A = 0xffffffff
;; -> Ah = 0xffff0000
;; -> Al = 0x0000ffff
;; -> Convert(Al) // Convert int to float
;; -> Ah = Ah >> 1 // Shift right 1 to assure Ah conversion isn't treated as signed
;; -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift
;; -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion.
;; -> dst = Ah + Al // Add the two floats together
(rule (lower (has_type $F32X4 (fcvt_from_uint a)))
(let (;; get the low 16 bits
(a_lo Xmm (x64_pslld a (RegMemImm.Imm 16)))
(a_lo Xmm (x64_psrld a_lo (RegMemImm.Imm 16)))
;; get the high 16 bits
(a_hi Xmm (x64_psubd a a_lo))
;; convert the low 16 bits
(a_lo Xmm (x64_cvtdq2ps a_lo))
;; shift the high bits by 1, convert, and double to get the correct
;; value
(a_hi Xmm (x64_psrld a_hi (RegMemImm.Imm 1)))
(a_hi Xmm (x64_cvtdq2ps a_hi))
(a_hi Xmm (x64_addps a_hi a_hi)))
;; add together the two converted values
(x64_addps a_hi a_lo)))

View File

@@ -166,57 +166,6 @@ fn input_to_reg_mem<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> RegM
)
}
/// An extension specification for `extend_input_to_reg`.
#[derive(Clone, Copy)]
enum ExtSpec {
#[allow(dead_code)]
ZeroExtendTo32,
ZeroExtendTo64,
#[allow(dead_code)]
SignExtendTo32,
#[allow(dead_code)] // not used just yet but may be used in the future!
SignExtendTo64,
}
/// Put the given input into a register, marking it as used, and do a zero- or signed- extension if
/// required. (This obviously causes side-effects.)
fn extend_input_to_reg<C: LowerCtx<I = Inst>>(
ctx: &mut C,
spec: InsnInput,
ext_spec: ExtSpec,
) -> Reg {
let requested_size = match ext_spec {
ExtSpec::ZeroExtendTo32 | ExtSpec::SignExtendTo32 => 32,
ExtSpec::ZeroExtendTo64 | ExtSpec::SignExtendTo64 => 64,
};
let input_size = ctx.input_ty(spec.insn, spec.input).bits();
let requested_ty = if requested_size == 32 {
types::I32
} else {
types::I64
};
let ext_mode = match (input_size, requested_size) {
(a, b) if a == b => return put_input_in_reg(ctx, spec),
(1, 8) => return put_input_in_reg(ctx, spec),
(a, b) => ExtMode::new(a.try_into().unwrap(), b.try_into().unwrap())
.unwrap_or_else(|| panic!("invalid extension: {} -> {}", a, b)),
};
let src = input_to_reg_mem(ctx, spec);
let dst = ctx.alloc_tmp(requested_ty).only_reg().unwrap();
match ext_spec {
ExtSpec::ZeroExtendTo32 | ExtSpec::ZeroExtendTo64 => {
ctx.emit(Inst::movzx_rm_r(ext_mode, src, dst))
}
ExtSpec::SignExtendTo32 | ExtSpec::SignExtendTo64 => {
ctx.emit(Inst::movsx_rm_r(ext_mode, src, dst))
}
}
dst.to_reg()
}
fn input_to_imm<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> Option<u64> {
ctx.get_input_as_source_or_const(spec.insn, spec.input)
.constant
@@ -629,207 +578,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::Selectif
| Opcode::SelectifSpectreGuard
| Opcode::FcvtFromSint
| Opcode::FcvtLowFromSint => {
| Opcode::FcvtLowFromSint
| Opcode::FcvtFromUint => {
implemented_in_isle(ctx);
}
Opcode::FcvtFromUint => {
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let ty = ty.unwrap();
let input_ty = ctx.input_ty(insn, 0);
let output_ty = ctx.output_ty(insn, 0);
if !ty.is_vector() {
match input_ty {
types::I8 | types::I16 | types::I32 => {
// Conversion from an unsigned int smaller than 64-bit is easy: zero-extend +
// do a signed conversion (which won't overflow).
let opcode = if ty == types::F32 {
SseOpcode::Cvtsi2ss
} else {
assert_eq!(ty, types::F64);
SseOpcode::Cvtsi2sd
};
let src = RegMem::reg(extend_input_to_reg(
ctx,
inputs[0],
ExtSpec::ZeroExtendTo64,
));
ctx.emit(Inst::gpr_to_xmm(opcode, src, OperandSize::Size64, dst));
}
types::I64 => {
let src = put_input_in_reg(ctx, inputs[0]);
let src_copy = ctx.alloc_tmp(types::I64).only_reg().unwrap();
ctx.emit(Inst::gen_move(src_copy, src, types::I64));
let tmp_gpr1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
let tmp_gpr2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
ctx.emit(Inst::cvt_u64_to_float_seq(
if ty == types::F64 {
OperandSize::Size64
} else {
OperandSize::Size32
},
src_copy,
tmp_gpr1,
tmp_gpr2,
dst,
));
}
_ => panic!("unexpected input type for FcvtFromUint: {:?}", input_ty),
};
} else if output_ty == types::F64X2 {
if let Some(uwiden) = matches_input(ctx, inputs[0], Opcode::UwidenLow) {
let uwiden_input = InsnInput {
insn: uwiden,
input: 0,
};
let src = put_input_in_reg(ctx, uwiden_input);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let input_ty = ctx.input_ty(uwiden, 0);
// Matches_input further obfuscates which Wasm instruction this is ultimately
// lowering. Check here that the types are as expected for F64x2ConvertLowI32x4U.
debug_assert!(input_ty == types::I32X4);
// Algorithm uses unpcklps to help create a float that is equivalent
// 0x1.0p52 + double(src). 0x1.0p52 is unique because at this exponent
// every value of the mantissa represents a corresponding uint32 number.
// When we subtract 0x1.0p52 we are left with double(src).
let uint_mask = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
ctx.emit(Inst::gen_move(dst, src, types::I32X4));
static UINT_MASK: [u8; 16] = [
0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00,
];
let uint_mask_const =
ctx.use_constant(VCodeConstantData::WellKnown(&UINT_MASK));
ctx.emit(Inst::xmm_load_const(
uint_mask_const,
uint_mask,
types::I32X4,
));
// Creates 0x1.0p52 + double(src)
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Unpcklps,
RegMem::from(uint_mask),
dst,
));
static UINT_MASK_HIGH: [u8; 16] = [
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x30, 0x43,
];
let uint_mask_high_const =
ctx.use_constant(VCodeConstantData::WellKnown(&UINT_MASK_HIGH));
let uint_mask_high = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
ctx.emit(Inst::xmm_load_const(
uint_mask_high_const,
uint_mask_high,
types::I32X4,
));
// 0x1.0p52 + double(src) - 0x1.0p52
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Subpd,
RegMem::from(uint_mask_high),
dst,
));
} else {
panic!("Unsupported FcvtFromUint conversion types: {}", ty);
}
} else {
assert_eq!(ctx.input_ty(insn, 0), types::I32X4);
let src = put_input_in_reg(ctx, inputs[0]);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
if isa_flags.use_avx512vl_simd() && isa_flags.use_avx512f_simd() {
// When AVX512VL and AVX512F are available,
// `fcvt_from_uint` can be lowered to a single instruction.
ctx.emit(Inst::xmm_unary_rm_r_evex(
Avx512Opcode::Vcvtudq2ps,
RegMem::reg(src),
dst,
));
} else {
// Converting packed unsigned integers to packed floats
// requires a few steps. There is no single instruction
// lowering for converting unsigned floats but there is for
// converting packed signed integers to float (cvtdq2ps). In
// the steps below we isolate the upper half (16 bits) and
// lower half (16 bits) of each lane and then we convert
// each half separately using cvtdq2ps meant for signed
// integers. In order for this to work for the upper half
// bits we must shift right by 1 (divide by 2) these bits in
// order to ensure the most significant bit is 0 not signed,
// and then after the conversion we double the value.
// Finally we add the converted values where addition will
// correctly round.
//
// Sequence:
// -> A = 0xffffffff
// -> Ah = 0xffff0000
// -> Al = 0x0000ffff
// -> Convert(Al) // Convert int to float
// -> Ah = Ah >> 1 // Shift right 1 to assure Ah conversion isn't treated as signed
// -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift
// -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion.
// -> dst = Ah + Al // Add the two floats together
// Create a temporary register
let tmp = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
ctx.emit(Inst::xmm_unary_rm_r(
SseOpcode::Movapd,
RegMem::reg(src),
tmp,
));
ctx.emit(Inst::gen_move(dst, src, ty));
// Get the low 16 bits
ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Pslld, RegMemImm::imm(16), tmp));
ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(16), tmp));
// Get the high 16 bits
ctx.emit(Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::from(tmp), dst));
// Convert the low 16 bits
ctx.emit(Inst::xmm_unary_rm_r(
SseOpcode::Cvtdq2ps,
RegMem::from(tmp),
tmp,
));
// Shift the high bits by 1, convert, and double to get the correct value.
ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(1), dst));
ctx.emit(Inst::xmm_unary_rm_r(
SseOpcode::Cvtdq2ps,
RegMem::from(dst),
dst,
));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Addps,
RegMem::reg(dst.to_reg()),
dst,
));
// Add together the two converted values.
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Addps,
RegMem::reg(tmp.to_reg()),
dst,
));
}
}
}
Opcode::FcvtToUint | Opcode::FcvtToUintSat | Opcode::FcvtToSint | Opcode::FcvtToSintSat => {
let src = put_input_in_reg(ctx, inputs[0]);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

View File

@@ -770,6 +770,18 @@ where
fn jump_table_size(&mut self, targets: &BoxVecMachLabel) -> u32 {
targets.len() as u32
}
#[inline]
fn fcvt_uint_mask_const(&mut self) -> VCodeConstant {
self.lower_ctx
.use_constant(VCodeConstantData::WellKnown(&UINT_MASK))
}
#[inline]
fn fcvt_uint_mask_high_const(&mut self) -> VCodeConstant {
self.lower_ctx
.use_constant(VCodeConstantData::WellKnown(&UINT_MASK_HIGH))
}
}
impl<C> IsleContext<'_, C, Flags, IsaFlags, 6>
@@ -891,3 +903,11 @@ fn to_simm32(constant: i64) -> Option<GprMemImm> {
None
}
}
const UINT_MASK: [u8; 16] = [
0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
];
const UINT_MASK_HIGH: [u8; 16] = [
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43,
];