x64: lower fcvt_from_uint to VCVTUDQ2PS when possible
When AVX512VL and AVX512F are available, use a single instruction (`VCVTUDQ2PS`) instead of a length 9-instruction sequence. This optimization is a port from the legacy x86 backend.
This commit is contained in:
@@ -1000,6 +1000,7 @@ impl fmt::Display for SseOpcode {
|
|||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub enum Avx512Opcode {
|
pub enum Avx512Opcode {
|
||||||
|
Vcvtudq2ps,
|
||||||
Vpabsq,
|
Vpabsq,
|
||||||
Vpmullq,
|
Vpmullq,
|
||||||
}
|
}
|
||||||
@@ -1008,6 +1009,9 @@ impl Avx512Opcode {
|
|||||||
/// Which `InstructionSet`s support the opcode?
|
/// Which `InstructionSet`s support the opcode?
|
||||||
pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> {
|
pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> {
|
||||||
match self {
|
match self {
|
||||||
|
Avx512Opcode::Vcvtudq2ps => {
|
||||||
|
smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL]
|
||||||
|
}
|
||||||
Avx512Opcode::Vpabsq => smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL],
|
Avx512Opcode::Vpabsq => smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL],
|
||||||
Avx512Opcode::Vpmullq => smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512DQ],
|
Avx512Opcode::Vpmullq => smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512DQ],
|
||||||
}
|
}
|
||||||
@@ -1017,6 +1021,7 @@ impl Avx512Opcode {
|
|||||||
impl fmt::Debug for Avx512Opcode {
|
impl fmt::Debug for Avx512Opcode {
|
||||||
fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
|
fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
|
||||||
let name = match self {
|
let name = match self {
|
||||||
|
Avx512Opcode::Vcvtudq2ps => "vcvtudq2ps",
|
||||||
Avx512Opcode::Vpabsq => "vpabsq",
|
Avx512Opcode::Vpabsq => "vpabsq",
|
||||||
Avx512Opcode::Vpmullq => "vpmullq",
|
Avx512Opcode::Vpmullq => "vpmullq",
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -1408,16 +1408,17 @@ pub(crate) fn emit(
|
|||||||
}
|
}
|
||||||
|
|
||||||
Inst::XmmUnaryRmREvex { op, src, dst } => {
|
Inst::XmmUnaryRmREvex { op, src, dst } => {
|
||||||
let opcode = match op {
|
let (prefix, map, w, opcode) = match op {
|
||||||
Avx512Opcode::Vpabsq => 0x1f,
|
Avx512Opcode::Vpabsq => (LegacyPrefixes::_66, OpcodeMap::_0F38, true, 0x1f),
|
||||||
|
Avx512Opcode::Vcvtudq2ps => (LegacyPrefixes::_F2, OpcodeMap::_0F, false, 0x7a),
|
||||||
_ => unimplemented!("Opcode {:?} not implemented", op),
|
_ => unimplemented!("Opcode {:?} not implemented", op),
|
||||||
};
|
};
|
||||||
match src {
|
match src {
|
||||||
RegMem::Reg { reg: src } => EvexInstruction::new()
|
RegMem::Reg { reg: src } => EvexInstruction::new()
|
||||||
.length(EvexVectorLength::V128)
|
.length(EvexVectorLength::V128)
|
||||||
.prefix(LegacyPrefixes::_66)
|
.prefix(prefix)
|
||||||
.map(OpcodeMap::_0F38)
|
.map(map)
|
||||||
.w(true)
|
.w(w)
|
||||||
.opcode(opcode)
|
.opcode(opcode)
|
||||||
.reg(dst.to_reg().get_hw_encoding())
|
.reg(dst.to_reg().get_hw_encoding())
|
||||||
.rm(src.get_hw_encoding())
|
.rm(src.get_hw_encoding())
|
||||||
|
|||||||
@@ -3889,6 +3889,12 @@ fn test_x64_emit() {
|
|||||||
"vpabsq %xmm2, %xmm8",
|
"vpabsq %xmm2, %xmm8",
|
||||||
));
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_unary_rm_r_evex(Avx512Opcode::Vcvtudq2ps, RegMem::reg(xmm2), w_xmm8),
|
||||||
|
"62717F087AC2",
|
||||||
|
"vcvtudq2ps %xmm2, %xmm8",
|
||||||
|
));
|
||||||
|
|
||||||
// Xmm to int conversions, and conversely.
|
// Xmm to int conversions, and conversely.
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
|
|||||||
@@ -4069,15 +4069,32 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
_ => panic!("unexpected input type for FcvtFromUint: {:?}", input_ty),
|
_ => panic!("unexpected input type for FcvtFromUint: {:?}", input_ty),
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
// Converting packed unsigned integers to packed floats requires a few steps.
|
assert_eq!(ctx.input_ty(insn, 0), types::I32X4);
|
||||||
// There is no single instruction lowering for converting unsigned floats but there
|
let src = put_input_in_reg(ctx, inputs[0]);
|
||||||
// is for converting packed signed integers to float (cvtdq2ps). In the steps below
|
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||||
// we isolate the upper half (16 bits) and lower half (16 bits) of each lane and
|
|
||||||
// then we convert each half separately using cvtdq2ps meant for signed integers.
|
if isa_flags.use_avx512f_simd() || isa_flags.use_avx512vl_simd() {
|
||||||
// In order for this to work for the upper half bits we must shift right by 1
|
// When either AVX512VL or AVX512F are available,
|
||||||
// (divide by 2) these bits in order to ensure the most significant bit is 0 not
|
// `fcvt_from_uint` can be lowered to a single instruction.
|
||||||
// signed, and then after the conversion we double the value. Finally we add the
|
ctx.emit(Inst::xmm_unary_rm_r_evex(
|
||||||
// converted values where addition will correctly round.
|
Avx512Opcode::Vcvtudq2ps,
|
||||||
|
RegMem::reg(src),
|
||||||
|
dst,
|
||||||
|
));
|
||||||
|
} else {
|
||||||
|
// Converting packed unsigned integers to packed floats
|
||||||
|
// requires a few steps. There is no single instruction
|
||||||
|
// lowering for converting unsigned floats but there is for
|
||||||
|
// converting packed signed integers to float (cvtdq2ps). In
|
||||||
|
// the steps below we isolate the upper half (16 bits) and
|
||||||
|
// lower half (16 bits) of each lane and then we convert
|
||||||
|
// each half separately using cvtdq2ps meant for signed
|
||||||
|
// integers. In order for this to work for the upper half
|
||||||
|
// bits we must shift right by 1 (divide by 2) these bits in
|
||||||
|
// order to ensure the most significant bit is 0 not signed,
|
||||||
|
// and then after the conversion we double the value.
|
||||||
|
// Finally we add the converted values where addition will
|
||||||
|
// correctly round.
|
||||||
//
|
//
|
||||||
// Sequence:
|
// Sequence:
|
||||||
// -> A = 0xffffffff
|
// -> A = 0xffffffff
|
||||||
@@ -4089,10 +4106,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
// -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion.
|
// -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion.
|
||||||
// -> dst = Ah + Al // Add the two floats together
|
// -> dst = Ah + Al // Add the two floats together
|
||||||
|
|
||||||
assert_eq!(ctx.input_ty(insn, 0), types::I32X4);
|
|
||||||
let src = put_input_in_reg(ctx, inputs[0]);
|
|
||||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
||||||
|
|
||||||
// Create a temporary register
|
// Create a temporary register
|
||||||
let tmp = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
|
let tmp = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
|
||||||
ctx.emit(Inst::xmm_unary_rm_r(
|
ctx.emit(Inst::xmm_unary_rm_r(
|
||||||
@@ -4129,6 +4142,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
));
|
));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Opcode::FcvtToUint | Opcode::FcvtToUintSat | Opcode::FcvtToSint | Opcode::FcvtToSintSat => {
|
Opcode::FcvtToUint | Opcode::FcvtToUintSat | Opcode::FcvtToSint | Opcode::FcvtToSintSat => {
|
||||||
let src = put_input_in_reg(ctx, inputs[0]);
|
let src = put_input_in_reg(ctx, inputs[0]);
|
||||||
|
|||||||
@@ -2,17 +2,21 @@ test run
|
|||||||
set enable_simd
|
set enable_simd
|
||||||
target x86_64 machinst
|
target x86_64 machinst
|
||||||
|
|
||||||
function %fcvt_from_sint() -> b1 {
|
function %fcvt_from_sint(i32x4) -> f32x4 {
|
||||||
block0:
|
block0(v0: i32x4):
|
||||||
v0 = vconst.i32x4 [-1 0 1 123456789]
|
|
||||||
v1 = fcvt_from_sint.f32x4 v0
|
v1 = fcvt_from_sint.f32x4 v0
|
||||||
|
return v1
|
||||||
v2 = vconst.f32x4 [-0x1.0 0.0 0x1.0 0x75bcd18.0] ; 123456789 rounds to 123456792.0, an error of 3
|
|
||||||
v3 = fcmp eq v1, v2
|
|
||||||
v4 = vall_true v3
|
|
||||||
return v4
|
|
||||||
}
|
}
|
||||||
; run
|
; run: %fcvt_from_sint([-1 0 1 123456789]) == [-0x1.0 0.0 0x1.0 0x75bcd18.0]
|
||||||
|
; Note that 123456789 rounds to 123456792.0, an error of 3
|
||||||
|
|
||||||
|
function %fcvt_from_uint(i32x4) -> f32x4 {
|
||||||
|
block0(v0: i32x4):
|
||||||
|
v1 = fcvt_from_uint.f32x4 v0
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
; run: %fcvt_from_uint([0xFFFFFFFF 0 1 123456789]) == [0x100000000.0 0.0 0x1.0 0x75bcd18.0]
|
||||||
|
; Note that 0xFFFFFFFF is decimal 4,294,967,295 and is rounded up 1 to 4,294,967,296 in f32x4.
|
||||||
|
|
||||||
function %fcvt_to_sint_sat(f32x4) -> i32x4 {
|
function %fcvt_to_sint_sat(f32x4) -> i32x4 {
|
||||||
block0(v0:f32x4):
|
block0(v0:f32x4):
|
||||||
|
|||||||
Reference in New Issue
Block a user