x64: lower fcvt_from_uint to VCVTUDQ2PS when possible

When AVX512VL and AVX512F are available, use a single instruction (`VCVTUDQ2PS`) instead of a length 9-instruction sequence. This optimization is a port from the legacy x86 backend.
2021-05-19 12:20:11 -07:00
parent 3b3b126fe2
commit 54b45d28a3
5 changed files with 93 additions and 63 deletions
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -1000,6 +1000,7 @@ impl fmt::Display for SseOpcode {

 #[derive(Clone)]
 pub enum Avx512Opcode {
+    Vcvtudq2ps,
    Vpabsq,
    Vpmullq,
 }
@@ -1008,6 +1009,9 @@ impl Avx512Opcode {
    /// Which `InstructionSet`s support the opcode?
    pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> {
        match self {
+            Avx512Opcode::Vcvtudq2ps => {
+                smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL]
+            }
            Avx512Opcode::Vpabsq => smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL],
            Avx512Opcode::Vpmullq => smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512DQ],
        }
@@ -1017,6 +1021,7 @@ impl Avx512Opcode {
 impl fmt::Debug for Avx512Opcode {
    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
        let name = match self {
+            Avx512Opcode::Vcvtudq2ps => "vcvtudq2ps",
            Avx512Opcode::Vpabsq => "vpabsq",
            Avx512Opcode::Vpmullq => "vpmullq",
        };