x64: lower fcvt_from_uint to VCVTUDQ2PS when possible

When AVX512VL and AVX512F are available, use a single instruction (`VCVTUDQ2PS`) instead of a length 9-instruction sequence. This optimization is a port from the legacy x86 backend.
2021-05-19 12:20:11 -07:00
parent 3b3b126fe2
commit 54b45d28a3
5 changed files with 93 additions and 63 deletions
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -1408,16 +1408,17 @@ pub(crate) fn emit(
        }

        Inst::XmmUnaryRmREvex { op, src, dst } => {
-            let opcode = match op {
-                Avx512Opcode::Vpabsq => 0x1f,
+            let (prefix, map, w, opcode) = match op {
+                Avx512Opcode::Vpabsq => (LegacyPrefixes::_66, OpcodeMap::_0F38, true, 0x1f),
+                Avx512Opcode::Vcvtudq2ps => (LegacyPrefixes::_F2, OpcodeMap::_0F, false, 0x7a),
                _ => unimplemented!("Opcode {:?} not implemented", op),
            };
            match src {
                RegMem::Reg { reg: src } => EvexInstruction::new()
                    .length(EvexVectorLength::V128)
-                    .prefix(LegacyPrefixes::_66)
-                    .map(OpcodeMap::_0F38)
-                    .w(true)
+                    .prefix(prefix)
+                    .map(map)
+                    .w(w)
                    .opcode(opcode)
                    .reg(dst.to_reg().get_hw_encoding())
                    .rm(src.get_hw_encoding())