From 54b45d28a33ae85867a1b21ad702c09c0674946a Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Wed, 19 May 2021 12:20:11 -0700
Subject: [PATCH] x64: lower fcvt_from_uint to VCVTUDQ2PS when possible

When AVX512VL and AVX512F are available, use a single instruction
(`VCVTUDQ2PS`) instead of a length 9-instruction sequence. This
optimization is a port from the legacy x86 backend.
---
 cranelift/codegen/src/isa/x64/inst/args.rs    |   5 +
 cranelift/codegen/src/isa/x64/inst/emit.rs    |  11 +-
 .../codegen/src/isa/x64/inst/emit_tests.rs    |   6 +
 cranelift/codegen/src/isa/x64/lower.rs        | 112 ++++++++++--------
 .../isa/x64/simd-conversion-run.clif          |  22 ++--
 5 files changed, 93 insertions(+), 63 deletions(-)

diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs
index 77b173b42d..005df224dc 100644
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -1000,6 +1000,7 @@ impl fmt::Display for SseOpcode {
 
 #[derive(Clone)]
 pub enum Avx512Opcode {
+    Vcvtudq2ps,
     Vpabsq,
     Vpmullq,
 }
@@ -1008,6 +1009,9 @@ impl Avx512Opcode {
     /// Which `InstructionSet`s support the opcode?
     pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> {
         match self {
+            Avx512Opcode::Vcvtudq2ps => {
+                smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL]
+            }
             Avx512Opcode::Vpabsq => smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL],
             Avx512Opcode::Vpmullq => smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512DQ],
         }
@@ -1017,6 +1021,7 @@ impl Avx512Opcode {
 impl fmt::Debug for Avx512Opcode {
     fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
         let name = match self {
+            Avx512Opcode::Vcvtudq2ps => "vcvtudq2ps",
             Avx512Opcode::Vpabsq => "vpabsq",
             Avx512Opcode::Vpmullq => "vpmullq",
         };
diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
index f57212a3be..5e94395797 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -1408,16 +1408,17 @@ pub(crate) fn emit(
         }
 
         Inst::XmmUnaryRmREvex { op, src, dst } => {
-            let opcode = match op {
-                Avx512Opcode::Vpabsq => 0x1f,
+            let (prefix, map, w, opcode) = match op {
+                Avx512Opcode::Vpabsq => (LegacyPrefixes::_66, OpcodeMap::_0F38, true, 0x1f),
+                Avx512Opcode::Vcvtudq2ps => (LegacyPrefixes::_F2, OpcodeMap::_0F, false, 0x7a),
                 _ => unimplemented!("Opcode {:?} not implemented", op),
             };
             match src {
                 RegMem::Reg { reg: src } => EvexInstruction::new()
                     .length(EvexVectorLength::V128)
-                    .prefix(LegacyPrefixes::_66)
-                    .map(OpcodeMap::_0F38)
-                    .w(true)
+                    .prefix(prefix)
+                    .map(map)
+                    .w(w)
                     .opcode(opcode)
                     .reg(dst.to_reg().get_hw_encoding())
                     .rm(src.get_hw_encoding())
diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
index a288327fd5..efe62b4cea 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -3889,6 +3889,12 @@ fn test_x64_emit() {
         "vpabsq  %xmm2, %xmm8",
     ));
 
+    insns.push((
+        Inst::xmm_unary_rm_r_evex(Avx512Opcode::Vcvtudq2ps, RegMem::reg(xmm2), w_xmm8),
+        "62717F087AC2",
+        "vcvtudq2ps %xmm2, %xmm8",
+    ));
+
     // Xmm to int conversions, and conversely.
 
     insns.push((
diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs
index a7bd6ad372..2aaa8f69b5 100644
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -4069,64 +4069,78 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     _ => panic!("unexpected input type for FcvtFromUint: {:?}", input_ty),
                 };
             } else {
-                // Converting packed unsigned integers to packed floats requires a few steps.
-                // There is no single instruction lowering for converting unsigned floats but there
-                // is for converting packed signed integers to float (cvtdq2ps). In the steps below
-                // we isolate the upper half (16 bits) and lower half (16 bits) of each lane and
-                // then we convert each half separately using cvtdq2ps meant for signed integers.
-                // In order for this to work for the upper half bits we must shift right by 1
-                // (divide by 2) these bits in order to ensure the most significant bit is 0 not
-                // signed, and then after the conversion we double the value. Finally we add the
-                // converted values where addition will correctly round.
-                //
-                // Sequence:
-                // -> A = 0xffffffff
-                // -> Ah = 0xffff0000
-                // -> Al = 0x0000ffff
-                // -> Convert(Al) // Convert int to float
-                // -> Ah = Ah >> 1 // Shift right 1 to assure Ah conversion isn't treated as signed
-                // -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift
-                // -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion.
-                // -> dst = Ah + Al // Add the two floats together
-
                 assert_eq!(ctx.input_ty(insn, 0), types::I32X4);
                 let src = put_input_in_reg(ctx, inputs[0]);
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
-                // Create a temporary register
-                let tmp = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
-                ctx.emit(Inst::xmm_unary_rm_r(
-                    SseOpcode::Movapd,
-                    RegMem::reg(src),
-                    tmp,
-                ));
-                ctx.emit(Inst::gen_move(dst, src, ty));
+                if isa_flags.use_avx512f_simd() || isa_flags.use_avx512vl_simd() {
+                    // When either AVX512VL or AVX512F are available,
+                    // `fcvt_from_uint` can be lowered to a single instruction.
+                    ctx.emit(Inst::xmm_unary_rm_r_evex(
+                        Avx512Opcode::Vcvtudq2ps,
+                        RegMem::reg(src),
+                        dst,
+                    ));
+                } else {
+                    // Converting packed unsigned integers to packed floats
+                    // requires a few steps. There is no single instruction
+                    // lowering for converting unsigned floats but there is for
+                    // converting packed signed integers to float (cvtdq2ps). In
+                    // the steps below we isolate the upper half (16 bits) and
+                    // lower half (16 bits) of each lane and then we convert
+                    // each half separately using cvtdq2ps meant for signed
+                    // integers. In order for this to work for the upper half
+                    // bits we must shift right by 1 (divide by 2) these bits in
+                    // order to ensure the most significant bit is 0 not signed,
+                    // and then after the conversion we double the value.
+                    // Finally we add the converted values where addition will
+                    // correctly round.
+                    //
+                    // Sequence:
+                    // -> A = 0xffffffff
+                    // -> Ah = 0xffff0000
+                    // -> Al = 0x0000ffff
+                    // -> Convert(Al) // Convert int to float
+                    // -> Ah = Ah >> 1 // Shift right 1 to assure Ah conversion isn't treated as signed
+                    // -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift
+                    // -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion.
+                    // -> dst = Ah + Al // Add the two floats together
 
-                // Get the low 16 bits
-                ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Pslld, RegMemImm::imm(16), tmp));
-                ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(16), tmp));
+                    // Create a temporary register
+                    let tmp = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
+                    ctx.emit(Inst::xmm_unary_rm_r(
+                        SseOpcode::Movapd,
+                        RegMem::reg(src),
+                        tmp,
+                    ));
+                    ctx.emit(Inst::gen_move(dst, src, ty));
 
-                // Get the high 16 bits
-                ctx.emit(Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::from(tmp), dst));
+                    // Get the low 16 bits
+                    ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Pslld, RegMemImm::imm(16), tmp));
+                    ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(16), tmp));
 
-                // Convert the low 16 bits
-                ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::from(tmp), tmp));
+                    // Get the high 16 bits
+                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::from(tmp), dst));
 
-                // Shift the high bits by 1, convert, and double to get the correct value.
-                ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(1), dst));
-                ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::from(dst), dst));
-                ctx.emit(Inst::xmm_rm_r(
-                    SseOpcode::Addps,
-                    RegMem::reg(dst.to_reg()),
-                    dst,
-                ));
+                    // Convert the low 16 bits
+                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::from(tmp), tmp));
 
-                // Add together the two converted values.
-                ctx.emit(Inst::xmm_rm_r(
-                    SseOpcode::Addps,
-                    RegMem::reg(tmp.to_reg()),
-                    dst,
-                ));
+                    // Shift the high bits by 1, convert, and double to get the correct value.
+                    ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(1), dst));
+                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::from(dst), dst));
+                    ctx.emit(Inst::xmm_rm_r(
+                        SseOpcode::Addps,
+                        RegMem::reg(dst.to_reg()),
+                        dst,
+                    ));
+
+                    // Add together the two converted values.
+                    ctx.emit(Inst::xmm_rm_r(
+                        SseOpcode::Addps,
+                        RegMem::reg(tmp.to_reg()),
+                        dst,
+                    ));
+                }
             }
         }
 
diff --git a/cranelift/filetests/filetests/isa/x64/simd-conversion-run.clif b/cranelift/filetests/filetests/isa/x64/simd-conversion-run.clif
index d6f353eee2..6d40ad9f37 100644
--- a/cranelift/filetests/filetests/isa/x64/simd-conversion-run.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-conversion-run.clif
@@ -2,17 +2,21 @@ test run
 set enable_simd
 target x86_64 machinst
 
-function %fcvt_from_sint() -> b1 {
-block0:
-    v0 = vconst.i32x4 [-1 0 1 123456789]
+function %fcvt_from_sint(i32x4) -> f32x4 {
+block0(v0: i32x4):
     v1 = fcvt_from_sint.f32x4 v0
-
-    v2 = vconst.f32x4 [-0x1.0 0.0 0x1.0 0x75bcd18.0] ; 123456789 rounds to 123456792.0, an error of 3
-    v3 = fcmp eq v1, v2
-    v4 = vall_true v3
-    return v4
+    return v1
 }
-; run
+; run: %fcvt_from_sint([-1 0 1 123456789]) == [-0x1.0 0.0 0x1.0 0x75bcd18.0]
+; Note that 123456789 rounds to 123456792.0, an error of 3
+
+function %fcvt_from_uint(i32x4) -> f32x4 {
+block0(v0: i32x4):
+    v1 = fcvt_from_uint.f32x4 v0
+    return v1
+}
+; run: %fcvt_from_uint([0xFFFFFFFF 0 1 123456789]) == [0x100000000.0 0.0 0x1.0 0x75bcd18.0]
+; Note that 0xFFFFFFFF is decimal 4,294,967,295 and is rounded up 1 to 4,294,967,296 in f32x4.
 
 function %fcvt_to_sint_sat(f32x4) -> i32x4 {
 block0(v0:f32x4):