x64: lower fcvt_from_uint to VCVTUDQ2PS when possible

When AVX512VL and AVX512F are available, use a single instruction (`VCVTUDQ2PS`) instead of a length 9-instruction sequence. This optimization is a port from the legacy x86 backend.
2021-05-19 12:20:11 -07:00
parent 3b3b126fe2
commit 54b45d28a3
5 changed files with 93 additions and 63 deletions
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -1000,6 +1000,7 @@ impl fmt::Display for SseOpcode {
 #[derive(Clone)]
 pub enum Avx512Opcode {
    Vcvtudq2ps,
    Vpabsq,
    Vpmullq,
 }
@@ -1008,6 +1009,9 @@ impl Avx512Opcode {
    /// Which `InstructionSet`s support the opcode?
    pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> {
        match self {
            Avx512Opcode::Vcvtudq2ps => {
                smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL]
            }
            Avx512Opcode::Vpabsq => smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL],
            Avx512Opcode::Vpmullq => smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512DQ],
        }
@@ -1017,6 +1021,7 @@ impl Avx512Opcode {
 impl fmt::Debug for Avx512Opcode {
    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
        let name = match self {
            Avx512Opcode::Vcvtudq2ps => "vcvtudq2ps",
            Avx512Opcode::Vpabsq => "vpabsq",
            Avx512Opcode::Vpmullq => "vpmullq",
        };
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -1408,16 +1408,17 @@ pub(crate) fn emit(
        }
        Inst::XmmUnaryRmREvex { op, src, dst } => {
-            let opcode = match op {
+            let (prefix, map, w, opcode) = match op {
-                Avx512Opcode::Vpabsq => 0x1f,
+                Avx512Opcode::Vpabsq => (LegacyPrefixes::_66, OpcodeMap::_0F38, true, 0x1f),
                Avx512Opcode::Vcvtudq2ps => (LegacyPrefixes::_F2, OpcodeMap::_0F, false, 0x7a),
                _ => unimplemented!("Opcode {:?} not implemented", op),
            };
            match src {
                RegMem::Reg { reg: src } => EvexInstruction::new()
                    .length(EvexVectorLength::V128)
-                    .prefix(LegacyPrefixes::_66)
+                    .prefix(prefix)
-                    .map(OpcodeMap::_0F38)
+                    .map(map)
-                    .w(true)
+                    .w(w)
                    .opcode(opcode)
                    .reg(dst.to_reg().get_hw_encoding())
                    .rm(src.get_hw_encoding())
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -3889,6 +3889,12 @@ fn test_x64_emit() {
        "vpabsq  %xmm2, %xmm8",
    ));
    insns.push((
        Inst::xmm_unary_rm_r_evex(Avx512Opcode::Vcvtudq2ps, RegMem::reg(xmm2), w_xmm8),
        "62717F087AC2",
        "vcvtudq2ps %xmm2, %xmm8",
    ));
    // Xmm to int conversions, and conversely.
    insns.push((
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -4069,15 +4069,32 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    _ => panic!("unexpected input type for FcvtFromUint: {:?}", input_ty),
                };
            } else {
-                // Converting packed unsigned integers to packed floats requires a few steps.
+                assert_eq!(ctx.input_ty(insn, 0), types::I32X4);
-                // There is no single instruction lowering for converting unsigned floats but there
+                let src = put_input_in_reg(ctx, inputs[0]);
-                // is for converting packed signed integers to float (cvtdq2ps). In the steps below
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-                // we isolate the upper half (16 bits) and lower half (16 bits) of each lane and
+
-                // then we convert each half separately using cvtdq2ps meant for signed integers.
+                if isa_flags.use_avx512f_simd() || isa_flags.use_avx512vl_simd() {
-                // In order for this to work for the upper half bits we must shift right by 1
+                    // When either AVX512VL or AVX512F are available,
-                // (divide by 2) these bits in order to ensure the most significant bit is 0 not
+                    // `fcvt_from_uint` can be lowered to a single instruction.
-                // signed, and then after the conversion we double the value. Finally we add the
+                    ctx.emit(Inst::xmm_unary_rm_r_evex(
-                // converted values where addition will correctly round.
+                        Avx512Opcode::Vcvtudq2ps,
                        RegMem::reg(src),
                        dst,
                    ));
                } else {
                    // Converting packed unsigned integers to packed floats
                    // requires a few steps. There is no single instruction
                    // lowering for converting unsigned floats but there is for
                    // converting packed signed integers to float (cvtdq2ps). In
                    // the steps below we isolate the upper half (16 bits) and
                    // lower half (16 bits) of each lane and then we convert
                    // each half separately using cvtdq2ps meant for signed
                    // integers. In order for this to work for the upper half
                    // bits we must shift right by 1 (divide by 2) these bits in
                    // order to ensure the most significant bit is 0 not signed,
                    // and then after the conversion we double the value.
                    // Finally we add the converted values where addition will
                    // correctly round.
                    //
                    // Sequence:
                    // -> A = 0xffffffff
@@ -4089,10 +4106,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    // -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion.
                    // -> dst = Ah + Al // Add the two floats together
                assert_eq!(ctx.input_ty(insn, 0), types::I32X4);
                let src = put_input_in_reg(ctx, inputs[0]);
                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                    // Create a temporary register
                    let tmp = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
                    ctx.emit(Inst::xmm_unary_rm_r(
@@ -4129,6 +4142,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    ));
                }
            }
        }
        Opcode::FcvtToUint | Opcode::FcvtToUintSat | Opcode::FcvtToSint | Opcode::FcvtToSintSat => {
            let src = put_input_in_reg(ctx, inputs[0]);
--- a/cranelift/filetests/filetests/isa/x64/simd-conversion-run.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-conversion-run.clif
@@ -2,17 +2,21 @@ test run
 set enable_simd
 target x86_64 machinst
-function %fcvt_from_sint() -> b1 {
+function %fcvt_from_sint(i32x4) -> f32x4 {
-block0:
+block0(v0: i32x4):
    v0 = vconst.i32x4 [-1 0 1 123456789]
    v1 = fcvt_from_sint.f32x4 v0
-
+    return v1
    v2 = vconst.f32x4 [-0x1.0 0.0 0x1.0 0x75bcd18.0] ; 123456789 rounds to 123456792.0, an error of 3
    v3 = fcmp eq v1, v2
    v4 = vall_true v3
    return v4
 }
-; run
+; run: %fcvt_from_sint([-1 0 1 123456789]) == [-0x1.0 0.0 0x1.0 0x75bcd18.0]
 ; Note that 123456789 rounds to 123456792.0, an error of 3
 function %fcvt_from_uint(i32x4) -> f32x4 {
 block0(v0: i32x4):
    v1 = fcvt_from_uint.f32x4 v0
    return v1
 }
 ; run: %fcvt_from_uint([0xFFFFFFFF 0 1 123456789]) == [0x100000000.0 0.0 0x1.0 0x75bcd18.0]
 ; Note that 0xFFFFFFFF is decimal 4,294,967,295 and is rounded up 1 to 4,294,967,296 in f32x4.
 function %fcvt_to_sint_sat(f32x4) -> i32x4 {
 block0(v0:f32x4):