Add x86_vcvtudq2ps instruction

This instruction converts i32x4 to f32x4 in several AVX512 feature sets.
2020-05-26 13:08:18 -07:00
parent 9788b02dd5
commit 546fc9ddf1
6 changed files with 74 additions and 0 deletions
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -1655,10 +1655,12 @@ fn define_simd(
    let x86_ptest = x86.by_name("x86_ptest");
    let x86_punpckh = x86.by_name("x86_punpckh");
    let x86_punpckl = x86.by_name("x86_punpckl");
+    let x86_vcvtudq2ps = x86.by_name("x86_vcvtudq2ps");

    // Shorthands for recipes.
    let rec_blend = r.template("blend");
    let rec_evex_reg_vvvv_rm_128 = r.template("evex_reg_vvvv_rm_128");
+    let rec_evex_reg_rm_128 = r.template("evex_reg_rm_128");
    let rec_f_ib = r.template("f_ib");
    let rec_fa = r.template("fa");
    let rec_fa_ib = r.template("fa_ib");
@@ -1702,6 +1704,7 @@ fn define_simd(
    let use_sse41_simd = settings.predicate_by_name("use_sse41_simd");
    let use_sse42_simd = settings.predicate_by_name("use_sse42_simd");
    let use_avx512dq_simd = settings.predicate_by_name("use_avx512dq_simd");
+    let use_avx512vl_simd = settings.predicate_by_name("use_avx512vl_simd");

    // SIMD vector size: eventually multiple vector sizes may be supported but for now only
    // SSE-sized vectors are available.
@@ -1885,6 +1888,12 @@ fn define_simd(
            .bind(vector(F32, sse_vector_size))
            .bind(vector(I32, sse_vector_size));
        e.enc_both(fcvt_from_sint_32, rec_furm.opcodes(&CVTDQ2PS));
+
+        e.enc_32_64_maybe_isap(
+            x86_vcvtudq2ps,
+            rec_evex_reg_rm_128.opcodes(&VCVTUDQ2PS),
+            Some(use_avx512vl_simd), // TODO need an OR predicate to join with AVX512F
+        );
    }

    // SIMD vconst for special cases (all zeroes, all ones)
--- a/cranelift/codegen/meta/src/isa/x86/instructions.rs
+++ b/cranelift/codegen/meta/src/isa/x86/instructions.rs
@@ -145,6 +145,37 @@ pub(crate) fn define(
        .operands_out(vec![a]),
    );

+    let f32x4 = &TypeVar::new(
+        "f32x4",
+        "A floating point number",
+        TypeSetBuilder::new()
+            .floats(32..32)
+            .simd_lanes(4..4)
+            .build(),
+    );
+    let i32x4 = &TypeVar::new(
+        "i32x4",
+        "An integer type with the same number of lanes",
+        TypeSetBuilder::new().ints(32..32).simd_lanes(4..4).build(),
+    );
+    let x = &Operand::new("x", i32x4);
+    let a = &Operand::new("a", f32x4);
+
+    ig.push(
+        Inst::new(
+            "x86_vcvtudq2ps",
+            r#"
+        Convert unsigned integer to floating point.
+
+        Convert packed doubleword unsigned integers to packed single-precision floating-point 
+        values. This instruction does not trap.
+        "#,
+            &formats.unary,
+        )
+        .operands_in(vec![x])
+        .operands_out(vec![a]),
+    );
+
    let x = &Operand::new("x", Float);
    let a = &Operand::new("a", Float);
    let y = &Operand::new("y", Float);
--- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs
+++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs
@@ -665,6 +665,12 @@ pub static UCOMISS: [u8; 2] = [0x0f, 0x2e];
 /// Raise invalid opcode instruction.
 pub static UNDEFINED2: [u8; 2] = [0x0f, 0x0b];

+/// Convert four packed unsigned doubleword integers from xmm2/m128/m32bcst to packed
+/// single-precision floating-point values in xmm1 with writemask k1. Rounding behavior
+/// is controlled by MXCSR but can be overriden by EVEX.L'L in static rounding mode
+/// (AVX512VL, AVX512F).
+pub static VCVTUDQ2PS: [u8; 3] = [0xf2, 0x0f, 0x7a];
+
 /// imm{16,32} XOR r/m{16,32,64}, possibly sign-extended.
 pub static XOR_IMM: [u8; 1] = [0x81];

--- a/cranelift/codegen/meta/src/isa/x86/recipes.rs
+++ b/cranelift/codegen/meta/src/isa/x86/recipes.rs
@@ -3417,5 +3417,23 @@ pub(crate) fn define<'shared>(
        regs).rex_kind(RecipePrefixKind::Evex)
    );

+    recipes.add_template(
+        Template::new(
+            EncodingRecipeBuilder::new("evex_reg_rm_128", &formats.unary, 1)
+                .operands_in(vec![fpr])
+                .operands_out(vec![fpr])
+                .emit(
+                    r#"
+                // instruction encoding operands: reg (op1, w), rm (op2, r)
+                // this maps to:                  out_reg0,     in_reg0
+                let context = EvexContext::Other { length: EvexVectorLength::V128 };
+                let masking = EvexMasking::None;
+                put_evex(bits, out_reg0, 0, in_reg0, context, masking, sink); // params: reg, vvvv, rm
+                modrm_rr(in_reg0, out_reg0, sink); // params: rm, reg
+                "#,
+                ),
+            regs).rex_kind(RecipePrefixKind::Evex)
+    );
+
    recipes
 }
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -2066,6 +2066,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        | Opcode::X86Packss
        | Opcode::X86Punpckh
        | Opcode::X86Punpckl
+        | Opcode::X86Vcvtudq2ps
        | Opcode::X86ElfTlsGetAddr
        | Opcode::X86MachoTlsGetAddr => {
            panic!("x86-specific opcode in supposedly arch-neutral IR!");
--- a/cranelift/filetests/filetests/isa/x86/simd-avx512-conversion-binemit.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-avx512-conversion-binemit.clif
@@ -0,0 +1,9 @@
+test binemit
+set enable_simd
+target x86_64 has_avx512vl=true
+
+function %fcvt_from_uint(i32x4) {
+block0(v0: i32x4 [%xmm2]):
+[-, %xmm6]  v1 = x86_vcvtudq2ps v0 ; bin: 62 f1 7f 08 7a f2
+    return
+}