Add x86-specific instruction for i64x2 multiplication

Without this special instruction, legalizing to the AVX512 instruction AND the SSE instruction sequence is impossible. This extra instruction would be rendered unnecessary by the x64 backend.
2020-05-20 16:17:16 -07:00
parent 8701645493
commit 9ba9fd0f64
6 changed files with 35 additions and 3 deletions
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -1645,6 +1645,7 @@ fn define_simd(
    let x86_pmaxu = x86.by_name("x86_pmaxu");
    let x86_pmins = x86.by_name("x86_pmins");
    let x86_pminu = x86.by_name("x86_pminu");
    let x86_pmullq = x86.by_name("x86_pmullq");
    let x86_pshufb = x86.by_name("x86_pshufb");
    let x86_pshufd = x86.by_name("x86_pshufd");
    let x86_psll = x86.by_name("x86_psll");
@@ -2101,9 +2102,8 @@ fn define_simd(
    // SIMD integer multiplication for I64x2 using a AVX512.
    {
        let imul = imul.bind(vector(I64, sse_vector_size));
        e.enc_32_64_maybe_isap(
-            imul,
+            x86_pmullq,
            rec_evex_reg_vvvv_rm_128.opcodes(&PMULLQ).w(),
            Some(use_avx512dq_simd), // TODO need an OR predicate to join with AVX512VL
        );
--- a/cranelift/codegen/meta/src/isa/x86/instructions.rs
+++ b/cranelift/codegen/meta/src/isa/x86/instructions.rs
@@ -532,6 +532,23 @@ pub(crate) fn define(
        .operands_out(vec![a]),
    );
    let x = &Operand::new("x", I64x2);
    let y = &Operand::new("y", I64x2);
    let a = &Operand::new("a", I64x2);
    ig.push(
        Inst::new(
            "x86_pmullq",
            r#"
        Multiply Packed Integers -- Multiply two 64x2 integers and receive a 64x2 result with
        lane-wise wrapping if the result overflows. This instruction is necessary to add distinct
        encodings for CPUs with newer vector features.
        "#,
            &formats.binary,
        )
        .operands_in(vec![x, y])
        .operands_out(vec![a]),
    );
    let x = &Operand::new("x", TxN);
    let y = &Operand::new("y", TxN);
    let f = &Operand::new("f", iflags);
--- a/cranelift/codegen/meta/src/isa/x86/legalize.rs
+++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs
@@ -359,6 +359,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
    let icmp = insts.by_name("icmp");
    let imax = insts.by_name("imax");
    let imin = insts.by_name("imin");
    let imul = insts.by_name("imul");
    let ineg = insts.by_name("ineg");
    let insertlane = insts.by_name("insertlane");
    let ishl = insts.by_name("ishl");
@@ -763,6 +764,12 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
        );
    }
    // SIMD imul
    {
        let imul = imul.bind(vector(I64, sse_vector_size));
        narrow.legalize(def!(c = imul(a, b)), vec![def!(c = x86_pmullq(a, b))]);
    }
    narrow.custom_legalize(shuffle, "convert_shuffle");
    narrow.custom_legalize(extractlane, "convert_extractlane");
    narrow.custom_legalize(insertlane, "convert_insertlane");
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -1911,6 +1911,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        | Opcode::X86Pmaxu
        | Opcode::X86Pmins
        | Opcode::X86Pminu
        | Opcode::X86Pmullq
        | Opcode::X86Packss
        | Opcode::X86Punpckh
        | Opcode::X86Punpckl
--- a/cranelift/filetests/filetests/isa/x86/simd-arithmetic-legalize.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-arithmetic-legalize.clif
@@ -69,3 +69,10 @@ block0:
    ; nextln: v1 = band v0, v4
    return
 }
 function %imul(i64x2, i64x2) {
 block0(v0:i64x2, v1:i64x2):
    v2 = imul v0, v1
    ; check: v2 = x86_pmullq v0, v1
    return
 }
--- a/cranelift/filetests/filetests/isa/x86/simd-avx512-arithmetic-binemit.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-avx512-arithmetic-binemit.clif
@@ -6,7 +6,7 @@ function %imul_i64x2() {
 block0:
    [-, %xmm1]    v0 = vconst.i64x2 [1 2]
    [-, %xmm2]    v1 = vconst.i64x2 [2 2]
-    [-, %xmm14]   v2 = imul v0, v1 ; bin: 62 72 f5 08 40 f2
+    [-, %xmm14]   v2 = x86_pmullq v0, v1 ; bin: 62 72 f5 08 40 f2
    ; 62, mandatory EVEX prefix
    ; 72 = 0111 0010, R is set (MSB in %xmm14) while X, B, and R' are unset (note these are all inverted); mm is set to 0F38
    ; f5 = 1111 0101, W is set (64-bit op), vvvv set to 1 (inverted), bit 2 always set, pp set to 01