From 2964023a77f94939242aad5381e32ff458e3793c Mon Sep 17 00:00:00 2001 From: Yury Delendik Date: Thu, 24 Dec 2020 07:52:50 -0600 Subject: [PATCH] [SIMD][x86_64] Add encoding for PMADDWD (#2530) * [SIMD][x86_64] Add encoding for PMADDWD * also for "experimental_x64" --- .../codegen/meta/src/isa/x86/encodings.rs | 4 ++++ cranelift/codegen/meta/src/isa/x86/opcodes.rs | 3 +++ cranelift/codegen/src/isa/x64/inst/args.rs | 3 +++ cranelift/codegen/src/isa/x64/inst/emit.rs | 1 + .../codegen/src/isa/x64/inst/emit_tests.rs | 6 ++++++ cranelift/codegen/src/isa/x64/lower.rs | 18 ++++++++++++++++++ .../isa/x86/simd-arithmetic-binemit.clif | 6 ++++++ 7 files changed, 41 insertions(+) diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index de48c57c5d..bdfcef9c6f 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -1691,6 +1691,7 @@ fn define_simd( let usub_sat = shared.by_name("usub_sat"); let vconst = shared.by_name("vconst"); let vselect = shared.by_name("vselect"); + let widening_pairwise_dot_product_s = shared.by_name("widening_pairwise_dot_product_s"); let x86_cvtt2si = x86.by_name("x86_cvtt2si"); let x86_insertps = x86.by_name("x86_insertps"); let x86_fmax = x86.by_name("x86_fmax"); @@ -2213,6 +2214,9 @@ fn define_simd( // SIMD multiplication with lane expansion. e.enc_both_inferred(x86_pmuludq, rec_fa.opcodes(&PMULUDQ)); + // SIMD multiplication and add adjacent pairs, from SSE2. + e.enc_both_inferred(widening_pairwise_dot_product_s, rec_fa.opcodes(&PMADDWD)); + // SIMD integer multiplication for I64x2 using a AVX512. { e.enc_32_64_maybe_isap( diff --git a/cranelift/codegen/meta/src/isa/x86/opcodes.rs b/cranelift/codegen/meta/src/isa/x86/opcodes.rs index 595d13ba2b..2e72a1744d 100644 --- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs +++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs @@ -508,6 +508,9 @@ pub static VPMULLQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x40]; /// in xmm2/m128, and store the quadword results in xmm1 (SSE2). pub static PMULUDQ: [u8; 3] = [0x66, 0x0f, 0xf4]; +/// Multiply the packed word integers, add adjacent doubleword results. +pub static PMADDWD: [u8; 3] = [0x66, 0x0f, 0xf5]; + /// Pop top of stack into r{16,32,64}; increment stack pointer. pub static POP_REG: [u8; 1] = [0x58]; diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 7e3b3f22a2..4c61954630 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -498,6 +498,7 @@ pub enum SseOpcode { Pinsrb, Pinsrw, Pinsrd, + Pmaddwd, Pmaxsb, Pmaxsw, Pmaxsd, @@ -661,6 +662,7 @@ impl SseOpcode { | SseOpcode::Pcmpgtd | SseOpcode::Pextrw | SseOpcode::Pinsrw + | SseOpcode::Pmaddwd | SseOpcode::Pmaxsw | SseOpcode::Pmaxub | SseOpcode::Pminsw @@ -842,6 +844,7 @@ impl fmt::Debug for SseOpcode { SseOpcode::Pinsrb => "pinsrb", SseOpcode::Pinsrw => "pinsrw", SseOpcode::Pinsrd => "pinsrd", + SseOpcode::Pmaddwd => "pmaddwd", SseOpcode::Pmaxsb => "pmaxsb", SseOpcode::Pmaxsw => "pmaxsw", SseOpcode::Pmaxsd => "pmaxsd", diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index b655178bdf..fb32635a92 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1873,6 +1873,7 @@ pub(crate) fn emit( SseOpcode::Pcmpgtw => (LegacyPrefixes::_66, 0x0F65, 2), SseOpcode::Pcmpgtd => (LegacyPrefixes::_66, 0x0F66, 2), SseOpcode::Pcmpgtq => (LegacyPrefixes::_66, 0x0F3837, 3), + SseOpcode::Pmaddwd => (LegacyPrefixes::_66, 0x0FF5, 2), SseOpcode::Pmaxsb => (LegacyPrefixes::_66, 0x0F383C, 3), SseOpcode::Pmaxsw => (LegacyPrefixes::_66, 0x0FEE, 2), SseOpcode::Pmaxsd => (LegacyPrefixes::_66, 0x0F383D, 3), diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index 0786746672..e2afa80e2a 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -3067,6 +3067,12 @@ fn test_x64_emit() { "pmuludq %xmm8, %xmm9", )); + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmaddwd, RegMem::reg(xmm8), w_xmm1), + "66410FF5C8", + "pmaddwd %xmm8, %xmm1", + )); + insns.push(( Inst::xmm_rm_r(SseOpcode::Pmaxsb, RegMem::reg(xmm15), w_xmm6), "66410F383CF7", diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index b15cb62a73..6e6198c44b 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -2235,6 +2235,24 @@ fn lower_insn_to_regs>( } } + Opcode::WideningPairwiseDotProductS => { + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = input_to_reg_mem(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + + ctx.emit(Inst::gen_move(dst, lhs, ty)); + + if ty == types::I32X4 { + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmaddwd, rhs, dst)); + } else { + panic!( + "Opcode::WideningPairwiseDotProductS: unsupported laneage: {:?}", + ty + ); + } + } + Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv => { let lhs = put_input_in_reg(ctx, inputs[0]); let rhs = input_to_reg_mem(ctx, inputs[1]); diff --git a/cranelift/filetests/filetests/isa/x86/simd-arithmetic-binemit.clif b/cranelift/filetests/filetests/isa/x86/simd-arithmetic-binemit.clif index 8df5c2afa8..cd942338dd 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-arithmetic-binemit.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-arithmetic-binemit.clif @@ -108,3 +108,9 @@ block0(v0: i64x2 [%xmm3], v1: i64x2 [%xmm5]): [-, %xmm3] v2 = x86_pmuludq v0, v1 ; bin: 66 0f f4 dd return v2 } + +function %pmaddwd(i16x8, i16x8) -> i32x4 { +block0(v0: i16x8 [%xmm8], v1: i16x8 [%xmm9]): +[-, %xmm8] v2 = widening_pairwise_dot_product_s v0, v1 ; bin: 66 45 0f f5 c1 + return v2 +}