From a31336996c184eeccdd7d2e5e120751fc56bf5a7 Mon Sep 17 00:00:00 2001 From: Johnnie Birch <45402135+jlb6740@users.noreply.github.com> Date: Mon, 17 Aug 2020 13:44:10 -0700 Subject: [PATCH] Add support for some packed multiplication for new x64 backend Adds support for i32x4, and i16x8 and lowering for pmuludq in preperation for i64x2. --- cranelift/codegen/src/isa/x64/inst/args.rs | 12 ++- cranelift/codegen/src/isa/x64/inst/emit.rs | 89 ++++++++++--------- .../codegen/src/isa/x64/inst/emit_tests.rs | 18 ++++ cranelift/codegen/src/isa/x64/lower.rs | 5 ++ 4 files changed, 80 insertions(+), 44 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 9885102b66..600381496f 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -395,6 +395,9 @@ pub enum SseOpcode { Paddd, Paddq, Paddw, + Pmulld, + Pmullw, + Pmuludq, Psllw, Pslld, Psllq, @@ -491,6 +494,8 @@ impl SseOpcode { | SseOpcode::Paddd | SseOpcode::Paddq | SseOpcode::Paddw + | SseOpcode::Pmullw + | SseOpcode::Pmuludq | SseOpcode::Psllw | SseOpcode::Pslld | SseOpcode::Psllq @@ -510,7 +515,9 @@ impl SseOpcode { | SseOpcode::Ucomisd | SseOpcode::Xorpd => SSE2, - SseOpcode::Insertps | SseOpcode::Roundss | SseOpcode::Roundsd => SSE41, + SseOpcode::Insertps | SseOpcode::Pmulld | SseOpcode::Roundss | SseOpcode::Roundsd => { + SSE41 + } } } @@ -579,6 +586,9 @@ impl fmt::Debug for SseOpcode { SseOpcode::Paddd => "paddd", SseOpcode::Paddq => "paddq", SseOpcode::Paddw => "paddw", + SseOpcode::Pmulld => "pmulld", + SseOpcode::Pmullw => "pmullw", + SseOpcode::Pmuludq => "pmuludq", SseOpcode::Psllw => "psllw", SseOpcode::Pslld => "pslld", SseOpcode::Psllq => "psllq", diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index f0932b73c4..6118284674 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1632,57 +1632,60 @@ pub(crate) fn emit( dst: reg_g, } => { let rex = RexFlags::clear_w(); - let (prefix, opcode) = match op { - SseOpcode::Addps => (LegacyPrefix::None, 0x0F58), - SseOpcode::Addpd => (LegacyPrefix::_66, 0x0F58), - SseOpcode::Addss => (LegacyPrefix::_F3, 0x0F58), - SseOpcode::Addsd => (LegacyPrefix::_F2, 0x0F58), - SseOpcode::Andpd => (LegacyPrefix::_66, 0x0F54), - SseOpcode::Andps => (LegacyPrefix::None, 0x0F54), - SseOpcode::Andnps => (LegacyPrefix::None, 0x0F55), - SseOpcode::Andnpd => (LegacyPrefix::_66, 0x0F55), - SseOpcode::Divps => (LegacyPrefix::None, 0x0F5E), - SseOpcode::Divpd => (LegacyPrefix::_66, 0x0F5E), - SseOpcode::Divss => (LegacyPrefix::_F3, 0x0F5E), - SseOpcode::Divsd => (LegacyPrefix::_F2, 0x0F5E), - SseOpcode::Minps => (LegacyPrefix::None, 0x0F5D), - SseOpcode::Minpd => (LegacyPrefix::_66, 0x0F5D), - SseOpcode::Minss => (LegacyPrefix::_F3, 0x0F5D), - SseOpcode::Minsd => (LegacyPrefix::_F2, 0x0F5D), - SseOpcode::Maxps => (LegacyPrefix::None, 0x0F5F), - SseOpcode::Maxpd => (LegacyPrefix::_66, 0x0F5F), - SseOpcode::Maxss => (LegacyPrefix::_F3, 0x0F5F), - SseOpcode::Maxsd => (LegacyPrefix::_F2, 0x0F5F), - SseOpcode::Mulps => (LegacyPrefix::None, 0x0F59), - SseOpcode::Mulpd => (LegacyPrefix::_66, 0x0F59), - SseOpcode::Mulss => (LegacyPrefix::_F3, 0x0F59), - SseOpcode::Mulsd => (LegacyPrefix::_F2, 0x0F59), - SseOpcode::Orpd => (LegacyPrefix::_66, 0x0F56), - SseOpcode::Orps => (LegacyPrefix::None, 0x0F56), - SseOpcode::Paddb => (LegacyPrefix::_66, 0x0FFC), - SseOpcode::Paddd => (LegacyPrefix::_66, 0x0FFE), - SseOpcode::Paddq => (LegacyPrefix::_66, 0x0FD4), - SseOpcode::Paddw => (LegacyPrefix::_66, 0x0FFD), - SseOpcode::Psubb => (LegacyPrefix::_66, 0x0FF8), - SseOpcode::Psubd => (LegacyPrefix::_66, 0x0FFA), - SseOpcode::Psubq => (LegacyPrefix::_66, 0x0FFB), - SseOpcode::Psubw => (LegacyPrefix::_66, 0x0FF9), - SseOpcode::Subps => (LegacyPrefix::None, 0x0F5C), - SseOpcode::Subpd => (LegacyPrefix::_66, 0x0F5C), - SseOpcode::Subss => (LegacyPrefix::_F3, 0x0F5C), - SseOpcode::Subsd => (LegacyPrefix::_F2, 0x0F5C), - SseOpcode::Xorps => (LegacyPrefix::None, 0x0F57), - SseOpcode::Xorpd => (LegacyPrefix::_66, 0x0F57), + let (prefix, opcode, length) = match op { + SseOpcode::Addps => (LegacyPrefix::None, 0x0F58, 2), + SseOpcode::Addpd => (LegacyPrefix::_66, 0x0F58, 2), + SseOpcode::Addss => (LegacyPrefix::_F3, 0x0F58, 2), + SseOpcode::Addsd => (LegacyPrefix::_F2, 0x0F58, 2), + SseOpcode::Andpd => (LegacyPrefix::_66, 0x0F54, 2), + SseOpcode::Andps => (LegacyPrefix::None, 0x0F54, 2), + SseOpcode::Andnps => (LegacyPrefix::None, 0x0F55, 2), + SseOpcode::Andnpd => (LegacyPrefix::_66, 0x0F55, 2), + SseOpcode::Divps => (LegacyPrefix::None, 0x0F5E, 2), + SseOpcode::Divpd => (LegacyPrefix::_66, 0x0F5E, 2), + SseOpcode::Divss => (LegacyPrefix::_F3, 0x0F5E, 2), + SseOpcode::Divsd => (LegacyPrefix::_F2, 0x0F5E, 2), + SseOpcode::Minps => (LegacyPrefix::None, 0x0F5D, 2), + SseOpcode::Minpd => (LegacyPrefix::_66, 0x0F5D, 2), + SseOpcode::Minss => (LegacyPrefix::_F3, 0x0F5D, 2), + SseOpcode::Minsd => (LegacyPrefix::_F2, 0x0F5D, 2), + SseOpcode::Maxps => (LegacyPrefix::None, 0x0F5F, 2), + SseOpcode::Maxpd => (LegacyPrefix::_66, 0x0F5F, 2), + SseOpcode::Maxss => (LegacyPrefix::_F3, 0x0F5F, 2), + SseOpcode::Maxsd => (LegacyPrefix::_F2, 0x0F5F, 2), + SseOpcode::Mulps => (LegacyPrefix::None, 0x0F59, 2), + SseOpcode::Mulpd => (LegacyPrefix::_66, 0x0F59, 2), + SseOpcode::Mulss => (LegacyPrefix::_F3, 0x0F59, 2), + SseOpcode::Mulsd => (LegacyPrefix::_F2, 0x0F59, 2), + SseOpcode::Orpd => (LegacyPrefix::_66, 0x0F56, 2), + SseOpcode::Orps => (LegacyPrefix::None, 0x0F56, 2), + SseOpcode::Paddb => (LegacyPrefix::_66, 0x0FFC, 2), + SseOpcode::Paddd => (LegacyPrefix::_66, 0x0FFE, 2), + SseOpcode::Paddq => (LegacyPrefix::_66, 0x0FD4, 2), + SseOpcode::Paddw => (LegacyPrefix::_66, 0x0FFD, 2), + SseOpcode::Pmulld => (LegacyPrefix::_66, 0x0F3840, 3), + SseOpcode::Pmullw => (LegacyPrefix::_66, 0x0FD5, 2), + SseOpcode::Pmuludq => (LegacyPrefix::_66, 0x0FF4, 2), + SseOpcode::Psubb => (LegacyPrefix::_66, 0x0FF8, 2), + SseOpcode::Psubd => (LegacyPrefix::_66, 0x0FFA, 2), + SseOpcode::Psubq => (LegacyPrefix::_66, 0x0FFB, 2), + SseOpcode::Psubw => (LegacyPrefix::_66, 0x0FF9, 2), + SseOpcode::Subps => (LegacyPrefix::None, 0x0F5C, 2), + SseOpcode::Subpd => (LegacyPrefix::_66, 0x0F5C, 2), + SseOpcode::Subss => (LegacyPrefix::_F3, 0x0F5C, 2), + SseOpcode::Subsd => (LegacyPrefix::_F2, 0x0F5C, 2), + SseOpcode::Xorps => (LegacyPrefix::None, 0x0F57, 2), + SseOpcode::Xorpd => (LegacyPrefix::_66, 0x0F57, 2), _ => unimplemented!("Opcode {:?} not implemented", op), }; match src_e { RegMem::Reg { reg: reg_e } => { - emit_std_reg_reg(sink, prefix, opcode, 2, reg_g.to_reg(), *reg_e, rex); + emit_std_reg_reg(sink, prefix, opcode, length, reg_g.to_reg(), *reg_e, rex); } RegMem::Mem { addr } => { let addr = &addr.finalize(state); - emit_std_reg_mem(sink, prefix, opcode, 2, reg_g.to_reg(), addr, rex); + emit_std_reg_mem(sink, prefix, opcode, length, reg_g.to_reg(), addr, rex); } } } diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index 877c2282b4..e0f2ea1acd 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -3062,6 +3062,24 @@ fn test_x64_emit() { "psubq %xmm8, %xmm1", )); + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmulld, RegMem::reg(xmm15), w_xmm6), + "66410F3840F7", + "pmulld %xmm15, %xmm6", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmullw, RegMem::reg(xmm14), w_xmm1), + "66410FD5CE", + "pmullw %xmm14, %xmm1", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(xmm8), w_xmm9), + "66450FF4C8", + "pmuludq %xmm8, %xmm9", + )); + // XMM_Mov_R_M: float stores insns.push(( Inst::xmm_mov_r_m(SseOpcode::Movss, xmm15, Amode::imm_reg(128, r12), None), diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 66d16c894d..f4eb306882 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -357,6 +357,11 @@ fn lower_insn_to_regs>( types::I64X2 => SseOpcode::Psubq, _ => panic!("Unsupported type for packed Isub instruction"), }, + Opcode::Imul => match ty { + types::I16X8 => SseOpcode::Pmullw, + types::I32X4 => SseOpcode::Pmulld, + _ => panic!("Unsupported type for packed Imul instruction"), + }, _ => panic!("Unsupported packed instruction"), }; let lhs = input_to_reg(ctx, inputs[0]);