Add support for some packed multiplication for new x64 backend

Adds support for i32x4, and i16x8 and lowering for pmuludq in
preperation for i64x2.
This commit is contained in:
Johnnie Birch
2020-08-17 13:44:10 -07:00
parent 81b3450114
commit a31336996c
4 changed files with 80 additions and 44 deletions

View File

@@ -395,6 +395,9 @@ pub enum SseOpcode {
Paddd,
Paddq,
Paddw,
Pmulld,
Pmullw,
Pmuludq,
Psllw,
Pslld,
Psllq,
@@ -491,6 +494,8 @@ impl SseOpcode {
| SseOpcode::Paddd
| SseOpcode::Paddq
| SseOpcode::Paddw
| SseOpcode::Pmullw
| SseOpcode::Pmuludq
| SseOpcode::Psllw
| SseOpcode::Pslld
| SseOpcode::Psllq
@@ -510,7 +515,9 @@ impl SseOpcode {
| SseOpcode::Ucomisd
| SseOpcode::Xorpd => SSE2,
SseOpcode::Insertps | SseOpcode::Roundss | SseOpcode::Roundsd => SSE41,
SseOpcode::Insertps | SseOpcode::Pmulld | SseOpcode::Roundss | SseOpcode::Roundsd => {
SSE41
}
}
}
@@ -579,6 +586,9 @@ impl fmt::Debug for SseOpcode {
SseOpcode::Paddd => "paddd",
SseOpcode::Paddq => "paddq",
SseOpcode::Paddw => "paddw",
SseOpcode::Pmulld => "pmulld",
SseOpcode::Pmullw => "pmullw",
SseOpcode::Pmuludq => "pmuludq",
SseOpcode::Psllw => "psllw",
SseOpcode::Pslld => "pslld",
SseOpcode::Psllq => "psllq",

View File

@@ -1632,57 +1632,60 @@ pub(crate) fn emit(
dst: reg_g,
} => {
let rex = RexFlags::clear_w();
let (prefix, opcode) = match op {
SseOpcode::Addps => (LegacyPrefix::None, 0x0F58),
SseOpcode::Addpd => (LegacyPrefix::_66, 0x0F58),
SseOpcode::Addss => (LegacyPrefix::_F3, 0x0F58),
SseOpcode::Addsd => (LegacyPrefix::_F2, 0x0F58),
SseOpcode::Andpd => (LegacyPrefix::_66, 0x0F54),
SseOpcode::Andps => (LegacyPrefix::None, 0x0F54),
SseOpcode::Andnps => (LegacyPrefix::None, 0x0F55),
SseOpcode::Andnpd => (LegacyPrefix::_66, 0x0F55),
SseOpcode::Divps => (LegacyPrefix::None, 0x0F5E),
SseOpcode::Divpd => (LegacyPrefix::_66, 0x0F5E),
SseOpcode::Divss => (LegacyPrefix::_F3, 0x0F5E),
SseOpcode::Divsd => (LegacyPrefix::_F2, 0x0F5E),
SseOpcode::Minps => (LegacyPrefix::None, 0x0F5D),
SseOpcode::Minpd => (LegacyPrefix::_66, 0x0F5D),
SseOpcode::Minss => (LegacyPrefix::_F3, 0x0F5D),
SseOpcode::Minsd => (LegacyPrefix::_F2, 0x0F5D),
SseOpcode::Maxps => (LegacyPrefix::None, 0x0F5F),
SseOpcode::Maxpd => (LegacyPrefix::_66, 0x0F5F),
SseOpcode::Maxss => (LegacyPrefix::_F3, 0x0F5F),
SseOpcode::Maxsd => (LegacyPrefix::_F2, 0x0F5F),
SseOpcode::Mulps => (LegacyPrefix::None, 0x0F59),
SseOpcode::Mulpd => (LegacyPrefix::_66, 0x0F59),
SseOpcode::Mulss => (LegacyPrefix::_F3, 0x0F59),
SseOpcode::Mulsd => (LegacyPrefix::_F2, 0x0F59),
SseOpcode::Orpd => (LegacyPrefix::_66, 0x0F56),
SseOpcode::Orps => (LegacyPrefix::None, 0x0F56),
SseOpcode::Paddb => (LegacyPrefix::_66, 0x0FFC),
SseOpcode::Paddd => (LegacyPrefix::_66, 0x0FFE),
SseOpcode::Paddq => (LegacyPrefix::_66, 0x0FD4),
SseOpcode::Paddw => (LegacyPrefix::_66, 0x0FFD),
SseOpcode::Psubb => (LegacyPrefix::_66, 0x0FF8),
SseOpcode::Psubd => (LegacyPrefix::_66, 0x0FFA),
SseOpcode::Psubq => (LegacyPrefix::_66, 0x0FFB),
SseOpcode::Psubw => (LegacyPrefix::_66, 0x0FF9),
SseOpcode::Subps => (LegacyPrefix::None, 0x0F5C),
SseOpcode::Subpd => (LegacyPrefix::_66, 0x0F5C),
SseOpcode::Subss => (LegacyPrefix::_F3, 0x0F5C),
SseOpcode::Subsd => (LegacyPrefix::_F2, 0x0F5C),
SseOpcode::Xorps => (LegacyPrefix::None, 0x0F57),
SseOpcode::Xorpd => (LegacyPrefix::_66, 0x0F57),
let (prefix, opcode, length) = match op {
SseOpcode::Addps => (LegacyPrefix::None, 0x0F58, 2),
SseOpcode::Addpd => (LegacyPrefix::_66, 0x0F58, 2),
SseOpcode::Addss => (LegacyPrefix::_F3, 0x0F58, 2),
SseOpcode::Addsd => (LegacyPrefix::_F2, 0x0F58, 2),
SseOpcode::Andpd => (LegacyPrefix::_66, 0x0F54, 2),
SseOpcode::Andps => (LegacyPrefix::None, 0x0F54, 2),
SseOpcode::Andnps => (LegacyPrefix::None, 0x0F55, 2),
SseOpcode::Andnpd => (LegacyPrefix::_66, 0x0F55, 2),
SseOpcode::Divps => (LegacyPrefix::None, 0x0F5E, 2),
SseOpcode::Divpd => (LegacyPrefix::_66, 0x0F5E, 2),
SseOpcode::Divss => (LegacyPrefix::_F3, 0x0F5E, 2),
SseOpcode::Divsd => (LegacyPrefix::_F2, 0x0F5E, 2),
SseOpcode::Minps => (LegacyPrefix::None, 0x0F5D, 2),
SseOpcode::Minpd => (LegacyPrefix::_66, 0x0F5D, 2),
SseOpcode::Minss => (LegacyPrefix::_F3, 0x0F5D, 2),
SseOpcode::Minsd => (LegacyPrefix::_F2, 0x0F5D, 2),
SseOpcode::Maxps => (LegacyPrefix::None, 0x0F5F, 2),
SseOpcode::Maxpd => (LegacyPrefix::_66, 0x0F5F, 2),
SseOpcode::Maxss => (LegacyPrefix::_F3, 0x0F5F, 2),
SseOpcode::Maxsd => (LegacyPrefix::_F2, 0x0F5F, 2),
SseOpcode::Mulps => (LegacyPrefix::None, 0x0F59, 2),
SseOpcode::Mulpd => (LegacyPrefix::_66, 0x0F59, 2),
SseOpcode::Mulss => (LegacyPrefix::_F3, 0x0F59, 2),
SseOpcode::Mulsd => (LegacyPrefix::_F2, 0x0F59, 2),
SseOpcode::Orpd => (LegacyPrefix::_66, 0x0F56, 2),
SseOpcode::Orps => (LegacyPrefix::None, 0x0F56, 2),
SseOpcode::Paddb => (LegacyPrefix::_66, 0x0FFC, 2),
SseOpcode::Paddd => (LegacyPrefix::_66, 0x0FFE, 2),
SseOpcode::Paddq => (LegacyPrefix::_66, 0x0FD4, 2),
SseOpcode::Paddw => (LegacyPrefix::_66, 0x0FFD, 2),
SseOpcode::Pmulld => (LegacyPrefix::_66, 0x0F3840, 3),
SseOpcode::Pmullw => (LegacyPrefix::_66, 0x0FD5, 2),
SseOpcode::Pmuludq => (LegacyPrefix::_66, 0x0FF4, 2),
SseOpcode::Psubb => (LegacyPrefix::_66, 0x0FF8, 2),
SseOpcode::Psubd => (LegacyPrefix::_66, 0x0FFA, 2),
SseOpcode::Psubq => (LegacyPrefix::_66, 0x0FFB, 2),
SseOpcode::Psubw => (LegacyPrefix::_66, 0x0FF9, 2),
SseOpcode::Subps => (LegacyPrefix::None, 0x0F5C, 2),
SseOpcode::Subpd => (LegacyPrefix::_66, 0x0F5C, 2),
SseOpcode::Subss => (LegacyPrefix::_F3, 0x0F5C, 2),
SseOpcode::Subsd => (LegacyPrefix::_F2, 0x0F5C, 2),
SseOpcode::Xorps => (LegacyPrefix::None, 0x0F57, 2),
SseOpcode::Xorpd => (LegacyPrefix::_66, 0x0F57, 2),
_ => unimplemented!("Opcode {:?} not implemented", op),
};
match src_e {
RegMem::Reg { reg: reg_e } => {
emit_std_reg_reg(sink, prefix, opcode, 2, reg_g.to_reg(), *reg_e, rex);
emit_std_reg_reg(sink, prefix, opcode, length, reg_g.to_reg(), *reg_e, rex);
}
RegMem::Mem { addr } => {
let addr = &addr.finalize(state);
emit_std_reg_mem(sink, prefix, opcode, 2, reg_g.to_reg(), addr, rex);
emit_std_reg_mem(sink, prefix, opcode, length, reg_g.to_reg(), addr, rex);
}
}
}

View File

@@ -3062,6 +3062,24 @@ fn test_x64_emit() {
"psubq %xmm8, %xmm1",
));
insns.push((
Inst::xmm_rm_r(SseOpcode::Pmulld, RegMem::reg(xmm15), w_xmm6),
"66410F3840F7",
"pmulld %xmm15, %xmm6",
));
insns.push((
Inst::xmm_rm_r(SseOpcode::Pmullw, RegMem::reg(xmm14), w_xmm1),
"66410FD5CE",
"pmullw %xmm14, %xmm1",
));
insns.push((
Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(xmm8), w_xmm9),
"66450FF4C8",
"pmuludq %xmm8, %xmm9",
));
// XMM_Mov_R_M: float stores
insns.push((
Inst::xmm_mov_r_m(SseOpcode::Movss, xmm15, Amode::imm_reg(128, r12), None),

View File

@@ -357,6 +357,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
types::I64X2 => SseOpcode::Psubq,
_ => panic!("Unsupported type for packed Isub instruction"),
},
Opcode::Imul => match ty {
types::I16X8 => SseOpcode::Pmullw,
types::I32X4 => SseOpcode::Pmulld,
_ => panic!("Unsupported type for packed Imul instruction"),
},
_ => panic!("Unsupported packed instruction"),
};
let lhs = input_to_reg(ctx, inputs[0]);