[SIMD][x86_64] Add encoding for PMADDWD (#2530)
* [SIMD][x86_64] Add encoding for PMADDWD * also for "experimental_x64"
This commit is contained in:
@@ -1691,6 +1691,7 @@ fn define_simd(
|
|||||||
let usub_sat = shared.by_name("usub_sat");
|
let usub_sat = shared.by_name("usub_sat");
|
||||||
let vconst = shared.by_name("vconst");
|
let vconst = shared.by_name("vconst");
|
||||||
let vselect = shared.by_name("vselect");
|
let vselect = shared.by_name("vselect");
|
||||||
|
let widening_pairwise_dot_product_s = shared.by_name("widening_pairwise_dot_product_s");
|
||||||
let x86_cvtt2si = x86.by_name("x86_cvtt2si");
|
let x86_cvtt2si = x86.by_name("x86_cvtt2si");
|
||||||
let x86_insertps = x86.by_name("x86_insertps");
|
let x86_insertps = x86.by_name("x86_insertps");
|
||||||
let x86_fmax = x86.by_name("x86_fmax");
|
let x86_fmax = x86.by_name("x86_fmax");
|
||||||
@@ -2213,6 +2214,9 @@ fn define_simd(
|
|||||||
// SIMD multiplication with lane expansion.
|
// SIMD multiplication with lane expansion.
|
||||||
e.enc_both_inferred(x86_pmuludq, rec_fa.opcodes(&PMULUDQ));
|
e.enc_both_inferred(x86_pmuludq, rec_fa.opcodes(&PMULUDQ));
|
||||||
|
|
||||||
|
// SIMD multiplication and add adjacent pairs, from SSE2.
|
||||||
|
e.enc_both_inferred(widening_pairwise_dot_product_s, rec_fa.opcodes(&PMADDWD));
|
||||||
|
|
||||||
// SIMD integer multiplication for I64x2 using a AVX512.
|
// SIMD integer multiplication for I64x2 using a AVX512.
|
||||||
{
|
{
|
||||||
e.enc_32_64_maybe_isap(
|
e.enc_32_64_maybe_isap(
|
||||||
|
|||||||
@@ -508,6 +508,9 @@ pub static VPMULLQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x40];
|
|||||||
/// in xmm2/m128, and store the quadword results in xmm1 (SSE2).
|
/// in xmm2/m128, and store the quadword results in xmm1 (SSE2).
|
||||||
pub static PMULUDQ: [u8; 3] = [0x66, 0x0f, 0xf4];
|
pub static PMULUDQ: [u8; 3] = [0x66, 0x0f, 0xf4];
|
||||||
|
|
||||||
|
/// Multiply the packed word integers, add adjacent doubleword results.
|
||||||
|
pub static PMADDWD: [u8; 3] = [0x66, 0x0f, 0xf5];
|
||||||
|
|
||||||
/// Pop top of stack into r{16,32,64}; increment stack pointer.
|
/// Pop top of stack into r{16,32,64}; increment stack pointer.
|
||||||
pub static POP_REG: [u8; 1] = [0x58];
|
pub static POP_REG: [u8; 1] = [0x58];
|
||||||
|
|
||||||
|
|||||||
@@ -498,6 +498,7 @@ pub enum SseOpcode {
|
|||||||
Pinsrb,
|
Pinsrb,
|
||||||
Pinsrw,
|
Pinsrw,
|
||||||
Pinsrd,
|
Pinsrd,
|
||||||
|
Pmaddwd,
|
||||||
Pmaxsb,
|
Pmaxsb,
|
||||||
Pmaxsw,
|
Pmaxsw,
|
||||||
Pmaxsd,
|
Pmaxsd,
|
||||||
@@ -661,6 +662,7 @@ impl SseOpcode {
|
|||||||
| SseOpcode::Pcmpgtd
|
| SseOpcode::Pcmpgtd
|
||||||
| SseOpcode::Pextrw
|
| SseOpcode::Pextrw
|
||||||
| SseOpcode::Pinsrw
|
| SseOpcode::Pinsrw
|
||||||
|
| SseOpcode::Pmaddwd
|
||||||
| SseOpcode::Pmaxsw
|
| SseOpcode::Pmaxsw
|
||||||
| SseOpcode::Pmaxub
|
| SseOpcode::Pmaxub
|
||||||
| SseOpcode::Pminsw
|
| SseOpcode::Pminsw
|
||||||
@@ -842,6 +844,7 @@ impl fmt::Debug for SseOpcode {
|
|||||||
SseOpcode::Pinsrb => "pinsrb",
|
SseOpcode::Pinsrb => "pinsrb",
|
||||||
SseOpcode::Pinsrw => "pinsrw",
|
SseOpcode::Pinsrw => "pinsrw",
|
||||||
SseOpcode::Pinsrd => "pinsrd",
|
SseOpcode::Pinsrd => "pinsrd",
|
||||||
|
SseOpcode::Pmaddwd => "pmaddwd",
|
||||||
SseOpcode::Pmaxsb => "pmaxsb",
|
SseOpcode::Pmaxsb => "pmaxsb",
|
||||||
SseOpcode::Pmaxsw => "pmaxsw",
|
SseOpcode::Pmaxsw => "pmaxsw",
|
||||||
SseOpcode::Pmaxsd => "pmaxsd",
|
SseOpcode::Pmaxsd => "pmaxsd",
|
||||||
|
|||||||
@@ -1873,6 +1873,7 @@ pub(crate) fn emit(
|
|||||||
SseOpcode::Pcmpgtw => (LegacyPrefixes::_66, 0x0F65, 2),
|
SseOpcode::Pcmpgtw => (LegacyPrefixes::_66, 0x0F65, 2),
|
||||||
SseOpcode::Pcmpgtd => (LegacyPrefixes::_66, 0x0F66, 2),
|
SseOpcode::Pcmpgtd => (LegacyPrefixes::_66, 0x0F66, 2),
|
||||||
SseOpcode::Pcmpgtq => (LegacyPrefixes::_66, 0x0F3837, 3),
|
SseOpcode::Pcmpgtq => (LegacyPrefixes::_66, 0x0F3837, 3),
|
||||||
|
SseOpcode::Pmaddwd => (LegacyPrefixes::_66, 0x0FF5, 2),
|
||||||
SseOpcode::Pmaxsb => (LegacyPrefixes::_66, 0x0F383C, 3),
|
SseOpcode::Pmaxsb => (LegacyPrefixes::_66, 0x0F383C, 3),
|
||||||
SseOpcode::Pmaxsw => (LegacyPrefixes::_66, 0x0FEE, 2),
|
SseOpcode::Pmaxsw => (LegacyPrefixes::_66, 0x0FEE, 2),
|
||||||
SseOpcode::Pmaxsd => (LegacyPrefixes::_66, 0x0F383D, 3),
|
SseOpcode::Pmaxsd => (LegacyPrefixes::_66, 0x0F383D, 3),
|
||||||
|
|||||||
@@ -3067,6 +3067,12 @@ fn test_x64_emit() {
|
|||||||
"pmuludq %xmm8, %xmm9",
|
"pmuludq %xmm8, %xmm9",
|
||||||
));
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_rm_r(SseOpcode::Pmaddwd, RegMem::reg(xmm8), w_xmm1),
|
||||||
|
"66410FF5C8",
|
||||||
|
"pmaddwd %xmm8, %xmm1",
|
||||||
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Pmaxsb, RegMem::reg(xmm15), w_xmm6),
|
Inst::xmm_rm_r(SseOpcode::Pmaxsb, RegMem::reg(xmm15), w_xmm6),
|
||||||
"66410F383CF7",
|
"66410F383CF7",
|
||||||
|
|||||||
@@ -2235,6 +2235,24 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Opcode::WideningPairwiseDotProductS => {
|
||||||
|
let lhs = put_input_in_reg(ctx, inputs[0]);
|
||||||
|
let rhs = input_to_reg_mem(ctx, inputs[1]);
|
||||||
|
let dst = get_output_reg(ctx, outputs[0]);
|
||||||
|
let ty = ty.unwrap();
|
||||||
|
|
||||||
|
ctx.emit(Inst::gen_move(dst, lhs, ty));
|
||||||
|
|
||||||
|
if ty == types::I32X4 {
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmaddwd, rhs, dst));
|
||||||
|
} else {
|
||||||
|
panic!(
|
||||||
|
"Opcode::WideningPairwiseDotProductS: unsupported laneage: {:?}",
|
||||||
|
ty
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv => {
|
Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv => {
|
||||||
let lhs = put_input_in_reg(ctx, inputs[0]);
|
let lhs = put_input_in_reg(ctx, inputs[0]);
|
||||||
let rhs = input_to_reg_mem(ctx, inputs[1]);
|
let rhs = input_to_reg_mem(ctx, inputs[1]);
|
||||||
|
|||||||
@@ -108,3 +108,9 @@ block0(v0: i64x2 [%xmm3], v1: i64x2 [%xmm5]):
|
|||||||
[-, %xmm3] v2 = x86_pmuludq v0, v1 ; bin: 66 0f f4 dd
|
[-, %xmm3] v2 = x86_pmuludq v0, v1 ; bin: 66 0f f4 dd
|
||||||
return v2
|
return v2
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function %pmaddwd(i16x8, i16x8) -> i32x4 {
|
||||||
|
block0(v0: i16x8 [%xmm8], v1: i16x8 [%xmm9]):
|
||||||
|
[-, %xmm8] v2 = widening_pairwise_dot_product_s v0, v1 ; bin: 66 45 0f f5 c1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user