Add x86-specific instruction for i64x2 multiplication

Without this special instruction, legalizing to the AVX512 instruction AND the SSE instruction sequence is impossible. This extra instruction would be rendered unnecessary by the x64 backend.
This commit is contained in:
Andrew Brown
2020-05-20 16:17:16 -07:00
parent 8701645493
commit 9ba9fd0f64
6 changed files with 35 additions and 3 deletions

View File

@@ -1645,6 +1645,7 @@ fn define_simd(
let x86_pmaxu = x86.by_name("x86_pmaxu"); let x86_pmaxu = x86.by_name("x86_pmaxu");
let x86_pmins = x86.by_name("x86_pmins"); let x86_pmins = x86.by_name("x86_pmins");
let x86_pminu = x86.by_name("x86_pminu"); let x86_pminu = x86.by_name("x86_pminu");
let x86_pmullq = x86.by_name("x86_pmullq");
let x86_pshufb = x86.by_name("x86_pshufb"); let x86_pshufb = x86.by_name("x86_pshufb");
let x86_pshufd = x86.by_name("x86_pshufd"); let x86_pshufd = x86.by_name("x86_pshufd");
let x86_psll = x86.by_name("x86_psll"); let x86_psll = x86.by_name("x86_psll");
@@ -2101,9 +2102,8 @@ fn define_simd(
// SIMD integer multiplication for I64x2 using a AVX512. // SIMD integer multiplication for I64x2 using a AVX512.
{ {
let imul = imul.bind(vector(I64, sse_vector_size));
e.enc_32_64_maybe_isap( e.enc_32_64_maybe_isap(
imul, x86_pmullq,
rec_evex_reg_vvvv_rm_128.opcodes(&PMULLQ).w(), rec_evex_reg_vvvv_rm_128.opcodes(&PMULLQ).w(),
Some(use_avx512dq_simd), // TODO need an OR predicate to join with AVX512VL Some(use_avx512dq_simd), // TODO need an OR predicate to join with AVX512VL
); );

View File

@@ -532,6 +532,23 @@ pub(crate) fn define(
.operands_out(vec![a]), .operands_out(vec![a]),
); );
let x = &Operand::new("x", I64x2);
let y = &Operand::new("y", I64x2);
let a = &Operand::new("a", I64x2);
ig.push(
Inst::new(
"x86_pmullq",
r#"
Multiply Packed Integers -- Multiply two 64x2 integers and receive a 64x2 result with
lane-wise wrapping if the result overflows. This instruction is necessary to add distinct
encodings for CPUs with newer vector features.
"#,
&formats.binary,
)
.operands_in(vec![x, y])
.operands_out(vec![a]),
);
let x = &Operand::new("x", TxN); let x = &Operand::new("x", TxN);
let y = &Operand::new("y", TxN); let y = &Operand::new("y", TxN);
let f = &Operand::new("f", iflags); let f = &Operand::new("f", iflags);

View File

@@ -359,6 +359,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
let icmp = insts.by_name("icmp"); let icmp = insts.by_name("icmp");
let imax = insts.by_name("imax"); let imax = insts.by_name("imax");
let imin = insts.by_name("imin"); let imin = insts.by_name("imin");
let imul = insts.by_name("imul");
let ineg = insts.by_name("ineg"); let ineg = insts.by_name("ineg");
let insertlane = insts.by_name("insertlane"); let insertlane = insts.by_name("insertlane");
let ishl = insts.by_name("ishl"); let ishl = insts.by_name("ishl");
@@ -763,6 +764,12 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
); );
} }
// SIMD imul
{
let imul = imul.bind(vector(I64, sse_vector_size));
narrow.legalize(def!(c = imul(a, b)), vec![def!(c = x86_pmullq(a, b))]);
}
narrow.custom_legalize(shuffle, "convert_shuffle"); narrow.custom_legalize(shuffle, "convert_shuffle");
narrow.custom_legalize(extractlane, "convert_extractlane"); narrow.custom_legalize(extractlane, "convert_extractlane");
narrow.custom_legalize(insertlane, "convert_insertlane"); narrow.custom_legalize(insertlane, "convert_insertlane");

View File

@@ -1911,6 +1911,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::X86Pmaxu | Opcode::X86Pmaxu
| Opcode::X86Pmins | Opcode::X86Pmins
| Opcode::X86Pminu | Opcode::X86Pminu
| Opcode::X86Pmullq
| Opcode::X86Packss | Opcode::X86Packss
| Opcode::X86Punpckh | Opcode::X86Punpckh
| Opcode::X86Punpckl | Opcode::X86Punpckl

View File

@@ -69,3 +69,10 @@ block0:
; nextln: v1 = band v0, v4 ; nextln: v1 = band v0, v4
return return
} }
function %imul(i64x2, i64x2) {
block0(v0:i64x2, v1:i64x2):
v2 = imul v0, v1
; check: v2 = x86_pmullq v0, v1
return
}

View File

@@ -6,7 +6,7 @@ function %imul_i64x2() {
block0: block0:
[-, %xmm1] v0 = vconst.i64x2 [1 2] [-, %xmm1] v0 = vconst.i64x2 [1 2]
[-, %xmm2] v1 = vconst.i64x2 [2 2] [-, %xmm2] v1 = vconst.i64x2 [2 2]
[-, %xmm14] v2 = imul v0, v1 ; bin: 62 72 f5 08 40 f2 [-, %xmm14] v2 = x86_pmullq v0, v1 ; bin: 62 72 f5 08 40 f2
; 62, mandatory EVEX prefix ; 62, mandatory EVEX prefix
; 72 = 0111 0010, R is set (MSB in %xmm14) while X, B, and R' are unset (note these are all inverted); mm is set to 0F38 ; 72 = 0111 0010, R is set (MSB in %xmm14) while X, B, and R' are unset (note these are all inverted); mm is set to 0F38
; f5 = 1111 0101, W is set (64-bit op), vvvv set to 1 (inverted), bit 2 always set, pp set to 01 ; f5 = 1111 0101, W is set (64-bit op), vvvv set to 1 (inverted), bit 2 always set, pp set to 01