x64: lower i64x2.imul to VPMULLQ when possible

This adds the machinery to encode the VPMULLQ instruction which is
available in AVX512VL and AVX512DQ. When these feature sets are
available, we use this instruction instead of a lengthy 12-instruction
sequence.
This commit is contained in:
Andrew Brown
2021-05-10 16:25:03 -07:00
parent 5929a5e6ee
commit e676589b0c
5 changed files with 195 additions and 91 deletions

View File

@@ -462,6 +462,7 @@ pub(crate) enum InstructionSet {
BMI2,
AVX512F,
AVX512VL,
AVX512DQ,
}
/// Some SSE operations requiring 2 operands r/m and r.
@@ -994,6 +995,7 @@ impl fmt::Display for SseOpcode {
#[derive(Clone)]
pub enum Avx512Opcode {
Vpabsq,
Vpmullq,
}
impl Avx512Opcode {
@@ -1001,6 +1003,7 @@ impl Avx512Opcode {
pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> {
match self {
Avx512Opcode::Vpabsq => smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL],
Avx512Opcode::Vpmullq => smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512DQ],
}
}
}
@@ -1009,6 +1012,7 @@ impl fmt::Debug for Avx512Opcode {
fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
let name = match self {
Avx512Opcode::Vpabsq => "vpabsq",
Avx512Opcode::Vpmullq => "vpmullq",
};
write!(fmt, "{}", name)
}

View File

@@ -128,6 +128,7 @@ pub(crate) fn emit(
InstructionSet::BMI2 => info.isa_flags.has_bmi2(),
InstructionSet::AVX512F => info.isa_flags.has_avx512f(),
InstructionSet::AVX512VL => info.isa_flags.has_avx512vl(),
InstructionSet::AVX512DQ => info.isa_flags.has_avx512dq(),
}
};
@@ -1409,6 +1410,7 @@ pub(crate) fn emit(
Inst::XmmUnaryRmREvex { op, src, dst } => {
let opcode = match op {
Avx512Opcode::Vpabsq => 0x1f,
_ => unimplemented!("Opcode {:?} not implemented", op),
};
match src {
RegMem::Reg { reg: src } => EvexInstruction::new()
@@ -1545,6 +1547,31 @@ pub(crate) fn emit(
}
}
Inst::XmmRmREvex {
op,
src1,
src2,
dst,
} => {
let opcode = match op {
Avx512Opcode::Vpmullq => 0x40,
_ => unimplemented!("Opcode {:?} not implemented", op),
};
match src1 {
RegMem::Reg { reg: src } => EvexInstruction::new()
.length(EvexVectorLength::V128)
.prefix(LegacyPrefixes::_66)
.map(OpcodeMap::_0F38)
.w(true)
.opcode(opcode)
.reg(dst.to_reg().get_hw_encoding())
.rm(src.get_hw_encoding())
.vvvvv(src2.get_hw_encoding())
.encode(sink),
_ => todo!(),
};
}
Inst::XmmMinMaxSeq {
size,
is_min,

View File

@@ -3555,6 +3555,12 @@ fn test_x64_emit() {
"pmullw %xmm14, %xmm1",
));
insns.push((
Inst::xmm_rm_r_evex(Avx512Opcode::Vpmullq, RegMem::reg(xmm14), xmm10, w_xmm1),
"62D2AD0840CE",
"vpmullq %xmm14, %xmm10, %xmm1",
));
insns.push((
Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(xmm8), w_xmm9),
"66450FF4C8",
@@ -4283,6 +4289,7 @@ fn test_x64_emit() {
isa_flag_builder.enable("has_ssse3").unwrap();
isa_flag_builder.enable("has_sse41").unwrap();
isa_flag_builder.enable("has_avx512f").unwrap();
isa_flag_builder.enable("has_avx512dq").unwrap();
let isa_flags = x64::settings::Flags::new(&flags, isa_flag_builder);
let rru = regs::create_reg_universe_systemv(&flags);

View File

@@ -212,6 +212,13 @@ pub enum Inst {
dst: Writable<Reg>,
},
XmmRmREvex {
op: Avx512Opcode,
src1: RegMem,
src2: Reg,
dst: Writable<Reg>,
},
/// XMM (scalar or vector) unary op: mov between XMM registers (32 64) (reg addr) reg, sqrt,
/// etc.
///
@@ -577,7 +584,7 @@ impl Inst {
| Inst::XmmToGpr { op, .. }
| Inst::XmmUnaryRmR { op, .. } => smallvec![op.available_from()],
Inst::XmmUnaryRmREvex { op, .. } => op.available_from(),
Inst::XmmUnaryRmREvex { op, .. } | Inst::XmmRmREvex { op, .. } => op.available_from(),
}
}
}
@@ -724,6 +731,23 @@ impl Inst {
Inst::XmmRmR { op, src, dst }
}
pub(crate) fn xmm_rm_r_evex(
op: Avx512Opcode,
src1: RegMem,
src2: Reg,
dst: Writable<Reg>,
) -> Self {
src1.assert_regclass_is(RegClass::V128);
debug_assert!(src2.get_class() == RegClass::V128);
debug_assert!(dst.to_reg().get_class() == RegClass::V128);
Inst::XmmRmREvex {
op,
src1,
src2,
dst,
}
}
pub(crate) fn xmm_uninit_value(dst: Writable<Reg>) -> Self {
debug_assert!(dst.to_reg().get_class() == RegClass::V128);
Inst::XmmUninitializedValue { dst }
@@ -1425,6 +1449,20 @@ impl PrettyPrint for Inst {
show_ireg_sized(dst.to_reg(), mb_rru, 8),
),
Inst::XmmRmREvex {
op,
src1,
src2,
dst,
..
} => format!(
"{} {}, {}, {}",
ljustify(op.to_string()),
src1.show_rru_sized(mb_rru, 8),
show_ireg_sized(*src2, mb_rru, 8),
show_ireg_sized(dst.to_reg(), mb_rru, 8),
),
Inst::XmmMinMaxSeq {
lhs,
rhs_dst,
@@ -1898,6 +1936,13 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
collector.add_mod(*dst);
}
}
Inst::XmmRmREvex {
src1, src2, dst, ..
} => {
src1.get_regs_as_uses(collector);
collector.add_use(*src2);
collector.add_def(*dst);
}
Inst::XmmRmRImm { op, src, dst, .. } => {
if inst.produces_const() {
// No need to account for src, since src == dst.
@@ -2283,6 +2328,16 @@ fn x64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
map_mod(mapper, dst);
}
}
Inst::XmmRmREvex {
ref mut src1,
ref mut src2,
ref mut dst,
..
} => {
src1.map_uses(mapper);
map_use(mapper, src2);
map_def(mapper, dst);
}
Inst::XmmRmiReg {
ref mut src,
ref mut dst,