x64: lower i64x2.imul to VPMULLQ when possible
This adds the machinery to encode the VPMULLQ instruction which is available in AVX512VL and AVX512DQ. When these feature sets are available, we use this instruction instead of a lengthy 12-instruction sequence.
This commit is contained in:
@@ -462,6 +462,7 @@ pub(crate) enum InstructionSet {
|
|||||||
BMI2,
|
BMI2,
|
||||||
AVX512F,
|
AVX512F,
|
||||||
AVX512VL,
|
AVX512VL,
|
||||||
|
AVX512DQ,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Some SSE operations requiring 2 operands r/m and r.
|
/// Some SSE operations requiring 2 operands r/m and r.
|
||||||
@@ -994,6 +995,7 @@ impl fmt::Display for SseOpcode {
|
|||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub enum Avx512Opcode {
|
pub enum Avx512Opcode {
|
||||||
Vpabsq,
|
Vpabsq,
|
||||||
|
Vpmullq,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Avx512Opcode {
|
impl Avx512Opcode {
|
||||||
@@ -1001,6 +1003,7 @@ impl Avx512Opcode {
|
|||||||
pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> {
|
pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> {
|
||||||
match self {
|
match self {
|
||||||
Avx512Opcode::Vpabsq => smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL],
|
Avx512Opcode::Vpabsq => smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL],
|
||||||
|
Avx512Opcode::Vpmullq => smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512DQ],
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1009,6 +1012,7 @@ impl fmt::Debug for Avx512Opcode {
|
|||||||
fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
|
fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
|
||||||
let name = match self {
|
let name = match self {
|
||||||
Avx512Opcode::Vpabsq => "vpabsq",
|
Avx512Opcode::Vpabsq => "vpabsq",
|
||||||
|
Avx512Opcode::Vpmullq => "vpmullq",
|
||||||
};
|
};
|
||||||
write!(fmt, "{}", name)
|
write!(fmt, "{}", name)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -128,6 +128,7 @@ pub(crate) fn emit(
|
|||||||
InstructionSet::BMI2 => info.isa_flags.has_bmi2(),
|
InstructionSet::BMI2 => info.isa_flags.has_bmi2(),
|
||||||
InstructionSet::AVX512F => info.isa_flags.has_avx512f(),
|
InstructionSet::AVX512F => info.isa_flags.has_avx512f(),
|
||||||
InstructionSet::AVX512VL => info.isa_flags.has_avx512vl(),
|
InstructionSet::AVX512VL => info.isa_flags.has_avx512vl(),
|
||||||
|
InstructionSet::AVX512DQ => info.isa_flags.has_avx512dq(),
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -1409,6 +1410,7 @@ pub(crate) fn emit(
|
|||||||
Inst::XmmUnaryRmREvex { op, src, dst } => {
|
Inst::XmmUnaryRmREvex { op, src, dst } => {
|
||||||
let opcode = match op {
|
let opcode = match op {
|
||||||
Avx512Opcode::Vpabsq => 0x1f,
|
Avx512Opcode::Vpabsq => 0x1f,
|
||||||
|
_ => unimplemented!("Opcode {:?} not implemented", op),
|
||||||
};
|
};
|
||||||
match src {
|
match src {
|
||||||
RegMem::Reg { reg: src } => EvexInstruction::new()
|
RegMem::Reg { reg: src } => EvexInstruction::new()
|
||||||
@@ -1545,6 +1547,31 @@ pub(crate) fn emit(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Inst::XmmRmREvex {
|
||||||
|
op,
|
||||||
|
src1,
|
||||||
|
src2,
|
||||||
|
dst,
|
||||||
|
} => {
|
||||||
|
let opcode = match op {
|
||||||
|
Avx512Opcode::Vpmullq => 0x40,
|
||||||
|
_ => unimplemented!("Opcode {:?} not implemented", op),
|
||||||
|
};
|
||||||
|
match src1 {
|
||||||
|
RegMem::Reg { reg: src } => EvexInstruction::new()
|
||||||
|
.length(EvexVectorLength::V128)
|
||||||
|
.prefix(LegacyPrefixes::_66)
|
||||||
|
.map(OpcodeMap::_0F38)
|
||||||
|
.w(true)
|
||||||
|
.opcode(opcode)
|
||||||
|
.reg(dst.to_reg().get_hw_encoding())
|
||||||
|
.rm(src.get_hw_encoding())
|
||||||
|
.vvvvv(src2.get_hw_encoding())
|
||||||
|
.encode(sink),
|
||||||
|
_ => todo!(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
Inst::XmmMinMaxSeq {
|
Inst::XmmMinMaxSeq {
|
||||||
size,
|
size,
|
||||||
is_min,
|
is_min,
|
||||||
|
|||||||
@@ -3555,6 +3555,12 @@ fn test_x64_emit() {
|
|||||||
"pmullw %xmm14, %xmm1",
|
"pmullw %xmm14, %xmm1",
|
||||||
));
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_rm_r_evex(Avx512Opcode::Vpmullq, RegMem::reg(xmm14), xmm10, w_xmm1),
|
||||||
|
"62D2AD0840CE",
|
||||||
|
"vpmullq %xmm14, %xmm10, %xmm1",
|
||||||
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(xmm8), w_xmm9),
|
Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(xmm8), w_xmm9),
|
||||||
"66450FF4C8",
|
"66450FF4C8",
|
||||||
@@ -4283,6 +4289,7 @@ fn test_x64_emit() {
|
|||||||
isa_flag_builder.enable("has_ssse3").unwrap();
|
isa_flag_builder.enable("has_ssse3").unwrap();
|
||||||
isa_flag_builder.enable("has_sse41").unwrap();
|
isa_flag_builder.enable("has_sse41").unwrap();
|
||||||
isa_flag_builder.enable("has_avx512f").unwrap();
|
isa_flag_builder.enable("has_avx512f").unwrap();
|
||||||
|
isa_flag_builder.enable("has_avx512dq").unwrap();
|
||||||
let isa_flags = x64::settings::Flags::new(&flags, isa_flag_builder);
|
let isa_flags = x64::settings::Flags::new(&flags, isa_flag_builder);
|
||||||
|
|
||||||
let rru = regs::create_reg_universe_systemv(&flags);
|
let rru = regs::create_reg_universe_systemv(&flags);
|
||||||
|
|||||||
@@ -212,6 +212,13 @@ pub enum Inst {
|
|||||||
dst: Writable<Reg>,
|
dst: Writable<Reg>,
|
||||||
},
|
},
|
||||||
|
|
||||||
|
XmmRmREvex {
|
||||||
|
op: Avx512Opcode,
|
||||||
|
src1: RegMem,
|
||||||
|
src2: Reg,
|
||||||
|
dst: Writable<Reg>,
|
||||||
|
},
|
||||||
|
|
||||||
/// XMM (scalar or vector) unary op: mov between XMM registers (32 64) (reg addr) reg, sqrt,
|
/// XMM (scalar or vector) unary op: mov between XMM registers (32 64) (reg addr) reg, sqrt,
|
||||||
/// etc.
|
/// etc.
|
||||||
///
|
///
|
||||||
@@ -577,7 +584,7 @@ impl Inst {
|
|||||||
| Inst::XmmToGpr { op, .. }
|
| Inst::XmmToGpr { op, .. }
|
||||||
| Inst::XmmUnaryRmR { op, .. } => smallvec![op.available_from()],
|
| Inst::XmmUnaryRmR { op, .. } => smallvec![op.available_from()],
|
||||||
|
|
||||||
Inst::XmmUnaryRmREvex { op, .. } => op.available_from(),
|
Inst::XmmUnaryRmREvex { op, .. } | Inst::XmmRmREvex { op, .. } => op.available_from(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -724,6 +731,23 @@ impl Inst {
|
|||||||
Inst::XmmRmR { op, src, dst }
|
Inst::XmmRmR { op, src, dst }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn xmm_rm_r_evex(
|
||||||
|
op: Avx512Opcode,
|
||||||
|
src1: RegMem,
|
||||||
|
src2: Reg,
|
||||||
|
dst: Writable<Reg>,
|
||||||
|
) -> Self {
|
||||||
|
src1.assert_regclass_is(RegClass::V128);
|
||||||
|
debug_assert!(src2.get_class() == RegClass::V128);
|
||||||
|
debug_assert!(dst.to_reg().get_class() == RegClass::V128);
|
||||||
|
Inst::XmmRmREvex {
|
||||||
|
op,
|
||||||
|
src1,
|
||||||
|
src2,
|
||||||
|
dst,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) fn xmm_uninit_value(dst: Writable<Reg>) -> Self {
|
pub(crate) fn xmm_uninit_value(dst: Writable<Reg>) -> Self {
|
||||||
debug_assert!(dst.to_reg().get_class() == RegClass::V128);
|
debug_assert!(dst.to_reg().get_class() == RegClass::V128);
|
||||||
Inst::XmmUninitializedValue { dst }
|
Inst::XmmUninitializedValue { dst }
|
||||||
@@ -1425,6 +1449,20 @@ impl PrettyPrint for Inst {
|
|||||||
show_ireg_sized(dst.to_reg(), mb_rru, 8),
|
show_ireg_sized(dst.to_reg(), mb_rru, 8),
|
||||||
),
|
),
|
||||||
|
|
||||||
|
Inst::XmmRmREvex {
|
||||||
|
op,
|
||||||
|
src1,
|
||||||
|
src2,
|
||||||
|
dst,
|
||||||
|
..
|
||||||
|
} => format!(
|
||||||
|
"{} {}, {}, {}",
|
||||||
|
ljustify(op.to_string()),
|
||||||
|
src1.show_rru_sized(mb_rru, 8),
|
||||||
|
show_ireg_sized(*src2, mb_rru, 8),
|
||||||
|
show_ireg_sized(dst.to_reg(), mb_rru, 8),
|
||||||
|
),
|
||||||
|
|
||||||
Inst::XmmMinMaxSeq {
|
Inst::XmmMinMaxSeq {
|
||||||
lhs,
|
lhs,
|
||||||
rhs_dst,
|
rhs_dst,
|
||||||
@@ -1898,6 +1936,13 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
|
|||||||
collector.add_mod(*dst);
|
collector.add_mod(*dst);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Inst::XmmRmREvex {
|
||||||
|
src1, src2, dst, ..
|
||||||
|
} => {
|
||||||
|
src1.get_regs_as_uses(collector);
|
||||||
|
collector.add_use(*src2);
|
||||||
|
collector.add_def(*dst);
|
||||||
|
}
|
||||||
Inst::XmmRmRImm { op, src, dst, .. } => {
|
Inst::XmmRmRImm { op, src, dst, .. } => {
|
||||||
if inst.produces_const() {
|
if inst.produces_const() {
|
||||||
// No need to account for src, since src == dst.
|
// No need to account for src, since src == dst.
|
||||||
@@ -2283,6 +2328,16 @@ fn x64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
|
|||||||
map_mod(mapper, dst);
|
map_mod(mapper, dst);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Inst::XmmRmREvex {
|
||||||
|
ref mut src1,
|
||||||
|
ref mut src2,
|
||||||
|
ref mut dst,
|
||||||
|
..
|
||||||
|
} => {
|
||||||
|
src1.map_uses(mapper);
|
||||||
|
map_use(mapper, src2);
|
||||||
|
map_def(mapper, dst);
|
||||||
|
}
|
||||||
Inst::XmmRmiReg {
|
Inst::XmmRmiReg {
|
||||||
ref mut src,
|
ref mut src,
|
||||||
ref mut dst,
|
ref mut dst,
|
||||||
|
|||||||
@@ -1663,7 +1663,22 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
Opcode::Imul => {
|
Opcode::Imul => {
|
||||||
let ty = ty.unwrap();
|
let ty = ty.unwrap();
|
||||||
if ty == types::I64X2 {
|
if ty == types::I64X2 {
|
||||||
// For I64X2 multiplication we describe a lane A as being
|
// Eventually one of these should be `input_to_reg_mem` (TODO).
|
||||||
|
let lhs = put_input_in_reg(ctx, inputs[0]);
|
||||||
|
let rhs = put_input_in_reg(ctx, inputs[1]);
|
||||||
|
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||||
|
|
||||||
|
if isa_flags.use_avx512f_simd() || isa_flags.use_avx512vl_simd() {
|
||||||
|
// With the right AVX512 features (VL, DQ) this operation
|
||||||
|
// can lower to a single operation.
|
||||||
|
ctx.emit(Inst::xmm_rm_r_evex(
|
||||||
|
Avx512Opcode::Vpmullq,
|
||||||
|
RegMem::reg(rhs),
|
||||||
|
lhs,
|
||||||
|
dst,
|
||||||
|
));
|
||||||
|
} else {
|
||||||
|
// Otherwise, for I64X2 multiplication we describe a lane A as being
|
||||||
// composed of a 32-bit upper half "Ah" and a 32-bit lower half
|
// composed of a 32-bit upper half "Ah" and a 32-bit lower half
|
||||||
// "Al". The 32-bit long hand multiplication can then be written
|
// "Al". The 32-bit long hand multiplication can then be written
|
||||||
// as:
|
// as:
|
||||||
@@ -1698,11 +1713,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
// A' = A' + B'
|
// A' = A' + B'
|
||||||
// dst = A'
|
// dst = A'
|
||||||
|
|
||||||
// Get inputs rhs=A and lhs=B and the dst register
|
|
||||||
let lhs = put_input_in_reg(ctx, inputs[0]);
|
|
||||||
let rhs = put_input_in_reg(ctx, inputs[1]);
|
|
||||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
||||||
|
|
||||||
// A' = A
|
// A' = A
|
||||||
let rhs_1 = ctx.alloc_tmp(types::I64X2).only_reg().unwrap();
|
let rhs_1 = ctx.alloc_tmp(types::I64X2).only_reg().unwrap();
|
||||||
ctx.emit(Inst::gen_move(rhs_1, rhs, ty));
|
ctx.emit(Inst::gen_move(rhs_1, rhs, ty));
|
||||||
@@ -1762,6 +1772,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
rhs_1,
|
rhs_1,
|
||||||
));
|
));
|
||||||
ctx.emit(Inst::gen_move(dst, rhs_1.to_reg(), ty));
|
ctx.emit(Inst::gen_move(dst, rhs_1.to_reg(), ty));
|
||||||
|
}
|
||||||
} else if ty.lane_count() > 1 {
|
} else if ty.lane_count() > 1 {
|
||||||
// Emit single instruction lowerings for the remaining vector
|
// Emit single instruction lowerings for the remaining vector
|
||||||
// multiplications.
|
// multiplications.
|
||||||
|
|||||||
Reference in New Issue
Block a user