x64: lower i8x16.popcnt to VPOPCNTB when possible
When AVX512VL or AVX512BITALG are available, Wasm SIMD's `popcnt` instruction can be lowered to a single x64 instruction, `VPOPCNTB`, instead of 8+ instructions.
This commit is contained in:
@@ -40,6 +40,12 @@ pub(crate) fn define(shared: &SettingGroup) -> SettingGroup {
|
|||||||
"AVX2: CPUID.07H:EBX.AVX2[bit 5]",
|
"AVX2: CPUID.07H:EBX.AVX2[bit 5]",
|
||||||
false,
|
false,
|
||||||
);
|
);
|
||||||
|
let has_avx512bitalg = settings.add_bool(
|
||||||
|
"has_avx512bitalg",
|
||||||
|
"Has support for AVX512BITALG.",
|
||||||
|
"AVX512BITALG: CPUID.07H:ECX.AVX512BITALG[bit 12]",
|
||||||
|
false,
|
||||||
|
);
|
||||||
let has_avx512dq = settings.add_bool(
|
let has_avx512dq = settings.add_bool(
|
||||||
"has_avx512dq",
|
"has_avx512dq",
|
||||||
"Has support for AVX512DQ.",
|
"Has support for AVX512DQ.",
|
||||||
@@ -108,6 +114,10 @@ pub(crate) fn define(shared: &SettingGroup) -> SettingGroup {
|
|||||||
|
|
||||||
settings.add_predicate("use_avx_simd", predicate!(shared_enable_simd && has_avx));
|
settings.add_predicate("use_avx_simd", predicate!(shared_enable_simd && has_avx));
|
||||||
settings.add_predicate("use_avx2_simd", predicate!(shared_enable_simd && has_avx2));
|
settings.add_predicate("use_avx2_simd", predicate!(shared_enable_simd && has_avx2));
|
||||||
|
settings.add_predicate(
|
||||||
|
"use_avx512bitalg_simd",
|
||||||
|
predicate!(shared_enable_simd && has_avx512bitalg),
|
||||||
|
);
|
||||||
settings.add_predicate(
|
settings.add_predicate(
|
||||||
"use_avx512dq_simd",
|
"use_avx512dq_simd",
|
||||||
predicate!(shared_enable_simd && has_avx512dq),
|
predicate!(shared_enable_simd && has_avx512dq),
|
||||||
|
|||||||
@@ -460,9 +460,10 @@ pub(crate) enum InstructionSet {
|
|||||||
BMI1,
|
BMI1,
|
||||||
#[allow(dead_code)] // never constructed (yet).
|
#[allow(dead_code)] // never constructed (yet).
|
||||||
BMI2,
|
BMI2,
|
||||||
|
AVX512BITALG,
|
||||||
|
AVX512DQ,
|
||||||
AVX512F,
|
AVX512F,
|
||||||
AVX512VL,
|
AVX512VL,
|
||||||
AVX512DQ,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Some SSE operations requiring 2 operands r/m and r.
|
/// Some SSE operations requiring 2 operands r/m and r.
|
||||||
@@ -1003,6 +1004,7 @@ pub enum Avx512Opcode {
|
|||||||
Vcvtudq2ps,
|
Vcvtudq2ps,
|
||||||
Vpabsq,
|
Vpabsq,
|
||||||
Vpmullq,
|
Vpmullq,
|
||||||
|
Vpopcntb,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Avx512Opcode {
|
impl Avx512Opcode {
|
||||||
@@ -1014,6 +1016,9 @@ impl Avx512Opcode {
|
|||||||
}
|
}
|
||||||
Avx512Opcode::Vpabsq => smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL],
|
Avx512Opcode::Vpabsq => smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL],
|
||||||
Avx512Opcode::Vpmullq => smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512DQ],
|
Avx512Opcode::Vpmullq => smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512DQ],
|
||||||
|
Avx512Opcode::Vpopcntb => {
|
||||||
|
smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512BITALG]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1024,6 +1029,7 @@ impl fmt::Debug for Avx512Opcode {
|
|||||||
Avx512Opcode::Vcvtudq2ps => "vcvtudq2ps",
|
Avx512Opcode::Vcvtudq2ps => "vcvtudq2ps",
|
||||||
Avx512Opcode::Vpabsq => "vpabsq",
|
Avx512Opcode::Vpabsq => "vpabsq",
|
||||||
Avx512Opcode::Vpmullq => "vpmullq",
|
Avx512Opcode::Vpmullq => "vpmullq",
|
||||||
|
Avx512Opcode::Vpopcntb => "vpopcntb",
|
||||||
};
|
};
|
||||||
write!(fmt, "{}", name)
|
write!(fmt, "{}", name)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -126,9 +126,10 @@ pub(crate) fn emit(
|
|||||||
InstructionSet::Lzcnt => info.isa_flags.use_lzcnt(),
|
InstructionSet::Lzcnt => info.isa_flags.use_lzcnt(),
|
||||||
InstructionSet::BMI1 => info.isa_flags.use_bmi1(),
|
InstructionSet::BMI1 => info.isa_flags.use_bmi1(),
|
||||||
InstructionSet::BMI2 => info.isa_flags.has_bmi2(),
|
InstructionSet::BMI2 => info.isa_flags.has_bmi2(),
|
||||||
|
InstructionSet::AVX512BITALG => info.isa_flags.has_avx512bitalg(),
|
||||||
InstructionSet::AVX512F => info.isa_flags.has_avx512f(),
|
InstructionSet::AVX512F => info.isa_flags.has_avx512f(),
|
||||||
InstructionSet::AVX512VL => info.isa_flags.has_avx512vl(),
|
|
||||||
InstructionSet::AVX512DQ => info.isa_flags.has_avx512dq(),
|
InstructionSet::AVX512DQ => info.isa_flags.has_avx512dq(),
|
||||||
|
InstructionSet::AVX512VL => info.isa_flags.has_avx512vl(),
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -1409,8 +1410,9 @@ pub(crate) fn emit(
|
|||||||
|
|
||||||
Inst::XmmUnaryRmREvex { op, src, dst } => {
|
Inst::XmmUnaryRmREvex { op, src, dst } => {
|
||||||
let (prefix, map, w, opcode) = match op {
|
let (prefix, map, w, opcode) = match op {
|
||||||
Avx512Opcode::Vpabsq => (LegacyPrefixes::_66, OpcodeMap::_0F38, true, 0x1f),
|
|
||||||
Avx512Opcode::Vcvtudq2ps => (LegacyPrefixes::_F2, OpcodeMap::_0F, false, 0x7a),
|
Avx512Opcode::Vcvtudq2ps => (LegacyPrefixes::_F2, OpcodeMap::_0F, false, 0x7a),
|
||||||
|
Avx512Opcode::Vpabsq => (LegacyPrefixes::_66, OpcodeMap::_0F38, true, 0x1f),
|
||||||
|
Avx512Opcode::Vpopcntb => (LegacyPrefixes::_66, OpcodeMap::_0F38, false, 0x54),
|
||||||
_ => unimplemented!("Opcode {:?} not implemented", op),
|
_ => unimplemented!("Opcode {:?} not implemented", op),
|
||||||
};
|
};
|
||||||
match src {
|
match src {
|
||||||
|
|||||||
@@ -3895,6 +3895,12 @@ fn test_x64_emit() {
|
|||||||
"vcvtudq2ps %xmm2, %xmm8",
|
"vcvtudq2ps %xmm2, %xmm8",
|
||||||
));
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_unary_rm_r_evex(Avx512Opcode::Vpopcntb, RegMem::reg(xmm2), w_xmm8),
|
||||||
|
"62727D0854C2",
|
||||||
|
"vpopcntb %xmm2, %xmm8",
|
||||||
|
));
|
||||||
|
|
||||||
// Xmm to int conversions, and conversely.
|
// Xmm to int conversions, and conversely.
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
@@ -4308,6 +4314,7 @@ fn test_x64_emit() {
|
|||||||
isa_flag_builder.enable("has_sse41").unwrap();
|
isa_flag_builder.enable("has_sse41").unwrap();
|
||||||
isa_flag_builder.enable("has_avx512f").unwrap();
|
isa_flag_builder.enable("has_avx512f").unwrap();
|
||||||
isa_flag_builder.enable("has_avx512dq").unwrap();
|
isa_flag_builder.enable("has_avx512dq").unwrap();
|
||||||
|
isa_flag_builder.enable("has_avx512vl").unwrap();
|
||||||
let isa_flags = x64::settings::Flags::new(&flags, isa_flag_builder);
|
let isa_flags = x64::settings::Flags::new(&flags, isa_flag_builder);
|
||||||
|
|
||||||
let rru = regs::create_reg_universe_systemv(&flags);
|
let rru = regs::create_reg_universe_systemv(&flags);
|
||||||
|
|||||||
@@ -3079,7 +3079,22 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
));
|
));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// For SIMD 4.4 we use Mula's algroithm (https://arxiv.org/pdf/1611.07612.pdf)
|
// Lower `popcount` for vectors.
|
||||||
|
let ty = ty.unwrap();
|
||||||
|
let src = put_input_in_reg(ctx, inputs[0]);
|
||||||
|
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||||
|
|
||||||
|
if isa_flags.use_avx512vl_simd() || isa_flags.use_avx512bitalg_simd() {
|
||||||
|
// When either AVX512VL or AVX512BITALG are available,
|
||||||
|
// `popcnt.i8x16` can be lowered to a single instruction.
|
||||||
|
assert_eq!(ty, types::I8X16);
|
||||||
|
ctx.emit(Inst::xmm_unary_rm_r_evex(
|
||||||
|
Avx512Opcode::Vpopcntb,
|
||||||
|
RegMem::reg(src),
|
||||||
|
dst,
|
||||||
|
));
|
||||||
|
} else {
|
||||||
|
// For SIMD 4.4 we use Mula's algorithm (https://arxiv.org/pdf/1611.07612.pdf)
|
||||||
//
|
//
|
||||||
//__m128i count_bytes ( __m128i v) {
|
//__m128i count_bytes ( __m128i v) {
|
||||||
// __m128i lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
|
// __m128i lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
|
||||||
@@ -3096,15 +3111,10 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
// The algorithm uses shifts to isolate 4 bit sections of the vector, pshufb as part of the
|
// The algorithm uses shifts to isolate 4 bit sections of the vector, pshufb as part of the
|
||||||
// lookup process, and adds together the results.
|
// lookup process, and adds together the results.
|
||||||
|
|
||||||
// Get input vector and destination
|
|
||||||
let ty = ty.unwrap();
|
|
||||||
let lhs = put_input_in_reg(ctx, inputs[0]);
|
|
||||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
||||||
|
|
||||||
// __m128i lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
|
// __m128i lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
|
||||||
static POPCOUNT_4BIT: [u8; 16] = [
|
static POPCOUNT_4BIT: [u8; 16] = [
|
||||||
0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02,
|
0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03,
|
||||||
0x03, 0x03, 0x04,
|
0x02, 0x03, 0x03, 0x04,
|
||||||
];
|
];
|
||||||
let lookup = ctx.use_constant(VCodeConstantData::WellKnown(&POPCOUNT_4BIT));
|
let lookup = ctx.use_constant(VCodeConstantData::WellKnown(&POPCOUNT_4BIT));
|
||||||
|
|
||||||
@@ -3117,10 +3127,10 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
// __m128i lo = _mm_and_si128 (v, low_mask);
|
// __m128i lo = _mm_and_si128 (v, low_mask);
|
||||||
let lo = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
|
let lo = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
|
||||||
ctx.emit(Inst::gen_move(lo, low_mask.to_reg(), types::I8X16));
|
ctx.emit(Inst::gen_move(lo, low_mask.to_reg(), types::I8X16));
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pand, RegMem::reg(lhs), lo));
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pand, RegMem::reg(src), lo));
|
||||||
|
|
||||||
// __m128i hi = _mm_and_si128 (_mm_srli_epi16 (v, 4), low_mask);
|
// __m128i hi = _mm_and_si128 (_mm_srli_epi16 (v, 4), low_mask);
|
||||||
ctx.emit(Inst::gen_move(dst, lhs, ty));
|
ctx.emit(Inst::gen_move(dst, src, ty));
|
||||||
ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrlw, RegMemImm::imm(4), dst));
|
ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrlw, RegMemImm::imm(4), dst));
|
||||||
let tmp = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
|
let tmp = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
|
||||||
ctx.emit(Inst::gen_move(tmp, low_mask.to_reg(), types::I8X16));
|
ctx.emit(Inst::gen_move(tmp, low_mask.to_reg(), types::I8X16));
|
||||||
@@ -3156,6 +3166,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
));
|
));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Opcode::Bitrev => {
|
Opcode::Bitrev => {
|
||||||
let ty = ctx.input_ty(insn, 0);
|
let ty = ctx.input_ty(insn, 0);
|
||||||
|
|||||||
@@ -91,6 +91,9 @@ pub fn builder_with_options(
|
|||||||
if std::is_x86_feature_detected!("bmi2") {
|
if std::is_x86_feature_detected!("bmi2") {
|
||||||
isa_builder.enable("has_bmi2").unwrap();
|
isa_builder.enable("has_bmi2").unwrap();
|
||||||
}
|
}
|
||||||
|
if std::is_x86_feature_detected!("avx512bitalg") {
|
||||||
|
isa_builder.enable("has_avx512bitalg").unwrap();
|
||||||
|
}
|
||||||
if std::is_x86_feature_detected!("avx512dq") {
|
if std::is_x86_feature_detected!("avx512dq") {
|
||||||
isa_builder.enable("has_avx512dq").unwrap();
|
isa_builder.enable("has_avx512dq").unwrap();
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user