x64: lower i8x16.popcnt to VPOPCNTB when possible

When AVX512VL or AVX512BITALG are available, Wasm SIMD's `popcnt`
instruction can be lowered to a single x64 instruction, `VPOPCNTB`,
instead of 8+ instructions.
This commit is contained in:
Andrew Brown
2021-05-24 11:21:07 -07:00
parent 2b0649c74c
commit 459fce3467
6 changed files with 107 additions and 68 deletions

View File

@@ -40,6 +40,12 @@ pub(crate) fn define(shared: &SettingGroup) -> SettingGroup {
"AVX2: CPUID.07H:EBX.AVX2[bit 5]", "AVX2: CPUID.07H:EBX.AVX2[bit 5]",
false, false,
); );
let has_avx512bitalg = settings.add_bool(
"has_avx512bitalg",
"Has support for AVX512BITALG.",
"AVX512BITALG: CPUID.07H:ECX.AVX512BITALG[bit 12]",
false,
);
let has_avx512dq = settings.add_bool( let has_avx512dq = settings.add_bool(
"has_avx512dq", "has_avx512dq",
"Has support for AVX512DQ.", "Has support for AVX512DQ.",
@@ -108,6 +114,10 @@ pub(crate) fn define(shared: &SettingGroup) -> SettingGroup {
settings.add_predicate("use_avx_simd", predicate!(shared_enable_simd && has_avx)); settings.add_predicate("use_avx_simd", predicate!(shared_enable_simd && has_avx));
settings.add_predicate("use_avx2_simd", predicate!(shared_enable_simd && has_avx2)); settings.add_predicate("use_avx2_simd", predicate!(shared_enable_simd && has_avx2));
settings.add_predicate(
"use_avx512bitalg_simd",
predicate!(shared_enable_simd && has_avx512bitalg),
);
settings.add_predicate( settings.add_predicate(
"use_avx512dq_simd", "use_avx512dq_simd",
predicate!(shared_enable_simd && has_avx512dq), predicate!(shared_enable_simd && has_avx512dq),

View File

@@ -460,9 +460,10 @@ pub(crate) enum InstructionSet {
BMI1, BMI1,
#[allow(dead_code)] // never constructed (yet). #[allow(dead_code)] // never constructed (yet).
BMI2, BMI2,
AVX512BITALG,
AVX512DQ,
AVX512F, AVX512F,
AVX512VL, AVX512VL,
AVX512DQ,
} }
/// Some SSE operations requiring 2 operands r/m and r. /// Some SSE operations requiring 2 operands r/m and r.
@@ -1003,6 +1004,7 @@ pub enum Avx512Opcode {
Vcvtudq2ps, Vcvtudq2ps,
Vpabsq, Vpabsq,
Vpmullq, Vpmullq,
Vpopcntb,
} }
impl Avx512Opcode { impl Avx512Opcode {
@@ -1014,6 +1016,9 @@ impl Avx512Opcode {
} }
Avx512Opcode::Vpabsq => smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL], Avx512Opcode::Vpabsq => smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL],
Avx512Opcode::Vpmullq => smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512DQ], Avx512Opcode::Vpmullq => smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512DQ],
Avx512Opcode::Vpopcntb => {
smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512BITALG]
}
} }
} }
} }
@@ -1024,6 +1029,7 @@ impl fmt::Debug for Avx512Opcode {
Avx512Opcode::Vcvtudq2ps => "vcvtudq2ps", Avx512Opcode::Vcvtudq2ps => "vcvtudq2ps",
Avx512Opcode::Vpabsq => "vpabsq", Avx512Opcode::Vpabsq => "vpabsq",
Avx512Opcode::Vpmullq => "vpmullq", Avx512Opcode::Vpmullq => "vpmullq",
Avx512Opcode::Vpopcntb => "vpopcntb",
}; };
write!(fmt, "{}", name) write!(fmt, "{}", name)
} }

View File

@@ -126,9 +126,10 @@ pub(crate) fn emit(
InstructionSet::Lzcnt => info.isa_flags.use_lzcnt(), InstructionSet::Lzcnt => info.isa_flags.use_lzcnt(),
InstructionSet::BMI1 => info.isa_flags.use_bmi1(), InstructionSet::BMI1 => info.isa_flags.use_bmi1(),
InstructionSet::BMI2 => info.isa_flags.has_bmi2(), InstructionSet::BMI2 => info.isa_flags.has_bmi2(),
InstructionSet::AVX512BITALG => info.isa_flags.has_avx512bitalg(),
InstructionSet::AVX512F => info.isa_flags.has_avx512f(), InstructionSet::AVX512F => info.isa_flags.has_avx512f(),
InstructionSet::AVX512VL => info.isa_flags.has_avx512vl(),
InstructionSet::AVX512DQ => info.isa_flags.has_avx512dq(), InstructionSet::AVX512DQ => info.isa_flags.has_avx512dq(),
InstructionSet::AVX512VL => info.isa_flags.has_avx512vl(),
} }
}; };
@@ -1409,8 +1410,9 @@ pub(crate) fn emit(
Inst::XmmUnaryRmREvex { op, src, dst } => { Inst::XmmUnaryRmREvex { op, src, dst } => {
let (prefix, map, w, opcode) = match op { let (prefix, map, w, opcode) = match op {
Avx512Opcode::Vpabsq => (LegacyPrefixes::_66, OpcodeMap::_0F38, true, 0x1f),
Avx512Opcode::Vcvtudq2ps => (LegacyPrefixes::_F2, OpcodeMap::_0F, false, 0x7a), Avx512Opcode::Vcvtudq2ps => (LegacyPrefixes::_F2, OpcodeMap::_0F, false, 0x7a),
Avx512Opcode::Vpabsq => (LegacyPrefixes::_66, OpcodeMap::_0F38, true, 0x1f),
Avx512Opcode::Vpopcntb => (LegacyPrefixes::_66, OpcodeMap::_0F38, false, 0x54),
_ => unimplemented!("Opcode {:?} not implemented", op), _ => unimplemented!("Opcode {:?} not implemented", op),
}; };
match src { match src {

View File

@@ -3895,6 +3895,12 @@ fn test_x64_emit() {
"vcvtudq2ps %xmm2, %xmm8", "vcvtudq2ps %xmm2, %xmm8",
)); ));
insns.push((
Inst::xmm_unary_rm_r_evex(Avx512Opcode::Vpopcntb, RegMem::reg(xmm2), w_xmm8),
"62727D0854C2",
"vpopcntb %xmm2, %xmm8",
));
// Xmm to int conversions, and conversely. // Xmm to int conversions, and conversely.
insns.push(( insns.push((
@@ -4308,6 +4314,7 @@ fn test_x64_emit() {
isa_flag_builder.enable("has_sse41").unwrap(); isa_flag_builder.enable("has_sse41").unwrap();
isa_flag_builder.enable("has_avx512f").unwrap(); isa_flag_builder.enable("has_avx512f").unwrap();
isa_flag_builder.enable("has_avx512dq").unwrap(); isa_flag_builder.enable("has_avx512dq").unwrap();
isa_flag_builder.enable("has_avx512vl").unwrap();
let isa_flags = x64::settings::Flags::new(&flags, isa_flag_builder); let isa_flags = x64::settings::Flags::new(&flags, isa_flag_builder);
let rru = regs::create_reg_universe_systemv(&flags); let rru = regs::create_reg_universe_systemv(&flags);

View File

@@ -3079,81 +3079,92 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
)); ));
} }
} else { } else {
// For SIMD 4.4 we use Mula's algroithm (https://arxiv.org/pdf/1611.07612.pdf) // Lower `popcount` for vectors.
//
//__m128i count_bytes ( __m128i v) {
// __m128i lookup = _mm_setr_epi8(0 ,1 ,1 ,2 ,1 ,2 ,2 ,3 ,1 ,2 ,2 ,3 ,2 ,3 ,3 ,4) ;
// __m128i low_mask = _mm_set1_epi8 (0 x0f ) ;
// __m128i lo = _mm_and_si128 (v, low_mask ) ;
// __m128i hi = _mm_and_si128 (_mm_srli_epi16 (v, 4) , low_mask ) ;
// __m128i cnt1 = _mm_shuffle_epi8 (lookup , lo) ;
// __m128i cnt2 = _mm_shuffle_epi8 (lookup , hi) ;
// return _mm_add_epi8 (cnt1 , cnt2 ) ;
//}
//
// Details of the above algorithm can be found in the reference noted above, but the basics
// are to create a lookup table that pre populates the popcnt values for each number [0,15].
// The algorithm uses shifts to isolate 4 bit sections of the vector, pshufb as part of the
// lookup process, and adds together the results.
// Get input vector and destination
let ty = ty.unwrap(); let ty = ty.unwrap();
let lhs = put_input_in_reg(ctx, inputs[0]); let src = put_input_in_reg(ctx, inputs[0]);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
// __m128i lookup = _mm_setr_epi8(0 ,1 ,1 ,2 ,1 ,2 ,2 ,3 ,1 ,2 ,2 ,3 ,2 ,3 ,3 ,4); if isa_flags.use_avx512vl_simd() || isa_flags.use_avx512bitalg_simd() {
static POPCOUNT_4BIT: [u8; 16] = [ // When either AVX512VL or AVX512BITALG are available,
0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, // `popcnt.i8x16` can be lowered to a single instruction.
0x03, 0x03, 0x04, assert_eq!(ty, types::I8X16);
]; ctx.emit(Inst::xmm_unary_rm_r_evex(
let lookup = ctx.use_constant(VCodeConstantData::WellKnown(&POPCOUNT_4BIT)); Avx512Opcode::Vpopcntb,
RegMem::reg(src),
dst,
));
} else {
// For SIMD 4.4 we use Mula's algorithm (https://arxiv.org/pdf/1611.07612.pdf)
//
//__m128i count_bytes ( __m128i v) {
// __m128i lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
// __m128i low_mask = _mm_set1_epi8 (0x0f);
// __m128i lo = _mm_and_si128 (v, low_mask);
// __m128i hi = _mm_and_si128 (_mm_srli_epi16 (v, 4), low_mask);
// __m128i cnt1 = _mm_shuffle_epi8 (lookup, lo);
// __m128i cnt2 = _mm_shuffle_epi8 (lookup, hi);
// return _mm_add_epi8 (cnt1, cnt2);
//}
//
// Details of the above algorithm can be found in the reference noted above, but the basics
// are to create a lookup table that pre populates the popcnt values for each number [0,15].
// The algorithm uses shifts to isolate 4 bit sections of the vector, pshufb as part of the
// lookup process, and adds together the results.
// Create a mask for lower 4bits of each subword. // __m128i lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
static LOW_MASK: [u8; 16] = [0x0F; 16]; static POPCOUNT_4BIT: [u8; 16] = [
let low_mask_const = ctx.use_constant(VCodeConstantData::WellKnown(&LOW_MASK)); 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03,
let low_mask = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); 0x02, 0x03, 0x03, 0x04,
ctx.emit(Inst::xmm_load_const(low_mask_const, low_mask, ty)); ];
let lookup = ctx.use_constant(VCodeConstantData::WellKnown(&POPCOUNT_4BIT));
// __m128i lo = _mm_and_si128 (v, low_mask ); // Create a mask for lower 4bits of each subword.
let lo = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); static LOW_MASK: [u8; 16] = [0x0F; 16];
ctx.emit(Inst::gen_move(lo, low_mask.to_reg(), types::I8X16)); let low_mask_const = ctx.use_constant(VCodeConstantData::WellKnown(&LOW_MASK));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pand, RegMem::reg(lhs), lo)); let low_mask = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
ctx.emit(Inst::xmm_load_const(low_mask_const, low_mask, ty));
// __m128i hi = _mm_and_si128 (_mm_srli_epi16 (v, 4) , low_mask ) ; // __m128i lo = _mm_and_si128 (v, low_mask);
ctx.emit(Inst::gen_move(dst, lhs, ty)); let lo = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrlw, RegMemImm::imm(4), dst)); ctx.emit(Inst::gen_move(lo, low_mask.to_reg(), types::I8X16));
let tmp = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); ctx.emit(Inst::xmm_rm_r(SseOpcode::Pand, RegMem::reg(src), lo));
ctx.emit(Inst::gen_move(tmp, low_mask.to_reg(), types::I8X16));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pand,
RegMem::reg(dst.to_reg()),
tmp,
));
// __m128i cnt1 = _mm_shuffle_epi8 (lookup , lo) ; // __m128i hi = _mm_and_si128 (_mm_srli_epi16 (v, 4), low_mask);
let tmp2 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); ctx.emit(Inst::gen_move(dst, src, ty));
ctx.emit(Inst::xmm_load_const(lookup, tmp2, ty)); ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrlw, RegMemImm::imm(4), dst));
ctx.emit(Inst::gen_move(dst, tmp2.to_reg(), types::I8X16)); let tmp = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
ctx.emit(Inst::gen_move(tmp, low_mask.to_reg(), types::I8X16));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pand,
RegMem::reg(dst.to_reg()),
tmp,
));
ctx.emit(Inst::xmm_rm_r( // __m128i cnt1 = _mm_shuffle_epi8 (lookup, lo);
SseOpcode::Pshufb, let tmp2 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
RegMem::reg(lo.to_reg()), ctx.emit(Inst::xmm_load_const(lookup, tmp2, ty));
dst, ctx.emit(Inst::gen_move(dst, tmp2.to_reg(), types::I8X16));
));
// __m128i cnt2 = _mm_shuffle_epi8 (lookup , hi) ; ctx.emit(Inst::xmm_rm_r(
ctx.emit(Inst::xmm_rm_r( SseOpcode::Pshufb,
SseOpcode::Pshufb, RegMem::reg(lo.to_reg()),
RegMem::reg(tmp.to_reg()), dst,
tmp2, ));
));
// return _mm_add_epi8 (cnt1 , cnt2 ) ; // __m128i cnt2 = _mm_shuffle_epi8 (lookup , hi) ;
ctx.emit(Inst::xmm_rm_r( ctx.emit(Inst::xmm_rm_r(
SseOpcode::Paddb, SseOpcode::Pshufb,
RegMem::reg(tmp2.to_reg()), RegMem::reg(tmp.to_reg()),
dst, tmp2,
)); ));
// return _mm_add_epi8 (cnt1 , cnt2 ) ;
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Paddb,
RegMem::reg(tmp2.to_reg()),
dst,
));
}
} }
} }

View File

@@ -91,6 +91,9 @@ pub fn builder_with_options(
if std::is_x86_feature_detected!("bmi2") { if std::is_x86_feature_detected!("bmi2") {
isa_builder.enable("has_bmi2").unwrap(); isa_builder.enable("has_bmi2").unwrap();
} }
if std::is_x86_feature_detected!("avx512bitalg") {
isa_builder.enable("has_avx512bitalg").unwrap();
}
if std::is_x86_feature_detected!("avx512dq") { if std::is_x86_feature_detected!("avx512dq") {
isa_builder.enable("has_avx512dq").unwrap(); isa_builder.enable("has_avx512dq").unwrap();
} }