diff --git a/cranelift/codegen/meta/src/isa/x86/settings.rs b/cranelift/codegen/meta/src/isa/x86/settings.rs index 70b829787d..67071558d9 100644 --- a/cranelift/codegen/meta/src/isa/x86/settings.rs +++ b/cranelift/codegen/meta/src/isa/x86/settings.rs @@ -40,6 +40,12 @@ pub(crate) fn define(shared: &SettingGroup) -> SettingGroup { "AVX2: CPUID.07H:EBX.AVX2[bit 5]", false, ); + let has_avx512bitalg = settings.add_bool( + "has_avx512bitalg", + "Has support for AVX512BITALG.", + "AVX512BITALG: CPUID.07H:ECX.AVX512BITALG[bit 12]", + false, + ); let has_avx512dq = settings.add_bool( "has_avx512dq", "Has support for AVX512DQ.", @@ -108,6 +114,10 @@ pub(crate) fn define(shared: &SettingGroup) -> SettingGroup { settings.add_predicate("use_avx_simd", predicate!(shared_enable_simd && has_avx)); settings.add_predicate("use_avx2_simd", predicate!(shared_enable_simd && has_avx2)); + settings.add_predicate( + "use_avx512bitalg_simd", + predicate!(shared_enable_simd && has_avx512bitalg), + ); settings.add_predicate( "use_avx512dq_simd", predicate!(shared_enable_simd && has_avx512dq), diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 271159ea79..44e359d22e 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -460,9 +460,10 @@ pub(crate) enum InstructionSet { BMI1, #[allow(dead_code)] // never constructed (yet). BMI2, + AVX512BITALG, + AVX512DQ, AVX512F, AVX512VL, - AVX512DQ, } /// Some SSE operations requiring 2 operands r/m and r. @@ -1003,6 +1004,7 @@ pub enum Avx512Opcode { Vcvtudq2ps, Vpabsq, Vpmullq, + Vpopcntb, } impl Avx512Opcode { @@ -1014,6 +1016,9 @@ impl Avx512Opcode { } Avx512Opcode::Vpabsq => smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL], Avx512Opcode::Vpmullq => smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512DQ], + Avx512Opcode::Vpopcntb => { + smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512BITALG] + } } } } @@ -1024,6 +1029,7 @@ impl fmt::Debug for Avx512Opcode { Avx512Opcode::Vcvtudq2ps => "vcvtudq2ps", Avx512Opcode::Vpabsq => "vpabsq", Avx512Opcode::Vpmullq => "vpmullq", + Avx512Opcode::Vpopcntb => "vpopcntb", }; write!(fmt, "{}", name) } diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 5e94395797..b5c6c43c26 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -126,9 +126,10 @@ pub(crate) fn emit( InstructionSet::Lzcnt => info.isa_flags.use_lzcnt(), InstructionSet::BMI1 => info.isa_flags.use_bmi1(), InstructionSet::BMI2 => info.isa_flags.has_bmi2(), + InstructionSet::AVX512BITALG => info.isa_flags.has_avx512bitalg(), InstructionSet::AVX512F => info.isa_flags.has_avx512f(), - InstructionSet::AVX512VL => info.isa_flags.has_avx512vl(), InstructionSet::AVX512DQ => info.isa_flags.has_avx512dq(), + InstructionSet::AVX512VL => info.isa_flags.has_avx512vl(), } }; @@ -1409,8 +1410,9 @@ pub(crate) fn emit( Inst::XmmUnaryRmREvex { op, src, dst } => { let (prefix, map, w, opcode) = match op { - Avx512Opcode::Vpabsq => (LegacyPrefixes::_66, OpcodeMap::_0F38, true, 0x1f), Avx512Opcode::Vcvtudq2ps => (LegacyPrefixes::_F2, OpcodeMap::_0F, false, 0x7a), + Avx512Opcode::Vpabsq => (LegacyPrefixes::_66, OpcodeMap::_0F38, true, 0x1f), + Avx512Opcode::Vpopcntb => (LegacyPrefixes::_66, OpcodeMap::_0F38, false, 0x54), _ => unimplemented!("Opcode {:?} not implemented", op), }; match src { diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index efe62b4cea..d08216612c 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -3895,6 +3895,12 @@ fn test_x64_emit() { "vcvtudq2ps %xmm2, %xmm8", )); + insns.push(( + Inst::xmm_unary_rm_r_evex(Avx512Opcode::Vpopcntb, RegMem::reg(xmm2), w_xmm8), + "62727D0854C2", + "vpopcntb %xmm2, %xmm8", + )); + // Xmm to int conversions, and conversely. insns.push(( @@ -4308,6 +4314,7 @@ fn test_x64_emit() { isa_flag_builder.enable("has_sse41").unwrap(); isa_flag_builder.enable("has_avx512f").unwrap(); isa_flag_builder.enable("has_avx512dq").unwrap(); + isa_flag_builder.enable("has_avx512vl").unwrap(); let isa_flags = x64::settings::Flags::new(&flags, isa_flag_builder); let rru = regs::create_reg_universe_systemv(&flags); diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 48395dc4ea..b87f243344 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -3079,81 +3079,92 @@ fn lower_insn_to_regs>( )); } } else { - // For SIMD 4.4 we use Mula's algroithm (https://arxiv.org/pdf/1611.07612.pdf) - // - //__m128i count_bytes ( __m128i v) { - // __m128i lookup = _mm_setr_epi8(0 ,1 ,1 ,2 ,1 ,2 ,2 ,3 ,1 ,2 ,2 ,3 ,2 ,3 ,3 ,4) ; - // __m128i low_mask = _mm_set1_epi8 (0 x0f ) ; - // __m128i lo = _mm_and_si128 (v, low_mask ) ; - // __m128i hi = _mm_and_si128 (_mm_srli_epi16 (v, 4) , low_mask ) ; - // __m128i cnt1 = _mm_shuffle_epi8 (lookup , lo) ; - // __m128i cnt2 = _mm_shuffle_epi8 (lookup , hi) ; - // return _mm_add_epi8 (cnt1 , cnt2 ) ; - //} - // - // Details of the above algorithm can be found in the reference noted above, but the basics - // are to create a lookup table that pre populates the popcnt values for each number [0,15]. - // The algorithm uses shifts to isolate 4 bit sections of the vector, pshufb as part of the - // lookup process, and adds together the results. - - // Get input vector and destination + // Lower `popcount` for vectors. let ty = ty.unwrap(); - let lhs = put_input_in_reg(ctx, inputs[0]); + let src = put_input_in_reg(ctx, inputs[0]); let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - // __m128i lookup = _mm_setr_epi8(0 ,1 ,1 ,2 ,1 ,2 ,2 ,3 ,1 ,2 ,2 ,3 ,2 ,3 ,3 ,4); - static POPCOUNT_4BIT: [u8; 16] = [ - 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, - 0x03, 0x03, 0x04, - ]; - let lookup = ctx.use_constant(VCodeConstantData::WellKnown(&POPCOUNT_4BIT)); + if isa_flags.use_avx512vl_simd() || isa_flags.use_avx512bitalg_simd() { + // When either AVX512VL or AVX512BITALG are available, + // `popcnt.i8x16` can be lowered to a single instruction. + assert_eq!(ty, types::I8X16); + ctx.emit(Inst::xmm_unary_rm_r_evex( + Avx512Opcode::Vpopcntb, + RegMem::reg(src), + dst, + )); + } else { + // For SIMD 4.4 we use Mula's algorithm (https://arxiv.org/pdf/1611.07612.pdf) + // + //__m128i count_bytes ( __m128i v) { + // __m128i lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); + // __m128i low_mask = _mm_set1_epi8 (0x0f); + // __m128i lo = _mm_and_si128 (v, low_mask); + // __m128i hi = _mm_and_si128 (_mm_srli_epi16 (v, 4), low_mask); + // __m128i cnt1 = _mm_shuffle_epi8 (lookup, lo); + // __m128i cnt2 = _mm_shuffle_epi8 (lookup, hi); + // return _mm_add_epi8 (cnt1, cnt2); + //} + // + // Details of the above algorithm can be found in the reference noted above, but the basics + // are to create a lookup table that pre populates the popcnt values for each number [0,15]. + // The algorithm uses shifts to isolate 4 bit sections of the vector, pshufb as part of the + // lookup process, and adds together the results. - // Create a mask for lower 4bits of each subword. - static LOW_MASK: [u8; 16] = [0x0F; 16]; - let low_mask_const = ctx.use_constant(VCodeConstantData::WellKnown(&LOW_MASK)); - let low_mask = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); - ctx.emit(Inst::xmm_load_const(low_mask_const, low_mask, ty)); + // __m128i lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); + static POPCOUNT_4BIT: [u8; 16] = [ + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, + 0x02, 0x03, 0x03, 0x04, + ]; + let lookup = ctx.use_constant(VCodeConstantData::WellKnown(&POPCOUNT_4BIT)); - // __m128i lo = _mm_and_si128 (v, low_mask ); - let lo = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); - ctx.emit(Inst::gen_move(lo, low_mask.to_reg(), types::I8X16)); - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pand, RegMem::reg(lhs), lo)); + // Create a mask for lower 4bits of each subword. + static LOW_MASK: [u8; 16] = [0x0F; 16]; + let low_mask_const = ctx.use_constant(VCodeConstantData::WellKnown(&LOW_MASK)); + let low_mask = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); + ctx.emit(Inst::xmm_load_const(low_mask_const, low_mask, ty)); - // __m128i hi = _mm_and_si128 (_mm_srli_epi16 (v, 4) , low_mask ) ; - ctx.emit(Inst::gen_move(dst, lhs, ty)); - ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrlw, RegMemImm::imm(4), dst)); - let tmp = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); - ctx.emit(Inst::gen_move(tmp, low_mask.to_reg(), types::I8X16)); - ctx.emit(Inst::xmm_rm_r( - SseOpcode::Pand, - RegMem::reg(dst.to_reg()), - tmp, - )); + // __m128i lo = _mm_and_si128 (v, low_mask); + let lo = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); + ctx.emit(Inst::gen_move(lo, low_mask.to_reg(), types::I8X16)); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pand, RegMem::reg(src), lo)); - // __m128i cnt1 = _mm_shuffle_epi8 (lookup , lo) ; - let tmp2 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); - ctx.emit(Inst::xmm_load_const(lookup, tmp2, ty)); - ctx.emit(Inst::gen_move(dst, tmp2.to_reg(), types::I8X16)); + // __m128i hi = _mm_and_si128 (_mm_srli_epi16 (v, 4), low_mask); + ctx.emit(Inst::gen_move(dst, src, ty)); + ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrlw, RegMemImm::imm(4), dst)); + let tmp = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); + ctx.emit(Inst::gen_move(tmp, low_mask.to_reg(), types::I8X16)); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pand, + RegMem::reg(dst.to_reg()), + tmp, + )); - ctx.emit(Inst::xmm_rm_r( - SseOpcode::Pshufb, - RegMem::reg(lo.to_reg()), - dst, - )); + // __m128i cnt1 = _mm_shuffle_epi8 (lookup, lo); + let tmp2 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); + ctx.emit(Inst::xmm_load_const(lookup, tmp2, ty)); + ctx.emit(Inst::gen_move(dst, tmp2.to_reg(), types::I8X16)); - // __m128i cnt2 = _mm_shuffle_epi8 (lookup , hi) ; - ctx.emit(Inst::xmm_rm_r( - SseOpcode::Pshufb, - RegMem::reg(tmp.to_reg()), - tmp2, - )); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pshufb, + RegMem::reg(lo.to_reg()), + dst, + )); - // return _mm_add_epi8 (cnt1 , cnt2 ) ; - ctx.emit(Inst::xmm_rm_r( - SseOpcode::Paddb, - RegMem::reg(tmp2.to_reg()), - dst, - )); + // __m128i cnt2 = _mm_shuffle_epi8 (lookup , hi) ; + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pshufb, + RegMem::reg(tmp.to_reg()), + tmp2, + )); + + // return _mm_add_epi8 (cnt1 , cnt2 ) ; + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Paddb, + RegMem::reg(tmp2.to_reg()), + dst, + )); + } } } diff --git a/cranelift/native/src/lib.rs b/cranelift/native/src/lib.rs index e425baeb8f..82b3e98eec 100644 --- a/cranelift/native/src/lib.rs +++ b/cranelift/native/src/lib.rs @@ -91,6 +91,9 @@ pub fn builder_with_options( if std::is_x86_feature_detected!("bmi2") { isa_builder.enable("has_bmi2").unwrap(); } + if std::is_x86_feature_detected!("avx512bitalg") { + isa_builder.enable("has_avx512bitalg").unwrap(); + } if std::is_x86_feature_detected!("avx512dq") { isa_builder.enable("has_avx512dq").unwrap(); }