x64: lower i8x16.shuffle to VPERMI2B when possible

When shuffling values from two different registers, the x64 lowering for
`i8x16.shuffle` must first shuffle each register separately and then OR
the results with SSE instructions. With `VPERMI2B`, available in
AVX512VL + AVX512VBMI, this can be done in a single instruction after
the shuffle mask has been moved into the destination register. This
change uses `VPERMI2B` for that case when the CPU supports it.
This commit is contained in:
Andrew Brown
2021-05-24 10:06:33 -07:00
parent 51edea9e57
commit 2a9f458ea3
7 changed files with 100 additions and 35 deletions

View File

@@ -58,6 +58,12 @@ pub(crate) fn define(shared: &SettingGroup) -> SettingGroup {
"AVX512VL: CPUID.07H:EBX.AVX512VL[bit 31]", "AVX512VL: CPUID.07H:EBX.AVX512VL[bit 31]",
false, false,
); );
let has_avx512vbmi = settings.add_bool(
"has_avx512vbmi",
"Has support for AVX512VMBI.",
"AVX512VBMI: CPUID.07H:ECX.AVX512VBMI[bit 1]",
false,
);
let has_avx512f = settings.add_bool( let has_avx512f = settings.add_bool(
"has_avx512f", "has_avx512f",
"Has support for AVX512F.", "Has support for AVX512F.",
@@ -126,6 +132,10 @@ pub(crate) fn define(shared: &SettingGroup) -> SettingGroup {
"use_avx512vl_simd", "use_avx512vl_simd",
predicate!(shared_enable_simd && has_avx512vl), predicate!(shared_enable_simd && has_avx512vl),
); );
settings.add_predicate(
"use_avx512vbmi_simd",
predicate!(shared_enable_simd && has_avx512vbmi),
);
settings.add_predicate( settings.add_predicate(
"use_avx512f_simd", "use_avx512f_simd",
predicate!(shared_enable_simd && has_avx512f), predicate!(shared_enable_simd && has_avx512f),

View File

@@ -463,6 +463,7 @@ pub(crate) enum InstructionSet {
AVX512BITALG, AVX512BITALG,
AVX512DQ, AVX512DQ,
AVX512F, AVX512F,
AVX512VBMI,
AVX512VL, AVX512VL,
} }
@@ -999,10 +1000,11 @@ impl fmt::Display for SseOpcode {
} }
} }
#[derive(Clone)] #[derive(Clone, PartialEq)]
pub enum Avx512Opcode { pub enum Avx512Opcode {
Vcvtudq2ps, Vcvtudq2ps,
Vpabsq, Vpabsq,
Vpermi2b,
Vpmullq, Vpmullq,
Vpopcntb, Vpopcntb,
} }
@@ -1015,6 +1017,9 @@ impl Avx512Opcode {
smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL] smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL]
} }
Avx512Opcode::Vpabsq => smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL], Avx512Opcode::Vpabsq => smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL],
Avx512Opcode::Vpermi2b => {
smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512VBMI]
}
Avx512Opcode::Vpmullq => smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512DQ], Avx512Opcode::Vpmullq => smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512DQ],
Avx512Opcode::Vpopcntb => { Avx512Opcode::Vpopcntb => {
smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512BITALG] smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512BITALG]
@@ -1028,6 +1033,7 @@ impl fmt::Debug for Avx512Opcode {
let name = match self { let name = match self {
Avx512Opcode::Vcvtudq2ps => "vcvtudq2ps", Avx512Opcode::Vcvtudq2ps => "vcvtudq2ps",
Avx512Opcode::Vpabsq => "vpabsq", Avx512Opcode::Vpabsq => "vpabsq",
Avx512Opcode::Vpermi2b => "vpermi2b",
Avx512Opcode::Vpmullq => "vpmullq", Avx512Opcode::Vpmullq => "vpmullq",
Avx512Opcode::Vpopcntb => "vpopcntb", Avx512Opcode::Vpopcntb => "vpopcntb",
}; };

View File

@@ -127,8 +127,9 @@ pub(crate) fn emit(
InstructionSet::BMI1 => info.isa_flags.use_bmi1(), InstructionSet::BMI1 => info.isa_flags.use_bmi1(),
InstructionSet::BMI2 => info.isa_flags.has_bmi2(), InstructionSet::BMI2 => info.isa_flags.has_bmi2(),
InstructionSet::AVX512BITALG => info.isa_flags.has_avx512bitalg(), InstructionSet::AVX512BITALG => info.isa_flags.has_avx512bitalg(),
InstructionSet::AVX512F => info.isa_flags.has_avx512f(),
InstructionSet::AVX512DQ => info.isa_flags.has_avx512dq(), InstructionSet::AVX512DQ => info.isa_flags.has_avx512dq(),
InstructionSet::AVX512F => info.isa_flags.has_avx512f(),
InstructionSet::AVX512VBMI => info.isa_flags.has_avx512vbmi(),
InstructionSet::AVX512VL => info.isa_flags.has_avx512vl(), InstructionSet::AVX512VL => info.isa_flags.has_avx512vl(),
} }
}; };
@@ -1558,8 +1559,9 @@ pub(crate) fn emit(
src2, src2,
dst, dst,
} => { } => {
let opcode = match op { let (w, opcode) = match op {
Avx512Opcode::Vpmullq => 0x40, Avx512Opcode::Vpermi2b => (false, 0x75),
Avx512Opcode::Vpmullq => (true, 0x40),
_ => unimplemented!("Opcode {:?} not implemented", op), _ => unimplemented!("Opcode {:?} not implemented", op),
}; };
match src1 { match src1 {
@@ -1567,7 +1569,7 @@ pub(crate) fn emit(
.length(EvexVectorLength::V128) .length(EvexVectorLength::V128)
.prefix(LegacyPrefixes::_66) .prefix(LegacyPrefixes::_66)
.map(OpcodeMap::_0F38) .map(OpcodeMap::_0F38)
.w(true) .w(w)
.opcode(opcode) .opcode(opcode)
.reg(dst.to_reg().get_hw_encoding()) .reg(dst.to_reg().get_hw_encoding())
.rm(src.get_hw_encoding()) .rm(src.get_hw_encoding())

View File

@@ -3573,6 +3573,18 @@ fn test_x64_emit() {
"vpmullq %xmm14, %xmm10, %xmm1", "vpmullq %xmm14, %xmm10, %xmm1",
)); ));
insns.push((
Inst::xmm_rm_r_evex(Avx512Opcode::Vpermi2b, RegMem::reg(xmm14), xmm10, w_xmm1),
"62D22D0875CE",
"vpermi2b %xmm14, %xmm10, %xmm1",
));
insns.push((
Inst::xmm_rm_r_evex(Avx512Opcode::Vpermi2b, RegMem::reg(xmm1), xmm0, w_xmm2),
"62F27D0875D1",
"vpermi2b %xmm1, %xmm0, %xmm2",
));
insns.push(( insns.push((
Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(xmm8), w_xmm9), Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(xmm8), w_xmm9),
"66450FF4C8", "66450FF4C8",
@@ -4315,6 +4327,7 @@ fn test_x64_emit() {
isa_flag_builder.enable("has_avx512f").unwrap(); isa_flag_builder.enable("has_avx512f").unwrap();
isa_flag_builder.enable("has_avx512dq").unwrap(); isa_flag_builder.enable("has_avx512dq").unwrap();
isa_flag_builder.enable("has_avx512vl").unwrap(); isa_flag_builder.enable("has_avx512vl").unwrap();
isa_flag_builder.enable("has_avx512vbmi").unwrap();
let isa_flags = x64::settings::Flags::new(&flags, isa_flag_builder); let isa_flags = x64::settings::Flags::new(&flags, isa_flag_builder);
let rru = regs::create_reg_universe_systemv(&flags); let rru = regs::create_reg_universe_systemv(&flags);

View File

@@ -1944,11 +1944,18 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
} }
} }
Inst::XmmRmREvex { Inst::XmmRmREvex {
src1, src2, dst, .. op,
src1,
src2,
dst,
..
} => { } => {
src1.get_regs_as_uses(collector); src1.get_regs_as_uses(collector);
collector.add_use(*src2); collector.add_use(*src2);
collector.add_def(*dst); match *op {
Avx512Opcode::Vpermi2b => collector.add_mod(*dst),
_ => collector.add_def(*dst),
}
} }
Inst::XmmRmRImm { op, src, dst, .. } => { Inst::XmmRmRImm { op, src, dst, .. } => {
if inst.produces_const() { if inst.produces_const() {
@@ -2336,6 +2343,7 @@ fn x64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
} }
} }
Inst::XmmRmREvex { Inst::XmmRmREvex {
op,
ref mut src1, ref mut src1,
ref mut src2, ref mut src2,
ref mut dst, ref mut dst,
@@ -2343,7 +2351,10 @@ fn x64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
} => { } => {
src1.map_uses(mapper); src1.map_uses(mapper);
map_use(mapper, src2); map_use(mapper, src2);
map_def(mapper, dst); match *op {
Avx512Opcode::Vpermi2b => map_mod(mapper, dst),
_ => map_def(mapper, dst),
}
} }
Inst::XmmRmiReg { Inst::XmmRmiReg {
ref mut src, ref mut src,

View File

@@ -5550,6 +5550,26 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
// shuffle the `dst` register (remember that, in this case, it is the same as // shuffle the `dst` register (remember that, in this case, it is the same as
// `src` so we disregard this register). // `src` so we disregard this register).
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst)); ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst));
} else {
if isa_flags.use_avx512vl_simd() && isa_flags.use_avx512vbmi_simd() {
assert!(
mask.iter().all(|b| *b < 32),
"shuffle mask values must be between 0 and 31"
);
// Load the mask into the destination register.
let constant = ctx.use_constant(VCodeConstantData::Generated(mask.into()));
ctx.emit(Inst::xmm_load_const(constant, dst, ty));
// VPERMI2B has the exact semantics of Wasm's shuffle:
// permute the bytes in `src1` and `src2` using byte indexes
// in `dst` and store the byte results in `dst`.
ctx.emit(Inst::xmm_rm_r_evex(
Avx512Opcode::Vpermi2b,
RegMem::reg(rhs),
lhs,
dst,
));
} else { } else {
// If `lhs` and `rhs` are different, we must shuffle each separately and then OR // If `lhs` and `rhs` are different, we must shuffle each separately and then OR
// them together. This is necessary due to PSHUFB semantics. As in the case above, // them together. This is necessary due to PSHUFB semantics. As in the case above,
@@ -5558,7 +5578,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
// PSHUFB the `lhs` argument into `tmp0`, placing zeroes for unused lanes. // PSHUFB the `lhs` argument into `tmp0`, placing zeroes for unused lanes.
let tmp0 = ctx.alloc_tmp(lhs_ty).only_reg().unwrap(); let tmp0 = ctx.alloc_tmp(lhs_ty).only_reg().unwrap();
ctx.emit(Inst::gen_move(tmp0, lhs, lhs_ty)); ctx.emit(Inst::gen_move(tmp0, lhs, lhs_ty));
let constructed_mask = mask.iter().cloned().map(zero_unknown_lane_index).collect(); let constructed_mask =
mask.iter().cloned().map(zero_unknown_lane_index).collect();
let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask)); let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
let tmp1 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); let tmp1 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
ctx.emit(Inst::xmm_load_const(constant, tmp1, ty)); ctx.emit(Inst::xmm_load_const(constant, tmp1, ty));
@@ -5578,8 +5599,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
// OR the shuffled registers (the mechanism and lane-size for OR-ing the registers // OR the shuffled registers (the mechanism and lane-size for OR-ing the registers
// is not important). // is not important).
ctx.emit(Inst::xmm_rm_r(SseOpcode::Orps, RegMem::from(tmp0), dst)); ctx.emit(Inst::xmm_rm_r(SseOpcode::Orps, RegMem::from(tmp0), dst));
}
// TODO when AVX512 is enabled we should replace this sequence with a single VPERMB
} }
} }

View File

@@ -97,11 +97,14 @@ pub fn builder_with_options(
if std::is_x86_feature_detected!("avx512dq") { if std::is_x86_feature_detected!("avx512dq") {
isa_builder.enable("has_avx512dq").unwrap(); isa_builder.enable("has_avx512dq").unwrap();
} }
if std::is_x86_feature_detected!("avx512f") {
isa_builder.enable("has_avx512f").unwrap();
}
if std::is_x86_feature_detected!("avx512vl") { if std::is_x86_feature_detected!("avx512vl") {
isa_builder.enable("has_avx512vl").unwrap(); isa_builder.enable("has_avx512vl").unwrap();
} }
if std::is_x86_feature_detected!("avx512f") { if std::is_x86_feature_detected!("avx512vbmi") {
isa_builder.enable("has_avx512f").unwrap(); isa_builder.enable("has_avx512vbmi").unwrap();
} }
if std::is_x86_feature_detected!("lzcnt") { if std::is_x86_feature_detected!("lzcnt") {
isa_builder.enable("has_lzcnt").unwrap(); isa_builder.enable("has_lzcnt").unwrap();