x64: lower i8x16.shuffle to VPERMI2B when possible

When shuffling values from two different registers, the x64 lowering for
`i8x16.shuffle` must first shuffle each register separately and then OR
the results with SSE instructions. With `VPERMI2B`, available in
AVX512VL + AVX512VBMI, this can be done in a single instruction after
the shuffle mask has been moved into the destination register. This
change uses `VPERMI2B` for that case when the CPU supports it.
This commit is contained in:
Andrew Brown
2021-05-24 10:06:33 -07:00
parent 51edea9e57
commit 2a9f458ea3
7 changed files with 100 additions and 35 deletions

View File

@@ -127,8 +127,9 @@ pub(crate) fn emit(
InstructionSet::BMI1 => info.isa_flags.use_bmi1(),
InstructionSet::BMI2 => info.isa_flags.has_bmi2(),
InstructionSet::AVX512BITALG => info.isa_flags.has_avx512bitalg(),
InstructionSet::AVX512F => info.isa_flags.has_avx512f(),
InstructionSet::AVX512DQ => info.isa_flags.has_avx512dq(),
InstructionSet::AVX512F => info.isa_flags.has_avx512f(),
InstructionSet::AVX512VBMI => info.isa_flags.has_avx512vbmi(),
InstructionSet::AVX512VL => info.isa_flags.has_avx512vl(),
}
};
@@ -1558,8 +1559,9 @@ pub(crate) fn emit(
src2,
dst,
} => {
let opcode = match op {
Avx512Opcode::Vpmullq => 0x40,
let (w, opcode) = match op {
Avx512Opcode::Vpermi2b => (false, 0x75),
Avx512Opcode::Vpmullq => (true, 0x40),
_ => unimplemented!("Opcode {:?} not implemented", op),
};
match src1 {
@@ -1567,7 +1569,7 @@ pub(crate) fn emit(
.length(EvexVectorLength::V128)
.prefix(LegacyPrefixes::_66)
.map(OpcodeMap::_0F38)
.w(true)
.w(w)
.opcode(opcode)
.reg(dst.to_reg().get_hw_encoding())
.rm(src.get_hw_encoding())