x64: lower i8x16.shuffle to VPERMI2B when possible

When shuffling values from two different registers, the x64 lowering for `i8x16.shuffle` must first shuffle each register separately and then OR the results with SSE instructions. With `VPERMI2B`, available in AVX512VL + AVX512VBMI, this can be done in a single instruction after the shuffle mask has been moved into the destination register. This change uses `VPERMI2B` for that case when the CPU supports it.
2021-05-24 10:06:33 -07:00
parent 51edea9e57
commit 2a9f458ea3
7 changed files with 100 additions and 35 deletions
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -127,8 +127,9 @@ pub(crate) fn emit(
            InstructionSet::BMI1 => info.isa_flags.use_bmi1(),
            InstructionSet::BMI2 => info.isa_flags.has_bmi2(),
            InstructionSet::AVX512BITALG => info.isa_flags.has_avx512bitalg(),
-            InstructionSet::AVX512F => info.isa_flags.has_avx512f(),
            InstructionSet::AVX512DQ => info.isa_flags.has_avx512dq(),
+            InstructionSet::AVX512F => info.isa_flags.has_avx512f(),
+            InstructionSet::AVX512VBMI => info.isa_flags.has_avx512vbmi(),
            InstructionSet::AVX512VL => info.isa_flags.has_avx512vl(),
        }
    };
@@ -1558,8 +1559,9 @@ pub(crate) fn emit(
            src2,
            dst,
        } => {
-            let opcode = match op {
-                Avx512Opcode::Vpmullq => 0x40,
+            let (w, opcode) = match op {
+                Avx512Opcode::Vpermi2b => (false, 0x75),
+                Avx512Opcode::Vpmullq => (true, 0x40),
                _ => unimplemented!("Opcode {:?} not implemented", op),
            };
            match src1 {
@@ -1567,7 +1569,7 @@ pub(crate) fn emit(
                    .length(EvexVectorLength::V128)
                    .prefix(LegacyPrefixes::_66)
                    .map(OpcodeMap::_0F38)
-                    .w(true)
+                    .w(w)
                    .opcode(opcode)
                    .reg(dst.to_reg().get_hw_encoding())
                    .rm(src.get_hw_encoding())