x64: lower i8x16.shuffle to VPERMI2B when possible
When shuffling values from two different registers, the x64 lowering for `i8x16.shuffle` must first shuffle each register separately and then OR the results with SSE instructions. With `VPERMI2B`, available in AVX512VL + AVX512VBMI, this can be done in a single instruction after the shuffle mask has been moved into the destination register. This change uses `VPERMI2B` for that case when the CPU supports it.
This commit is contained in:
@@ -58,6 +58,12 @@ pub(crate) fn define(shared: &SettingGroup) -> SettingGroup {
|
||||
"AVX512VL: CPUID.07H:EBX.AVX512VL[bit 31]",
|
||||
false,
|
||||
);
|
||||
let has_avx512vbmi = settings.add_bool(
|
||||
"has_avx512vbmi",
|
||||
"Has support for AVX512VMBI.",
|
||||
"AVX512VBMI: CPUID.07H:ECX.AVX512VBMI[bit 1]",
|
||||
false,
|
||||
);
|
||||
let has_avx512f = settings.add_bool(
|
||||
"has_avx512f",
|
||||
"Has support for AVX512F.",
|
||||
@@ -126,6 +132,10 @@ pub(crate) fn define(shared: &SettingGroup) -> SettingGroup {
|
||||
"use_avx512vl_simd",
|
||||
predicate!(shared_enable_simd && has_avx512vl),
|
||||
);
|
||||
settings.add_predicate(
|
||||
"use_avx512vbmi_simd",
|
||||
predicate!(shared_enable_simd && has_avx512vbmi),
|
||||
);
|
||||
settings.add_predicate(
|
||||
"use_avx512f_simd",
|
||||
predicate!(shared_enable_simd && has_avx512f),
|
||||
|
||||
@@ -463,6 +463,7 @@ pub(crate) enum InstructionSet {
|
||||
AVX512BITALG,
|
||||
AVX512DQ,
|
||||
AVX512F,
|
||||
AVX512VBMI,
|
||||
AVX512VL,
|
||||
}
|
||||
|
||||
@@ -999,10 +1000,11 @@ impl fmt::Display for SseOpcode {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
#[derive(Clone, PartialEq)]
|
||||
pub enum Avx512Opcode {
|
||||
Vcvtudq2ps,
|
||||
Vpabsq,
|
||||
Vpermi2b,
|
||||
Vpmullq,
|
||||
Vpopcntb,
|
||||
}
|
||||
@@ -1015,6 +1017,9 @@ impl Avx512Opcode {
|
||||
smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL]
|
||||
}
|
||||
Avx512Opcode::Vpabsq => smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL],
|
||||
Avx512Opcode::Vpermi2b => {
|
||||
smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512VBMI]
|
||||
}
|
||||
Avx512Opcode::Vpmullq => smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512DQ],
|
||||
Avx512Opcode::Vpopcntb => {
|
||||
smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512BITALG]
|
||||
@@ -1028,6 +1033,7 @@ impl fmt::Debug for Avx512Opcode {
|
||||
let name = match self {
|
||||
Avx512Opcode::Vcvtudq2ps => "vcvtudq2ps",
|
||||
Avx512Opcode::Vpabsq => "vpabsq",
|
||||
Avx512Opcode::Vpermi2b => "vpermi2b",
|
||||
Avx512Opcode::Vpmullq => "vpmullq",
|
||||
Avx512Opcode::Vpopcntb => "vpopcntb",
|
||||
};
|
||||
|
||||
@@ -127,8 +127,9 @@ pub(crate) fn emit(
|
||||
InstructionSet::BMI1 => info.isa_flags.use_bmi1(),
|
||||
InstructionSet::BMI2 => info.isa_flags.has_bmi2(),
|
||||
InstructionSet::AVX512BITALG => info.isa_flags.has_avx512bitalg(),
|
||||
InstructionSet::AVX512F => info.isa_flags.has_avx512f(),
|
||||
InstructionSet::AVX512DQ => info.isa_flags.has_avx512dq(),
|
||||
InstructionSet::AVX512F => info.isa_flags.has_avx512f(),
|
||||
InstructionSet::AVX512VBMI => info.isa_flags.has_avx512vbmi(),
|
||||
InstructionSet::AVX512VL => info.isa_flags.has_avx512vl(),
|
||||
}
|
||||
};
|
||||
@@ -1558,8 +1559,9 @@ pub(crate) fn emit(
|
||||
src2,
|
||||
dst,
|
||||
} => {
|
||||
let opcode = match op {
|
||||
Avx512Opcode::Vpmullq => 0x40,
|
||||
let (w, opcode) = match op {
|
||||
Avx512Opcode::Vpermi2b => (false, 0x75),
|
||||
Avx512Opcode::Vpmullq => (true, 0x40),
|
||||
_ => unimplemented!("Opcode {:?} not implemented", op),
|
||||
};
|
||||
match src1 {
|
||||
@@ -1567,7 +1569,7 @@ pub(crate) fn emit(
|
||||
.length(EvexVectorLength::V128)
|
||||
.prefix(LegacyPrefixes::_66)
|
||||
.map(OpcodeMap::_0F38)
|
||||
.w(true)
|
||||
.w(w)
|
||||
.opcode(opcode)
|
||||
.reg(dst.to_reg().get_hw_encoding())
|
||||
.rm(src.get_hw_encoding())
|
||||
|
||||
@@ -3573,6 +3573,18 @@ fn test_x64_emit() {
|
||||
"vpmullq %xmm14, %xmm10, %xmm1",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::xmm_rm_r_evex(Avx512Opcode::Vpermi2b, RegMem::reg(xmm14), xmm10, w_xmm1),
|
||||
"62D22D0875CE",
|
||||
"vpermi2b %xmm14, %xmm10, %xmm1",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::xmm_rm_r_evex(Avx512Opcode::Vpermi2b, RegMem::reg(xmm1), xmm0, w_xmm2),
|
||||
"62F27D0875D1",
|
||||
"vpermi2b %xmm1, %xmm0, %xmm2",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(xmm8), w_xmm9),
|
||||
"66450FF4C8",
|
||||
@@ -4315,6 +4327,7 @@ fn test_x64_emit() {
|
||||
isa_flag_builder.enable("has_avx512f").unwrap();
|
||||
isa_flag_builder.enable("has_avx512dq").unwrap();
|
||||
isa_flag_builder.enable("has_avx512vl").unwrap();
|
||||
isa_flag_builder.enable("has_avx512vbmi").unwrap();
|
||||
let isa_flags = x64::settings::Flags::new(&flags, isa_flag_builder);
|
||||
|
||||
let rru = regs::create_reg_universe_systemv(&flags);
|
||||
|
||||
@@ -1944,11 +1944,18 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
|
||||
}
|
||||
}
|
||||
Inst::XmmRmREvex {
|
||||
src1, src2, dst, ..
|
||||
op,
|
||||
src1,
|
||||
src2,
|
||||
dst,
|
||||
..
|
||||
} => {
|
||||
src1.get_regs_as_uses(collector);
|
||||
collector.add_use(*src2);
|
||||
collector.add_def(*dst);
|
||||
match *op {
|
||||
Avx512Opcode::Vpermi2b => collector.add_mod(*dst),
|
||||
_ => collector.add_def(*dst),
|
||||
}
|
||||
}
|
||||
Inst::XmmRmRImm { op, src, dst, .. } => {
|
||||
if inst.produces_const() {
|
||||
@@ -2336,6 +2343,7 @@ fn x64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
|
||||
}
|
||||
}
|
||||
Inst::XmmRmREvex {
|
||||
op,
|
||||
ref mut src1,
|
||||
ref mut src2,
|
||||
ref mut dst,
|
||||
@@ -2343,7 +2351,10 @@ fn x64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
|
||||
} => {
|
||||
src1.map_uses(mapper);
|
||||
map_use(mapper, src2);
|
||||
map_def(mapper, dst);
|
||||
match *op {
|
||||
Avx512Opcode::Vpermi2b => map_mod(mapper, dst),
|
||||
_ => map_def(mapper, dst),
|
||||
}
|
||||
}
|
||||
Inst::XmmRmiReg {
|
||||
ref mut src,
|
||||
|
||||
@@ -5550,6 +5550,26 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
// shuffle the `dst` register (remember that, in this case, it is the same as
|
||||
// `src` so we disregard this register).
|
||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst));
|
||||
} else {
|
||||
if isa_flags.use_avx512vl_simd() && isa_flags.use_avx512vbmi_simd() {
|
||||
assert!(
|
||||
mask.iter().all(|b| *b < 32),
|
||||
"shuffle mask values must be between 0 and 31"
|
||||
);
|
||||
|
||||
// Load the mask into the destination register.
|
||||
let constant = ctx.use_constant(VCodeConstantData::Generated(mask.into()));
|
||||
ctx.emit(Inst::xmm_load_const(constant, dst, ty));
|
||||
|
||||
// VPERMI2B has the exact semantics of Wasm's shuffle:
|
||||
// permute the bytes in `src1` and `src2` using byte indexes
|
||||
// in `dst` and store the byte results in `dst`.
|
||||
ctx.emit(Inst::xmm_rm_r_evex(
|
||||
Avx512Opcode::Vpermi2b,
|
||||
RegMem::reg(rhs),
|
||||
lhs,
|
||||
dst,
|
||||
));
|
||||
} else {
|
||||
// If `lhs` and `rhs` are different, we must shuffle each separately and then OR
|
||||
// them together. This is necessary due to PSHUFB semantics. As in the case above,
|
||||
@@ -5558,7 +5578,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
// PSHUFB the `lhs` argument into `tmp0`, placing zeroes for unused lanes.
|
||||
let tmp0 = ctx.alloc_tmp(lhs_ty).only_reg().unwrap();
|
||||
ctx.emit(Inst::gen_move(tmp0, lhs, lhs_ty));
|
||||
let constructed_mask = mask.iter().cloned().map(zero_unknown_lane_index).collect();
|
||||
let constructed_mask =
|
||||
mask.iter().cloned().map(zero_unknown_lane_index).collect();
|
||||
let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
|
||||
let tmp1 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
|
||||
ctx.emit(Inst::xmm_load_const(constant, tmp1, ty));
|
||||
@@ -5578,8 +5599,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
// OR the shuffled registers (the mechanism and lane-size for OR-ing the registers
|
||||
// is not important).
|
||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Orps, RegMem::from(tmp0), dst));
|
||||
|
||||
// TODO when AVX512 is enabled we should replace this sequence with a single VPERMB
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -97,11 +97,14 @@ pub fn builder_with_options(
|
||||
if std::is_x86_feature_detected!("avx512dq") {
|
||||
isa_builder.enable("has_avx512dq").unwrap();
|
||||
}
|
||||
if std::is_x86_feature_detected!("avx512f") {
|
||||
isa_builder.enable("has_avx512f").unwrap();
|
||||
}
|
||||
if std::is_x86_feature_detected!("avx512vl") {
|
||||
isa_builder.enable("has_avx512vl").unwrap();
|
||||
}
|
||||
if std::is_x86_feature_detected!("avx512f") {
|
||||
isa_builder.enable("has_avx512f").unwrap();
|
||||
if std::is_x86_feature_detected!("avx512vbmi") {
|
||||
isa_builder.enable("has_avx512vbmi").unwrap();
|
||||
}
|
||||
if std::is_x86_feature_detected!("lzcnt") {
|
||||
isa_builder.enable("has_lzcnt").unwrap();
|
||||
|
||||
Reference in New Issue
Block a user