x64: lower i8x16.shuffle to VPERMI2B when possible
When shuffling values from two different registers, the x64 lowering for `i8x16.shuffle` must first shuffle each register separately and then OR the results with SSE instructions. With `VPERMI2B`, available in AVX512VL + AVX512VBMI, this can be done in a single instruction after the shuffle mask has been moved into the destination register. This change uses `VPERMI2B` for that case when the CPU supports it.
This commit is contained in:
@@ -58,6 +58,12 @@ pub(crate) fn define(shared: &SettingGroup) -> SettingGroup {
|
|||||||
"AVX512VL: CPUID.07H:EBX.AVX512VL[bit 31]",
|
"AVX512VL: CPUID.07H:EBX.AVX512VL[bit 31]",
|
||||||
false,
|
false,
|
||||||
);
|
);
|
||||||
|
let has_avx512vbmi = settings.add_bool(
|
||||||
|
"has_avx512vbmi",
|
||||||
|
"Has support for AVX512VMBI.",
|
||||||
|
"AVX512VBMI: CPUID.07H:ECX.AVX512VBMI[bit 1]",
|
||||||
|
false,
|
||||||
|
);
|
||||||
let has_avx512f = settings.add_bool(
|
let has_avx512f = settings.add_bool(
|
||||||
"has_avx512f",
|
"has_avx512f",
|
||||||
"Has support for AVX512F.",
|
"Has support for AVX512F.",
|
||||||
@@ -126,6 +132,10 @@ pub(crate) fn define(shared: &SettingGroup) -> SettingGroup {
|
|||||||
"use_avx512vl_simd",
|
"use_avx512vl_simd",
|
||||||
predicate!(shared_enable_simd && has_avx512vl),
|
predicate!(shared_enable_simd && has_avx512vl),
|
||||||
);
|
);
|
||||||
|
settings.add_predicate(
|
||||||
|
"use_avx512vbmi_simd",
|
||||||
|
predicate!(shared_enable_simd && has_avx512vbmi),
|
||||||
|
);
|
||||||
settings.add_predicate(
|
settings.add_predicate(
|
||||||
"use_avx512f_simd",
|
"use_avx512f_simd",
|
||||||
predicate!(shared_enable_simd && has_avx512f),
|
predicate!(shared_enable_simd && has_avx512f),
|
||||||
|
|||||||
@@ -463,6 +463,7 @@ pub(crate) enum InstructionSet {
|
|||||||
AVX512BITALG,
|
AVX512BITALG,
|
||||||
AVX512DQ,
|
AVX512DQ,
|
||||||
AVX512F,
|
AVX512F,
|
||||||
|
AVX512VBMI,
|
||||||
AVX512VL,
|
AVX512VL,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -999,10 +1000,11 @@ impl fmt::Display for SseOpcode {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone, PartialEq)]
|
||||||
pub enum Avx512Opcode {
|
pub enum Avx512Opcode {
|
||||||
Vcvtudq2ps,
|
Vcvtudq2ps,
|
||||||
Vpabsq,
|
Vpabsq,
|
||||||
|
Vpermi2b,
|
||||||
Vpmullq,
|
Vpmullq,
|
||||||
Vpopcntb,
|
Vpopcntb,
|
||||||
}
|
}
|
||||||
@@ -1015,6 +1017,9 @@ impl Avx512Opcode {
|
|||||||
smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL]
|
smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL]
|
||||||
}
|
}
|
||||||
Avx512Opcode::Vpabsq => smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL],
|
Avx512Opcode::Vpabsq => smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL],
|
||||||
|
Avx512Opcode::Vpermi2b => {
|
||||||
|
smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512VBMI]
|
||||||
|
}
|
||||||
Avx512Opcode::Vpmullq => smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512DQ],
|
Avx512Opcode::Vpmullq => smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512DQ],
|
||||||
Avx512Opcode::Vpopcntb => {
|
Avx512Opcode::Vpopcntb => {
|
||||||
smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512BITALG]
|
smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512BITALG]
|
||||||
@@ -1028,6 +1033,7 @@ impl fmt::Debug for Avx512Opcode {
|
|||||||
let name = match self {
|
let name = match self {
|
||||||
Avx512Opcode::Vcvtudq2ps => "vcvtudq2ps",
|
Avx512Opcode::Vcvtudq2ps => "vcvtudq2ps",
|
||||||
Avx512Opcode::Vpabsq => "vpabsq",
|
Avx512Opcode::Vpabsq => "vpabsq",
|
||||||
|
Avx512Opcode::Vpermi2b => "vpermi2b",
|
||||||
Avx512Opcode::Vpmullq => "vpmullq",
|
Avx512Opcode::Vpmullq => "vpmullq",
|
||||||
Avx512Opcode::Vpopcntb => "vpopcntb",
|
Avx512Opcode::Vpopcntb => "vpopcntb",
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -127,8 +127,9 @@ pub(crate) fn emit(
|
|||||||
InstructionSet::BMI1 => info.isa_flags.use_bmi1(),
|
InstructionSet::BMI1 => info.isa_flags.use_bmi1(),
|
||||||
InstructionSet::BMI2 => info.isa_flags.has_bmi2(),
|
InstructionSet::BMI2 => info.isa_flags.has_bmi2(),
|
||||||
InstructionSet::AVX512BITALG => info.isa_flags.has_avx512bitalg(),
|
InstructionSet::AVX512BITALG => info.isa_flags.has_avx512bitalg(),
|
||||||
InstructionSet::AVX512F => info.isa_flags.has_avx512f(),
|
|
||||||
InstructionSet::AVX512DQ => info.isa_flags.has_avx512dq(),
|
InstructionSet::AVX512DQ => info.isa_flags.has_avx512dq(),
|
||||||
|
InstructionSet::AVX512F => info.isa_flags.has_avx512f(),
|
||||||
|
InstructionSet::AVX512VBMI => info.isa_flags.has_avx512vbmi(),
|
||||||
InstructionSet::AVX512VL => info.isa_flags.has_avx512vl(),
|
InstructionSet::AVX512VL => info.isa_flags.has_avx512vl(),
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -1558,8 +1559,9 @@ pub(crate) fn emit(
|
|||||||
src2,
|
src2,
|
||||||
dst,
|
dst,
|
||||||
} => {
|
} => {
|
||||||
let opcode = match op {
|
let (w, opcode) = match op {
|
||||||
Avx512Opcode::Vpmullq => 0x40,
|
Avx512Opcode::Vpermi2b => (false, 0x75),
|
||||||
|
Avx512Opcode::Vpmullq => (true, 0x40),
|
||||||
_ => unimplemented!("Opcode {:?} not implemented", op),
|
_ => unimplemented!("Opcode {:?} not implemented", op),
|
||||||
};
|
};
|
||||||
match src1 {
|
match src1 {
|
||||||
@@ -1567,7 +1569,7 @@ pub(crate) fn emit(
|
|||||||
.length(EvexVectorLength::V128)
|
.length(EvexVectorLength::V128)
|
||||||
.prefix(LegacyPrefixes::_66)
|
.prefix(LegacyPrefixes::_66)
|
||||||
.map(OpcodeMap::_0F38)
|
.map(OpcodeMap::_0F38)
|
||||||
.w(true)
|
.w(w)
|
||||||
.opcode(opcode)
|
.opcode(opcode)
|
||||||
.reg(dst.to_reg().get_hw_encoding())
|
.reg(dst.to_reg().get_hw_encoding())
|
||||||
.rm(src.get_hw_encoding())
|
.rm(src.get_hw_encoding())
|
||||||
|
|||||||
@@ -3573,6 +3573,18 @@ fn test_x64_emit() {
|
|||||||
"vpmullq %xmm14, %xmm10, %xmm1",
|
"vpmullq %xmm14, %xmm10, %xmm1",
|
||||||
));
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_rm_r_evex(Avx512Opcode::Vpermi2b, RegMem::reg(xmm14), xmm10, w_xmm1),
|
||||||
|
"62D22D0875CE",
|
||||||
|
"vpermi2b %xmm14, %xmm10, %xmm1",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_rm_r_evex(Avx512Opcode::Vpermi2b, RegMem::reg(xmm1), xmm0, w_xmm2),
|
||||||
|
"62F27D0875D1",
|
||||||
|
"vpermi2b %xmm1, %xmm0, %xmm2",
|
||||||
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(xmm8), w_xmm9),
|
Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(xmm8), w_xmm9),
|
||||||
"66450FF4C8",
|
"66450FF4C8",
|
||||||
@@ -4315,6 +4327,7 @@ fn test_x64_emit() {
|
|||||||
isa_flag_builder.enable("has_avx512f").unwrap();
|
isa_flag_builder.enable("has_avx512f").unwrap();
|
||||||
isa_flag_builder.enable("has_avx512dq").unwrap();
|
isa_flag_builder.enable("has_avx512dq").unwrap();
|
||||||
isa_flag_builder.enable("has_avx512vl").unwrap();
|
isa_flag_builder.enable("has_avx512vl").unwrap();
|
||||||
|
isa_flag_builder.enable("has_avx512vbmi").unwrap();
|
||||||
let isa_flags = x64::settings::Flags::new(&flags, isa_flag_builder);
|
let isa_flags = x64::settings::Flags::new(&flags, isa_flag_builder);
|
||||||
|
|
||||||
let rru = regs::create_reg_universe_systemv(&flags);
|
let rru = regs::create_reg_universe_systemv(&flags);
|
||||||
|
|||||||
@@ -1944,11 +1944,18 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
Inst::XmmRmREvex {
|
Inst::XmmRmREvex {
|
||||||
src1, src2, dst, ..
|
op,
|
||||||
|
src1,
|
||||||
|
src2,
|
||||||
|
dst,
|
||||||
|
..
|
||||||
} => {
|
} => {
|
||||||
src1.get_regs_as_uses(collector);
|
src1.get_regs_as_uses(collector);
|
||||||
collector.add_use(*src2);
|
collector.add_use(*src2);
|
||||||
collector.add_def(*dst);
|
match *op {
|
||||||
|
Avx512Opcode::Vpermi2b => collector.add_mod(*dst),
|
||||||
|
_ => collector.add_def(*dst),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Inst::XmmRmRImm { op, src, dst, .. } => {
|
Inst::XmmRmRImm { op, src, dst, .. } => {
|
||||||
if inst.produces_const() {
|
if inst.produces_const() {
|
||||||
@@ -2336,6 +2343,7 @@ fn x64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
Inst::XmmRmREvex {
|
Inst::XmmRmREvex {
|
||||||
|
op,
|
||||||
ref mut src1,
|
ref mut src1,
|
||||||
ref mut src2,
|
ref mut src2,
|
||||||
ref mut dst,
|
ref mut dst,
|
||||||
@@ -2343,7 +2351,10 @@ fn x64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
|
|||||||
} => {
|
} => {
|
||||||
src1.map_uses(mapper);
|
src1.map_uses(mapper);
|
||||||
map_use(mapper, src2);
|
map_use(mapper, src2);
|
||||||
map_def(mapper, dst);
|
match *op {
|
||||||
|
Avx512Opcode::Vpermi2b => map_mod(mapper, dst),
|
||||||
|
_ => map_def(mapper, dst),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Inst::XmmRmiReg {
|
Inst::XmmRmiReg {
|
||||||
ref mut src,
|
ref mut src,
|
||||||
|
|||||||
@@ -5550,6 +5550,26 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
// shuffle the `dst` register (remember that, in this case, it is the same as
|
// shuffle the `dst` register (remember that, in this case, it is the same as
|
||||||
// `src` so we disregard this register).
|
// `src` so we disregard this register).
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst));
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst));
|
||||||
|
} else {
|
||||||
|
if isa_flags.use_avx512vl_simd() && isa_flags.use_avx512vbmi_simd() {
|
||||||
|
assert!(
|
||||||
|
mask.iter().all(|b| *b < 32),
|
||||||
|
"shuffle mask values must be between 0 and 31"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Load the mask into the destination register.
|
||||||
|
let constant = ctx.use_constant(VCodeConstantData::Generated(mask.into()));
|
||||||
|
ctx.emit(Inst::xmm_load_const(constant, dst, ty));
|
||||||
|
|
||||||
|
// VPERMI2B has the exact semantics of Wasm's shuffle:
|
||||||
|
// permute the bytes in `src1` and `src2` using byte indexes
|
||||||
|
// in `dst` and store the byte results in `dst`.
|
||||||
|
ctx.emit(Inst::xmm_rm_r_evex(
|
||||||
|
Avx512Opcode::Vpermi2b,
|
||||||
|
RegMem::reg(rhs),
|
||||||
|
lhs,
|
||||||
|
dst,
|
||||||
|
));
|
||||||
} else {
|
} else {
|
||||||
// If `lhs` and `rhs` are different, we must shuffle each separately and then OR
|
// If `lhs` and `rhs` are different, we must shuffle each separately and then OR
|
||||||
// them together. This is necessary due to PSHUFB semantics. As in the case above,
|
// them together. This is necessary due to PSHUFB semantics. As in the case above,
|
||||||
@@ -5558,7 +5578,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
// PSHUFB the `lhs` argument into `tmp0`, placing zeroes for unused lanes.
|
// PSHUFB the `lhs` argument into `tmp0`, placing zeroes for unused lanes.
|
||||||
let tmp0 = ctx.alloc_tmp(lhs_ty).only_reg().unwrap();
|
let tmp0 = ctx.alloc_tmp(lhs_ty).only_reg().unwrap();
|
||||||
ctx.emit(Inst::gen_move(tmp0, lhs, lhs_ty));
|
ctx.emit(Inst::gen_move(tmp0, lhs, lhs_ty));
|
||||||
let constructed_mask = mask.iter().cloned().map(zero_unknown_lane_index).collect();
|
let constructed_mask =
|
||||||
|
mask.iter().cloned().map(zero_unknown_lane_index).collect();
|
||||||
let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
|
let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
|
||||||
let tmp1 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
|
let tmp1 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
|
||||||
ctx.emit(Inst::xmm_load_const(constant, tmp1, ty));
|
ctx.emit(Inst::xmm_load_const(constant, tmp1, ty));
|
||||||
@@ -5578,8 +5599,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
// OR the shuffled registers (the mechanism and lane-size for OR-ing the registers
|
// OR the shuffled registers (the mechanism and lane-size for OR-ing the registers
|
||||||
// is not important).
|
// is not important).
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Orps, RegMem::from(tmp0), dst));
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Orps, RegMem::from(tmp0), dst));
|
||||||
|
}
|
||||||
// TODO when AVX512 is enabled we should replace this sequence with a single VPERMB
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -97,11 +97,14 @@ pub fn builder_with_options(
|
|||||||
if std::is_x86_feature_detected!("avx512dq") {
|
if std::is_x86_feature_detected!("avx512dq") {
|
||||||
isa_builder.enable("has_avx512dq").unwrap();
|
isa_builder.enable("has_avx512dq").unwrap();
|
||||||
}
|
}
|
||||||
|
if std::is_x86_feature_detected!("avx512f") {
|
||||||
|
isa_builder.enable("has_avx512f").unwrap();
|
||||||
|
}
|
||||||
if std::is_x86_feature_detected!("avx512vl") {
|
if std::is_x86_feature_detected!("avx512vl") {
|
||||||
isa_builder.enable("has_avx512vl").unwrap();
|
isa_builder.enable("has_avx512vl").unwrap();
|
||||||
}
|
}
|
||||||
if std::is_x86_feature_detected!("avx512f") {
|
if std::is_x86_feature_detected!("avx512vbmi") {
|
||||||
isa_builder.enable("has_avx512f").unwrap();
|
isa_builder.enable("has_avx512vbmi").unwrap();
|
||||||
}
|
}
|
||||||
if std::is_x86_feature_detected!("lzcnt") {
|
if std::is_x86_feature_detected!("lzcnt") {
|
||||||
isa_builder.enable("has_lzcnt").unwrap();
|
isa_builder.enable("has_lzcnt").unwrap();
|
||||||
|
|||||||
Reference in New Issue
Block a user