x64: lower i8x16.shuffle to VPERMI2B when possible

When shuffling values from two different registers, the x64 lowering for
`i8x16.shuffle` must first shuffle each register separately and then OR
the results with SSE instructions. With `VPERMI2B`, available in
AVX512VL + AVX512VBMI, this can be done in a single instruction after
the shuffle mask has been moved into the destination register. This
change uses `VPERMI2B` for that case when the CPU supports it.
This commit is contained in:
Andrew Brown
2021-05-24 10:06:33 -07:00
parent 51edea9e57
commit 2a9f458ea3
7 changed files with 100 additions and 35 deletions

View File

@@ -5551,35 +5551,55 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
// `src` so we disregard this register).
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst));
} else {
// If `lhs` and `rhs` are different, we must shuffle each separately and then OR
// them together. This is necessary due to PSHUFB semantics. As in the case above,
// we build the `constructed_mask` for each case statically.
if isa_flags.use_avx512vl_simd() && isa_flags.use_avx512vbmi_simd() {
assert!(
mask.iter().all(|b| *b < 32),
"shuffle mask values must be between 0 and 31"
);
// PSHUFB the `lhs` argument into `tmp0`, placing zeroes for unused lanes.
let tmp0 = ctx.alloc_tmp(lhs_ty).only_reg().unwrap();
ctx.emit(Inst::gen_move(tmp0, lhs, lhs_ty));
let constructed_mask = mask.iter().cloned().map(zero_unknown_lane_index).collect();
let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
let tmp1 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
ctx.emit(Inst::xmm_load_const(constant, tmp1, ty));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp1), tmp0));
// Load the mask into the destination register.
let constant = ctx.use_constant(VCodeConstantData::Generated(mask.into()));
ctx.emit(Inst::xmm_load_const(constant, dst, ty));
// PSHUFB the second argument, placing zeroes for unused lanes.
let constructed_mask = mask
.iter()
.map(|b| b.wrapping_sub(16))
.map(zero_unknown_lane_index)
.collect();
let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
let tmp2 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
ctx.emit(Inst::xmm_load_const(constant, tmp2, ty));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp2), dst));
// VPERMI2B has the exact semantics of Wasm's shuffle:
// permute the bytes in `src1` and `src2` using byte indexes
// in `dst` and store the byte results in `dst`.
ctx.emit(Inst::xmm_rm_r_evex(
Avx512Opcode::Vpermi2b,
RegMem::reg(rhs),
lhs,
dst,
));
} else {
// If `lhs` and `rhs` are different, we must shuffle each separately and then OR
// them together. This is necessary due to PSHUFB semantics. As in the case above,
// we build the `constructed_mask` for each case statically.
// OR the shuffled registers (the mechanism and lane-size for OR-ing the registers
// is not important).
ctx.emit(Inst::xmm_rm_r(SseOpcode::Orps, RegMem::from(tmp0), dst));
// PSHUFB the `lhs` argument into `tmp0`, placing zeroes for unused lanes.
let tmp0 = ctx.alloc_tmp(lhs_ty).only_reg().unwrap();
ctx.emit(Inst::gen_move(tmp0, lhs, lhs_ty));
let constructed_mask =
mask.iter().cloned().map(zero_unknown_lane_index).collect();
let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
let tmp1 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
ctx.emit(Inst::xmm_load_const(constant, tmp1, ty));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp1), tmp0));
// TODO when AVX512 is enabled we should replace this sequence with a single VPERMB
// PSHUFB the second argument, placing zeroes for unused lanes.
let constructed_mask = mask
.iter()
.map(|b| b.wrapping_sub(16))
.map(zero_unknown_lane_index)
.collect();
let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
let tmp2 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
ctx.emit(Inst::xmm_load_const(constant, tmp2, ty));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp2), dst));
// OR the shuffled registers (the mechanism and lane-size for OR-ing the registers
// is not important).
ctx.emit(Inst::xmm_rm_r(SseOpcode::Orps, RegMem::from(tmp0), dst));
}
}
}