[machinst x64]: add shuffle implementation

This commit is contained in:
Andrew Brown
2020-09-23 08:39:33 -07:00
parent f4836f9ca9
commit a64abf9b76
4 changed files with 79 additions and 1 deletions

View File

@@ -425,6 +425,7 @@ pub enum SseOpcode {
Pmulld,
Pmullw,
Pmuludq,
Pshufb,
Pshufd,
Psllw,
Pslld,
@@ -557,7 +558,7 @@ impl SseOpcode {
| SseOpcode::Ucomisd
| SseOpcode::Xorpd => SSE2,
SseOpcode::Pabsb | SseOpcode::Pabsw | SseOpcode::Pabsd => SSSE3,
SseOpcode::Pabsb | SseOpcode::Pabsw | SseOpcode::Pabsd | SseOpcode::Pshufb => SSSE3,
SseOpcode::Insertps
| SseOpcode::Pextrb
@@ -672,6 +673,7 @@ impl fmt::Debug for SseOpcode {
SseOpcode::Pmulld => "pmulld",
SseOpcode::Pmullw => "pmullw",
SseOpcode::Pmuludq => "pmuludq",
SseOpcode::Pshufb => "pshufb",
SseOpcode::Pshufd => "pshufd",
SseOpcode::Psllw => "psllw",
SseOpcode::Pslld => "pslld",

View File

@@ -1797,6 +1797,7 @@ pub(crate) fn emit(
SseOpcode::Pmulld => (LegacyPrefixes::_66, 0x0F3840, 3),
SseOpcode::Pmullw => (LegacyPrefixes::_66, 0x0FD5, 2),
SseOpcode::Pmuludq => (LegacyPrefixes::_66, 0x0FF4, 2),
SseOpcode::Pshufb => (LegacyPrefixes::_66, 0x0F3800, 3),
SseOpcode::Psubb => (LegacyPrefixes::_66, 0x0FF8, 2),
SseOpcode::Psubd => (LegacyPrefixes::_66, 0x0FFA, 2),
SseOpcode::Psubq => (LegacyPrefixes::_66, 0x0FFB, 2),

View File

@@ -3243,6 +3243,12 @@ fn test_x64_emit() {
"pxor %xmm11, %xmm2",
));
insns.push((
Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::reg(xmm11), w_xmm2),
"66410F3800D3",
"pshufb %xmm11, %xmm2",
));
// XMM_Mov_R_M: float stores
insns.push((
Inst::xmm_mov_r_m(SseOpcode::Movss, xmm15, Amode::imm_reg(128, r12), None),

View File

@@ -2640,6 +2640,75 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
ctx.emit(Inst::gen_move(dst, src, ty));
}
Opcode::Shuffle => {
let ty = ty.unwrap();
let dst = get_output_reg(ctx, outputs[0]);
let lhs_ty = ctx.input_ty(insn, 0);
let lhs = put_input_in_reg(ctx, inputs[0]);
let rhs = put_input_in_reg(ctx, inputs[1]);
let mask = if let &InstructionData::Shuffle { mask, .. } = ctx.data(insn) {
ctx.get_immediate(mask).clone()
} else {
unreachable!("shuffle should always have the shuffle format")
};
// A mask-building helper: in 128-bit SIMD, 0-15 indicate which lane to read from and a
// 1 in the most significant position zeroes the lane.
let zero_unknown_lane_index = |b: u8| if b > 15 { 0b10000000 } else { b };
ctx.emit(Inst::gen_move(dst, rhs, ty));
if rhs == lhs {
// If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM
// register. We statically build `constructed_mask` to zero out any unknown lane
// indices (may not be completely necessary: verification could fail incorrect mask
// values) and fix the indexes to all point to the `dst` vector.
let constructed_mask = mask
.iter()
// If the mask is greater than 15 it still may be referring to a lane in b.
.map(|&b| if b > 15 { b.wrapping_sub(16) } else { b })
.map(zero_unknown_lane_index)
.collect();
let tmp = ctx.alloc_tmp(RegClass::V128, types::I8X16);
ctx.emit(Inst::xmm_load_const_seq(constructed_mask, tmp, ty));
// After loading the constructed mask in a temporary register, we use this to
// shuffle the `dst` register (remember that, in this case, it is the same as
// `src` so we disregard this register).
let tmp = RegMem::reg(tmp.to_reg());
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, tmp, dst));
} else {
// If `lhs` and `rhs` are different, we must shuffle each separately and then OR
// them together. This is necessary due to PSHUFB semantics. As in the case above,
// we build the `constructed_mask` for each case statically.
// PSHUFB the `lhs` argument into `tmp0`, placing zeroes for unused lanes.
let tmp0 = ctx.alloc_tmp(RegClass::V128, lhs_ty);
ctx.emit(Inst::gen_move(tmp0, lhs, lhs_ty));
let constructed_mask = mask.iter().cloned().map(zero_unknown_lane_index).collect();
let tmp1 = ctx.alloc_tmp(RegClass::V128, types::I8X16);
ctx.emit(Inst::xmm_load_const_seq(constructed_mask, tmp1, ty));
let tmp1 = RegMem::reg(tmp1.to_reg());
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, tmp1, tmp0));
// PSHUFB the second argument, placing zeroes for unused lanes.
let constructed_mask = mask
.iter()
.map(|b| b.wrapping_sub(16))
.map(zero_unknown_lane_index)
.collect();
let tmp2 = ctx.alloc_tmp(RegClass::V128, types::I8X16);
ctx.emit(Inst::xmm_load_const_seq(constructed_mask, tmp2, ty));
let tmp2 = RegMem::reg(tmp2.to_reg());
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, tmp2, dst));
// OR the shuffled registers (the mechanism and lane-size for OR-ing the registers
// is not important).
let tmp0 = RegMem::reg(tmp0.to_reg());
ctx.emit(Inst::xmm_rm_r(SseOpcode::Orps, tmp0, dst));
// TODO when AVX512 is enabled we should replace this sequence with a single VPERMB
}
}
Opcode::Insertlane => {
// The instruction format maps to variables like: %dst = insertlane %in_vec, %src, %lane
let ty = ty.unwrap();