[machinst x64]: add shuffle implementation
This commit is contained in:
@@ -425,6 +425,7 @@ pub enum SseOpcode {
|
|||||||
Pmulld,
|
Pmulld,
|
||||||
Pmullw,
|
Pmullw,
|
||||||
Pmuludq,
|
Pmuludq,
|
||||||
|
Pshufb,
|
||||||
Pshufd,
|
Pshufd,
|
||||||
Psllw,
|
Psllw,
|
||||||
Pslld,
|
Pslld,
|
||||||
@@ -557,7 +558,7 @@ impl SseOpcode {
|
|||||||
| SseOpcode::Ucomisd
|
| SseOpcode::Ucomisd
|
||||||
| SseOpcode::Xorpd => SSE2,
|
| SseOpcode::Xorpd => SSE2,
|
||||||
|
|
||||||
SseOpcode::Pabsb | SseOpcode::Pabsw | SseOpcode::Pabsd => SSSE3,
|
SseOpcode::Pabsb | SseOpcode::Pabsw | SseOpcode::Pabsd | SseOpcode::Pshufb => SSSE3,
|
||||||
|
|
||||||
SseOpcode::Insertps
|
SseOpcode::Insertps
|
||||||
| SseOpcode::Pextrb
|
| SseOpcode::Pextrb
|
||||||
@@ -672,6 +673,7 @@ impl fmt::Debug for SseOpcode {
|
|||||||
SseOpcode::Pmulld => "pmulld",
|
SseOpcode::Pmulld => "pmulld",
|
||||||
SseOpcode::Pmullw => "pmullw",
|
SseOpcode::Pmullw => "pmullw",
|
||||||
SseOpcode::Pmuludq => "pmuludq",
|
SseOpcode::Pmuludq => "pmuludq",
|
||||||
|
SseOpcode::Pshufb => "pshufb",
|
||||||
SseOpcode::Pshufd => "pshufd",
|
SseOpcode::Pshufd => "pshufd",
|
||||||
SseOpcode::Psllw => "psllw",
|
SseOpcode::Psllw => "psllw",
|
||||||
SseOpcode::Pslld => "pslld",
|
SseOpcode::Pslld => "pslld",
|
||||||
|
|||||||
@@ -1797,6 +1797,7 @@ pub(crate) fn emit(
|
|||||||
SseOpcode::Pmulld => (LegacyPrefixes::_66, 0x0F3840, 3),
|
SseOpcode::Pmulld => (LegacyPrefixes::_66, 0x0F3840, 3),
|
||||||
SseOpcode::Pmullw => (LegacyPrefixes::_66, 0x0FD5, 2),
|
SseOpcode::Pmullw => (LegacyPrefixes::_66, 0x0FD5, 2),
|
||||||
SseOpcode::Pmuludq => (LegacyPrefixes::_66, 0x0FF4, 2),
|
SseOpcode::Pmuludq => (LegacyPrefixes::_66, 0x0FF4, 2),
|
||||||
|
SseOpcode::Pshufb => (LegacyPrefixes::_66, 0x0F3800, 3),
|
||||||
SseOpcode::Psubb => (LegacyPrefixes::_66, 0x0FF8, 2),
|
SseOpcode::Psubb => (LegacyPrefixes::_66, 0x0FF8, 2),
|
||||||
SseOpcode::Psubd => (LegacyPrefixes::_66, 0x0FFA, 2),
|
SseOpcode::Psubd => (LegacyPrefixes::_66, 0x0FFA, 2),
|
||||||
SseOpcode::Psubq => (LegacyPrefixes::_66, 0x0FFB, 2),
|
SseOpcode::Psubq => (LegacyPrefixes::_66, 0x0FFB, 2),
|
||||||
|
|||||||
@@ -3243,6 +3243,12 @@ fn test_x64_emit() {
|
|||||||
"pxor %xmm11, %xmm2",
|
"pxor %xmm11, %xmm2",
|
||||||
));
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::reg(xmm11), w_xmm2),
|
||||||
|
"66410F3800D3",
|
||||||
|
"pshufb %xmm11, %xmm2",
|
||||||
|
));
|
||||||
|
|
||||||
// XMM_Mov_R_M: float stores
|
// XMM_Mov_R_M: float stores
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_mov_r_m(SseOpcode::Movss, xmm15, Amode::imm_reg(128, r12), None),
|
Inst::xmm_mov_r_m(SseOpcode::Movss, xmm15, Amode::imm_reg(128, r12), None),
|
||||||
|
|||||||
@@ -2640,6 +2640,75 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
ctx.emit(Inst::gen_move(dst, src, ty));
|
ctx.emit(Inst::gen_move(dst, src, ty));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Opcode::Shuffle => {
|
||||||
|
let ty = ty.unwrap();
|
||||||
|
let dst = get_output_reg(ctx, outputs[0]);
|
||||||
|
let lhs_ty = ctx.input_ty(insn, 0);
|
||||||
|
let lhs = put_input_in_reg(ctx, inputs[0]);
|
||||||
|
let rhs = put_input_in_reg(ctx, inputs[1]);
|
||||||
|
let mask = if let &InstructionData::Shuffle { mask, .. } = ctx.data(insn) {
|
||||||
|
ctx.get_immediate(mask).clone()
|
||||||
|
} else {
|
||||||
|
unreachable!("shuffle should always have the shuffle format")
|
||||||
|
};
|
||||||
|
|
||||||
|
// A mask-building helper: in 128-bit SIMD, 0-15 indicate which lane to read from and a
|
||||||
|
// 1 in the most significant position zeroes the lane.
|
||||||
|
let zero_unknown_lane_index = |b: u8| if b > 15 { 0b10000000 } else { b };
|
||||||
|
|
||||||
|
ctx.emit(Inst::gen_move(dst, rhs, ty));
|
||||||
|
if rhs == lhs {
|
||||||
|
// If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM
|
||||||
|
// register. We statically build `constructed_mask` to zero out any unknown lane
|
||||||
|
// indices (may not be completely necessary: verification could fail incorrect mask
|
||||||
|
// values) and fix the indexes to all point to the `dst` vector.
|
||||||
|
let constructed_mask = mask
|
||||||
|
.iter()
|
||||||
|
// If the mask is greater than 15 it still may be referring to a lane in b.
|
||||||
|
.map(|&b| if b > 15 { b.wrapping_sub(16) } else { b })
|
||||||
|
.map(zero_unknown_lane_index)
|
||||||
|
.collect();
|
||||||
|
let tmp = ctx.alloc_tmp(RegClass::V128, types::I8X16);
|
||||||
|
ctx.emit(Inst::xmm_load_const_seq(constructed_mask, tmp, ty));
|
||||||
|
// After loading the constructed mask in a temporary register, we use this to
|
||||||
|
// shuffle the `dst` register (remember that, in this case, it is the same as
|
||||||
|
// `src` so we disregard this register).
|
||||||
|
let tmp = RegMem::reg(tmp.to_reg());
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, tmp, dst));
|
||||||
|
} else {
|
||||||
|
// If `lhs` and `rhs` are different, we must shuffle each separately and then OR
|
||||||
|
// them together. This is necessary due to PSHUFB semantics. As in the case above,
|
||||||
|
// we build the `constructed_mask` for each case statically.
|
||||||
|
|
||||||
|
// PSHUFB the `lhs` argument into `tmp0`, placing zeroes for unused lanes.
|
||||||
|
let tmp0 = ctx.alloc_tmp(RegClass::V128, lhs_ty);
|
||||||
|
ctx.emit(Inst::gen_move(tmp0, lhs, lhs_ty));
|
||||||
|
let constructed_mask = mask.iter().cloned().map(zero_unknown_lane_index).collect();
|
||||||
|
let tmp1 = ctx.alloc_tmp(RegClass::V128, types::I8X16);
|
||||||
|
ctx.emit(Inst::xmm_load_const_seq(constructed_mask, tmp1, ty));
|
||||||
|
let tmp1 = RegMem::reg(tmp1.to_reg());
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, tmp1, tmp0));
|
||||||
|
|
||||||
|
// PSHUFB the second argument, placing zeroes for unused lanes.
|
||||||
|
let constructed_mask = mask
|
||||||
|
.iter()
|
||||||
|
.map(|b| b.wrapping_sub(16))
|
||||||
|
.map(zero_unknown_lane_index)
|
||||||
|
.collect();
|
||||||
|
let tmp2 = ctx.alloc_tmp(RegClass::V128, types::I8X16);
|
||||||
|
ctx.emit(Inst::xmm_load_const_seq(constructed_mask, tmp2, ty));
|
||||||
|
let tmp2 = RegMem::reg(tmp2.to_reg());
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, tmp2, dst));
|
||||||
|
|
||||||
|
// OR the shuffled registers (the mechanism and lane-size for OR-ing the registers
|
||||||
|
// is not important).
|
||||||
|
let tmp0 = RegMem::reg(tmp0.to_reg());
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Orps, tmp0, dst));
|
||||||
|
|
||||||
|
// TODO when AVX512 is enabled we should replace this sequence with a single VPERMB
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Opcode::Insertlane => {
|
Opcode::Insertlane => {
|
||||||
// The instruction format maps to variables like: %dst = insertlane %in_vec, %src, %lane
|
// The instruction format maps to variables like: %dst = insertlane %in_vec, %src, %lane
|
||||||
let ty = ty.unwrap();
|
let ty = ty.unwrap();
|
||||||
|
|||||||
Reference in New Issue
Block a user