From a64abf9b7650ed9f2916b030cb92cb9dfbb1adb2 Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Wed, 23 Sep 2020 08:39:33 -0700 Subject: [PATCH] [machinst x64]: add shuffle implementation --- cranelift/codegen/src/isa/x64/inst/args.rs | 4 +- cranelift/codegen/src/isa/x64/inst/emit.rs | 1 + .../codegen/src/isa/x64/inst/emit_tests.rs | 6 ++ cranelift/codegen/src/isa/x64/lower.rs | 69 +++++++++++++++++++ 4 files changed, 79 insertions(+), 1 deletion(-) diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index f763945766..719cfa760a 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -425,6 +425,7 @@ pub enum SseOpcode { Pmulld, Pmullw, Pmuludq, + Pshufb, Pshufd, Psllw, Pslld, @@ -557,7 +558,7 @@ impl SseOpcode { | SseOpcode::Ucomisd | SseOpcode::Xorpd => SSE2, - SseOpcode::Pabsb | SseOpcode::Pabsw | SseOpcode::Pabsd => SSSE3, + SseOpcode::Pabsb | SseOpcode::Pabsw | SseOpcode::Pabsd | SseOpcode::Pshufb => SSSE3, SseOpcode::Insertps | SseOpcode::Pextrb @@ -672,6 +673,7 @@ impl fmt::Debug for SseOpcode { SseOpcode::Pmulld => "pmulld", SseOpcode::Pmullw => "pmullw", SseOpcode::Pmuludq => "pmuludq", + SseOpcode::Pshufb => "pshufb", SseOpcode::Pshufd => "pshufd", SseOpcode::Psllw => "psllw", SseOpcode::Pslld => "pslld", diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index acfc6f27aa..7094752633 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1797,6 +1797,7 @@ pub(crate) fn emit( SseOpcode::Pmulld => (LegacyPrefixes::_66, 0x0F3840, 3), SseOpcode::Pmullw => (LegacyPrefixes::_66, 0x0FD5, 2), SseOpcode::Pmuludq => (LegacyPrefixes::_66, 0x0FF4, 2), + SseOpcode::Pshufb => (LegacyPrefixes::_66, 0x0F3800, 3), SseOpcode::Psubb => (LegacyPrefixes::_66, 0x0FF8, 2), SseOpcode::Psubd => (LegacyPrefixes::_66, 0x0FFA, 2), SseOpcode::Psubq => (LegacyPrefixes::_66, 0x0FFB, 2), diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index 8945435f6c..23446ff27e 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -3243,6 +3243,12 @@ fn test_x64_emit() { "pxor %xmm11, %xmm2", )); + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::reg(xmm11), w_xmm2), + "66410F3800D3", + "pshufb %xmm11, %xmm2", + )); + // XMM_Mov_R_M: float stores insns.push(( Inst::xmm_mov_r_m(SseOpcode::Movss, xmm15, Amode::imm_reg(128, r12), None), diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 2da82f40f7..d1e7b31bb6 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -2640,6 +2640,75 @@ fn lower_insn_to_regs>( ctx.emit(Inst::gen_move(dst, src, ty)); } + Opcode::Shuffle => { + let ty = ty.unwrap(); + let dst = get_output_reg(ctx, outputs[0]); + let lhs_ty = ctx.input_ty(insn, 0); + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = put_input_in_reg(ctx, inputs[1]); + let mask = if let &InstructionData::Shuffle { mask, .. } = ctx.data(insn) { + ctx.get_immediate(mask).clone() + } else { + unreachable!("shuffle should always have the shuffle format") + }; + + // A mask-building helper: in 128-bit SIMD, 0-15 indicate which lane to read from and a + // 1 in the most significant position zeroes the lane. + let zero_unknown_lane_index = |b: u8| if b > 15 { 0b10000000 } else { b }; + + ctx.emit(Inst::gen_move(dst, rhs, ty)); + if rhs == lhs { + // If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM + // register. We statically build `constructed_mask` to zero out any unknown lane + // indices (may not be completely necessary: verification could fail incorrect mask + // values) and fix the indexes to all point to the `dst` vector. + let constructed_mask = mask + .iter() + // If the mask is greater than 15 it still may be referring to a lane in b. + .map(|&b| if b > 15 { b.wrapping_sub(16) } else { b }) + .map(zero_unknown_lane_index) + .collect(); + let tmp = ctx.alloc_tmp(RegClass::V128, types::I8X16); + ctx.emit(Inst::xmm_load_const_seq(constructed_mask, tmp, ty)); + // After loading the constructed mask in a temporary register, we use this to + // shuffle the `dst` register (remember that, in this case, it is the same as + // `src` so we disregard this register). + let tmp = RegMem::reg(tmp.to_reg()); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, tmp, dst)); + } else { + // If `lhs` and `rhs` are different, we must shuffle each separately and then OR + // them together. This is necessary due to PSHUFB semantics. As in the case above, + // we build the `constructed_mask` for each case statically. + + // PSHUFB the `lhs` argument into `tmp0`, placing zeroes for unused lanes. + let tmp0 = ctx.alloc_tmp(RegClass::V128, lhs_ty); + ctx.emit(Inst::gen_move(tmp0, lhs, lhs_ty)); + let constructed_mask = mask.iter().cloned().map(zero_unknown_lane_index).collect(); + let tmp1 = ctx.alloc_tmp(RegClass::V128, types::I8X16); + ctx.emit(Inst::xmm_load_const_seq(constructed_mask, tmp1, ty)); + let tmp1 = RegMem::reg(tmp1.to_reg()); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, tmp1, tmp0)); + + // PSHUFB the second argument, placing zeroes for unused lanes. + let constructed_mask = mask + .iter() + .map(|b| b.wrapping_sub(16)) + .map(zero_unknown_lane_index) + .collect(); + let tmp2 = ctx.alloc_tmp(RegClass::V128, types::I8X16); + ctx.emit(Inst::xmm_load_const_seq(constructed_mask, tmp2, ty)); + let tmp2 = RegMem::reg(tmp2.to_reg()); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, tmp2, dst)); + + // OR the shuffled registers (the mechanism and lane-size for OR-ing the registers + // is not important). + let tmp0 = RegMem::reg(tmp0.to_reg()); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Orps, tmp0, dst)); + + // TODO when AVX512 is enabled we should replace this sequence with a single VPERMB + } + } + Opcode::Insertlane => { // The instruction format maps to variables like: %dst = insertlane %in_vec, %src, %lane let ty = ty.unwrap();